|
@@ -0,0 +1,391 @@
|
|
|
|
|
+// import axios from "axios";
|
|
|
|
|
+import { Buffer } from "buffer";
|
|
|
|
|
+import { randomBytes } from "crypto";
|
|
|
|
|
+import { Readable } from "stream";
|
|
|
|
|
+
|
|
|
|
|
+// Modified according to https://github.com/Migushthe2nd/MsEdgeTTS
|
|
|
|
|
+
|
|
|
|
|
+/**
|
|
|
|
|
+ * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume
|
|
|
|
|
+ */
|
|
|
|
|
+export enum VOLUME {
|
|
|
|
|
+ SILENT = "silent",
|
|
|
|
|
+ X_SOFT = "x-soft",
|
|
|
|
|
+ SOFT = "soft",
|
|
|
|
|
+ MEDIUM = "medium",
|
|
|
|
|
+ LOUD = "loud",
|
|
|
|
|
+ X_LOUD = "x-LOUD",
|
|
|
|
|
+ DEFAULT = "default",
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+/**
|
|
|
|
|
+ * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking
|
|
|
|
|
+ */
|
|
|
|
|
+export enum RATE {
|
|
|
|
|
+ X_SLOW = "x-slow",
|
|
|
|
|
+ SLOW = "slow",
|
|
|
|
|
+ MEDIUM = "medium",
|
|
|
|
|
+ FAST = "fast",
|
|
|
|
|
+ X_FAST = "x-fast",
|
|
|
|
|
+ DEFAULT = "default",
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+/**
|
|
|
|
|
+ * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline
|
|
|
|
|
+ */
|
|
|
|
|
+export enum PITCH {
|
|
|
|
|
+ X_LOW = "x-low",
|
|
|
|
|
+ LOW = "low",
|
|
|
|
|
+ MEDIUM = "medium",
|
|
|
|
|
+ HIGH = "high",
|
|
|
|
|
+ X_HIGH = "x-high",
|
|
|
|
|
+ DEFAULT = "default",
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+/**
|
|
|
|
|
+ * Only a few of the [possible formats](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs) are accepted.
|
|
|
|
|
+ */
|
|
|
|
|
+export enum OUTPUT_FORMAT {
|
|
|
|
|
+ // Streaming =============================
|
|
|
|
|
+ // AMR_WB_16000HZ = "amr-wb-16000hz",
|
|
|
|
|
+ // AUDIO_16KHZ_16BIT_32KBPS_MONO_OPUS = "audio-16khz-16bit-32kbps-mono-opus",
|
|
|
|
|
+ // AUDIO_16KHZ_32KBITRATE_MONO_MP3 = "audio-16khz-32kbitrate-mono-mp3",
|
|
|
|
|
+ // AUDIO_16KHZ_64KBITRATE_MONO_MP3 = "audio-16khz-64kbitrate-mono-mp3",
|
|
|
|
|
+ // AUDIO_16KHZ_128KBITRATE_MONO_MP3 = "audio-16khz-128kbitrate-mono-mp3",
|
|
|
|
|
+ // AUDIO_24KHZ_16BIT_24KBPS_MONO_OPUS = "audio-24khz-16bit-24kbps-mono-opus",
|
|
|
|
|
+ // AUDIO_24KHZ_16BIT_48KBPS_MONO_OPUS = "audio-24khz-16bit-48kbps-mono-opus",
|
|
|
|
|
+ AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3",
|
|
|
|
|
+ AUDIO_24KHZ_96KBITRATE_MONO_MP3 = "audio-24khz-96kbitrate-mono-mp3",
|
|
|
|
|
+ // AUDIO_24KHZ_160KBITRATE_MONO_MP3 = "audio-24khz-160kbitrate-mono-mp3",
|
|
|
|
|
+ // AUDIO_48KHZ_96KBITRATE_MONO_MP3 = "audio-48khz-96kbitrate-mono-mp3",
|
|
|
|
|
+ // AUDIO_48KHZ_192KBITRATE_MONO_MP3 = "audio-48khz-192kbitrate-mono-mp3",
|
|
|
|
|
+ // OGG_16KHZ_16BIT_MONO_OPUS = "ogg-16khz-16bit-mono-opus",
|
|
|
|
|
+ // OGG_24KHZ_16BIT_MONO_OPUS = "ogg-24khz-16bit-mono-opus",
|
|
|
|
|
+ // OGG_48KHZ_16BIT_MONO_OPUS = "ogg-48khz-16bit-mono-opus",
|
|
|
|
|
+ // RAW_8KHZ_8BIT_MONO_ALAW = "raw-8khz-8bit-mono-alaw",
|
|
|
|
|
+ // RAW_8KHZ_8BIT_MONO_MULAW = "raw-8khz-8bit-mono-mulaw",
|
|
|
|
|
+ // RAW_8KHZ_16BIT_MONO_PCM = "raw-8khz-16bit-mono-pcm",
|
|
|
|
|
+ // RAW_16KHZ_16BIT_MONO_PCM = "raw-16khz-16bit-mono-pcm",
|
|
|
|
|
+ // RAW_16KHZ_16BIT_MONO_TRUESILK = "raw-16khz-16bit-mono-truesilk",
|
|
|
|
|
+ // RAW_22050HZ_16BIT_MONO_PCM = "raw-22050hz-16bit-mono-pcm",
|
|
|
|
|
+ // RAW_24KHZ_16BIT_MONO_PCM = "raw-24khz-16bit-mono-pcm",
|
|
|
|
|
+ // RAW_24KHZ_16BIT_MONO_TRUESILK = "raw-24khz-16bit-mono-truesilk",
|
|
|
|
|
+ // RAW_44100HZ_16BIT_MONO_PCM = "raw-44100hz-16bit-mono-pcm",
|
|
|
|
|
+ // RAW_48KHZ_16BIT_MONO_PCM = "raw-48khz-16bit-mono-pcm",
|
|
|
|
|
+ // WEBM_16KHZ_16BIT_MONO_OPUS = "webm-16khz-16bit-mono-opus",
|
|
|
|
|
+ // WEBM_24KHZ_16BIT_24KBPS_MONO_OPUS = "webm-24khz-16bit-24kbps-mono-opus",
|
|
|
|
|
+ WEBM_24KHZ_16BIT_MONO_OPUS = "webm-24khz-16bit-mono-opus",
|
|
|
|
|
+ // Non-streaming =============================
|
|
|
|
|
+ // RIFF_8KHZ_8BIT_MONO_ALAW = "riff-8khz-8bit-mono-alaw",
|
|
|
|
|
+ // RIFF_8KHZ_8BIT_MONO_MULAW = "riff-8khz-8bit-mono-mulaw",
|
|
|
|
|
+ // RIFF_8KHZ_16BIT_MONO_PCM = "riff-8khz-16bit-mono-pcm",
|
|
|
|
|
+ // RIFF_22050HZ_16BIT_MONO_PCM = "riff-22050hz-16bit-mono-pcm",
|
|
|
|
|
+ // RIFF_24KHZ_16BIT_MONO_PCM = "riff-24khz-16bit-mono-pcm",
|
|
|
|
|
+ // RIFF_44100HZ_16BIT_MONO_PCM = "riff-44100hz-16bit-mono-pcm",
|
|
|
|
|
+ // RIFF_48KHZ_16BIT_MONO_PCM = "riff-48khz-16bit-mono-pcm",
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+export type Voice = {
|
|
|
|
|
+ Name: string;
|
|
|
|
|
+ ShortName: string;
|
|
|
|
|
+ Gender: string;
|
|
|
|
|
+ Locale: string;
|
|
|
|
|
+ SuggestedCodec: string;
|
|
|
|
|
+ FriendlyName: string;
|
|
|
|
|
+ Status: string;
|
|
|
|
|
+};
|
|
|
|
|
+
|
|
|
|
|
+export class ProsodyOptions {
|
|
|
|
|
+ /**
|
|
|
|
|
+ * The pitch to use.
|
|
|
|
|
+ * Can be any {@link PITCH}, or a relative frequency in Hz (+50Hz), a relative semitone (+2st), or a relative percentage (+50%).
|
|
|
|
|
+ * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline)
|
|
|
|
|
+ */
|
|
|
|
|
+ pitch?: PITCH | string = "+0Hz";
|
|
|
|
|
+ /**
|
|
|
|
|
+ * The rate to use.
|
|
|
|
|
+ * Can be any {@link RATE}, or a relative number (0.5), or string with a relative percentage (+50%).
|
|
|
|
|
+ * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking)
|
|
|
|
|
+ */
|
|
|
|
|
+ rate?: RATE | string | number = 1.0;
|
|
|
|
|
+ /**
|
|
|
|
|
+ * The volume to use.
|
|
|
|
|
+ * Can be any {@link VOLUME}, or an absolute number (0, 100), a string with a relative number (+50), or a relative percentage (+50%).
|
|
|
|
|
+ * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume)
|
|
|
|
|
+ */
|
|
|
|
|
+ volume?: VOLUME | string | number = 100.0;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+export class MsEdgeTTS {
|
|
|
|
|
+ static OUTPUT_FORMAT = OUTPUT_FORMAT;
|
|
|
|
|
+ private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
|
|
|
|
|
+ private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
|
|
|
|
|
+ private static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
|
|
|
|
|
+ private static BINARY_DELIM = "Path:audio\r\n";
|
|
|
|
|
+ private static VOICE_LANG_REGEX = /\w{2}-\w{2}/;
|
|
|
|
|
+ private readonly _enableLogger;
|
|
|
|
|
+ private _ws: WebSocket | undefined;
|
|
|
|
|
+ private _voice: any;
|
|
|
|
|
+ private _voiceLocale: any;
|
|
|
|
|
+ private _outputFormat: any;
|
|
|
|
|
+ private _streams: { [key: string]: Readable } = {};
|
|
|
|
|
+ private _startTime = 0;
|
|
|
|
|
+
|
|
|
|
|
+ private _log(...o: any[]) {
|
|
|
|
|
+ if (this._enableLogger) {
|
|
|
|
|
+ console.log(...o);
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * Create a new `MsEdgeTTS` instance.
|
|
|
|
|
+ *
|
|
|
|
|
+ * @param agent (optional, **NOT SUPPORTED IN BROWSER**) Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent).
|
|
|
|
|
+ * @param enableLogger=false whether to enable the built-in logger. This logs connections inits, disconnects, and incoming data to the console
|
|
|
|
|
+ */
|
|
|
|
|
+ public constructor(enableLogger: boolean = false) {
|
|
|
|
|
+ this._enableLogger = enableLogger;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private async _send(message: any) {
|
|
|
|
|
+ for (let i = 1; i <= 3 && this._ws!.readyState !== this._ws!.OPEN; i++) {
|
|
|
|
|
+ if (i == 1) {
|
|
|
|
|
+ this._startTime = Date.now();
|
|
|
|
|
+ }
|
|
|
|
|
+ this._log("connecting: ", i);
|
|
|
|
|
+ await this._initClient();
|
|
|
|
|
+ }
|
|
|
|
|
+ this._ws!.send(message);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private _initClient() {
|
|
|
|
|
+ this._ws = new WebSocket(MsEdgeTTS.SYNTH_URL);
|
|
|
|
|
+
|
|
|
|
|
+ this._ws.binaryType = "arraybuffer";
|
|
|
|
|
+ return new Promise((resolve, reject) => {
|
|
|
|
|
+ this._ws!.onopen = () => {
|
|
|
|
|
+ this._log(
|
|
|
|
|
+ "Connected in",
|
|
|
|
|
+ (Date.now() - this._startTime) / 1000,
|
|
|
|
|
+ "seconds",
|
|
|
|
|
+ );
|
|
|
|
|
+ this._send(
|
|
|
|
|
+ `Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n
|
|
|
|
|
+ {
|
|
|
|
|
+ "context": {
|
|
|
|
|
+ "synthesis": {
|
|
|
|
|
+ "audio": {
|
|
|
|
|
+ "metadataoptions": {
|
|
|
|
|
+ "sentenceBoundaryEnabled": "false",
|
|
|
|
|
+ "wordBoundaryEnabled": "false"
|
|
|
|
|
+ },
|
|
|
|
|
+ "outputFormat": "${this._outputFormat}"
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ `,
|
|
|
|
|
+ ).then(resolve);
|
|
|
|
|
+ };
|
|
|
|
|
+ this._ws!.onmessage = (m: any) => {
|
|
|
|
|
+ const buffer = Buffer.from(m.data as ArrayBuffer);
|
|
|
|
|
+ const message = buffer.toString();
|
|
|
|
|
+ const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)![1];
|
|
|
|
|
+ if (message.includes("Path:turn.start")) {
|
|
|
|
|
+ // start of turn, ignore
|
|
|
|
|
+ } else if (message.includes("Path:turn.end")) {
|
|
|
|
|
+ // end of turn, close stream
|
|
|
|
|
+ this._streams[requestId].push(null);
|
|
|
|
|
+ } else if (message.includes("Path:response")) {
|
|
|
|
|
+ // context response, ignore
|
|
|
|
|
+ } else if (
|
|
|
|
|
+ message.includes("Path:audio") &&
|
|
|
|
|
+ m.data instanceof ArrayBuffer
|
|
|
|
|
+ ) {
|
|
|
|
|
+ this._pushAudioData(buffer, requestId);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ this._log("UNKNOWN MESSAGE", message);
|
|
|
|
|
+ }
|
|
|
|
|
+ };
|
|
|
|
|
+ this._ws!.onclose = () => {
|
|
|
|
|
+ this._log(
|
|
|
|
|
+ "disconnected after:",
|
|
|
|
|
+ (Date.now() - this._startTime) / 1000,
|
|
|
|
|
+ "seconds",
|
|
|
|
|
+ );
|
|
|
|
|
+ for (const requestId in this._streams) {
|
|
|
|
|
+ this._streams[requestId].push(null);
|
|
|
|
|
+ }
|
|
|
|
|
+ };
|
|
|
|
|
+ this._ws!.onerror = function (error: any) {
|
|
|
|
|
+ reject("Connect Error: " + error);
|
|
|
|
|
+ };
|
|
|
|
|
+ });
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private _pushAudioData(audioBuffer: Buffer, requestId: string) {
|
|
|
|
|
+ const audioStartIndex =
|
|
|
|
|
+ audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) +
|
|
|
|
|
+ MsEdgeTTS.BINARY_DELIM.length;
|
|
|
|
|
+ const audioData = audioBuffer.subarray(audioStartIndex);
|
|
|
|
|
+ this._streams[requestId].push(audioData);
|
|
|
|
|
+ this._log("received audio chunk, size: ", audioData?.length);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string {
|
|
|
|
|
+ // in case future updates to the edge API block these elements, we'll be concatenating strings.
|
|
|
|
|
+ options = { ...new ProsodyOptions(), ...options };
|
|
|
|
|
+ return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this._voiceLocale}">
|
|
|
|
|
+ <voice name="${this._voice}">
|
|
|
|
|
+ <prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}">
|
|
|
|
|
+ ${input}
|
|
|
|
|
+ </prosody>
|
|
|
|
|
+ </voice>
|
|
|
|
|
+ </speak>`;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * Fetch the list of voices available in Microsoft Edge.
|
|
|
|
|
+ * These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview).
|
|
|
|
|
+ */
|
|
|
|
|
+ // getVoices(): Promise<Voice[]> {
|
|
|
|
|
+ // return new Promise((resolve, reject) => {
|
|
|
|
|
+ // axios
|
|
|
|
|
+ // .get(MsEdgeTTS.VOICES_URL)
|
|
|
|
|
+ // .then((res) => resolve(res.data))
|
|
|
|
|
+ // .catch(reject);
|
|
|
|
|
+ // });
|
|
|
|
|
+ // }
|
|
|
|
|
+ getVoices(): Promise<Voice[]> {
|
|
|
|
|
+ return fetch(MsEdgeTTS.VOICES_URL)
|
|
|
|
|
+ .then((response) => {
|
|
|
|
|
+ if (!response.ok) {
|
|
|
|
|
+ throw new Error("Network response was not ok");
|
|
|
|
|
+ }
|
|
|
|
|
+ return response.json();
|
|
|
|
|
+ })
|
|
|
|
|
+ .then((data) => data as Voice[])
|
|
|
|
|
+ .catch((error) => {
|
|
|
|
|
+ throw error;
|
|
|
|
|
+ });
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * Sets the required information for the speech to be synthesised and inits a new WebSocket connection.
|
|
|
|
|
+ * Must be called at least once before text can be synthesised.
|
|
|
|
|
+ * Saved in this instance. Can be called at any time times to update the metadata.
|
|
|
|
|
+ *
|
|
|
|
|
+ * @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
|
|
|
|
|
+ * @param outputFormat any {@link OUTPUT_FORMAT}
|
|
|
|
|
+ * @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName`
|
|
|
|
|
+ */
|
|
|
|
|
+ async setMetadata(
|
|
|
|
|
+ voiceName: string,
|
|
|
|
|
+ outputFormat: OUTPUT_FORMAT,
|
|
|
|
|
+ voiceLocale?: string,
|
|
|
|
|
+ ) {
|
|
|
|
|
+ const oldVoice = this._voice;
|
|
|
|
|
+ const oldVoiceLocale = this._voiceLocale;
|
|
|
|
|
+ const oldOutputFormat = this._outputFormat;
|
|
|
|
|
+
|
|
|
|
|
+ this._voice = voiceName;
|
|
|
|
|
+ this._voiceLocale = voiceLocale;
|
|
|
|
|
+ if (!this._voiceLocale) {
|
|
|
|
|
+ const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice);
|
|
|
|
|
+ if (!voiceLangMatch)
|
|
|
|
|
+ throw new Error("Could not infer voiceLocale from voiceName!");
|
|
|
|
|
+ this._voiceLocale = voiceLangMatch[0];
|
|
|
|
|
+ }
|
|
|
|
|
+ this._outputFormat = outputFormat;
|
|
|
|
|
+
|
|
|
|
|
+ const changed =
|
|
|
|
|
+ oldVoice !== this._voice ||
|
|
|
|
|
+ oldVoiceLocale !== this._voiceLocale ||
|
|
|
|
|
+ oldOutputFormat !== this._outputFormat;
|
|
|
|
|
+
|
|
|
|
|
+ // create new client
|
|
|
|
|
+ if (changed || this._ws!.readyState !== this._ws!.OPEN) {
|
|
|
|
|
+ this._startTime = Date.now();
|
|
|
|
|
+ await this._initClient();
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private _metadataCheck() {
|
|
|
|
|
+ if (!this._ws)
|
|
|
|
|
+ throw new Error(
|
|
|
|
|
+ "Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile.",
|
|
|
|
|
+ );
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * Close the WebSocket connection.
|
|
|
|
|
+ */
|
|
|
|
|
+ close() {
|
|
|
|
|
+ this._ws!.close();
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}.
|
|
|
|
|
+ *
|
|
|
|
|
+ * @param input the text to synthesise. Can include SSML elements.
|
|
|
|
|
+ * @param options (optional) {@link ProsodyOptions}
|
|
|
|
|
+ * @returns {Readable} - a `stream.Readable` with the audio data
|
|
|
|
|
+ */
|
|
|
|
|
+ toStream(input: string, options?: ProsodyOptions): Readable {
|
|
|
|
|
+ const { stream } = this._rawSSMLRequest(this._SSMLTemplate(input, options));
|
|
|
|
|
+ return stream;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ toArrayBuffer(input: string, options?: ProsodyOptions): Promise<ArrayBuffer> {
|
|
|
|
|
+ return new Promise((resolve, reject) => {
|
|
|
|
|
+ let data: Uint8Array[] = [];
|
|
|
|
|
+ const readable = this.toStream(input, options);
|
|
|
|
|
+ readable.on("data", (chunk) => {
|
|
|
|
|
+ data.push(chunk);
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ readable.on("end", () => {
|
|
|
|
|
+ resolve(Buffer.concat(data).buffer);
|
|
|
|
|
+ });
|
|
|
|
|
+
|
|
|
|
|
+ readable.on("error", (err) => {
|
|
|
|
|
+ reject(err);
|
|
|
|
|
+ });
|
|
|
|
|
+ });
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ /**
|
|
|
|
|
+ * Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request.
|
|
|
|
|
+ *
|
|
|
|
|
+ * @param requestSSML the SSML to send. SSML elements required in order to work.
|
|
|
|
|
+ * @returns {Readable} - a `stream.Readable` with the audio data
|
|
|
|
|
+ */
|
|
|
|
|
+ rawToStream(requestSSML: string): Readable {
|
|
|
|
|
+ const { stream } = this._rawSSMLRequest(requestSSML);
|
|
|
|
|
+ return stream;
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ private _rawSSMLRequest(requestSSML: string): {
|
|
|
|
|
+ stream: Readable;
|
|
|
|
|
+ requestId: string;
|
|
|
|
|
+ } {
|
|
|
|
|
+ this._metadataCheck();
|
|
|
|
|
+
|
|
|
|
|
+ const requestId = randomBytes(16).toString("hex");
|
|
|
|
|
+ const request =
|
|
|
|
|
+ `X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n
|
|
|
|
|
+ ` + requestSSML.trim();
|
|
|
|
|
+ // https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup
|
|
|
|
|
+ const self = this;
|
|
|
|
|
+ const stream = new Readable({
|
|
|
|
|
+ read() {},
|
|
|
|
|
+ destroy(error: Error | null, callback: (error: Error | null) => void) {
|
|
|
|
|
+ delete self._streams[requestId];
|
|
|
|
|
+ callback(error);
|
|
|
|
|
+ },
|
|
|
|
|
+ });
|
|
|
|
|
+ this._streams[requestId] = stream;
|
|
|
|
|
+ this._send(request).then();
|
|
|
|
|
+ return { stream, requestId };
|
|
|
|
|
+ }
|
|
|
|
|
+}
|