ms_edge_tts.ts 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. // import axios from "axios";
  2. import { Buffer } from "buffer";
  3. import { randomBytes } from "crypto";
  4. import { Readable } from "stream";
  5. // Modified according to https://github.com/Migushthe2nd/MsEdgeTTS
  6. /**
  7. * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume
  8. */
  9. export enum VOLUME {
  10. SILENT = "silent",
  11. X_SOFT = "x-soft",
  12. SOFT = "soft",
  13. MEDIUM = "medium",
  14. LOUD = "loud",
  15. X_LOUD = "x-LOUD",
  16. DEFAULT = "default",
  17. }
  18. /**
  19. * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking
  20. */
  21. export enum RATE {
  22. X_SLOW = "x-slow",
  23. SLOW = "slow",
  24. MEDIUM = "medium",
  25. FAST = "fast",
  26. X_FAST = "x-fast",
  27. DEFAULT = "default",
  28. }
  29. /**
  30. * https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline
  31. */
  32. export enum PITCH {
  33. X_LOW = "x-low",
  34. LOW = "low",
  35. MEDIUM = "medium",
  36. HIGH = "high",
  37. X_HIGH = "x-high",
  38. DEFAULT = "default",
  39. }
  40. /**
  41. * Only a few of the [possible formats](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-text-to-speech#audio-outputs) are accepted.
  42. */
  43. export enum OUTPUT_FORMAT {
  44. // Streaming =============================
  45. // AMR_WB_16000HZ = "amr-wb-16000hz",
  46. // AUDIO_16KHZ_16BIT_32KBPS_MONO_OPUS = "audio-16khz-16bit-32kbps-mono-opus",
  47. // AUDIO_16KHZ_32KBITRATE_MONO_MP3 = "audio-16khz-32kbitrate-mono-mp3",
  48. // AUDIO_16KHZ_64KBITRATE_MONO_MP3 = "audio-16khz-64kbitrate-mono-mp3",
  49. // AUDIO_16KHZ_128KBITRATE_MONO_MP3 = "audio-16khz-128kbitrate-mono-mp3",
  50. // AUDIO_24KHZ_16BIT_24KBPS_MONO_OPUS = "audio-24khz-16bit-24kbps-mono-opus",
  51. // AUDIO_24KHZ_16BIT_48KBPS_MONO_OPUS = "audio-24khz-16bit-48kbps-mono-opus",
  52. AUDIO_24KHZ_48KBITRATE_MONO_MP3 = "audio-24khz-48kbitrate-mono-mp3",
  53. AUDIO_24KHZ_96KBITRATE_MONO_MP3 = "audio-24khz-96kbitrate-mono-mp3",
  54. // AUDIO_24KHZ_160KBITRATE_MONO_MP3 = "audio-24khz-160kbitrate-mono-mp3",
  55. // AUDIO_48KHZ_96KBITRATE_MONO_MP3 = "audio-48khz-96kbitrate-mono-mp3",
  56. // AUDIO_48KHZ_192KBITRATE_MONO_MP3 = "audio-48khz-192kbitrate-mono-mp3",
  57. // OGG_16KHZ_16BIT_MONO_OPUS = "ogg-16khz-16bit-mono-opus",
  58. // OGG_24KHZ_16BIT_MONO_OPUS = "ogg-24khz-16bit-mono-opus",
  59. // OGG_48KHZ_16BIT_MONO_OPUS = "ogg-48khz-16bit-mono-opus",
  60. // RAW_8KHZ_8BIT_MONO_ALAW = "raw-8khz-8bit-mono-alaw",
  61. // RAW_8KHZ_8BIT_MONO_MULAW = "raw-8khz-8bit-mono-mulaw",
  62. // RAW_8KHZ_16BIT_MONO_PCM = "raw-8khz-16bit-mono-pcm",
  63. // RAW_16KHZ_16BIT_MONO_PCM = "raw-16khz-16bit-mono-pcm",
  64. // RAW_16KHZ_16BIT_MONO_TRUESILK = "raw-16khz-16bit-mono-truesilk",
  65. // RAW_22050HZ_16BIT_MONO_PCM = "raw-22050hz-16bit-mono-pcm",
  66. // RAW_24KHZ_16BIT_MONO_PCM = "raw-24khz-16bit-mono-pcm",
  67. // RAW_24KHZ_16BIT_MONO_TRUESILK = "raw-24khz-16bit-mono-truesilk",
  68. // RAW_44100HZ_16BIT_MONO_PCM = "raw-44100hz-16bit-mono-pcm",
  69. // RAW_48KHZ_16BIT_MONO_PCM = "raw-48khz-16bit-mono-pcm",
  70. // WEBM_16KHZ_16BIT_MONO_OPUS = "webm-16khz-16bit-mono-opus",
  71. // WEBM_24KHZ_16BIT_24KBPS_MONO_OPUS = "webm-24khz-16bit-24kbps-mono-opus",
  72. WEBM_24KHZ_16BIT_MONO_OPUS = "webm-24khz-16bit-mono-opus",
  73. // Non-streaming =============================
  74. // RIFF_8KHZ_8BIT_MONO_ALAW = "riff-8khz-8bit-mono-alaw",
  75. // RIFF_8KHZ_8BIT_MONO_MULAW = "riff-8khz-8bit-mono-mulaw",
  76. // RIFF_8KHZ_16BIT_MONO_PCM = "riff-8khz-16bit-mono-pcm",
  77. // RIFF_22050HZ_16BIT_MONO_PCM = "riff-22050hz-16bit-mono-pcm",
  78. // RIFF_24KHZ_16BIT_MONO_PCM = "riff-24khz-16bit-mono-pcm",
  79. // RIFF_44100HZ_16BIT_MONO_PCM = "riff-44100hz-16bit-mono-pcm",
  80. // RIFF_48KHZ_16BIT_MONO_PCM = "riff-48khz-16bit-mono-pcm",
  81. }
  82. export type Voice = {
  83. Name: string;
  84. ShortName: string;
  85. Gender: string;
  86. Locale: string;
  87. SuggestedCodec: string;
  88. FriendlyName: string;
  89. Status: string;
  90. };
  91. export class ProsodyOptions {
  92. /**
  93. * The pitch to use.
  94. * Can be any {@link PITCH}, or a relative frequency in Hz (+50Hz), a relative semitone (+2st), or a relative percentage (+50%).
  95. * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,pitch,-Indicates%20the%20baseline)
  96. */
  97. pitch?: PITCH | string = "+0Hz";
  98. /**
  99. * The rate to use.
  100. * Can be any {@link RATE}, or a relative number (0.5), or string with a relative percentage (+50%).
  101. * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,rate,-Indicates%20the%20speaking)
  102. */
  103. rate?: RATE | string | number = 1.0;
  104. /**
  105. * The volume to use.
  106. * Can be any {@link VOLUME}, or an absolute number (0, 100), a string with a relative number (+50), or a relative percentage (+50%).
  107. * [SSML documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-voice#:~:text=Optional-,volume,-Indicates%20the%20volume)
  108. */
  109. volume?: VOLUME | string | number = 100.0;
  110. }
  111. export class MsEdgeTTS {
  112. static OUTPUT_FORMAT = OUTPUT_FORMAT;
  113. private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
  114. private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
  115. private static SYNTH_URL = `wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
  116. private static BINARY_DELIM = "Path:audio\r\n";
  117. private static VOICE_LANG_REGEX = /\w{2}-\w{2}/;
  118. private readonly _enableLogger;
  119. private _ws: WebSocket | undefined;
  120. private _voice: any;
  121. private _voiceLocale: any;
  122. private _outputFormat: any;
  123. private _streams: { [key: string]: Readable } = {};
  124. private _startTime = 0;
  125. private _log(...o: any[]) {
  126. if (this._enableLogger) {
  127. console.log(...o);
  128. }
  129. }
  130. /**
  131. * Create a new `MsEdgeTTS` instance.
  132. *
  133. * @param agent (optional, **NOT SUPPORTED IN BROWSER**) Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent).
  134. * @param enableLogger=false whether to enable the built-in logger. This logs connections inits, disconnects, and incoming data to the console
  135. */
  136. public constructor(enableLogger: boolean = false) {
  137. this._enableLogger = enableLogger;
  138. }
  139. private async _send(message: any) {
  140. for (let i = 1; i <= 3 && this._ws!.readyState !== this._ws!.OPEN; i++) {
  141. if (i == 1) {
  142. this._startTime = Date.now();
  143. }
  144. this._log("connecting: ", i);
  145. await this._initClient();
  146. }
  147. this._ws!.send(message);
  148. }
  149. private _initClient() {
  150. this._ws = new WebSocket(MsEdgeTTS.SYNTH_URL);
  151. this._ws.binaryType = "arraybuffer";
  152. return new Promise((resolve, reject) => {
  153. this._ws!.onopen = () => {
  154. this._log(
  155. "Connected in",
  156. (Date.now() - this._startTime) / 1000,
  157. "seconds",
  158. );
  159. this._send(
  160. `Content-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n
  161. {
  162. "context": {
  163. "synthesis": {
  164. "audio": {
  165. "metadataoptions": {
  166. "sentenceBoundaryEnabled": "false",
  167. "wordBoundaryEnabled": "false"
  168. },
  169. "outputFormat": "${this._outputFormat}"
  170. }
  171. }
  172. }
  173. }
  174. `,
  175. ).then(resolve);
  176. };
  177. this._ws!.onmessage = (m: any) => {
  178. const buffer = Buffer.from(m.data as ArrayBuffer);
  179. const message = buffer.toString();
  180. const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)![1];
  181. if (message.includes("Path:turn.start")) {
  182. // start of turn, ignore
  183. } else if (message.includes("Path:turn.end")) {
  184. // end of turn, close stream
  185. this._streams[requestId].push(null);
  186. } else if (message.includes("Path:response")) {
  187. // context response, ignore
  188. } else if (
  189. message.includes("Path:audio") &&
  190. m.data instanceof ArrayBuffer
  191. ) {
  192. this._pushAudioData(buffer, requestId);
  193. } else {
  194. this._log("UNKNOWN MESSAGE", message);
  195. }
  196. };
  197. this._ws!.onclose = () => {
  198. this._log(
  199. "disconnected after:",
  200. (Date.now() - this._startTime) / 1000,
  201. "seconds",
  202. );
  203. for (const requestId in this._streams) {
  204. this._streams[requestId].push(null);
  205. }
  206. };
  207. this._ws!.onerror = function (error: any) {
  208. reject("Connect Error: " + error);
  209. };
  210. });
  211. }
  212. private _pushAudioData(audioBuffer: Buffer, requestId: string) {
  213. const audioStartIndex =
  214. audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) +
  215. MsEdgeTTS.BINARY_DELIM.length;
  216. const audioData = audioBuffer.subarray(audioStartIndex);
  217. this._streams[requestId].push(audioData);
  218. this._log("received audio chunk, size: ", audioData?.length);
  219. }
  220. private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string {
  221. // in case future updates to the edge API block these elements, we'll be concatenating strings.
  222. options = { ...new ProsodyOptions(), ...options };
  223. return `<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="${this._voiceLocale}">
  224. <voice name="${this._voice}">
  225. <prosody pitch="${options.pitch}" rate="${options.rate}" volume="${options.volume}">
  226. ${input}
  227. </prosody>
  228. </voice>
  229. </speak>`;
  230. }
  231. /**
  232. * Fetch the list of voices available in Microsoft Edge.
  233. * These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview).
  234. */
  235. // getVoices(): Promise<Voice[]> {
  236. // return new Promise((resolve, reject) => {
  237. // axios
  238. // .get(MsEdgeTTS.VOICES_URL)
  239. // .then((res) => resolve(res.data))
  240. // .catch(reject);
  241. // });
  242. // }
  243. getVoices(): Promise<Voice[]> {
  244. return fetch(MsEdgeTTS.VOICES_URL)
  245. .then((response) => {
  246. if (!response.ok) {
  247. throw new Error("Network response was not ok");
  248. }
  249. return response.json();
  250. })
  251. .then((data) => data as Voice[])
  252. .catch((error) => {
  253. throw error;
  254. });
  255. }
  256. /**
  257. * Sets the required information for the speech to be synthesised and inits a new WebSocket connection.
  258. * Must be called at least once before text can be synthesised.
  259. * Saved in this instance. Can be called at any time times to update the metadata.
  260. *
  261. * @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
  262. * @param outputFormat any {@link OUTPUT_FORMAT}
  263. * @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName`
  264. */
  265. async setMetadata(
  266. voiceName: string,
  267. outputFormat: OUTPUT_FORMAT,
  268. voiceLocale?: string,
  269. ) {
  270. const oldVoice = this._voice;
  271. const oldVoiceLocale = this._voiceLocale;
  272. const oldOutputFormat = this._outputFormat;
  273. this._voice = voiceName;
  274. this._voiceLocale = voiceLocale;
  275. if (!this._voiceLocale) {
  276. const voiceLangMatch = MsEdgeTTS.VOICE_LANG_REGEX.exec(this._voice);
  277. if (!voiceLangMatch)
  278. throw new Error("Could not infer voiceLocale from voiceName!");
  279. this._voiceLocale = voiceLangMatch[0];
  280. }
  281. this._outputFormat = outputFormat;
  282. const changed =
  283. oldVoice !== this._voice ||
  284. oldVoiceLocale !== this._voiceLocale ||
  285. oldOutputFormat !== this._outputFormat;
  286. // create new client
  287. if (changed || this._ws!.readyState !== this._ws!.OPEN) {
  288. this._startTime = Date.now();
  289. await this._initClient();
  290. }
  291. }
  292. private _metadataCheck() {
  293. if (!this._ws)
  294. throw new Error(
  295. "Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile.",
  296. );
  297. }
  298. /**
  299. * Close the WebSocket connection.
  300. */
  301. close() {
  302. this._ws!.close();
  303. }
  304. /**
  305. * Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}.
  306. *
  307. * @param input the text to synthesise. Can include SSML elements.
  308. * @param options (optional) {@link ProsodyOptions}
  309. * @returns {Readable} - a `stream.Readable` with the audio data
  310. */
  311. toStream(input: string, options?: ProsodyOptions): Readable {
  312. const { stream } = this._rawSSMLRequest(this._SSMLTemplate(input, options));
  313. return stream;
  314. }
  315. toArrayBuffer(input: string, options?: ProsodyOptions): Promise<ArrayBuffer> {
  316. return new Promise((resolve, reject) => {
  317. let data: Uint8Array[] = [];
  318. const readable = this.toStream(input, options);
  319. readable.on("data", (chunk) => {
  320. data.push(chunk);
  321. });
  322. readable.on("end", () => {
  323. resolve(Buffer.concat(data).buffer);
  324. });
  325. readable.on("error", (err) => {
  326. reject(err);
  327. });
  328. });
  329. }
  330. /**
  331. * Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request.
  332. *
  333. * @param requestSSML the SSML to send. SSML elements required in order to work.
  334. * @returns {Readable} - a `stream.Readable` with the audio data
  335. */
  336. rawToStream(requestSSML: string): Readable {
  337. const { stream } = this._rawSSMLRequest(requestSSML);
  338. return stream;
  339. }
  340. private _rawSSMLRequest(requestSSML: string): {
  341. stream: Readable;
  342. requestId: string;
  343. } {
  344. this._metadataCheck();
  345. const requestId = randomBytes(16).toString("hex");
  346. const request =
  347. `X-RequestId:${requestId}\r\nContent-Type:application/ssml+xml\r\nPath:ssml\r\n\r\n
  348. ` + requestSSML.trim();
  349. // https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup
  350. const self = this;
  351. const stream = new Readable({
  352. read() {},
  353. destroy(error: Error | null, callback: (error: Error | null) => void) {
  354. delete self._streams[requestId];
  355. callback(error);
  356. },
  357. });
  358. this._streams[requestId] = stream;
  359. this._send(request).then();
  360. return { stream, requestId };
  361. }
  362. }