realtime-chat.tsx 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471
  1. import VoiceIcon from "@/app/icons/voice.svg";
  2. import VoiceOffIcon from "@/app/icons/voice-off.svg";
  3. import Close24Icon from "@/app/icons/close-24.svg";
  4. import PowerIcon from "@/app/icons/power.svg";
  5. import styles from "./realtime-chat.module.scss";
  6. import clsx from "clsx";
  7. import { useState, useRef, useCallback, useEffect } from "react";
  8. import { useAccessStore, useChatStore, ChatMessage } from "@/app/store";
  9. import { IconButton } from "@/app/components/button";
  10. import {
  11. Modality,
  12. RTClient,
  13. RTInputAudioItem,
  14. RTResponse,
  15. TurnDetection,
  16. } from "rt-client";
  17. import { AudioHandler } from "@/app/lib/audio";
  18. interface RealtimeChatProps {
  19. onClose?: () => void;
  20. onStartVoice?: () => void;
  21. onPausedVoice?: () => void;
  22. }
  23. export function RealtimeChat({
  24. onClose,
  25. onStartVoice,
  26. onPausedVoice,
  27. }: RealtimeChatProps) {
  28. const currentItemId = useRef<string>("");
  29. const currentBotMessage = useRef<ChatMessage | null>();
  30. const currentUserMessage = useRef<ChatMessage | null>();
  31. const accessStore = useAccessStore.getState();
  32. const chatStore = useChatStore();
  33. const [isRecording, setIsRecording] = useState(false);
  34. const [isConnected, setIsConnected] = useState(false);
  35. const [isConnecting, setIsConnecting] = useState(false);
  36. const [modality, setModality] = useState("audio");
  37. const [isAzure, setIsAzure] = useState(false);
  38. const [endpoint, setEndpoint] = useState("");
  39. const [deployment, setDeployment] = useState("");
  40. const [useVAD, setUseVAD] = useState(true);
  41. const clientRef = useRef<RTClient | null>(null);
  42. const audioHandlerRef = useRef<AudioHandler | null>(null);
  43. const apiKey = accessStore.openaiApiKey;
  44. const handleConnect = async () => {
  45. if (!isConnected) {
  46. try {
  47. setIsConnecting(true);
  48. clientRef.current = isAzure
  49. ? new RTClient(new URL(endpoint), { key: apiKey }, { deployment })
  50. : new RTClient(
  51. { key: apiKey },
  52. { model: "gpt-4o-realtime-preview-2024-10-01" },
  53. );
  54. const modalities: Modality[] =
  55. modality === "audio" ? ["text", "audio"] : ["text"];
  56. const turnDetection: TurnDetection = useVAD
  57. ? { type: "server_vad" }
  58. : null;
  59. clientRef.current.configure({
  60. instructions: "Hi",
  61. input_audio_transcription: { model: "whisper-1" },
  62. turn_detection: turnDetection,
  63. tools: [],
  64. temperature: 0.9,
  65. modalities,
  66. });
  67. startResponseListener();
  68. setIsConnected(true);
  69. } catch (error) {
  70. console.error("Connection failed:", error);
  71. } finally {
  72. setIsConnecting(false);
  73. }
  74. } else {
  75. await disconnect();
  76. }
  77. };
  78. const disconnect = async () => {
  79. if (clientRef.current) {
  80. try {
  81. await clientRef.current.close();
  82. clientRef.current = null;
  83. setIsConnected(false);
  84. } catch (error) {
  85. console.error("Disconnect failed:", error);
  86. }
  87. }
  88. };
  89. const startResponseListener = async () => {
  90. if (!clientRef.current) return;
  91. try {
  92. for await (const serverEvent of clientRef.current.events()) {
  93. if (serverEvent.type === "response") {
  94. await handleResponse(serverEvent);
  95. } else if (serverEvent.type === "input_audio") {
  96. await handleInputAudio(serverEvent);
  97. }
  98. }
  99. } catch (error) {
  100. if (clientRef.current) {
  101. console.error("Response iteration error:", error);
  102. }
  103. }
  104. };
  105. const handleResponse = async (response: RTResponse) => {
  106. for await (const item of response) {
  107. if (item.type === "message" && item.role === "assistant") {
  108. const message = {
  109. type: item.role,
  110. content: "",
  111. };
  112. // setMessages((prevMessages) => [...prevMessages, message]);
  113. for await (const content of item) {
  114. if (content.type === "text") {
  115. for await (const text of content.textChunks()) {
  116. message.content += text;
  117. // setMessages((prevMessages) => {
  118. // prevMessages[prevMessages.length - 1].content = message.content;
  119. // return [...prevMessages];
  120. // });
  121. }
  122. } else if (content.type === "audio") {
  123. const textTask = async () => {
  124. for await (const text of content.transcriptChunks()) {
  125. message.content += text;
  126. // setMessages((prevMessages) => {
  127. // prevMessages[prevMessages.length - 1].content =
  128. // message.content;
  129. // return [...prevMessages];
  130. // });
  131. }
  132. };
  133. const audioTask = async () => {
  134. audioHandlerRef.current?.startStreamingPlayback();
  135. for await (const audio of content.audioChunks()) {
  136. audioHandlerRef.current?.playChunk(audio);
  137. }
  138. };
  139. await Promise.all([textTask(), audioTask()]);
  140. }
  141. }
  142. }
  143. }
  144. };
  145. const handleInputAudio = async (item: RTInputAudioItem) => {
  146. audioHandlerRef.current?.stopStreamingPlayback();
  147. await item.waitForCompletion();
  148. // setMessages((prevMessages) => [
  149. // ...prevMessages,
  150. // {
  151. // type: "user",
  152. // content: item.transcription || "",
  153. // },
  154. // ]);
  155. };
  156. const toggleRecording = async () => {
  157. if (!isRecording && clientRef.current) {
  158. try {
  159. if (!audioHandlerRef.current) {
  160. audioHandlerRef.current = new AudioHandler();
  161. await audioHandlerRef.current.initialize();
  162. }
  163. await audioHandlerRef.current.startRecording(async (chunk) => {
  164. await clientRef.current?.sendAudio(chunk);
  165. });
  166. setIsRecording(true);
  167. } catch (error) {
  168. console.error("Failed to start recording:", error);
  169. }
  170. } else if (audioHandlerRef.current) {
  171. try {
  172. audioHandlerRef.current.stopRecording();
  173. if (!useVAD) {
  174. const inputAudio = await clientRef.current?.commitAudio();
  175. await handleInputAudio(inputAudio!);
  176. await clientRef.current?.generateResponse();
  177. }
  178. setIsRecording(false);
  179. } catch (error) {
  180. console.error("Failed to stop recording:", error);
  181. }
  182. }
  183. };
  184. useEffect(() => {
  185. const initAudioHandler = async () => {
  186. const handler = new AudioHandler();
  187. await handler.initialize();
  188. audioHandlerRef.current = handler;
  189. };
  190. initAudioHandler().catch(console.error);
  191. return () => {
  192. disconnect();
  193. audioHandlerRef.current?.close().catch(console.error);
  194. };
  195. }, []);
  196. // useEffect(() => {
  197. // if (
  198. // clientRef.current?.getTurnDetectionType() === "server_vad" &&
  199. // audioData
  200. // ) {
  201. // // console.log("appendInputAudio", audioData);
  202. // // 将录制的16PCM音频发送给openai
  203. // clientRef.current?.appendInputAudio(audioData);
  204. // }
  205. // }, [audioData]);
  206. // useEffect(() => {
  207. // console.log("isRecording", isRecording);
  208. // if (!isRecording.current) return;
  209. // if (!clientRef.current) {
  210. // const apiKey = accessStore.openaiApiKey;
  211. // const client = (clientRef.current = new RealtimeClient({
  212. // url: "wss://api.openai.com/v1/realtime",
  213. // apiKey,
  214. // dangerouslyAllowAPIKeyInBrowser: true,
  215. // debug: true,
  216. // }));
  217. // client
  218. // .connect()
  219. // .then(() => {
  220. // // TODO 设置真实的上下文
  221. // client.sendUserMessageContent([
  222. // {
  223. // type: `input_text`,
  224. // text: `Hi`,
  225. // // text: `For testing purposes, I want you to list ten car brands. Number each item, e.g. "one (or whatever number you are one): the item name".`
  226. // },
  227. // ]);
  228. // // 配置服务端判断说话人开启还是结束
  229. // client.updateSession({
  230. // turn_detection: { type: "server_vad" },
  231. // });
  232. // client.on("realtime.event", (realtimeEvent) => {
  233. // // 调试
  234. // console.log("realtime.event", realtimeEvent);
  235. // });
  236. // client.on("conversation.interrupted", async () => {
  237. // if (currentBotMessage.current) {
  238. // stopPlaying();
  239. // try {
  240. // client.cancelResponse(
  241. // currentBotMessage.current?.id,
  242. // currentTime(),
  243. // );
  244. // } catch (e) {
  245. // console.error(e);
  246. // }
  247. // }
  248. // });
  249. // client.on("conversation.updated", async (event: any) => {
  250. // // console.log("currentSession", chatStore.currentSession());
  251. // // const items = client.conversation.getItems();
  252. // const content = event?.item?.content?.[0]?.transcript || "";
  253. // const text = event?.item?.content?.[0]?.text || "";
  254. // // console.log(
  255. // // "conversation.updated",
  256. // // event,
  257. // // "content[0]",
  258. // // event?.item?.content?.[0]?.transcript,
  259. // // "formatted",
  260. // // event?.item?.formatted?.transcript,
  261. // // "content",
  262. // // content,
  263. // // "text",
  264. // // text,
  265. // // event?.item?.status,
  266. // // event?.item?.role,
  267. // // items.length,
  268. // // items,
  269. // // );
  270. // const { item, delta } = event;
  271. // const { role, id, status, formatted } = item || {};
  272. // if (id && role == "assistant") {
  273. // if (
  274. // !currentBotMessage.current ||
  275. // currentBotMessage.current?.id != id
  276. // ) {
  277. // // create assistant message and save to session
  278. // currentBotMessage.current = createMessage({ id, role });
  279. // chatStore.updateCurrentSession((session) => {
  280. // session.messages = session.messages.concat([
  281. // currentBotMessage.current!,
  282. // ]);
  283. // });
  284. // }
  285. // if (currentBotMessage.current?.id != id) {
  286. // stopPlaying();
  287. // }
  288. // if (content) {
  289. // currentBotMessage.current.content = content;
  290. // chatStore.updateCurrentSession((session) => {
  291. // session.messages = session.messages.concat();
  292. // });
  293. // }
  294. // if (delta?.audio) {
  295. // // typeof delta.audio is Int16Array
  296. // // 直接播放
  297. // addInt16PCM(delta.audio);
  298. // }
  299. // // console.log(
  300. // // "updated try save wavFile",
  301. // // status,
  302. // // currentBotMessage.current?.audio_url,
  303. // // formatted?.audio,
  304. // // );
  305. // if (
  306. // status == "completed" &&
  307. // !currentBotMessage.current?.audio_url &&
  308. // formatted?.audio?.length
  309. // ) {
  310. // // 转换为wav文件保存 TODO 使用mp3格式会更节省空间
  311. // const botMessage = currentBotMessage.current;
  312. // const wavFile = new WavPacker().pack(sampleRate, {
  313. // bitsPerSample: 16,
  314. // channelCount: 1,
  315. // data: formatted?.audio,
  316. // });
  317. // // 这里将音频文件放到对象里面wavFile.url可以使用<audio>标签播放
  318. // item.formatted.file = wavFile;
  319. // uploadImageRemote(wavFile.blob).then((audio_url) => {
  320. // botMessage.audio_url = audio_url;
  321. // chatStore.updateCurrentSession((session) => {
  322. // session.messages = session.messages.concat();
  323. // });
  324. // });
  325. // }
  326. // if (
  327. // status == "completed" &&
  328. // !currentBotMessage.current?.content
  329. // ) {
  330. // chatStore.updateCurrentSession((session) => {
  331. // session.messages = session.messages.filter(
  332. // (m) => m.id !== currentBotMessage.current?.id,
  333. // );
  334. // });
  335. // }
  336. // }
  337. // if (id && role == "user" && !text) {
  338. // if (
  339. // !currentUserMessage.current ||
  340. // currentUserMessage.current?.id != id
  341. // ) {
  342. // // create assistant message and save to session
  343. // currentUserMessage.current = createMessage({ id, role });
  344. // chatStore.updateCurrentSession((session) => {
  345. // session.messages = session.messages.concat([
  346. // currentUserMessage.current!,
  347. // ]);
  348. // });
  349. // }
  350. // if (content) {
  351. // // 转换为wav文件保存 TODO 使用mp3格式会更节省空间
  352. // const userMessage = currentUserMessage.current;
  353. // const wavFile = new WavPacker().pack(sampleRate, {
  354. // bitsPerSample: 16,
  355. // channelCount: 1,
  356. // data: formatted?.audio,
  357. // });
  358. // // 这里将音频文件放到对象里面wavFile.url可以使用<audio>标签播放
  359. // item.formatted.file = wavFile;
  360. // uploadImageRemote(wavFile.blob).then((audio_url) => {
  361. // // update message content
  362. // userMessage.content = content;
  363. // // update message audio_url
  364. // userMessage.audio_url = audio_url;
  365. // chatStore.updateCurrentSession((session) => {
  366. // session.messages = session.messages.concat();
  367. // });
  368. // });
  369. // }
  370. // }
  371. // });
  372. // })
  373. // .catch((e) => {
  374. // console.error("Error", e);
  375. // });
  376. // }
  377. // return () => {
  378. // stop();
  379. // // TODO close client
  380. // clientRef.current?.disconnect();
  381. // };
  382. // }, [isRecording.current]);
  383. const handleStartVoice = useCallback(() => {
  384. onStartVoice?.();
  385. handleConnect();
  386. }, []);
  387. const handlePausedVoice = () => {
  388. onPausedVoice?.();
  389. };
  390. const handleClose = () => {
  391. onClose?.();
  392. disconnect();
  393. };
  394. return (
  395. <div className={styles["realtime-chat"]}>
  396. <div
  397. className={clsx(styles["circle-mic"], {
  398. [styles["pulse"]]: true,
  399. })}
  400. >
  401. <div className={styles["icon-center"]}></div>
  402. </div>
  403. <div className={styles["bottom-icons"]}>
  404. <div>
  405. <IconButton
  406. icon={isRecording ? <VoiceOffIcon /> : <VoiceIcon />}
  407. onClick={toggleRecording}
  408. disabled={!isConnected}
  409. bordered
  410. shadow
  411. />
  412. </div>
  413. <div className={styles["icon-center"]}>
  414. <IconButton
  415. icon={<PowerIcon />}
  416. text={
  417. isConnecting
  418. ? "Connecting..."
  419. : isConnected
  420. ? "Disconnect"
  421. : "Connect"
  422. }
  423. onClick={handleConnect}
  424. disabled={isConnecting}
  425. bordered
  426. shadow
  427. />
  428. </div>
  429. <div onClick={handleClose}>
  430. <IconButton
  431. icon={<Close24Icon />}
  432. onClick={handleClose}
  433. disabled={!isConnected}
  434. bordered
  435. shadow
  436. />
  437. </div>
  438. </div>
  439. </div>
  440. );
  441. }