realtime-chat.tsx 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461
  1. import VoiceIcon from "@/app/icons/voice.svg";
  2. import VoiceOffIcon from "@/app/icons/voice-off.svg";
  3. import Close24Icon from "@/app/icons/close-24.svg";
  4. import PowerIcon from "@/app/icons/power.svg";
  5. import styles from "./realtime-chat.module.scss";
  6. import clsx from "clsx";
  7. import { useState, useRef, useEffect } from "react";
  8. import {
  9. useAccessStore,
  10. useChatStore,
  11. ChatMessage,
  12. createMessage,
  13. } from "@/app/store";
  14. import { IconButton } from "@/app/components/button";
  15. import {
  16. Modality,
  17. RTClient,
  18. RTInputAudioItem,
  19. RTResponse,
  20. TurnDetection,
  21. } from "rt-client";
  22. import { AudioHandler } from "@/app/lib/audio";
  23. interface RealtimeChatProps {
  24. onClose?: () => void;
  25. onStartVoice?: () => void;
  26. onPausedVoice?: () => void;
  27. }
  28. export function RealtimeChat({
  29. onClose,
  30. onStartVoice,
  31. onPausedVoice,
  32. }: RealtimeChatProps) {
  33. const currentItemId = useRef<string>("");
  34. const currentBotMessage = useRef<ChatMessage | null>();
  35. const currentUserMessage = useRef<ChatMessage | null>();
  36. const accessStore = useAccessStore.getState();
  37. const chatStore = useChatStore();
  38. const session = chatStore.currentSession();
  39. const [isRecording, setIsRecording] = useState(false);
  40. const [isConnected, setIsConnected] = useState(false);
  41. const [isConnecting, setIsConnecting] = useState(false);
  42. const [modality, setModality] = useState("audio");
  43. const [isAzure, setIsAzure] = useState(false);
  44. const [endpoint, setEndpoint] = useState("");
  45. const [deployment, setDeployment] = useState("");
  46. const [useVAD, setUseVAD] = useState(true);
  47. const clientRef = useRef<RTClient | null>(null);
  48. const audioHandlerRef = useRef<AudioHandler | null>(null);
  49. const apiKey = accessStore.openaiApiKey;
  50. const handleConnect = async () => {
  51. if (!isConnected) {
  52. try {
  53. setIsConnecting(true);
  54. clientRef.current = isAzure
  55. ? new RTClient(new URL(endpoint), { key: apiKey }, { deployment })
  56. : new RTClient(
  57. { key: apiKey },
  58. { model: "gpt-4o-realtime-preview-2024-10-01" },
  59. );
  60. const modalities: Modality[] =
  61. modality === "audio" ? ["text", "audio"] : ["text"];
  62. const turnDetection: TurnDetection = useVAD
  63. ? { type: "server_vad" }
  64. : null;
  65. clientRef.current.configure({
  66. instructions: "Hi",
  67. input_audio_transcription: { model: "whisper-1" },
  68. turn_detection: turnDetection,
  69. tools: [],
  70. temperature: 0.9,
  71. modalities,
  72. });
  73. startResponseListener();
  74. setIsConnected(true);
  75. } catch (error) {
  76. console.error("Connection failed:", error);
  77. } finally {
  78. setIsConnecting(false);
  79. }
  80. } else {
  81. await disconnect();
  82. }
  83. };
  84. const disconnect = async () => {
  85. if (clientRef.current) {
  86. try {
  87. await clientRef.current.close();
  88. clientRef.current = null;
  89. setIsConnected(false);
  90. } catch (error) {
  91. console.error("Disconnect failed:", error);
  92. }
  93. }
  94. };
  95. const startResponseListener = async () => {
  96. if (!clientRef.current) return;
  97. try {
  98. for await (const serverEvent of clientRef.current.events()) {
  99. if (serverEvent.type === "response") {
  100. await handleResponse(serverEvent);
  101. } else if (serverEvent.type === "input_audio") {
  102. await handleInputAudio(serverEvent);
  103. }
  104. }
  105. } catch (error) {
  106. if (clientRef.current) {
  107. console.error("Response iteration error:", error);
  108. }
  109. }
  110. };
  111. const handleResponse = async (response: RTResponse) => {
  112. for await (const item of response) {
  113. if (item.type === "message" && item.role === "assistant") {
  114. const botMessage = createMessage({
  115. role: item.role,
  116. content: "",
  117. });
  118. for await (const content of item) {
  119. if (content.type === "text") {
  120. for await (const text of content.textChunks()) {
  121. botMessage.content += text;
  122. }
  123. } else if (content.type === "audio") {
  124. const textTask = async () => {
  125. for await (const text of content.transcriptChunks()) {
  126. botMessage.content += text;
  127. }
  128. };
  129. const audioTask = async () => {
  130. audioHandlerRef.current?.startStreamingPlayback();
  131. for await (const audio of content.audioChunks()) {
  132. audioHandlerRef.current?.playChunk(audio);
  133. }
  134. };
  135. await Promise.all([textTask(), audioTask()]);
  136. chatStore.updateTargetSession(session, (session) => {
  137. botMessage.date = new Date().toLocaleString();
  138. session.messages = session.messages.concat([botMessage]);
  139. });
  140. }
  141. }
  142. }
  143. }
  144. };
  145. const handleInputAudio = async (item: RTInputAudioItem) => {
  146. audioHandlerRef.current?.stopStreamingPlayback();
  147. await item.waitForCompletion();
  148. const userMessage = createMessage({
  149. role: "user",
  150. content: item.transcription,
  151. });
  152. chatStore.updateTargetSession(session, (session) => {
  153. session.messages = session.messages.concat([userMessage]);
  154. });
  155. };
  156. const toggleRecording = async () => {
  157. if (!isRecording && clientRef.current) {
  158. try {
  159. if (!audioHandlerRef.current) {
  160. audioHandlerRef.current = new AudioHandler();
  161. await audioHandlerRef.current.initialize();
  162. }
  163. await audioHandlerRef.current.startRecording(async (chunk) => {
  164. await clientRef.current?.sendAudio(chunk);
  165. });
  166. setIsRecording(true);
  167. } catch (error) {
  168. console.error("Failed to start recording:", error);
  169. }
  170. } else if (audioHandlerRef.current) {
  171. try {
  172. audioHandlerRef.current.stopRecording();
  173. if (!useVAD) {
  174. const inputAudio = await clientRef.current?.commitAudio();
  175. await handleInputAudio(inputAudio!);
  176. await clientRef.current?.generateResponse();
  177. }
  178. setIsRecording(false);
  179. } catch (error) {
  180. console.error("Failed to stop recording:", error);
  181. }
  182. }
  183. };
  184. useEffect(() => {
  185. const initAudioHandler = async () => {
  186. const handler = new AudioHandler();
  187. await handler.initialize();
  188. audioHandlerRef.current = handler;
  189. };
  190. initAudioHandler().catch(console.error);
  191. return () => {
  192. disconnect();
  193. audioHandlerRef.current?.close().catch(console.error);
  194. };
  195. }, []);
  196. // useEffect(() => {
  197. // if (
  198. // clientRef.current?.getTurnDetectionType() === "server_vad" &&
  199. // audioData
  200. // ) {
  201. // // console.log("appendInputAudio", audioData);
  202. // // 将录制的16PCM音频发送给openai
  203. // clientRef.current?.appendInputAudio(audioData);
  204. // }
  205. // }, [audioData]);
  206. // useEffect(() => {
  207. // console.log("isRecording", isRecording);
  208. // if (!isRecording.current) return;
  209. // if (!clientRef.current) {
  210. // const apiKey = accessStore.openaiApiKey;
  211. // const client = (clientRef.current = new RealtimeClient({
  212. // url: "wss://api.openai.com/v1/realtime",
  213. // apiKey,
  214. // dangerouslyAllowAPIKeyInBrowser: true,
  215. // debug: true,
  216. // }));
  217. // client
  218. // .connect()
  219. // .then(() => {
  220. // // TODO 设置真实的上下文
  221. // client.sendUserMessageContent([
  222. // {
  223. // type: `input_text`,
  224. // text: `Hi`,
  225. // // text: `For testing purposes, I want you to list ten car brands. Number each item, e.g. "one (or whatever number you are one): the item name".`
  226. // },
  227. // ]);
  228. // // 配置服务端判断说话人开启还是结束
  229. // client.updateSession({
  230. // turn_detection: { type: "server_vad" },
  231. // });
  232. // client.on("realtime.event", (realtimeEvent) => {
  233. // // 调试
  234. // console.log("realtime.event", realtimeEvent);
  235. // });
  236. // client.on("conversation.interrupted", async () => {
  237. // if (currentBotMessage.current) {
  238. // stopPlaying();
  239. // try {
  240. // client.cancelResponse(
  241. // currentBotMessage.current?.id,
  242. // currentTime(),
  243. // );
  244. // } catch (e) {
  245. // console.error(e);
  246. // }
  247. // }
  248. // });
  249. // client.on("conversation.updated", async (event: any) => {
  250. // // console.log("currentSession", chatStore.currentSession());
  251. // // const items = client.conversation.getItems();
  252. // const content = event?.item?.content?.[0]?.transcript || "";
  253. // const text = event?.item?.content?.[0]?.text || "";
  254. // // console.log(
  255. // // "conversation.updated",
  256. // // event,
  257. // // "content[0]",
  258. // // event?.item?.content?.[0]?.transcript,
  259. // // "formatted",
  260. // // event?.item?.formatted?.transcript,
  261. // // "content",
  262. // // content,
  263. // // "text",
  264. // // text,
  265. // // event?.item?.status,
  266. // // event?.item?.role,
  267. // // items.length,
  268. // // items,
  269. // // );
  270. // const { item, delta } = event;
  271. // const { role, id, status, formatted } = item || {};
  272. // if (id && role == "assistant") {
  273. // if (
  274. // !currentBotMessage.current ||
  275. // currentBotMessage.current?.id != id
  276. // ) {
  277. // // create assistant message and save to session
  278. // currentBotMessage.current = createMessage({ id, role });
  279. // chatStore.updateCurrentSession((session) => {
  280. // session.messages = session.messages.concat([
  281. // currentBotMessage.current!,
  282. // ]);
  283. // });
  284. // }
  285. // if (currentBotMessage.current?.id != id) {
  286. // stopPlaying();
  287. // }
  288. // if (content) {
  289. // currentBotMessage.current.content = content;
  290. // chatStore.updateCurrentSession((session) => {
  291. // session.messages = session.messages.concat();
  292. // });
  293. // }
  294. // if (delta?.audio) {
  295. // // typeof delta.audio is Int16Array
  296. // // 直接播放
  297. // addInt16PCM(delta.audio);
  298. // }
  299. // // console.log(
  300. // // "updated try save wavFile",
  301. // // status,
  302. // // currentBotMessage.current?.audio_url,
  303. // // formatted?.audio,
  304. // // );
  305. // if (
  306. // status == "completed" &&
  307. // !currentBotMessage.current?.audio_url &&
  308. // formatted?.audio?.length
  309. // ) {
  310. // // 转换为wav文件保存 TODO 使用mp3格式会更节省空间
  311. // const botMessage = currentBotMessage.current;
  312. // const wavFile = new WavPacker().pack(sampleRate, {
  313. // bitsPerSample: 16,
  314. // channelCount: 1,
  315. // data: formatted?.audio,
  316. // });
  317. // // 这里将音频文件放到对象里面wavFile.url可以使用<audio>标签播放
  318. // item.formatted.file = wavFile;
  319. // uploadImageRemote(wavFile.blob).then((audio_url) => {
  320. // botMessage.audio_url = audio_url;
  321. // chatStore.updateCurrentSession((session) => {
  322. // session.messages = session.messages.concat();
  323. // });
  324. // });
  325. // }
  326. // if (
  327. // status == "completed" &&
  328. // !currentBotMessage.current?.content
  329. // ) {
  330. // chatStore.updateCurrentSession((session) => {
  331. // session.messages = session.messages.filter(
  332. // (m) => m.id !== currentBotMessage.current?.id,
  333. // );
  334. // });
  335. // }
  336. // }
  337. // if (id && role == "user" && !text) {
  338. // if (
  339. // !currentUserMessage.current ||
  340. // currentUserMessage.current?.id != id
  341. // ) {
  342. // // create assistant message and save to session
  343. // currentUserMessage.current = createMessage({ id, role });
  344. // chatStore.updateCurrentSession((session) => {
  345. // session.messages = session.messages.concat([
  346. // currentUserMessage.current!,
  347. // ]);
  348. // });
  349. // }
  350. // if (content) {
  351. // // 转换为wav文件保存 TODO 使用mp3格式会更节省空间
  352. // const userMessage = currentUserMessage.current;
  353. // const wavFile = new WavPacker().pack(sampleRate, {
  354. // bitsPerSample: 16,
  355. // channelCount: 1,
  356. // data: formatted?.audio,
  357. // });
  358. // // 这里将音频文件放到对象里面wavFile.url可以使用<audio>标签播放
  359. // item.formatted.file = wavFile;
  360. // uploadImageRemote(wavFile.blob).then((audio_url) => {
  361. // // update message content
  362. // userMessage.content = content;
  363. // // update message audio_url
  364. // userMessage.audio_url = audio_url;
  365. // chatStore.updateCurrentSession((session) => {
  366. // session.messages = session.messages.concat();
  367. // });
  368. // });
  369. // }
  370. // }
  371. // });
  372. // })
  373. // .catch((e) => {
  374. // console.error("Error", e);
  375. // });
  376. // }
  377. // return () => {
  378. // stop();
  379. // // TODO close client
  380. // clientRef.current?.disconnect();
  381. // };
  382. // }, [isRecording.current]);
  383. const handleClose = () => {
  384. onClose?.();
  385. disconnect();
  386. };
  387. return (
  388. <div className={styles["realtime-chat"]}>
  389. <div
  390. className={clsx(styles["circle-mic"], {
  391. [styles["pulse"]]: true,
  392. })}
  393. >
  394. <div className={styles["icon-center"]}></div>
  395. </div>
  396. <div className={styles["bottom-icons"]}>
  397. <div>
  398. <IconButton
  399. icon={isRecording ? <VoiceOffIcon /> : <VoiceIcon />}
  400. onClick={toggleRecording}
  401. disabled={!isConnected}
  402. bordered
  403. shadow
  404. />
  405. </div>
  406. <div className={styles["icon-center"]}>
  407. <IconButton
  408. icon={<PowerIcon />}
  409. text={
  410. isConnecting
  411. ? "Connecting..."
  412. : isConnected
  413. ? "Disconnect"
  414. : "Connect"
  415. }
  416. onClick={handleConnect}
  417. disabled={isConnecting}
  418. bordered
  419. shadow
  420. />
  421. </div>
  422. <div onClick={handleClose}>
  423. <IconButton
  424. icon={<Close24Icon />}
  425. onClick={handleClose}
  426. bordered
  427. shadow
  428. />
  429. </div>
  430. </div>
  431. </div>
  432. );
  433. }