realtime-chat.tsx 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476
  1. import VoiceIcon from "@/app/icons/voice.svg";
  2. import VoiceOffIcon from "@/app/icons/voice-off.svg";
  3. import Close24Icon from "@/app/icons/close-24.svg";
  4. import PowerIcon from "@/app/icons/power.svg";
  5. import styles from "./realtime-chat.module.scss";
  6. import clsx from "clsx";
  7. import { useState, useRef, useEffect } from "react";
  8. import {
  9. useAccessStore,
  10. useChatStore,
  11. ChatMessage,
  12. createMessage,
  13. } from "@/app/store";
  14. import { IconButton } from "@/app/components/button";
  15. import {
  16. Modality,
  17. RTClient,
  18. RTInputAudioItem,
  19. RTResponse,
  20. TurnDetection,
  21. } from "rt-client";
  22. import { AudioHandler } from "@/app/lib/audio";
  23. import { uploadImage } from "@/app/utils/chat";
  24. interface RealtimeChatProps {
  25. onClose?: () => void;
  26. onStartVoice?: () => void;
  27. onPausedVoice?: () => void;
  28. }
  29. export function RealtimeChat({
  30. onClose,
  31. onStartVoice,
  32. onPausedVoice,
  33. }: RealtimeChatProps) {
  34. const currentItemId = useRef<string>("");
  35. const currentBotMessage = useRef<ChatMessage | null>();
  36. const currentUserMessage = useRef<ChatMessage | null>();
  37. const accessStore = useAccessStore.getState();
  38. const chatStore = useChatStore();
  39. const session = chatStore.currentSession();
  40. const [isRecording, setIsRecording] = useState(false);
  41. const [isConnected, setIsConnected] = useState(false);
  42. const [isConnecting, setIsConnecting] = useState(false);
  43. const [modality, setModality] = useState("audio");
  44. const [isAzure, setIsAzure] = useState(false);
  45. const [endpoint, setEndpoint] = useState("");
  46. const [deployment, setDeployment] = useState("");
  47. const [useVAD, setUseVAD] = useState(true);
  48. const clientRef = useRef<RTClient | null>(null);
  49. const audioHandlerRef = useRef<AudioHandler | null>(null);
  50. const apiKey = accessStore.openaiApiKey;
  51. const handleConnect = async () => {
  52. if (!isConnected) {
  53. try {
  54. setIsConnecting(true);
  55. clientRef.current = isAzure
  56. ? new RTClient(new URL(endpoint), { key: apiKey }, { deployment })
  57. : new RTClient(
  58. { key: apiKey },
  59. { model: "gpt-4o-realtime-preview-2024-10-01" },
  60. );
  61. const modalities: Modality[] =
  62. modality === "audio" ? ["text", "audio"] : ["text"];
  63. const turnDetection: TurnDetection = useVAD
  64. ? { type: "server_vad" }
  65. : null;
  66. clientRef.current.configure({
  67. instructions: "Hi",
  68. input_audio_transcription: { model: "whisper-1" },
  69. turn_detection: turnDetection,
  70. tools: [],
  71. temperature: 0.9,
  72. modalities,
  73. });
  74. startResponseListener();
  75. setIsConnected(true);
  76. } catch (error) {
  77. console.error("Connection failed:", error);
  78. } finally {
  79. setIsConnecting(false);
  80. }
  81. } else {
  82. await disconnect();
  83. }
  84. };
  85. const disconnect = async () => {
  86. if (clientRef.current) {
  87. try {
  88. await clientRef.current.close();
  89. clientRef.current = null;
  90. setIsConnected(false);
  91. } catch (error) {
  92. console.error("Disconnect failed:", error);
  93. }
  94. }
  95. };
  96. const startResponseListener = async () => {
  97. if (!clientRef.current) return;
  98. try {
  99. for await (const serverEvent of clientRef.current.events()) {
  100. if (serverEvent.type === "response") {
  101. await handleResponse(serverEvent);
  102. } else if (serverEvent.type === "input_audio") {
  103. await handleInputAudio(serverEvent);
  104. }
  105. }
  106. } catch (error) {
  107. if (clientRef.current) {
  108. console.error("Response iteration error:", error);
  109. }
  110. }
  111. };
  112. const handleResponse = async (response: RTResponse) => {
  113. for await (const item of response) {
  114. console.log("handleResponse", item);
  115. if (item.type === "message" && item.role === "assistant") {
  116. const botMessage = createMessage({
  117. role: item.role,
  118. content: "",
  119. });
  120. // add bot message first
  121. chatStore.updateTargetSession(session, (session) => {
  122. session.messages = session.messages.concat([botMessage]);
  123. });
  124. for await (const content of item) {
  125. if (content.type === "text") {
  126. for await (const text of content.textChunks()) {
  127. botMessage.content += text;
  128. }
  129. } else if (content.type === "audio") {
  130. const textTask = async () => {
  131. for await (const text of content.transcriptChunks()) {
  132. botMessage.content += text;
  133. }
  134. };
  135. const audioTask = async () => {
  136. audioHandlerRef.current?.startStreamingPlayback();
  137. for await (const audio of content.audioChunks()) {
  138. audioHandlerRef.current?.playChunk(audio);
  139. }
  140. };
  141. await Promise.all([textTask(), audioTask()]);
  142. }
  143. }
  144. // upload audio get audio_url
  145. const blob = audioHandlerRef.current?.savePlayFile();
  146. uploadImage(blob).then((audio_url) => {
  147. botMessage.audio_url = audio_url;
  148. botMessage.date = new Date().toLocaleString();
  149. // update text and audio_url
  150. chatStore.updateTargetSession((session) => {
  151. session.messages = session.messages.concat();
  152. });
  153. });
  154. }
  155. }
  156. };
  157. const handleInputAudio = async (item: RTInputAudioItem) => {
  158. audioHandlerRef.current?.stopStreamingPlayback();
  159. await item.waitForCompletion();
  160. const { audioStartMillis, audioEndMillis } = item;
  161. // TODO, save input audio_url, and update session
  162. console.log("handleInputAudio", item, audioStartMillis, audioEndMillis);
  163. const userMessage = createMessage({
  164. role: "user",
  165. content: item.transcription,
  166. });
  167. chatStore.updateTargetSession(session, (session) => {
  168. session.messages = session.messages.concat([userMessage]);
  169. });
  170. };
  171. const toggleRecording = async () => {
  172. if (!isRecording && clientRef.current) {
  173. try {
  174. if (!audioHandlerRef.current) {
  175. audioHandlerRef.current = new AudioHandler();
  176. await audioHandlerRef.current.initialize();
  177. }
  178. await audioHandlerRef.current.startRecording(async (chunk) => {
  179. await clientRef.current?.sendAudio(chunk);
  180. });
  181. setIsRecording(true);
  182. } catch (error) {
  183. console.error("Failed to start recording:", error);
  184. }
  185. } else if (audioHandlerRef.current) {
  186. try {
  187. audioHandlerRef.current.stopRecording();
  188. if (!useVAD) {
  189. const inputAudio = await clientRef.current?.commitAudio();
  190. await handleInputAudio(inputAudio!);
  191. await clientRef.current?.generateResponse();
  192. }
  193. setIsRecording(false);
  194. } catch (error) {
  195. console.error("Failed to stop recording:", error);
  196. }
  197. }
  198. };
  199. useEffect(() => {
  200. const initAudioHandler = async () => {
  201. const handler = new AudioHandler();
  202. await handler.initialize();
  203. audioHandlerRef.current = handler;
  204. };
  205. initAudioHandler().catch(console.error);
  206. return () => {
  207. disconnect();
  208. audioHandlerRef.current?.close().catch(console.error);
  209. };
  210. }, []);
  211. // useEffect(() => {
  212. // if (
  213. // clientRef.current?.getTurnDetectionType() === "server_vad" &&
  214. // audioData
  215. // ) {
  216. // // console.log("appendInputAudio", audioData);
  217. // // 将录制的16PCM音频发送给openai
  218. // clientRef.current?.appendInputAudio(audioData);
  219. // }
  220. // }, [audioData]);
  221. // useEffect(() => {
  222. // console.log("isRecording", isRecording);
  223. // if (!isRecording.current) return;
  224. // if (!clientRef.current) {
  225. // const apiKey = accessStore.openaiApiKey;
  226. // const client = (clientRef.current = new RealtimeClient({
  227. // url: "wss://api.openai.com/v1/realtime",
  228. // apiKey,
  229. // dangerouslyAllowAPIKeyInBrowser: true,
  230. // debug: true,
  231. // }));
  232. // client
  233. // .connect()
  234. // .then(() => {
  235. // // TODO 设置真实的上下文
  236. // client.sendUserMessageContent([
  237. // {
  238. // type: `input_text`,
  239. // text: `Hi`,
  240. // // text: `For testing purposes, I want you to list ten car brands. Number each item, e.g. "one (or whatever number you are one): the item name".`
  241. // },
  242. // ]);
  243. // // 配置服务端判断说话人开启还是结束
  244. // client.updateSession({
  245. // turn_detection: { type: "server_vad" },
  246. // });
  247. // client.on("realtime.event", (realtimeEvent) => {
  248. // // 调试
  249. // console.log("realtime.event", realtimeEvent);
  250. // });
  251. // client.on("conversation.interrupted", async () => {
  252. // if (currentBotMessage.current) {
  253. // stopPlaying();
  254. // try {
  255. // client.cancelResponse(
  256. // currentBotMessage.current?.id,
  257. // currentTime(),
  258. // );
  259. // } catch (e) {
  260. // console.error(e);
  261. // }
  262. // }
  263. // });
  264. // client.on("conversation.updated", async (event: any) => {
  265. // // console.log("currentSession", chatStore.currentSession());
  266. // // const items = client.conversation.getItems();
  267. // const content = event?.item?.content?.[0]?.transcript || "";
  268. // const text = event?.item?.content?.[0]?.text || "";
  269. // // console.log(
  270. // // "conversation.updated",
  271. // // event,
  272. // // "content[0]",
  273. // // event?.item?.content?.[0]?.transcript,
  274. // // "formatted",
  275. // // event?.item?.formatted?.transcript,
  276. // // "content",
  277. // // content,
  278. // // "text",
  279. // // text,
  280. // // event?.item?.status,
  281. // // event?.item?.role,
  282. // // items.length,
  283. // // items,
  284. // // );
  285. // const { item, delta } = event;
  286. // const { role, id, status, formatted } = item || {};
  287. // if (id && role == "assistant") {
  288. // if (
  289. // !currentBotMessage.current ||
  290. // currentBotMessage.current?.id != id
  291. // ) {
  292. // // create assistant message and save to session
  293. // currentBotMessage.current = createMessage({ id, role });
  294. // chatStore.updateCurrentSession((session) => {
  295. // session.messages = session.messages.concat([
  296. // currentBotMessage.current!,
  297. // ]);
  298. // });
  299. // }
  300. // if (currentBotMessage.current?.id != id) {
  301. // stopPlaying();
  302. // }
  303. // if (content) {
  304. // currentBotMessage.current.content = content;
  305. // chatStore.updateCurrentSession((session) => {
  306. // session.messages = session.messages.concat();
  307. // });
  308. // }
  309. // if (delta?.audio) {
  310. // // typeof delta.audio is Int16Array
  311. // // 直接播放
  312. // addInt16PCM(delta.audio);
  313. // }
  314. // // console.log(
  315. // // "updated try save wavFile",
  316. // // status,
  317. // // currentBotMessage.current?.audio_url,
  318. // // formatted?.audio,
  319. // // );
  320. // if (
  321. // status == "completed" &&
  322. // !currentBotMessage.current?.audio_url &&
  323. // formatted?.audio?.length
  324. // ) {
  325. // // 转换为wav文件保存 TODO 使用mp3格式会更节省空间
  326. // const botMessage = currentBotMessage.current;
  327. // const wavFile = new WavPacker().pack(sampleRate, {
  328. // bitsPerSample: 16,
  329. // channelCount: 1,
  330. // data: formatted?.audio,
  331. // });
  332. // // 这里将音频文件放到对象里面wavFile.url可以使用<audio>标签播放
  333. // item.formatted.file = wavFile;
  334. // uploadImageRemote(wavFile.blob).then((audio_url) => {
  335. // botMessage.audio_url = audio_url;
  336. // chatStore.updateCurrentSession((session) => {
  337. // session.messages = session.messages.concat();
  338. // });
  339. // });
  340. // }
  341. // if (
  342. // status == "completed" &&
  343. // !currentBotMessage.current?.content
  344. // ) {
  345. // chatStore.updateCurrentSession((session) => {
  346. // session.messages = session.messages.filter(
  347. // (m) => m.id !== currentBotMessage.current?.id,
  348. // );
  349. // });
  350. // }
  351. // }
  352. // if (id && role == "user" && !text) {
  353. // if (
  354. // !currentUserMessage.current ||
  355. // currentUserMessage.current?.id != id
  356. // ) {
  357. // // create assistant message and save to session
  358. // currentUserMessage.current = createMessage({ id, role });
  359. // chatStore.updateCurrentSession((session) => {
  360. // session.messages = session.messages.concat([
  361. // currentUserMessage.current!,
  362. // ]);
  363. // });
  364. // }
  365. // if (content) {
  366. // // 转换为wav文件保存 TODO 使用mp3格式会更节省空间
  367. // const userMessage = currentUserMessage.current;
  368. // const wavFile = new WavPacker().pack(sampleRate, {
  369. // bitsPerSample: 16,
  370. // channelCount: 1,
  371. // data: formatted?.audio,
  372. // });
  373. // // 这里将音频文件放到对象里面wavFile.url可以使用<audio>标签播放
  374. // item.formatted.file = wavFile;
  375. // uploadImageRemote(wavFile.blob).then((audio_url) => {
  376. // // update message content
  377. // userMessage.content = content;
  378. // // update message audio_url
  379. // userMessage.audio_url = audio_url;
  380. // chatStore.updateCurrentSession((session) => {
  381. // session.messages = session.messages.concat();
  382. // });
  383. // });
  384. // }
  385. // }
  386. // });
  387. // })
  388. // .catch((e) => {
  389. // console.error("Error", e);
  390. // });
  391. // }
  392. // return () => {
  393. // stop();
  394. // // TODO close client
  395. // clientRef.current?.disconnect();
  396. // };
  397. // }, [isRecording.current]);
  398. const handleClose = () => {
  399. onClose?.();
  400. disconnect();
  401. };
  402. return (
  403. <div className={styles["realtime-chat"]}>
  404. <div
  405. className={clsx(styles["circle-mic"], {
  406. [styles["pulse"]]: true,
  407. })}
  408. >
  409. <div className={styles["icon-center"]}></div>
  410. </div>
  411. <div className={styles["bottom-icons"]}>
  412. <div>
  413. <IconButton
  414. icon={isRecording ? <VoiceOffIcon /> : <VoiceIcon />}
  415. onClick={toggleRecording}
  416. disabled={!isConnected}
  417. bordered
  418. shadow
  419. />
  420. </div>
  421. <div className={styles["icon-center"]}>
  422. <IconButton
  423. icon={<PowerIcon />}
  424. text={
  425. isConnecting
  426. ? "Connecting..."
  427. : isConnected
  428. ? "Disconnect"
  429. : "Connect"
  430. }
  431. onClick={handleConnect}
  432. disabled={isConnecting}
  433. bordered
  434. shadow
  435. />
  436. </div>
  437. <div onClick={handleClose}>
  438. <IconButton
  439. icon={<Close24Icon />}
  440. onClick={handleClose}
  441. bordered
  442. shadow
  443. />
  444. </div>
  445. </div>
  446. </div>
  447. );
  448. }