realtime-chat.tsx 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. import VoiceIcon from "@/app/icons/voice.svg";
  2. import VoiceOffIcon from "@/app/icons/voice-off.svg";
  3. import PowerIcon from "@/app/icons/power.svg";
  4. import styles from "./realtime-chat.module.scss";
  5. import clsx from "clsx";
  6. import { useState, useRef, useEffect } from "react";
  7. import { useChatStore, createMessage, useAppConfig } from "@/app/store";
  8. import { IconButton } from "@/app/components/button";
  9. import {
  10. Modality,
  11. RTClient,
  12. RTInputAudioItem,
  13. RTResponse,
  14. TurnDetection,
  15. } from "rt-client";
  16. import { AudioHandler } from "@/app/lib/audio";
  17. import { uploadImage } from "@/app/utils/chat";
  18. interface RealtimeChatProps {
  19. onClose?: () => void;
  20. onStartVoice?: () => void;
  21. onPausedVoice?: () => void;
  22. }
  23. export function RealtimeChat({
  24. onClose,
  25. onStartVoice,
  26. onPausedVoice,
  27. }: RealtimeChatProps) {
  28. const chatStore = useChatStore();
  29. const session = chatStore.currentSession();
  30. const config = useAppConfig();
  31. const [status, setStatus] = useState("");
  32. const [isRecording, setIsRecording] = useState(false);
  33. const [isConnected, setIsConnected] = useState(false);
  34. const [isConnecting, setIsConnecting] = useState(false);
  35. const [modality, setModality] = useState("audio");
  36. const [useVAD, setUseVAD] = useState(true);
  37. const clientRef = useRef<RTClient | null>(null);
  38. const audioHandlerRef = useRef<AudioHandler | null>(null);
  39. const initRef = useRef(false);
  40. const temperature = config.realtimeConfig.temperature;
  41. const apiKey = config.realtimeConfig.apiKey;
  42. const model = config.realtimeConfig.model;
  43. const azure = config.realtimeConfig.provider === "Azure";
  44. const azureEndpoint = config.realtimeConfig.azure.endpoint;
  45. const azureDeployment = config.realtimeConfig.azure.deployment;
  46. const voice = config.realtimeConfig.voice;
  47. const handleConnect = async () => {
  48. if (isConnecting) return;
  49. if (!isConnected) {
  50. try {
  51. setIsConnecting(true);
  52. clientRef.current = azure
  53. ? new RTClient(
  54. new URL(azureEndpoint),
  55. { key: apiKey },
  56. { deployment: azureDeployment },
  57. )
  58. : new RTClient({ key: apiKey }, { model });
  59. const modalities: Modality[] =
  60. modality === "audio" ? ["text", "audio"] : ["text"];
  61. const turnDetection: TurnDetection = useVAD
  62. ? { type: "server_vad" }
  63. : null;
  64. clientRef.current.configure({
  65. instructions: "",
  66. voice,
  67. input_audio_transcription: { model: "whisper-1" },
  68. turn_detection: turnDetection,
  69. tools: [],
  70. temperature,
  71. modalities,
  72. });
  73. startResponseListener();
  74. setIsConnected(true);
  75. try {
  76. const recentMessages = chatStore.getMessagesWithMemory();
  77. for (const message of recentMessages) {
  78. const { role, content } = message;
  79. if (typeof content === "string") {
  80. await clientRef.current.sendItem({
  81. type: "message",
  82. role: role as any,
  83. content: [
  84. {
  85. type: (role === "assistant" ? "text" : "input_text") as any,
  86. text: content as string,
  87. },
  88. ],
  89. });
  90. }
  91. }
  92. } catch (error) {
  93. console.error("Set message failed:", error);
  94. }
  95. } catch (error) {
  96. console.error("Connection failed:", error);
  97. setStatus("Connection failed");
  98. } finally {
  99. setIsConnecting(false);
  100. }
  101. } else {
  102. await disconnect();
  103. }
  104. };
  105. const disconnect = async () => {
  106. if (clientRef.current) {
  107. try {
  108. await clientRef.current.close();
  109. clientRef.current = null;
  110. setIsConnected(false);
  111. } catch (error) {
  112. console.error("Disconnect failed:", error);
  113. }
  114. }
  115. };
  116. const startResponseListener = async () => {
  117. if (!clientRef.current) return;
  118. try {
  119. for await (const serverEvent of clientRef.current.events()) {
  120. if (serverEvent.type === "response") {
  121. await handleResponse(serverEvent);
  122. } else if (serverEvent.type === "input_audio") {
  123. await handleInputAudio(serverEvent);
  124. }
  125. }
  126. } catch (error) {
  127. if (clientRef.current) {
  128. console.error("Response iteration error:", error);
  129. }
  130. }
  131. };
  132. const handleResponse = async (response: RTResponse) => {
  133. for await (const item of response) {
  134. if (item.type === "message" && item.role === "assistant") {
  135. const botMessage = createMessage({
  136. role: item.role,
  137. content: "",
  138. });
  139. // add bot message first
  140. chatStore.updateTargetSession(session, (session) => {
  141. session.messages = session.messages.concat([botMessage]);
  142. });
  143. for await (const content of item) {
  144. if (content.type === "text") {
  145. for await (const text of content.textChunks()) {
  146. botMessage.content += text;
  147. }
  148. } else if (content.type === "audio") {
  149. const textTask = async () => {
  150. for await (const text of content.transcriptChunks()) {
  151. botMessage.content += text;
  152. }
  153. };
  154. const audioTask = async () => {
  155. audioHandlerRef.current?.startStreamingPlayback();
  156. for await (const audio of content.audioChunks()) {
  157. audioHandlerRef.current?.playChunk(audio);
  158. }
  159. };
  160. await Promise.all([textTask(), audioTask()]);
  161. }
  162. // update message.content
  163. chatStore.updateTargetSession(session, (session) => {
  164. session.messages = session.messages.concat();
  165. });
  166. }
  167. // upload audio get audio_url
  168. const blob = audioHandlerRef.current?.savePlayFile();
  169. uploadImage(blob!).then((audio_url) => {
  170. botMessage.audio_url = audio_url;
  171. // update text and audio_url
  172. chatStore.updateTargetSession(session, (session) => {
  173. session.messages = session.messages.concat();
  174. });
  175. });
  176. }
  177. }
  178. };
  179. const handleInputAudio = async (item: RTInputAudioItem) => {
  180. await item.waitForCompletion();
  181. if (item.transcription) {
  182. const userMessage = createMessage({
  183. role: "user",
  184. content: item.transcription,
  185. });
  186. chatStore.updateTargetSession(session, (session) => {
  187. session.messages = session.messages.concat([userMessage]);
  188. });
  189. // save input audio_url, and update session
  190. const { audioStartMillis, audioEndMillis } = item;
  191. // upload audio get audio_url
  192. const blob = audioHandlerRef.current?.saveRecordFile(
  193. audioStartMillis,
  194. audioEndMillis,
  195. );
  196. uploadImage(blob!).then((audio_url) => {
  197. userMessage.audio_url = audio_url;
  198. chatStore.updateTargetSession(session, (session) => {
  199. session.messages = session.messages.concat();
  200. });
  201. });
  202. }
  203. // stop streaming play after get input audio.
  204. audioHandlerRef.current?.stopStreamingPlayback();
  205. };
  206. const toggleRecording = async () => {
  207. if (!isRecording && clientRef.current) {
  208. try {
  209. if (!audioHandlerRef.current) {
  210. audioHandlerRef.current = new AudioHandler();
  211. await audioHandlerRef.current.initialize();
  212. }
  213. await audioHandlerRef.current.startRecording(async (chunk) => {
  214. await clientRef.current?.sendAudio(chunk);
  215. });
  216. setIsRecording(true);
  217. } catch (error) {
  218. console.error("Failed to start recording:", error);
  219. }
  220. } else if (audioHandlerRef.current) {
  221. try {
  222. audioHandlerRef.current.stopRecording();
  223. if (!useVAD) {
  224. const inputAudio = await clientRef.current?.commitAudio();
  225. await handleInputAudio(inputAudio!);
  226. await clientRef.current?.generateResponse();
  227. }
  228. setIsRecording(false);
  229. } catch (error) {
  230. console.error("Failed to stop recording:", error);
  231. }
  232. }
  233. };
  234. useEffect(() => {
  235. // 防止重复初始化
  236. if (initRef.current) return;
  237. initRef.current = true;
  238. const initAudioHandler = async () => {
  239. const handler = new AudioHandler();
  240. await handler.initialize();
  241. audioHandlerRef.current = handler;
  242. await handleConnect();
  243. await toggleRecording();
  244. };
  245. initAudioHandler().catch((error) => {
  246. setStatus(error);
  247. console.error(error);
  248. });
  249. return () => {
  250. if (isRecording) {
  251. toggleRecording();
  252. }
  253. audioHandlerRef.current?.close().catch(console.error);
  254. disconnect();
  255. };
  256. }, []);
  257. // update session params
  258. useEffect(() => {
  259. clientRef.current?.configure({ voice });
  260. }, [voice]);
  261. useEffect(() => {
  262. clientRef.current?.configure({ temperature });
  263. }, [temperature]);
  264. const handleClose = async () => {
  265. onClose?.();
  266. if (isRecording) {
  267. await toggleRecording();
  268. }
  269. disconnect().catch(console.error);
  270. };
  271. return (
  272. <div className={styles["realtime-chat"]}>
  273. <div
  274. className={clsx(styles["circle-mic"], {
  275. [styles["pulse"]]: isRecording,
  276. })}
  277. >
  278. <div className={styles["icon-center"]}></div>
  279. </div>
  280. <div className={styles["bottom-icons"]}>
  281. <div>
  282. <IconButton
  283. icon={isRecording ? <VoiceIcon /> : <VoiceOffIcon />}
  284. onClick={toggleRecording}
  285. disabled={!isConnected}
  286. shadow
  287. bordered
  288. />
  289. </div>
  290. <div className={styles["icon-center"]}>{status}</div>
  291. <div>
  292. <IconButton
  293. icon={<PowerIcon />}
  294. onClick={handleClose}
  295. shadow
  296. bordered
  297. />
  298. </div>
  299. </div>
  300. </div>
  301. );
  302. }