Przeglądaj źródła

upload response audio, and update audio_url to session message

lloydzhou 1 rok temu
rodzic
commit
cf46d5ad63
2 zmienionych plików z 49 dodań i 4 usunięć
  1. 19 4
      app/components/realtime-chat/realtime-chat.tsx
  2. 30 0
      app/lib/audio.ts

+ 19 - 4
app/components/realtime-chat/realtime-chat.tsx

@@ -25,6 +25,7 @@ import {
   TurnDetection,
 } from "rt-client";
 import { AudioHandler } from "@/app/lib/audio";
+import { uploadImage } from "@/app/utils/chat";
 
 interface RealtimeChatProps {
   onClose?: () => void;
@@ -126,11 +127,16 @@ export function RealtimeChat({
 
   const handleResponse = async (response: RTResponse) => {
     for await (const item of response) {
+      console.log("handleResponse", item);
       if (item.type === "message" && item.role === "assistant") {
         const botMessage = createMessage({
           role: item.role,
           content: "",
         });
+        // add bot message first
+        chatStore.updateTargetSession(session, (session) => {
+          session.messages = session.messages.concat([botMessage]);
+        });
         for await (const content of item) {
           if (content.type === "text") {
             for await (const text of content.textChunks()) {
@@ -149,12 +155,18 @@ export function RealtimeChat({
               }
             };
             await Promise.all([textTask(), audioTask()]);
-            chatStore.updateTargetSession(session, (session) => {
-              botMessage.date = new Date().toLocaleString();
-              session.messages = session.messages.concat([botMessage]);
-            });
           }
         }
+        // upload audio get audio_url
+        const blob = audioHandlerRef.current?.savePlayFile();
+        uploadImage(blob).then((audio_url) => {
+          botMessage.audio_url = audio_url;
+          botMessage.date = new Date().toLocaleString();
+          // update text and audio_url
+          chatStore.updateTargetSession((session) => {
+            session.messages = session.messages.concat();
+          });
+        });
       }
     }
   };
@@ -162,6 +174,9 @@ export function RealtimeChat({
   const handleInputAudio = async (item: RTInputAudioItem) => {
     audioHandlerRef.current?.stopStreamingPlayback();
     await item.waitForCompletion();
+    const { audioStartMillis, audioEndMillis } = item;
+    // TODO, save input audio_url, and update session
+    console.log("handleInputAudio", item, audioStartMillis, audioEndMillis);
     const userMessage = createMessage({
       role: "user",
       content: item.transcription,

+ 30 - 0
app/lib/audio.ts

@@ -8,6 +8,7 @@ export class AudioHandler {
   private nextPlayTime: number = 0;
   private isPlaying: boolean = false;
   private playbackQueue: AudioBufferSourceNode[] = [];
+  private playBuffer: Int16Array[] = [];
 
   constructor() {
     this.context = new AudioContext({ sampleRate: this.sampleRate });
@@ -84,12 +85,14 @@ export class AudioHandler {
     this.isPlaying = false;
     this.playbackQueue.forEach((source) => source.stop());
     this.playbackQueue = [];
+    this.playBuffer = [];
   }
 
   playChunk(chunk: Uint8Array) {
     if (!this.isPlaying) return;
 
     const int16Data = new Int16Array(chunk.buffer);
+    this.playBuffer.push.apply(this.playBuffer, int16Data); // save playBuffer
 
     const float32Data = new Float32Array(int16Data.length);
     for (let i = 0; i < int16Data.length; i++) {
@@ -125,6 +128,33 @@ export class AudioHandler {
       this.nextPlayTime = this.context.currentTime;
     }
   }
+  _saveData(data: Int16Array, bytesPerSample = 16): Blob {
+    const headerLength = 44;
+    const numberOfChannels = 1;
+    const dataLength = data.length;
+    const wav = new Uint8Array(headerLength + dataLength * 2);
+    const view = new DataView(wav.buffer);
+    view.setUint32(0, 1380533830, false); // RIFF identifier 'RIFF'
+    view.setUint32(4, 36 + dataLength * 2, true); // file length minus RIFF identifier length and file description length
+    view.setUint32(8, 1463899717, false); // RIFF type 'WAVE'
+    view.setUint32(12, 1718449184, false); // format chunk identifier 'fmt '
+    view.setUint32(16, 16, true); // format chunk length
+    view.setUint16(20, 1, true); // sample format (raw)
+    view.setUint16(22, numberOfChannels, true); // channel count
+    view.setUint32(24, this.sampleRate, true); // sample rate
+    view.setUint32(28, this.sampleRate * 4, true); // byte rate (sample rate * block align)
+    view.setUint16(32, numberOfChannels * 2, true); // block align (channel count * bytes per sample)
+    view.setUint16(34, bytesPerSample, true); // bits per sample
+    view.setUint32(36, 1684108385, false); // data chunk identifier 'data'
+    view.setUint32(40, dataLength * 2, true); // data chunk length
+    for (let i = 0; i < dataLength; i++) {
+      view.setInt16(44 + i * 2, data[i], true);
+    }
+    return new Blob([view], { type: "audio/mpeg" });
+  }
+  savePlayFile() {
+    return this._saveData(this.playBuffer);
+  }
   async close() {
     this.workletNode?.disconnect();
     this.source?.disconnect();