From 08f172544d7c04eed86a9a0e8cf14da1803b899c Mon Sep 17 00:00:00 2001
From: Jack Merrill <me@jackmerrill.com>
Date: Mon, 5 May 2025 11:07:55 -0400
Subject: [PATCH] more work on TTS

---
 app/dashboard/documents/[id]/page.tsx        |   4 +-
 app/dashboard/page.tsx                       |   2 +
 components/MarkdownRenderer.tsx              |  28 ++---
 components/TTSProvider.tsx                   | 110 +++++++++++--------
 public/workers/kokoro-worker.js              |  61 ++++++----
 supabase/functions/process-document/index.ts |  24 +---
 6 files changed, 130 insertions(+), 99 deletions(-)
diff --git a/app/dashboard/documents/[id]/page.tsx b/app/dashboard/documents/[id]/page.tsx
index abd4ddd..d223d0b 100644
--- a/app/dashboard/documents/[id]/page.tsx
+++ b/app/dashboard/documents/[id]/page.tsx
@@ -54,7 +54,7 @@ export default async function DocumentPage(props: { params: { id: string } }) {
   }
   const { data: documents, error: documentsError } = await supabase
     .from("documents")
-    .select("id, file_name, created_at, owner")
+    .select("*")
     .eq("owner", user.id)
     .order("created_at", { ascending: false });
 
@@ -76,6 +76,8 @@ export default async function DocumentPage(props: { params: { id: string } }) {
         <AppSidebar
           documents={documents.map((d) => {
             return {
+              id: d.id,
+              disabled: d.is_processing,
               name: d.file_name,
               url: `/dashboard/documents/${d.id}`,
               emoji: "📄",
diff --git a/app/dashboard/page.tsx b/app/dashboard/page.tsx
index 330b40a..ad3a32a 100644
--- a/app/dashboard/page.tsx
+++ b/app/dashboard/page.tsx
@@ -47,6 +47,8 @@ export default async function Page() {
       <AppSidebar
         documents={documents.map((d) => {
           return {
+            id: d.id,
+            disabled: d.is_processing,
             name: d.file_name,
             url: `/dashboard/documents/${d.id}`,
             emoji: "📄",
diff --git a/components/MarkdownRenderer.tsx b/components/MarkdownRenderer.tsx
index e0eaefa..07d4c4c 100644
--- a/components/MarkdownRenderer.tsx
+++ b/components/MarkdownRenderer.tsx
@@ -20,7 +20,10 @@ export type OCRData = {
   index: number;
   images: string[];
   markdown: string;
-  citations: Record<string, string>;
+  citations: {
+    text: string;
+    number: string;
+  }[];
   dimensions: {
     dpi: number;
     width: number;
@@ -64,16 +67,15 @@ export default function MarkdownRenderer({
 
   let totalCitations = 0;
   ocr.forEach((page) => {
-    Object.entries(page.citations).forEach(([key, value]) => {
-      if (value) {
-        totalCitations++;
-        citations.push({
-          text: value,
-          page: page.index,
-          index: key,
-          number: Number(totalCitations),
-        });
-      }
+    // each page has its own citations (1-N), so we need to map them correctly
+    page.citations.forEach((citation, index) => {
+      totalCitations += 1;
+      citations.push({
+        text: citation.text,
+        page: page.index,
+        index: (totalCitations + index).toString(), // unique index across all pages
+        number: totalCitations + index + 1, // 1-based numbering
+      });
     });
   });
 
@@ -128,7 +130,8 @@ export default function MarkdownRenderer({
       }
 
       const citation = citations.find(
-        (c) => c.index === referenceNumber && c.page === page.index
+        (c) =>
+          c.index === referenceNumber || c.number.toString() === referenceNumber
       );
 
       if (!citation) {
@@ -146,7 +149,6 @@ export default function MarkdownRenderer({
           </PopoverTrigger>
           <PopoverContent className="w-56 overflow-hidden rounded-lg p-0">
             <div className="p-4">
-              {/* Replace with actual reference content */}
               <p>{citation.text}</p>
             </div>
           </PopoverContent>
diff --git a/components/TTSProvider.tsx b/components/TTSProvider.tsx
index d04e872..d3072e9 100644
--- a/components/TTSProvider.tsx
+++ b/components/TTSProvider.tsx
@@ -83,33 +83,35 @@ export const TTSProvider = ({
     if (cached) {
       return cached;
     }
-    worker.current!.postMessage({
-      type: "generate",
-      text: sentence,
-      voice: selectedSpeaker,
-    });
-    setStatus("running");
-    setLoadingMessage("Generating audio...");
+
     return new Promise((resolve, reject) => {
-      worker.current!.addEventListener(
-        "message",
-        (e: any) => {
-          if (e.data.status === "complete") {
-            localStorage.setItem(key, e.data.audio);
-            resolve(e.data.audio);
-          } else if (e.data.status === "error") {
-            toast.error(`Error generating audio: ${e.data.error}`);
-            reject(e.data.error);
-          }
-        },
-        { once: true }
-      );
+      const handleMessage = (e: MessageEvent) => {
+        if (e.data.index !== index) return; // Ignore messages for other indices
+
+        if (e.data.status === "complete") {
+          localStorage.setItem(key, e.data.audio);
+          worker.current!.removeEventListener("message", handleMessage); // Clean up listener
+          resolve(e.data.audio);
+        } else if (e.data.status === "error") {
+          worker.current!.removeEventListener("message", handleMessage); // Clean up listener
+          toast.error(`Error generating audio: ${e.data.error}`);
+          reject(e.data.error);
+        }
+      };
+
+      worker.current!.addEventListener("message", handleMessage);
+
+      worker.current!.postMessage({
+        type: "generate",
+        index,
+        text: sentence,
+        voice: selectedSpeaker,
+      });
     });
   }
 
   // We use the `useEffect` hook to setup the worker as soon as the `App` component is mounted.
   useEffect(() => {
-    // Create the worker if it does not yet exist.
     console.log("Initializing worker...");
     worker.current ??= new Worker("/workers/kokoro-worker.js", {
       type: "module",
@@ -117,7 +119,6 @@ export const TTSProvider = ({
 
     console.log("Worker initialized");
 
-    // Create a callback function for messages from the worker thread.
     const onMessageReceived = (e: any) => {
       switch (e.data.status) {
         case "device":
@@ -132,56 +133,71 @@ export const TTSProvider = ({
           break;
         case "complete":
           const { audio, text } = e.data;
-          // Generation complete: re-enable the "Generate" button
           setResults((prev) => [{ text, src: audio }, ...prev]);
           setStatus("ready");
           break;
       }
     };
 
-    console.log("onmessagereceived");
-
     const onErrorReceived = (e: any) => {
       console.error("Worker error:", e);
       setError(e.message);
     };
 
-    console.log("Attaching event listeners to worker");
-
-    // Attach the callback function as an event listener.
     worker.current.addEventListener("message", onMessageReceived);
     worker.current.addEventListener("error", onErrorReceived);
 
-    console.log(worker.current);
-    // Define a cleanup function for when the component is unmounted.
     return () => {
       worker.current!.removeEventListener("message", onMessageReceived);
       worker.current!.removeEventListener("error", onErrorReceived);
     };
   }, []);
 
-  // Pre-buffer current and next 2 sentences.
+  // Pre-buffer current and next 5 sentences.
   useEffect(() => {
+    let isCancelled = false;
+
     async function preloadBuffer() {
       const newBuffer = [...ttsBuffer];
-      const end = Math.min(sentences.length, currentSentence + 3);
+      const end = Math.min(sentences.length, currentSentence + 5); // Preload 5 sentences ahead
+
       for (let i = currentSentence; i < end; i++) {
+        if (isCancelled) break;
         if (!newBuffer[i]) {
           console.log("Preloading TTS for sentence:", i, sentences[i]);
-          newBuffer[i] = await generateTTSForIndex(
-            removeMarkdown(sentences[i]),
-            i
-          );
+          try {
+            newBuffer[i] = await generateTTSForIndex(
+              removeMarkdown(sentences[i]),
+              i
+            );
+          } catch (error) {
+            console.error("Error preloading TTS:", error);
+          }
         }
       }
-      setTtsBuffer(newBuffer);
+
+      if (!isCancelled) {
+        setTtsBuffer((prev) => {
+          // Only update state if the buffer has changed
+          if (JSON.stringify(prev) !== JSON.stringify(newBuffer)) {
+            return newBuffer;
+          }
+          return prev;
+        });
+      }
     }
+
     preloadBuffer();
-    // eslint-disable-next-line react-hooks/exhaustive-deps
-  }, [currentSentence, sentences.join(" ")]);
+
+    return () => {
+      isCancelled = true; // Cancel preloading if the component unmounts or dependencies change
+    };
+  }, [currentSentence, sentences]);
 
   const playSentence = async (index: number) => {
+    if (index === currentSentence) return; // Prevent redundant updates
     setCurrentSentence(index);
+
     let audioUrl = ttsBuffer[index];
     if (!audioUrl) {
       audioUrl = await generateTTSForIndex(
@@ -194,6 +210,7 @@ export const TTSProvider = ({
         return updated;
       });
     }
+
     if (audioRef.current) {
       audioRef.current.src = audioUrl;
       await new Promise((res) => {
@@ -211,16 +228,21 @@ export const TTSProvider = ({
 
   const playInOrder = async (index: number) => {
     if (index < 0 || index >= sentences.length) return;
-    console.log("Playing in order from index:", index);
+    if (index === currentSentence && playing) return; // Prevent redundant playback
     setCurrentSentence(index);
+    setPlaying(true);
+
     for (let i = index; i < sentences.length; i++) {
       console.log("Playing sentence:", i, sentences[i]);
-      await playSentence(i);
-      if (i < sentences.length - 1) {
-        console.log("Waiting for next sentence...");
-        await new Promise((resolve) => setTimeout(resolve, 1000));
+      try {
+        await playSentence(i);
+      } catch (error) {
+        console.error("Error playing sentence:", error);
+        break; // Stop playback on error
       }
     }
+
+    setPlaying(false);
   };
 
   const pause = () => {
diff --git a/public/workers/kokoro-worker.js b/public/workers/kokoro-worker.js
index 40a088b..dd9e267 100644
--- a/public/workers/kokoro-worker.js
+++ b/public/workers/kokoro-worker.js
@@ -1,6 +1,9 @@
 console.log("Initializing Kokoro TTS Worker");
 
-import { KokoroTTS } from "https://cdn.jsdelivr.net/npm/kokoro-js@1.2.0/+esm";
+import {
+  KokoroTTS,
+  TextSplitterStream,
+} from "https://cdn.jsdelivr.net/npm/kokoro-js@1.2.0/+esm";
 async function detectWebGPU() {
   try {
     const adapter = await navigator.gpu.requestAdapter();
@@ -35,29 +38,43 @@ const tts = await KokoroTTS.from_pretrained(model_id, {
   },
 });
 
+const splitter = new TextSplitterStream();
+const stream = tts.stream(splitter);
+let index = 0;
+
+// Listen for messages from the main thread
+self.addEventListener("message", async (e) => {
+  const { text, voice, index } = e.data;
+
+  console.log(
+    `Generating speech for text: "${text}" with voice: ${voice}, index: ${index}`
+  );
+
+  // Push the text to the splitter
+  splitter.push(text);
+  splitter.push(""); // Signal the end of the text
+
+  // Process the stream and include the correct index
+  for await (const { text: processedText, phonemes, audio } of stream) {
+    console.log({ processedText, phonemes });
+
+    const blob = audio.toBlob();
+    const base64Audio = await blobToBase64(blob);
+
+    self.postMessage({
+      status: "complete",
+      audio: base64Audio,
+      text: processedText,
+      phonemes,
+      index, // Include the index from the original message
+    });
+
+    break; // Stop processing after the first chunk for this message
+  }
+});
+
 console.log("Kokoro TTS model loaded successfully");
 
 self.postMessage({ status: "ready", voices: tts.voices, device });
 
 console.log("Available voices:", tts.voices);
-
-// Listen for messages from the main thread
-self.addEventListener("message", async (e) => {
-  const { text, voice } = e.data;
-
-  try {
-    // Generate speech
-    console.log(`Generating speech for text: "${text}" with voice: ${voice}`);
-    const audio = await tts.generate(text, { voice });
-
-    // Send the audio file back to the main thread
-    const blob = audio.toBlob();
-    self.postMessage({
-      status: "complete",
-      audio: await blobToBase64(blob),
-      text,
-    });
-  } catch (error) {
-    self.postMessage({ status: "error", error: error.message });
-  }
-});
diff --git a/supabase/functions/process-document/index.ts b/supabase/functions/process-document/index.ts
index af45377..d3d7a21 100644
--- a/supabase/functions/process-document/index.ts
+++ b/supabase/functions/process-document/index.ts
@@ -14,7 +14,7 @@ const client = new Mistral({
 const PROCESSING_PROMPT = `
 You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format.
 
-The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content.
+The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. **Do not add headings if they do not exist in the original text.**
 Any images should be included.
 Do not return the Markdown as a code block, only as a raw string, without any new lines.
 
@@ -35,7 +35,7 @@ Return the final result as a text object with the following structure (without c
   "citations": [
     {
       "number": 1, // The number as it appears in the text
-      "text": "Citation text 1"
+      "text": "Citation text 1" // Ensure any JSON-breaking characters are properly escaped
     },
     {
       "number": 2,
@@ -138,7 +138,7 @@ Deno.serve(async (req) => {
           message: "File ID found in form data.",
         });
         const docId = formData.get("id");
-
+        console.log("Document ID:", docId, formData);
         const { data: documentData, error: documentError } = await supabase
           .from("documents")
           .select("*")
@@ -170,24 +170,9 @@ Deno.serve(async (req) => {
           throw new Error("Document record not found");
         }
 
-        const { data: storageData, error: storageError } = await supabaseServer
-          .from("storage.objects")
-          .select("name")
-          .eq("id", documentData.raw_file)
-          .single();
-
-        if (storageError) {
-          console.error("Error fetching file name:", storageError);
-          sendEvent("error", {
-            message: "Error fetching file name",
-            error: storageError,
-          });
-          throw new Error("Storage data fetch failed");
-        }
-
         const { data: fileData, error: fileError } = await supabase.storage
           .from("documents")
-          .download(storageData.name);
+          .download(`${user.id}/${uuid}.pdf`);
 
         if (fileError) {
           console.error("Error downloading file from storage:", fileError);
@@ -425,6 +410,7 @@ Deno.serve(async (req) => {
 
             const content = split[0].trim();
             const citationsStr = split[1]?.trim() || "{}";
+            console.log(`[${page.index}] Citations: ${citationsStr}`);
             const citations = JSON.parse(citationsStr).citations || {};
 
             console.log("Generating Markdown for page:", page.index);