From 08f172544d7c04eed86a9a0e8cf14da1803b899c Mon Sep 17 00:00:00 2001 From: Jack Merrill Date: Mon, 5 May 2025 11:07:55 -0400 Subject: [PATCH] more work on TTS --- app/dashboard/documents/[id]/page.tsx | 4 +- app/dashboard/page.tsx | 2 + components/MarkdownRenderer.tsx | 28 ++--- components/TTSProvider.tsx | 110 +++++++++++-------- public/workers/kokoro-worker.js | 61 ++++++---- supabase/functions/process-document/index.ts | 24 +--- 6 files changed, 130 insertions(+), 99 deletions(-) diff --git a/app/dashboard/documents/[id]/page.tsx b/app/dashboard/documents/[id]/page.tsx index abd4ddd..d223d0b 100644 --- a/app/dashboard/documents/[id]/page.tsx +++ b/app/dashboard/documents/[id]/page.tsx @@ -54,7 +54,7 @@ export default async function DocumentPage(props: { params: { id: string } }) { } const { data: documents, error: documentsError } = await supabase .from("documents") - .select("id, file_name, created_at, owner") + .select("*") .eq("owner", user.id) .order("created_at", { ascending: false }); @@ -76,6 +76,8 @@ export default async function DocumentPage(props: { params: { id: string } }) { { return { + id: d.id, + disabled: d.is_processing, name: d.file_name, url: `/dashboard/documents/${d.id}`, emoji: "📄", diff --git a/app/dashboard/page.tsx b/app/dashboard/page.tsx index 330b40a..ad3a32a 100644 --- a/app/dashboard/page.tsx +++ b/app/dashboard/page.tsx @@ -47,6 +47,8 @@ export default async function Page() { { return { + id: d.id, + disabled: d.is_processing, name: d.file_name, url: `/dashboard/documents/${d.id}`, emoji: "📄", diff --git a/components/MarkdownRenderer.tsx b/components/MarkdownRenderer.tsx index e0eaefa..07d4c4c 100644 --- a/components/MarkdownRenderer.tsx +++ b/components/MarkdownRenderer.tsx @@ -20,7 +20,10 @@ export type OCRData = { index: number; images: string[]; markdown: string; - citations: Record; + citations: { + text: string; + number: string; + }[]; dimensions: { dpi: number; width: number; @@ -64,16 +67,15 @@ export default function MarkdownRenderer({ let totalCitations = 0; ocr.forEach((page) => { - Object.entries(page.citations).forEach(([key, value]) => { - if (value) { - totalCitations++; - citations.push({ - text: value, - page: page.index, - index: key, - number: Number(totalCitations), - }); - } + // each page has its own citations (1-N), so we need to map them correctly + page.citations.forEach((citation, index) => { + totalCitations += 1; + citations.push({ + text: citation.text, + page: page.index, + index: (totalCitations + index).toString(), // unique index across all pages + number: totalCitations + index + 1, // 1-based numbering + }); }); }); @@ -128,7 +130,8 @@ export default function MarkdownRenderer({ } const citation = citations.find( - (c) => c.index === referenceNumber && c.page === page.index + (c) => + c.index === referenceNumber || c.number.toString() === referenceNumber ); if (!citation) { @@ -146,7 +149,6 @@ export default function MarkdownRenderer({
- {/* Replace with actual reference content */}

{citation.text}

diff --git a/components/TTSProvider.tsx b/components/TTSProvider.tsx index d04e872..d3072e9 100644 --- a/components/TTSProvider.tsx +++ b/components/TTSProvider.tsx @@ -83,33 +83,35 @@ export const TTSProvider = ({ if (cached) { return cached; } - worker.current!.postMessage({ - type: "generate", - text: sentence, - voice: selectedSpeaker, - }); - setStatus("running"); - setLoadingMessage("Generating audio..."); + return new Promise((resolve, reject) => { - worker.current!.addEventListener( - "message", - (e: any) => { - if (e.data.status === "complete") { - localStorage.setItem(key, e.data.audio); - resolve(e.data.audio); - } else if (e.data.status === "error") { - toast.error(`Error generating audio: ${e.data.error}`); - reject(e.data.error); - } - }, - { once: true } - ); + const handleMessage = (e: MessageEvent) => { + if (e.data.index !== index) return; // Ignore messages for other indices + + if (e.data.status === "complete") { + localStorage.setItem(key, e.data.audio); + worker.current!.removeEventListener("message", handleMessage); // Clean up listener + resolve(e.data.audio); + } else if (e.data.status === "error") { + worker.current!.removeEventListener("message", handleMessage); // Clean up listener + toast.error(`Error generating audio: ${e.data.error}`); + reject(e.data.error); + } + }; + + worker.current!.addEventListener("message", handleMessage); + + worker.current!.postMessage({ + type: "generate", + index, + text: sentence, + voice: selectedSpeaker, + }); }); } // We use the `useEffect` hook to setup the worker as soon as the `App` component is mounted. useEffect(() => { - // Create the worker if it does not yet exist. console.log("Initializing worker..."); worker.current ??= new Worker("/workers/kokoro-worker.js", { type: "module", @@ -117,7 +119,6 @@ export const TTSProvider = ({ console.log("Worker initialized"); - // Create a callback function for messages from the worker thread. const onMessageReceived = (e: any) => { switch (e.data.status) { case "device": @@ -132,56 +133,71 @@ export const TTSProvider = ({ break; case "complete": const { audio, text } = e.data; - // Generation complete: re-enable the "Generate" button setResults((prev) => [{ text, src: audio }, ...prev]); setStatus("ready"); break; } }; - console.log("onmessagereceived"); - const onErrorReceived = (e: any) => { console.error("Worker error:", e); setError(e.message); }; - console.log("Attaching event listeners to worker"); - - // Attach the callback function as an event listener. worker.current.addEventListener("message", onMessageReceived); worker.current.addEventListener("error", onErrorReceived); - console.log(worker.current); - // Define a cleanup function for when the component is unmounted. return () => { worker.current!.removeEventListener("message", onMessageReceived); worker.current!.removeEventListener("error", onErrorReceived); }; }, []); - // Pre-buffer current and next 2 sentences. + // Pre-buffer current and next 5 sentences. useEffect(() => { + let isCancelled = false; + async function preloadBuffer() { const newBuffer = [...ttsBuffer]; - const end = Math.min(sentences.length, currentSentence + 3); + const end = Math.min(sentences.length, currentSentence + 5); // Preload 5 sentences ahead + for (let i = currentSentence; i < end; i++) { + if (isCancelled) break; if (!newBuffer[i]) { console.log("Preloading TTS for sentence:", i, sentences[i]); - newBuffer[i] = await generateTTSForIndex( - removeMarkdown(sentences[i]), - i - ); + try { + newBuffer[i] = await generateTTSForIndex( + removeMarkdown(sentences[i]), + i + ); + } catch (error) { + console.error("Error preloading TTS:", error); + } } } - setTtsBuffer(newBuffer); + + if (!isCancelled) { + setTtsBuffer((prev) => { + // Only update state if the buffer has changed + if (JSON.stringify(prev) !== JSON.stringify(newBuffer)) { + return newBuffer; + } + return prev; + }); + } } + preloadBuffer(); - // eslint-disable-next-line react-hooks/exhaustive-deps - }, [currentSentence, sentences.join(" ")]); + + return () => { + isCancelled = true; // Cancel preloading if the component unmounts or dependencies change + }; + }, [currentSentence, sentences]); const playSentence = async (index: number) => { + if (index === currentSentence) return; // Prevent redundant updates setCurrentSentence(index); + let audioUrl = ttsBuffer[index]; if (!audioUrl) { audioUrl = await generateTTSForIndex( @@ -194,6 +210,7 @@ export const TTSProvider = ({ return updated; }); } + if (audioRef.current) { audioRef.current.src = audioUrl; await new Promise((res) => { @@ -211,16 +228,21 @@ export const TTSProvider = ({ const playInOrder = async (index: number) => { if (index < 0 || index >= sentences.length) return; - console.log("Playing in order from index:", index); + if (index === currentSentence && playing) return; // Prevent redundant playback setCurrentSentence(index); + setPlaying(true); + for (let i = index; i < sentences.length; i++) { console.log("Playing sentence:", i, sentences[i]); - await playSentence(i); - if (i < sentences.length - 1) { - console.log("Waiting for next sentence..."); - await new Promise((resolve) => setTimeout(resolve, 1000)); + try { + await playSentence(i); + } catch (error) { + console.error("Error playing sentence:", error); + break; // Stop playback on error } } + + setPlaying(false); }; const pause = () => { diff --git a/public/workers/kokoro-worker.js b/public/workers/kokoro-worker.js index 40a088b..dd9e267 100644 --- a/public/workers/kokoro-worker.js +++ b/public/workers/kokoro-worker.js @@ -1,6 +1,9 @@ console.log("Initializing Kokoro TTS Worker"); -import { KokoroTTS } from "https://cdn.jsdelivr.net/npm/kokoro-js@1.2.0/+esm"; +import { + KokoroTTS, + TextSplitterStream, +} from "https://cdn.jsdelivr.net/npm/kokoro-js@1.2.0/+esm"; async function detectWebGPU() { try { const adapter = await navigator.gpu.requestAdapter(); @@ -35,29 +38,43 @@ const tts = await KokoroTTS.from_pretrained(model_id, { }, }); +const splitter = new TextSplitterStream(); +const stream = tts.stream(splitter); +let index = 0; + +// Listen for messages from the main thread +self.addEventListener("message", async (e) => { + const { text, voice, index } = e.data; + + console.log( + `Generating speech for text: "${text}" with voice: ${voice}, index: ${index}` + ); + + // Push the text to the splitter + splitter.push(text); + splitter.push(""); // Signal the end of the text + + // Process the stream and include the correct index + for await (const { text: processedText, phonemes, audio } of stream) { + console.log({ processedText, phonemes }); + + const blob = audio.toBlob(); + const base64Audio = await blobToBase64(blob); + + self.postMessage({ + status: "complete", + audio: base64Audio, + text: processedText, + phonemes, + index, // Include the index from the original message + }); + + break; // Stop processing after the first chunk for this message + } +}); + console.log("Kokoro TTS model loaded successfully"); self.postMessage({ status: "ready", voices: tts.voices, device }); console.log("Available voices:", tts.voices); - -// Listen for messages from the main thread -self.addEventListener("message", async (e) => { - const { text, voice } = e.data; - - try { - // Generate speech - console.log(`Generating speech for text: "${text}" with voice: ${voice}`); - const audio = await tts.generate(text, { voice }); - - // Send the audio file back to the main thread - const blob = audio.toBlob(); - self.postMessage({ - status: "complete", - audio: await blobToBase64(blob), - text, - }); - } catch (error) { - self.postMessage({ status: "error", error: error.message }); - } -}); diff --git a/supabase/functions/process-document/index.ts b/supabase/functions/process-document/index.ts index af45377..d3d7a21 100644 --- a/supabase/functions/process-document/index.ts +++ b/supabase/functions/process-document/index.ts @@ -14,7 +14,7 @@ const client = new Mistral({ const PROCESSING_PROMPT = ` You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format. -The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. +The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. **Do not add headings if they do not exist in the original text.** Any images should be included. Do not return the Markdown as a code block, only as a raw string, without any new lines. @@ -35,7 +35,7 @@ Return the final result as a text object with the following structure (without c "citations": [ { "number": 1, // The number as it appears in the text - "text": "Citation text 1" + "text": "Citation text 1" // Ensure any JSON-breaking characters are properly escaped }, { "number": 2, @@ -138,7 +138,7 @@ Deno.serve(async (req) => { message: "File ID found in form data.", }); const docId = formData.get("id"); - + console.log("Document ID:", docId, formData); const { data: documentData, error: documentError } = await supabase .from("documents") .select("*") @@ -170,24 +170,9 @@ Deno.serve(async (req) => { throw new Error("Document record not found"); } - const { data: storageData, error: storageError } = await supabaseServer - .from("storage.objects") - .select("name") - .eq("id", documentData.raw_file) - .single(); - - if (storageError) { - console.error("Error fetching file name:", storageError); - sendEvent("error", { - message: "Error fetching file name", - error: storageError, - }); - throw new Error("Storage data fetch failed"); - } - const { data: fileData, error: fileError } = await supabase.storage .from("documents") - .download(storageData.name); + .download(`${user.id}/${uuid}.pdf`); if (fileError) { console.error("Error downloading file from storage:", fileError); @@ -425,6 +410,7 @@ Deno.serve(async (req) => { const content = split[0].trim(); const citationsStr = split[1]?.trim() || "{}"; + console.log(`[${page.index}] Citations: ${citationsStr}`); const citations = JSON.parse(citationsStr).citations || {}; console.log("Generating Markdown for page:", page.index);