more work on TTS
This commit is contained in:
parent
16b552262e
commit
08f172544d
@ -54,7 +54,7 @@ export default async function DocumentPage(props: { params: { id: string } }) {
|
||||
}
|
||||
const { data: documents, error: documentsError } = await supabase
|
||||
.from("documents")
|
||||
.select("id, file_name, created_at, owner")
|
||||
.select("*")
|
||||
.eq("owner", user.id)
|
||||
.order("created_at", { ascending: false });
|
||||
|
||||
@ -76,6 +76,8 @@ export default async function DocumentPage(props: { params: { id: string } }) {
|
||||
<AppSidebar
|
||||
documents={documents.map((d) => {
|
||||
return {
|
||||
id: d.id,
|
||||
disabled: d.is_processing,
|
||||
name: d.file_name,
|
||||
url: `/dashboard/documents/${d.id}`,
|
||||
emoji: "📄",
|
||||
|
@ -47,6 +47,8 @@ export default async function Page() {
|
||||
<AppSidebar
|
||||
documents={documents.map((d) => {
|
||||
return {
|
||||
id: d.id,
|
||||
disabled: d.is_processing,
|
||||
name: d.file_name,
|
||||
url: `/dashboard/documents/${d.id}`,
|
||||
emoji: "📄",
|
||||
|
@ -20,7 +20,10 @@ export type OCRData = {
|
||||
index: number;
|
||||
images: string[];
|
||||
markdown: string;
|
||||
citations: Record<string, string>;
|
||||
citations: {
|
||||
text: string;
|
||||
number: string;
|
||||
}[];
|
||||
dimensions: {
|
||||
dpi: number;
|
||||
width: number;
|
||||
@ -64,16 +67,15 @@ export default function MarkdownRenderer({
|
||||
|
||||
let totalCitations = 0;
|
||||
ocr.forEach((page) => {
|
||||
Object.entries(page.citations).forEach(([key, value]) => {
|
||||
if (value) {
|
||||
totalCitations++;
|
||||
// each page has its own citations (1-N), so we need to map them correctly
|
||||
page.citations.forEach((citation, index) => {
|
||||
totalCitations += 1;
|
||||
citations.push({
|
||||
text: value,
|
||||
text: citation.text,
|
||||
page: page.index,
|
||||
index: key,
|
||||
number: Number(totalCitations),
|
||||
index: (totalCitations + index).toString(), // unique index across all pages
|
||||
number: totalCitations + index + 1, // 1-based numbering
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
@ -128,7 +130,8 @@ export default function MarkdownRenderer({
|
||||
}
|
||||
|
||||
const citation = citations.find(
|
||||
(c) => c.index === referenceNumber && c.page === page.index
|
||||
(c) =>
|
||||
c.index === referenceNumber || c.number.toString() === referenceNumber
|
||||
);
|
||||
|
||||
if (!citation) {
|
||||
@ -146,7 +149,6 @@ export default function MarkdownRenderer({
|
||||
</PopoverTrigger>
|
||||
<PopoverContent className="w-56 overflow-hidden rounded-lg p-0">
|
||||
<div className="p-4">
|
||||
{/* Replace with actual reference content */}
|
||||
<p>{citation.text}</p>
|
||||
</div>
|
||||
</PopoverContent>
|
||||
|
@ -83,33 +83,35 @@ export const TTSProvider = ({
|
||||
if (cached) {
|
||||
return cached;
|
||||
}
|
||||
worker.current!.postMessage({
|
||||
type: "generate",
|
||||
text: sentence,
|
||||
voice: selectedSpeaker,
|
||||
});
|
||||
setStatus("running");
|
||||
setLoadingMessage("Generating audio...");
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
worker.current!.addEventListener(
|
||||
"message",
|
||||
(e: any) => {
|
||||
const handleMessage = (e: MessageEvent) => {
|
||||
if (e.data.index !== index) return; // Ignore messages for other indices
|
||||
|
||||
if (e.data.status === "complete") {
|
||||
localStorage.setItem(key, e.data.audio);
|
||||
worker.current!.removeEventListener("message", handleMessage); // Clean up listener
|
||||
resolve(e.data.audio);
|
||||
} else if (e.data.status === "error") {
|
||||
worker.current!.removeEventListener("message", handleMessage); // Clean up listener
|
||||
toast.error(`Error generating audio: ${e.data.error}`);
|
||||
reject(e.data.error);
|
||||
}
|
||||
},
|
||||
{ once: true }
|
||||
);
|
||||
};
|
||||
|
||||
worker.current!.addEventListener("message", handleMessage);
|
||||
|
||||
worker.current!.postMessage({
|
||||
type: "generate",
|
||||
index,
|
||||
text: sentence,
|
||||
voice: selectedSpeaker,
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// We use the `useEffect` hook to setup the worker as soon as the `App` component is mounted.
|
||||
useEffect(() => {
|
||||
// Create the worker if it does not yet exist.
|
||||
console.log("Initializing worker...");
|
||||
worker.current ??= new Worker("/workers/kokoro-worker.js", {
|
||||
type: "module",
|
||||
@ -117,7 +119,6 @@ export const TTSProvider = ({
|
||||
|
||||
console.log("Worker initialized");
|
||||
|
||||
// Create a callback function for messages from the worker thread.
|
||||
const onMessageReceived = (e: any) => {
|
||||
switch (e.data.status) {
|
||||
case "device":
|
||||
@ -132,56 +133,71 @@ export const TTSProvider = ({
|
||||
break;
|
||||
case "complete":
|
||||
const { audio, text } = e.data;
|
||||
// Generation complete: re-enable the "Generate" button
|
||||
setResults((prev) => [{ text, src: audio }, ...prev]);
|
||||
setStatus("ready");
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
console.log("onmessagereceived");
|
||||
|
||||
const onErrorReceived = (e: any) => {
|
||||
console.error("Worker error:", e);
|
||||
setError(e.message);
|
||||
};
|
||||
|
||||
console.log("Attaching event listeners to worker");
|
||||
|
||||
// Attach the callback function as an event listener.
|
||||
worker.current.addEventListener("message", onMessageReceived);
|
||||
worker.current.addEventListener("error", onErrorReceived);
|
||||
|
||||
console.log(worker.current);
|
||||
// Define a cleanup function for when the component is unmounted.
|
||||
return () => {
|
||||
worker.current!.removeEventListener("message", onMessageReceived);
|
||||
worker.current!.removeEventListener("error", onErrorReceived);
|
||||
};
|
||||
}, []);
|
||||
|
||||
// Pre-buffer current and next 2 sentences.
|
||||
// Pre-buffer current and next 5 sentences.
|
||||
useEffect(() => {
|
||||
let isCancelled = false;
|
||||
|
||||
async function preloadBuffer() {
|
||||
const newBuffer = [...ttsBuffer];
|
||||
const end = Math.min(sentences.length, currentSentence + 3);
|
||||
const end = Math.min(sentences.length, currentSentence + 5); // Preload 5 sentences ahead
|
||||
|
||||
for (let i = currentSentence; i < end; i++) {
|
||||
if (isCancelled) break;
|
||||
if (!newBuffer[i]) {
|
||||
console.log("Preloading TTS for sentence:", i, sentences[i]);
|
||||
try {
|
||||
newBuffer[i] = await generateTTSForIndex(
|
||||
removeMarkdown(sentences[i]),
|
||||
i
|
||||
);
|
||||
} catch (error) {
|
||||
console.error("Error preloading TTS:", error);
|
||||
}
|
||||
}
|
||||
setTtsBuffer(newBuffer);
|
||||
}
|
||||
|
||||
if (!isCancelled) {
|
||||
setTtsBuffer((prev) => {
|
||||
// Only update state if the buffer has changed
|
||||
if (JSON.stringify(prev) !== JSON.stringify(newBuffer)) {
|
||||
return newBuffer;
|
||||
}
|
||||
return prev;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
preloadBuffer();
|
||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||
}, [currentSentence, sentences.join(" ")]);
|
||||
|
||||
return () => {
|
||||
isCancelled = true; // Cancel preloading if the component unmounts or dependencies change
|
||||
};
|
||||
}, [currentSentence, sentences]);
|
||||
|
||||
const playSentence = async (index: number) => {
|
||||
if (index === currentSentence) return; // Prevent redundant updates
|
||||
setCurrentSentence(index);
|
||||
|
||||
let audioUrl = ttsBuffer[index];
|
||||
if (!audioUrl) {
|
||||
audioUrl = await generateTTSForIndex(
|
||||
@ -194,6 +210,7 @@ export const TTSProvider = ({
|
||||
return updated;
|
||||
});
|
||||
}
|
||||
|
||||
if (audioRef.current) {
|
||||
audioRef.current.src = audioUrl;
|
||||
await new Promise((res) => {
|
||||
@ -211,16 +228,21 @@ export const TTSProvider = ({
|
||||
|
||||
const playInOrder = async (index: number) => {
|
||||
if (index < 0 || index >= sentences.length) return;
|
||||
console.log("Playing in order from index:", index);
|
||||
if (index === currentSentence && playing) return; // Prevent redundant playback
|
||||
setCurrentSentence(index);
|
||||
setPlaying(true);
|
||||
|
||||
for (let i = index; i < sentences.length; i++) {
|
||||
console.log("Playing sentence:", i, sentences[i]);
|
||||
try {
|
||||
await playSentence(i);
|
||||
if (i < sentences.length - 1) {
|
||||
console.log("Waiting for next sentence...");
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000));
|
||||
} catch (error) {
|
||||
console.error("Error playing sentence:", error);
|
||||
break; // Stop playback on error
|
||||
}
|
||||
}
|
||||
|
||||
setPlaying(false);
|
||||
};
|
||||
|
||||
const pause = () => {
|
||||
|
@ -1,6 +1,9 @@
|
||||
console.log("Initializing Kokoro TTS Worker");
|
||||
|
||||
import { KokoroTTS } from "https://cdn.jsdelivr.net/npm/kokoro-js@1.2.0/+esm";
|
||||
import {
|
||||
KokoroTTS,
|
||||
TextSplitterStream,
|
||||
} from "https://cdn.jsdelivr.net/npm/kokoro-js@1.2.0/+esm";
|
||||
async function detectWebGPU() {
|
||||
try {
|
||||
const adapter = await navigator.gpu.requestAdapter();
|
||||
@ -35,29 +38,43 @@ const tts = await KokoroTTS.from_pretrained(model_id, {
|
||||
},
|
||||
});
|
||||
|
||||
const splitter = new TextSplitterStream();
|
||||
const stream = tts.stream(splitter);
|
||||
let index = 0;
|
||||
|
||||
// Listen for messages from the main thread
|
||||
self.addEventListener("message", async (e) => {
|
||||
const { text, voice, index } = e.data;
|
||||
|
||||
console.log(
|
||||
`Generating speech for text: "${text}" with voice: ${voice}, index: ${index}`
|
||||
);
|
||||
|
||||
// Push the text to the splitter
|
||||
splitter.push(text);
|
||||
splitter.push(""); // Signal the end of the text
|
||||
|
||||
// Process the stream and include the correct index
|
||||
for await (const { text: processedText, phonemes, audio } of stream) {
|
||||
console.log({ processedText, phonemes });
|
||||
|
||||
const blob = audio.toBlob();
|
||||
const base64Audio = await blobToBase64(blob);
|
||||
|
||||
self.postMessage({
|
||||
status: "complete",
|
||||
audio: base64Audio,
|
||||
text: processedText,
|
||||
phonemes,
|
||||
index, // Include the index from the original message
|
||||
});
|
||||
|
||||
break; // Stop processing after the first chunk for this message
|
||||
}
|
||||
});
|
||||
|
||||
console.log("Kokoro TTS model loaded successfully");
|
||||
|
||||
self.postMessage({ status: "ready", voices: tts.voices, device });
|
||||
|
||||
console.log("Available voices:", tts.voices);
|
||||
|
||||
// Listen for messages from the main thread
|
||||
self.addEventListener("message", async (e) => {
|
||||
const { text, voice } = e.data;
|
||||
|
||||
try {
|
||||
// Generate speech
|
||||
console.log(`Generating speech for text: "${text}" with voice: ${voice}`);
|
||||
const audio = await tts.generate(text, { voice });
|
||||
|
||||
// Send the audio file back to the main thread
|
||||
const blob = audio.toBlob();
|
||||
self.postMessage({
|
||||
status: "complete",
|
||||
audio: await blobToBase64(blob),
|
||||
text,
|
||||
});
|
||||
} catch (error) {
|
||||
self.postMessage({ status: "error", error: error.message });
|
||||
}
|
||||
});
|
||||
|
@ -14,7 +14,7 @@ const client = new Mistral({
|
||||
const PROCESSING_PROMPT = `
|
||||
You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format.
|
||||
|
||||
The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content.
|
||||
The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. **Do not add headings if they do not exist in the original text.**
|
||||
Any images should be included.
|
||||
Do not return the Markdown as a code block, only as a raw string, without any new lines.
|
||||
|
||||
@ -35,7 +35,7 @@ Return the final result as a text object with the following structure (without c
|
||||
"citations": [
|
||||
{
|
||||
"number": 1, // The number as it appears in the text
|
||||
"text": "Citation text 1"
|
||||
"text": "Citation text 1" // Ensure any JSON-breaking characters are properly escaped
|
||||
},
|
||||
{
|
||||
"number": 2,
|
||||
@ -138,7 +138,7 @@ Deno.serve(async (req) => {
|
||||
message: "File ID found in form data.",
|
||||
});
|
||||
const docId = formData.get("id");
|
||||
|
||||
console.log("Document ID:", docId, formData);
|
||||
const { data: documentData, error: documentError } = await supabase
|
||||
.from("documents")
|
||||
.select("*")
|
||||
@ -170,24 +170,9 @@ Deno.serve(async (req) => {
|
||||
throw new Error("Document record not found");
|
||||
}
|
||||
|
||||
const { data: storageData, error: storageError } = await supabaseServer
|
||||
.from("storage.objects")
|
||||
.select("name")
|
||||
.eq("id", documentData.raw_file)
|
||||
.single();
|
||||
|
||||
if (storageError) {
|
||||
console.error("Error fetching file name:", storageError);
|
||||
sendEvent("error", {
|
||||
message: "Error fetching file name",
|
||||
error: storageError,
|
||||
});
|
||||
throw new Error("Storage data fetch failed");
|
||||
}
|
||||
|
||||
const { data: fileData, error: fileError } = await supabase.storage
|
||||
.from("documents")
|
||||
.download(storageData.name);
|
||||
.download(`${user.id}/${uuid}.pdf`);
|
||||
|
||||
if (fileError) {
|
||||
console.error("Error downloading file from storage:", fileError);
|
||||
@ -425,6 +410,7 @@ Deno.serve(async (req) => {
|
||||
|
||||
const content = split[0].trim();
|
||||
const citationsStr = split[1]?.trim() || "{}";
|
||||
console.log(`[${page.index}] Citations: ${citationsStr}`);
|
||||
const citations = JSON.parse(citationsStr).citations || {};
|
||||
|
||||
console.log("Generating Markdown for page:", page.index);
|
||||
|
Loading…
x
Reference in New Issue
Block a user