diff --git a/bun.lockb b/bun.lockb index 84a569d..794f9fb 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/components/TTSProvider.tsx b/components/TTSProvider.tsx index 4d3654b..d04e872 100644 --- a/components/TTSProvider.tsx +++ b/components/TTSProvider.tsx @@ -8,6 +8,7 @@ import React, { ReactNode, } from "react"; import removeMarkdown from "remove-markdown"; +import { toast } from "sonner"; // More robust sentence splitter using Intl.Segmenter for better accuracy. function splitIntoSentences(text: string): string[] { @@ -49,7 +50,9 @@ export const TTSProvider = ({ }) => { // Combine pages and split into sentences. const fullText = pages.join("\n"); - const sentences = splitIntoSentences(fullText); + const sentences = splitIntoSentences(fullText).filter( + (sentence) => sentence.trim() !== "\\n" && sentence.trim() !== "" + ); const [currentSentence, setCurrentSentence] = useState(0); const [ttsBuffer, setTtsBuffer] = useState<(string | null)[]>( @@ -95,6 +98,7 @@ export const TTSProvider = ({ localStorage.setItem(key, e.data.audio); resolve(e.data.audio); } else if (e.data.status === "error") { + toast.error(`Error generating audio: ${e.data.error}`); reject(e.data.error); } }, @@ -163,6 +167,7 @@ export const TTSProvider = ({ const end = Math.min(sentences.length, currentSentence + 3); for (let i = currentSentence; i < end; i++) { if (!newBuffer[i]) { + console.log("Preloading TTS for sentence:", i, sentences[i]); newBuffer[i] = await generateTTSForIndex( removeMarkdown(sentences[i]), i @@ -191,7 +196,11 @@ export const TTSProvider = ({ } if (audioRef.current) { audioRef.current.src = audioUrl; - await audioRef.current.play(); + await new Promise((res) => { + audioRef.current!.play(); + + audioRef.current!.onended = () => res(true); + }); } }; @@ -202,10 +211,13 @@ export const TTSProvider = ({ const playInOrder = async (index: number) => { if (index < 0 || index >= sentences.length) return; + console.log("Playing in order from index:", index); setCurrentSentence(index); for (let i = index; i < sentences.length; i++) { + console.log("Playing sentence:", i, sentences[i]); await playSentence(i); if (i < sentences.length - 1) { + console.log("Waiting for next sentence..."); await new Promise((resolve) => setTimeout(resolve, 1000)); } } diff --git a/public/workers/kokoro-worker.js b/public/workers/kokoro-worker.js index 3b95646..40a088b 100644 --- a/public/workers/kokoro-worker.js +++ b/public/workers/kokoro-worker.js @@ -9,6 +9,15 @@ async function detectWebGPU() { return false; } } + +function blobToBase64(blob) { + return new Promise((resolve, _) => { + const reader = new FileReader(); + reader.onloadend = () => resolve(reader.result); + reader.readAsDataURL(blob); + }); +} + // Device detection const device = (await detectWebGPU()) ? "webgpu" : "wasm"; self.postMessage({ status: "device", device }); @@ -20,6 +29,10 @@ const model_id = "onnx-community/Kokoro-82M-v1.0-ONNX"; const tts = await KokoroTTS.from_pretrained(model_id, { dtype: device === "wasm" ? "q8" : "fp32", device, + progressCallback: (progress) => { + self.postMessage({ status: "progress", progress }); + console.log(`Loading progress: ${progress * 100}%`); + }, }); console.log("Kokoro TTS model loaded successfully"); @@ -34,13 +47,14 @@ self.addEventListener("message", async (e) => { try { // Generate speech + console.log(`Generating speech for text: "${text}" with voice: ${voice}`); const audio = await tts.generate(text, { voice }); // Send the audio file back to the main thread const blob = audio.toBlob(); self.postMessage({ status: "complete", - audio: URL.createObjectURL(blob), + audio: await blobToBase64(blob), text, }); } catch (error) {