diff --git a/app/actions.ts b/app/actions.ts index 784a66e..a2f3899 100644 --- a/app/actions.ts +++ b/app/actions.ts @@ -6,6 +6,8 @@ import { headers } from "next/headers"; import { redirect } from "next/navigation"; import { Provider } from "@supabase/supabase-js"; import { revalidatePath } from "next/cache"; +import { ElevenLabsClient } from "elevenlabs"; +import { Readable } from "stream"; export const signUpAction = async (formData: FormData) => { const email = formData.get("email")?.toString(); @@ -75,79 +77,61 @@ export const signInAction = async (formData: FormData) => { redirect("/dashboard"); }; -export const forgotPasswordAction = async (formData: FormData) => { - const email = formData.get("email")?.toString(); - const supabase = await createClient(); - const origin = (await headers()).get("origin"); - const callbackUrl = formData.get("callbackUrl")?.toString(); - - if (!email) { - return encodedRedirect("error", "/forgot-password", "Email is required"); - } - - const { error } = await supabase.auth.resetPasswordForEmail(email, { - redirectTo: `${origin}/auth/callback?redirect_to=/protected/reset-password`, - }); - - if (error) { - console.error(error.message); - return encodedRedirect( - "error", - "/forgot-password", - "Could not reset password" - ); - } - - if (callbackUrl) { - return redirect(callbackUrl); - } - - return encodedRedirect( - "success", - "/forgot-password", - "Check your email for a link to reset your password." - ); -}; - -export const resetPasswordAction = async (formData: FormData) => { - const supabase = await createClient(); - - const password = formData.get("password") as string; - const confirmPassword = formData.get("confirmPassword") as string; - - if (!password || !confirmPassword) { - encodedRedirect( - "error", - "/protected/reset-password", - "Password and confirm password are required" - ); - } - - if (password !== confirmPassword) { - encodedRedirect( - "error", - "/protected/reset-password", - "Passwords do not match" - ); - } - - const { error } = await supabase.auth.updateUser({ - password: password, - }); - - if (error) { - encodedRedirect( - "error", - "/protected/reset-password", - "Password update failed" - ); - } - - encodedRedirect("success", "/protected/reset-password", "Password updated"); -}; - export const signOutAction = async () => { const supabase = await createClient(); await supabase.auth.signOut(); return redirect("/login"); }; + +const elevenLabs = new ElevenLabsClient({ + apiKey: process.env.ELEVENLABS_API_KEY, +}); + +export const synthesizeTTSAction = async (data: { + text: string; + previous_text: string; + next_text: string; + voice: string; + index: number; +}) => { + const { text, voice, index } = data; + console.log("Generating TTS for text:", text, "with voice:", voice); + + if (!text) { + throw new Error("Text is required"); + } + + try { + // Call ElevenLabs API to generate the audio + const audioStream = await elevenLabs.textToSpeech.convertAsStream( + "gUABw7pXQjhjt0kNFBTF", + { + text, + output_format: "mp3_44100_128", + voice_settings: { + stability: 0.75, + speed: 1.0, + similarity_boost: 0.75, + }, + previous_text: data.previous_text, + next_text: data.next_text, + model_id: "eleven_multilingual_v2", + } + ); + const chunks: Buffer[] = []; + + for await (const chunk of audioStream) { + chunks.push(chunk); + } + + const audioBuffer = Buffer.concat(chunks); + + // Create a Blob from the Buffer + const audioBlob = new Blob([audioBuffer], { type: "audio/mpeg" }); + + return audioBlob; + } catch (error) { + console.error("Error generating TTS:", error); + throw new Error("Failed to generate TTS audio stream"); + } +}; diff --git a/app/api/process-document/route.ts b/app/api/process-document/route.ts new file mode 100644 index 0000000..7f271af --- /dev/null +++ b/app/api/process-document/route.ts @@ -0,0 +1,198 @@ +import { NextRequest, NextResponse } from "next/server"; +import { Mistral } from "@mistralai/mistralai"; +import pLimit from "p-limit"; +import { createClient } from "@/utils/supabase/server"; + +const corsHeaders = { + "Access-Control-Allow-Origin": "*", + "Access-Control-Allow-Headers": + "authorization, x-client-info, apikey, content-type", +}; + +const apiKey = process.env.MISTRAL_API_KEY!; +const client = new Mistral({ apiKey }); + +const PROCESSING_PROMPT = ` +You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format. + +The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. **Do not add headings if they do not exist in the original text.** +Any images should be included. +Do not return the Markdown as a code block, only as a raw string, without any new lines. + +No data or information should ever be removed, it should only be processed and formatted. + +There are in-text citations/references in the text, remove them from the text (**but most importantly, keep the reference number in the text. use a tag**) and put them into an object where the key is the reference number and the value is the text. If any citations contain JSON-breaking characters, ensure they are properly escaped. This includes characters like double quotes, backslashes, and newlines. + +The Markdown should be human-readable and well-formatted. The markdown string should properly sanitized and should not break a JSON parser when returned as the final format. + +Return the final result as a text object with the following structure (without code block formatting): + +""" + + +--------- + +{ + "citations": [ + { + "number": 1, // The number as it appears in the text + "text": "Citation text 1" // Ensure any JSON-breaking characters are properly escaped + }, + { + "number": 2, + "text": "Citation text 2" + } + ] +} +""" + +Do not return the text object as a code block, only as a raw string. +`; + +async function getCitations(citationsStr: string) { + try { + const citations = JSON.parse(citationsStr).citations || {}; + + return + } + + +} + +export async function POST(req: NextRequest) { + if (req.method === "OPTIONS") { + return new NextResponse(null, { + headers: { + ...corsHeaders, + "Access-Control-Allow-Methods": "POST, OPTIONS", + }, + }); + } + + const formData = await req.formData(); + const accessToken = formData.get("access_token") as string; + const refreshToken = formData.get("refresh_token") as string; + + if (!formData.has("file") || !accessToken || !refreshToken) { + return NextResponse.json( + { + error: "Missing required fields: file, access_token, or refresh_token", + }, + { status: 400 } + ); + } + + const supabase = await createClient(); + + const file = formData.get("file") as File; + const fileName = file.name; + const uuid = crypto.randomUUID(); + + try { + // Authenticate the user + const { + data: { user }, + error: sessionError, + } = await supabase.auth.setSession({ + access_token: accessToken, + refresh_token: refreshToken, + }); + + if (sessionError) { + throw new Error("Failed to set session: " + sessionError.message); + } + + if (!user) { + throw new Error("User not authenticated"); + } + + // Upload the file to Supabase storage + const { data: storageData, error: storageError } = await supabase.storage + .from("documents") + .upload(`${user.id}/${uuid}.pdf`, file); + + if (storageError) { + throw new Error("Failed to upload file: " + storageError.message); + } + + // Insert document record + const { error: docError } = await supabase.from("documents").insert({ + id: uuid, + file_name: file.name, + owner: user.id, + raw_file: storageData.id, + is_processing: true, + }); + + if (docError) { + throw new Error("Failed to insert document record: " + docError.message); + } + + // Upload file to Mistral + const uploadedPdf = await client.files.upload({ + file: { fileName, content: file }, + purpose: "ocr", + }); + + const signedUrl = await client.files.getSignedUrl({ + fileId: uploadedPdf.id, + }); + + // Process OCR + const ocrResponse = await client.ocr.process({ + model: "mistral-ocr-latest", + document: { type: "document_url", documentUrl: signedUrl.url }, + }); + + const limit = pLimit(2); + const promises = ocrResponse.pages.map((page) => + limit(async () => { + const response = await client.chat.complete({ + model: "mistral-small-latest", + messages: [ + { + role: "system", + content: [{ type: "text", text: PROCESSING_PROMPT }], + }, + { role: "user", content: [{ type: "text", text: page.markdown }] }, + ], + }); + + const split = response.choices[0].message.content.split("---------"); + const content = split[0].trim(); + const citationsStr = split[1]?.trim() || "{}"; + console.log(citationsStr); + + const citations = await getCitations(citationsStr); + + return { + ...page, + markdown: content, + citations, + }; + }) + ); + + const results = await Promise.all(promises); + + // Update document record with OCR data + const { error: updateError } = await supabase + .from("documents") + .update({ ocr_data: results, is_processing: false }) + .eq("id", uuid); + + if (updateError) { + throw new Error( + "Failed to update document record: " + updateError.message + ); + } + + return NextResponse.json({ + message: "Document processed successfully", + results, + }); + } catch (error: any) { + console.error("Error processing document:", error); + return NextResponse.json({ error: error.message }, { status: 500 }); + } +} diff --git a/bun.lockb b/bun.lockb index e0ee5d7..f87940c 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/components/KokoroReader.tsx b/components/KokoroReader.tsx index 6c38a84..f80c9a4 100644 --- a/components/KokoroReader.tsx +++ b/components/KokoroReader.tsx @@ -22,6 +22,7 @@ export default function KokoroReader({ pages }: { pages: any[] }) { playSentence, playInOrder, status, + pause, } = useTTS(); const [playing, setPlaying] = useState(false); @@ -40,6 +41,11 @@ export default function KokoroReader({ pages }: { pages: any[] }) { playInOrder(currentSentence || 0); }; + const paused = () => { + setPlaying(false); + pause(); + }; + return (
@@ -48,13 +54,13 @@ export default function KokoroReader({ pages }: { pages: any[] }) { variant="ghost" size="icon" className="h-10 w-10" - onClick={play} + onClick={status === "ready" ? play : paused} disabled={status === null} > {status === "running" ? ( ) : ( - Play + {playing ? "Pause" : "Play"} )} {playing ? : } diff --git a/components/TTSProvider.tsx b/components/TTSProvider.tsx index 7b85e16..6c2338d 100644 --- a/components/TTSProvider.tsx +++ b/components/TTSProvider.tsx @@ -10,7 +10,7 @@ import React, { } from "react"; import removeMarkdown from "remove-markdown"; import { toast } from "sonner"; -import * as Echogarden from "echogarden"; +import { synthesizeTTSAction } from "@/app/actions"; // More robust sentence splitter using Intl.Segmenter for better accuracy. function splitIntoSentences(text: string): string[] { @@ -50,7 +50,6 @@ export const TTSProvider = ({ children: ReactNode; }) => { const supabase = createClient(); - // Combine pages and split into sentences. const fullText = pages.join("\n"); const sentences = splitIntoSentences(fullText).filter( (sentence) => sentence.trim() !== "\\n" && sentence.trim() !== "" @@ -63,30 +62,42 @@ export const TTSProvider = ({ const [voices, setVoices] = useState([]); const [status, setStatus] = useState<"ready" | "running" | null>("ready"); + // Cache for preloaded audio + const audioCache = useRef>(new Map()); + + // Preload audio for a range of sentences + const preloadAudio = async (startIndex: number, range: number = 3) => { + for ( + let i = startIndex; + i < Math.min(sentences.length, startIndex + range); + i++ + ) { + if (!audioCache.current.has(i)) { + console.log(`Preloading audio for sentence ${i}: ${sentences[i]}`); + try { + const audioUrl = await generateTTS(sentences[i], i); + audioCache.current.set(i, audioUrl); // Cache the audio URL + } catch (error) { + console.error(`Error preloading audio for sentence ${i}:`, error); + } + } + } + }; + async function generateTTS(sentence: string, index: number): Promise { try { - // const { data, error } = await supabase.functions.invoke("generate-tts", { - // body: { - // text: sentence, - // voice: selectedSpeaker, - // index, - // }, - // }); - - const { audio } = await Echogarden.synthesize(sentence, { - engine: "kokoro", + const blob = await synthesizeTTSAction({ + text: sentence, + previous_text: sentences[index - 1] || "", + next_text: sentences[index + 1] || "", voice: selectedSpeaker, + index, }); - - setStatus("running"); - - if (!Buffer.isBuffer(audio)) { - throw new Error("Audio data is not a buffer"); + if (!blob) { + throw new Error("Failed to generate TTS"); } - - const blob = new Blob([audio], { type: "audio/wav" }); - - return URL.createObjectURL(blob); + const audioUrl = URL.createObjectURL(blob); + return audioUrl; } catch (error) { console.error("Error generating TTS:", error); toast.error("Failed to generate TTS. Please try again."); @@ -99,7 +110,15 @@ export const TTSProvider = ({ const sentence = removeMarkdown(sentences[index]); try { - const audioUrl = await generateTTS(sentence, index); + let audioUrl = audioCache.current.get(index); + if (!audioUrl) { + console.log( + `Audio not preloaded for sentence ${index}, generating on the fly.` + ); + audioUrl = await generateTTS(sentence, index); + audioCache.current.set(index, audioUrl); // Cache the generated audio + } + if (audioRef.current) { audioRef.current.src = audioUrl; await new Promise((res) => { @@ -125,6 +144,7 @@ export const TTSProvider = ({ console.log("Playing sentence:", i, sentences[i]); try { await playSentence(i); + preloadAudio(i + 1); // Preload the next sentence after playing } catch (error) { console.error("Error playing sentence:", error); break; // Stop playback on error @@ -151,6 +171,11 @@ export const TTSProvider = ({ } }; + // Preload sentences when the current sentence changes + useEffect(() => { + preloadAudio(currentSentence); + }, [currentSentence]); + const value: TTSContextType = { sentences, currentSentence, diff --git a/components/UploadZone.tsx b/components/UploadZone.tsx index 58f6e47..2c52db0 100644 --- a/components/UploadZone.tsx +++ b/components/UploadZone.tsx @@ -27,7 +27,7 @@ export default function UploadZone({ user }: { user?: { id: string } }) { body.append("access_token", data.session?.access_token || ""); body.append("refresh_token", data.session?.refresh_token || ""); - const edgeFunctionUrl = `${process.env.NEXT_PUBLIC_SUPABASE_URL}/functions/v1/process-document`; + const edgeFunctionUrl = `/api/process-document`; // Start listening to the SSE stream const eventSource = new SSE(edgeFunctionUrl, { diff --git a/package.json b/package.json index 3e662aa..2dae9b9 100644 --- a/package.json +++ b/package.json @@ -29,6 +29,7 @@ "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "echogarden": "^2.7.0", + "elevenlabs": "^1.57.0", "kokoro-js": "^1.2.0", "lucide-react": "^0.486.0", "next": "latest",