i gave up we're using elevenlabs

2025-05-06 01:43:35 -04:00 · 2025-05-06 01:43:35 -04:00 · 2e2a0f28b4
commit 2e2a0f28b4
parent 1f0f09d254
7 changed files with 310 additions and 96 deletions
--- a/app/actions.ts
+++ b/app/actions.ts
@ -6,6 +6,8 @@ import { headers } from "next/headers";
 import { redirect } from "next/navigation";
 import { Provider } from "@supabase/supabase-js";
 import { revalidatePath } from "next/cache";
+import { ElevenLabsClient } from "elevenlabs";
+import { Readable } from "stream";

 export const signUpAction = async (formData: FormData) => {
  const email = formData.get("email")?.toString();
@ -75,79 +77,61 @@ export const signInAction = async (formData: FormData) => {
  redirect("/dashboard");
 };

-export const forgotPasswordAction = async (formData: FormData) => {
-  const email = formData.get("email")?.toString();
-  const supabase = await createClient();
-  const origin = (await headers()).get("origin");
-  const callbackUrl = formData.get("callbackUrl")?.toString();
-
-  if (!email) {
-    return encodedRedirect("error", "/forgot-password", "Email is required");
-  }
-
-  const { error } = await supabase.auth.resetPasswordForEmail(email, {
-    redirectTo: `${origin}/auth/callback?redirect_to=/protected/reset-password`,
-  });
-
-  if (error) {
-    console.error(error.message);
-    return encodedRedirect(
-      "error",
-      "/forgot-password",
-      "Could not reset password"
-    );
-  }
-
-  if (callbackUrl) {
-    return redirect(callbackUrl);
-  }
-
-  return encodedRedirect(
-    "success",
-    "/forgot-password",
-    "Check your email for a link to reset your password."
-  );
-};
-
-export const resetPasswordAction = async (formData: FormData) => {
-  const supabase = await createClient();
-
-  const password = formData.get("password") as string;
-  const confirmPassword = formData.get("confirmPassword") as string;
-
-  if (!password || !confirmPassword) {
-    encodedRedirect(
-      "error",
-      "/protected/reset-password",
-      "Password and confirm password are required"
-    );
-  }
-
-  if (password !== confirmPassword) {
-    encodedRedirect(
-      "error",
-      "/protected/reset-password",
-      "Passwords do not match"
-    );
-  }
-
-  const { error } = await supabase.auth.updateUser({
-    password: password,
-  });
-
-  if (error) {
-    encodedRedirect(
-      "error",
-      "/protected/reset-password",
-      "Password update failed"
-    );
-  }
-
-  encodedRedirect("success", "/protected/reset-password", "Password updated");
-};
-
 export const signOutAction = async () => {
  const supabase = await createClient();
  await supabase.auth.signOut();
  return redirect("/login");
 };
+
+const elevenLabs = new ElevenLabsClient({
+  apiKey: process.env.ELEVENLABS_API_KEY,
+});
+
+export const synthesizeTTSAction = async (data: {
+  text: string;
+  previous_text: string;
+  next_text: string;
+  voice: string;
+  index: number;
+}) => {
+  const { text, voice, index } = data;
+  console.log("Generating TTS for text:", text, "with voice:", voice);
+
+  if (!text) {
+    throw new Error("Text is required");
+  }
+
+  try {
+    // Call ElevenLabs API to generate the audio
+    const audioStream = await elevenLabs.textToSpeech.convertAsStream(
+      "gUABw7pXQjhjt0kNFBTF",
+      {
+        text,
+        output_format: "mp3_44100_128",
+        voice_settings: {
+          stability: 0.75,
+          speed: 1.0,
+          similarity_boost: 0.75,
+        },
+        previous_text: data.previous_text,
+        next_text: data.next_text,
+        model_id: "eleven_multilingual_v2",
+      }
+    );
+    const chunks: Buffer[] = [];
+
+    for await (const chunk of audioStream) {
+      chunks.push(chunk);
+    }
+
+    const audioBuffer = Buffer.concat(chunks);
+
+    // Create a Blob from the Buffer
+    const audioBlob = new Blob([audioBuffer], { type: "audio/mpeg" });
+
+    return audioBlob;
+  } catch (error) {
+    console.error("Error generating TTS:", error);
+    throw new Error("Failed to generate TTS audio stream");
+  }
+};
--- a/app/api/process-document/route.ts
+++ b/app/api/process-document/route.ts
@ -0,0 +1,198 @@
+import { NextRequest, NextResponse } from "next/server";
+import { Mistral } from "@mistralai/mistralai";
+import pLimit from "p-limit";
+import { createClient } from "@/utils/supabase/server";
+
+const corsHeaders = {
+  "Access-Control-Allow-Origin": "*",
+  "Access-Control-Allow-Headers":
+    "authorization, x-client-info, apikey, content-type",
+};
+
+const apiKey = process.env.MISTRAL_API_KEY!;
+const client = new Mistral({ apiKey });
+
+const PROCESSING_PROMPT = `
+You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format.
+
+The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. **Do not add headings if they do not exist in the original text.**
+Any images should be included.
+Do not return the Markdown as a code block, only as a raw string, without any new lines.
+
+No data or information should ever be removed, it should only be processed and formatted.
+
+There are in-text citations/references in the text, remove them from the text (**but most importantly, keep the reference number in the text. use a <sup></sup> tag**) and put them into an object where the key is the reference number and the value is the text. If any citations contain JSON-breaking characters, ensure they are properly escaped. This includes characters like double quotes, backslashes, and newlines.
+
+The Markdown should be human-readable and well-formatted. The markdown string should properly sanitized and should not break a JSON parser when returned as the final format.
+
+Return the final result as a text object with the following structure (without code block formatting):
+
+"""
+<processed markdown text>
+
+---------
+
+{
+  "citations": [
+    {
+      "number": 1, // The number as it appears in the text
+      "text": "Citation text 1" // Ensure any JSON-breaking characters are properly escaped
+    },
+    {
+      "number": 2,
+      "text": "Citation text 2"
+    }
+  ]
+}
+"""
+
+Do not return the text object as a code block, only as a raw string.
+`;
+
+async function getCitations(citationsStr: string) {
+  try {
+    const citations = JSON.parse(citationsStr).citations || {};
+
+    return 
+  }
+
+
+}
+
+export async function POST(req: NextRequest) {
+  if (req.method === "OPTIONS") {
+    return new NextResponse(null, {
+      headers: {
+        ...corsHeaders,
+        "Access-Control-Allow-Methods": "POST, OPTIONS",
+      },
+    });
+  }
+
+  const formData = await req.formData();
+  const accessToken = formData.get("access_token") as string;
+  const refreshToken = formData.get("refresh_token") as string;
+
+  if (!formData.has("file") || !accessToken || !refreshToken) {
+    return NextResponse.json(
+      {
+        error: "Missing required fields: file, access_token, or refresh_token",
+      },
+      { status: 400 }
+    );
+  }
+
+  const supabase = await createClient();
+
+  const file = formData.get("file") as File;
+  const fileName = file.name;
+  const uuid = crypto.randomUUID();
+
+  try {
+    // Authenticate the user
+    const {
+      data: { user },
+      error: sessionError,
+    } = await supabase.auth.setSession({
+      access_token: accessToken,
+      refresh_token: refreshToken,
+    });
+
+    if (sessionError) {
+      throw new Error("Failed to set session: " + sessionError.message);
+    }
+
+    if (!user) {
+      throw new Error("User not authenticated");
+    }
+
+    // Upload the file to Supabase storage
+    const { data: storageData, error: storageError } = await supabase.storage
+      .from("documents")
+      .upload(`${user.id}/${uuid}.pdf`, file);
+
+    if (storageError) {
+      throw new Error("Failed to upload file: " + storageError.message);
+    }
+
+    // Insert document record
+    const { error: docError } = await supabase.from("documents").insert({
+      id: uuid,
+      file_name: file.name,
+      owner: user.id,
+      raw_file: storageData.id,
+      is_processing: true,
+    });
+
+    if (docError) {
+      throw new Error("Failed to insert document record: " + docError.message);
+    }
+
+    // Upload file to Mistral
+    const uploadedPdf = await client.files.upload({
+      file: { fileName, content: file },
+      purpose: "ocr",
+    });
+
+    const signedUrl = await client.files.getSignedUrl({
+      fileId: uploadedPdf.id,
+    });
+
+    // Process OCR
+    const ocrResponse = await client.ocr.process({
+      model: "mistral-ocr-latest",
+      document: { type: "document_url", documentUrl: signedUrl.url },
+    });
+
+    const limit = pLimit(2);
+    const promises = ocrResponse.pages.map((page) =>
+      limit(async () => {
+        const response = await client.chat.complete({
+          model: "mistral-small-latest",
+          messages: [
+            {
+              role: "system",
+              content: [{ type: "text", text: PROCESSING_PROMPT }],
+            },
+            { role: "user", content: [{ type: "text", text: page.markdown }] },
+          ],
+        });
+
+        const split = response.choices[0].message.content.split("---------");
+        const content = split[0].trim();
+        const citationsStr = split[1]?.trim() || "{}";
+        console.log(citationsStr);
+
+        const citations = await getCitations(citationsStr);
+
+        return {
+            ...page,
+            markdown: content,
+            citations,
+        };
+      })
+    );
+
+    const results = await Promise.all(promises);
+
+    // Update document record with OCR data
+    const { error: updateError } = await supabase
+      .from("documents")
+      .update({ ocr_data: results, is_processing: false })
+      .eq("id", uuid);
+
+    if (updateError) {
+      throw new Error(
+        "Failed to update document record: " + updateError.message
+      );
+    }
+
+    return NextResponse.json({
+      message: "Document processed successfully",
+      results,
+    });
+  } catch (error: any) {
+    console.error("Error processing document:", error);
+    return NextResponse.json({ error: error.message }, { status: 500 });
+  }
+}
--- a/bun.lockb
+++ b/bun.lockb
--- a/components/KokoroReader.tsx
+++ b/components/KokoroReader.tsx
@ -22,6 +22,7 @@ export default function KokoroReader({ pages }: { pages: any[] }) {
    playSentence,
    playInOrder,
    status,
+    pause,
  } = useTTS();

  const [playing, setPlaying] = useState(false);
@ -40,6 +41,11 @@ export default function KokoroReader({ pages }: { pages: any[] }) {
    playInOrder(currentSentence || 0);
  };

+  const paused = () => {
+    setPlaying(false);
+    pause();
+  };
+
  return (
    <div className="flex flex-col items-center justify-center pt-4 relative overflow-hidden font-sans">
      <div className="max-w-3xl w-full relative z-[2]">
@ -48,13 +54,13 @@ export default function KokoroReader({ pages }: { pages: any[] }) {
            variant="ghost"
            size="icon"
            className="h-10 w-10"
-            onClick={play}
+            onClick={status === "ready" ? play : paused}
            disabled={status === null}
          >
            {status === "running" ? (
              <Loader className="animate-spin" />
            ) : (
-              <span className="sr-only">Play</span>
+              <span className="sr-only">{playing ? "Pause" : "Play"}</span>
            )}
            {playing ? <Pause /> : <Play />}
          </Button>
--- a/components/TTSProvider.tsx
+++ b/components/TTSProvider.tsx
@ -10,7 +10,7 @@ import React, {
 } from "react";
 import removeMarkdown from "remove-markdown";
 import { toast } from "sonner";
-import * as Echogarden from "echogarden";
+import { synthesizeTTSAction } from "@/app/actions";

 // More robust sentence splitter using Intl.Segmenter for better accuracy.
 function splitIntoSentences(text: string): string[] {
@ -50,7 +50,6 @@ export const TTSProvider = ({
  children: ReactNode;
 }) => {
  const supabase = createClient();
-  // Combine pages and split into sentences.
  const fullText = pages.join("\n");
  const sentences = splitIntoSentences(fullText).filter(
    (sentence) => sentence.trim() !== "\\n" && sentence.trim() !== ""
@ -63,30 +62,42 @@ export const TTSProvider = ({
  const [voices, setVoices] = useState<any[]>([]);
  const [status, setStatus] = useState<"ready" | "running" | null>("ready");

+  // Cache for preloaded audio
+  const audioCache = useRef<Map<number, string>>(new Map());
+
+  // Preload audio for a range of sentences
+  const preloadAudio = async (startIndex: number, range: number = 3) => {
+    for (
+      let i = startIndex;
+      i < Math.min(sentences.length, startIndex + range);
+      i++
+    ) {
+      if (!audioCache.current.has(i)) {
+        console.log(`Preloading audio for sentence ${i}: ${sentences[i]}`);
+        try {
+          const audioUrl = await generateTTS(sentences[i], i);
+          audioCache.current.set(i, audioUrl); // Cache the audio URL
+        } catch (error) {
+          console.error(`Error preloading audio for sentence ${i}:`, error);
+        }
+      }
+    }
+  };
+
  async function generateTTS(sentence: string, index: number): Promise<string> {
    try {
-      // const { data, error } = await supabase.functions.invoke("generate-tts", {
-      //   body: {
-      //     text: sentence,
-      //     voice: selectedSpeaker,
-      //     index,
-      //   },
-      // });
-
-      const { audio } = await Echogarden.synthesize(sentence, {
-        engine: "kokoro",
+      const blob = await synthesizeTTSAction({
+        text: sentence,
+        previous_text: sentences[index - 1] || "",
+        next_text: sentences[index + 1] || "",
        voice: selectedSpeaker,
+        index,
      });
-
-      setStatus("running");
-
-      if (!Buffer.isBuffer(audio)) {
-        throw new Error("Audio data is not a buffer");
+      if (!blob) {
+        throw new Error("Failed to generate TTS");
      }
-
-      const blob = new Blob([audio], { type: "audio/wav" });
-
-      return URL.createObjectURL(blob);
+      const audioUrl = URL.createObjectURL(blob);
+      return audioUrl;
    } catch (error) {
      console.error("Error generating TTS:", error);
      toast.error("Failed to generate TTS. Please try again.");
@ -99,7 +110,15 @@ export const TTSProvider = ({

    const sentence = removeMarkdown(sentences[index]);
    try {
-      const audioUrl = await generateTTS(sentence, index);
+      let audioUrl = audioCache.current.get(index);
+      if (!audioUrl) {
+        console.log(
+          `Audio not preloaded for sentence ${index}, generating on the fly.`
+        );
+        audioUrl = await generateTTS(sentence, index);
+        audioCache.current.set(index, audioUrl); // Cache the generated audio
+      }
+
      if (audioRef.current) {
        audioRef.current.src = audioUrl;
        await new Promise((res) => {
@ -125,6 +144,7 @@ export const TTSProvider = ({
      console.log("Playing sentence:", i, sentences[i]);
      try {
        await playSentence(i);
+        preloadAudio(i + 1); // Preload the next sentence after playing
      } catch (error) {
        console.error("Error playing sentence:", error);
        break; // Stop playback on error
@ -151,6 +171,11 @@ export const TTSProvider = ({
    }
  };

+  // Preload sentences when the current sentence changes
+  useEffect(() => {
+    preloadAudio(currentSentence);
+  }, [currentSentence]);
+
  const value: TTSContextType = {
    sentences,
    currentSentence,
--- a/components/UploadZone.tsx
+++ b/components/UploadZone.tsx
@ -27,7 +27,7 @@ export default function UploadZone({ user }: { user?: { id: string } }) {
    body.append("access_token", data.session?.access_token || "");
    body.append("refresh_token", data.session?.refresh_token || "");

-    const edgeFunctionUrl = `${process.env.NEXT_PUBLIC_SUPABASE_URL}/functions/v1/process-document`;
+    const edgeFunctionUrl = `/api/process-document`;

    // Start listening to the SSE stream
    const eventSource = new SSE(edgeFunctionUrl, {
--- a/package.json
+++ b/package.json
@ -29,6 +29,7 @@
    "class-variance-authority": "^0.7.1",
    "clsx": "^2.1.1",
    "echogarden": "^2.7.0",
+    "elevenlabs": "^1.57.0",
    "kokoro-js": "^1.2.0",
    "lucide-react": "^0.486.0",
    "next": "latest",