i gave up we're using elevenlabs

This commit is contained in:
Jack Merrill 2025-05-06 01:43:35 -04:00
parent 1f0f09d254
commit 2e2a0f28b4
No known key found for this signature in database
GPG Key ID: FD574AFF96E99636
7 changed files with 310 additions and 96 deletions

View File

@ -6,6 +6,8 @@ import { headers } from "next/headers";
import { redirect } from "next/navigation";
import { Provider } from "@supabase/supabase-js";
import { revalidatePath } from "next/cache";
import { ElevenLabsClient } from "elevenlabs";
import { Readable } from "stream";
export const signUpAction = async (formData: FormData) => {
const email = formData.get("email")?.toString();
@ -75,79 +77,61 @@ export const signInAction = async (formData: FormData) => {
redirect("/dashboard");
};
export const forgotPasswordAction = async (formData: FormData) => {
const email = formData.get("email")?.toString();
const supabase = await createClient();
const origin = (await headers()).get("origin");
const callbackUrl = formData.get("callbackUrl")?.toString();
if (!email) {
return encodedRedirect("error", "/forgot-password", "Email is required");
}
const { error } = await supabase.auth.resetPasswordForEmail(email, {
redirectTo: `${origin}/auth/callback?redirect_to=/protected/reset-password`,
});
if (error) {
console.error(error.message);
return encodedRedirect(
"error",
"/forgot-password",
"Could not reset password"
);
}
if (callbackUrl) {
return redirect(callbackUrl);
}
return encodedRedirect(
"success",
"/forgot-password",
"Check your email for a link to reset your password."
);
};
export const resetPasswordAction = async (formData: FormData) => {
const supabase = await createClient();
const password = formData.get("password") as string;
const confirmPassword = formData.get("confirmPassword") as string;
if (!password || !confirmPassword) {
encodedRedirect(
"error",
"/protected/reset-password",
"Password and confirm password are required"
);
}
if (password !== confirmPassword) {
encodedRedirect(
"error",
"/protected/reset-password",
"Passwords do not match"
);
}
const { error } = await supabase.auth.updateUser({
password: password,
});
if (error) {
encodedRedirect(
"error",
"/protected/reset-password",
"Password update failed"
);
}
encodedRedirect("success", "/protected/reset-password", "Password updated");
};
export const signOutAction = async () => {
const supabase = await createClient();
await supabase.auth.signOut();
return redirect("/login");
};
const elevenLabs = new ElevenLabsClient({
apiKey: process.env.ELEVENLABS_API_KEY,
});
export const synthesizeTTSAction = async (data: {
text: string;
previous_text: string;
next_text: string;
voice: string;
index: number;
}) => {
const { text, voice, index } = data;
console.log("Generating TTS for text:", text, "with voice:", voice);
if (!text) {
throw new Error("Text is required");
}
try {
// Call ElevenLabs API to generate the audio
const audioStream = await elevenLabs.textToSpeech.convertAsStream(
"gUABw7pXQjhjt0kNFBTF",
{
text,
output_format: "mp3_44100_128",
voice_settings: {
stability: 0.75,
speed: 1.0,
similarity_boost: 0.75,
},
previous_text: data.previous_text,
next_text: data.next_text,
model_id: "eleven_multilingual_v2",
}
);
const chunks: Buffer[] = [];
for await (const chunk of audioStream) {
chunks.push(chunk);
}
const audioBuffer = Buffer.concat(chunks);
// Create a Blob from the Buffer
const audioBlob = new Blob([audioBuffer], { type: "audio/mpeg" });
return audioBlob;
} catch (error) {
console.error("Error generating TTS:", error);
throw new Error("Failed to generate TTS audio stream");
}
};

View File

@ -0,0 +1,198 @@
import { NextRequest, NextResponse } from "next/server";
import { Mistral } from "@mistralai/mistralai";
import pLimit from "p-limit";
import { createClient } from "@/utils/supabase/server";
const corsHeaders = {
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Headers":
"authorization, x-client-info, apikey, content-type",
};
const apiKey = process.env.MISTRAL_API_KEY!;
const client = new Mistral({ apiKey });
const PROCESSING_PROMPT = `
You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format.
The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. **Do not add headings if they do not exist in the original text.**
Any images should be included.
Do not return the Markdown as a code block, only as a raw string, without any new lines.
No data or information should ever be removed, it should only be processed and formatted.
There are in-text citations/references in the text, remove them from the text (**but most importantly, keep the reference number in the text. use a <sup></sup> tag**) and put them into an object where the key is the reference number and the value is the text. If any citations contain JSON-breaking characters, ensure they are properly escaped. This includes characters like double quotes, backslashes, and newlines.
The Markdown should be human-readable and well-formatted. The markdown string should properly sanitized and should not break a JSON parser when returned as the final format.
Return the final result as a text object with the following structure (without code block formatting):
"""
<processed markdown text>
---------
{
"citations": [
{
"number": 1, // The number as it appears in the text
"text": "Citation text 1" // Ensure any JSON-breaking characters are properly escaped
},
{
"number": 2,
"text": "Citation text 2"
}
]
}
"""
Do not return the text object as a code block, only as a raw string.
`;
async function getCitations(citationsStr: string) {
try {
const citations = JSON.parse(citationsStr).citations || {};
return
}
}
export async function POST(req: NextRequest) {
if (req.method === "OPTIONS") {
return new NextResponse(null, {
headers: {
...corsHeaders,
"Access-Control-Allow-Methods": "POST, OPTIONS",
},
});
}
const formData = await req.formData();
const accessToken = formData.get("access_token") as string;
const refreshToken = formData.get("refresh_token") as string;
if (!formData.has("file") || !accessToken || !refreshToken) {
return NextResponse.json(
{
error: "Missing required fields: file, access_token, or refresh_token",
},
{ status: 400 }
);
}
const supabase = await createClient();
const file = formData.get("file") as File;
const fileName = file.name;
const uuid = crypto.randomUUID();
try {
// Authenticate the user
const {
data: { user },
error: sessionError,
} = await supabase.auth.setSession({
access_token: accessToken,
refresh_token: refreshToken,
});
if (sessionError) {
throw new Error("Failed to set session: " + sessionError.message);
}
if (!user) {
throw new Error("User not authenticated");
}
// Upload the file to Supabase storage
const { data: storageData, error: storageError } = await supabase.storage
.from("documents")
.upload(`${user.id}/${uuid}.pdf`, file);
if (storageError) {
throw new Error("Failed to upload file: " + storageError.message);
}
// Insert document record
const { error: docError } = await supabase.from("documents").insert({
id: uuid,
file_name: file.name,
owner: user.id,
raw_file: storageData.id,
is_processing: true,
});
if (docError) {
throw new Error("Failed to insert document record: " + docError.message);
}
// Upload file to Mistral
const uploadedPdf = await client.files.upload({
file: { fileName, content: file },
purpose: "ocr",
});
const signedUrl = await client.files.getSignedUrl({
fileId: uploadedPdf.id,
});
// Process OCR
const ocrResponse = await client.ocr.process({
model: "mistral-ocr-latest",
document: { type: "document_url", documentUrl: signedUrl.url },
});
const limit = pLimit(2);
const promises = ocrResponse.pages.map((page) =>
limit(async () => {
const response = await client.chat.complete({
model: "mistral-small-latest",
messages: [
{
role: "system",
content: [{ type: "text", text: PROCESSING_PROMPT }],
},
{ role: "user", content: [{ type: "text", text: page.markdown }] },
],
});
const split = response.choices[0].message.content.split("---------");
const content = split[0].trim();
const citationsStr = split[1]?.trim() || "{}";
console.log(citationsStr);
const citations = await getCitations(citationsStr);
return {
...page,
markdown: content,
citations,
};
})
);
const results = await Promise.all(promises);
// Update document record with OCR data
const { error: updateError } = await supabase
.from("documents")
.update({ ocr_data: results, is_processing: false })
.eq("id", uuid);
if (updateError) {
throw new Error(
"Failed to update document record: " + updateError.message
);
}
return NextResponse.json({
message: "Document processed successfully",
results,
});
} catch (error: any) {
console.error("Error processing document:", error);
return NextResponse.json({ error: error.message }, { status: 500 });
}
}

BIN
bun.lockb

Binary file not shown.

View File

@ -22,6 +22,7 @@ export default function KokoroReader({ pages }: { pages: any[] }) {
playSentence,
playInOrder,
status,
pause,
} = useTTS();
const [playing, setPlaying] = useState(false);
@ -40,6 +41,11 @@ export default function KokoroReader({ pages }: { pages: any[] }) {
playInOrder(currentSentence || 0);
};
const paused = () => {
setPlaying(false);
pause();
};
return (
<div className="flex flex-col items-center justify-center pt-4 relative overflow-hidden font-sans">
<div className="max-w-3xl w-full relative z-[2]">
@ -48,13 +54,13 @@ export default function KokoroReader({ pages }: { pages: any[] }) {
variant="ghost"
size="icon"
className="h-10 w-10"
onClick={play}
onClick={status === "ready" ? play : paused}
disabled={status === null}
>
{status === "running" ? (
<Loader className="animate-spin" />
) : (
<span className="sr-only">Play</span>
<span className="sr-only">{playing ? "Pause" : "Play"}</span>
)}
{playing ? <Pause /> : <Play />}
</Button>

View File

@ -10,7 +10,7 @@ import React, {
} from "react";
import removeMarkdown from "remove-markdown";
import { toast } from "sonner";
import * as Echogarden from "echogarden";
import { synthesizeTTSAction } from "@/app/actions";
// More robust sentence splitter using Intl.Segmenter for better accuracy.
function splitIntoSentences(text: string): string[] {
@ -50,7 +50,6 @@ export const TTSProvider = ({
children: ReactNode;
}) => {
const supabase = createClient();
// Combine pages and split into sentences.
const fullText = pages.join("\n");
const sentences = splitIntoSentences(fullText).filter(
(sentence) => sentence.trim() !== "\\n" && sentence.trim() !== ""
@ -63,30 +62,42 @@ export const TTSProvider = ({
const [voices, setVoices] = useState<any[]>([]);
const [status, setStatus] = useState<"ready" | "running" | null>("ready");
// Cache for preloaded audio
const audioCache = useRef<Map<number, string>>(new Map());
// Preload audio for a range of sentences
const preloadAudio = async (startIndex: number, range: number = 3) => {
for (
let i = startIndex;
i < Math.min(sentences.length, startIndex + range);
i++
) {
if (!audioCache.current.has(i)) {
console.log(`Preloading audio for sentence ${i}: ${sentences[i]}`);
try {
const audioUrl = await generateTTS(sentences[i], i);
audioCache.current.set(i, audioUrl); // Cache the audio URL
} catch (error) {
console.error(`Error preloading audio for sentence ${i}:`, error);
}
}
}
};
async function generateTTS(sentence: string, index: number): Promise<string> {
try {
// const { data, error } = await supabase.functions.invoke("generate-tts", {
// body: {
// text: sentence,
// voice: selectedSpeaker,
// index,
// },
// });
const { audio } = await Echogarden.synthesize(sentence, {
engine: "kokoro",
const blob = await synthesizeTTSAction({
text: sentence,
previous_text: sentences[index - 1] || "",
next_text: sentences[index + 1] || "",
voice: selectedSpeaker,
index,
});
setStatus("running");
if (!Buffer.isBuffer(audio)) {
throw new Error("Audio data is not a buffer");
if (!blob) {
throw new Error("Failed to generate TTS");
}
const blob = new Blob([audio], { type: "audio/wav" });
return URL.createObjectURL(blob);
const audioUrl = URL.createObjectURL(blob);
return audioUrl;
} catch (error) {
console.error("Error generating TTS:", error);
toast.error("Failed to generate TTS. Please try again.");
@ -99,7 +110,15 @@ export const TTSProvider = ({
const sentence = removeMarkdown(sentences[index]);
try {
const audioUrl = await generateTTS(sentence, index);
let audioUrl = audioCache.current.get(index);
if (!audioUrl) {
console.log(
`Audio not preloaded for sentence ${index}, generating on the fly.`
);
audioUrl = await generateTTS(sentence, index);
audioCache.current.set(index, audioUrl); // Cache the generated audio
}
if (audioRef.current) {
audioRef.current.src = audioUrl;
await new Promise((res) => {
@ -125,6 +144,7 @@ export const TTSProvider = ({
console.log("Playing sentence:", i, sentences[i]);
try {
await playSentence(i);
preloadAudio(i + 1); // Preload the next sentence after playing
} catch (error) {
console.error("Error playing sentence:", error);
break; // Stop playback on error
@ -151,6 +171,11 @@ export const TTSProvider = ({
}
};
// Preload sentences when the current sentence changes
useEffect(() => {
preloadAudio(currentSentence);
}, [currentSentence]);
const value: TTSContextType = {
sentences,
currentSentence,

View File

@ -27,7 +27,7 @@ export default function UploadZone({ user }: { user?: { id: string } }) {
body.append("access_token", data.session?.access_token || "");
body.append("refresh_token", data.session?.refresh_token || "");
const edgeFunctionUrl = `${process.env.NEXT_PUBLIC_SUPABASE_URL}/functions/v1/process-document`;
const edgeFunctionUrl = `/api/process-document`;
// Start listening to the SSE stream
const eventSource = new SSE(edgeFunctionUrl, {

View File

@ -29,6 +29,7 @@
"class-variance-authority": "^0.7.1",
"clsx": "^2.1.1",
"echogarden": "^2.7.0",
"elevenlabs": "^1.57.0",
"kokoro-js": "^1.2.0",
"lucide-react": "^0.486.0",
"next": "latest",