i gave up we're using elevenlabs
This commit is contained in:
parent
1f0f09d254
commit
2e2a0f28b4
126
app/actions.ts
126
app/actions.ts
@ -6,6 +6,8 @@ import { headers } from "next/headers";
|
||||
import { redirect } from "next/navigation";
|
||||
import { Provider } from "@supabase/supabase-js";
|
||||
import { revalidatePath } from "next/cache";
|
||||
import { ElevenLabsClient } from "elevenlabs";
|
||||
import { Readable } from "stream";
|
||||
|
||||
export const signUpAction = async (formData: FormData) => {
|
||||
const email = formData.get("email")?.toString();
|
||||
@ -75,79 +77,61 @@ export const signInAction = async (formData: FormData) => {
|
||||
redirect("/dashboard");
|
||||
};
|
||||
|
||||
export const forgotPasswordAction = async (formData: FormData) => {
|
||||
const email = formData.get("email")?.toString();
|
||||
const supabase = await createClient();
|
||||
const origin = (await headers()).get("origin");
|
||||
const callbackUrl = formData.get("callbackUrl")?.toString();
|
||||
|
||||
if (!email) {
|
||||
return encodedRedirect("error", "/forgot-password", "Email is required");
|
||||
}
|
||||
|
||||
const { error } = await supabase.auth.resetPasswordForEmail(email, {
|
||||
redirectTo: `${origin}/auth/callback?redirect_to=/protected/reset-password`,
|
||||
});
|
||||
|
||||
if (error) {
|
||||
console.error(error.message);
|
||||
return encodedRedirect(
|
||||
"error",
|
||||
"/forgot-password",
|
||||
"Could not reset password"
|
||||
);
|
||||
}
|
||||
|
||||
if (callbackUrl) {
|
||||
return redirect(callbackUrl);
|
||||
}
|
||||
|
||||
return encodedRedirect(
|
||||
"success",
|
||||
"/forgot-password",
|
||||
"Check your email for a link to reset your password."
|
||||
);
|
||||
};
|
||||
|
||||
export const resetPasswordAction = async (formData: FormData) => {
|
||||
const supabase = await createClient();
|
||||
|
||||
const password = formData.get("password") as string;
|
||||
const confirmPassword = formData.get("confirmPassword") as string;
|
||||
|
||||
if (!password || !confirmPassword) {
|
||||
encodedRedirect(
|
||||
"error",
|
||||
"/protected/reset-password",
|
||||
"Password and confirm password are required"
|
||||
);
|
||||
}
|
||||
|
||||
if (password !== confirmPassword) {
|
||||
encodedRedirect(
|
||||
"error",
|
||||
"/protected/reset-password",
|
||||
"Passwords do not match"
|
||||
);
|
||||
}
|
||||
|
||||
const { error } = await supabase.auth.updateUser({
|
||||
password: password,
|
||||
});
|
||||
|
||||
if (error) {
|
||||
encodedRedirect(
|
||||
"error",
|
||||
"/protected/reset-password",
|
||||
"Password update failed"
|
||||
);
|
||||
}
|
||||
|
||||
encodedRedirect("success", "/protected/reset-password", "Password updated");
|
||||
};
|
||||
|
||||
export const signOutAction = async () => {
|
||||
const supabase = await createClient();
|
||||
await supabase.auth.signOut();
|
||||
return redirect("/login");
|
||||
};
|
||||
|
||||
const elevenLabs = new ElevenLabsClient({
|
||||
apiKey: process.env.ELEVENLABS_API_KEY,
|
||||
});
|
||||
|
||||
export const synthesizeTTSAction = async (data: {
|
||||
text: string;
|
||||
previous_text: string;
|
||||
next_text: string;
|
||||
voice: string;
|
||||
index: number;
|
||||
}) => {
|
||||
const { text, voice, index } = data;
|
||||
console.log("Generating TTS for text:", text, "with voice:", voice);
|
||||
|
||||
if (!text) {
|
||||
throw new Error("Text is required");
|
||||
}
|
||||
|
||||
try {
|
||||
// Call ElevenLabs API to generate the audio
|
||||
const audioStream = await elevenLabs.textToSpeech.convertAsStream(
|
||||
"gUABw7pXQjhjt0kNFBTF",
|
||||
{
|
||||
text,
|
||||
output_format: "mp3_44100_128",
|
||||
voice_settings: {
|
||||
stability: 0.75,
|
||||
speed: 1.0,
|
||||
similarity_boost: 0.75,
|
||||
},
|
||||
previous_text: data.previous_text,
|
||||
next_text: data.next_text,
|
||||
model_id: "eleven_multilingual_v2",
|
||||
}
|
||||
);
|
||||
const chunks: Buffer[] = [];
|
||||
|
||||
for await (const chunk of audioStream) {
|
||||
chunks.push(chunk);
|
||||
}
|
||||
|
||||
const audioBuffer = Buffer.concat(chunks);
|
||||
|
||||
// Create a Blob from the Buffer
|
||||
const audioBlob = new Blob([audioBuffer], { type: "audio/mpeg" });
|
||||
|
||||
return audioBlob;
|
||||
} catch (error) {
|
||||
console.error("Error generating TTS:", error);
|
||||
throw new Error("Failed to generate TTS audio stream");
|
||||
}
|
||||
};
|
||||
|
198
app/api/process-document/route.ts
Normal file
198
app/api/process-document/route.ts
Normal file
@ -0,0 +1,198 @@
|
||||
import { NextRequest, NextResponse } from "next/server";
|
||||
import { Mistral } from "@mistralai/mistralai";
|
||||
import pLimit from "p-limit";
|
||||
import { createClient } from "@/utils/supabase/server";
|
||||
|
||||
const corsHeaders = {
|
||||
"Access-Control-Allow-Origin": "*",
|
||||
"Access-Control-Allow-Headers":
|
||||
"authorization, x-client-info, apikey, content-type",
|
||||
};
|
||||
|
||||
const apiKey = process.env.MISTRAL_API_KEY!;
|
||||
const client = new Mistral({ apiKey });
|
||||
|
||||
const PROCESSING_PROMPT = `
|
||||
You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format.
|
||||
|
||||
The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. **Do not add headings if they do not exist in the original text.**
|
||||
Any images should be included.
|
||||
Do not return the Markdown as a code block, only as a raw string, without any new lines.
|
||||
|
||||
No data or information should ever be removed, it should only be processed and formatted.
|
||||
|
||||
There are in-text citations/references in the text, remove them from the text (**but most importantly, keep the reference number in the text. use a <sup></sup> tag**) and put them into an object where the key is the reference number and the value is the text. If any citations contain JSON-breaking characters, ensure they are properly escaped. This includes characters like double quotes, backslashes, and newlines.
|
||||
|
||||
The Markdown should be human-readable and well-formatted. The markdown string should properly sanitized and should not break a JSON parser when returned as the final format.
|
||||
|
||||
Return the final result as a text object with the following structure (without code block formatting):
|
||||
|
||||
"""
|
||||
<processed markdown text>
|
||||
|
||||
---------
|
||||
|
||||
{
|
||||
"citations": [
|
||||
{
|
||||
"number": 1, // The number as it appears in the text
|
||||
"text": "Citation text 1" // Ensure any JSON-breaking characters are properly escaped
|
||||
},
|
||||
{
|
||||
"number": 2,
|
||||
"text": "Citation text 2"
|
||||
}
|
||||
]
|
||||
}
|
||||
"""
|
||||
|
||||
Do not return the text object as a code block, only as a raw string.
|
||||
`;
|
||||
|
||||
async function getCitations(citationsStr: string) {
|
||||
try {
|
||||
const citations = JSON.parse(citationsStr).citations || {};
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
export async function POST(req: NextRequest) {
|
||||
if (req.method === "OPTIONS") {
|
||||
return new NextResponse(null, {
|
||||
headers: {
|
||||
...corsHeaders,
|
||||
"Access-Control-Allow-Methods": "POST, OPTIONS",
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
const formData = await req.formData();
|
||||
const accessToken = formData.get("access_token") as string;
|
||||
const refreshToken = formData.get("refresh_token") as string;
|
||||
|
||||
if (!formData.has("file") || !accessToken || !refreshToken) {
|
||||
return NextResponse.json(
|
||||
{
|
||||
error: "Missing required fields: file, access_token, or refresh_token",
|
||||
},
|
||||
{ status: 400 }
|
||||
);
|
||||
}
|
||||
|
||||
const supabase = await createClient();
|
||||
|
||||
const file = formData.get("file") as File;
|
||||
const fileName = file.name;
|
||||
const uuid = crypto.randomUUID();
|
||||
|
||||
try {
|
||||
// Authenticate the user
|
||||
const {
|
||||
data: { user },
|
||||
error: sessionError,
|
||||
} = await supabase.auth.setSession({
|
||||
access_token: accessToken,
|
||||
refresh_token: refreshToken,
|
||||
});
|
||||
|
||||
if (sessionError) {
|
||||
throw new Error("Failed to set session: " + sessionError.message);
|
||||
}
|
||||
|
||||
if (!user) {
|
||||
throw new Error("User not authenticated");
|
||||
}
|
||||
|
||||
// Upload the file to Supabase storage
|
||||
const { data: storageData, error: storageError } = await supabase.storage
|
||||
.from("documents")
|
||||
.upload(`${user.id}/${uuid}.pdf`, file);
|
||||
|
||||
if (storageError) {
|
||||
throw new Error("Failed to upload file: " + storageError.message);
|
||||
}
|
||||
|
||||
// Insert document record
|
||||
const { error: docError } = await supabase.from("documents").insert({
|
||||
id: uuid,
|
||||
file_name: file.name,
|
||||
owner: user.id,
|
||||
raw_file: storageData.id,
|
||||
is_processing: true,
|
||||
});
|
||||
|
||||
if (docError) {
|
||||
throw new Error("Failed to insert document record: " + docError.message);
|
||||
}
|
||||
|
||||
// Upload file to Mistral
|
||||
const uploadedPdf = await client.files.upload({
|
||||
file: { fileName, content: file },
|
||||
purpose: "ocr",
|
||||
});
|
||||
|
||||
const signedUrl = await client.files.getSignedUrl({
|
||||
fileId: uploadedPdf.id,
|
||||
});
|
||||
|
||||
// Process OCR
|
||||
const ocrResponse = await client.ocr.process({
|
||||
model: "mistral-ocr-latest",
|
||||
document: { type: "document_url", documentUrl: signedUrl.url },
|
||||
});
|
||||
|
||||
const limit = pLimit(2);
|
||||
const promises = ocrResponse.pages.map((page) =>
|
||||
limit(async () => {
|
||||
const response = await client.chat.complete({
|
||||
model: "mistral-small-latest",
|
||||
messages: [
|
||||
{
|
||||
role: "system",
|
||||
content: [{ type: "text", text: PROCESSING_PROMPT }],
|
||||
},
|
||||
{ role: "user", content: [{ type: "text", text: page.markdown }] },
|
||||
],
|
||||
});
|
||||
|
||||
const split = response.choices[0].message.content.split("---------");
|
||||
const content = split[0].trim();
|
||||
const citationsStr = split[1]?.trim() || "{}";
|
||||
console.log(citationsStr);
|
||||
|
||||
const citations = await getCitations(citationsStr);
|
||||
|
||||
return {
|
||||
...page,
|
||||
markdown: content,
|
||||
citations,
|
||||
};
|
||||
})
|
||||
);
|
||||
|
||||
const results = await Promise.all(promises);
|
||||
|
||||
// Update document record with OCR data
|
||||
const { error: updateError } = await supabase
|
||||
.from("documents")
|
||||
.update({ ocr_data: results, is_processing: false })
|
||||
.eq("id", uuid);
|
||||
|
||||
if (updateError) {
|
||||
throw new Error(
|
||||
"Failed to update document record: " + updateError.message
|
||||
);
|
||||
}
|
||||
|
||||
return NextResponse.json({
|
||||
message: "Document processed successfully",
|
||||
results,
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error("Error processing document:", error);
|
||||
return NextResponse.json({ error: error.message }, { status: 500 });
|
||||
}
|
||||
}
|
@ -22,6 +22,7 @@ export default function KokoroReader({ pages }: { pages: any[] }) {
|
||||
playSentence,
|
||||
playInOrder,
|
||||
status,
|
||||
pause,
|
||||
} = useTTS();
|
||||
|
||||
const [playing, setPlaying] = useState(false);
|
||||
@ -40,6 +41,11 @@ export default function KokoroReader({ pages }: { pages: any[] }) {
|
||||
playInOrder(currentSentence || 0);
|
||||
};
|
||||
|
||||
const paused = () => {
|
||||
setPlaying(false);
|
||||
pause();
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="flex flex-col items-center justify-center pt-4 relative overflow-hidden font-sans">
|
||||
<div className="max-w-3xl w-full relative z-[2]">
|
||||
@ -48,13 +54,13 @@ export default function KokoroReader({ pages }: { pages: any[] }) {
|
||||
variant="ghost"
|
||||
size="icon"
|
||||
className="h-10 w-10"
|
||||
onClick={play}
|
||||
onClick={status === "ready" ? play : paused}
|
||||
disabled={status === null}
|
||||
>
|
||||
{status === "running" ? (
|
||||
<Loader className="animate-spin" />
|
||||
) : (
|
||||
<span className="sr-only">Play</span>
|
||||
<span className="sr-only">{playing ? "Pause" : "Play"}</span>
|
||||
)}
|
||||
{playing ? <Pause /> : <Play />}
|
||||
</Button>
|
||||
|
@ -10,7 +10,7 @@ import React, {
|
||||
} from "react";
|
||||
import removeMarkdown from "remove-markdown";
|
||||
import { toast } from "sonner";
|
||||
import * as Echogarden from "echogarden";
|
||||
import { synthesizeTTSAction } from "@/app/actions";
|
||||
|
||||
// More robust sentence splitter using Intl.Segmenter for better accuracy.
|
||||
function splitIntoSentences(text: string): string[] {
|
||||
@ -50,7 +50,6 @@ export const TTSProvider = ({
|
||||
children: ReactNode;
|
||||
}) => {
|
||||
const supabase = createClient();
|
||||
// Combine pages and split into sentences.
|
||||
const fullText = pages.join("\n");
|
||||
const sentences = splitIntoSentences(fullText).filter(
|
||||
(sentence) => sentence.trim() !== "\\n" && sentence.trim() !== ""
|
||||
@ -63,30 +62,42 @@ export const TTSProvider = ({
|
||||
const [voices, setVoices] = useState<any[]>([]);
|
||||
const [status, setStatus] = useState<"ready" | "running" | null>("ready");
|
||||
|
||||
// Cache for preloaded audio
|
||||
const audioCache = useRef<Map<number, string>>(new Map());
|
||||
|
||||
// Preload audio for a range of sentences
|
||||
const preloadAudio = async (startIndex: number, range: number = 3) => {
|
||||
for (
|
||||
let i = startIndex;
|
||||
i < Math.min(sentences.length, startIndex + range);
|
||||
i++
|
||||
) {
|
||||
if (!audioCache.current.has(i)) {
|
||||
console.log(`Preloading audio for sentence ${i}: ${sentences[i]}`);
|
||||
try {
|
||||
const audioUrl = await generateTTS(sentences[i], i);
|
||||
audioCache.current.set(i, audioUrl); // Cache the audio URL
|
||||
} catch (error) {
|
||||
console.error(`Error preloading audio for sentence ${i}:`, error);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
async function generateTTS(sentence: string, index: number): Promise<string> {
|
||||
try {
|
||||
// const { data, error } = await supabase.functions.invoke("generate-tts", {
|
||||
// body: {
|
||||
// text: sentence,
|
||||
// voice: selectedSpeaker,
|
||||
// index,
|
||||
// },
|
||||
// });
|
||||
|
||||
const { audio } = await Echogarden.synthesize(sentence, {
|
||||
engine: "kokoro",
|
||||
const blob = await synthesizeTTSAction({
|
||||
text: sentence,
|
||||
previous_text: sentences[index - 1] || "",
|
||||
next_text: sentences[index + 1] || "",
|
||||
voice: selectedSpeaker,
|
||||
index,
|
||||
});
|
||||
|
||||
setStatus("running");
|
||||
|
||||
if (!Buffer.isBuffer(audio)) {
|
||||
throw new Error("Audio data is not a buffer");
|
||||
if (!blob) {
|
||||
throw new Error("Failed to generate TTS");
|
||||
}
|
||||
|
||||
const blob = new Blob([audio], { type: "audio/wav" });
|
||||
|
||||
return URL.createObjectURL(blob);
|
||||
const audioUrl = URL.createObjectURL(blob);
|
||||
return audioUrl;
|
||||
} catch (error) {
|
||||
console.error("Error generating TTS:", error);
|
||||
toast.error("Failed to generate TTS. Please try again.");
|
||||
@ -99,7 +110,15 @@ export const TTSProvider = ({
|
||||
|
||||
const sentence = removeMarkdown(sentences[index]);
|
||||
try {
|
||||
const audioUrl = await generateTTS(sentence, index);
|
||||
let audioUrl = audioCache.current.get(index);
|
||||
if (!audioUrl) {
|
||||
console.log(
|
||||
`Audio not preloaded for sentence ${index}, generating on the fly.`
|
||||
);
|
||||
audioUrl = await generateTTS(sentence, index);
|
||||
audioCache.current.set(index, audioUrl); // Cache the generated audio
|
||||
}
|
||||
|
||||
if (audioRef.current) {
|
||||
audioRef.current.src = audioUrl;
|
||||
await new Promise((res) => {
|
||||
@ -125,6 +144,7 @@ export const TTSProvider = ({
|
||||
console.log("Playing sentence:", i, sentences[i]);
|
||||
try {
|
||||
await playSentence(i);
|
||||
preloadAudio(i + 1); // Preload the next sentence after playing
|
||||
} catch (error) {
|
||||
console.error("Error playing sentence:", error);
|
||||
break; // Stop playback on error
|
||||
@ -151,6 +171,11 @@ export const TTSProvider = ({
|
||||
}
|
||||
};
|
||||
|
||||
// Preload sentences when the current sentence changes
|
||||
useEffect(() => {
|
||||
preloadAudio(currentSentence);
|
||||
}, [currentSentence]);
|
||||
|
||||
const value: TTSContextType = {
|
||||
sentences,
|
||||
currentSentence,
|
||||
|
@ -27,7 +27,7 @@ export default function UploadZone({ user }: { user?: { id: string } }) {
|
||||
body.append("access_token", data.session?.access_token || "");
|
||||
body.append("refresh_token", data.session?.refresh_token || "");
|
||||
|
||||
const edgeFunctionUrl = `${process.env.NEXT_PUBLIC_SUPABASE_URL}/functions/v1/process-document`;
|
||||
const edgeFunctionUrl = `/api/process-document`;
|
||||
|
||||
// Start listening to the SSE stream
|
||||
const eventSource = new SSE(edgeFunctionUrl, {
|
||||
|
@ -29,6 +29,7 @@
|
||||
"class-variance-authority": "^0.7.1",
|
||||
"clsx": "^2.1.1",
|
||||
"echogarden": "^2.7.0",
|
||||
"elevenlabs": "^1.57.0",
|
||||
"kokoro-js": "^1.2.0",
|
||||
"lucide-react": "^0.486.0",
|
||||
"next": "latest",
|
||||
|
Loading…
x
Reference in New Issue
Block a user