refactor: update imports and improve TTS functionality with pause/resume support

This commit is contained in:
Jack Merrill 2025-05-07 19:30:38 -04:00
parent 2e2a0f28b4
commit 7f9bdee7f4
Signed by: jack
GPG Key ID: F6BFCA1B80EA6AF7
8 changed files with 272 additions and 86 deletions

View File

@ -115,7 +115,7 @@ export const synthesizeTTSAction = async (data: {
}, },
previous_text: data.previous_text, previous_text: data.previous_text,
next_text: data.next_text, next_text: data.next_text,
model_id: "eleven_multilingual_v2", model_id: "eleven_flash_v2_5", // use eleven_multilingual_v2 if this doesnt sound good
} }
); );
const chunks: Buffer[] = []; const chunks: Buffer[] = [];

View File

@ -15,7 +15,7 @@ const client = new Mistral({ apiKey });
const PROCESSING_PROMPT = ` const PROCESSING_PROMPT = `
You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format. You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format.
The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. **Do not add headings if they do not exist in the original text.** The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. **Do not add headings if they do not exist in the original text.** If there is a title to the document, it should be the first heading.
Any images should be included. Any images should be included.
Do not return the Markdown as a code block, only as a raw string, without any new lines. Do not return the Markdown as a code block, only as a raw string, without any new lines.
@ -49,18 +49,36 @@ Return the final result as a text object with the following structure (without c
Do not return the text object as a code block, only as a raw string. Do not return the text object as a code block, only as a raw string.
`; `;
async function getCitations(citationsStr: string) { function getCitations(citationsStr: string) {
try { try {
const citations = JSON.parse(citationsStr).citations || {}; console.log("Parsing citations string:", citationsStr);
const citationsData = JSON.parse(citationsStr);
return console.log("Sanitizing citations...");
const sanitizedCitations = citationsData.citations.map((citation: any) => {
const sanitizedText = citation.text.replace(
/(https?:\/\/[^\s]+)/g,
(url: string) => encodeURI(url)
);
return {
...citation,
text: sanitizedText,
};
});
console.log("Sanitized citations:", sanitizedCitations);
return sanitizedCitations;
} catch (err) {
console.error("Error parsing or sanitizing citations:", err);
return [];
} }
} }
export async function POST(req: NextRequest) { export async function POST(req: NextRequest) {
console.log("Received POST request");
if (req.method === "OPTIONS") { if (req.method === "OPTIONS") {
console.log("Handling OPTIONS request");
return new NextResponse(null, { return new NextResponse(null, {
headers: { headers: {
...corsHeaders, ...corsHeaders,
@ -69,27 +87,16 @@ export async function POST(req: NextRequest) {
}); });
} }
const formData = await req.formData();
const accessToken = formData.get("access_token") as string;
const refreshToken = formData.get("refresh_token") as string;
if (!formData.has("file") || !accessToken || !refreshToken) {
return NextResponse.json(
{
error: "Missing required fields: file, access_token, or refresh_token",
},
{ status: 400 }
);
}
const supabase = await createClient();
const file = formData.get("file") as File;
const fileName = file.name;
const uuid = crypto.randomUUID();
try { try {
// Authenticate the user console.log("Parsing form data...");
const formData = await req.formData();
const accessToken = formData.get("access_token") as string;
const refreshToken = formData.get("refresh_token") as string;
console.log("Creating Supabase client...");
const supabase = await createClient();
console.log("Authenticating user...");
const { const {
data: { user }, data: { user },
error: sessionError, error: sessionError,
@ -99,54 +106,152 @@ export async function POST(req: NextRequest) {
}); });
if (sessionError) { if (sessionError) {
console.error("Failed to set session:", sessionError.message);
throw new Error("Failed to set session: " + sessionError.message); throw new Error("Failed to set session: " + sessionError.message);
} }
if (!user) { if (!user) {
console.error("User not authenticated");
throw new Error("User not authenticated"); throw new Error("User not authenticated");
} }
// Upload the file to Supabase storage var reprocessing = false;
const { data: storageData, error: storageError } = await supabase.storage var uuid = crypto.randomUUID();
.from("documents")
.upload(`${user.id}/${uuid}.pdf`, file);
if (storageError) { if (formData.has("id")) {
throw new Error("Failed to upload file: " + storageError.message); console.log("Reprocessing document...");
reprocessing = true;
console.log("File ID found in form data.");
const docId = formData.get("id");
console.log("Document ID:", docId, formData);
const { data: documentData, error: documentError } = await supabase
.from("documents")
.select("*")
.eq("id", docId!.toString())
.single();
if (documentError) {
console.error("Error fetching document record:", documentError);
throw new Error("Document record fetch failed");
}
if (documentData) {
await supabase
.from("documents")
.update({
is_processing: true,
})
.eq("id", documentData.id);
uuid = documentData.id;
} else {
console.error("Document record not found.");
throw new Error("Document record not found");
}
const { data: fileData, error: fileError } = await supabase.storage
.from("documents")
.download(`${user.id}/${uuid}.pdf`);
if (fileError) {
console.error("Error downloading file from storage:", fileError);
throw new Error("File download failed");
}
console.log("File downloaded from storage:", fileData);
formData.set("file", fileData);
} }
// Insert document record if (
const { error: docError } = await supabase.from("documents").insert({ !reprocessing &&
id: uuid, (!formData.has("file") || !accessToken || !refreshToken)
file_name: file.name, ) {
owner: user.id, console.error(
raw_file: storageData.id, "Missing required fields: file, access_token, or refresh_token"
is_processing: true, );
}); return NextResponse.json(
{
if (docError) { error:
throw new Error("Failed to insert document record: " + docError.message); "Missing required fields: file, access_token, or refresh_token",
},
{ status: 400 }
);
} }
// Upload file to Mistral let file = formData.get("file") as File;
const fileName = file.name;
if (!reprocessing) {
console.log("Generated UUID for file:", uuid);
console.log("Uploading file to Supabase storage...");
const { data: storageData, error: storageError } = await supabase.storage
.from("documents")
.upload(`${user.id}/${uuid}.pdf`, file);
if (storageError) {
console.error("Failed to upload file:", storageError.message);
throw new Error("Failed to upload file: " + storageError.message);
}
console.log("Inserting document record...");
const { error: docError } = await supabase.from("documents").insert({
id: uuid,
file_name: file.name,
owner: user.id,
raw_file: storageData.id,
is_processing: true,
});
if (docError) {
console.error("Failed to insert document record:", docError.message);
throw new Error(
"Failed to insert document record: " + docError.message
);
}
} else {
console.log("Reprocessing document...");
const { error: docError } = await supabase
.from("documents")
.update({
is_processing: true,
})
.eq("id", uuid);
if (docError) {
console.error("Error updating document record:", docError);
throw new Error("Document record update failed");
}
console.log("Document record updated successfully.");
}
console.log("Uploading file to Mistral...");
const uploadedPdf = await client.files.upload({ const uploadedPdf = await client.files.upload({
file: { fileName, content: file }, file: { fileName: `${uuid}.pdf`, content: file },
purpose: "ocr", purpose: "ocr",
}); });
console.log("Getting signed URL from Mistral...");
const signedUrl = await client.files.getSignedUrl({ const signedUrl = await client.files.getSignedUrl({
fileId: uploadedPdf.id, fileId: uploadedPdf.id,
}); });
// Process OCR console.log("Processing OCR...");
const ocrResponse = await client.ocr.process({ const ocrResponse = await client.ocr.process({
model: "mistral-ocr-latest", model: "mistral-ocr-latest",
document: { type: "document_url", documentUrl: signedUrl.url }, document: { type: "document_url", documentUrl: signedUrl.url },
includeImageBase64: true,
}); });
console.log("Processing OCR pages...");
const limit = pLimit(2); const limit = pLimit(2);
const promises = ocrResponse.pages.map((page) => const promises = ocrResponse.pages.map((page) =>
limit(async () => { limit(async () => {
console.log("Processing page:", page);
const response = await client.chat.complete({ const response = await client.chat.complete({
model: "mistral-small-latest", model: "mistral-small-latest",
messages: [ messages: [
@ -161,32 +266,34 @@ export async function POST(req: NextRequest) {
const split = response.choices[0].message.content.split("---------"); const split = response.choices[0].message.content.split("---------");
const content = split[0].trim(); const content = split[0].trim();
const citationsStr = split[1]?.trim() || "{}"; const citationsStr = split[1]?.trim() || "{}";
console.log(citationsStr); console.log("Citations string:", citationsStr);
const citations = await getCitations(citationsStr); const citations = getCitations(citationsStr);
return { return {
...page, ...page,
markdown: content, markdown: content,
citations, citations,
}; };
}) })
); );
const results = await Promise.all(promises); const results = await Promise.all(promises);
// Update document record with OCR data console.log("Updating document record with OCR data...");
const { error: updateError } = await supabase const { error: updateError } = await supabase
.from("documents") .from("documents")
.update({ ocr_data: results, is_processing: false }) .update({ ocr_data: results, is_processing: false })
.eq("id", uuid); .eq("id", uuid);
if (updateError) { if (updateError) {
console.error("Failed to update document record:", updateError.message);
throw new Error( throw new Error(
"Failed to update document record: " + updateError.message "Failed to update document record: " + updateError.message
); );
} }
console.log("Document processed successfully");
return NextResponse.json({ return NextResponse.json({
message: "Document processed successfully", message: "Document processed successfully",
results, results,

View File

@ -23,6 +23,7 @@ export default function KokoroReader({ pages }: { pages: any[] }) {
playInOrder, playInOrder,
status, status,
pause, pause,
resume,
} = useTTS(); } = useTTS();
const [playing, setPlaying] = useState(false); const [playing, setPlaying] = useState(false);
@ -32,18 +33,17 @@ export default function KokoroReader({ pages }: { pages: any[] }) {
}, [status === "ready"]); }, [status === "ready"]);
const play = () => { const play = () => {
if (playing) { if (!playing && status === "paused") {
setPlaying(false); resume();
return; } else {
playInOrder(currentSentence || 0);
} }
setPlaying(true); setPlaying(true);
playInOrder(currentSentence || 0);
}; };
const paused = () => { const paused = () => {
setPlaying(false); setPlaying(false);
pause(); pause(); // Call the pause function from TTSProvider
}; };
return ( return (

View File

@ -90,7 +90,7 @@ export default function MarkdownRenderer({
<h3 className="text-lg font-medium mb-2 text-gray-300" {...props} /> <h3 className="text-lg font-medium mb-2 text-gray-300" {...props} />
), ),
h4: ({ node, ...props }) => ( h4: ({ node, ...props }) => (
<h4 className="text-lg font-medium mb-2 text-gray-300" {...props} /> <h4 className="text-lg font-bold mb-2 text-gray-300" {...props} />
), ),
p: ({ node, ...props }) => ( p: ({ node, ...props }) => (
<p className="leading-7 text-gray-200" {...props} /> <p className="leading-7 text-gray-200" {...props} />
@ -147,7 +147,7 @@ export default function MarkdownRenderer({
{...props} {...props}
/> />
</PopoverTrigger> </PopoverTrigger>
<PopoverContent className="w-56 overflow-hidden rounded-lg p-0"> <PopoverContent className="w-auto max-w-3xl bg-gray-900 overflow-hidden rounded-lg p-0">
<div className="p-4"> <div className="p-4">
<p>{citation.text}</p> <p>{citation.text}</p>
</div> </div>

View File

@ -29,7 +29,7 @@ interface TTSContextType {
currentSentence: number; currentSentence: number;
voices: any[]; voices: any[];
selectedSpeaker: string; selectedSpeaker: string;
status: "ready" | "running" | null; status: "ready" | "running" | "paused" | null;
setSelectedSpeaker: (speaker: string) => void; setSelectedSpeaker: (speaker: string) => void;
setCurrentSentence: (index: number) => void; setCurrentSentence: (index: number) => void;
playSentence: (index: number) => void; playSentence: (index: number) => void;
@ -60,11 +60,16 @@ export const TTSProvider = ({
const [selectedSpeaker, setSelectedSpeaker] = useState("af_heart"); const [selectedSpeaker, setSelectedSpeaker] = useState("af_heart");
const [voices, setVoices] = useState<any[]>([]); const [voices, setVoices] = useState<any[]>([]);
const [status, setStatus] = useState<"ready" | "running" | null>("ready"); const [status, setStatus] = useState<"ready" | "running" | "paused" | null>(
"ready"
);
// Cache for preloaded audio // Cache for preloaded audio
const audioCache = useRef<Map<number, string>>(new Map()); const audioCache = useRef<Map<number, string>>(new Map());
// Currently processing TTS
const [processing, setProcessing] = useState<number[]>([]);
// Preload audio for a range of sentences // Preload audio for a range of sentences
const preloadAudio = async (startIndex: number, range: number = 3) => { const preloadAudio = async (startIndex: number, range: number = 3) => {
for ( for (
@ -72,9 +77,10 @@ export const TTSProvider = ({
i < Math.min(sentences.length, startIndex + range); i < Math.min(sentences.length, startIndex + range);
i++ i++
) { ) {
if (!audioCache.current.has(i)) { if (!audioCache.current.has(i) && !processing.includes(i)) {
console.log(`Preloading audio for sentence ${i}: ${sentences[i]}`); console.log(`Preloading audio for sentence ${i}: ${sentences[i]}`);
try { try {
setProcessing((prev) => [...prev, i]); // Add to processing
const audioUrl = await generateTTS(sentences[i], i); const audioUrl = await generateTTS(sentences[i], i);
audioCache.current.set(i, audioUrl); // Cache the audio URL audioCache.current.set(i, audioUrl); // Cache the audio URL
} catch (error) { } catch (error) {
@ -140,7 +146,15 @@ export const TTSProvider = ({
if (index < 0 || index >= sentences.length) return; if (index < 0 || index >= sentences.length) return;
setCurrentSentence(index); setCurrentSentence(index);
// Introduce a flag to track whether playback should continue
let shouldContinue = true;
for (let i = index; i < sentences.length; i++) { for (let i = index; i < sentences.length; i++) {
if (!shouldContinue) {
console.log("Playback stopped or paused.");
break;
}
console.log("Playing sentence:", i, sentences[i]); console.log("Playing sentence:", i, sentences[i]);
try { try {
await playSentence(i); await playSentence(i);
@ -150,17 +164,22 @@ export const TTSProvider = ({
break; // Stop playback on error break; // Stop playback on error
} }
} }
// Reset the playback state when done
setStatus("ready");
}; };
const pause = () => { const pause = () => {
if (audioRef.current) { if (audioRef.current) {
audioRef.current.pause(); audioRef.current.pause();
setStatus("paused"); // Update the status to paused
} }
}; };
const resume = () => { const resume = () => {
if (audioRef.current) { if (audioRef.current && status === "paused") {
audioRef.current.play(); audioRef.current.play();
setStatus("running"); // Update the status to running
} }
}; };
@ -168,7 +187,10 @@ export const TTSProvider = ({
if (audioRef.current) { if (audioRef.current) {
audioRef.current.pause(); audioRef.current.pause();
audioRef.current.currentTime = 0; audioRef.current.currentTime = 0;
setStatus("ready"); // Update the status to ready
} }
// Reset the playback flag
shouldContinue = false;
}; };
// Preload sentences when the current sentence changes // Preload sentences when the current sentence changes

View File

@ -17,7 +17,7 @@ import {
Upload, Upload,
} from "lucide-react"; } from "lucide-react";
import { NavDocuments } from "@/components/nav-favorites"; import { NavDocuments } from "@/components/nav-documents";
import { NavMain } from "@/components/nav-main"; import { NavMain } from "@/components/nav-main";
import { NavSecondary } from "@/components/nav-secondary"; import { NavSecondary } from "@/components/nav-secondary";
import { import {

View File

@ -29,9 +29,11 @@ import {
} from "@/components/ui/sidebar"; } from "@/components/ui/sidebar";
import { createClient } from "@/utils/supabase/client"; import { createClient } from "@/utils/supabase/client";
import { toast } from "sonner"; import { toast } from "sonner";
import { SSE } from "sse.js";
import { useEffect, useState } from "react";
export function NavDocuments({ export function NavDocuments({
documents, documents: ogDocuments,
}: { }: {
documents: { documents: {
id: string; id: string;
@ -43,6 +45,53 @@ export function NavDocuments({
}) { }) {
const { isMobile } = useSidebar(); const { isMobile } = useSidebar();
const supabase = createClient(); const supabase = createClient();
const [documents, setDocuments] = useState(ogDocuments);
useEffect(() => {
// watch for changes in the documents table, update the state when it changes
const handleRecordInserted = (payload: any) => {
const newDocument = payload.new;
setDocuments((prev) => [...prev, newDocument]);
};
const handleRecordUpdated = (payload: any) => {
const updatedDocument = payload.new;
setDocuments((prev) =>
prev.map((doc) =>
doc.id === updatedDocument.id ? updatedDocument : doc
)
);
};
const handleRecordDeleted = (payload: any) => {
const deletedDocument = payload.old;
setDocuments((prev) =>
prev.filter((doc) => doc.id !== deletedDocument.id)
);
};
const subscription = supabase
.channel("documents")
.on(
"postgres_changes",
{ event: "INSERT", schema: "public", table: "documents" },
handleRecordInserted
)
.on(
"postgres_changes",
{ event: "UPDATE", schema: "public", table: "documents" },
handleRecordUpdated
)
.on(
"postgres_changes",
{ event: "DELETE", schema: "public", table: "documents" },
handleRecordDeleted
)
.subscribe();
return () => {
subscription.unsubscribe();
};
}, [ogDocuments, supabase]);
return ( return (
<SidebarGroup className="group-data-[collapsible=icon]:hidden"> <SidebarGroup className="group-data-[collapsible=icon]:hidden">
@ -92,19 +141,33 @@ export function NavDocuments({
session.data.session.refresh_token session.data.session.refresh_token
); );
toast.promise( const eventSource = new SSE(`/api/process-document`, {
supabase.functions.invoke("process-document", { payload: data,
body: data, headers: {
}), apikey: process.env.NEXT_PUBLIC_SUPABASE_ANON_KEY!,
{ Authorization: `Bearer ${process.env.NEXT_PUBLIC_SUPABASE_ANON_KEY}`,
loading: "Reprocessing document...", },
success: "Document reprocessed successfully", method: "POST",
error: (err) => { });
console.error("Error reprocessing document:", err);
return "Failed to reprocess document"; toast.loading("Reprocessing document...");
},
eventSource.onmessage = (event) => {
const message = JSON.parse(event.data);
if (message.status === "success") {
toast.success("Document reprocessed successfully");
eventSource.close();
} else if (message.status === "error") {
toast.error("Failed to reprocess document");
eventSource.close();
} }
); };
eventSource.onerror = (err) => {
console.error("SSE error:", err);
toast.error("An error occurred while reprocessing");
eventSource.close();
};
}} }}
> >
<RefreshCw className="text-muted-foreground" /> <RefreshCw className="text-muted-foreground" />
@ -119,12 +182,6 @@ export function NavDocuments({
</DropdownMenu> </DropdownMenu>
</SidebarMenuItem> </SidebarMenuItem>
))} ))}
<SidebarMenuItem>
<SidebarMenuButton className="text-sidebar-foreground/70">
<MoreHorizontal />
<span>More</span>
</SidebarMenuButton>
</SidebarMenuItem>
</SidebarMenu> </SidebarMenu>
</SidebarGroup> </SidebarGroup>
); );

View File

@ -14,7 +14,7 @@ const client = new Mistral({
const PROCESSING_PROMPT = ` const PROCESSING_PROMPT = `
You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format. You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format.
The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. **Do not add headings if they do not exist in the original text.** The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. **Do not add headings if they do not exist in the original text.** If there is a title to the document, it should be the first heading.
Any images should be included. Any images should be included.
Do not return the Markdown as a code block, only as a raw string, without any new lines. Do not return the Markdown as a code block, only as a raw string, without any new lines.