From 16b552262e8329447d11c680be26684408c97bbd Mon Sep 17 00:00:00 2001 From: Jack Merrill Date: Sun, 4 May 2025 13:57:08 -0400 Subject: [PATCH] better document (re)processing --- app/dashboard/documents/[id]/page.tsx | 2 +- app/dashboard/upload/page.tsx | 1 + app/dashboard/upload/process/route.ts | 170 --------------- components/MarkdownRenderer.tsx | 97 +++++++-- components/UploadZone.tsx | 6 - components/app-sidebar.tsx | 8 +- components/nav-favorites.tsx | 56 +++-- supabase/functions/process-document/index.ts | 217 +++++++++++++++---- 8 files changed, 304 insertions(+), 253 deletions(-) delete mode 100644 app/dashboard/upload/process/route.ts diff --git a/app/dashboard/documents/[id]/page.tsx b/app/dashboard/documents/[id]/page.tsx index 10136c6..abd4ddd 100644 --- a/app/dashboard/documents/[id]/page.tsx +++ b/app/dashboard/documents/[id]/page.tsx @@ -120,7 +120,7 @@ export default async function DocumentPage(props: { params: { id: string } }) { dangerouslySetInnerHTML={{ __html: String(processedContent) }} > */}
- +
diff --git a/app/dashboard/upload/page.tsx b/app/dashboard/upload/page.tsx index 38cfa4a..909eedc 100644 --- a/app/dashboard/upload/page.tsx +++ b/app/dashboard/upload/page.tsx @@ -44,6 +44,7 @@ export default async function Page() { { return { + id: d.id, disabled: d.is_processing, name: d.file_name, url: `/dashboard/documents/${d.id}`, diff --git a/app/dashboard/upload/process/route.ts b/app/dashboard/upload/process/route.ts deleted file mode 100644 index 634cbfe..0000000 --- a/app/dashboard/upload/process/route.ts +++ /dev/null @@ -1,170 +0,0 @@ -import { createClient } from "@/utils/supabase/server"; -import { NextResponse } from "next/server"; -import { Mistral } from "@mistralai/mistralai"; -import { redirect } from "next/navigation"; -import { ChatCompletionChoice } from "@mistralai/mistralai/models/components"; -import pLimit from "p-limit"; - -const apiKey = process.env.MISTRAL_API_KEY; -const client = new Mistral({ apiKey: apiKey }); - -const PROCESSING_PROMPT = ` -You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format. - -The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. -Any images should be included. -Do not return the Markdown as a code block, only as a raw string, without any new lines. - -The Markdown should be human-readable and well-formatted. -`; - -export async function POST(request: Request) { - const supabase = await createClient(); - const formData = await request.formData(); - const file = formData.get("file") as File; - const fileName = formData.get("fileName") as string; - const id = formData.get("id") as string; - - const uploaded_pdf = await client.files.upload({ - file: { - fileName, - content: file, - }, - purpose: "ocr", - }); - - const signedUrl = await client.files.getSignedUrl({ - fileId: uploaded_pdf.id, - }); - - const ocrResponse = await client.ocr.process({ - model: "mistral-ocr-latest", - document: { - type: "document_url", - documentUrl: signedUrl.url, - }, - }); - - const limit = pLimit(1); // Limit to 1 concurrent request (adjust as needed) - - const promises: Promise[] = []; - - for (const page of ocrResponse.pages) { - const pagePromise = limit(async () => { - const response = await client.chat.complete({ - model: "mistral-small-latest", - messages: [ - { - role: "user", - content: [ - { - type: "text", - text: PROCESSING_PROMPT, - }, - ], - }, - ], - }); - - if (!response.choices) { - console.error("No choices in response"); - return; - } - - const imageData: { [key: string]: string } = {}; - - if (page.images.length > 0) { - for (const img of page.images) { - imageData[img.id] = img.imageBase64!; - } - } - - if (response.choices[0].message.content) { - const markdown = replaceImagesInMarkdown( - response.choices[0].message.content.toString(), - imageData - ); - - return { - ...page, - markdown, - }; - } else { - console.error("Message content is undefined"); - } - }); - - promises.push(pagePromise); - } - - const results = await Promise.all(promises); - const sortedResults = results.sort((a, b) => a.index - b.index); - - const { data, error } = await supabase - .from("documents") - .update({ - ocr_data: sortedResults, - }) - .eq("id", id); - if (error) { - console.error(error); - return NextResponse.json({ error: error.message }, { status: 500 }); - } - return NextResponse.json({ - id, - }); -} - -interface OCRResponse { - pages: { - markdown: string; - images: { id: string; image_base64: string }[]; - }[]; -} - -function replaceImagesInMarkdown( - markdownStr: string, - imagesDict: { [key: string]: string } -): string { - /** - * Replace image placeholders in markdown with base64-encoded images. - * - * Args: - * markdownStr: Markdown text containing image placeholders - * imagesDict: Dictionary mapping image IDs to base64 strings - * - * Returns: - * Markdown text with images replaced by base64 data - */ - for (const [imgName, base64Str] of Object.entries(imagesDict)) { - markdownStr = markdownStr.replace( - new RegExp(`!\\[${imgName}\\]\\(${imgName}\\)`, "g"), - `![${imgName}](${base64Str})` - ); - } - return markdownStr; -} - -function getCombinedMarkdown(ocrResponse: OCRResponse): string { - /** - * Combine OCR text and images into a single markdown document. - * - * Args: - * ocrResponse: Response from OCR processing containing text and images - * - * Returns: - * Combined markdown string with embedded images - */ - const markdowns: string[] = []; - // Extract images from page - for (const page of ocrResponse.pages) { - const imageData: { [key: string]: string } = {}; - for (const img of page.images) { - imageData[img.id] = img.image_base64; - } - // Replace image placeholders with actual images - markdowns.push(replaceImagesInMarkdown(page.markdown, imageData)); - } - - return markdowns.join("\n\n"); -} diff --git a/components/MarkdownRenderer.tsx b/components/MarkdownRenderer.tsx index 8b69114..e0eaefa 100644 --- a/components/MarkdownRenderer.tsx +++ b/components/MarkdownRenderer.tsx @@ -9,16 +9,29 @@ import ReactMarkdown, { Components } from "react-markdown"; import rehypeRaw from "rehype-raw"; import { useTTS } from "./TTSProvider"; import rehypeHighlight from "@/lib/utils"; +import { Database } from "@/utils/supabase/types"; // Utility to escape regex special characters: function escapeRegExp(text: string) { return text.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&"); } +export type OCRData = { + index: number; + images: string[]; + markdown: string; + citations: Record; + dimensions: { + dpi: number; + width: number; + height: number; + }; +}; + export default function MarkdownRenderer({ - rawContent, + document, }: { - rawContent: string; + document: Database["public"]["Tables"]["documents"]["Row"]; }) { // Obtain TTS info from context. // TTSProvider is already wrapping this component higher in the tree. @@ -36,6 +49,34 @@ export default function MarkdownRenderer({ [textToHighlight] ); + const ocr = document?.ocr_data as OCRData[]; + + const rawContent = ocr.map((page) => page.markdown).join("\n") || ""; + + const citations: { + text: string; + page: number; + index: string; + number: number; + }[] = []; + const totalPages = ocr.length; + const totalSentences = sentences.length; + + let totalCitations = 0; + ocr.forEach((page) => { + Object.entries(page.citations).forEach(([key, value]) => { + if (value) { + totalCitations++; + citations.push({ + text: value, + page: page.index, + index: key, + number: Number(totalCitations), + }); + } + }); + }); + const components: Components = { h1: ({ node, ...props }) => (

@@ -77,23 +118,41 @@ export default function MarkdownRenderer({ {...props} /> ), - sup: ({ node, ...props }) => ( - // TODO: get the references from the document and display them in a popover - - - - - -
- {/* Replace with actual reference content */} -

Reference content goes here.

-
-
-
- ), + sup: ({ node, ...props }) => { + // Check if the text contains a reference number + const text = props.children!.toString(); + + const referenceNumber = text; + if (!referenceNumber) { + return ; + } + + const citation = citations.find( + (c) => c.index === referenceNumber && c.page === page.index + ); + + if (!citation) { + return ; + } + + return ( + // TODO: get the references from the document and display them in a popover + + + + + +
+ {/* Replace with actual reference content */} +

{citation.text}

+
+
+
+ ); + }, }; return ( diff --git a/components/UploadZone.tsx b/components/UploadZone.tsx index a373914..58f6e47 100644 --- a/components/UploadZone.tsx +++ b/components/UploadZone.tsx @@ -64,12 +64,6 @@ export default function UploadZone({ user }: { user?: { id: string } }) { eventSource.close(); }); - // // Invoke the serverless function - // supabase.functions.invoke("process-document", { - // body, - // method: "POST", - // }); - toast.info( "Document is being processed in the background. You will be notified when it's ready." ); diff --git a/components/app-sidebar.tsx b/components/app-sidebar.tsx index 66c5123..f4d344a 100644 --- a/components/app-sidebar.tsx +++ b/components/app-sidebar.tsx @@ -33,7 +33,13 @@ export function AppSidebar({ documents, ...props }: React.ComponentProps & { - documents?: Array<{ name: string; url: string; emoji?: string }>; + documents: Array<{ + id: string; + disabled?: boolean; + name: string; + url: string; + emoji?: string; + }>; }) { const data = { navMain: [ diff --git a/components/nav-favorites.tsx b/components/nav-favorites.tsx index ddaa4c6..85c2b3c 100644 --- a/components/nav-favorites.tsx +++ b/components/nav-favorites.tsx @@ -6,6 +6,7 @@ import { Link, LoaderCircle, MoreHorizontal, + RefreshCw, StarOff, Trash2, } from "lucide-react"; @@ -26,11 +27,14 @@ import { SidebarMenuItem, useSidebar, } from "@/components/ui/sidebar"; +import { createClient } from "@/utils/supabase/client"; +import { toast } from "sonner"; export function NavDocuments({ documents, }: { documents: { + id: string; disabled?: boolean; name: string; url: string; @@ -38,13 +42,14 @@ export function NavDocuments({ }[]; }) { const { isMobile } = useSidebar(); + const supabase = createClient(); return ( Documents {documents.map((item) => ( - + {item.disabled ? ( @@ -67,18 +72,43 @@ export function NavDocuments({ side={isMobile ? "bottom" : "right"} align={isMobile ? "end" : "start"} > - - - Remove from Favorites - - - - - Copy Link - - - - Open in New Tab + { + const data = new FormData(); + + const session = await supabase.auth.getSession(); + if (!session.data.session) { + toast.error("You are not logged in"); + return; + } + + data.append("id", item.id); + data.append( + "access_token", + session.data.session.access_token + ); + data.append( + "refresh_token", + session.data.session.refresh_token + ); + + toast.promise( + supabase.functions.invoke("process-document", { + body: data, + }), + { + loading: "Reprocessing document...", + success: "Document reprocessed successfully", + error: (err) => { + console.error("Error reprocessing document:", err); + return "Failed to reprocess document"; + }, + } + ); + }} + > + + Reprocess Document diff --git a/supabase/functions/process-document/index.ts b/supabase/functions/process-document/index.ts index d1388fe..af45377 100644 --- a/supabase/functions/process-document/index.ts +++ b/supabase/functions/process-document/index.ts @@ -32,11 +32,16 @@ Return the final result as a text object with the following structure (without c --------- { - "citations": { - "1": "Citation text for reference 1", - "2": "Citation text for reference 2", - // ... more citations - } + "citations": [ + { + "number": 1, // The number as it appears in the text + "text": "Citation text 1" + }, + { + "number": 2, + "text": "Citation text 2" + } + ] } """ @@ -97,12 +102,16 @@ Deno.serve(async (req) => { Deno.env.get("SUPABASE_ANON_KEY") ); + const supabaseServer = createClient( + Deno.env.get("SUPABASE_URL"), + Deno.env.get("SUPABASE_SERVICE_ROLE_KEY") + ); + const formData = await req.formData(); - const file = formData.get("file"); const accessToken = formData.get("access_token"); const refreshToken = formData.get("refresh_token"); - const fileName = file.name; - const uuid = crypto.randomUUID(); + var reprocessing = false; + var uuid = crypto.randomUUID(); const { data: { user }, @@ -121,6 +130,101 @@ Deno.serve(async (req) => { throw new Error("Setting session failed"); } + if (formData.has("id")) { + console.log("Reprocessing document..."); + reprocessing = true; + console.log("File ID found in form data."); + sendEvent("status", { + message: "File ID found in form data.", + }); + const docId = formData.get("id"); + + const { data: documentData, error: documentError } = await supabase + .from("documents") + .select("*") + .eq("id", docId) + .single(); + + if (documentError) { + console.error("Error fetching document record:", documentError); + sendEvent("error", { + message: "Error fetching document record", + error: documentError, + }); + throw new Error("Document record fetch failed"); + } + + if (documentData) { + await supabase + .from("documents") + .update({ + is_processing: true, + }) + .eq("id", documentData.id); + uuid = documentData.id; + } else { + console.error("Document record not found."); + sendEvent("error", { + message: "Document record not found", + }); + throw new Error("Document record not found"); + } + + const { data: storageData, error: storageError } = await supabaseServer + .from("storage.objects") + .select("name") + .eq("id", documentData.raw_file) + .single(); + + if (storageError) { + console.error("Error fetching file name:", storageError); + sendEvent("error", { + message: "Error fetching file name", + error: storageError, + }); + throw new Error("Storage data fetch failed"); + } + + const { data: fileData, error: fileError } = await supabase.storage + .from("documents") + .download(storageData.name); + + if (fileError) { + console.error("Error downloading file from storage:", fileError); + sendEvent("error", { + message: "Error downloading file from storage", + error: fileError, + }); + throw new Error("File download failed"); + } + + console.log("File downloaded from storage:", fileData); + sendEvent("status", { + message: "File downloaded from storage", + fileData, + }); + + formData.set("file", fileData); + } + + if (!formData.has("file")) { + console.error("File not found in form data."); + sendEvent("error", { + message: "File not found in form data", + }); + throw new Error("File not found"); + } + if (!formData.has("access_token") || !formData.has("refresh_token")) { + console.error("Access token or refresh token not found in form data."); + sendEvent("error", { + message: "Access token or refresh token not found in form data", + }); + throw new Error("Tokens not found"); + } + + const file = formData.get("file") as File; + const fileName = file.name; + console.log("Generated UUID:", uuid); sendEvent("status", { message: "Generated UUID", @@ -133,46 +237,73 @@ Deno.serve(async (req) => { user, }); - const { data: storageData, error: storageError } = await supabase.storage - .from("documents") - .upload(`${user.id}/${uuid}.pdf`, file); + if (!reprocessing) { + const { data: storageData, error: storageError } = + await supabase.storage + .from("documents") + .upload(`${user.id}/${uuid}.pdf`, file); - if (storageError) { - console.error("Error uploading file to storage:", storageError); - sendEvent("error", { - message: "Error uploading file to storage", - error: storageError, + if (storageError) { + console.error("Error uploading file to storage:", storageError); + sendEvent("error", { + message: "Error uploading file to storage", + error: storageError, + }); + throw new Error("File upload failed"); + } + + console.log("File uploaded to storage:", storageData); + sendEvent("status", { + message: "File uploaded to storage", + storageData, }); - throw new Error("File upload failed"); - } - console.log("File uploaded to storage:", storageData); - sendEvent("status", { - message: "File uploaded to storage", - storageData, - }); - - const { error: docError } = await supabase.from("documents").insert({ - id: uuid, - file_name: file.name, - owner: user.id, - raw_file: storageData.id, - is_processing: true, - }); - - if (docError) { - console.error("Error inserting document record:", docError); - sendEvent("error", { - message: "Error inserting document record", - error: docError, + const { error: docError } = await supabase.from("documents").insert({ + id: uuid, + file_name: file.name, + owner: user.id, + raw_file: storageData.id, + is_processing: true, }); - throw new Error("Document record insertion failed"); - } - console.log("Document record inserted successfully."); - sendEvent("status", { - message: "Document record inserted successfully", - }); + if (docError) { + console.error("Error inserting document record:", docError); + sendEvent("error", { + message: "Error inserting document record", + error: docError, + }); + throw new Error("Document record insertion failed"); + } + + console.log("Document record inserted successfully."); + sendEvent("status", { + message: "Document record inserted successfully", + }); + } else { + console.log("Reprocessing document..."); + sendEvent("status", { + message: "Reprocessing document", + }); + + const { error: docError } = await supabase + .from("documents") + .update({ + is_processing: true, + }) + .eq("id", uuid); + if (docError) { + console.error("Error updating document record:", docError); + sendEvent("error", { + message: "Error updating document record", + error: docError, + }); + throw new Error("Document record update failed"); + } + console.log("Document record updated successfully."); + sendEvent("status", { + message: "Document record updated successfully", + }); + } console.log("Uploading file to Mistral..."); sendEvent("status", {