better document (re)processing

This commit is contained in:
Jack Merrill 2025-05-04 13:57:08 -04:00
parent 3eda271635
commit 16b552262e
Signed by: jack
GPG Key ID: F6BFCA1B80EA6AF7
8 changed files with 304 additions and 253 deletions

View File

@ -120,7 +120,7 @@ export default async function DocumentPage(props: { params: { id: string } }) {
dangerouslySetInnerHTML={{ __html: String(processedContent) }} dangerouslySetInnerHTML={{ __html: String(processedContent) }}
></div> */} ></div> */}
<div className="mx-auto px-12 py-20 gap-2"> <div className="mx-auto px-12 py-20 gap-2">
<MarkdownRenderer rawContent={rawContent} /> <MarkdownRenderer document={document} />
</div> </div>
</SidebarInset> </SidebarInset>
</TTSProvider> </TTSProvider>

View File

@ -44,6 +44,7 @@ export default async function Page() {
<AppSidebar <AppSidebar
documents={documents.map((d) => { documents={documents.map((d) => {
return { return {
id: d.id,
disabled: d.is_processing, disabled: d.is_processing,
name: d.file_name, name: d.file_name,
url: `/dashboard/documents/${d.id}`, url: `/dashboard/documents/${d.id}`,

View File

@ -1,170 +0,0 @@
import { createClient } from "@/utils/supabase/server";
import { NextResponse } from "next/server";
import { Mistral } from "@mistralai/mistralai";
import { redirect } from "next/navigation";
import { ChatCompletionChoice } from "@mistralai/mistralai/models/components";
import pLimit from "p-limit";
const apiKey = process.env.MISTRAL_API_KEY;
const client = new Mistral({ apiKey: apiKey });
const PROCESSING_PROMPT = `
You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format.
The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content.
Any images should be included.
Do not return the Markdown as a code block, only as a raw string, without any new lines.
The Markdown should be human-readable and well-formatted.
`;
export async function POST(request: Request) {
const supabase = await createClient();
const formData = await request.formData();
const file = formData.get("file") as File;
const fileName = formData.get("fileName") as string;
const id = formData.get("id") as string;
const uploaded_pdf = await client.files.upload({
file: {
fileName,
content: file,
},
purpose: "ocr",
});
const signedUrl = await client.files.getSignedUrl({
fileId: uploaded_pdf.id,
});
const ocrResponse = await client.ocr.process({
model: "mistral-ocr-latest",
document: {
type: "document_url",
documentUrl: signedUrl.url,
},
});
const limit = pLimit(1); // Limit to 1 concurrent request (adjust as needed)
const promises: Promise<any>[] = [];
for (const page of ocrResponse.pages) {
const pagePromise = limit(async () => {
const response = await client.chat.complete({
model: "mistral-small-latest",
messages: [
{
role: "user",
content: [
{
type: "text",
text: PROCESSING_PROMPT,
},
],
},
],
});
if (!response.choices) {
console.error("No choices in response");
return;
}
const imageData: { [key: string]: string } = {};
if (page.images.length > 0) {
for (const img of page.images) {
imageData[img.id] = img.imageBase64!;
}
}
if (response.choices[0].message.content) {
const markdown = replaceImagesInMarkdown(
response.choices[0].message.content.toString(),
imageData
);
return {
...page,
markdown,
};
} else {
console.error("Message content is undefined");
}
});
promises.push(pagePromise);
}
const results = await Promise.all(promises);
const sortedResults = results.sort((a, b) => a.index - b.index);
const { data, error } = await supabase
.from("documents")
.update({
ocr_data: sortedResults,
})
.eq("id", id);
if (error) {
console.error(error);
return NextResponse.json({ error: error.message }, { status: 500 });
}
return NextResponse.json({
id,
});
}
interface OCRResponse {
pages: {
markdown: string;
images: { id: string; image_base64: string }[];
}[];
}
function replaceImagesInMarkdown(
markdownStr: string,
imagesDict: { [key: string]: string }
): string {
/**
* Replace image placeholders in markdown with base64-encoded images.
*
* Args:
* markdownStr: Markdown text containing image placeholders
* imagesDict: Dictionary mapping image IDs to base64 strings
*
* Returns:
* Markdown text with images replaced by base64 data
*/
for (const [imgName, base64Str] of Object.entries(imagesDict)) {
markdownStr = markdownStr.replace(
new RegExp(`!\\[${imgName}\\]\\(${imgName}\\)`, "g"),
`![${imgName}](${base64Str})`
);
}
return markdownStr;
}
function getCombinedMarkdown(ocrResponse: OCRResponse): string {
/**
* Combine OCR text and images into a single markdown document.
*
* Args:
* ocrResponse: Response from OCR processing containing text and images
*
* Returns:
* Combined markdown string with embedded images
*/
const markdowns: string[] = [];
// Extract images from page
for (const page of ocrResponse.pages) {
const imageData: { [key: string]: string } = {};
for (const img of page.images) {
imageData[img.id] = img.image_base64;
}
// Replace image placeholders with actual images
markdowns.push(replaceImagesInMarkdown(page.markdown, imageData));
}
return markdowns.join("\n\n");
}

View File

@ -9,16 +9,29 @@ import ReactMarkdown, { Components } from "react-markdown";
import rehypeRaw from "rehype-raw"; import rehypeRaw from "rehype-raw";
import { useTTS } from "./TTSProvider"; import { useTTS } from "./TTSProvider";
import rehypeHighlight from "@/lib/utils"; import rehypeHighlight from "@/lib/utils";
import { Database } from "@/utils/supabase/types";
// Utility to escape regex special characters: // Utility to escape regex special characters:
function escapeRegExp(text: string) { function escapeRegExp(text: string) {
return text.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&"); return text.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&");
} }
export type OCRData = {
index: number;
images: string[];
markdown: string;
citations: Record<string, string>;
dimensions: {
dpi: number;
width: number;
height: number;
};
};
export default function MarkdownRenderer({ export default function MarkdownRenderer({
rawContent, document,
}: { }: {
rawContent: string; document: Database["public"]["Tables"]["documents"]["Row"];
}) { }) {
// Obtain TTS info from context. // Obtain TTS info from context.
// TTSProvider is already wrapping this component higher in the tree. // TTSProvider is already wrapping this component higher in the tree.
@ -36,6 +49,34 @@ export default function MarkdownRenderer({
[textToHighlight] [textToHighlight]
); );
const ocr = document?.ocr_data as OCRData[];
const rawContent = ocr.map((page) => page.markdown).join("\n") || "";
const citations: {
text: string;
page: number;
index: string;
number: number;
}[] = [];
const totalPages = ocr.length;
const totalSentences = sentences.length;
let totalCitations = 0;
ocr.forEach((page) => {
Object.entries(page.citations).forEach(([key, value]) => {
if (value) {
totalCitations++;
citations.push({
text: value,
page: page.index,
index: key,
number: Number(totalCitations),
});
}
});
});
const components: Components = { const components: Components = {
h1: ({ node, ...props }) => ( h1: ({ node, ...props }) => (
<h1 className="text-2xl font-semibold mb-4 text-white" {...props} /> <h1 className="text-2xl font-semibold mb-4 text-white" {...props} />
@ -77,7 +118,24 @@ export default function MarkdownRenderer({
{...props} {...props}
/> />
), ),
sup: ({ node, ...props }) => ( sup: ({ node, ...props }) => {
// Check if the text contains a reference number
const text = props.children!.toString();
const referenceNumber = text;
if (!referenceNumber) {
return <sup {...props} data-error="no reference number" />;
}
const citation = citations.find(
(c) => c.index === referenceNumber && c.page === page.index
);
if (!citation) {
return <sup {...props} data-error="no citation found" />;
}
return (
// TODO: get the references from the document and display them in a popover // TODO: get the references from the document and display them in a popover
<Popover> <Popover>
<PopoverTrigger asChild> <PopoverTrigger asChild>
@ -89,11 +147,12 @@ export default function MarkdownRenderer({
<PopoverContent className="w-56 overflow-hidden rounded-lg p-0"> <PopoverContent className="w-56 overflow-hidden rounded-lg p-0">
<div className="p-4"> <div className="p-4">
{/* Replace with actual reference content */} {/* Replace with actual reference content */}
<p>Reference content goes here.</p> <p>{citation.text}</p>
</div> </div>
</PopoverContent> </PopoverContent>
</Popover> </Popover>
), );
},
}; };
return ( return (

View File

@ -64,12 +64,6 @@ export default function UploadZone({ user }: { user?: { id: string } }) {
eventSource.close(); eventSource.close();
}); });
// // Invoke the serverless function
// supabase.functions.invoke("process-document", {
// body,
// method: "POST",
// });
toast.info( toast.info(
"Document is being processed in the background. You will be notified when it's ready." "Document is being processed in the background. You will be notified when it's ready."
); );

View File

@ -33,7 +33,13 @@ export function AppSidebar({
documents, documents,
...props ...props
}: React.ComponentProps<typeof Sidebar> & { }: React.ComponentProps<typeof Sidebar> & {
documents?: Array<{ name: string; url: string; emoji?: string }>; documents: Array<{
id: string;
disabled?: boolean;
name: string;
url: string;
emoji?: string;
}>;
}) { }) {
const data = { const data = {
navMain: [ navMain: [

View File

@ -6,6 +6,7 @@ import {
Link, Link,
LoaderCircle, LoaderCircle,
MoreHorizontal, MoreHorizontal,
RefreshCw,
StarOff, StarOff,
Trash2, Trash2,
} from "lucide-react"; } from "lucide-react";
@ -26,11 +27,14 @@ import {
SidebarMenuItem, SidebarMenuItem,
useSidebar, useSidebar,
} from "@/components/ui/sidebar"; } from "@/components/ui/sidebar";
import { createClient } from "@/utils/supabase/client";
import { toast } from "sonner";
export function NavDocuments({ export function NavDocuments({
documents, documents,
}: { }: {
documents: { documents: {
id: string;
disabled?: boolean; disabled?: boolean;
name: string; name: string;
url: string; url: string;
@ -38,13 +42,14 @@ export function NavDocuments({
}[]; }[];
}) { }) {
const { isMobile } = useSidebar(); const { isMobile } = useSidebar();
const supabase = createClient();
return ( return (
<SidebarGroup className="group-data-[collapsible=icon]:hidden"> <SidebarGroup className="group-data-[collapsible=icon]:hidden">
<SidebarGroupLabel>Documents</SidebarGroupLabel> <SidebarGroupLabel>Documents</SidebarGroupLabel>
<SidebarMenu> <SidebarMenu>
{documents.map((item) => ( {documents.map((item) => (
<SidebarMenuItem key={item.name} aria-disabled={item.disabled}> <SidebarMenuItem key={item.id} aria-disabled={item.disabled}>
<SidebarMenuButton asChild disabled={item.disabled}> <SidebarMenuButton asChild disabled={item.disabled}>
<a href={item.url} title={item.name}> <a href={item.url} title={item.name}>
{item.disabled ? ( {item.disabled ? (
@ -67,18 +72,43 @@ export function NavDocuments({
side={isMobile ? "bottom" : "right"} side={isMobile ? "bottom" : "right"}
align={isMobile ? "end" : "start"} align={isMobile ? "end" : "start"}
> >
<DropdownMenuItem> <DropdownMenuItem
<StarOff className="text-muted-foreground" /> onClick={async () => {
<span>Remove from Favorites</span> const data = new FormData();
</DropdownMenuItem>
<DropdownMenuSeparator /> const session = await supabase.auth.getSession();
<DropdownMenuItem> if (!session.data.session) {
<Link className="text-muted-foreground" /> toast.error("You are not logged in");
<span>Copy Link</span> return;
</DropdownMenuItem> }
<DropdownMenuItem>
<ArrowUpRight className="text-muted-foreground" /> data.append("id", item.id);
<span>Open in New Tab</span> data.append(
"access_token",
session.data.session.access_token
);
data.append(
"refresh_token",
session.data.session.refresh_token
);
toast.promise(
supabase.functions.invoke("process-document", {
body: data,
}),
{
loading: "Reprocessing document...",
success: "Document reprocessed successfully",
error: (err) => {
console.error("Error reprocessing document:", err);
return "Failed to reprocess document";
},
}
);
}}
>
<RefreshCw className="text-muted-foreground" />
<span>Reprocess Document</span>
</DropdownMenuItem> </DropdownMenuItem>
<DropdownMenuSeparator /> <DropdownMenuSeparator />
<DropdownMenuItem> <DropdownMenuItem>

View File

@ -32,11 +32,16 @@ Return the final result as a text object with the following structure (without c
--------- ---------
{ {
"citations": { "citations": [
"1": "Citation text for reference 1", {
"2": "Citation text for reference 2", "number": 1, // The number as it appears in the text
// ... more citations "text": "Citation text 1"
},
{
"number": 2,
"text": "Citation text 2"
} }
]
} }
""" """
@ -97,12 +102,16 @@ Deno.serve(async (req) => {
Deno.env.get("SUPABASE_ANON_KEY") Deno.env.get("SUPABASE_ANON_KEY")
); );
const supabaseServer = createClient(
Deno.env.get("SUPABASE_URL"),
Deno.env.get("SUPABASE_SERVICE_ROLE_KEY")
);
const formData = await req.formData(); const formData = await req.formData();
const file = formData.get("file");
const accessToken = formData.get("access_token"); const accessToken = formData.get("access_token");
const refreshToken = formData.get("refresh_token"); const refreshToken = formData.get("refresh_token");
const fileName = file.name; var reprocessing = false;
const uuid = crypto.randomUUID(); var uuid = crypto.randomUUID();
const { const {
data: { user }, data: { user },
@ -121,6 +130,101 @@ Deno.serve(async (req) => {
throw new Error("Setting session failed"); throw new Error("Setting session failed");
} }
if (formData.has("id")) {
console.log("Reprocessing document...");
reprocessing = true;
console.log("File ID found in form data.");
sendEvent("status", {
message: "File ID found in form data.",
});
const docId = formData.get("id");
const { data: documentData, error: documentError } = await supabase
.from("documents")
.select("*")
.eq("id", docId)
.single();
if (documentError) {
console.error("Error fetching document record:", documentError);
sendEvent("error", {
message: "Error fetching document record",
error: documentError,
});
throw new Error("Document record fetch failed");
}
if (documentData) {
await supabase
.from("documents")
.update({
is_processing: true,
})
.eq("id", documentData.id);
uuid = documentData.id;
} else {
console.error("Document record not found.");
sendEvent("error", {
message: "Document record not found",
});
throw new Error("Document record not found");
}
const { data: storageData, error: storageError } = await supabaseServer
.from("storage.objects")
.select("name")
.eq("id", documentData.raw_file)
.single();
if (storageError) {
console.error("Error fetching file name:", storageError);
sendEvent("error", {
message: "Error fetching file name",
error: storageError,
});
throw new Error("Storage data fetch failed");
}
const { data: fileData, error: fileError } = await supabase.storage
.from("documents")
.download(storageData.name);
if (fileError) {
console.error("Error downloading file from storage:", fileError);
sendEvent("error", {
message: "Error downloading file from storage",
error: fileError,
});
throw new Error("File download failed");
}
console.log("File downloaded from storage:", fileData);
sendEvent("status", {
message: "File downloaded from storage",
fileData,
});
formData.set("file", fileData);
}
if (!formData.has("file")) {
console.error("File not found in form data.");
sendEvent("error", {
message: "File not found in form data",
});
throw new Error("File not found");
}
if (!formData.has("access_token") || !formData.has("refresh_token")) {
console.error("Access token or refresh token not found in form data.");
sendEvent("error", {
message: "Access token or refresh token not found in form data",
});
throw new Error("Tokens not found");
}
const file = formData.get("file") as File;
const fileName = file.name;
console.log("Generated UUID:", uuid); console.log("Generated UUID:", uuid);
sendEvent("status", { sendEvent("status", {
message: "Generated UUID", message: "Generated UUID",
@ -133,7 +237,9 @@ Deno.serve(async (req) => {
user, user,
}); });
const { data: storageData, error: storageError } = await supabase.storage if (!reprocessing) {
const { data: storageData, error: storageError } =
await supabase.storage
.from("documents") .from("documents")
.upload(`${user.id}/${uuid}.pdf`, file); .upload(`${user.id}/${uuid}.pdf`, file);
@ -173,6 +279,31 @@ Deno.serve(async (req) => {
sendEvent("status", { sendEvent("status", {
message: "Document record inserted successfully", message: "Document record inserted successfully",
}); });
} else {
console.log("Reprocessing document...");
sendEvent("status", {
message: "Reprocessing document",
});
const { error: docError } = await supabase
.from("documents")
.update({
is_processing: true,
})
.eq("id", uuid);
if (docError) {
console.error("Error updating document record:", docError);
sendEvent("error", {
message: "Error updating document record",
error: docError,
});
throw new Error("Document record update failed");
}
console.log("Document record updated successfully.");
sendEvent("status", {
message: "Document record updated successfully",
});
}
console.log("Uploading file to Mistral..."); console.log("Uploading file to Mistral...");
sendEvent("status", { sendEvent("status", {