better document (re)processing

This commit is contained in:
Jack Merrill 2025-05-04 13:57:08 -04:00
parent 3eda271635
commit 16b552262e
Signed by: jack
GPG Key ID: F6BFCA1B80EA6AF7
8 changed files with 304 additions and 253 deletions

View File

@ -120,7 +120,7 @@ export default async function DocumentPage(props: { params: { id: string } }) {
dangerouslySetInnerHTML={{ __html: String(processedContent) }}
></div> */}
<div className="mx-auto px-12 py-20 gap-2">
<MarkdownRenderer rawContent={rawContent} />
<MarkdownRenderer document={document} />
</div>
</SidebarInset>
</TTSProvider>

View File

@ -44,6 +44,7 @@ export default async function Page() {
<AppSidebar
documents={documents.map((d) => {
return {
id: d.id,
disabled: d.is_processing,
name: d.file_name,
url: `/dashboard/documents/${d.id}`,

View File

@ -1,170 +0,0 @@
import { createClient } from "@/utils/supabase/server";
import { NextResponse } from "next/server";
import { Mistral } from "@mistralai/mistralai";
import { redirect } from "next/navigation";
import { ChatCompletionChoice } from "@mistralai/mistralai/models/components";
import pLimit from "p-limit";
const apiKey = process.env.MISTRAL_API_KEY;
const client = new Mistral({ apiKey: apiKey });
const PROCESSING_PROMPT = `
You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format.
The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content.
Any images should be included.
Do not return the Markdown as a code block, only as a raw string, without any new lines.
The Markdown should be human-readable and well-formatted.
`;
export async function POST(request: Request) {
const supabase = await createClient();
const formData = await request.formData();
const file = formData.get("file") as File;
const fileName = formData.get("fileName") as string;
const id = formData.get("id") as string;
const uploaded_pdf = await client.files.upload({
file: {
fileName,
content: file,
},
purpose: "ocr",
});
const signedUrl = await client.files.getSignedUrl({
fileId: uploaded_pdf.id,
});
const ocrResponse = await client.ocr.process({
model: "mistral-ocr-latest",
document: {
type: "document_url",
documentUrl: signedUrl.url,
},
});
const limit = pLimit(1); // Limit to 1 concurrent request (adjust as needed)
const promises: Promise<any>[] = [];
for (const page of ocrResponse.pages) {
const pagePromise = limit(async () => {
const response = await client.chat.complete({
model: "mistral-small-latest",
messages: [
{
role: "user",
content: [
{
type: "text",
text: PROCESSING_PROMPT,
},
],
},
],
});
if (!response.choices) {
console.error("No choices in response");
return;
}
const imageData: { [key: string]: string } = {};
if (page.images.length > 0) {
for (const img of page.images) {
imageData[img.id] = img.imageBase64!;
}
}
if (response.choices[0].message.content) {
const markdown = replaceImagesInMarkdown(
response.choices[0].message.content.toString(),
imageData
);
return {
...page,
markdown,
};
} else {
console.error("Message content is undefined");
}
});
promises.push(pagePromise);
}
const results = await Promise.all(promises);
const sortedResults = results.sort((a, b) => a.index - b.index);
const { data, error } = await supabase
.from("documents")
.update({
ocr_data: sortedResults,
})
.eq("id", id);
if (error) {
console.error(error);
return NextResponse.json({ error: error.message }, { status: 500 });
}
return NextResponse.json({
id,
});
}
interface OCRResponse {
pages: {
markdown: string;
images: { id: string; image_base64: string }[];
}[];
}
function replaceImagesInMarkdown(
markdownStr: string,
imagesDict: { [key: string]: string }
): string {
/**
* Replace image placeholders in markdown with base64-encoded images.
*
* Args:
* markdownStr: Markdown text containing image placeholders
* imagesDict: Dictionary mapping image IDs to base64 strings
*
* Returns:
* Markdown text with images replaced by base64 data
*/
for (const [imgName, base64Str] of Object.entries(imagesDict)) {
markdownStr = markdownStr.replace(
new RegExp(`!\\[${imgName}\\]\\(${imgName}\\)`, "g"),
`![${imgName}](${base64Str})`
);
}
return markdownStr;
}
function getCombinedMarkdown(ocrResponse: OCRResponse): string {
/**
* Combine OCR text and images into a single markdown document.
*
* Args:
* ocrResponse: Response from OCR processing containing text and images
*
* Returns:
* Combined markdown string with embedded images
*/
const markdowns: string[] = [];
// Extract images from page
for (const page of ocrResponse.pages) {
const imageData: { [key: string]: string } = {};
for (const img of page.images) {
imageData[img.id] = img.image_base64;
}
// Replace image placeholders with actual images
markdowns.push(replaceImagesInMarkdown(page.markdown, imageData));
}
return markdowns.join("\n\n");
}

View File

@ -9,16 +9,29 @@ import ReactMarkdown, { Components } from "react-markdown";
import rehypeRaw from "rehype-raw";
import { useTTS } from "./TTSProvider";
import rehypeHighlight from "@/lib/utils";
import { Database } from "@/utils/supabase/types";
// Utility to escape regex special characters:
function escapeRegExp(text: string) {
return text.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&");
}
export type OCRData = {
index: number;
images: string[];
markdown: string;
citations: Record<string, string>;
dimensions: {
dpi: number;
width: number;
height: number;
};
};
export default function MarkdownRenderer({
rawContent,
document,
}: {
rawContent: string;
document: Database["public"]["Tables"]["documents"]["Row"];
}) {
// Obtain TTS info from context.
// TTSProvider is already wrapping this component higher in the tree.
@ -36,6 +49,34 @@ export default function MarkdownRenderer({
[textToHighlight]
);
const ocr = document?.ocr_data as OCRData[];
const rawContent = ocr.map((page) => page.markdown).join("\n") || "";
const citations: {
text: string;
page: number;
index: string;
number: number;
}[] = [];
const totalPages = ocr.length;
const totalSentences = sentences.length;
let totalCitations = 0;
ocr.forEach((page) => {
Object.entries(page.citations).forEach(([key, value]) => {
if (value) {
totalCitations++;
citations.push({
text: value,
page: page.index,
index: key,
number: Number(totalCitations),
});
}
});
});
const components: Components = {
h1: ({ node, ...props }) => (
<h1 className="text-2xl font-semibold mb-4 text-white" {...props} />
@ -77,23 +118,41 @@ export default function MarkdownRenderer({
{...props}
/>
),
sup: ({ node, ...props }) => (
// TODO: get the references from the document and display them in a popover
<Popover>
<PopoverTrigger asChild>
<sup
className="text-gray-200 cursor-pointer underline hover:cursor-pointer"
{...props}
/>
</PopoverTrigger>
<PopoverContent className="w-56 overflow-hidden rounded-lg p-0">
<div className="p-4">
{/* Replace with actual reference content */}
<p>Reference content goes here.</p>
</div>
</PopoverContent>
</Popover>
),
sup: ({ node, ...props }) => {
// Check if the text contains a reference number
const text = props.children!.toString();
const referenceNumber = text;
if (!referenceNumber) {
return <sup {...props} data-error="no reference number" />;
}
const citation = citations.find(
(c) => c.index === referenceNumber && c.page === page.index
);
if (!citation) {
return <sup {...props} data-error="no citation found" />;
}
return (
// TODO: get the references from the document and display them in a popover
<Popover>
<PopoverTrigger asChild>
<sup
className="text-gray-200 cursor-pointer underline hover:cursor-pointer"
{...props}
/>
</PopoverTrigger>
<PopoverContent className="w-56 overflow-hidden rounded-lg p-0">
<div className="p-4">
{/* Replace with actual reference content */}
<p>{citation.text}</p>
</div>
</PopoverContent>
</Popover>
);
},
};
return (

View File

@ -64,12 +64,6 @@ export default function UploadZone({ user }: { user?: { id: string } }) {
eventSource.close();
});
// // Invoke the serverless function
// supabase.functions.invoke("process-document", {
// body,
// method: "POST",
// });
toast.info(
"Document is being processed in the background. You will be notified when it's ready."
);

View File

@ -33,7 +33,13 @@ export function AppSidebar({
documents,
...props
}: React.ComponentProps<typeof Sidebar> & {
documents?: Array<{ name: string; url: string; emoji?: string }>;
documents: Array<{
id: string;
disabled?: boolean;
name: string;
url: string;
emoji?: string;
}>;
}) {
const data = {
navMain: [

View File

@ -6,6 +6,7 @@ import {
Link,
LoaderCircle,
MoreHorizontal,
RefreshCw,
StarOff,
Trash2,
} from "lucide-react";
@ -26,11 +27,14 @@ import {
SidebarMenuItem,
useSidebar,
} from "@/components/ui/sidebar";
import { createClient } from "@/utils/supabase/client";
import { toast } from "sonner";
export function NavDocuments({
documents,
}: {
documents: {
id: string;
disabled?: boolean;
name: string;
url: string;
@ -38,13 +42,14 @@ export function NavDocuments({
}[];
}) {
const { isMobile } = useSidebar();
const supabase = createClient();
return (
<SidebarGroup className="group-data-[collapsible=icon]:hidden">
<SidebarGroupLabel>Documents</SidebarGroupLabel>
<SidebarMenu>
{documents.map((item) => (
<SidebarMenuItem key={item.name} aria-disabled={item.disabled}>
<SidebarMenuItem key={item.id} aria-disabled={item.disabled}>
<SidebarMenuButton asChild disabled={item.disabled}>
<a href={item.url} title={item.name}>
{item.disabled ? (
@ -67,18 +72,43 @@ export function NavDocuments({
side={isMobile ? "bottom" : "right"}
align={isMobile ? "end" : "start"}
>
<DropdownMenuItem>
<StarOff className="text-muted-foreground" />
<span>Remove from Favorites</span>
</DropdownMenuItem>
<DropdownMenuSeparator />
<DropdownMenuItem>
<Link className="text-muted-foreground" />
<span>Copy Link</span>
</DropdownMenuItem>
<DropdownMenuItem>
<ArrowUpRight className="text-muted-foreground" />
<span>Open in New Tab</span>
<DropdownMenuItem
onClick={async () => {
const data = new FormData();
const session = await supabase.auth.getSession();
if (!session.data.session) {
toast.error("You are not logged in");
return;
}
data.append("id", item.id);
data.append(
"access_token",
session.data.session.access_token
);
data.append(
"refresh_token",
session.data.session.refresh_token
);
toast.promise(
supabase.functions.invoke("process-document", {
body: data,
}),
{
loading: "Reprocessing document...",
success: "Document reprocessed successfully",
error: (err) => {
console.error("Error reprocessing document:", err);
return "Failed to reprocess document";
},
}
);
}}
>
<RefreshCw className="text-muted-foreground" />
<span>Reprocess Document</span>
</DropdownMenuItem>
<DropdownMenuSeparator />
<DropdownMenuItem>

View File

@ -32,11 +32,16 @@ Return the final result as a text object with the following structure (without c
---------
{
"citations": {
"1": "Citation text for reference 1",
"2": "Citation text for reference 2",
// ... more citations
}
"citations": [
{
"number": 1, // The number as it appears in the text
"text": "Citation text 1"
},
{
"number": 2,
"text": "Citation text 2"
}
]
}
"""
@ -97,12 +102,16 @@ Deno.serve(async (req) => {
Deno.env.get("SUPABASE_ANON_KEY")
);
const supabaseServer = createClient(
Deno.env.get("SUPABASE_URL"),
Deno.env.get("SUPABASE_SERVICE_ROLE_KEY")
);
const formData = await req.formData();
const file = formData.get("file");
const accessToken = formData.get("access_token");
const refreshToken = formData.get("refresh_token");
const fileName = file.name;
const uuid = crypto.randomUUID();
var reprocessing = false;
var uuid = crypto.randomUUID();
const {
data: { user },
@ -121,6 +130,101 @@ Deno.serve(async (req) => {
throw new Error("Setting session failed");
}
if (formData.has("id")) {
console.log("Reprocessing document...");
reprocessing = true;
console.log("File ID found in form data.");
sendEvent("status", {
message: "File ID found in form data.",
});
const docId = formData.get("id");
const { data: documentData, error: documentError } = await supabase
.from("documents")
.select("*")
.eq("id", docId)
.single();
if (documentError) {
console.error("Error fetching document record:", documentError);
sendEvent("error", {
message: "Error fetching document record",
error: documentError,
});
throw new Error("Document record fetch failed");
}
if (documentData) {
await supabase
.from("documents")
.update({
is_processing: true,
})
.eq("id", documentData.id);
uuid = documentData.id;
} else {
console.error("Document record not found.");
sendEvent("error", {
message: "Document record not found",
});
throw new Error("Document record not found");
}
const { data: storageData, error: storageError } = await supabaseServer
.from("storage.objects")
.select("name")
.eq("id", documentData.raw_file)
.single();
if (storageError) {
console.error("Error fetching file name:", storageError);
sendEvent("error", {
message: "Error fetching file name",
error: storageError,
});
throw new Error("Storage data fetch failed");
}
const { data: fileData, error: fileError } = await supabase.storage
.from("documents")
.download(storageData.name);
if (fileError) {
console.error("Error downloading file from storage:", fileError);
sendEvent("error", {
message: "Error downloading file from storage",
error: fileError,
});
throw new Error("File download failed");
}
console.log("File downloaded from storage:", fileData);
sendEvent("status", {
message: "File downloaded from storage",
fileData,
});
formData.set("file", fileData);
}
if (!formData.has("file")) {
console.error("File not found in form data.");
sendEvent("error", {
message: "File not found in form data",
});
throw new Error("File not found");
}
if (!formData.has("access_token") || !formData.has("refresh_token")) {
console.error("Access token or refresh token not found in form data.");
sendEvent("error", {
message: "Access token or refresh token not found in form data",
});
throw new Error("Tokens not found");
}
const file = formData.get("file") as File;
const fileName = file.name;
console.log("Generated UUID:", uuid);
sendEvent("status", {
message: "Generated UUID",
@ -133,46 +237,73 @@ Deno.serve(async (req) => {
user,
});
const { data: storageData, error: storageError } = await supabase.storage
.from("documents")
.upload(`${user.id}/${uuid}.pdf`, file);
if (!reprocessing) {
const { data: storageData, error: storageError } =
await supabase.storage
.from("documents")
.upload(`${user.id}/${uuid}.pdf`, file);
if (storageError) {
console.error("Error uploading file to storage:", storageError);
sendEvent("error", {
message: "Error uploading file to storage",
error: storageError,
if (storageError) {
console.error("Error uploading file to storage:", storageError);
sendEvent("error", {
message: "Error uploading file to storage",
error: storageError,
});
throw new Error("File upload failed");
}
console.log("File uploaded to storage:", storageData);
sendEvent("status", {
message: "File uploaded to storage",
storageData,
});
throw new Error("File upload failed");
}
console.log("File uploaded to storage:", storageData);
sendEvent("status", {
message: "File uploaded to storage",
storageData,
});
const { error: docError } = await supabase.from("documents").insert({
id: uuid,
file_name: file.name,
owner: user.id,
raw_file: storageData.id,
is_processing: true,
});
if (docError) {
console.error("Error inserting document record:", docError);
sendEvent("error", {
message: "Error inserting document record",
error: docError,
const { error: docError } = await supabase.from("documents").insert({
id: uuid,
file_name: file.name,
owner: user.id,
raw_file: storageData.id,
is_processing: true,
});
throw new Error("Document record insertion failed");
}
console.log("Document record inserted successfully.");
sendEvent("status", {
message: "Document record inserted successfully",
});
if (docError) {
console.error("Error inserting document record:", docError);
sendEvent("error", {
message: "Error inserting document record",
error: docError,
});
throw new Error("Document record insertion failed");
}
console.log("Document record inserted successfully.");
sendEvent("status", {
message: "Document record inserted successfully",
});
} else {
console.log("Reprocessing document...");
sendEvent("status", {
message: "Reprocessing document",
});
const { error: docError } = await supabase
.from("documents")
.update({
is_processing: true,
})
.eq("id", uuid);
if (docError) {
console.error("Error updating document record:", docError);
sendEvent("error", {
message: "Error updating document record",
error: docError,
});
throw new Error("Document record update failed");
}
console.log("Document record updated successfully.");
sendEvent("status", {
message: "Document record updated successfully",
});
}
console.log("Uploading file to Mistral...");
sendEvent("status", {