better document (re)processing
This commit is contained in:
parent
3eda271635
commit
16b552262e
@ -120,7 +120,7 @@ export default async function DocumentPage(props: { params: { id: string } }) {
|
||||
dangerouslySetInnerHTML={{ __html: String(processedContent) }}
|
||||
></div> */}
|
||||
<div className="mx-auto px-12 py-20 gap-2">
|
||||
<MarkdownRenderer rawContent={rawContent} />
|
||||
<MarkdownRenderer document={document} />
|
||||
</div>
|
||||
</SidebarInset>
|
||||
</TTSProvider>
|
||||
|
@ -44,6 +44,7 @@ export default async function Page() {
|
||||
<AppSidebar
|
||||
documents={documents.map((d) => {
|
||||
return {
|
||||
id: d.id,
|
||||
disabled: d.is_processing,
|
||||
name: d.file_name,
|
||||
url: `/dashboard/documents/${d.id}`,
|
||||
|
@ -1,170 +0,0 @@
|
||||
import { createClient } from "@/utils/supabase/server";
|
||||
import { NextResponse } from "next/server";
|
||||
import { Mistral } from "@mistralai/mistralai";
|
||||
import { redirect } from "next/navigation";
|
||||
import { ChatCompletionChoice } from "@mistralai/mistralai/models/components";
|
||||
import pLimit from "p-limit";
|
||||
|
||||
const apiKey = process.env.MISTRAL_API_KEY;
|
||||
const client = new Mistral({ apiKey: apiKey });
|
||||
|
||||
const PROCESSING_PROMPT = `
|
||||
You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format.
|
||||
|
||||
The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content.
|
||||
Any images should be included.
|
||||
Do not return the Markdown as a code block, only as a raw string, without any new lines.
|
||||
|
||||
The Markdown should be human-readable and well-formatted.
|
||||
`;
|
||||
|
||||
export async function POST(request: Request) {
|
||||
const supabase = await createClient();
|
||||
const formData = await request.formData();
|
||||
const file = formData.get("file") as File;
|
||||
const fileName = formData.get("fileName") as string;
|
||||
const id = formData.get("id") as string;
|
||||
|
||||
const uploaded_pdf = await client.files.upload({
|
||||
file: {
|
||||
fileName,
|
||||
content: file,
|
||||
},
|
||||
purpose: "ocr",
|
||||
});
|
||||
|
||||
const signedUrl = await client.files.getSignedUrl({
|
||||
fileId: uploaded_pdf.id,
|
||||
});
|
||||
|
||||
const ocrResponse = await client.ocr.process({
|
||||
model: "mistral-ocr-latest",
|
||||
document: {
|
||||
type: "document_url",
|
||||
documentUrl: signedUrl.url,
|
||||
},
|
||||
});
|
||||
|
||||
const limit = pLimit(1); // Limit to 1 concurrent request (adjust as needed)
|
||||
|
||||
const promises: Promise<any>[] = [];
|
||||
|
||||
for (const page of ocrResponse.pages) {
|
||||
const pagePromise = limit(async () => {
|
||||
const response = await client.chat.complete({
|
||||
model: "mistral-small-latest",
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: PROCESSING_PROMPT,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
if (!response.choices) {
|
||||
console.error("No choices in response");
|
||||
return;
|
||||
}
|
||||
|
||||
const imageData: { [key: string]: string } = {};
|
||||
|
||||
if (page.images.length > 0) {
|
||||
for (const img of page.images) {
|
||||
imageData[img.id] = img.imageBase64!;
|
||||
}
|
||||
}
|
||||
|
||||
if (response.choices[0].message.content) {
|
||||
const markdown = replaceImagesInMarkdown(
|
||||
response.choices[0].message.content.toString(),
|
||||
imageData
|
||||
);
|
||||
|
||||
return {
|
||||
...page,
|
||||
markdown,
|
||||
};
|
||||
} else {
|
||||
console.error("Message content is undefined");
|
||||
}
|
||||
});
|
||||
|
||||
promises.push(pagePromise);
|
||||
}
|
||||
|
||||
const results = await Promise.all(promises);
|
||||
const sortedResults = results.sort((a, b) => a.index - b.index);
|
||||
|
||||
const { data, error } = await supabase
|
||||
.from("documents")
|
||||
.update({
|
||||
ocr_data: sortedResults,
|
||||
})
|
||||
.eq("id", id);
|
||||
if (error) {
|
||||
console.error(error);
|
||||
return NextResponse.json({ error: error.message }, { status: 500 });
|
||||
}
|
||||
return NextResponse.json({
|
||||
id,
|
||||
});
|
||||
}
|
||||
|
||||
interface OCRResponse {
|
||||
pages: {
|
||||
markdown: string;
|
||||
images: { id: string; image_base64: string }[];
|
||||
}[];
|
||||
}
|
||||
|
||||
function replaceImagesInMarkdown(
|
||||
markdownStr: string,
|
||||
imagesDict: { [key: string]: string }
|
||||
): string {
|
||||
/**
|
||||
* Replace image placeholders in markdown with base64-encoded images.
|
||||
*
|
||||
* Args:
|
||||
* markdownStr: Markdown text containing image placeholders
|
||||
* imagesDict: Dictionary mapping image IDs to base64 strings
|
||||
*
|
||||
* Returns:
|
||||
* Markdown text with images replaced by base64 data
|
||||
*/
|
||||
for (const [imgName, base64Str] of Object.entries(imagesDict)) {
|
||||
markdownStr = markdownStr.replace(
|
||||
new RegExp(`!\\[${imgName}\\]\\(${imgName}\\)`, "g"),
|
||||
``
|
||||
);
|
||||
}
|
||||
return markdownStr;
|
||||
}
|
||||
|
||||
function getCombinedMarkdown(ocrResponse: OCRResponse): string {
|
||||
/**
|
||||
* Combine OCR text and images into a single markdown document.
|
||||
*
|
||||
* Args:
|
||||
* ocrResponse: Response from OCR processing containing text and images
|
||||
*
|
||||
* Returns:
|
||||
* Combined markdown string with embedded images
|
||||
*/
|
||||
const markdowns: string[] = [];
|
||||
// Extract images from page
|
||||
for (const page of ocrResponse.pages) {
|
||||
const imageData: { [key: string]: string } = {};
|
||||
for (const img of page.images) {
|
||||
imageData[img.id] = img.image_base64;
|
||||
}
|
||||
// Replace image placeholders with actual images
|
||||
markdowns.push(replaceImagesInMarkdown(page.markdown, imageData));
|
||||
}
|
||||
|
||||
return markdowns.join("\n\n");
|
||||
}
|
@ -9,16 +9,29 @@ import ReactMarkdown, { Components } from "react-markdown";
|
||||
import rehypeRaw from "rehype-raw";
|
||||
import { useTTS } from "./TTSProvider";
|
||||
import rehypeHighlight from "@/lib/utils";
|
||||
import { Database } from "@/utils/supabase/types";
|
||||
|
||||
// Utility to escape regex special characters:
|
||||
function escapeRegExp(text: string) {
|
||||
return text.replace(/[-[\]{}()*+?.,\\^$|#\s]/g, "\\$&");
|
||||
}
|
||||
|
||||
export type OCRData = {
|
||||
index: number;
|
||||
images: string[];
|
||||
markdown: string;
|
||||
citations: Record<string, string>;
|
||||
dimensions: {
|
||||
dpi: number;
|
||||
width: number;
|
||||
height: number;
|
||||
};
|
||||
};
|
||||
|
||||
export default function MarkdownRenderer({
|
||||
rawContent,
|
||||
document,
|
||||
}: {
|
||||
rawContent: string;
|
||||
document: Database["public"]["Tables"]["documents"]["Row"];
|
||||
}) {
|
||||
// Obtain TTS info from context.
|
||||
// TTSProvider is already wrapping this component higher in the tree.
|
||||
@ -36,6 +49,34 @@ export default function MarkdownRenderer({
|
||||
[textToHighlight]
|
||||
);
|
||||
|
||||
const ocr = document?.ocr_data as OCRData[];
|
||||
|
||||
const rawContent = ocr.map((page) => page.markdown).join("\n") || "";
|
||||
|
||||
const citations: {
|
||||
text: string;
|
||||
page: number;
|
||||
index: string;
|
||||
number: number;
|
||||
}[] = [];
|
||||
const totalPages = ocr.length;
|
||||
const totalSentences = sentences.length;
|
||||
|
||||
let totalCitations = 0;
|
||||
ocr.forEach((page) => {
|
||||
Object.entries(page.citations).forEach(([key, value]) => {
|
||||
if (value) {
|
||||
totalCitations++;
|
||||
citations.push({
|
||||
text: value,
|
||||
page: page.index,
|
||||
index: key,
|
||||
number: Number(totalCitations),
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
const components: Components = {
|
||||
h1: ({ node, ...props }) => (
|
||||
<h1 className="text-2xl font-semibold mb-4 text-white" {...props} />
|
||||
@ -77,7 +118,24 @@ export default function MarkdownRenderer({
|
||||
{...props}
|
||||
/>
|
||||
),
|
||||
sup: ({ node, ...props }) => (
|
||||
sup: ({ node, ...props }) => {
|
||||
// Check if the text contains a reference number
|
||||
const text = props.children!.toString();
|
||||
|
||||
const referenceNumber = text;
|
||||
if (!referenceNumber) {
|
||||
return <sup {...props} data-error="no reference number" />;
|
||||
}
|
||||
|
||||
const citation = citations.find(
|
||||
(c) => c.index === referenceNumber && c.page === page.index
|
||||
);
|
||||
|
||||
if (!citation) {
|
||||
return <sup {...props} data-error="no citation found" />;
|
||||
}
|
||||
|
||||
return (
|
||||
// TODO: get the references from the document and display them in a popover
|
||||
<Popover>
|
||||
<PopoverTrigger asChild>
|
||||
@ -89,11 +147,12 @@ export default function MarkdownRenderer({
|
||||
<PopoverContent className="w-56 overflow-hidden rounded-lg p-0">
|
||||
<div className="p-4">
|
||||
{/* Replace with actual reference content */}
|
||||
<p>Reference content goes here.</p>
|
||||
<p>{citation.text}</p>
|
||||
</div>
|
||||
</PopoverContent>
|
||||
</Popover>
|
||||
),
|
||||
);
|
||||
},
|
||||
};
|
||||
|
||||
return (
|
||||
|
@ -64,12 +64,6 @@ export default function UploadZone({ user }: { user?: { id: string } }) {
|
||||
eventSource.close();
|
||||
});
|
||||
|
||||
// // Invoke the serverless function
|
||||
// supabase.functions.invoke("process-document", {
|
||||
// body,
|
||||
// method: "POST",
|
||||
// });
|
||||
|
||||
toast.info(
|
||||
"Document is being processed in the background. You will be notified when it's ready."
|
||||
);
|
||||
|
@ -33,7 +33,13 @@ export function AppSidebar({
|
||||
documents,
|
||||
...props
|
||||
}: React.ComponentProps<typeof Sidebar> & {
|
||||
documents?: Array<{ name: string; url: string; emoji?: string }>;
|
||||
documents: Array<{
|
||||
id: string;
|
||||
disabled?: boolean;
|
||||
name: string;
|
||||
url: string;
|
||||
emoji?: string;
|
||||
}>;
|
||||
}) {
|
||||
const data = {
|
||||
navMain: [
|
||||
|
@ -6,6 +6,7 @@ import {
|
||||
Link,
|
||||
LoaderCircle,
|
||||
MoreHorizontal,
|
||||
RefreshCw,
|
||||
StarOff,
|
||||
Trash2,
|
||||
} from "lucide-react";
|
||||
@ -26,11 +27,14 @@ import {
|
||||
SidebarMenuItem,
|
||||
useSidebar,
|
||||
} from "@/components/ui/sidebar";
|
||||
import { createClient } from "@/utils/supabase/client";
|
||||
import { toast } from "sonner";
|
||||
|
||||
export function NavDocuments({
|
||||
documents,
|
||||
}: {
|
||||
documents: {
|
||||
id: string;
|
||||
disabled?: boolean;
|
||||
name: string;
|
||||
url: string;
|
||||
@ -38,13 +42,14 @@ export function NavDocuments({
|
||||
}[];
|
||||
}) {
|
||||
const { isMobile } = useSidebar();
|
||||
const supabase = createClient();
|
||||
|
||||
return (
|
||||
<SidebarGroup className="group-data-[collapsible=icon]:hidden">
|
||||
<SidebarGroupLabel>Documents</SidebarGroupLabel>
|
||||
<SidebarMenu>
|
||||
{documents.map((item) => (
|
||||
<SidebarMenuItem key={item.name} aria-disabled={item.disabled}>
|
||||
<SidebarMenuItem key={item.id} aria-disabled={item.disabled}>
|
||||
<SidebarMenuButton asChild disabled={item.disabled}>
|
||||
<a href={item.url} title={item.name}>
|
||||
{item.disabled ? (
|
||||
@ -67,18 +72,43 @@ export function NavDocuments({
|
||||
side={isMobile ? "bottom" : "right"}
|
||||
align={isMobile ? "end" : "start"}
|
||||
>
|
||||
<DropdownMenuItem>
|
||||
<StarOff className="text-muted-foreground" />
|
||||
<span>Remove from Favorites</span>
|
||||
</DropdownMenuItem>
|
||||
<DropdownMenuSeparator />
|
||||
<DropdownMenuItem>
|
||||
<Link className="text-muted-foreground" />
|
||||
<span>Copy Link</span>
|
||||
</DropdownMenuItem>
|
||||
<DropdownMenuItem>
|
||||
<ArrowUpRight className="text-muted-foreground" />
|
||||
<span>Open in New Tab</span>
|
||||
<DropdownMenuItem
|
||||
onClick={async () => {
|
||||
const data = new FormData();
|
||||
|
||||
const session = await supabase.auth.getSession();
|
||||
if (!session.data.session) {
|
||||
toast.error("You are not logged in");
|
||||
return;
|
||||
}
|
||||
|
||||
data.append("id", item.id);
|
||||
data.append(
|
||||
"access_token",
|
||||
session.data.session.access_token
|
||||
);
|
||||
data.append(
|
||||
"refresh_token",
|
||||
session.data.session.refresh_token
|
||||
);
|
||||
|
||||
toast.promise(
|
||||
supabase.functions.invoke("process-document", {
|
||||
body: data,
|
||||
}),
|
||||
{
|
||||
loading: "Reprocessing document...",
|
||||
success: "Document reprocessed successfully",
|
||||
error: (err) => {
|
||||
console.error("Error reprocessing document:", err);
|
||||
return "Failed to reprocess document";
|
||||
},
|
||||
}
|
||||
);
|
||||
}}
|
||||
>
|
||||
<RefreshCw className="text-muted-foreground" />
|
||||
<span>Reprocess Document</span>
|
||||
</DropdownMenuItem>
|
||||
<DropdownMenuSeparator />
|
||||
<DropdownMenuItem>
|
||||
|
@ -32,11 +32,16 @@ Return the final result as a text object with the following structure (without c
|
||||
---------
|
||||
|
||||
{
|
||||
"citations": {
|
||||
"1": "Citation text for reference 1",
|
||||
"2": "Citation text for reference 2",
|
||||
// ... more citations
|
||||
"citations": [
|
||||
{
|
||||
"number": 1, // The number as it appears in the text
|
||||
"text": "Citation text 1"
|
||||
},
|
||||
{
|
||||
"number": 2,
|
||||
"text": "Citation text 2"
|
||||
}
|
||||
]
|
||||
}
|
||||
"""
|
||||
|
||||
@ -97,12 +102,16 @@ Deno.serve(async (req) => {
|
||||
Deno.env.get("SUPABASE_ANON_KEY")
|
||||
);
|
||||
|
||||
const supabaseServer = createClient(
|
||||
Deno.env.get("SUPABASE_URL"),
|
||||
Deno.env.get("SUPABASE_SERVICE_ROLE_KEY")
|
||||
);
|
||||
|
||||
const formData = await req.formData();
|
||||
const file = formData.get("file");
|
||||
const accessToken = formData.get("access_token");
|
||||
const refreshToken = formData.get("refresh_token");
|
||||
const fileName = file.name;
|
||||
const uuid = crypto.randomUUID();
|
||||
var reprocessing = false;
|
||||
var uuid = crypto.randomUUID();
|
||||
|
||||
const {
|
||||
data: { user },
|
||||
@ -121,6 +130,101 @@ Deno.serve(async (req) => {
|
||||
throw new Error("Setting session failed");
|
||||
}
|
||||
|
||||
if (formData.has("id")) {
|
||||
console.log("Reprocessing document...");
|
||||
reprocessing = true;
|
||||
console.log("File ID found in form data.");
|
||||
sendEvent("status", {
|
||||
message: "File ID found in form data.",
|
||||
});
|
||||
const docId = formData.get("id");
|
||||
|
||||
const { data: documentData, error: documentError } = await supabase
|
||||
.from("documents")
|
||||
.select("*")
|
||||
.eq("id", docId)
|
||||
.single();
|
||||
|
||||
if (documentError) {
|
||||
console.error("Error fetching document record:", documentError);
|
||||
sendEvent("error", {
|
||||
message: "Error fetching document record",
|
||||
error: documentError,
|
||||
});
|
||||
throw new Error("Document record fetch failed");
|
||||
}
|
||||
|
||||
if (documentData) {
|
||||
await supabase
|
||||
.from("documents")
|
||||
.update({
|
||||
is_processing: true,
|
||||
})
|
||||
.eq("id", documentData.id);
|
||||
uuid = documentData.id;
|
||||
} else {
|
||||
console.error("Document record not found.");
|
||||
sendEvent("error", {
|
||||
message: "Document record not found",
|
||||
});
|
||||
throw new Error("Document record not found");
|
||||
}
|
||||
|
||||
const { data: storageData, error: storageError } = await supabaseServer
|
||||
.from("storage.objects")
|
||||
.select("name")
|
||||
.eq("id", documentData.raw_file)
|
||||
.single();
|
||||
|
||||
if (storageError) {
|
||||
console.error("Error fetching file name:", storageError);
|
||||
sendEvent("error", {
|
||||
message: "Error fetching file name",
|
||||
error: storageError,
|
||||
});
|
||||
throw new Error("Storage data fetch failed");
|
||||
}
|
||||
|
||||
const { data: fileData, error: fileError } = await supabase.storage
|
||||
.from("documents")
|
||||
.download(storageData.name);
|
||||
|
||||
if (fileError) {
|
||||
console.error("Error downloading file from storage:", fileError);
|
||||
sendEvent("error", {
|
||||
message: "Error downloading file from storage",
|
||||
error: fileError,
|
||||
});
|
||||
throw new Error("File download failed");
|
||||
}
|
||||
|
||||
console.log("File downloaded from storage:", fileData);
|
||||
sendEvent("status", {
|
||||
message: "File downloaded from storage",
|
||||
fileData,
|
||||
});
|
||||
|
||||
formData.set("file", fileData);
|
||||
}
|
||||
|
||||
if (!formData.has("file")) {
|
||||
console.error("File not found in form data.");
|
||||
sendEvent("error", {
|
||||
message: "File not found in form data",
|
||||
});
|
||||
throw new Error("File not found");
|
||||
}
|
||||
if (!formData.has("access_token") || !formData.has("refresh_token")) {
|
||||
console.error("Access token or refresh token not found in form data.");
|
||||
sendEvent("error", {
|
||||
message: "Access token or refresh token not found in form data",
|
||||
});
|
||||
throw new Error("Tokens not found");
|
||||
}
|
||||
|
||||
const file = formData.get("file") as File;
|
||||
const fileName = file.name;
|
||||
|
||||
console.log("Generated UUID:", uuid);
|
||||
sendEvent("status", {
|
||||
message: "Generated UUID",
|
||||
@ -133,7 +237,9 @@ Deno.serve(async (req) => {
|
||||
user,
|
||||
});
|
||||
|
||||
const { data: storageData, error: storageError } = await supabase.storage
|
||||
if (!reprocessing) {
|
||||
const { data: storageData, error: storageError } =
|
||||
await supabase.storage
|
||||
.from("documents")
|
||||
.upload(`${user.id}/${uuid}.pdf`, file);
|
||||
|
||||
@ -173,6 +279,31 @@ Deno.serve(async (req) => {
|
||||
sendEvent("status", {
|
||||
message: "Document record inserted successfully",
|
||||
});
|
||||
} else {
|
||||
console.log("Reprocessing document...");
|
||||
sendEvent("status", {
|
||||
message: "Reprocessing document",
|
||||
});
|
||||
|
||||
const { error: docError } = await supabase
|
||||
.from("documents")
|
||||
.update({
|
||||
is_processing: true,
|
||||
})
|
||||
.eq("id", uuid);
|
||||
if (docError) {
|
||||
console.error("Error updating document record:", docError);
|
||||
sendEvent("error", {
|
||||
message: "Error updating document record",
|
||||
error: docError,
|
||||
});
|
||||
throw new Error("Document record update failed");
|
||||
}
|
||||
console.log("Document record updated successfully.");
|
||||
sendEvent("status", {
|
||||
message: "Document record updated successfully",
|
||||
});
|
||||
}
|
||||
|
||||
console.log("Uploading file to Mistral...");
|
||||
sendEvent("status", {
|
||||
|
Loading…
x
Reference in New Issue
Block a user