import { createClient } from "@/utils/supabase/server"; import { NextResponse } from "next/server"; import { Mistral } from "@mistralai/mistralai"; import { redirect } from "next/navigation"; import { ChatCompletionChoice } from "@mistralai/mistralai/models/components"; import pLimit from "p-limit"; const apiKey = process.env.MISTRAL_API_KEY; const client = new Mistral({ apiKey: apiKey }); const PROCESSING_PROMPT = ` You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format. The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. Any images should be included. Do not return the Markdown as a code block, only as a raw string, without any new lines. The Markdown should be human-readable and well-formatted. `; export async function POST(request: Request) { const supabase = await createClient(); const formData = await request.formData(); const file = formData.get("file") as File; const fileName = formData.get("fileName") as string; const id = formData.get("id") as string; const uploaded_pdf = await client.files.upload({ file: { fileName, content: file, }, purpose: "ocr", }); const signedUrl = await client.files.getSignedUrl({ fileId: uploaded_pdf.id, }); const ocrResponse = await client.ocr.process({ model: "mistral-ocr-latest", document: { type: "document_url", documentUrl: signedUrl.url, }, }); const limit = pLimit(1); // Limit to 1 concurrent request (adjust as needed) const promises: Promise[] = []; for (const page of ocrResponse.pages) { const pagePromise = limit(async () => { const response = await client.chat.complete({ model: "mistral-small-latest", messages: [ { role: "user", content: [ { type: "text", text: PROCESSING_PROMPT, }, ], }, ], }); if (!response.choices) { console.error("No choices in response"); return; } const imageData: { [key: string]: string } = {}; if (page.images.length > 0) { for (const img of page.images) { imageData[img.id] = img.imageBase64!; } } if (response.choices[0].message.content) { const markdown = replaceImagesInMarkdown( response.choices[0].message.content.toString(), imageData ); return { ...page, markdown, }; } else { console.error("Message content is undefined"); } }); promises.push(pagePromise); } const results = await Promise.all(promises); const sortedResults = results.sort((a, b) => a.index - b.index); const { data, error } = await supabase .from("documents") .update({ ocr_data: sortedResults, }) .eq("id", id); if (error) { console.error(error); return NextResponse.json({ error: error.message }, { status: 500 }); } return NextResponse.json({ id, }); } interface OCRResponse { pages: { markdown: string; images: { id: string; image_base64: string }[]; }[]; } function replaceImagesInMarkdown( markdownStr: string, imagesDict: { [key: string]: string } ): string { /** * Replace image placeholders in markdown with base64-encoded images. * * Args: * markdownStr: Markdown text containing image placeholders * imagesDict: Dictionary mapping image IDs to base64 strings * * Returns: * Markdown text with images replaced by base64 data */ for (const [imgName, base64Str] of Object.entries(imagesDict)) { markdownStr = markdownStr.replace( new RegExp(`!\\[${imgName}\\]\\(${imgName}\\)`, "g"), `![${imgName}](${base64Str})` ); } return markdownStr; } function getCombinedMarkdown(ocrResponse: OCRResponse): string { /** * Combine OCR text and images into a single markdown document. * * Args: * ocrResponse: Response from OCR processing containing text and images * * Returns: * Combined markdown string with embedded images */ const markdowns: string[] = []; // Extract images from page for (const page of ocrResponse.pages) { const imageData: { [key: string]: string } = {}; for (const img of page.images) { imageData[img.id] = img.image_base64; } // Replace image placeholders with actual images markdowns.push(replaceImagesInMarkdown(page.markdown, imageData)); } return markdowns.join("\n\n"); }