199 lines
5.9 KiB
TypeScript
199 lines
5.9 KiB
TypeScript
import { NextRequest, NextResponse } from "next/server";
|
|
import { Mistral } from "@mistralai/mistralai";
|
|
import pLimit from "p-limit";
|
|
import { createClient } from "@/utils/supabase/server";
|
|
|
|
const corsHeaders = {
|
|
"Access-Control-Allow-Origin": "*",
|
|
"Access-Control-Allow-Headers":
|
|
"authorization, x-client-info, apikey, content-type",
|
|
};
|
|
|
|
const apiKey = process.env.MISTRAL_API_KEY!;
|
|
const client = new Mistral({ apiKey });
|
|
|
|
const PROCESSING_PROMPT = `
|
|
You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format.
|
|
|
|
The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. **Do not add headings if they do not exist in the original text.**
|
|
Any images should be included.
|
|
Do not return the Markdown as a code block, only as a raw string, without any new lines.
|
|
|
|
No data or information should ever be removed, it should only be processed and formatted.
|
|
|
|
There are in-text citations/references in the text, remove them from the text (**but most importantly, keep the reference number in the text. use a <sup></sup> tag**) and put them into an object where the key is the reference number and the value is the text. If any citations contain JSON-breaking characters, ensure they are properly escaped. This includes characters like double quotes, backslashes, and newlines.
|
|
|
|
The Markdown should be human-readable and well-formatted. The markdown string should properly sanitized and should not break a JSON parser when returned as the final format.
|
|
|
|
Return the final result as a text object with the following structure (without code block formatting):
|
|
|
|
"""
|
|
<processed markdown text>
|
|
|
|
---------
|
|
|
|
{
|
|
"citations": [
|
|
{
|
|
"number": 1, // The number as it appears in the text
|
|
"text": "Citation text 1" // Ensure any JSON-breaking characters are properly escaped
|
|
},
|
|
{
|
|
"number": 2,
|
|
"text": "Citation text 2"
|
|
}
|
|
]
|
|
}
|
|
"""
|
|
|
|
Do not return the text object as a code block, only as a raw string.
|
|
`;
|
|
|
|
async function getCitations(citationsStr: string) {
|
|
try {
|
|
const citations = JSON.parse(citationsStr).citations || {};
|
|
|
|
return
|
|
}
|
|
|
|
|
|
}
|
|
|
|
export async function POST(req: NextRequest) {
|
|
if (req.method === "OPTIONS") {
|
|
return new NextResponse(null, {
|
|
headers: {
|
|
...corsHeaders,
|
|
"Access-Control-Allow-Methods": "POST, OPTIONS",
|
|
},
|
|
});
|
|
}
|
|
|
|
const formData = await req.formData();
|
|
const accessToken = formData.get("access_token") as string;
|
|
const refreshToken = formData.get("refresh_token") as string;
|
|
|
|
if (!formData.has("file") || !accessToken || !refreshToken) {
|
|
return NextResponse.json(
|
|
{
|
|
error: "Missing required fields: file, access_token, or refresh_token",
|
|
},
|
|
{ status: 400 }
|
|
);
|
|
}
|
|
|
|
const supabase = await createClient();
|
|
|
|
const file = formData.get("file") as File;
|
|
const fileName = file.name;
|
|
const uuid = crypto.randomUUID();
|
|
|
|
try {
|
|
// Authenticate the user
|
|
const {
|
|
data: { user },
|
|
error: sessionError,
|
|
} = await supabase.auth.setSession({
|
|
access_token: accessToken,
|
|
refresh_token: refreshToken,
|
|
});
|
|
|
|
if (sessionError) {
|
|
throw new Error("Failed to set session: " + sessionError.message);
|
|
}
|
|
|
|
if (!user) {
|
|
throw new Error("User not authenticated");
|
|
}
|
|
|
|
// Upload the file to Supabase storage
|
|
const { data: storageData, error: storageError } = await supabase.storage
|
|
.from("documents")
|
|
.upload(`${user.id}/${uuid}.pdf`, file);
|
|
|
|
if (storageError) {
|
|
throw new Error("Failed to upload file: " + storageError.message);
|
|
}
|
|
|
|
// Insert document record
|
|
const { error: docError } = await supabase.from("documents").insert({
|
|
id: uuid,
|
|
file_name: file.name,
|
|
owner: user.id,
|
|
raw_file: storageData.id,
|
|
is_processing: true,
|
|
});
|
|
|
|
if (docError) {
|
|
throw new Error("Failed to insert document record: " + docError.message);
|
|
}
|
|
|
|
// Upload file to Mistral
|
|
const uploadedPdf = await client.files.upload({
|
|
file: { fileName, content: file },
|
|
purpose: "ocr",
|
|
});
|
|
|
|
const signedUrl = await client.files.getSignedUrl({
|
|
fileId: uploadedPdf.id,
|
|
});
|
|
|
|
// Process OCR
|
|
const ocrResponse = await client.ocr.process({
|
|
model: "mistral-ocr-latest",
|
|
document: { type: "document_url", documentUrl: signedUrl.url },
|
|
});
|
|
|
|
const limit = pLimit(2);
|
|
const promises = ocrResponse.pages.map((page) =>
|
|
limit(async () => {
|
|
const response = await client.chat.complete({
|
|
model: "mistral-small-latest",
|
|
messages: [
|
|
{
|
|
role: "system",
|
|
content: [{ type: "text", text: PROCESSING_PROMPT }],
|
|
},
|
|
{ role: "user", content: [{ type: "text", text: page.markdown }] },
|
|
],
|
|
});
|
|
|
|
const split = response.choices[0].message.content.split("---------");
|
|
const content = split[0].trim();
|
|
const citationsStr = split[1]?.trim() || "{}";
|
|
console.log(citationsStr);
|
|
|
|
const citations = await getCitations(citationsStr);
|
|
|
|
return {
|
|
...page,
|
|
markdown: content,
|
|
citations,
|
|
};
|
|
})
|
|
);
|
|
|
|
const results = await Promise.all(promises);
|
|
|
|
// Update document record with OCR data
|
|
const { error: updateError } = await supabase
|
|
.from("documents")
|
|
.update({ ocr_data: results, is_processing: false })
|
|
.eq("id", uuid);
|
|
|
|
if (updateError) {
|
|
throw new Error(
|
|
"Failed to update document record: " + updateError.message
|
|
);
|
|
}
|
|
|
|
return NextResponse.json({
|
|
message: "Document processed successfully",
|
|
results,
|
|
});
|
|
} catch (error: any) {
|
|
console.error("Error processing document:", error);
|
|
return NextResponse.json({ error: error.message }, { status: 500 });
|
|
}
|
|
}
|