306 lines
9.4 KiB
TypeScript

import { NextRequest, NextResponse } from "next/server";
import { Mistral } from "@mistralai/mistralai";
import pLimit from "p-limit";
import { createClient } from "@/utils/supabase/server";
const corsHeaders = {
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Headers":
"authorization, x-client-info, apikey, content-type",
};
const apiKey = process.env.MISTRAL_API_KEY!;
const client = new Mistral({ apiKey });
const PROCESSING_PROMPT = `
You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format.
The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. **Do not add headings if they do not exist in the original text.** If there is a title to the document, it should be the first heading.
Any images should be included.
Do not return the Markdown as a code block, only as a raw string, without any new lines.
No data or information should ever be removed, it should only be processed and formatted.
There are in-text citations/references in the text, remove them from the text (**but most importantly, keep the reference number in the text. use a <sup></sup> tag**) and put them into an object where the key is the reference number and the value is the text. If any citations contain JSON-breaking characters, ensure they are properly escaped. This includes characters like double quotes, backslashes, and newlines.
The Markdown should be human-readable and well-formatted. The markdown string should properly sanitized and should not break a JSON parser when returned as the final format.
Return the final result as a text object with the following structure (without code block formatting):
"""
<processed markdown text>
---------
{
"citations": [
{
"number": 1, // The number as it appears in the text
"text": "Citation text 1" // Ensure any JSON-breaking characters are properly escaped
},
{
"number": 2,
"text": "Citation text 2"
}
]
}
"""
Do not return the text object as a code block, only as a raw string.
`;
function getCitations(citationsStr: string) {
try {
console.log("Parsing citations string:", citationsStr);
const citationsData = JSON.parse(citationsStr);
console.log("Sanitizing citations...");
const sanitizedCitations = citationsData.citations.map((citation: any) => {
const sanitizedText = citation.text.replace(
/(https?:\/\/[^\s]+)/g,
(url: string) => encodeURI(url)
);
return {
...citation,
text: sanitizedText,
};
});
console.log("Sanitized citations:", sanitizedCitations);
return sanitizedCitations;
} catch (err) {
console.error("Error parsing or sanitizing citations:", err);
return [];
}
}
export async function POST(req: NextRequest) {
console.log("Received POST request");
if (req.method === "OPTIONS") {
console.log("Handling OPTIONS request");
return new NextResponse(null, {
headers: {
...corsHeaders,
"Access-Control-Allow-Methods": "POST, OPTIONS",
},
});
}
try {
console.log("Parsing form data...");
const formData = await req.formData();
const accessToken = formData.get("access_token") as string;
const refreshToken = formData.get("refresh_token") as string;
console.log("Creating Supabase client...");
const supabase = await createClient();
console.log("Authenticating user...");
const {
data: { user },
error: sessionError,
} = await supabase.auth.setSession({
access_token: accessToken,
refresh_token: refreshToken,
});
if (sessionError) {
console.error("Failed to set session:", sessionError.message);
throw new Error("Failed to set session: " + sessionError.message);
}
if (!user) {
console.error("User not authenticated");
throw new Error("User not authenticated");
}
var reprocessing = false;
var uuid = crypto.randomUUID();
if (formData.has("id")) {
console.log("Reprocessing document...");
reprocessing = true;
console.log("File ID found in form data.");
const docId = formData.get("id");
console.log("Document ID:", docId, formData);
const { data: documentData, error: documentError } = await supabase
.from("documents")
.select("*")
.eq("id", docId!.toString())
.single();
if (documentError) {
console.error("Error fetching document record:", documentError);
throw new Error("Document record fetch failed");
}
if (documentData) {
await supabase
.from("documents")
.update({
is_processing: true,
})
.eq("id", documentData.id);
uuid = documentData.id;
} else {
console.error("Document record not found.");
throw new Error("Document record not found");
}
const { data: fileData, error: fileError } = await supabase.storage
.from("documents")
.download(`${user.id}/${uuid}.pdf`);
if (fileError) {
console.error("Error downloading file from storage:", fileError);
throw new Error("File download failed");
}
console.log("File downloaded from storage:", fileData);
formData.set("file", fileData);
}
if (
!reprocessing &&
(!formData.has("file") || !accessToken || !refreshToken)
) {
console.error(
"Missing required fields: file, access_token, or refresh_token"
);
return NextResponse.json(
{
error:
"Missing required fields: file, access_token, or refresh_token",
},
{ status: 400 }
);
}
let file = formData.get("file") as File;
const fileName = file.name;
if (!reprocessing) {
console.log("Generated UUID for file:", uuid);
console.log("Uploading file to Supabase storage...");
const { data: storageData, error: storageError } = await supabase.storage
.from("documents")
.upload(`${user.id}/${uuid}.pdf`, file);
if (storageError) {
console.error("Failed to upload file:", storageError.message);
throw new Error("Failed to upload file: " + storageError.message);
}
console.log("Inserting document record...");
const { error: docError } = await supabase.from("documents").insert({
id: uuid,
file_name: file.name,
owner: user.id,
raw_file: storageData.id,
is_processing: true,
});
if (docError) {
console.error("Failed to insert document record:", docError.message);
throw new Error(
"Failed to insert document record: " + docError.message
);
}
} else {
console.log("Reprocessing document...");
const { error: docError } = await supabase
.from("documents")
.update({
is_processing: true,
})
.eq("id", uuid);
if (docError) {
console.error("Error updating document record:", docError);
throw new Error("Document record update failed");
}
console.log("Document record updated successfully.");
}
console.log("Uploading file to Mistral...");
const uploadedPdf = await client.files.upload({
file: { fileName: `${uuid}.pdf`, content: file },
purpose: "ocr",
});
console.log("Getting signed URL from Mistral...");
const signedUrl = await client.files.getSignedUrl({
fileId: uploadedPdf.id,
});
console.log("Processing OCR...");
const ocrResponse = await client.ocr.process({
model: "mistral-ocr-latest",
document: { type: "document_url", documentUrl: signedUrl.url },
includeImageBase64: true,
});
console.log("Processing OCR pages...");
const limit = pLimit(2);
const promises = ocrResponse.pages.map((page) =>
limit(async () => {
console.log("Processing page:", page);
const response = await client.chat.complete({
model: "mistral-small-latest",
messages: [
{
role: "system",
content: [{ type: "text", text: PROCESSING_PROMPT }],
},
{ role: "user", content: [{ type: "text", text: page.markdown }] },
],
});
const split = response.choices[0].message.content.split("---------");
const content = split[0].trim();
const citationsStr = split[1]?.trim() || "{}";
console.log("Citations string:", citationsStr);
const citations = getCitations(citationsStr);
return {
...page,
markdown: content,
citations,
};
})
);
const results = await Promise.all(promises);
console.log("Updating document record with OCR data...");
const { error: updateError } = await supabase
.from("documents")
.update({ ocr_data: results, is_processing: false })
.eq("id", uuid);
if (updateError) {
console.error("Failed to update document record:", updateError.message);
throw new Error(
"Failed to update document record: " + updateError.message
);
}
console.log("Document processed successfully");
return NextResponse.json({
message: "Document processed successfully",
results,
});
} catch (error: any) {
console.error("Error processing document:", error);
return NextResponse.json({ error: error.message }, { status: 500 });
}
}