310 lines
9.6 KiB
TypeScript
310 lines
9.6 KiB
TypeScript
import { NextRequest, NextResponse } from "next/server";
|
|
import { Mistral } from "@mistralai/mistralai";
|
|
import pLimit from "p-limit";
|
|
import { createClient } from "@/utils/supabase/server";
|
|
|
|
const corsHeaders = {
|
|
"Access-Control-Allow-Origin": "*",
|
|
"Access-Control-Allow-Headers":
|
|
"authorization, x-client-info, apikey, content-type",
|
|
};
|
|
|
|
const apiKey = process.env.MISTRAL_API_KEY!;
|
|
const client = new Mistral({ apiKey });
|
|
|
|
const PROCESSING_PROMPT = `
|
|
You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format.
|
|
|
|
The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. **Do not add headings if they do not exist in the original text.** If there is a title to the document, it should be the first heading.
|
|
Any images should be included.
|
|
Do not return the Markdown as a code block, only as a raw string, without any new lines.
|
|
|
|
No data or information should ever be removed, it should only be processed and formatted.
|
|
|
|
There are in-text citations/references in the text, remove them from the text (**but most importantly, keep the reference number in the text. use a <sup></sup> tag**) and put them into an object where the key is the reference number and the value is the text. (**Note that there may be multiple citations, usually split by commas. Ensure these are added too.**) If any citations contain JSON-breaking characters, ensure they are properly escaped. This includes characters like double quotes, backslashes, and newlines.
|
|
|
|
The Markdown should be human-readable and well-formatted. The markdown string should properly sanitized and should not break a JSON parser when returned as the final format.
|
|
|
|
Return the final result as a text object with the following structure (without code block formatting):
|
|
|
|
"""
|
|
<processed markdown text>
|
|
|
|
---------
|
|
|
|
{
|
|
"citations": [
|
|
{
|
|
"number": 1, // The number as it appears in the text
|
|
"text": "Citation text 1" // Ensure any JSON-breaking characters are properly escaped
|
|
},
|
|
{
|
|
"number": 2,
|
|
"text": "Citation text 2"
|
|
}
|
|
]
|
|
}
|
|
"""
|
|
|
|
Do not return the text object as a code block, only as a raw string.
|
|
`;
|
|
|
|
function getCitations(citationsStr: string) {
|
|
try {
|
|
console.log("Parsing citations string:", citationsStr);
|
|
const citationsData = JSON.parse(citationsStr);
|
|
|
|
console.log("Sanitizing citations...");
|
|
const sanitizedCitations = citationsData.citations.map((citation: any) => {
|
|
const sanitizedText = citation.text.replace(
|
|
/(https?:\/\/[^\s]+)/g,
|
|
(url: string) => encodeURI(url)
|
|
);
|
|
return {
|
|
...citation,
|
|
text: sanitizedText,
|
|
};
|
|
});
|
|
|
|
console.log("Sanitized citations:", sanitizedCitations);
|
|
return sanitizedCitations;
|
|
} catch (err) {
|
|
console.error("Error parsing or sanitizing citations:", err);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
export async function POST(req: NextRequest) {
|
|
console.log("Received POST request");
|
|
|
|
if (req.method === "OPTIONS") {
|
|
console.log("Handling OPTIONS request");
|
|
return new NextResponse(null, {
|
|
headers: {
|
|
...corsHeaders,
|
|
"Access-Control-Allow-Methods": "POST, OPTIONS",
|
|
},
|
|
});
|
|
}
|
|
|
|
try {
|
|
console.log("Parsing form data...");
|
|
const formData = await req.formData();
|
|
const accessToken = formData.get("access_token") as string;
|
|
const refreshToken = formData.get("refresh_token") as string;
|
|
|
|
console.log("Creating Supabase client...");
|
|
const supabase = await createClient();
|
|
|
|
console.log("Authenticating user...");
|
|
const {
|
|
data: { user },
|
|
error: sessionError,
|
|
} = await supabase.auth.setSession({
|
|
access_token: accessToken,
|
|
refresh_token: refreshToken,
|
|
});
|
|
|
|
if (sessionError) {
|
|
console.error("Failed to set session:", sessionError.message);
|
|
throw new Error("Failed to set session: " + sessionError.message);
|
|
}
|
|
|
|
if (!user) {
|
|
console.error("User not authenticated");
|
|
throw new Error("User not authenticated");
|
|
}
|
|
|
|
var reprocessing = false;
|
|
var uuid = crypto.randomUUID();
|
|
|
|
if (formData.has("id")) {
|
|
console.log("Reprocessing document...");
|
|
reprocessing = true;
|
|
console.log("File ID found in form data.");
|
|
|
|
const docId = formData.get("id");
|
|
console.log("Document ID:", docId, formData);
|
|
const { data: documentData, error: documentError } = await supabase
|
|
.from("documents")
|
|
.select("*")
|
|
.eq("id", docId!.toString())
|
|
.single();
|
|
|
|
if (documentError) {
|
|
console.error("Error fetching document record:", documentError);
|
|
|
|
throw new Error("Document record fetch failed");
|
|
}
|
|
|
|
if (documentData) {
|
|
await supabase
|
|
.from("documents")
|
|
.update({
|
|
is_processing: true,
|
|
})
|
|
.eq("id", documentData.id);
|
|
uuid = documentData.id;
|
|
} else {
|
|
console.error("Document record not found.");
|
|
|
|
throw new Error("Document record not found");
|
|
}
|
|
|
|
const { data: fileData, error: fileError } = await supabase.storage
|
|
.from("documents")
|
|
.download(`${user.id}/${uuid}.pdf`);
|
|
|
|
if (fileError) {
|
|
console.error("Error downloading file from storage:", fileError);
|
|
|
|
throw new Error("File download failed");
|
|
}
|
|
|
|
console.log("File downloaded from storage:", fileData);
|
|
|
|
formData.set("file", fileData);
|
|
}
|
|
|
|
if (
|
|
!reprocessing &&
|
|
(!formData.has("file") || !accessToken || !refreshToken)
|
|
) {
|
|
console.error(
|
|
"Missing required fields: file, access_token, or refresh_token"
|
|
);
|
|
return NextResponse.json(
|
|
{
|
|
error:
|
|
"Missing required fields: file, access_token, or refresh_token",
|
|
},
|
|
{ status: 400 }
|
|
);
|
|
}
|
|
|
|
let file = formData.get("file") as File;
|
|
const fileName = file.name;
|
|
|
|
if (!reprocessing) {
|
|
console.log("Generated UUID for file:", uuid);
|
|
|
|
console.log("Uploading file to Supabase storage...");
|
|
const { data: storageData, error: storageError } = await supabase.storage
|
|
.from("documents")
|
|
.upload(`${user.id}/${uuid}.pdf`, file);
|
|
|
|
if (storageError) {
|
|
console.error("Failed to upload file:", storageError.message);
|
|
throw new Error("Failed to upload file: " + storageError.message);
|
|
}
|
|
|
|
console.log("Inserting document record...");
|
|
const { error: docError } = await supabase.from("documents").insert({
|
|
id: uuid,
|
|
file_name: file.name,
|
|
owner: user.id,
|
|
raw_file: storageData.id,
|
|
is_processing: true,
|
|
});
|
|
|
|
if (docError) {
|
|
console.error("Failed to insert document record:", docError.message);
|
|
throw new Error(
|
|
"Failed to insert document record: " + docError.message
|
|
);
|
|
}
|
|
} else {
|
|
console.log("Reprocessing document...");
|
|
|
|
const { error: docError } = await supabase
|
|
.from("documents")
|
|
.update({
|
|
is_processing: true,
|
|
})
|
|
.eq("id", uuid);
|
|
if (docError) {
|
|
console.error("Error updating document record:", docError);
|
|
throw new Error("Document record update failed");
|
|
}
|
|
console.log("Document record updated successfully.");
|
|
}
|
|
|
|
console.log("Uploading file to Mistral...");
|
|
const uploadedPdf = await client.files.upload({
|
|
file: { fileName: `${uuid}.pdf`, content: file },
|
|
purpose: "ocr",
|
|
});
|
|
|
|
console.log("Getting signed URL from Mistral...");
|
|
const signedUrl = await client.files.getSignedUrl({
|
|
fileId: uploadedPdf.id,
|
|
});
|
|
|
|
console.log("Processing OCR...");
|
|
const ocrResponse = await client.ocr.process({
|
|
model: "mistral-ocr-latest",
|
|
document: { type: "document_url", documentUrl: signedUrl.url },
|
|
includeImageBase64: true,
|
|
});
|
|
|
|
console.log("Processing OCR pages...");
|
|
const limit = pLimit(2);
|
|
const promises = ocrResponse.pages.map((page) =>
|
|
limit(async () => {
|
|
console.log("Processing page:", page);
|
|
const response = await client.chat.complete({
|
|
model: "mistral-small-latest",
|
|
messages: [
|
|
{
|
|
role: "system",
|
|
content: [{ type: "text", text: PROCESSING_PROMPT }],
|
|
},
|
|
{ role: "user", content: [{ type: "text", text: page.markdown }] },
|
|
],
|
|
});
|
|
|
|
const contentData = response.choices?.[0]?.message?.content;
|
|
const split =
|
|
typeof contentData === "string"
|
|
? contentData.split("---------")
|
|
: ["", "{}"];
|
|
const content = split[0].trim();
|
|
const citationsStr = split[1]?.trim() || "{}";
|
|
console.log("Citations string:", citationsStr);
|
|
|
|
const citations = getCitations(citationsStr);
|
|
|
|
return {
|
|
...page,
|
|
markdown: content,
|
|
citations,
|
|
};
|
|
})
|
|
);
|
|
|
|
const results = await Promise.all(promises);
|
|
|
|
console.log("Updating document record with OCR data...");
|
|
const { error: updateError } = await supabase
|
|
.from("documents")
|
|
.update({ ocr_data: results, is_processing: false })
|
|
.eq("id", uuid);
|
|
|
|
if (updateError) {
|
|
console.error("Failed to update document record:", updateError.message);
|
|
throw new Error(
|
|
"Failed to update document record: " + updateError.message
|
|
);
|
|
}
|
|
|
|
console.log("Document processed successfully");
|
|
return NextResponse.json({
|
|
message: "Document processed successfully",
|
|
results,
|
|
});
|
|
} catch (error: any) {
|
|
console.error("Error processing document:", error);
|
|
return NextResponse.json({ error: error.message }, { status: 500 });
|
|
}
|
|
}
|