Refactor document page processing and enhance upload functionality with improved token handling and error messaging
This commit is contained in:
parent
e84e5cc477
commit
2a9139744a
@ -64,9 +64,7 @@ export default async function DocumentPage(props: { params: { id: string } }) {
|
||||
return <div>Error loading documents.</div>;
|
||||
}
|
||||
|
||||
const pages = (document.ocr_data as any).pages.map(
|
||||
(page: any) => page.markdown
|
||||
);
|
||||
const pages = (document.ocr_data as any).map((page: any) => page.markdown);
|
||||
|
||||
const processedContent = await remark()
|
||||
.use(remarkHtml)
|
||||
@ -110,9 +108,11 @@ export default async function DocumentPage(props: { params: { id: string } }) {
|
||||
text-white
|
||||
prose-h1:font-semibold prose-h1:text-2xl prose-h1:mb-4 prose-h1:text-white
|
||||
prose-h2:font-medium prose-h2:text-xl prose-h2:mb-3 prose-h2:text-white
|
||||
prose-h3:font-medium prose-h3:text-lg prose-h3:mb-2 prose-h3:text-gray-300
|
||||
prose-h4:font-medium prose-h4:text-lg prose-h4:mb-2 prose-h4:text-gray-300
|
||||
prose-a:text-blue-400 hover:prose-a:underline
|
||||
prose-p:leading-7 prose-p:text-gray-200
|
||||
prose-strong:text-gray-200 prose-strong:font-semibold
|
||||
prose-blockquote:italic prose-blockquote:border-l-4 prose-blockquote:pl-4 prose-blockquote:border-gray-600 prose-blockquote:text-gray-300
|
||||
prose-code:bg-gray-800 prose-code:rounded prose-code:px-1 prose-code:py-0.5 prose-code:text-gray-200
|
||||
prose-img:rounded-lg prose-img:shadow-sm"
|
||||
|
@ -24,7 +24,8 @@ export default function UploadZone({ user }: { user?: { id: string } }) {
|
||||
|
||||
const body = new FormData();
|
||||
body.append("file", file);
|
||||
body.append("jwt", data.session?.access_token || "");
|
||||
body.append("access_token", data.session?.access_token || "");
|
||||
body.append("refresh_token", data.session?.refresh_token || "");
|
||||
|
||||
const edgeFunctionUrl = `${process.env.NEXT_PUBLIC_SUPABASE_URL}/functions/v1/process-document`;
|
||||
|
||||
@ -34,29 +35,23 @@ export default function UploadZone({ user }: { user?: { id: string } }) {
|
||||
headers: {
|
||||
apikey: process.env.NEXT_PUBLIC_SUPABASE_ANON_KEY!,
|
||||
Authorization: `Bearer ${process.env.NEXT_PUBLIC_SUPABASE_ANON_KEY}`,
|
||||
"Content-Type": "application/json",
|
||||
// "Content-Type": "multipart/form-data",
|
||||
},
|
||||
});
|
||||
|
||||
eventSource.onmessage = (event) => {
|
||||
const data = JSON.parse(event.data);
|
||||
console.log("SSE Message:", data);
|
||||
|
||||
if (data.message) {
|
||||
setStatus(data.message);
|
||||
}
|
||||
};
|
||||
|
||||
eventSource.addEventListener("status", (event) => {
|
||||
const data = JSON.parse(event.data);
|
||||
console.log("Status Event:", data);
|
||||
supabase.auth.setSession;
|
||||
|
||||
setStatus(data.message);
|
||||
});
|
||||
|
||||
eventSource.addEventListener("error", (event) => {
|
||||
console.error("SSE Error:", event);
|
||||
toast.error("An error occurred while processing the document.");
|
||||
toast.error("An error occurred while processing the document.", {
|
||||
description: event.data || "Unknown error",
|
||||
});
|
||||
setUploading(false);
|
||||
eventSource.close();
|
||||
});
|
||||
@ -69,11 +64,11 @@ export default function UploadZone({ user }: { user?: { id: string } }) {
|
||||
eventSource.close();
|
||||
});
|
||||
|
||||
// Invoke the serverless function
|
||||
supabase.functions.invoke("process-document", {
|
||||
body,
|
||||
method: "POST",
|
||||
});
|
||||
// // Invoke the serverless function
|
||||
// supabase.functions.invoke("process-document", {
|
||||
// body,
|
||||
// method: "POST",
|
||||
// });
|
||||
|
||||
toast.info(
|
||||
"Document is being processed in the background. You will be notified when it's ready."
|
||||
|
@ -2,13 +2,11 @@ import "jsr:@supabase/functions-js/edge-runtime.d.ts";
|
||||
import { createClient } from "jsr:@supabase/supabase-js@2";
|
||||
import { Mistral } from "npm:@mistralai/mistralai";
|
||||
import pLimit from "npm:p-limit";
|
||||
|
||||
export const corsHeaders = {
|
||||
"Access-Control-Allow-Origin": "*",
|
||||
"Access-Control-Allow-Headers":
|
||||
"authorization, x-client-info, apikey, content-type",
|
||||
};
|
||||
|
||||
const apiKey = Deno.env.get("MISTRAL_API_KEY");
|
||||
const client = new Mistral({
|
||||
apiKey: apiKey,
|
||||
@ -20,19 +18,34 @@ The textual page data should only be returned in valid Markdown format. Use prop
|
||||
Any images should be included.
|
||||
Do not return the Markdown as a code block, only as a raw string, without any new lines.
|
||||
|
||||
No data or information should ever be removed, it should only be processed and formatted.
|
||||
|
||||
There are in-text citations/references in the text, remove them from the text and put them into an object where the key is the reference number and the value is the text.
|
||||
|
||||
The Markdown should be human-readable and well-formatted.
|
||||
|
||||
Return the final result as a JSON object with the following structure:
|
||||
{
|
||||
"markdown": "<processed_markdown>",
|
||||
"citations": {
|
||||
"1": "<citation_text_1>",
|
||||
"2": "<citation_text_2>"
|
||||
}
|
||||
}
|
||||
`;
|
||||
|
||||
Deno.serve(async (req) => {
|
||||
console.log("Request received:", req.method);
|
||||
|
||||
if (req.method === "OPTIONS") {
|
||||
return new Response("ok", { headers: corsHeaders });
|
||||
console.log("Handling OPTIONS request...");
|
||||
return new Response(null, {
|
||||
headers: {
|
||||
...corsHeaders,
|
||||
"Access-Control-Allow-Methods": "POST, OPTIONS",
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
if (req.method === "POST") {
|
||||
console.log("Processing POST request...");
|
||||
|
||||
const { body, writable } = new TransformStream();
|
||||
const writer = writable.getWriter();
|
||||
|
||||
@ -44,14 +57,30 @@ Deno.serve(async (req) => {
|
||||
...corsHeaders,
|
||||
});
|
||||
|
||||
const sendEvent = async (event: string, data: any) => {
|
||||
let activeOperations = 0; // Track active operations
|
||||
let streamClosed = false; // Track if the stream is closed
|
||||
|
||||
const sendEvent = async (event, data) => {
|
||||
if (streamClosed) {
|
||||
console.warn("Attempted to write to a closed stream.");
|
||||
return;
|
||||
}
|
||||
const message = `event: ${event}\ndata: ${JSON.stringify(data)}\n\n`;
|
||||
console.log("Sending event:", message);
|
||||
await writer.write(new TextEncoder().encode(message));
|
||||
try {
|
||||
activeOperations++;
|
||||
await writer.write(new TextEncoder().encode(message));
|
||||
} catch (error) {
|
||||
console.error("Error writing to stream:", error);
|
||||
} finally {
|
||||
activeOperations--;
|
||||
}
|
||||
};
|
||||
|
||||
// Start streaming updates
|
||||
sendEvent("status", { message: "Initializing..." });
|
||||
sendEvent("status", {
|
||||
message: "Initializing...",
|
||||
});
|
||||
|
||||
try {
|
||||
const supabase = createClient(
|
||||
@ -61,20 +90,43 @@ Deno.serve(async (req) => {
|
||||
|
||||
const formData = await req.formData();
|
||||
const file = formData.get("file");
|
||||
const jwt = formData.get("jwt");
|
||||
const accessToken = formData.get("access_token");
|
||||
const refreshToken = formData.get("refresh_token");
|
||||
const fileName = file.name;
|
||||
const uuid = crypto.randomUUID();
|
||||
|
||||
console.log("Generated UUID:", uuid);
|
||||
sendEvent("status", { message: "Generated UUID", uuid });
|
||||
const {
|
||||
data: { user },
|
||||
error: sessionError,
|
||||
} = await supabase.auth.setSession({
|
||||
access_token: accessToken,
|
||||
refresh_token: refreshToken,
|
||||
});
|
||||
|
||||
if (sessionError) {
|
||||
console.error("Error setting session:", sessionError);
|
||||
sendEvent("error", {
|
||||
message: "Error setting session",
|
||||
error: sessionError,
|
||||
});
|
||||
throw new Error("Setting session failed");
|
||||
}
|
||||
|
||||
console.log("Generated UUID:", uuid);
|
||||
sendEvent("status", {
|
||||
message: "Generated UUID",
|
||||
uuid,
|
||||
});
|
||||
|
||||
const user = await supabase.auth.getUser(jwt);
|
||||
console.log("Authenticated user:", user);
|
||||
sendEvent("status", { message: "Authenticated user", user });
|
||||
sendEvent("status", {
|
||||
message: "Authenticated user",
|
||||
user,
|
||||
});
|
||||
|
||||
const { data: storageData, error: storageError } = await supabase.storage
|
||||
.from("documents")
|
||||
.upload(`${user!.id}/${uuid}.pdf`, file);
|
||||
.upload(`${user.id}/${uuid}.pdf`, file);
|
||||
|
||||
if (storageError) {
|
||||
console.error("Error uploading file to storage:", storageError);
|
||||
@ -83,18 +135,18 @@ Deno.serve(async (req) => {
|
||||
error: storageError,
|
||||
});
|
||||
throw new Error("File upload failed");
|
||||
} else {
|
||||
console.log("File uploaded to storage:", storageData);
|
||||
sendEvent("status", {
|
||||
message: "File uploaded to storage",
|
||||
storageData,
|
||||
});
|
||||
}
|
||||
|
||||
console.log("File uploaded to storage:", storageData);
|
||||
sendEvent("status", {
|
||||
message: "File uploaded to storage",
|
||||
storageData,
|
||||
});
|
||||
|
||||
const { error: docError } = await supabase.from("documents").insert({
|
||||
id: uuid,
|
||||
file_name: file.name,
|
||||
owner: user!.id,
|
||||
owner: user.id,
|
||||
raw_file: storageData.id,
|
||||
is_processing: true,
|
||||
});
|
||||
@ -106,15 +158,17 @@ Deno.serve(async (req) => {
|
||||
error: docError,
|
||||
});
|
||||
throw new Error("Document record insertion failed");
|
||||
} else {
|
||||
console.log("Document record inserted successfully.");
|
||||
sendEvent("status", {
|
||||
message: "Document record inserted successfully",
|
||||
});
|
||||
}
|
||||
|
||||
console.log("Document record inserted successfully.");
|
||||
sendEvent("status", {
|
||||
message: "Document record inserted successfully",
|
||||
});
|
||||
|
||||
console.log("Uploading file to Mistral...");
|
||||
sendEvent("status", { message: "Uploading file to Mistral..." });
|
||||
sendEvent("status", {
|
||||
message: "Uploading file to Mistral...",
|
||||
});
|
||||
|
||||
const uploaded_pdf = await client.files.upload({
|
||||
file: {
|
||||
@ -123,6 +177,7 @@ Deno.serve(async (req) => {
|
||||
},
|
||||
purpose: "ocr",
|
||||
});
|
||||
|
||||
console.log("File uploaded to Mistral:", uploaded_pdf);
|
||||
sendEvent("status", {
|
||||
message: "File uploaded to Mistral",
|
||||
@ -132,11 +187,17 @@ Deno.serve(async (req) => {
|
||||
const signedUrl = await client.files.getSignedUrl({
|
||||
fileId: uploaded_pdf.id,
|
||||
});
|
||||
|
||||
console.log("Generated signed URL:", signedUrl);
|
||||
sendEvent("status", { message: "Generated signed URL", signedUrl });
|
||||
sendEvent("status", {
|
||||
message: "Generated signed URL",
|
||||
signedUrl,
|
||||
});
|
||||
|
||||
console.log("Processing OCR...");
|
||||
sendEvent("status", { message: "Processing OCR..." });
|
||||
sendEvent("status", {
|
||||
message: "Processing OCR...",
|
||||
});
|
||||
|
||||
const ocrResponse = await client.ocr.process({
|
||||
model: "mistral-ocr-latest",
|
||||
@ -145,15 +206,21 @@ Deno.serve(async (req) => {
|
||||
documentUrl: signedUrl.url,
|
||||
},
|
||||
});
|
||||
console.log("OCR response received:", ocrResponse);
|
||||
sendEvent("status", { message: "OCR response received", ocrResponse });
|
||||
|
||||
const limit = pLimit(1);
|
||||
console.log("OCR response received:", ocrResponse);
|
||||
sendEvent("status", {
|
||||
message: "OCR response received",
|
||||
ocrResponse,
|
||||
});
|
||||
|
||||
const limit = pLimit(2);
|
||||
const promises = [];
|
||||
|
||||
for (const page of ocrResponse.pages) {
|
||||
console.log("Processing page:", page.index);
|
||||
sendEvent("status", { message: `Processing page ${page.index}` });
|
||||
sendEvent("status", {
|
||||
message: `Processing page ${page.index}`,
|
||||
});
|
||||
|
||||
const pagePromise = limit(async () => {
|
||||
console.log(`Processing page ${page.index} with Mistral...`);
|
||||
@ -162,7 +229,7 @@ Deno.serve(async (req) => {
|
||||
model: "mistral-small-latest",
|
||||
messages: [
|
||||
{
|
||||
role: "user",
|
||||
role: "system",
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
@ -170,6 +237,15 @@ Deno.serve(async (req) => {
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: page.markdown,
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
});
|
||||
|
||||
@ -200,17 +276,21 @@ Deno.serve(async (req) => {
|
||||
}
|
||||
|
||||
if (response.choices[0].message.content) {
|
||||
const markdownResponse = JSON.parse(
|
||||
response.choices[0].message.content.toString()
|
||||
);
|
||||
const citations = markdownResponse.citations;
|
||||
const markdown = markdownResponse.markdown;
|
||||
console.log("Generating Markdown for page:", page.index);
|
||||
sendEvent("status", {
|
||||
message: `Generating Markdown for page ${page.index}`,
|
||||
});
|
||||
const markdown = replaceImagesInMarkdown(
|
||||
response.choices[0].message.content.toString(),
|
||||
imageData
|
||||
);
|
||||
const markdown = replaceImagesInMarkdown(markdown, imageData);
|
||||
|
||||
return {
|
||||
...page,
|
||||
markdown,
|
||||
citations,
|
||||
};
|
||||
} else {
|
||||
console.error("Message content is undefined for page:", page.index);
|
||||
@ -227,9 +307,14 @@ Deno.serve(async (req) => {
|
||||
sendEvent("status", {
|
||||
message: "Waiting for all pages to be processed...",
|
||||
});
|
||||
|
||||
const results = await Promise.all(promises);
|
||||
|
||||
console.log("All pages processed. Results:", results);
|
||||
sendEvent("status", { message: "All pages processed", results });
|
||||
sendEvent("status", {
|
||||
message: "All pages processed",
|
||||
results,
|
||||
});
|
||||
|
||||
const sortedResults = results.sort((a, b) => a.index - b.index);
|
||||
console.log("Sorted results:", sortedResults);
|
||||
@ -252,18 +337,27 @@ Deno.serve(async (req) => {
|
||||
throw new Error("Document record update failed");
|
||||
}
|
||||
|
||||
console.log("Document record updated successfully.");
|
||||
sendEvent("status", { message: "Document record updated successfully" });
|
||||
sendEvent("status", { completed: true, uuid });
|
||||
console.log("Closing SSE stream...");
|
||||
} catch (error) {
|
||||
console.error("Error during processing:", error);
|
||||
sendEvent("error", { message: "Error during processing", error });
|
||||
sendEvent("error", {
|
||||
message: "Error during processing",
|
||||
error,
|
||||
});
|
||||
} finally {
|
||||
console.log("Closing SSE stream...");
|
||||
await writer.close();
|
||||
// Wait for all active operations to complete before closing the stream
|
||||
const interval = setInterval(() => {
|
||||
if (activeOperations === 0) {
|
||||
clearInterval(interval);
|
||||
streamClosed = true;
|
||||
writer.close();
|
||||
}
|
||||
}, 100); // Check every 100ms
|
||||
}
|
||||
|
||||
return new Response(body, { headers });
|
||||
return new Response(body, {
|
||||
headers,
|
||||
});
|
||||
}
|
||||
|
||||
console.error("Method not allowed:", req.method);
|
||||
@ -271,7 +365,6 @@ Deno.serve(async (req) => {
|
||||
status: 405,
|
||||
});
|
||||
});
|
||||
|
||||
function replaceImagesInMarkdown(markdownStr, imagesDict) {
|
||||
console.log("Replacing images in Markdown...");
|
||||
for (const [imgName, base64Str] of Object.entries(imagesDict)) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user