neuroread/app/api/process-document/route.ts

import { NextRequest, NextResponse } from "next/server";
import { Mistral } from "@mistralai/mistralai";
import pLimit from "p-limit";
import { createClient } from "@/utils/supabase/server";

const corsHeaders = {
  "Access-Control-Allow-Origin": "*",
  "Access-Control-Allow-Headers":
    "authorization, x-client-info, apikey, content-type",
};

const apiKey = process.env.MISTRAL_API_KEY!;
const client = new Mistral({ apiKey });

const PROCESSING_PROMPT = `
You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format.

The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. **Do not add headings if they do not exist in the original text.**
Any images should be included.
Do not return the Markdown as a code block, only as a raw string, without any new lines.

No data or information should ever be removed, it should only be processed and formatted.

There are in-text citations/references in the text, remove them from the text (**but most importantly, keep the reference number in the text. use a <sup></sup> tag**) and put them into an object where the key is the reference number and the value is the text. If any citations contain JSON-breaking characters, ensure they are properly escaped. This includes characters like double quotes, backslashes, and newlines.

The Markdown should be human-readable and well-formatted. The markdown string should properly sanitized and should not break a JSON parser when returned as the final format.

Return the final result as a text object with the following structure (without code block formatting):

"""
<processed markdown text>

---------

{
  "citations": [
    {
      "number": 1, // The number as it appears in the text
      "text": "Citation text 1" // Ensure any JSON-breaking characters are properly escaped
    },
    {
      "number": 2,
      "text": "Citation text 2"
    }
  ]
}
"""

Do not return the text object as a code block, only as a raw string.
`;

async function getCitations(citationsStr: string) {
  try {
    const citations = JSON.parse(citationsStr).citations || {};

    return
  }


}

export async function POST(req: NextRequest) {
  if (req.method === "OPTIONS") {
    return new NextResponse(null, {
      headers: {
        ...corsHeaders,
        "Access-Control-Allow-Methods": "POST, OPTIONS",
      },
    });
  }

  const formData = await req.formData();
  const accessToken = formData.get("access_token") as string;
  const refreshToken = formData.get("refresh_token") as string;

  if (!formData.has("file") || !accessToken || !refreshToken) {
    return NextResponse.json(
      {
        error: "Missing required fields: file, access_token, or refresh_token",
      },
      { status: 400 }
    );
  }

  const supabase = await createClient();

  const file = formData.get("file") as File;
  const fileName = file.name;
  const uuid = crypto.randomUUID();

  try {
    // Authenticate the user
    const {
      data: { user },
      error: sessionError,
    } = await supabase.auth.setSession({
      access_token: accessToken,
      refresh_token: refreshToken,
    });

    if (sessionError) {
      throw new Error("Failed to set session: " + sessionError.message);
    }

    if (!user) {
      throw new Error("User not authenticated");
    }

    // Upload the file to Supabase storage
    const { data: storageData, error: storageError } = await supabase.storage
      .from("documents")
      .upload(`${user.id}/${uuid}.pdf`, file);

    if (storageError) {
      throw new Error("Failed to upload file: " + storageError.message);
    }

    // Insert document record
    const { error: docError } = await supabase.from("documents").insert({
      id: uuid,
      file_name: file.name,
      owner: user.id,
      raw_file: storageData.id,
      is_processing: true,
    });

    if (docError) {
      throw new Error("Failed to insert document record: " + docError.message);
    }

    // Upload file to Mistral
    const uploadedPdf = await client.files.upload({
      file: { fileName, content: file },
      purpose: "ocr",
    });

    const signedUrl = await client.files.getSignedUrl({
      fileId: uploadedPdf.id,
    });

    // Process OCR
    const ocrResponse = await client.ocr.process({
      model: "mistral-ocr-latest",
      document: { type: "document_url", documentUrl: signedUrl.url },
    });

    const limit = pLimit(2);
    const promises = ocrResponse.pages.map((page) =>
      limit(async () => {
        const response = await client.chat.complete({
          model: "mistral-small-latest",
          messages: [
            {
              role: "system",
              content: [{ type: "text", text: PROCESSING_PROMPT }],
            },
            { role: "user", content: [{ type: "text", text: page.markdown }] },
          ],
        });

        const split = response.choices[0].message.content.split("---------");
        const content = split[0].trim();
        const citationsStr = split[1]?.trim() || "{}";
        console.log(citationsStr);

        const citations = await getCitations(citationsStr);

        return {
            ...page,
            markdown: content,
            citations,
        };
      })
    );

    const results = await Promise.all(promises);

    // Update document record with OCR data
    const { error: updateError } = await supabase
      .from("documents")
      .update({ ocr_data: results, is_processing: false })
      .eq("id", uuid);

    if (updateError) {
      throw new Error(
        "Failed to update document record: " + updateError.message
      );
    }

    return NextResponse.json({
      message: "Document processed successfully",
      results,
    });
  } catch (error: any) {
    console.error("Error processing document:", error);
    return NextResponse.json({ error: error.message }, { status: 500 });
  }
}