neuroread/app/api/process-document/route.ts

import { NextRequest, NextResponse } from "next/server";
import { Mistral } from "@mistralai/mistralai";
import pLimit from "p-limit";
import { createClient } from "@/utils/supabase/server";

const corsHeaders = {
  "Access-Control-Allow-Origin": "*",
  "Access-Control-Allow-Headers":
    "authorization, x-client-info, apikey, content-type",
};

const apiKey = process.env.MISTRAL_API_KEY!;
const client = new Mistral({ apiKey });

const PROCESSING_PROMPT = `
You are a document processing AI. Your task is to process the Markdown text scanned from a document page and return it in a clean and structured format.

The textual page data should only be returned in valid Markdown format. Use proper headings and subheadings to structure the content. **Do not add headings if they do not exist in the original text.** If there is a title to the document, it should be the first heading.
Any images should be included.
Do not return the Markdown as a code block, only as a raw string, without any new lines.

No data or information should ever be removed, it should only be processed and formatted.

There are in-text citations/references in the text, remove them from the text (**but most importantly, keep the reference number in the text. use a <sup></sup> tag**) and put them into an object where the key is the reference number and the value is the text. (**Note that there may be multiple citations, usually split by commas. Ensure these are added too.**) If any citations contain JSON-breaking characters, ensure they are properly escaped. This includes characters like double quotes, backslashes, and newlines.

The Markdown should be human-readable and well-formatted. The markdown string should properly sanitized and should not break a JSON parser when returned as the final format.

Return the final result as a text object with the following structure (without code block formatting):

"""
<processed markdown text>

---------

{
  "citations": [
    {
      "number": 1, // The number as it appears in the text
      "text": "Citation text 1" // Ensure any JSON-breaking characters are properly escaped
    },
    {
      "number": 2,
      "text": "Citation text 2"
    }
  ]
}
"""

Do not return the text object as a code block, only as a raw string.
`;

function getCitations(citationsStr: string) {
  try {
    console.log("Parsing citations string:", citationsStr);
    const citationsData = JSON.parse(citationsStr);

    console.log("Sanitizing citations...");
    const sanitizedCitations = citationsData.citations.map((citation: any) => {
      const sanitizedText = citation.text.replace(
        /(https?:\/\/[^\s]+)/g,
        (url: string) => encodeURI(url)
      );
      return {
        ...citation,
        text: sanitizedText,
      };
    });

    console.log("Sanitized citations:", sanitizedCitations);
    return sanitizedCitations;
  } catch (err) {
    console.error("Error parsing or sanitizing citations:", err);
    return [];
  }
}

export async function POST(req: NextRequest) {
  console.log("Received POST request");

  if (req.method === "OPTIONS") {
    console.log("Handling OPTIONS request");
    return new NextResponse(null, {
      headers: {
        ...corsHeaders,
        "Access-Control-Allow-Methods": "POST, OPTIONS",
      },
    });
  }

  try {
    console.log("Parsing form data...");
    const formData = await req.formData();
    const accessToken = formData.get("access_token") as string;
    const refreshToken = formData.get("refresh_token") as string;

    console.log("Creating Supabase client...");
    const supabase = await createClient();

    console.log("Authenticating user...");
    const {
      data: { user },
      error: sessionError,
    } = await supabase.auth.setSession({
      access_token: accessToken,
      refresh_token: refreshToken,
    });

    if (sessionError) {
      console.error("Failed to set session:", sessionError.message);
      throw new Error("Failed to set session: " + sessionError.message);
    }

    if (!user) {
      console.error("User not authenticated");
      throw new Error("User not authenticated");
    }

    var reprocessing = false;
    var uuid = crypto.randomUUID();

    if (formData.has("id")) {
      console.log("Reprocessing document...");
      reprocessing = true;
      console.log("File ID found in form data.");

      const docId = formData.get("id");
      console.log("Document ID:", docId, formData);
      const { data: documentData, error: documentError } = await supabase
        .from("documents")
        .select("*")
        .eq("id", docId!.toString())
        .single();

      if (documentError) {
        console.error("Error fetching document record:", documentError);

        throw new Error("Document record fetch failed");
      }

      if (documentData) {
        await supabase
          .from("documents")
          .update({
            is_processing: true,
          })
          .eq("id", documentData.id);
        uuid = documentData.id;
      } else {
        console.error("Document record not found.");

        throw new Error("Document record not found");
      }

      const { data: fileData, error: fileError } = await supabase.storage
        .from("documents")
        .download(`${user.id}/${uuid}.pdf`);

      if (fileError) {
        console.error("Error downloading file from storage:", fileError);

        throw new Error("File download failed");
      }

      console.log("File downloaded from storage:", fileData);

      formData.set("file", fileData);
    }

    if (
      !reprocessing &&
      (!formData.has("file") || !accessToken || !refreshToken)
    ) {
      console.error(
        "Missing required fields: file, access_token, or refresh_token"
      );
      return NextResponse.json(
        {
          error:
            "Missing required fields: file, access_token, or refresh_token",
        },
        { status: 400 }
      );
    }

    let file = formData.get("file") as File;
    const fileName = file.name;

    if (!reprocessing) {
      console.log("Generated UUID for file:", uuid);

      console.log("Uploading file to Supabase storage...");
      const { data: storageData, error: storageError } = await supabase.storage
        .from("documents")
        .upload(`${user.id}/${uuid}.pdf`, file);

      if (storageError) {
        console.error("Failed to upload file:", storageError.message);
        throw new Error("Failed to upload file: " + storageError.message);
      }

      console.log("Inserting document record...");
      const { error: docError } = await supabase.from("documents").insert({
        id: uuid,
        file_name: file.name,
        owner: user.id,
        raw_file: storageData.id,
        is_processing: true,
      });

      if (docError) {
        console.error("Failed to insert document record:", docError.message);
        throw new Error(
          "Failed to insert document record: " + docError.message
        );
      }
    } else {
      console.log("Reprocessing document...");

      const { error: docError } = await supabase
        .from("documents")
        .update({
          is_processing: true,
        })
        .eq("id", uuid);
      if (docError) {
        console.error("Error updating document record:", docError);
        throw new Error("Document record update failed");
      }
      console.log("Document record updated successfully.");
    }

    console.log("Uploading file to Mistral...");
    const uploadedPdf = await client.files.upload({
      file: { fileName: `${uuid}.pdf`, content: file },
      purpose: "ocr",
    });

    console.log("Getting signed URL from Mistral...");
    const signedUrl = await client.files.getSignedUrl({
      fileId: uploadedPdf.id,
    });

    console.log("Processing OCR...");
    const ocrResponse = await client.ocr.process({
      model: "mistral-ocr-latest",
      document: { type: "document_url", documentUrl: signedUrl.url },
      includeImageBase64: true,
    });

    console.log("Processing OCR pages...");
    const limit = pLimit(2);
    const promises = ocrResponse.pages.map((page) =>
      limit(async () => {
        console.log("Processing page:", page);
        const response = await client.chat.complete({
          model: "mistral-small-latest",
          messages: [
            {
              role: "system",
              content: [{ type: "text", text: PROCESSING_PROMPT }],
            },
            { role: "user", content: [{ type: "text", text: page.markdown }] },
          ],
        });

        const contentData = response.choices?.[0]?.message?.content;
        const split =
          typeof contentData === "string"
            ? contentData.split("---------")
            : ["", "{}"];
        const content = split[0].trim();
        const citationsStr = split[1]?.trim() || "{}";
        console.log("Citations string:", citationsStr);

        const citations = getCitations(citationsStr);

        return {
          ...page,
          markdown: content,
          citations,
        };
      })
    );

    const results = await Promise.all(promises);

    console.log("Updating document record with OCR data...");
    const { error: updateError } = await supabase
      .from("documents")
      .update({ ocr_data: results, is_processing: false })
      .eq("id", uuid);

    if (updateError) {
      console.error("Failed to update document record:", updateError.message);
      throw new Error(
        "Failed to update document record: " + updateError.message
      );
    }

    console.log("Document processed successfully");
    return NextResponse.json({
      message: "Document processed successfully",
      results,
    });
  } catch (error: any) {
    console.error("Error processing document:", error);
    return NextResponse.json({ error: error.message }, { status: 500 });
  }
}