Source code for metabeeai.llm_review_software.annotator

# Fixed PDF annotator
#
# Execute with:
#   python metabee/annotator.py --basepath data
#
# m.mieskolainen@imperial.ac.uk, 2025

import argparse
import json
import os

import fitz  # PyMuPDF
from termcolor import cprint


def convert_relative_to_absolute(page, rel_box):
    # Get page dimensions (width, height)
    width, height = page.rect.width, page.rect.height
    # Convert relative (0-1) coordinates to absolute values
    left = rel_box["l"] * width
    top = rel_box["t"] * height
    right = rel_box["r"] * width
    bottom = rel_box["b"] * height
    return fitz.Rect(left, top, right, bottom)



[docs]
def annotate_pdf(pdf_path, merged_json_path, output_pdf, answers_json_path=None):
    """
    Annotate PDFs with bounding boxes.
    First, annotate all "question-answer" chunks (red boxes).
    Then, if an answers.json file is provided, gather field names for each chunk_id and annotate
    with blue boxes and a combined text label.
    """

    # Load merged JSON file.
    with open(merged_json_path, "r", encoding="utf-8") as f:
        merged = json.load(f)
    cprint(f"Loaded merged JSON: {merged_json_path}", "white")

    doc = fitz.open(pdf_path)

    # Annotate "question-answer" chunks with red rectangles.
    for chunk in merged["data"]["chunks"]:
        if chunk.get("chunk_type") == "question-answer":
            if "grounding" in chunk:
                for g in chunk["grounding"]:
                    page_num = g["page"]
                    if page_num < len(doc):
                        page = doc[page_num]
                        rect = convert_relative_to_absolute(page, g["box"])
                        # Draw a red rectangle with a width of 1.
                        page.draw_rect(rect, color=(1, 0, 0), width=1)

    # If answers.json is provided, process it.
    if answers_json_path and os.path.isfile(answers_json_path):
        with open(answers_json_path, "r", encoding="utf-8") as f:
            answers = json.load(f)
        cprint(f"Loaded answers JSON: {answers_json_path}", "cyan")

        # Build a dictionary mapping chunk_id to chunk for quick lookup.
        chunk_dict = {}
        for chunk in merged["data"]["chunks"]:
            cid = chunk.get("chunk_id")
            if cid:
                chunk_dict[cid] = chunk

        # Build a mapping from chunk_id to a set of field names.
        cid_to_fields = {}

        # Helper: Recursively extract chunk_ids from nested dictionaries.
        def extract_chunk_ids(d, current_label):
            if isinstance(d, dict):
                if "chunk_ids" in d:
                    for cid in d["chunk_ids"]:
                        cid_to_fields.setdefault(cid, set()).add(current_label)
                else:
                    for k, v in d.items():
                        extract_chunk_ids(v, k)
            elif isinstance(d, list):
                for item in d:
                    extract_chunk_ids(item, current_label)

        questions = answers.get("QUESTIONS", {})
        for question_key, question_value in questions.items():
            # For questions, either the field value directly has chunk_ids or search recursively.
            if isinstance(question_value, dict):
                for field_key, field_value in question_value.items():
                    if isinstance(field_value, dict) and "chunk_ids" in field_value:
                        for cid in field_value["chunk_ids"]:
                            cid_to_fields.setdefault(cid, set()).add(field_key)
                    else:
                        extract_chunk_ids(field_value, field_key)

        # Annotate each chunk from answers.json using the aggregated field names.
        for cid, fields in cid_to_fields.items():
            if cid in chunk_dict:
                chunk = chunk_dict[cid]
                if "grounding" in chunk:
                    for g in chunk["grounding"]:
                        page_num = g["page"]
                        if page_num < len(doc):
                            page = doc[page_num]
                            rect = convert_relative_to_absolute(page, g["box"])

                            # Draw a blue rectangle for answer-related chunks.
                            page.draw_rect(rect, color=(0, 0, 1), width=1)

                            # Insert a text annotation at the top left of the box.
                            field_text = ", ".join(sorted(fields))
                            annot_text = f"{cid}: ({field_text})"

                            shift = -5
                            page.insert_text(
                                (rect.x0, rect.y0 + shift), annot_text, fontname="helv", fontsize=5, color=(0, 0, 1)
                            )
            else:
                cprint(f"Warning: Chunk id {cid} not found in merged JSON", "yellow")
    else:
        cprint("No answers.json found - not processing answer-based annotations", "red")

    try:
        doc.save(output_pdf)
        cprint(f"Annotated PDF saved as: {output_pdf}", "green")
    except Exception as e:
        cprint(f"Error in saving PDF: {output_pdf}. Exception: {e}", "red")



def process_all_papers(base_papers_dir):
    """
    Process each paper folder (names that are digits) in sorted order.
    """
    paper_folders = sorted(
        [
            folder
            for folder in os.listdir(base_papers_dir)
            if os.path.isdir(os.path.join(base_papers_dir, folder)) and folder.isdigit()
        ]
    )

    for paper_folder in paper_folders:
        paper_path = os.path.join(base_papers_dir, paper_folder)
        pages_dir = os.path.join(paper_path, "pages")

        # Define file paths (adjust filenames as needed).
        original_pdf_path = os.path.join(paper_path, f"{paper_folder}_main.pdf")
        merged_json_path = os.path.join(pages_dir, "merged_v2.json")
        output_pdf = os.path.join(paper_path, f"{paper_folder}_main_annotated.pdf")
        answers_json_path = os.path.join(paper_path, "answers.json")

        # Check if necessary files exist.
        if not os.path.isfile(original_pdf_path):
            cprint(f"Paper {paper_folder}: Missing original PDF: {original_pdf_path}", "red")
            continue
        if not os.path.isfile(merged_json_path):
            cprint(f"Paper {paper_folder}: Missing merged JSON: {merged_json_path}", "red")
            continue

        # Annotate PDF (answers_json_path is optional).
        annotate_pdf(original_pdf_path, merged_json_path, output_pdf, answers_json_path)
        cprint(f"Paper {paper_folder}: Processing complete", "magenta")
        print()


def main():
    parser = argparse.ArgumentParser(description="Annotate PDFs using merged JSON and answers.json for papers.")
    parser.add_argument(
        "--basepath",
        type=str,
        default=os.getcwd(),
        help="Base path containing the 'papers' folder. Defaults to the current working directory.",
    )
    args = parser.parse_args()

    papers_dir = os.path.join(args.basepath, "papers")
    if not os.path.isdir(papers_dir):
        cprint(f"Error: 'papers' folder not found in {args.basepath}", "red")
        return
    process_all_papers(papers_dir)


if __name__ == "__main__":
    main()