Source code for metabeeai.llm_review_software.annotator

# Fixed PDF annotator
#
# Execute with:
#   python metabee/annotator.py --basepath data
#
# m.mieskolainen@imperial.ac.uk, 2025

import argparse
import json
import os

import fitz  # PyMuPDF
from termcolor import cprint


def convert_relative_to_absolute(page, rel_box):
    # Get page dimensions (width, height)
    width, height = page.rect.width, page.rect.height
    # Convert relative (0-1) coordinates to absolute values
    left = rel_box["l"] * width
    top = rel_box["t"] * height
    right = rel_box["r"] * width
    bottom = rel_box["b"] * height
    return fitz.Rect(left, top, right, bottom)


[docs] def annotate_pdf(pdf_path, merged_json_path, output_pdf, answers_json_path=None): """ Annotate PDFs with bounding boxes. First, annotate all "question-answer" chunks (red boxes). Then, if an answers.json file is provided, gather field names for each chunk_id and annotate with blue boxes and a combined text label. """ # Load merged JSON file. with open(merged_json_path, "r", encoding="utf-8") as f: merged = json.load(f) cprint(f"Loaded merged JSON: {merged_json_path}", "white") doc = fitz.open(pdf_path) # Annotate "question-answer" chunks with red rectangles. for chunk in merged["data"]["chunks"]: if chunk.get("chunk_type") == "question-answer": if "grounding" in chunk: for g in chunk["grounding"]: page_num = g["page"] if page_num < len(doc): page = doc[page_num] rect = convert_relative_to_absolute(page, g["box"]) # Draw a red rectangle with a width of 1. page.draw_rect(rect, color=(1, 0, 0), width=1) # If answers.json is provided, process it. if answers_json_path and os.path.isfile(answers_json_path): with open(answers_json_path, "r", encoding="utf-8") as f: answers = json.load(f) cprint(f"Loaded answers JSON: {answers_json_path}", "cyan") # Build a dictionary mapping chunk_id to chunk for quick lookup. chunk_dict = {} for chunk in merged["data"]["chunks"]: cid = chunk.get("chunk_id") if cid: chunk_dict[cid] = chunk # Build a mapping from chunk_id to a set of field names. cid_to_fields = {} # Helper: Recursively extract chunk_ids from nested dictionaries. def extract_chunk_ids(d, current_label): if isinstance(d, dict): if "chunk_ids" in d: for cid in d["chunk_ids"]: cid_to_fields.setdefault(cid, set()).add(current_label) else: for k, v in d.items(): extract_chunk_ids(v, k) elif isinstance(d, list): for item in d: extract_chunk_ids(item, current_label) questions = answers.get("QUESTIONS", {}) for question_key, question_value in questions.items(): # For questions, either the field value directly has chunk_ids or search recursively. if isinstance(question_value, dict): for field_key, field_value in question_value.items(): if isinstance(field_value, dict) and "chunk_ids" in field_value: for cid in field_value["chunk_ids"]: cid_to_fields.setdefault(cid, set()).add(field_key) else: extract_chunk_ids(field_value, field_key) # Annotate each chunk from answers.json using the aggregated field names. for cid, fields in cid_to_fields.items(): if cid in chunk_dict: chunk = chunk_dict[cid] if "grounding" in chunk: for g in chunk["grounding"]: page_num = g["page"] if page_num < len(doc): page = doc[page_num] rect = convert_relative_to_absolute(page, g["box"]) # Draw a blue rectangle for answer-related chunks. page.draw_rect(rect, color=(0, 0, 1), width=1) # Insert a text annotation at the top left of the box. field_text = ", ".join(sorted(fields)) annot_text = f"{cid}: ({field_text})" shift = -5 page.insert_text( (rect.x0, rect.y0 + shift), annot_text, fontname="helv", fontsize=5, color=(0, 0, 1) ) else: cprint(f"Warning: Chunk id {cid} not found in merged JSON", "yellow") else: cprint("No answers.json found - not processing answer-based annotations", "red") try: doc.save(output_pdf) cprint(f"Annotated PDF saved as: {output_pdf}", "green") except Exception as e: cprint(f"Error in saving PDF: {output_pdf}. Exception: {e}", "red")
def process_all_papers(base_papers_dir): """ Process each paper folder (names that are digits) in sorted order. """ paper_folders = sorted( [ folder for folder in os.listdir(base_papers_dir) if os.path.isdir(os.path.join(base_papers_dir, folder)) and folder.isdigit() ] ) for paper_folder in paper_folders: paper_path = os.path.join(base_papers_dir, paper_folder) pages_dir = os.path.join(paper_path, "pages") # Define file paths (adjust filenames as needed). original_pdf_path = os.path.join(paper_path, f"{paper_folder}_main.pdf") merged_json_path = os.path.join(pages_dir, "merged_v2.json") output_pdf = os.path.join(paper_path, f"{paper_folder}_main_annotated.pdf") answers_json_path = os.path.join(paper_path, "answers.json") # Check if necessary files exist. if not os.path.isfile(original_pdf_path): cprint(f"Paper {paper_folder}: Missing original PDF: {original_pdf_path}", "red") continue if not os.path.isfile(merged_json_path): cprint(f"Paper {paper_folder}: Missing merged JSON: {merged_json_path}", "red") continue # Annotate PDF (answers_json_path is optional). annotate_pdf(original_pdf_path, merged_json_path, output_pdf, answers_json_path) cprint(f"Paper {paper_folder}: Processing complete", "magenta") print() def main(): parser = argparse.ArgumentParser(description="Annotate PDFs using merged JSON and answers.json for papers.") parser.add_argument( "--basepath", type=str, default=os.getcwd(), help="Base path containing the 'papers' folder. Defaults to the current working directory.", ) args = parser.parse_args() papers_dir = os.path.join(args.basepath, "papers") if not os.path.isdir(papers_dir): cprint(f"Error: 'papers' folder not found in {args.basepath}", "red") return process_all_papers(papers_dir) if __name__ == "__main__": main()