Source code for metabeeai.process_pdfs.merger

# Splitted json merger tool which handles both single-page and overlapping
# 2-page PDF formats
#
# Execute with:
#   python metabeeai_llm/merger.py --basepath data
#
# m.mieskolainen@imperial.ac.uk, 2025

import argparse
import json
import os

from termcolor import cprint


def detect_page_mode(json_files):
    """
    Detect whether PDFs are single-page or 2-page overlapping format.

    Returns:
        str: 'single' for single-page (main_p01.pdf.json),
             'overlap' for 2-page overlapping (main_p01-02.pdf.json)
    """
    if not json_files:
        return "single"

    # Check the first filename for the pattern
    first_file = os.path.basename(json_files[0])
    # Remove .json extension and check if there's a hyphen in the page numbers
    # Pattern: main_p01-02.pdf.json (overlap) vs main_p01.pdf.json (single)
    if "-" in first_file and "main_p" in first_file:
        return "overlap"
    return "single"



[docs]
def adjust_and_merge_json(json_files, output_file, filter_types=None):
    if filter_types is None:
        filter_types = []
    merged = {"data": {"chunks": []}}
    page_offset = 0  # global offset for merged pages

    # Detect whether we're dealing with single-page or overlapping 2-page PDFs
    page_mode = detect_page_mode(json_files)

    for i, file in enumerate(json_files):
        with open(file, "r", encoding="utf-8") as f:
            data = json.load(f)

        # Collect all page numbers from grounding entries in this file (only for chunks not filtered out)
        pages_in_file = []
        for chunk in data["data"]["chunks"]:
            if filter_types and "chunk_type" in chunk and chunk["chunk_type"] in filter_types:
                continue
            if "grounding" in chunk:
                for g in chunk["grounding"]:
                    pages_in_file.append(g["page"])

        if pages_in_file:
            file_min_page = min(pages_in_file)
            file_max_page = max(pages_in_file)
        else:
            file_min_page = 0
            file_max_page = 0

        if page_mode == "single":
            # Single-page mode: no overlap, just sequential pages
            for chunk in data["data"]["chunks"]:
                if filter_types and "chunk_type" in chunk and chunk["chunk_type"] in filter_types:
                    continue
                if "grounding" in chunk:
                    for g in chunk["grounding"]:
                        # Adjust page number by offset (each file adds 1 page)
                        g["page"] = g["page"] + page_offset
                merged["data"]["chunks"].append(chunk)
            # Each file represents 1 page
            page_offset += 1
        else:
            # Overlapping 2-page mode: handle overlap
            if i == 0:
                # For the first file, no overlap to remove.
                max_new_page_this_file = None
                for chunk in data["data"]["chunks"]:
                    if filter_types and "chunk_type" in chunk and chunk["chunk_type"] in filter_types:
                        continue
                    if "grounding" in chunk:
                        for g in chunk["grounding"]:
                            new_page = g["page"] + page_offset
                            g["page"] = new_page
                            if max_new_page_this_file is None or new_page > max_new_page_this_file:
                                max_new_page_this_file = new_page
                    merged["data"]["chunks"].append(chunk)
                if max_new_page_this_file is not None:
                    page_offset = max_new_page_this_file + 1
            else:
                # For subsequent files, adjust overlapping page by mapping overlapping grounding entries
                # to the same global page (page_offset - 1) instead of skipping them.
                max_new_page_this_file = None
                for chunk in data["data"]["chunks"]:
                    if filter_types and "chunk_type" in chunk and chunk["chunk_type"] in filter_types:
                        continue
                    if "grounding" in chunk:
                        new_grounding = []
                        for g in chunk["grounding"]:
                            if g["page"] == file_min_page:
                                # Map the overlapping page to the previous global page
                                new_page = (g["page"] - file_min_page) + (page_offset - 1)
                            else:
                                new_page = (g["page"] - (file_min_page + 1)) + page_offset
                            g["page"] = new_page
                            new_grounding.append(g)
                            if max_new_page_this_file is None or new_page > max_new_page_this_file:
                                max_new_page_this_file = new_page
                        chunk["grounding"] = new_grounding
                    merged["data"]["chunks"].append(chunk)
                # Update offset based on the number of pages in the current file.
                page_offset += file_max_page - file_min_page

    with open(output_file, "w", encoding="utf-8") as out:
        json.dump(merged, out, indent=2)




[docs]
def process_all_papers(base_papers_dir, filter_types):
    # Process each paper folder in alphanumeric sorted order
    paper_folders = sorted(
        [folder for folder in os.listdir(base_papers_dir) if os.path.isdir(os.path.join(base_papers_dir, folder))]
    )

    for paper_folder in paper_folders:
        paper_path = os.path.join(base_papers_dir, paper_folder)
        pages_dir = os.path.join(paper_path, "pages")

        if os.path.isdir(pages_dir):
            # Find all JSON files starting with "main_" in the pages subfolder.
            json_files = [
                os.path.join(pages_dir, f) for f in os.listdir(pages_dir) if f.startswith("main_") and f.endswith(".json")
            ]
            json_files.sort()
            if json_files:
                output_file = os.path.join(pages_dir, "merged_v2.json")
                page_mode = detect_page_mode(json_files)
                mode_desc = "single-page" if page_mode == "single" else "overlapping 2-page"
                adjust_and_merge_json(json_files, output_file, filter_types)
                cprint(f"Paper {paper_folder}: Merged {len(json_files)} files ({mode_desc} mode) into {output_file}", "green")

                # Load the merged file to compute total pages and total chunks.
                with open(output_file, "r", encoding="utf-8") as f:
                    merged_data = json.load(f)
                chunks = merged_data["data"]["chunks"]
                total_chunks = len(chunks)

                # Compute unique pages from all grounding entries.
                pages = {g["page"] for chunk in chunks if "grounding" in chunk for g in chunk["grounding"]}
                total_pages = max(pages) + 1 if pages else 0
                print(f"Paper {paper_folder}: Total pages: {total_pages}, Total chunks: {total_chunks}")



def main():
    parser = argparse.ArgumentParser(description="Merge JSON files for papers and print page/chunk counts per paper.")
    parser.add_argument(
        "--basepath",
        type=str,
        default=os.getcwd(),
        help="Base path containing the 'papers' folder. Defaults to the current working directory.",
    )
    parser.add_argument(
        "--filter-chunk-type",
        nargs="+",
        default=[],
        help="List of keywords for filtering out chunks based on 'chunk_type' (e.g., marginalia).",
    )
    args = parser.parse_args()

    papers_dir = os.path.join(args.basepath, "papers")
    if not os.path.isdir(papers_dir):
        print(f"Error: papers folder not found in {args.basepath}")
        return
    process_all_papers(papers_dir, args.filter_chunk_type)


if __name__ == "__main__":
    main()