Source code for metabeeai.process_pdfs.merger
# Splitted json merger tool which handles both single-page and overlapping
# 2-page PDF formats
#
# Execute with:
# python metabeeai_llm/merger.py --basepath data
#
# m.mieskolainen@imperial.ac.uk, 2025
import argparse
import json
import os
from termcolor import cprint
def detect_page_mode(json_files):
"""
Detect whether PDFs are single-page or 2-page overlapping format.
Returns:
str: 'single' for single-page (main_p01.pdf.json),
'overlap' for 2-page overlapping (main_p01-02.pdf.json)
"""
if not json_files:
return "single"
# Check the first filename for the pattern
first_file = os.path.basename(json_files[0])
# Remove .json extension and check if there's a hyphen in the page numbers
# Pattern: main_p01-02.pdf.json (overlap) vs main_p01.pdf.json (single)
if "-" in first_file and "main_p" in first_file:
return "overlap"
return "single"
[docs]
def adjust_and_merge_json(json_files, output_file, filter_types=None):
if filter_types is None:
filter_types = []
merged = {"data": {"chunks": []}}
page_offset = 0 # global offset for merged pages
# Detect whether we're dealing with single-page or overlapping 2-page PDFs
page_mode = detect_page_mode(json_files)
for i, file in enumerate(json_files):
with open(file, "r", encoding="utf-8") as f:
data = json.load(f)
# Collect all page numbers from grounding entries in this file (only for chunks not filtered out)
pages_in_file = []
for chunk in data["data"]["chunks"]:
if filter_types and "chunk_type" in chunk and chunk["chunk_type"] in filter_types:
continue
if "grounding" in chunk:
for g in chunk["grounding"]:
pages_in_file.append(g["page"])
if pages_in_file:
file_min_page = min(pages_in_file)
file_max_page = max(pages_in_file)
else:
file_min_page = 0
file_max_page = 0
if page_mode == "single":
# Single-page mode: no overlap, just sequential pages
for chunk in data["data"]["chunks"]:
if filter_types and "chunk_type" in chunk and chunk["chunk_type"] in filter_types:
continue
if "grounding" in chunk:
for g in chunk["grounding"]:
# Adjust page number by offset (each file adds 1 page)
g["page"] = g["page"] + page_offset
merged["data"]["chunks"].append(chunk)
# Each file represents 1 page
page_offset += 1
else:
# Overlapping 2-page mode: handle overlap
if i == 0:
# For the first file, no overlap to remove.
max_new_page_this_file = None
for chunk in data["data"]["chunks"]:
if filter_types and "chunk_type" in chunk and chunk["chunk_type"] in filter_types:
continue
if "grounding" in chunk:
for g in chunk["grounding"]:
new_page = g["page"] + page_offset
g["page"] = new_page
if max_new_page_this_file is None or new_page > max_new_page_this_file:
max_new_page_this_file = new_page
merged["data"]["chunks"].append(chunk)
if max_new_page_this_file is not None:
page_offset = max_new_page_this_file + 1
else:
# For subsequent files, adjust overlapping page by mapping overlapping grounding entries
# to the same global page (page_offset - 1) instead of skipping them.
max_new_page_this_file = None
for chunk in data["data"]["chunks"]:
if filter_types and "chunk_type" in chunk and chunk["chunk_type"] in filter_types:
continue
if "grounding" in chunk:
new_grounding = []
for g in chunk["grounding"]:
if g["page"] == file_min_page:
# Map the overlapping page to the previous global page
new_page = (g["page"] - file_min_page) + (page_offset - 1)
else:
new_page = (g["page"] - (file_min_page + 1)) + page_offset
g["page"] = new_page
new_grounding.append(g)
if max_new_page_this_file is None or new_page > max_new_page_this_file:
max_new_page_this_file = new_page
chunk["grounding"] = new_grounding
merged["data"]["chunks"].append(chunk)
# Update offset based on the number of pages in the current file.
page_offset += file_max_page - file_min_page
with open(output_file, "w", encoding="utf-8") as out:
json.dump(merged, out, indent=2)
[docs]
def process_all_papers(base_papers_dir, filter_types):
# Process each paper folder in alphanumeric sorted order
paper_folders = sorted(
[folder for folder in os.listdir(base_papers_dir) if os.path.isdir(os.path.join(base_papers_dir, folder))]
)
for paper_folder in paper_folders:
paper_path = os.path.join(base_papers_dir, paper_folder)
pages_dir = os.path.join(paper_path, "pages")
if os.path.isdir(pages_dir):
# Find all JSON files starting with "main_" in the pages subfolder.
json_files = [
os.path.join(pages_dir, f) for f in os.listdir(pages_dir) if f.startswith("main_") and f.endswith(".json")
]
json_files.sort()
if json_files:
output_file = os.path.join(pages_dir, "merged_v2.json")
page_mode = detect_page_mode(json_files)
mode_desc = "single-page" if page_mode == "single" else "overlapping 2-page"
adjust_and_merge_json(json_files, output_file, filter_types)
cprint(f"Paper {paper_folder}: Merged {len(json_files)} files ({mode_desc} mode) into {output_file}", "green")
# Load the merged file to compute total pages and total chunks.
with open(output_file, "r", encoding="utf-8") as f:
merged_data = json.load(f)
chunks = merged_data["data"]["chunks"]
total_chunks = len(chunks)
# Compute unique pages from all grounding entries.
pages = {g["page"] for chunk in chunks if "grounding" in chunk for g in chunk["grounding"]}
total_pages = max(pages) + 1 if pages else 0
print(f"Paper {paper_folder}: Total pages: {total_pages}, Total chunks: {total_chunks}")
def main():
parser = argparse.ArgumentParser(description="Merge JSON files for papers and print page/chunk counts per paper.")
parser.add_argument(
"--basepath",
type=str,
default=os.getcwd(),
help="Base path containing the 'papers' folder. Defaults to the current working directory.",
)
parser.add_argument(
"--filter-chunk-type",
nargs="+",
default=[],
help="List of keywords for filtering out chunks based on 'chunk_type' (e.g., marginalia).",
)
args = parser.parse_args()
papers_dir = os.path.join(args.basepath, "papers")
if not os.path.isdir(papers_dir):
print(f"Error: papers folder not found in {args.basepath}")
return
process_all_papers(papers_dir, args.filter_chunk_type)
if __name__ == "__main__":
main()