Source code for metabeeai.process_pdfs.split_pdf
#!/usr/bin/env python3
import argparse
import os
import PyPDF2
[docs]
def split_pdfs(papers_dir=None, pages_per_split=1):
"""
Split PDFs in the specified directory into single-page or overlapping 2-page segments.
Args:
papers_dir: Directory containing paper subfolders (defaults to config)
pages_per_split: Number of pages per split (1 or 2). Default is 1.
1 = single-page documents
2 = overlapping 2-page documents
"""
# Validate pages_per_split
if pages_per_split not in [1, 2]:
print(f"Error: pages_per_split must be 1 or 2, got {pages_per_split}")
return
# Resolve from config if not provided
if papers_dir is None:
from metabeeai.config import get_config_param
papers_dir = get_config_param("papers_dir")
# Validate papers directory
if not os.path.exists(papers_dir):
print(f"Error: Directory '{papers_dir}' does not exist")
return
# Get all subfolders in the specified directory
subfolders = [f for f in os.listdir(papers_dir) if os.path.isdir(os.path.join(papers_dir, f))]
if not subfolders:
print(f"No subfolders found in '{papers_dir}'")
return
mode = "single-page" if pages_per_split == 1 else "overlapping 2-page"
print(f"Found {len(subfolders)} subfolders to process in {mode} mode")
for subfolder in subfolders:
# Create pages directory if it doesn't exist
pages_dir = os.path.join(papers_dir, subfolder, "pages")
os.makedirs(pages_dir, exist_ok=True)
# Construct path to main PDF using subfolder name
pdf_path = os.path.join(papers_dir, subfolder, f"{subfolder}_main.pdf")
if not os.path.exists(pdf_path):
print(f"PDF file not found at {pdf_path}, skipping...")
continue
try:
# read the PDF
print(f"Processing {pdf_path}...")
pdf_reader = PyPDF2.PdfReader(pdf_path)
total_pages = len(pdf_reader.pages)
if pages_per_split == 1:
# Create single-page PDFs
splits_created = 0
for i in range(total_pages):
pdf_writer = PyPDF2.PdfWriter()
pdf_writer.add_page(pdf_reader.pages[i])
output_path = os.path.join(pages_dir, f"main_p{i+1:02d}.pdf")
with open(output_path, "wb") as output_file:
pdf_writer.write(output_file)
splits_created += 1
print(
f"Successfully processed {subfolder}_main.pdf ({total_pages} pages, "
f"created {splits_created} single-page PDFs)"
)
elif pages_per_split == 2:
# Create overlapping 2-page PDFs
splits_created = 0
for i in range(total_pages - 1): # Stop at second-to-last page
pdf_writer = PyPDF2.PdfWriter()
# Add current page and next page
pdf_writer.add_page(pdf_reader.pages[i])
pdf_writer.add_page(pdf_reader.pages[i + 1])
output_path = os.path.join(pages_dir, f"main_p{i+1:02d}-{i+2:02d}.pdf")
with open(output_path, "wb") as output_file:
pdf_writer.write(output_file)
splits_created += 1
print(
f"Successfully processed {subfolder}_main.pdf ({total_pages} pages, "
f"created {splits_created} overlapping 2-page PDFs)"
)
except Exception as e:
print(f"Error processing {pdf_path}: {str(e)}")
if __name__ == "__main__":
# Set up command line argument parsing
parser = argparse.ArgumentParser(
description="Split PDFs into single-page or overlapping 2-page documents",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Split into single-page documents (default)
%(prog)s /path/to/papers
%(prog)s /path/to/papers --pages 1
# Split into overlapping 2-page documents
%(prog)s /path/to/papers --pages 2
""",
)
parser.add_argument("directory", type=str, nargs="?", help="Directory containing paper subfolders (defaults to config)")
parser.add_argument(
"--pages",
type=int,
default=1,
choices=[1, 2],
help="Number of pages per split: 1 for single-page (default), 2 for overlapping 2-page",
)
parser.add_argument("--config", type=str, default=None, help="Path to config YAML file")
# Parse arguments
args = parser.parse_args()
if args.config:
os.environ["METABEEAI_CONFIG_FILE"] = args.config
# Run the main function
split_pdfs(args.directory, pages_per_split=args.pages)
print("Processing complete!")