Source code for metabeeai.process_pdfs.va_process_papers
import argparse
import os
import time
from datetime import datetime
import requests
from dotenv import load_dotenv
from metabeeai.config import get_config_param
[docs]
def process_papers(papers_dir=None, start_folder=None):
"""
Process papers in the specified directory using Vision Agentic Document Analysis, starting from an optional folder.
Args:
papers_dir: Directory containing paper subfolders (defaults to config)
start_folder: Optional folder name to start processing from (alphanumeric ordering)
"""
# Resolve papers_dir from config if not provided
if papers_dir is None:
papers_dir = get_config_param("papers_dir")
# Load environment variables
load_dotenv()
# Get all subfolders in papers_dir
if not os.path.exists(papers_dir):
print(f"Error: Directory '{papers_dir}' does not exist")
return
subfolders = [f for f in os.listdir(papers_dir) if os.path.isdir(os.path.join(papers_dir, f))]
# Sort alphanumerically (e.g., 6fhek9 comes before 6pafhf)
subfolders.sort()
url = "https://api.va.landing.ai/v1/tools/agentic-document-analysis"
# Create log file with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file_path = os.path.join(papers_dir, f"processing_log_{timestamp}.txt")
def log_message(message):
"""Write message to both console and log file"""
print(message)
with open(log_file_path, "a") as log_file:
log_file.write(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - {message}\n")
# If start_folder is specified, filter subfolders
if start_folder:
start_idx = next((i for i, folder in enumerate(subfolders) if folder >= start_folder), len(subfolders))
if start_idx == len(subfolders):
log_message(f"Warning: Start folder '{start_folder}' not found or comes after all existing folders")
subfolders = subfolders[start_idx:]
log_message(f"Starting processing in directory: {papers_dir}")
log_message(f"Found {len(subfolders)} folders to process")
# Process each subfolder in alphanumeric order
for subfolder in subfolders:
log_message(f"\nProcessing subfolder: {subfolder}")
pages_path = os.path.join(papers_dir, subfolder, "pages")
# Make sure directory exists
if not os.path.exists(pages_path):
log_message(f"Pages directory not found at {pages_path}, skipping...")
continue
# Get list of PDF files and sort them
# Handle both single-page (main_p01.pdf) and 2-page (main_p01-02.pdf) formats
page_files = sorted(
[f for f in os.listdir(pages_path) if f.endswith(".pdf")],
key=lambda x: int(x.split("_p")[1].split("-")[0].split(".")[0]),
)
if not page_files:
log_message(f"No PDF files found in {pages_path}, skipping...")
continue
# Process each page
for page_file in page_files:
start_time = time.time()
# Check if JSON exists
json_path = os.path.join(pages_path, f"{page_file}.json")
if os.path.exists(json_path):
log_message(f"JSON file already exists for {page_file}, skipping...")
continue
file_path = os.path.join(pages_path, page_file)
try:
with open(file_path, "rb") as f:
files = {"pdf": f}
landing_api_key = get_config_param("landing_api_key")
headers = {"Authorization": f"Basic {landing_api_key}"}
response = requests.post(url, files=files, headers=headers)
response.raise_for_status()
# Calculate processing time
processing_time = time.time() - start_time
# Save response
with open(json_path, "w") as f:
f.write(response.text)
log_message(f"Successfully processed {page_file} in {processing_time:.2f} seconds")
except Exception as e:
processing_time = time.time() - start_time
log_message(f"Error processing {page_file} after {processing_time:.2f} seconds: {str(e)}")
def main():
parser = argparse.ArgumentParser(
description="Process PDF papers from a specific directory and/or starting folder (alphanumeric ordering)",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Process all papers in default directory (data/papers)
%(prog)s
# Process papers in a specific directory
%(prog)s --dir path/to/papers
# Process starting from a specific folder (alphanumeric)
%(prog)s --start 95UKMIEY
%(prog)s --dir data/papers --start CX9M8HCM
""",
)
parser.add_argument("--config", type=str, default=None, help="Path to config YAML file")
parser.add_argument("--dir", type=str, help="Papers directory (default: from config)", default=None)
parser.add_argument("--start", type=str, help="Starting folder name (alphanumeric, e.g., 95UKMIEY, CX9M8HCM)", default=None)
args = parser.parse_args()
# Surface config file to all lookups if provided
if args.config:
os.environ["METABEEAI_CONFIG_FILE"] = args.config
process_papers(args.dir, args.start)
if __name__ == "__main__":
main()