From e1b06dd0cf911aa8473d764aa51312b3a1c18b92 Mon Sep 17 00:00:00 2001
From: sh20raj <shaswatraj3@gmail.com>
Date: Sat, 5 Apr 2025 11:30:10 +0530
Subject: [PATCH] Add biomedical multimodal dataset preparation tools for Gemma
 fine-tuning

This commit addresses issue #210 by providing tools for preparing biomedical multimodal datasets with text, images, tables, and formulas for Gemma fine-tuning.

- Add preprocess_pdfs.py for extracting content from PDFs
- Add create_dataset.py for structuring the dataset
- Add finetune_gemma.py for fine-tuning Gemma models
- Add comprehensive documentation and requirements

The solution enables users to convert biomedical PDFs to a format suitable for Gemma fine-tuning while preserving the semantic relationships between text and non-text elements.
---
 BIOMEDICAL_DATASET_README.md | 147 +++++++++++++++++++
 create_dataset.py            | 145 ++++++++++++++++++
 finetune_gemma.py            | 255 ++++++++++++++++++++++++++++++++
 preprocess_pdfs.py           | 277 +++++++++++++++++++++++++++++++++++
 requirements.txt             |  15 ++
 5 files changed, 839 insertions(+)
 create mode 100644 BIOMEDICAL_DATASET_README.md
 create mode 100644 create_dataset.py
 create mode 100644 finetune_gemma.py
 create mode 100644 preprocess_pdfs.py
 create mode 100644 requirements.txt

diff --git a/BIOMEDICAL_DATASET_README.md b/BIOMEDICAL_DATASET_README.md
new file mode 100644
index 00000000..31c9457b
--- /dev/null
+++ b/BIOMEDICAL_DATASET_README.md
@@ -0,0 +1,147 @@
+# Biomedical Multimodal Dataset Preparation for Gemma Fine-tuning
+
+This repository contains tools and scripts for preparing biomedical multimodal datasets (text + images/tables/formulas) for fine-tuning Gemma models.
+
+## Overview
+
+The process involves three main steps:
+
+1. **Preprocessing PDFs**: Extract text, images, tables, and formulas from biomedical PDFs
+2. **Creating a Dataset**: Structure the extracted content into a format suitable for Gemma fine-tuning
+3. **Fine-tuning Gemma**: Fine-tune a Gemma model on the prepared dataset
+
+## Requirements
+
+Install the required dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+
+## 1. Preprocessing PDFs
+
+The `preprocess_pdfs.py` script extracts content from PDFs and converts it to appropriate formats:
+
+- **Text**: Extracted as Markdown
+- **Images**: Extracted as PNG/JPEG files
+- **Tables**: Converted to Markdown format
+- **Formulas**: Converted to LaTeX format
+
+### Usage
+
+```bash
+python preprocess_pdfs.py --input_dir /path/to/pdfs --output_dir /path/to/preprocessed
+```
+
+### Output Structure
+
+```
+/path/to/preprocessed/
+├── document1/
+│   ├── document1.md
+│   ├── images/
+│   │   ├── img_1_1.png
+│   │   ├── img_1_2.png
+│   │   └── ...
+│   ├── tables/
+│   │   ├── table_1.md
+│   │   ├── table_2.md
+│   │   └── ...
+│   └── formulas/
+│       ├── formula_1_1.tex
+│       ├── formula_1_1.png
+│       └── ...
+└── document2/
+    └── ...
+```
+
+## 2. Creating a Dataset
+
+The `create_dataset.py` script structures the preprocessed content into a format suitable for Gemma fine-tuning:
+
+### Usage
+
+```bash
+python create_dataset.py --input_dir /path/to/preprocessed --output_dir /path/to/dataset
+```
+
+### Output Structure
+
+```
+/path/to/dataset/
+├── train/
+│   └── data.json
+├── validation/
+│   └── data.json
+└── images/
+    ├── img_1_1.png
+    ├── img_1_2.png
+    └── ...
+```
+
+### Dataset Format
+
+The dataset is structured as a JSON file with the following format:
+
+```json
+[
+  {
+    "input_text": "Text with <start_of_image> tags and LaTeX formulas",
+    "output_text": "Expected output for instruction tuning",
+    "images": ["path/to/image1.png", "path/to/image2.png"]
+  },
+  ...
+]
+```
+
+## 3. Fine-tuning Gemma
+
+The `finetune_gemma.py` script fine-tunes a Gemma model on the prepared dataset:
+
+### Usage
+
+```bash
+python finetune_gemma.py --dataset_dir /path/to/dataset --output_dir /path/to/model
+```
+
+## Best Practices
+
+### Handling Images
+
+- Use `<start_of_image>` tags in the text to indicate where images should appear
+- Ensure images are properly sized (800x800 pixels is recommended)
+- Include descriptive alt text or captions for images
+
+### Handling Tables
+
+- Use Markdown table format to preserve structure
+- Keep tables simple and well-formatted
+- Ensure column headers are clear and descriptive
+
+### Handling Formulas
+
+- Use LaTeX format for mathematical formulas
+- Enclose inline formulas with single dollar signs: `$formula$`
+- Enclose block formulas with double dollar signs: `$$formula$$`
+
+### Dataset Preparation
+
+- Balance the dataset with a variety of content types
+- Ensure high-quality text and image content
+- Provide clear instruction-response pairs for fine-tuning
+- Split the dataset into training and validation sets (90/10 split is recommended)
+
+## Limitations and Considerations
+
+- Complex tables may not be perfectly preserved in Markdown format
+- Formula extraction accuracy depends on the quality of the PDF
+- Very large images may need to be resized or split
+- Some special characters in LaTeX formulas may require escaping
+
+## Requirements File
+
+A `requirements.txt` file is included with all necessary dependencies.
+
+## License
+
+This project is licensed under the Apache License 2.0 - see the LICENSE file for details.
diff --git a/create_dataset.py b/create_dataset.py
new file mode 100644
index 00000000..c0abef67
--- /dev/null
+++ b/create_dataset.py
@@ -0,0 +1,145 @@
+#!/usr/bin/env python3
+"""
+Create a dataset for Gemma fine-tuning from preprocessed biomedical PDFs.
+
+This script creates a dataset structure suitable for Gemma fine-tuning from
+preprocessed biomedical PDFs.
+
+Usage:
+    python create_dataset.py --input_dir /path/to/preprocessed --output_dir /path/to/dataset
+"""
+
+import argparse
+import os
+import json
+import random
+import shutil
+from pathlib import Path
+from typing import Dict, List, Tuple, Any
+
+def create_training_example(markdown_path: str, image_map: Dict[str, str]) -> Dict[str, Any]:
+    """
+    Create a training example from a preprocessed markdown file.
+    
+    Args:
+        markdown_path: Path to the markdown file
+        image_map: Dictionary mapping image IDs to file paths
+        
+    Returns:
+        Dictionary with training example data
+    """
+    # Read markdown content
+    with open(markdown_path, "r") as md_file:
+        markdown_content = md_file.read()
+    
+    # Split content into sections (simplified approach)
+    sections = markdown_content.split("## Page")
+    
+    # Create examples
+    examples = []
+    
+    for section in sections:
+        if not section.strip():
+            continue
+        
+        # Check if section contains image references
+        has_images = "<start_of_image>" in section
+        
+        # Get image paths for this section
+        section_images = []
+        if has_images:
+            # Extract image IDs from the section
+            for image_id in image_map:
+                if image_id in section:
+                    section_images.append(image_map[image_id])
+        
+        # Create example
+        example = {
+            "input_text": section.strip(),
+            "output_text": "",  # This would be filled with expected output for instruction tuning
+            "images": section_images
+        }
+        
+        examples.append(example)
+    
+    return examples
+
+def create_dataset(input_dir: str, output_dir: str, split_ratio: float = 0.9) -> None:
+    """
+    Create a dataset for Gemma fine-tuning.
+    
+    Args:
+        input_dir: Directory containing preprocessed content
+        output_dir: Directory to save the dataset
+        split_ratio: Train/validation split ratio
+    """
+    # Create output directories
+    os.makedirs(output_dir, exist_ok=True)
+    os.makedirs(os.path.join(output_dir, "train"), exist_ok=True)
+    os.makedirs(os.path.join(output_dir, "validation"), exist_ok=True)
+    os.makedirs(os.path.join(output_dir, "images"), exist_ok=True)
+    
+    # Get list of preprocessed directories
+    preprocessed_dirs = [d for d in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, d))]
+    
+    # Process each directory
+    all_examples = []
+    
+    for dir_name in preprocessed_dirs:
+        dir_path = os.path.join(input_dir, dir_name)
+        
+        # Find markdown file
+        markdown_files = [f for f in os.listdir(dir_path) if f.endswith(".md")]
+        if not markdown_files:
+            continue
+        
+        markdown_path = os.path.join(dir_path, markdown_files[0])
+        
+        # Find image map
+        image_dir = os.path.join(dir_path, "images")
+        if os.path.exists(image_dir):
+            image_files = [f for f in os.listdir(image_dir) if f.endswith((".png", ".jpg", ".jpeg"))]
+            image_map = {os.path.splitext(f)[0]: os.path.join(image_dir, f) for f in image_files}
+        else:
+            image_map = {}
+        
+        # Create examples
+        examples = create_training_example(markdown_path, image_map)
+        all_examples.extend(examples)
+    
+    # Shuffle examples
+    random.shuffle(all_examples)
+    
+    # Split into train and validation sets
+    split_idx = int(len(all_examples) * split_ratio)
+    train_examples = all_examples[:split_idx]
+    val_examples = all_examples[split_idx:]
+    
+    # Copy images to dataset directory and update paths
+    for example in all_examples:
+        for i, img_path in enumerate(example["images"]):
+            img_filename = os.path.basename(img_path)
+            new_img_path = os.path.join(output_dir, "images", img_filename)
+            shutil.copy(img_path, new_img_path)
+            example["images"][i] = os.path.relpath(new_img_path, output_dir)
+    
+    # Save train and validation sets
+    with open(os.path.join(output_dir, "train", "data.json"), "w") as f:
+        json.dump(train_examples, f, indent=2)
+    
+    with open(os.path.join(output_dir, "validation", "data.json"), "w") as f:
+        json.dump(val_examples, f, indent=2)
+    
+    print(f"Created dataset with {len(train_examples)} training examples and {len(val_examples)} validation examples")
+
+def main():
+    parser = argparse.ArgumentParser(description="Create a dataset for Gemma fine-tuning")
+    parser.add_argument("--input_dir", required=True, help="Directory containing preprocessed content")
+    parser.add_argument("--output_dir", required=True, help="Directory to save the dataset")
+    parser.add_argument("--split_ratio", type=float, default=0.9, help="Train/validation split ratio")
+    args = parser.parse_args()
+    
+    create_dataset(args.input_dir, args.output_dir, args.split_ratio)
+
+if __name__ == "__main__":
+    main()
diff --git a/finetune_gemma.py b/finetune_gemma.py
new file mode 100644
index 00000000..92f79e29
--- /dev/null
+++ b/finetune_gemma.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+"""
+Fine-tune Gemma on a biomedical multimodal dataset.
+
+This script fine-tunes Gemma on a biomedical multimodal dataset with images, tables, and formulas.
+
+Usage:
+    python finetune_gemma.py --dataset_dir /path/to/dataset --output_dir /path/to/output
+"""
+
+import argparse
+import os
+import json
+from pathlib import Path
+from typing import Dict, List, Tuple, Any
+
+import jax.numpy as jnp
+import numpy as np
+from PIL import Image
+
+from gemma import gm
+from kauldron import kd
+import optax
+
+class BiomedicalDataset(kd.data.Dataset):
+    """Dataset for biomedical multimodal data."""
+    
+    def __init__(
+        self,
+        data_path: str,
+        image_dir: str,
+        tokenizer: gm.text.Tokenizer,
+        max_length: int = 512,
+        training: bool = True,
+        sampling: bool = False,
+    ):
+        """Initialize the dataset.
+        
+        Args:
+            data_path: Path to the JSON data file
+            image_dir: Directory containing images
+            tokenizer: Tokenizer to use
+            max_length: Maximum sequence length
+            training: Whether this is a training dataset
+            sampling: Whether this is for sampling
+        """
+        self.data_path = data_path
+        self.image_dir = image_dir
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.training = training
+        self.sampling = sampling
+        
+        # Load data
+        with open(data_path, "r") as f:
+            self.data = json.load(f)
+    
+    def __len__(self):
+        return len(self.data)
+    
+    def __getitem__(self, idx):
+        example = self.data[idx]
+        
+        # Get text
+        input_text = example["input_text"]
+        output_text = example["output_text"]
+        
+        # Get images
+        images = []
+        for img_path in example["images"]:
+            full_img_path = os.path.join(self.image_dir, img_path)
+            if os.path.exists(full_img_path):
+                img = np.array(Image.open(full_img_path).convert("RGB"))
+                # Resize image to 800x800
+                img = kd.data.py.resize_image(img, (800, 800))
+                images.append(img)
+        
+        # If no images, add a dummy image to maintain batch structure
+        if not images:
+            images = [np.zeros((800, 800, 3), dtype=np.uint8)]
+        
+        # Stack images
+        images = np.stack(images, axis=0)
+        
+        # If sampling, return the raw text and images
+        if self.sampling:
+            return {
+                "prompt": input_text,
+                "response": output_text,
+                "image": images,
+            }
+        
+        # Create model inputs
+        prompt = self.tokenizer.encode(input_text, add_bos=True)
+        response = self.tokenizer.encode(output_text)
+        
+        # Create the model inputs/targets/loss_mask
+        seq2seq_fields = gm.data._functional.make_seq2seq_fields(
+            prompt=prompt,
+            response=response,
+        )
+        
+        # Add padding
+        seq2seq_fields = gm.data._functional.pad(
+            seq2seq_fields,
+            max_length=self.max_length,
+            truncate=True,
+        )
+        
+        return {
+            "input": seq2seq_fields.input,
+            "target": seq2seq_fields.target,
+            "loss_mask": seq2seq_fields.target_mask,
+            "image": images,
+        }
+
+def get_config(dataset_dir: str, output_dir: str):
+    """Get the training configuration.
+    
+    Args:
+        dataset_dir: Directory containing the dataset
+        output_dir: Directory to save the output
+        
+    Returns:
+        Training configuration
+    """
+    batch_size = 8  # Smaller batch size due to multimodal data
+    max_length = 512
+    
+    # Initialize tokenizer
+    tokenizer = gm.text.Gemma3Tokenizer()
+    
+    # Create datasets
+    train_data_path = os.path.join(dataset_dir, "train", "data.json")
+    val_data_path = os.path.join(dataset_dir, "validation", "data.json")
+    image_dir = os.path.join(dataset_dir, "images")
+    
+    train_ds = BiomedicalDataset(
+        data_path=train_data_path,
+        image_dir=image_dir,
+        tokenizer=tokenizer,
+        max_length=max_length,
+        training=True,
+    )
+    
+    val_ds = BiomedicalDataset(
+        data_path=val_data_path,
+        image_dir=image_dir,
+        tokenizer=tokenizer,
+        max_length=max_length,
+        training=False,
+    )
+    
+    sampling_ds = BiomedicalDataset(
+        data_path=val_data_path,
+        image_dir=image_dir,
+        tokenizer=tokenizer,
+        max_length=max_length,
+        training=False,
+        sampling=True,
+    )
+    
+    # Create data loaders
+    train_loader = kd.data.PyLoader(
+        dataset=train_ds,
+        batch_size=batch_size,
+        shuffle=True,
+        num_workers=4,
+    )
+    
+    val_loader = kd.data.PyLoader(
+        dataset=val_ds,
+        batch_size=batch_size,
+        shuffle=False,
+        num_workers=4,
+    )
+    
+    sampling_loader = kd.data.PyLoader(
+        dataset=sampling_ds,
+        batch_size=1,  # For sampling
+        shuffle=False,
+        num_workers=1,
+    )
+    
+    return kd.train.Trainer(
+        seed=42,
+        # Dataset
+        train_ds=train_loader,
+        # Model definition
+        model=gm.nn.Gemma3_4B(
+            tokens="batch.input",
+            images="batch.image",
+        ),
+        # Load the weights from the pretrained checkpoint
+        init_transform=gm.ckpts.LoadCheckpoint(
+            path=gm.ckpts.CheckpointPath.GEMMA3_4B_IT,
+        ),
+        # Training
+        num_train_steps=5000,  # Adjust based on dataset size
+        train_losses={
+            "xentropy": kd.losses.SoftmaxCrossEntropyWithIntLabels(
+                logits="preds.logits",
+                labels="batch.target",
+                mask="batch.loss_mask",
+            ),
+        },
+        train_summaries={
+            "image": kd.summaries.ShowImages(images="batch.image", num_images=5),
+        },
+        optimizer=optax.adafactor(learning_rate=1e-5),  # Lower learning rate for fine-tuning
+        checkpointer=kd.ckpts.Checkpointer(
+            save_interval_steps=500,
+            workdir=output_dir,
+        ),
+        # Evaluation
+        evals={
+            "validation": kd.evals.Evaluator(
+                run=kd.evals.EveryNSteps(500),
+                ds=val_loader,
+            ),
+            # The sampler evaluator run inference on a few prompts from the
+            # validation set.
+            "sampling": gm.evals.SamplerEvaluator(
+                run=kd.evals.EveryNSteps(500),
+                max_new_tokens=100,  # Sampling parameters
+                num_batches=3,
+                ds=sampling_loader,
+                summaries={
+                    "image": kd.summaries.ShowImages(
+                        images="batch.image", num_images=5
+                    ),
+                },
+            ),
+        },
+    )
+
+def main():
+    parser = argparse.ArgumentParser(description="Fine-tune Gemma on a biomedical multimodal dataset")
+    parser.add_argument("--dataset_dir", required=True, help="Directory containing the dataset")
+    parser.add_argument("--output_dir", required=True, help="Directory to save the output")
+    args = parser.parse_args()
+    
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    
+    # Get configuration
+    config = get_config(args.dataset_dir, args.output_dir)
+    
+    # Start training
+    print("Starting fine-tuning...")
+    kd.main.run(config)
+    print("Fine-tuning complete!")
+
+if __name__ == "__main__":
+    main()
diff --git a/preprocess_pdfs.py b/preprocess_pdfs.py
new file mode 100644
index 00000000..b8be8f8f
--- /dev/null
+++ b/preprocess_pdfs.py
@@ -0,0 +1,277 @@
+#!/usr/bin/env python3
+"""
+Preprocess biomedical PDFs for Gemma fine-tuning.
+
+This script extracts text, images, tables, and formulas from PDFs and converts them
+to appropriate formats for Gemma fine-tuning.
+
+Usage:
+    python preprocess_pdfs.py --input_dir /path/to/pdfs --output_dir /path/to/output
+"""
+
+import argparse
+import os
+import re
+import uuid
+from pathlib import Path
+from typing import Dict, List, Tuple, Any
+
+import fitz  # PyMuPDF
+import numpy as np
+from PIL import Image
+import camelot
+import pytesseract
+from pix2tex.cli import LatexOCR
+
+# Initialize LaTeX OCR model for formula extraction
+latex_ocr = LatexOCR()
+
+def extract_images(pdf_path: str, output_dir: str) -> Dict[str, str]:
+    """
+    Extract images from PDF and save them to output directory.
+    
+    Args:
+        pdf_path: Path to the PDF file
+        output_dir: Directory to save extracted images
+        
+    Returns:
+        Dictionary mapping image IDs to file paths
+    """
+    image_dir = os.path.join(output_dir, "images")
+    os.makedirs(image_dir, exist_ok=True)
+    
+    image_map = {}
+    doc = fitz.open(pdf_path)
+    
+    for page_num, page in enumerate(doc):
+        image_list = page.get_images(full=True)
+        
+        for img_idx, img_info in enumerate(image_list):
+            xref = img_info[0]
+            base_image = doc.extract_image(xref)
+            image_bytes = base_image["image"]
+            image_ext = base_image["ext"]
+            
+            # Generate a unique ID for the image
+            image_id = f"img_{page_num+1}_{img_idx+1}"
+            image_filename = f"{image_id}.{image_ext}"
+            image_path = os.path.join(image_dir, image_filename)
+            
+            with open(image_path, "wb") as img_file:
+                img_file.write(image_bytes)
+            
+            # Store the mapping
+            image_map[image_id] = image_path
+    
+    return image_map
+
+def extract_tables(pdf_path: str, output_dir: str) -> Dict[str, str]:
+    """
+    Extract tables from PDF and convert to Markdown format.
+    
+    Args:
+        pdf_path: Path to the PDF file
+        output_dir: Directory to save extracted tables
+        
+    Returns:
+        Dictionary mapping table IDs to Markdown content
+    """
+    table_dir = os.path.join(output_dir, "tables")
+    os.makedirs(table_dir, exist_ok=True)
+    
+    table_map = {}
+    
+    # Extract tables using Camelot
+    try:
+        tables = camelot.read_pdf(pdf_path, pages='all', flavor='lattice')
+        
+        for table_idx, table in enumerate(tables):
+            # Generate a unique ID for the table
+            table_id = f"table_{table_idx+1}"
+            
+            # Convert table to Markdown
+            markdown_table = table.df.to_markdown(index=False)
+            
+            # Store the mapping
+            table_map[table_id] = markdown_table
+            
+            # Save the table as Markdown
+            table_path = os.path.join(table_dir, f"{table_id}.md")
+            with open(table_path, "w") as table_file:
+                table_file.write(markdown_table)
+    except Exception as e:
+        print(f"Error extracting tables: {e}")
+    
+    return table_map
+
+def extract_formulas(pdf_path: str, output_dir: str) -> Dict[str, str]:
+    """
+    Extract mathematical formulas from PDF and convert to LaTeX.
+    
+    Args:
+        pdf_path: Path to the PDF file
+        output_dir: Directory to save extracted formulas
+        
+    Returns:
+        Dictionary mapping formula IDs to LaTeX content
+    """
+    formula_dir = os.path.join(output_dir, "formulas")
+    os.makedirs(formula_dir, exist_ok=True)
+    
+    formula_map = {}
+    doc = fitz.open(pdf_path)
+    
+    for page_num, page in enumerate(doc):
+        # Extract page as an image
+        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        
+        # Use OCR to identify potential formula regions
+        # This is a simplified approach - in practice, you'd need more sophisticated detection
+        text = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
+        
+        # Look for potential formula indicators
+        formula_indicators = ["=", "+", "-", "∫", "∑", "∏", "√", "α", "β", "γ", "δ"]
+        
+        potential_formulas = []
+        for i, word in enumerate(text["text"]):
+            if any(indicator in word for indicator in formula_indicators):
+                x, y, w, h = text["left"][i], text["top"][i], text["width"][i], text["height"][i]
+                # Expand the region slightly
+                x = max(0, x - 20)
+                y = max(0, y - 20)
+                w = min(pix.width - x, w + 40)
+                h = min(pix.height - y, h + 40)
+                
+                formula_img = img.crop((x, y, x + w, y + h))
+                potential_formulas.append(formula_img)
+        
+        # Process potential formulas
+        for idx, formula_img in enumerate(potential_formulas):
+            try:
+                # Use LaTeX OCR to convert image to LaTeX
+                latex = latex_ocr(formula_img)
+                
+                # Generate a unique ID for the formula
+                formula_id = f"formula_{page_num+1}_{idx+1}"
+                
+                # Store the mapping
+                formula_map[formula_id] = latex
+                
+                # Save the formula as LaTeX
+                formula_path = os.path.join(formula_dir, f"{formula_id}.tex")
+                with open(formula_path, "w") as formula_file:
+                    formula_file.write(latex)
+                
+                # Also save the formula image for reference
+                formula_img_path = os.path.join(formula_dir, f"{formula_id}.png")
+                formula_img.save(formula_img_path)
+            except Exception as e:
+                print(f"Error processing formula: {e}")
+    
+    return formula_map
+
+def extract_text_with_references(pdf_path: str, image_map: Dict[str, str], 
+                                table_map: Dict[str, str], formula_map: Dict[str, str]) -> str:
+    """
+    Extract text from PDF with references to images, tables, and formulas.
+    
+    Args:
+        pdf_path: Path to the PDF file
+        image_map: Dictionary mapping image IDs to file paths
+        table_map: Dictionary mapping table IDs to Markdown content
+        formula_map: Dictionary mapping formula IDs to LaTeX content
+        
+    Returns:
+        Markdown text with references to images, tables, and formulas
+    """
+    doc = fitz.open(pdf_path)
+    markdown_text = ""
+    
+    for page_num, page in enumerate(doc):
+        text = page.get_text("text")
+        
+        # Process text and add references to images, tables, and formulas
+        # This is a simplified approach - in practice, you'd need more sophisticated detection
+        
+        # Add page header
+        markdown_text += f"## Page {page_num+1}\n\n"
+        
+        # Add text
+        markdown_text += text + "\n\n"
+        
+        # Add references to images on this page
+        for image_id, image_path in image_map.items():
+            if f"_{page_num+1}_" in image_id:
+                markdown_text += f"<start_of_image>\n\n"
+        
+        # Add references to tables on this page
+        for table_id, table_content in table_map.items():
+            markdown_text += f"\n\n{table_content}\n\n"
+        
+        # Add references to formulas on this page
+        for formula_id, formula_content in formula_map.items():
+            if f"_{page_num+1}_" in formula_id:
+                markdown_text += f"\n\n${formula_content}$\n\n"
+    
+    return markdown_text
+
+def preprocess_pdf(pdf_path: str, output_dir: str) -> Dict[str, Any]:
+    """
+    Preprocess a single PDF file.
+    
+    Args:
+        pdf_path: Path to the PDF file
+        output_dir: Directory to save preprocessed content
+        
+    Returns:
+        Dictionary with preprocessed content
+    """
+    pdf_name = os.path.basename(pdf_path).replace(".pdf", "")
+    pdf_output_dir = os.path.join(output_dir, pdf_name)
+    os.makedirs(pdf_output_dir, exist_ok=True)
+    
+    # Extract content
+    image_map = extract_images(pdf_path, pdf_output_dir)
+    table_map = extract_tables(pdf_path, pdf_output_dir)
+    formula_map = extract_formulas(pdf_path, pdf_output_dir)
+    
+    # Extract text with references
+    markdown_text = extract_text_with_references(pdf_path, image_map, table_map, formula_map)
+    
+    # Save markdown text
+    markdown_path = os.path.join(pdf_output_dir, f"{pdf_name}.md")
+    with open(markdown_path, "w") as md_file:
+        md_file.write(markdown_text)
+    
+    return {
+        "pdf_path": pdf_path,
+        "markdown_path": markdown_path,
+        "image_map": image_map,
+        "table_map": table_map,
+        "formula_map": formula_map
+    }
+
+def main():
+    parser = argparse.ArgumentParser(description="Preprocess biomedical PDFs for Gemma fine-tuning")
+    parser.add_argument("--input_dir", required=True, help="Directory containing PDF files")
+    parser.add_argument("--output_dir", required=True, help="Directory to save preprocessed content")
+    args = parser.parse_args()
+    
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+    
+    # Get list of PDF files
+    pdf_files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if f.endswith(".pdf")]
+    
+    # Process each PDF
+    results = []
+    for pdf_path in pdf_files:
+        print(f"Processing {pdf_path}...")
+        result = preprocess_pdf(pdf_path, args.output_dir)
+        results.append(result)
+    
+    print(f"Processed {len(results)} PDF files")
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..ac597cc5
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,15 @@
+gemma>=0.1.0
+kauldron>=0.1.0
+jax>=0.4.20
+jaxlib>=0.4.20
+numpy>=1.24.0
+pillow>=10.0.0
+pymupdf>=1.23.0  # For PDF processing (PyMuPDF)
+camelot-py>=0.11.0  # For table extraction
+pytesseract>=0.3.10  # For OCR
+pix2tex>=0.1.0  # For LaTeX formula extraction
+opencv-python>=4.8.0  # Required by camelot
+ghostscript>=0.7  # Required by camelot
+optax>=0.1.7  # For optimization
+pandas>=2.0.0  # For table handling
+tabulate>=0.9.0  # For Markdown table conversion