From e1b06dd0cf911aa8473d764aa51312b3a1c18b92 Mon Sep 17 00:00:00 2001 From: sh20raj Date: Sat, 5 Apr 2025 11:30:10 +0530 Subject: [PATCH] Add biomedical multimodal dataset preparation tools for Gemma fine-tuning This commit addresses issue #210 by providing tools for preparing biomedical multimodal datasets with text, images, tables, and formulas for Gemma fine-tuning. - Add preprocess_pdfs.py for extracting content from PDFs - Add create_dataset.py for structuring the dataset - Add finetune_gemma.py for fine-tuning Gemma models - Add comprehensive documentation and requirements The solution enables users to convert biomedical PDFs to a format suitable for Gemma fine-tuning while preserving the semantic relationships between text and non-text elements. --- BIOMEDICAL_DATASET_README.md | 147 +++++++++++++++++++ create_dataset.py | 145 ++++++++++++++++++ finetune_gemma.py | 255 ++++++++++++++++++++++++++++++++ preprocess_pdfs.py | 277 +++++++++++++++++++++++++++++++++++ requirements.txt | 15 ++ 5 files changed, 839 insertions(+) create mode 100644 BIOMEDICAL_DATASET_README.md create mode 100644 create_dataset.py create mode 100644 finetune_gemma.py create mode 100644 preprocess_pdfs.py create mode 100644 requirements.txt diff --git a/BIOMEDICAL_DATASET_README.md b/BIOMEDICAL_DATASET_README.md new file mode 100644 index 00000000..31c9457b --- /dev/null +++ b/BIOMEDICAL_DATASET_README.md @@ -0,0 +1,147 @@ +# Biomedical Multimodal Dataset Preparation for Gemma Fine-tuning + +This repository contains tools and scripts for preparing biomedical multimodal datasets (text + images/tables/formulas) for fine-tuning Gemma models. + +## Overview + +The process involves three main steps: + +1. **Preprocessing PDFs**: Extract text, images, tables, and formulas from biomedical PDFs +2. **Creating a Dataset**: Structure the extracted content into a format suitable for Gemma fine-tuning +3. **Fine-tuning Gemma**: Fine-tune a Gemma model on the prepared dataset + +## Requirements + +Install the required dependencies: + +```bash +pip install -r requirements.txt +``` + +## 1. Preprocessing PDFs + +The `preprocess_pdfs.py` script extracts content from PDFs and converts it to appropriate formats: + +- **Text**: Extracted as Markdown +- **Images**: Extracted as PNG/JPEG files +- **Tables**: Converted to Markdown format +- **Formulas**: Converted to LaTeX format + +### Usage + +```bash +python preprocess_pdfs.py --input_dir /path/to/pdfs --output_dir /path/to/preprocessed +``` + +### Output Structure + +``` +/path/to/preprocessed/ +├── document1/ +│ ├── document1.md +│ ├── images/ +│ │ ├── img_1_1.png +│ │ ├── img_1_2.png +│ │ └── ... +│ ├── tables/ +│ │ ├── table_1.md +│ │ ├── table_2.md +│ │ └── ... +│ └── formulas/ +│ ├── formula_1_1.tex +│ ├── formula_1_1.png +│ └── ... +└── document2/ + └── ... +``` + +## 2. Creating a Dataset + +The `create_dataset.py` script structures the preprocessed content into a format suitable for Gemma fine-tuning: + +### Usage + +```bash +python create_dataset.py --input_dir /path/to/preprocessed --output_dir /path/to/dataset +``` + +### Output Structure + +``` +/path/to/dataset/ +├── train/ +│ └── data.json +├── validation/ +│ └── data.json +└── images/ + ├── img_1_1.png + ├── img_1_2.png + └── ... +``` + +### Dataset Format + +The dataset is structured as a JSON file with the following format: + +```json +[ + { + "input_text": "Text with tags and LaTeX formulas", + "output_text": "Expected output for instruction tuning", + "images": ["path/to/image1.png", "path/to/image2.png"] + }, + ... +] +``` + +## 3. Fine-tuning Gemma + +The `finetune_gemma.py` script fine-tunes a Gemma model on the prepared dataset: + +### Usage + +```bash +python finetune_gemma.py --dataset_dir /path/to/dataset --output_dir /path/to/model +``` + +## Best Practices + +### Handling Images + +- Use `` tags in the text to indicate where images should appear +- Ensure images are properly sized (800x800 pixels is recommended) +- Include descriptive alt text or captions for images + +### Handling Tables + +- Use Markdown table format to preserve structure +- Keep tables simple and well-formatted +- Ensure column headers are clear and descriptive + +### Handling Formulas + +- Use LaTeX format for mathematical formulas +- Enclose inline formulas with single dollar signs: `$formula$` +- Enclose block formulas with double dollar signs: `$$formula$$` + +### Dataset Preparation + +- Balance the dataset with a variety of content types +- Ensure high-quality text and image content +- Provide clear instruction-response pairs for fine-tuning +- Split the dataset into training and validation sets (90/10 split is recommended) + +## Limitations and Considerations + +- Complex tables may not be perfectly preserved in Markdown format +- Formula extraction accuracy depends on the quality of the PDF +- Very large images may need to be resized or split +- Some special characters in LaTeX formulas may require escaping + +## Requirements File + +A `requirements.txt` file is included with all necessary dependencies. + +## License + +This project is licensed under the Apache License 2.0 - see the LICENSE file for details. diff --git a/create_dataset.py b/create_dataset.py new file mode 100644 index 00000000..c0abef67 --- /dev/null +++ b/create_dataset.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 +""" +Create a dataset for Gemma fine-tuning from preprocessed biomedical PDFs. + +This script creates a dataset structure suitable for Gemma fine-tuning from +preprocessed biomedical PDFs. + +Usage: + python create_dataset.py --input_dir /path/to/preprocessed --output_dir /path/to/dataset +""" + +import argparse +import os +import json +import random +import shutil +from pathlib import Path +from typing import Dict, List, Tuple, Any + +def create_training_example(markdown_path: str, image_map: Dict[str, str]) -> Dict[str, Any]: + """ + Create a training example from a preprocessed markdown file. + + Args: + markdown_path: Path to the markdown file + image_map: Dictionary mapping image IDs to file paths + + Returns: + Dictionary with training example data + """ + # Read markdown content + with open(markdown_path, "r") as md_file: + markdown_content = md_file.read() + + # Split content into sections (simplified approach) + sections = markdown_content.split("## Page") + + # Create examples + examples = [] + + for section in sections: + if not section.strip(): + continue + + # Check if section contains image references + has_images = "" in section + + # Get image paths for this section + section_images = [] + if has_images: + # Extract image IDs from the section + for image_id in image_map: + if image_id in section: + section_images.append(image_map[image_id]) + + # Create example + example = { + "input_text": section.strip(), + "output_text": "", # This would be filled with expected output for instruction tuning + "images": section_images + } + + examples.append(example) + + return examples + +def create_dataset(input_dir: str, output_dir: str, split_ratio: float = 0.9) -> None: + """ + Create a dataset for Gemma fine-tuning. + + Args: + input_dir: Directory containing preprocessed content + output_dir: Directory to save the dataset + split_ratio: Train/validation split ratio + """ + # Create output directories + os.makedirs(output_dir, exist_ok=True) + os.makedirs(os.path.join(output_dir, "train"), exist_ok=True) + os.makedirs(os.path.join(output_dir, "validation"), exist_ok=True) + os.makedirs(os.path.join(output_dir, "images"), exist_ok=True) + + # Get list of preprocessed directories + preprocessed_dirs = [d for d in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, d))] + + # Process each directory + all_examples = [] + + for dir_name in preprocessed_dirs: + dir_path = os.path.join(input_dir, dir_name) + + # Find markdown file + markdown_files = [f for f in os.listdir(dir_path) if f.endswith(".md")] + if not markdown_files: + continue + + markdown_path = os.path.join(dir_path, markdown_files[0]) + + # Find image map + image_dir = os.path.join(dir_path, "images") + if os.path.exists(image_dir): + image_files = [f for f in os.listdir(image_dir) if f.endswith((".png", ".jpg", ".jpeg"))] + image_map = {os.path.splitext(f)[0]: os.path.join(image_dir, f) for f in image_files} + else: + image_map = {} + + # Create examples + examples = create_training_example(markdown_path, image_map) + all_examples.extend(examples) + + # Shuffle examples + random.shuffle(all_examples) + + # Split into train and validation sets + split_idx = int(len(all_examples) * split_ratio) + train_examples = all_examples[:split_idx] + val_examples = all_examples[split_idx:] + + # Copy images to dataset directory and update paths + for example in all_examples: + for i, img_path in enumerate(example["images"]): + img_filename = os.path.basename(img_path) + new_img_path = os.path.join(output_dir, "images", img_filename) + shutil.copy(img_path, new_img_path) + example["images"][i] = os.path.relpath(new_img_path, output_dir) + + # Save train and validation sets + with open(os.path.join(output_dir, "train", "data.json"), "w") as f: + json.dump(train_examples, f, indent=2) + + with open(os.path.join(output_dir, "validation", "data.json"), "w") as f: + json.dump(val_examples, f, indent=2) + + print(f"Created dataset with {len(train_examples)} training examples and {len(val_examples)} validation examples") + +def main(): + parser = argparse.ArgumentParser(description="Create a dataset for Gemma fine-tuning") + parser.add_argument("--input_dir", required=True, help="Directory containing preprocessed content") + parser.add_argument("--output_dir", required=True, help="Directory to save the dataset") + parser.add_argument("--split_ratio", type=float, default=0.9, help="Train/validation split ratio") + args = parser.parse_args() + + create_dataset(args.input_dir, args.output_dir, args.split_ratio) + +if __name__ == "__main__": + main() diff --git a/finetune_gemma.py b/finetune_gemma.py new file mode 100644 index 00000000..92f79e29 --- /dev/null +++ b/finetune_gemma.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python3 +""" +Fine-tune Gemma on a biomedical multimodal dataset. + +This script fine-tunes Gemma on a biomedical multimodal dataset with images, tables, and formulas. + +Usage: + python finetune_gemma.py --dataset_dir /path/to/dataset --output_dir /path/to/output +""" + +import argparse +import os +import json +from pathlib import Path +from typing import Dict, List, Tuple, Any + +import jax.numpy as jnp +import numpy as np +from PIL import Image + +from gemma import gm +from kauldron import kd +import optax + +class BiomedicalDataset(kd.data.Dataset): + """Dataset for biomedical multimodal data.""" + + def __init__( + self, + data_path: str, + image_dir: str, + tokenizer: gm.text.Tokenizer, + max_length: int = 512, + training: bool = True, + sampling: bool = False, + ): + """Initialize the dataset. + + Args: + data_path: Path to the JSON data file + image_dir: Directory containing images + tokenizer: Tokenizer to use + max_length: Maximum sequence length + training: Whether this is a training dataset + sampling: Whether this is for sampling + """ + self.data_path = data_path + self.image_dir = image_dir + self.tokenizer = tokenizer + self.max_length = max_length + self.training = training + self.sampling = sampling + + # Load data + with open(data_path, "r") as f: + self.data = json.load(f) + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + example = self.data[idx] + + # Get text + input_text = example["input_text"] + output_text = example["output_text"] + + # Get images + images = [] + for img_path in example["images"]: + full_img_path = os.path.join(self.image_dir, img_path) + if os.path.exists(full_img_path): + img = np.array(Image.open(full_img_path).convert("RGB")) + # Resize image to 800x800 + img = kd.data.py.resize_image(img, (800, 800)) + images.append(img) + + # If no images, add a dummy image to maintain batch structure + if not images: + images = [np.zeros((800, 800, 3), dtype=np.uint8)] + + # Stack images + images = np.stack(images, axis=0) + + # If sampling, return the raw text and images + if self.sampling: + return { + "prompt": input_text, + "response": output_text, + "image": images, + } + + # Create model inputs + prompt = self.tokenizer.encode(input_text, add_bos=True) + response = self.tokenizer.encode(output_text) + + # Create the model inputs/targets/loss_mask + seq2seq_fields = gm.data._functional.make_seq2seq_fields( + prompt=prompt, + response=response, + ) + + # Add padding + seq2seq_fields = gm.data._functional.pad( + seq2seq_fields, + max_length=self.max_length, + truncate=True, + ) + + return { + "input": seq2seq_fields.input, + "target": seq2seq_fields.target, + "loss_mask": seq2seq_fields.target_mask, + "image": images, + } + +def get_config(dataset_dir: str, output_dir: str): + """Get the training configuration. + + Args: + dataset_dir: Directory containing the dataset + output_dir: Directory to save the output + + Returns: + Training configuration + """ + batch_size = 8 # Smaller batch size due to multimodal data + max_length = 512 + + # Initialize tokenizer + tokenizer = gm.text.Gemma3Tokenizer() + + # Create datasets + train_data_path = os.path.join(dataset_dir, "train", "data.json") + val_data_path = os.path.join(dataset_dir, "validation", "data.json") + image_dir = os.path.join(dataset_dir, "images") + + train_ds = BiomedicalDataset( + data_path=train_data_path, + image_dir=image_dir, + tokenizer=tokenizer, + max_length=max_length, + training=True, + ) + + val_ds = BiomedicalDataset( + data_path=val_data_path, + image_dir=image_dir, + tokenizer=tokenizer, + max_length=max_length, + training=False, + ) + + sampling_ds = BiomedicalDataset( + data_path=val_data_path, + image_dir=image_dir, + tokenizer=tokenizer, + max_length=max_length, + training=False, + sampling=True, + ) + + # Create data loaders + train_loader = kd.data.PyLoader( + dataset=train_ds, + batch_size=batch_size, + shuffle=True, + num_workers=4, + ) + + val_loader = kd.data.PyLoader( + dataset=val_ds, + batch_size=batch_size, + shuffle=False, + num_workers=4, + ) + + sampling_loader = kd.data.PyLoader( + dataset=sampling_ds, + batch_size=1, # For sampling + shuffle=False, + num_workers=1, + ) + + return kd.train.Trainer( + seed=42, + # Dataset + train_ds=train_loader, + # Model definition + model=gm.nn.Gemma3_4B( + tokens="batch.input", + images="batch.image", + ), + # Load the weights from the pretrained checkpoint + init_transform=gm.ckpts.LoadCheckpoint( + path=gm.ckpts.CheckpointPath.GEMMA3_4B_IT, + ), + # Training + num_train_steps=5000, # Adjust based on dataset size + train_losses={ + "xentropy": kd.losses.SoftmaxCrossEntropyWithIntLabels( + logits="preds.logits", + labels="batch.target", + mask="batch.loss_mask", + ), + }, + train_summaries={ + "image": kd.summaries.ShowImages(images="batch.image", num_images=5), + }, + optimizer=optax.adafactor(learning_rate=1e-5), # Lower learning rate for fine-tuning + checkpointer=kd.ckpts.Checkpointer( + save_interval_steps=500, + workdir=output_dir, + ), + # Evaluation + evals={ + "validation": kd.evals.Evaluator( + run=kd.evals.EveryNSteps(500), + ds=val_loader, + ), + # The sampler evaluator run inference on a few prompts from the + # validation set. + "sampling": gm.evals.SamplerEvaluator( + run=kd.evals.EveryNSteps(500), + max_new_tokens=100, # Sampling parameters + num_batches=3, + ds=sampling_loader, + summaries={ + "image": kd.summaries.ShowImages( + images="batch.image", num_images=5 + ), + }, + ), + }, + ) + +def main(): + parser = argparse.ArgumentParser(description="Fine-tune Gemma on a biomedical multimodal dataset") + parser.add_argument("--dataset_dir", required=True, help="Directory containing the dataset") + parser.add_argument("--output_dir", required=True, help="Directory to save the output") + args = parser.parse_args() + + # Create output directory + os.makedirs(args.output_dir, exist_ok=True) + + # Get configuration + config = get_config(args.dataset_dir, args.output_dir) + + # Start training + print("Starting fine-tuning...") + kd.main.run(config) + print("Fine-tuning complete!") + +if __name__ == "__main__": + main() diff --git a/preprocess_pdfs.py b/preprocess_pdfs.py new file mode 100644 index 00000000..b8be8f8f --- /dev/null +++ b/preprocess_pdfs.py @@ -0,0 +1,277 @@ +#!/usr/bin/env python3 +""" +Preprocess biomedical PDFs for Gemma fine-tuning. + +This script extracts text, images, tables, and formulas from PDFs and converts them +to appropriate formats for Gemma fine-tuning. + +Usage: + python preprocess_pdfs.py --input_dir /path/to/pdfs --output_dir /path/to/output +""" + +import argparse +import os +import re +import uuid +from pathlib import Path +from typing import Dict, List, Tuple, Any + +import fitz # PyMuPDF +import numpy as np +from PIL import Image +import camelot +import pytesseract +from pix2tex.cli import LatexOCR + +# Initialize LaTeX OCR model for formula extraction +latex_ocr = LatexOCR() + +def extract_images(pdf_path: str, output_dir: str) -> Dict[str, str]: + """ + Extract images from PDF and save them to output directory. + + Args: + pdf_path: Path to the PDF file + output_dir: Directory to save extracted images + + Returns: + Dictionary mapping image IDs to file paths + """ + image_dir = os.path.join(output_dir, "images") + os.makedirs(image_dir, exist_ok=True) + + image_map = {} + doc = fitz.open(pdf_path) + + for page_num, page in enumerate(doc): + image_list = page.get_images(full=True) + + for img_idx, img_info in enumerate(image_list): + xref = img_info[0] + base_image = doc.extract_image(xref) + image_bytes = base_image["image"] + image_ext = base_image["ext"] + + # Generate a unique ID for the image + image_id = f"img_{page_num+1}_{img_idx+1}" + image_filename = f"{image_id}.{image_ext}" + image_path = os.path.join(image_dir, image_filename) + + with open(image_path, "wb") as img_file: + img_file.write(image_bytes) + + # Store the mapping + image_map[image_id] = image_path + + return image_map + +def extract_tables(pdf_path: str, output_dir: str) -> Dict[str, str]: + """ + Extract tables from PDF and convert to Markdown format. + + Args: + pdf_path: Path to the PDF file + output_dir: Directory to save extracted tables + + Returns: + Dictionary mapping table IDs to Markdown content + """ + table_dir = os.path.join(output_dir, "tables") + os.makedirs(table_dir, exist_ok=True) + + table_map = {} + + # Extract tables using Camelot + try: + tables = camelot.read_pdf(pdf_path, pages='all', flavor='lattice') + + for table_idx, table in enumerate(tables): + # Generate a unique ID for the table + table_id = f"table_{table_idx+1}" + + # Convert table to Markdown + markdown_table = table.df.to_markdown(index=False) + + # Store the mapping + table_map[table_id] = markdown_table + + # Save the table as Markdown + table_path = os.path.join(table_dir, f"{table_id}.md") + with open(table_path, "w") as table_file: + table_file.write(markdown_table) + except Exception as e: + print(f"Error extracting tables: {e}") + + return table_map + +def extract_formulas(pdf_path: str, output_dir: str) -> Dict[str, str]: + """ + Extract mathematical formulas from PDF and convert to LaTeX. + + Args: + pdf_path: Path to the PDF file + output_dir: Directory to save extracted formulas + + Returns: + Dictionary mapping formula IDs to LaTeX content + """ + formula_dir = os.path.join(output_dir, "formulas") + os.makedirs(formula_dir, exist_ok=True) + + formula_map = {} + doc = fitz.open(pdf_path) + + for page_num, page in enumerate(doc): + # Extract page as an image + pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + + # Use OCR to identify potential formula regions + # This is a simplified approach - in practice, you'd need more sophisticated detection + text = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT) + + # Look for potential formula indicators + formula_indicators = ["=", "+", "-", "∫", "∑", "∏", "√", "α", "β", "γ", "δ"] + + potential_formulas = [] + for i, word in enumerate(text["text"]): + if any(indicator in word for indicator in formula_indicators): + x, y, w, h = text["left"][i], text["top"][i], text["width"][i], text["height"][i] + # Expand the region slightly + x = max(0, x - 20) + y = max(0, y - 20) + w = min(pix.width - x, w + 40) + h = min(pix.height - y, h + 40) + + formula_img = img.crop((x, y, x + w, y + h)) + potential_formulas.append(formula_img) + + # Process potential formulas + for idx, formula_img in enumerate(potential_formulas): + try: + # Use LaTeX OCR to convert image to LaTeX + latex = latex_ocr(formula_img) + + # Generate a unique ID for the formula + formula_id = f"formula_{page_num+1}_{idx+1}" + + # Store the mapping + formula_map[formula_id] = latex + + # Save the formula as LaTeX + formula_path = os.path.join(formula_dir, f"{formula_id}.tex") + with open(formula_path, "w") as formula_file: + formula_file.write(latex) + + # Also save the formula image for reference + formula_img_path = os.path.join(formula_dir, f"{formula_id}.png") + formula_img.save(formula_img_path) + except Exception as e: + print(f"Error processing formula: {e}") + + return formula_map + +def extract_text_with_references(pdf_path: str, image_map: Dict[str, str], + table_map: Dict[str, str], formula_map: Dict[str, str]) -> str: + """ + Extract text from PDF with references to images, tables, and formulas. + + Args: + pdf_path: Path to the PDF file + image_map: Dictionary mapping image IDs to file paths + table_map: Dictionary mapping table IDs to Markdown content + formula_map: Dictionary mapping formula IDs to LaTeX content + + Returns: + Markdown text with references to images, tables, and formulas + """ + doc = fitz.open(pdf_path) + markdown_text = "" + + for page_num, page in enumerate(doc): + text = page.get_text("text") + + # Process text and add references to images, tables, and formulas + # This is a simplified approach - in practice, you'd need more sophisticated detection + + # Add page header + markdown_text += f"## Page {page_num+1}\n\n" + + # Add text + markdown_text += text + "\n\n" + + # Add references to images on this page + for image_id, image_path in image_map.items(): + if f"_{page_num+1}_" in image_id: + markdown_text += f"\n\n" + + # Add references to tables on this page + for table_id, table_content in table_map.items(): + markdown_text += f"\n\n{table_content}\n\n" + + # Add references to formulas on this page + for formula_id, formula_content in formula_map.items(): + if f"_{page_num+1}_" in formula_id: + markdown_text += f"\n\n${formula_content}$\n\n" + + return markdown_text + +def preprocess_pdf(pdf_path: str, output_dir: str) -> Dict[str, Any]: + """ + Preprocess a single PDF file. + + Args: + pdf_path: Path to the PDF file + output_dir: Directory to save preprocessed content + + Returns: + Dictionary with preprocessed content + """ + pdf_name = os.path.basename(pdf_path).replace(".pdf", "") + pdf_output_dir = os.path.join(output_dir, pdf_name) + os.makedirs(pdf_output_dir, exist_ok=True) + + # Extract content + image_map = extract_images(pdf_path, pdf_output_dir) + table_map = extract_tables(pdf_path, pdf_output_dir) + formula_map = extract_formulas(pdf_path, pdf_output_dir) + + # Extract text with references + markdown_text = extract_text_with_references(pdf_path, image_map, table_map, formula_map) + + # Save markdown text + markdown_path = os.path.join(pdf_output_dir, f"{pdf_name}.md") + with open(markdown_path, "w") as md_file: + md_file.write(markdown_text) + + return { + "pdf_path": pdf_path, + "markdown_path": markdown_path, + "image_map": image_map, + "table_map": table_map, + "formula_map": formula_map + } + +def main(): + parser = argparse.ArgumentParser(description="Preprocess biomedical PDFs for Gemma fine-tuning") + parser.add_argument("--input_dir", required=True, help="Directory containing PDF files") + parser.add_argument("--output_dir", required=True, help="Directory to save preprocessed content") + args = parser.parse_args() + + # Create output directory + os.makedirs(args.output_dir, exist_ok=True) + + # Get list of PDF files + pdf_files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if f.endswith(".pdf")] + + # Process each PDF + results = [] + for pdf_path in pdf_files: + print(f"Processing {pdf_path}...") + result = preprocess_pdf(pdf_path, args.output_dir) + results.append(result) + + print(f"Processed {len(results)} PDF files") + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..ac597cc5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,15 @@ +gemma>=0.1.0 +kauldron>=0.1.0 +jax>=0.4.20 +jaxlib>=0.4.20 +numpy>=1.24.0 +pillow>=10.0.0 +pymupdf>=1.23.0 # For PDF processing (PyMuPDF) +camelot-py>=0.11.0 # For table extraction +pytesseract>=0.3.10 # For OCR +pix2tex>=0.1.0 # For LaTeX formula extraction +opencv-python>=4.8.0 # Required by camelot +ghostscript>=0.7 # Required by camelot +optax>=0.1.7 # For optimization +pandas>=2.0.0 # For table handling +tabulate>=0.9.0 # For Markdown table conversion