Merge pull request #7 from jeremymanning/main

jeremymanning · web-flow · commit f6bfaeea98ca · 2025-09-17T12:59:21.000-04:00
Add flexible device support: CPU, MPS, and unlimited GPU scaling
diff --git a/code/generate_figures.py b/code/generate_figures.py
@@ -21,14 +21,26 @@
 from llm_stylometry.cli_utils import safe_print, format_header, is_windows
 
 
-def train_models():
+def train_models(max_gpus=None):
     """Train all models from scratch."""
     safe_print("\n" + "=" * 60)
     safe_print("Training Models from Scratch")
     safe_print("=" * 60)
     warning = "[WARNING]" if is_windows() else "⚠️"
+    # Check device availability
+    import torch
+    device_info = ""
+    if torch.cuda.is_available():
+        gpu_count = torch.cuda.device_count()
+        device_info = f"CUDA GPUs available: {gpu_count}"
+    elif torch.backends.mps.is_available():
+        device_info = "Apple Metal Performance Shaders (MPS) available"
+    else:
+        device_info = "CPU only (training will be slow)"
+
     safe_print(f"\n{warning}  Warning: This will train 80 models (8 authors × 10 seeds)")
-    safe_print("   This requires a CUDA GPU and will take several hours.")
+    safe_print(f"   Device: {device_info}")
+    safe_print("   Training time depends on hardware (hours on GPU, days on CPU)")
 
     response = input("\nProceed with training? [y/N]: ")
     if response.lower() != 'y':
@@ -66,6 +78,10 @@ def train_models():
         env['NO_MULTIPROCESSING'] = '1'
         # Set PyTorch memory management for better GPU memory usage
         env['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
+        # Pass through max GPUs limit if specified
+        if max_gpus:
+            env['MAX_GPUS'] = str(max_gpus)
+            safe_print(f"Limiting to {max_gpus} GPU(s)")
         # Run without capturing output so we can see progress
         result = subprocess.run([sys.executable, 'code/main.py'], env=env, check=False)
         if result.returncode != 0:
@@ -182,6 +198,13 @@ def main():
         help='List available figures'
     )
 
+    parser.add_argument(
+        '--max-gpus', '-g',
+        type=int,
+        help='Maximum number of GPUs to use for training (default: all available)',
+        default=None
+    )
+
     args = parser.parse_args()
 
     if args.list:
@@ -199,7 +222,7 @@ def main():
 
     # Train models if requested
     if args.train:
-        if not train_models():
+        if not train_models(max_gpus=args.max_gpus):
             return 1
         # Update data path to use newly generated results
         args.data = 'data/model_results.pkl'
diff --git a/code/main.py b/code/main.py
@@ -36,8 +36,20 @@ def tqdm(iterable, *args, **kwargs):
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-if not torch.cuda.is_available():
-    raise Exception("No GPU available")
+# Detect available devices
+def get_device_info():
+    """Detect and return device configuration."""
+    if torch.cuda.is_available():
+        device_count = torch.cuda.device_count()
+        return "cuda", device_count
+    elif torch.backends.mps.is_available():
+        # Apple Metal Performance Shaders (MPS) backend
+        return "mps", 1
+    else:
+        return "cpu", 1
+
+device_type, device_count = get_device_info()
+logger.info(f"Device type: {device_type}, Count: {device_count}")
 
 experiments = []
 for seed in range(10):
@@ -51,16 +63,26 @@ def tqdm(iterable, *args, **kwargs):
         )
 
 
-def run_experiment(exp: Experiment, gpu_queue):
+def run_experiment(exp: Experiment, device_queue, device_type="cuda"):
     try:
         logging.basicConfig(level=logging.INFO)
         logger = logging.getLogger(__name__)
 
-        # Get an available GPU id
-        gpu_id = gpu_queue.get()
+        # Get an available device id
+        device_id = device_queue.get() if device_queue else 0
         logger.info(f"Starting experiment: {exp.name}")
-        torch.cuda.set_device(gpu_id)
-        device = torch.device("cuda", index=gpu_id)
+
+        # Set up device based on type
+        if device_type == "cuda":
+            torch.cuda.set_device(device_id)
+            device = torch.device("cuda", index=device_id)
+            device_label = f"GPU {device_id}"
+        elif device_type == "mps":
+            device = torch.device("mps")
+            device_label = "MPS"
+        else:
+            device = torch.device("cpu")
+            device_label = "CPU"
 
         # Initialize tokenizer directly using get_tokenizer
         tokenizer = get_tokenizer(exp.tokenizer_name)
@@ -82,7 +104,7 @@ def run_experiment(exp: Experiment, gpu_queue):
             excluded_train_path=exp.excluded_train_path,
         )
         logger.info(
-            f"[GPU {gpu_id}] Number of training batches: {len(train_dataloader)}"
+            f"[{device_label}] Number of training batches: {len(train_dataloader)}"
         )
 
         # Set up eval dataloaders
@@ -130,7 +152,7 @@ def run_experiment(exp: Experiment, gpu_queue):
             start_epoch = 0
 
         logger.info(
-            f"[GPU {gpu_id}] Total number of non-embedding parameters: {count_non_embedding_params(model)}"
+            f"[{device_label}] Total number of non-embedding parameters: {count_non_embedding_params(model)}"
         )
 
         # Initial evaluation (epochs_complete = 0)
@@ -151,15 +173,16 @@ def run_experiment(exp: Experiment, gpu_queue):
                 train_author=exp.train_author,
             )
 
-        # Set up mixed precision training for memory efficiency
-        scaler = torch.amp.GradScaler('cuda')
+        # Set up mixed precision training if supported
+        use_amp = device_type == "cuda"
+        scaler = torch.amp.GradScaler('cuda') if use_amp else None
 
         # Enable gradient checkpointing to save memory (if supported)
         try:
             model.gradient_checkpointing_enable()
-            logger.info(f"[GPU {gpu_id}] Gradient checkpointing enabled for memory efficiency")
+            logger.info(f"[{device_label}] Gradient checkpointing enabled for memory efficiency")
         except AttributeError:
-            logger.info(f"[GPU {gpu_id}] Model does not support gradient checkpointing")
+            logger.info(f"[{device_label}] Model does not support gradient checkpointing")
 
         # Training loop
         for epoch in tqdm(range(start_epoch, max_epochs)):
@@ -171,16 +194,24 @@ def run_experiment(exp: Experiment, gpu_queue):
 
                 input_ids = batch["input_ids"].to(device)
 
-                # Forward pass with mixed precision
-                with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
+                # Forward pass with or without mixed precision
+                if use_amp:
+                    with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
+                        outputs = model(input_ids=input_ids, labels=input_ids)
+                        loss = outputs.loss
+                else:
                     outputs = model(input_ids=input_ids, labels=input_ids)
                     loss = outputs.loss
 
-                # Backward pass with scaled gradients
+                # Backward pass with or without mixed precision
                 optimizer.zero_grad()
-                scaler.scale(loss).backward()
-                scaler.step(optimizer)
-                scaler.update()
+                if use_amp:
+                    scaler.scale(loss).backward()
+                    scaler.step(optimizer)
+                    scaler.update()
+                else:
+                    loss.backward()
+                    optimizer.step()
 
                 # Accumulate training loss
                 total_train_loss += loss.item()
@@ -230,11 +261,12 @@ def run_experiment(exp: Experiment, gpu_queue):
                     train_author=exp.train_author,
                 )
 
-                # Force memory cleanup between evaluations
-                torch.cuda.empty_cache()
+                # Force memory cleanup between evaluations (CUDA only)
+                if device_type == "cuda":
+                    torch.cuda.empty_cache()
 
             # Build log message for console output
-            log_message = f"[GPU {gpu_id}] Epoch {epochs_completed}/{max_epochs}: training loss = {train_loss:.4f}"
+            log_message = f"[{device_label}] Epoch {epochs_completed}/{max_epochs}: training loss = {train_loss:.4f}"
             for name, loss in eval_losses.items():
                 log_message += f", {name}: {loss:.4f}"
             logger.info(log_message)
@@ -249,13 +281,14 @@ def run_experiment(exp: Experiment, gpu_queue):
             # Early stopping after completing epoch (retain logs and checkpoints)
             if train_loss <= stop_train_loss and min_epochs <= epochs_completed:
                 logger.info(
-                    f"[GPU {gpu_id}] Training loss {train_loss:.4f} below threshold {stop_train_loss}. Stopping training."
+                    f"[{device_label}] Training loss {train_loss:.4f} below threshold {stop_train_loss}. Stopping training."
                 )
                 break
-        logger.info(f"[GPU {gpu_id}] Training complete for {modelname}")
+        logger.info(f"[{device_label}] Training complete for {modelname}")
 
         # Return the GPU id to the queue
-        gpu_queue.put(gpu_id)
+        if device_queue:
+            device_queue.put(device_id)
     except Exception:
         logger.exception(f"Error in experiment {exp.name}")
         raise
@@ -265,16 +298,29 @@ def run_experiment(exp: Experiment, gpu_queue):
     # Check if we should run sequentially (for subprocess compatibility)
     USE_MULTIPROCESSING = os.environ.get('NO_MULTIPROCESSING', '0') != '1'
 
-    device_count = torch.cuda.device_count()
-    gpu_count = min(device_count, 4)
-    print(f"Using {gpu_count} GPUs out of {device_count} available")
+    # Use already detected device configuration
+    if device_type == "cuda":
+        # Check for MAX_GPUS environment variable to optionally limit GPU usage
+        max_gpus = int(os.environ.get('MAX_GPUS', '0')) or device_count
+        gpu_count = min(device_count, max_gpus)
+        if gpu_count < device_count:
+            print(f"Using {gpu_count} GPUs (limited by MAX_GPUS) out of {device_count} available")
+        else:
+            print(f"Using all {gpu_count} available GPUs")
+    elif device_type == "mps":
+        gpu_count = 1
+        print("Using Apple Metal Performance Shaders (MPS)")
+    else:
+        gpu_count = 1
+        print("Using CPU for training (this will be slow)")
 
-    if USE_MULTIPROCESSING:
+    if USE_MULTIPROCESSING and device_type == "cuda" and gpu_count > 1:
+        # Only use multiprocessing for multiple CUDA GPUs
         mp.set_start_method("spawn", force=True)
         manager = mp.Manager()
-        gpu_queue = manager.Queue()
+        device_queue = manager.Queue()
         for gpu in range(gpu_count):
-            gpu_queue.put(gpu)
+            device_queue.put(gpu)
 
         pool = mp.Pool(processes=gpu_count)
         logger = logging.getLogger(__name__)
@@ -286,22 +332,27 @@ def error_callback(e):
 
         for exp in experiments:
             pool.apply_async(
-                run_experiment, (exp, gpu_queue), error_callback=error_callback
+                run_experiment, (exp, device_queue, device_type), error_callback=error_callback
             )
         pool.close()
         pool.join()
     else:
-        # Sequential mode for subprocess compatibility
+        # Sequential mode for subprocess compatibility or single device
         print("Running in sequential mode (multiprocessing disabled)")
-        import queue
-        gpu_queue = queue.Queue()
-        for gpu in range(gpu_count):
-            gpu_queue.put(gpu)
+        if device_type == "cuda" and gpu_count > 1:
+            # Multiple GPUs but running sequentially
+            import queue
+            device_queue = queue.Queue()
+            for gpu in range(gpu_count):
+                device_queue.put(gpu)
+        else:
+            # Single device or non-CUDA
+            device_queue = None
 
         for i, exp in enumerate(experiments):
             print(f"Training model {i+1}/{len(experiments)}: {exp.name}")
-            run_experiment(exp, gpu_queue)
-            # Put GPU back in queue for next experiment
-            if not gpu_queue.empty():
-                gpu_id = gpu_queue.get()
-                gpu_queue.put(gpu_id)
+            run_experiment(exp, device_queue, device_type)
+            # For multi-GPU sequential mode, rotate through GPUs
+            if device_queue and not device_queue.empty():
+                device_id = device_queue.get()
+                device_queue.put(device_id)
diff --git a/run_llm_stylometry.sh b/run_llm_stylometry.sh
@@ -35,6 +35,7 @@ OPTIONS:
     -h, --help              Show this help message
     -f, --figure FIGURE     Generate specific figure (1a, 1b, 2a, 2b, 3, 4, 5)
     -t, --train             Train models from scratch before generating figures
+    -g, --max-gpus NUM      Maximum number of GPUs to use for training (default: all)
     -d, --data PATH         Path to model_results.pkl (default: data/model_results.pkl)
     -o, --output DIR        Output directory for figures (default: paper/figs/source)
     -l, --list              List available figures
@@ -48,7 +49,8 @@ EXAMPLES:
     $0                      # Setup environment and generate all figures
     $0 -f 1a                # Generate only Figure 1A
     $0 -f 4                 # Generate only Figure 4 (MDS plot)
-    $0 -t                   # Train models from scratch, then generate figures
+    $0 -t                   # Train models from scratch using all GPUs
+    $0 -t -g 2              # Train models using only 2 GPUs
     $0 -l                   # List available figures
     $0 --setup-only         # Only setup the environment
     $0 --clean              # Remove environment and reinstall from scratch
@@ -278,6 +280,7 @@ setup_environment() {
 # Parse command line arguments
 FIGURE=""
 TRAIN=false
+MAX_GPUS=""
 DATA_PATH="data/model_results.pkl"
 OUTPUT_DIR="paper/figs/source"
 LIST_FIGURES=false
@@ -301,6 +304,10 @@ while [[ $# -gt 0 ]]; do
             TRAIN=true
             shift
             ;;
+        -g|--max-gpus)
+            MAX_GPUS="$2"
+            shift 2
+            ;;
         -d|--data)
             DATA_PATH="$2"
             shift 2
@@ -381,6 +388,22 @@ if [ "$SETUP_ONLY" = true ]; then
     exit 0
 fi
 
+# Detect available compute devices
+print_info "Detecting available compute devices..."
+DEVICE_INFO=$(python -c "
+import torch
+if torch.cuda.is_available():
+    n = torch.cuda.device_count()
+    names = [torch.cuda.get_device_name(i) for i in range(n)]
+    print(f'CUDA GPUs: {n} device(s) - {names[0] if n > 0 else \"Unknown\"}')
+elif torch.backends.mps.is_available():
+    print('Apple Metal Performance Shaders (MPS)')
+else:
+    import multiprocessing
+    print(f'CPU only ({multiprocessing.cpu_count()} cores)')
+" 2>/dev/null || echo "Could not detect device")
+print_info "Device: $DEVICE_INFO"
+
 # Build the Python command
 PYTHON_CMD="python code/generate_figures.py"
 
@@ -394,6 +417,10 @@ if [ "$TRAIN" = true ]; then
     PYTHON_CMD="$PYTHON_CMD --train"
 fi
 
+if [ -n "$MAX_GPUS" ]; then
+    PYTHON_CMD="$PYTHON_CMD --max-gpus $MAX_GPUS"
+fi
+
 if [ "$DATA_PATH" != "data/model_results.pkl" ]; then
     PYTHON_CMD="$PYTHON_CMD --data $DATA_PATH"
 fi