icereed · icereed · Oct 13, 2025 · May 4, 2025 · Oct 13, 2025 · coderabbitai
diff --git a/README.md b/README.md
@@ -174,6 +174,7 @@ services:
       # LLM_PROVIDER: "ollama"
       # LLM_MODEL: "qwen3:8b"
       # OLLAMA_HOST: "http://host.docker.internal:11434"
+      # OLLAMA_CONTEXT_LENGTH: "8192" # Sets Ollama NumCtx (context window)
       # TOKEN_LIMIT: 1000 # Recommended for smaller models
 
       # Optional LLM Settings
@@ -547,6 +548,10 @@ For best results with the enhanced OCR features:
 | `VISION_LLM_REQUESTS_PER_MINUTE`    | Maximum requests per minute for the Vision LLM. Useful for managing API costs or local LLM load.                                                                                              | No       | 120                        |
 | `VISION_LLM_MAX_RETRIES`            | Maximum retry attempts for failed Vision LLM requests.                                                                                                                                        | No       | 3                          |
 | `VISION_LLM_BACKOFF_MAX_WAIT`       | Maximum wait time between retries for the Vision LLM (e.g., `30s`).                                                                                                                           | No       | 30s                        |
+| `VISION_LLM_MAX_TOKENS`             | Maximum tokens for Vision LLM OCR output.                                                                                                                                                     | No       |                            |
+| `VISION_LLM_TEMPERATURE`            | Sampling temperature for Vision OCR generation. Lower is more deterministic. Important: For OpenAI GPT-5 it must be explicitly set to `1.0`.                                                  | No       |                            |
+| `OLLAMA_CONTEXT_LENGTH`             | (Ollama only) Integer. Sets NumCtx (context window) for the Ollama runner. If unset or 0, the model default is used.                                                                          | No       |                            |
+| `OLLAMA_OCR_TOP_K`                  | (Ollama only) Top-k token sampling for Vision OCR. Lower favors more likely tokens; higher increases diversity.                                                                               | No       |                            |
 | `AZURE_DOCAI_ENDPOINT`              | Azure Document Intelligence endpoint. Required if OCR_PROVIDER is `azure`.                                                                                                                    | Cond.    |                            |
 | `AZURE_DOCAI_KEY`                   | Azure Document Intelligence API key. Required if OCR_PROVIDER is `azure`.                                                                                                                     | Cond.    |                            |
 | `AZURE_DOCAI_MODEL_ID`              | Azure Document Intelligence model ID. Optional if using `azure` provider.                                                                                                                     | No       | prebuilt-read              |
@@ -886,6 +891,7 @@ When using local LLMs (like those through Ollama), you might need to adjust cert
 #### Token Management
 
 - Use `TOKEN_LIMIT` environment variable to control the maximum number of tokens sent to the LLM
+- For Ollama, set `OLLAMA_CONTEXT_LENGTH` to control the model's context window (NumCtx). This is independent of `TOKEN_LIMIT` and configures the server-side KV cache size. If unset or 0, the model default is used. Choose a value within the model's supported window (e.g., 8192).
 - Smaller models might truncate content unexpectedly if given too much text
 - Start with a conservative limit (e.g., 1000 tokens) and adjust based on your model's capabilities
 - Set to `0` to disable the limit (use with caution)
@@ -895,13 +901,15 @@ Example configuration for smaller models:
 ```yaml
 environment:
   TOKEN_LIMIT: "2000" # Adjust based on your model's context window
+  OLLAMA_CONTEXT_LENGTH: "4096" # Controls Ollama NumCtx (context window); if unset, model default is used
   LLM_PROVIDER: "ollama"
   LLM_MODEL: "qwen3:8b" # Or other local model
 ```
 
 Common issues and solutions:
 
 - If you see truncated or incomplete responses, try lowering the `TOKEN_LIMIT`
+- On Ollama, if you hit "context length exceeded" or memory issues, reduce `OLLAMA_CONTEXT_LENGTH` or choose a smaller model/context size.
 - If processing is too limited, gradually increase the limit while monitoring performance
 - For models with larger context windows, you can increase the limit or disable it entirely
 

diff --git a/app_http_handlers.go b/app_http_handlers.go
@@ -1,8 +1,10 @@
 package main
 
 import (
+	"context"
 	"bytes"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"net/http"
 	"os"
@@ -252,11 +254,12 @@ func (app *App) getJobStatusHandler(c *gin.Context) {
 	}
 
 	response := gin.H{
-		"job_id":     job.ID,
-		"status":     job.Status,
-		"created_at": job.CreatedAt,
-		"updated_at": job.UpdatedAt,
-		"pages_done": job.PagesDone,
+		"job_id":      job.ID,
+		"status":      job.Status,
+		"created_at":  job.CreatedAt,
+		"updated_at":  job.UpdatedAt,
+		"pages_done":  job.PagesDone,
+		"total_pages": job.TotalPages,
 	}
 
 	if job.Status == "completed" {
@@ -293,6 +296,20 @@ func (app *App) getAllJobsHandler(c *gin.Context) {
 	c.JSON(http.StatusOK, jobList)
 }
 
+// POST /api/ocr/jobs/:job_id/stop
+func (app *App) stopOCRJobHandler(c *gin.Context) {
+	jobID := c.Param("job_id")
+	jobCancellersMu.Lock()
+	cancel, exists := jobCancellers[jobID]
+	jobCancellersMu.Unlock()
+	if !exists {
+		c.JSON(http.StatusNotFound, gin.H{"error": "No running job with this ID"})
+		return
+	}
+	cancel()
+	c.Status(http.StatusNoContent)
+}
+
 // getDocumentHandler handles the retrieval of a document by its ID
 func (app *App) getDocumentHandler() gin.HandlerFunc {
 	return func(c *gin.Context) {
@@ -312,6 +329,158 @@ func (app *App) getDocumentHandler() gin.HandlerFunc {
 	}
 }
 
+// getOCRPagesHandler returns per-page OCR results for a document
+func (app *App) getOCRPagesHandler(c *gin.Context) {
+	id := c.Param("id")
+	parsedID, err := strconv.Atoi(id)
+	if err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid document ID"})
+		return
+	}
+
+	dbResults, err := GetOcrPageResults(app.Database, parsedID)
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to fetch OCR page results"})
+		return
+	}
+
+	type OCRPageResult struct {
+		Text           string                 `json:"text"`
+		OcrLimitHit    bool                   `json:"ocrLimitHit"`
+		GenerationInfo map[string]interface{} `json:"generationInfo,omitempty"`
+	}
+
+	var pages []OCRPageResult
+	for _, res := range dbResults {
+		var genInfo map[string]interface{}
+		if res.GenerationInfo != "" {
+			_ = json.Unmarshal([]byte(res.GenerationInfo), &genInfo)
+		}
+		pages = append(pages, OCRPageResult{
+			Text:           res.Text,
+			OcrLimitHit:    res.OcrLimitHit,
+			GenerationInfo: genInfo,
+		})
+	}
+
+	c.JSON(http.StatusOK, gin.H{
+		"pages": pages,
+	})
+}
+
+func (app *App) reOCRPageHandler(c *gin.Context) {
+	id := c.Param("id")
+	pageIdxStr := c.Param("pageIndex")
+	parsedID, err := strconv.Atoi(id)
+	if err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid document ID"})
+		return
+	}
+	pageIdx, err := strconv.Atoi(pageIdxStr)
+	if err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid page index"})
+		return
+	}
+
+	// Download all images for the document, but only process the requested page
+	imagePaths, _, err := app.Client.DownloadDocumentAsImages(c.Request.Context(), parsedID, limitOcrPages)
+	if err != nil || pageIdx < 0 || pageIdx >= len(imagePaths) {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid page index or failed to download images"})
+		return
+	}
-	imagePaths, _, err := app.Client.DownloadDocumentAsImages(c.Request.Context(), parsedID, limitOcrPages)
-	if err != nil || pageIdx < 0 || pageIdx >= len(imagePaths) {
-		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid page index or failed to download images"})
-		return
-	}
+	imagePaths, _, err := app.Client.DownloadDocumentAsImages(c.Request.Context(), parsedID, limitOcrPages)
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to download images"})
+		return
+	}
+	if pageIdx < 0 || pageIdx >= len(imagePaths) {
+		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("Invalid page index %d, document has %d pages", pageIdx, len(imagePaths))})
+		return
+	}
-	imagePaths, _, err := app.Client.DownloadDocumentAsImages(c.Request.Context(), parsedID, limitOcrPages)
-	if err != nil || pageIdx < 0 || pageIdx >= len(imagePaths) {
-		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid page index or failed to download images"})
-		return
-	}
+	imagePaths, _, err := app.Client.DownloadDocumentAsImages(c.Request.Context(), parsedID, limitOcrPages)
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to download images"})
+		return
+	}
+	if pageIdx < 0 || pageIdx >= len(imagePaths) {
+		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("Invalid page index %d, document has %d pages", pageIdx, len(imagePaths))})
+		return
+	}
+	imageContent, err := os.ReadFile(imagePaths[pageIdx])
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to read image file"})
+		return
+	}
+
+	cancelKey := fmt.Sprintf("%d-%d", parsedID, pageIdx)
+	reOcrCtx, cancelReOcr := context.WithCancel(c.Request.Context())
+	defer cancelReOcr()
+
+	reOcrCancellersMu.Lock()
+	if existingCancel, ok := reOcrCancellers[cancelKey]; ok {
+		existingCancel()
+	}
+	reOcrCancellers[cancelKey] = cancelReOcr
+	reOcrCancellersMu.Unlock()
+
+	defer func() {
+		reOcrCancellersMu.Lock()
+		delete(reOcrCancellers, cancelKey)
+		reOcrCancellersMu.Unlock()
+	}()
+
+	result, err := app.ocrProvider.ProcessImage(reOcrCtx, imageContent, pageIdx+1)
+
+	if err != nil {
+		if errors.Is(err, context.Canceled) {
+			log.Infof("Re-OCR for doc %d page %d cancelled.", parsedID, pageIdx)
+			c.Status(499)
+		} else {
+			log.Errorf("Failed to re-OCR doc %d page %d: %v", parsedID, pageIdx, err)
+			c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to re-OCR page"})
+		}
+		return
+	}
+	if result == nil {
+		log.Errorf("Re-OCR for doc %d page %d returned nil result.", parsedID, pageIdx)
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "Re-OCR returned no result"})
+		return
+	}
+
+	var genInfoJSON string
+	if result.GenerationInfo != nil {
+		if b, err := json.Marshal(result.GenerationInfo); err == nil {
+			genInfoJSON = string(b)
+		}
+	}
-	var genInfoJSON string
-	if result.GenerationInfo != nil {
-		if b, err := json.Marshal(result.GenerationInfo); err == nil {
-			genInfoJSON = string(b)
-		}
-	}
+	var genInfoJSON string
+	if result.GenerationInfo != nil {
+		if b, err := json.Marshal(result.GenerationInfo); err != nil {
+			log.Errorf("Failed to marshal GenerationInfo for doc %d page %d: %v", parsedID, pageIdx, err)
+		} else {
+			genInfoJSON = string(b)
+		}
+	}
-	var genInfoJSON string
-	if result.GenerationInfo != nil {
-		if b, err := json.Marshal(result.GenerationInfo); err == nil {
-			genInfoJSON = string(b)
-		}
-	}
+	var genInfoJSON string
+	if result.GenerationInfo != nil {
+		if b, err := json.Marshal(result.GenerationInfo); err != nil {
+			log.Errorf("Failed to marshal GenerationInfo for doc %d page %d: %v", parsedID, pageIdx, err)
+		} else {
+			genInfoJSON = string(b)
+		}
+	}
+	saveErr := SaveSingleOcrPageResult(app.Database, parsedID, pageIdx, result.Text, result.OcrLimitHit, genInfoJSON)
+	if saveErr != nil {
+		log.Errorf("Failed to save re-OCR result for doc %d page %d: %v", parsedID, pageIdx, saveErr)
+	}
+
+	c.JSON(http.StatusOK, gin.H{
+		"text":           result.Text,
+		"ocrLimitHit":    result.OcrLimitHit,
+		"generationInfo": result.GenerationInfo,
+	})
-	saveErr := SaveSingleOcrPageResult(app.Database, parsedID, pageIdx, result.Text, result.OcrLimitHit, genInfoJSON)
-	if saveErr != nil {
-		log.Errorf("Failed to save re-OCR result for doc %d page %d: %v", parsedID, pageIdx, saveErr)
-	}
-
-	c.JSON(http.StatusOK, gin.H{
-		"text":           result.Text,
-		"ocrLimitHit":    result.OcrLimitHit,
-		"generationInfo": result.GenerationInfo,
-	})
+    saveErr := SaveSingleOcrPageResult(app.Database, parsedID, pageIdx, result.Text, result.OcrLimitHit, genInfoJSON)
+    if saveErr != nil {
+        log.Errorf("Failed to save re-OCR result for doc %d page %d: %v", parsedID, pageIdx, saveErr)
+        c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to persist re-OCR result"})
+        return
+    }
+
+    c.JSON(http.StatusOK, gin.H{
+        "text":           result.Text,
+        "ocrLimitHit":    result.OcrLimitHit,
+        "generationInfo": result.GenerationInfo,
+    })
-	saveErr := SaveSingleOcrPageResult(app.Database, parsedID, pageIdx, result.Text, result.OcrLimitHit, genInfoJSON)
-	if saveErr != nil {
-		log.Errorf("Failed to save re-OCR result for doc %d page %d: %v", parsedID, pageIdx, saveErr)
-	}
-
-	c.JSON(http.StatusOK, gin.H{
-		"text":           result.Text,
-		"ocrLimitHit":    result.OcrLimitHit,
-		"generationInfo": result.GenerationInfo,
-	})
+    saveErr := SaveSingleOcrPageResult(app.Database, parsedID, pageIdx, result.Text, result.OcrLimitHit, genInfoJSON)
+    if saveErr != nil {
+        log.Errorf("Failed to save re-OCR result for doc %d page %d: %v", parsedID, pageIdx, saveErr)
+        c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to persist re-OCR result"})
+        return
+    }
+
+    c.JSON(http.StatusOK, gin.H{
+        "text":           result.Text,
+        "ocrLimitHit":    result.OcrLimitHit,
+        "generationInfo": result.GenerationInfo,
+    })
+}
+
+// cancelReOCRPageHandler handles the DELETE request to cancel an ongoing re-OCR for a specific page.
+func (app *App) cancelReOCRPageHandler(c *gin.Context) {
+	id := c.Param("id")
+	pageIdxStr := c.Param("pageIndex")
+	parsedID, err := strconv.Atoi(id)
+	if err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid document ID"})
+		return
+	}
+	pageIdx, err := strconv.Atoi(pageIdxStr)
+	if err != nil {
+		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid page index"})
+		return
+	}
+
+	cancelKey := fmt.Sprintf("%d-%d", parsedID, pageIdx)
+
+	reOcrCancellersMu.Lock()
+	cancel, exists := reOcrCancellers[cancelKey]
+	if exists {
+		delete(reOcrCancellers, cancelKey)
+	}
+	reOcrCancellersMu.Unlock()
+
+	if exists {
+		cancel()
+		log.Infof("Cancellation requested for re-OCR doc %d page %d", parsedID, pageIdx)
+		c.Status(http.StatusNoContent)
+	} else {
+		log.Warnf("No active re-OCR found to cancel for doc %d page %d", parsedID, pageIdx)
+		c.JSON(http.StatusNotFound, gin.H{"error": "No active re-OCR operation found for this page"})
+	}
+}
+
 // Section for local-db actions
 
 func (app *App) getModificationHistoryHandler(c *gin.Context) {

diff --git a/background.go b/background.go
@@ -222,23 +222,35 @@ func (app *App) processAutoOcrTagDocuments(ctx context.Context) (int, error) {
 		var err error
 		if app.docProcessor != nil {
 			// Use injected processor if available
-			processedDoc, err = app.docProcessor.ProcessDocumentOCR(ctx, document.ID, options)
+			processedDoc, err = app.docProcessor.ProcessDocumentOCR(ctx, document.ID, options, "")
 		} else {
 			// Use the app's own implementation if no processor is injected
-			processedDoc, err = app.ProcessDocumentOCR(ctx, document.ID, options)
+			processedDoc, err = app.ProcessDocumentOCR(ctx, document.ID, options, "")
 		}
+
 		if err != nil {
 			docLogger.Errorf("OCR processing failed: %v", err)
 			errs = append(errs, fmt.Errorf("document %d OCR error: %w", document.ID, err))
 			continue
 		}
+		if processedDoc == nil {
+			docLogger.Info("OCR processing skipped for document")
+			continue
+		}
 		docLogger.Debug("OCR processing completed")
 
 		documentSuggestion := DocumentSuggestion{
 			ID:               document.ID,
 			OriginalDocument: document,
 			SuggestedContent: processedDoc.Text,
 			RemoveTags:       []string{autoOcrTag},
+			// Add OCR complete tag if tagging is enabled and PDF wasn't uploaded (upload handles tagging)
+			AddTags: func() []string {
+				if app.pdfOCRTagging && !options.UploadPDF {
+					return []string{app.pdfOCRCompleteTag}
+				}
+				return nil
+			}(),
 		}
 
 		if (app.pdfOCRTagging) && app.pdfOCRCompleteTag != "" {

diff --git a/background_test.go b/background_test.go
@@ -40,7 +40,7 @@ type mockDocumentProcessor struct {
 	mockText string
 }
 
-func (m *mockDocumentProcessor) ProcessDocumentOCR(ctx context.Context, documentID int, options OCROptions) (*ProcessedDocument, error) {
+func (m *mockDocumentProcessor) ProcessDocumentOCR(ctx context.Context, documentID int, options OCROptions, jobID string) (*ProcessedDocument, error) {
 	return &ProcessedDocument{
 		ID:   documentID,
 		Text: m.mockText,