Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ services:
# LLM_PROVIDER: "ollama"
# LLM_MODEL: "qwen3:8b"
# OLLAMA_HOST: "http://host.docker.internal:11434"
# OLLAMA_CONTEXT_LENGTH: "8192" # Sets Ollama NumCtx (context window)
# TOKEN_LIMIT: 1000 # Recommended for smaller models

# Optional LLM Settings
Expand Down Expand Up @@ -547,6 +548,10 @@ For best results with the enhanced OCR features:
| `VISION_LLM_REQUESTS_PER_MINUTE` | Maximum requests per minute for the Vision LLM. Useful for managing API costs or local LLM load. | No | 120 |
| `VISION_LLM_MAX_RETRIES` | Maximum retry attempts for failed Vision LLM requests. | No | 3 |
| `VISION_LLM_BACKOFF_MAX_WAIT` | Maximum wait time between retries for the Vision LLM (e.g., `30s`). | No | 30s |
| `VISION_LLM_MAX_TOKENS` | Maximum tokens for Vision LLM OCR output. | No | |
| `VISION_LLM_TEMPERATURE` | Sampling temperature for Vision OCR generation. Lower is more deterministic. Important: For OpenAI GPT-5 it must be explicitly set to `1.0`. | No | |
| `OLLAMA_CONTEXT_LENGTH` | (Ollama only) Integer. Sets NumCtx (context window) for the Ollama runner. If unset or 0, the model default is used. | No | |
| `OLLAMA_OCR_TOP_K` | (Ollama only) Top-k token sampling for Vision OCR. Lower favors more likely tokens; higher increases diversity. | No | |
| `AZURE_DOCAI_ENDPOINT` | Azure Document Intelligence endpoint. Required if OCR_PROVIDER is `azure`. | Cond. | |
| `AZURE_DOCAI_KEY` | Azure Document Intelligence API key. Required if OCR_PROVIDER is `azure`. | Cond. | |
| `AZURE_DOCAI_MODEL_ID` | Azure Document Intelligence model ID. Optional if using `azure` provider. | No | prebuilt-read |
Expand Down Expand Up @@ -886,6 +891,7 @@ When using local LLMs (like those through Ollama), you might need to adjust cert
#### Token Management

- Use `TOKEN_LIMIT` environment variable to control the maximum number of tokens sent to the LLM
- For Ollama, set `OLLAMA_CONTEXT_LENGTH` to control the model's context window (NumCtx). This is independent of `TOKEN_LIMIT` and configures the server-side KV cache size. If unset or 0, the model default is used. Choose a value within the model's supported window (e.g., 8192).
- Smaller models might truncate content unexpectedly if given too much text
- Start with a conservative limit (e.g., 1000 tokens) and adjust based on your model's capabilities
- Set to `0` to disable the limit (use with caution)
Expand All @@ -895,13 +901,15 @@ Example configuration for smaller models:
```yaml
environment:
TOKEN_LIMIT: "2000" # Adjust based on your model's context window
OLLAMA_CONTEXT_LENGTH: "4096" # Controls Ollama NumCtx (context window); if unset, model default is used
LLM_PROVIDER: "ollama"
LLM_MODEL: "qwen3:8b" # Or other local model
```

Common issues and solutions:

- If you see truncated or incomplete responses, try lowering the `TOKEN_LIMIT`
- On Ollama, if you hit "context length exceeded" or memory issues, reduce `OLLAMA_CONTEXT_LENGTH` or choose a smaller model/context size.
- If processing is too limited, gradually increase the limit while monitoring performance
- For models with larger context windows, you can increase the limit or disable it entirely

Expand Down
179 changes: 174 additions & 5 deletions app_http_handlers.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package main

import (
"context"
"bytes"
"encoding/json"
"errors"
"fmt"
"net/http"
"os"
Expand Down Expand Up @@ -252,11 +254,12 @@ func (app *App) getJobStatusHandler(c *gin.Context) {
}

response := gin.H{
"job_id": job.ID,
"status": job.Status,
"created_at": job.CreatedAt,
"updated_at": job.UpdatedAt,
"pages_done": job.PagesDone,
"job_id": job.ID,
"status": job.Status,
"created_at": job.CreatedAt,
"updated_at": job.UpdatedAt,
"pages_done": job.PagesDone,
"total_pages": job.TotalPages,
}

if job.Status == "completed" {
Expand Down Expand Up @@ -293,6 +296,20 @@ func (app *App) getAllJobsHandler(c *gin.Context) {
c.JSON(http.StatusOK, jobList)
}

// POST /api/ocr/jobs/:job_id/stop
func (app *App) stopOCRJobHandler(c *gin.Context) {
jobID := c.Param("job_id")
jobCancellersMu.Lock()
cancel, exists := jobCancellers[jobID]
jobCancellersMu.Unlock()
if !exists {
c.JSON(http.StatusNotFound, gin.H{"error": "No running job with this ID"})
return
}
cancel()
c.Status(http.StatusNoContent)
}

// getDocumentHandler handles the retrieval of a document by its ID
func (app *App) getDocumentHandler() gin.HandlerFunc {
return func(c *gin.Context) {
Expand All @@ -312,6 +329,158 @@ func (app *App) getDocumentHandler() gin.HandlerFunc {
}
}

// getOCRPagesHandler returns per-page OCR results for a document
func (app *App) getOCRPagesHandler(c *gin.Context) {
id := c.Param("id")
parsedID, err := strconv.Atoi(id)
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid document ID"})
return
}

dbResults, err := GetOcrPageResults(app.Database, parsedID)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to fetch OCR page results"})
return
}

type OCRPageResult struct {
Text string `json:"text"`
OcrLimitHit bool `json:"ocrLimitHit"`
GenerationInfo map[string]interface{} `json:"generationInfo,omitempty"`
}

var pages []OCRPageResult
for _, res := range dbResults {
var genInfo map[string]interface{}
if res.GenerationInfo != "" {
_ = json.Unmarshal([]byte(res.GenerationInfo), &genInfo)
}
pages = append(pages, OCRPageResult{
Text: res.Text,
OcrLimitHit: res.OcrLimitHit,
GenerationInfo: genInfo,
})
}

c.JSON(http.StatusOK, gin.H{
"pages": pages,
})
}
Comment on lines +333 to +369
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add error handling for JSON unmarshaling

The JSON unmarshaling of GenerationInfo at line 290 ignores errors silently. While this may be intentional for backward compatibility, it could hide data corruption issues.

Add error logging for failed JSON unmarshaling:

 		var genInfo map[string]interface{}
 		if res.GenerationInfo != "" {
-			_ = json.Unmarshal([]byte(res.GenerationInfo), &genInfo)
+			if err := json.Unmarshal([]byte(res.GenerationInfo), &genInfo); err != nil {
+				log.Warnf("Failed to unmarshal GenerationInfo for doc %d page %d: %v", parsedID, i, err)
+			}
 		}

Committable suggestion skipped: line range outside the PR's diff.

🤖 Prompt for AI Agents
In app_http_handlers.go around lines 266 to 302, the json.Unmarshal call for
res.GenerationInfo ignores errors; update the loop to capture the unmarshaling
error and log it (including enough context such as parsedID and res identifiers)
instead of discarding it. Specifically, change the unmarshal to if err :=
json.Unmarshal([]byte(res.GenerationInfo), &genInfo); err != nil { /* log error
with context using app logger or the standard logger */ } so the handler records
bad/malformed GenerationInfo while continuing to return other page data.


func (app *App) reOCRPageHandler(c *gin.Context) {
id := c.Param("id")
pageIdxStr := c.Param("pageIndex")
parsedID, err := strconv.Atoi(id)
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid document ID"})
return
}
pageIdx, err := strconv.Atoi(pageIdxStr)
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid page index"})
return
}

// Download all images for the document, but only process the requested page
imagePaths, _, err := app.Client.DownloadDocumentAsImages(c.Request.Context(), parsedID, limitOcrPages)
if err != nil || pageIdx < 0 || pageIdx >= len(imagePaths) {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid page index or failed to download images"})
return
}
Comment on lines +386 to +390
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Verify image index bounds before file access

The code downloads images and then checks if pageIdx is within bounds, but it reads the file before performing adequate validation. While the bounds check exists, the error message combines two different error conditions.

Separate the error conditions for clarity:

 	imagePaths, _, err := app.Client.DownloadDocumentAsImages(c.Request.Context(), parsedID, limitOcrPages)
-	if err != nil || pageIdx < 0 || pageIdx >= len(imagePaths) {
-		c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid page index or failed to download images"})
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to download images"})
+		return
+	}
+	if pageIdx < 0 || pageIdx >= len(imagePaths) {
+		c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("Invalid page index %d, document has %d pages", pageIdx, len(imagePaths))})
 		return
 	}
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
imagePaths, _, err := app.Client.DownloadDocumentAsImages(c.Request.Context(), parsedID, limitOcrPages)
if err != nil || pageIdx < 0 || pageIdx >= len(imagePaths) {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid page index or failed to download images"})
return
}
imagePaths, _, err := app.Client.DownloadDocumentAsImages(c.Request.Context(), parsedID, limitOcrPages)
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to download images"})
return
}
if pageIdx < 0 || pageIdx >= len(imagePaths) {
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("Invalid page index %d, document has %d pages", pageIdx, len(imagePaths))})
return
}
🤖 Prompt for AI Agents
In app_http_handlers.go around lines 319 to 323, the code calls
DownloadDocumentAsImages and then treats a combined error condition (download
error or invalid page index) as a single BadRequest; instead, first check and
handle the download error (return 500 or appropriate server error with the
download error message) before inspecting imagePaths, then separately validate
pageIdx against len(imagePaths) and return a clear 400 BadRequest if the index
is out of bounds; ensure you do not reference imagePaths length when err != nil
and return distinct, descriptive error responses for the two failure modes.

imageContent, err := os.ReadFile(imagePaths[pageIdx])
if err != nil {
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to read image file"})
return
}

cancelKey := fmt.Sprintf("%d-%d", parsedID, pageIdx)
reOcrCtx, cancelReOcr := context.WithCancel(c.Request.Context())
defer cancelReOcr()

reOcrCancellersMu.Lock()
if existingCancel, ok := reOcrCancellers[cancelKey]; ok {
existingCancel()
}
reOcrCancellers[cancelKey] = cancelReOcr
reOcrCancellersMu.Unlock()

defer func() {
reOcrCancellersMu.Lock()
delete(reOcrCancellers, cancelKey)
reOcrCancellersMu.Unlock()
}()

result, err := app.ocrProvider.ProcessImage(reOcrCtx, imageContent, pageIdx+1)

if err != nil {
if errors.Is(err, context.Canceled) {
log.Infof("Re-OCR for doc %d page %d cancelled.", parsedID, pageIdx)
c.Status(499)
} else {
log.Errorf("Failed to re-OCR doc %d page %d: %v", parsedID, pageIdx, err)
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to re-OCR page"})
}
return
}
if result == nil {
log.Errorf("Re-OCR for doc %d page %d returned nil result.", parsedID, pageIdx)
c.JSON(http.StatusInternalServerError, gin.H{"error": "Re-OCR returned no result"})
return
}

var genInfoJSON string
if result.GenerationInfo != nil {
if b, err := json.Marshal(result.GenerationInfo); err == nil {
genInfoJSON = string(b)
}
}
Comment on lines +432 to +437
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add error handling for JSON marshaling

The JSON marshaling at line 367 could fail, but errors are silently ignored, which could result in empty genInfoJSON being saved.

Handle JSON marshaling errors:

 	var genInfoJSON string
 	if result.GenerationInfo != nil {
-		if b, err := json.Marshal(result.GenerationInfo); err == nil {
+		if b, err := json.Marshal(result.GenerationInfo); err != nil {
+			log.Errorf("Failed to marshal GenerationInfo for doc %d page %d: %v", parsedID, pageIdx, err)
+		} else {
 			genInfoJSON = string(b)
 		}
 	}
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
var genInfoJSON string
if result.GenerationInfo != nil {
if b, err := json.Marshal(result.GenerationInfo); err == nil {
genInfoJSON = string(b)
}
}
var genInfoJSON string
if result.GenerationInfo != nil {
if b, err := json.Marshal(result.GenerationInfo); err != nil {
log.Errorf("Failed to marshal GenerationInfo for doc %d page %d: %v", parsedID, pageIdx, err)
} else {
genInfoJSON = string(b)
}
}
🤖 Prompt for AI Agents
In app_http_handlers.go around lines 365 to 370, the json.Marshal call for
result.GenerationInfo currently ignores errors which can lead to empty
genInfoJSON; change the code to check the error from json.Marshal, and on error
log the marshaling error with context (including any identifiers like request ID
or generation ID) and return or propagate an appropriate error response (e.g.,
HTTP 500 / internal error) instead of proceeding with an empty string; if
returning is not appropriate in this handler, at minimum set genInfoJSON to a
safe default and log the error so the failure is visible.

saveErr := SaveSingleOcrPageResult(app.Database, parsedID, pageIdx, result.Text, result.OcrLimitHit, genInfoJSON)
if saveErr != nil {
log.Errorf("Failed to save re-OCR result for doc %d page %d: %v", parsedID, pageIdx, saveErr)
}

c.JSON(http.StatusOK, gin.H{
"text": result.Text,
"ocrLimitHit": result.OcrLimitHit,
"generationInfo": result.GenerationInfo,
})
Comment on lines +438 to +447
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Surface DB write failures to the client

If persisting the page result fails we currently return 200, so the UI shows success even though nothing was saved. Please fail fast and tell the client so they can retry.

	saveErr := SaveSingleOcrPageResult(app.Database, parsedID, pageIdx, result.Text, result.OcrLimitHit, genInfoJSON)
	if saveErr != nil {
		log.Errorf("Failed to save re-OCR result for doc %d page %d: %v", parsedID, pageIdx, saveErr)
-	}
-
-	c.JSON(http.StatusOK, gin.H{
+		c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to persist re-OCR result"})
+		return
+	}
+
+	c.JSON(http.StatusOK, gin.H{
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
saveErr := SaveSingleOcrPageResult(app.Database, parsedID, pageIdx, result.Text, result.OcrLimitHit, genInfoJSON)
if saveErr != nil {
log.Errorf("Failed to save re-OCR result for doc %d page %d: %v", parsedID, pageIdx, saveErr)
}
c.JSON(http.StatusOK, gin.H{
"text": result.Text,
"ocrLimitHit": result.OcrLimitHit,
"generationInfo": result.GenerationInfo,
})
saveErr := SaveSingleOcrPageResult(app.Database, parsedID, pageIdx, result.Text, result.OcrLimitHit, genInfoJSON)
if saveErr != nil {
log.Errorf("Failed to save re-OCR result for doc %d page %d: %v", parsedID, pageIdx, saveErr)
c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to persist re-OCR result"})
return
}
c.JSON(http.StatusOK, gin.H{
"text": result.Text,
"ocrLimitHit": result.OcrLimitHit,
"generationInfo": result.GenerationInfo,
})
🤖 Prompt for AI Agents
In app_http_handlers.go around lines 438-447 the handler logs
SaveSingleOcrPageResult failures but still returns HTTP 200; change the control
flow to return an error response when saveErr != nil by responding with a
non-200 status (e.g., http.StatusInternalServerError or http.StatusBadGateway)
and a concise JSON error body (message field), then immediately return so the
client sees the failure and can retry; do not remove the existing log, but
ensure the error response does not leak sensitive internals.

}

// cancelReOCRPageHandler handles the DELETE request to cancel an ongoing re-OCR for a specific page.
func (app *App) cancelReOCRPageHandler(c *gin.Context) {
id := c.Param("id")
pageIdxStr := c.Param("pageIndex")
parsedID, err := strconv.Atoi(id)
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid document ID"})
return
}
pageIdx, err := strconv.Atoi(pageIdxStr)
if err != nil {
c.JSON(http.StatusBadRequest, gin.H{"error": "Invalid page index"})
return
}

cancelKey := fmt.Sprintf("%d-%d", parsedID, pageIdx)

reOcrCancellersMu.Lock()
cancel, exists := reOcrCancellers[cancelKey]
if exists {
delete(reOcrCancellers, cancelKey)
}
reOcrCancellersMu.Unlock()

if exists {
cancel()
log.Infof("Cancellation requested for re-OCR doc %d page %d", parsedID, pageIdx)
c.Status(http.StatusNoContent)
} else {
log.Warnf("No active re-OCR found to cancel for doc %d page %d", parsedID, pageIdx)
c.JSON(http.StatusNotFound, gin.H{"error": "No active re-OCR operation found for this page"})
}
}

// Section for local-db actions

func (app *App) getModificationHistoryHandler(c *gin.Context) {
Expand Down
16 changes: 14 additions & 2 deletions background.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,23 +222,35 @@ func (app *App) processAutoOcrTagDocuments(ctx context.Context) (int, error) {
var err error
if app.docProcessor != nil {
// Use injected processor if available
processedDoc, err = app.docProcessor.ProcessDocumentOCR(ctx, document.ID, options)
processedDoc, err = app.docProcessor.ProcessDocumentOCR(ctx, document.ID, options, "")
} else {
// Use the app's own implementation if no processor is injected
processedDoc, err = app.ProcessDocumentOCR(ctx, document.ID, options)
processedDoc, err = app.ProcessDocumentOCR(ctx, document.ID, options, "")
}

if err != nil {
docLogger.Errorf("OCR processing failed: %v", err)
errs = append(errs, fmt.Errorf("document %d OCR error: %w", document.ID, err))
continue
}
if processedDoc == nil {
docLogger.Info("OCR processing skipped for document")
continue
}
docLogger.Debug("OCR processing completed")

documentSuggestion := DocumentSuggestion{
ID: document.ID,
OriginalDocument: document,
SuggestedContent: processedDoc.Text,
RemoveTags: []string{autoOcrTag},
// Add OCR complete tag if tagging is enabled and PDF wasn't uploaded (upload handles tagging)
AddTags: func() []string {
if app.pdfOCRTagging && !options.UploadPDF {
return []string{app.pdfOCRCompleteTag}
}
return nil
}(),
}

if (app.pdfOCRTagging) && app.pdfOCRCompleteTag != "" {
Expand Down
2 changes: 1 addition & 1 deletion background_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ type mockDocumentProcessor struct {
mockText string
}

func (m *mockDocumentProcessor) ProcessDocumentOCR(ctx context.Context, documentID int, options OCROptions) (*ProcessedDocument, error) {
func (m *mockDocumentProcessor) ProcessDocumentOCR(ctx context.Context, documentID int, options OCROptions, jobID string) (*ProcessedDocument, error) {
return &ProcessedDocument{
ID: documentID,
Text: m.mockText,
Expand Down
Loading
Loading