diff --git a/README.md b/README.md index 3981ae2d..cd37ace3 100644 --- a/README.md +++ b/README.md @@ -111,6 +111,7 @@ https://github.com/user-attachments/assets/bd5d38b9-9309-40b9-93ca-918dfa4f3fd4 - [Working with Local LLMs](#working-with-local-llms) - [Token Management](#token-management) - [PDF Processing Issues](#pdf-processing-issues) + - [Custom Field Generation Issues](#custom-field-generation-issues) - [Contributing](#contributing) - [Support the Project](#support-the-project) - [License](#license) @@ -220,6 +221,10 @@ services: # Option 4: Docling Server # OCR_PROVIDER: 'docling' # Use a Docling server # DOCLING_URL: 'http://your-docling-server:port' # URL of your Docling instance + # DOCLING_IMAGE_EXPORT_MODE: "placeholder" # Optional, defaults to "embedded" + # DOCLING_OCR_PIPELINE: "standard" # Optional, defaults to "vlm" + # DOCLING_OCR_ENGINE: "easyocr" # Optional, defaults to "easyocr" (only used when `DOCLING_OCR_PIPELINE is set to 'standard') + AUTO_OCR_TAG: "paperless-gpt-ocr-auto" # Optional, default: paperless-gpt-ocr-auto OCR_LIMIT_PAGES: "5" # Optional, default: 5. Set to 0 for no limit. @@ -364,6 +369,9 @@ paperless-gpt supports four different OCR providers, each with unique strengths ```yaml OCR_PROVIDER: "docling" DOCLING_URL: "http://your-docling-server:port" + DOCLING_IMAGE_EXPORT_MODE: "placeholder" # Optional, defaults to "embedded" + DOCLING_OCR_PIPELINE: "standard" # Optional, defaults to "vlm" + DOCLING_OCR_ENGINE: "macocr" # Optional, defaults to "easyocr" (only used when `DOCLING_OCR_PIPELINE is set to 'standard') ``` ## OCR Processing Modes @@ -558,6 +566,8 @@ For best results with the enhanced OCR features: | `GOOGLE_APPLICATION_CREDENTIALS` | Path to the mounted Google service account key. Required if OCR_PROVIDER is `google_docai`. | Cond. | | | `DOCLING_URL` | URL of the Docling server instance. Required if OCR_PROVIDER is `docling`. | Cond. | | | `DOCLING_IMAGE_EXPORT_MODE` | Mode for image export. Optional; defaults to `embedded` if unset. | No | embedded | +| `DOCLING_OCR_PIPELINE` | Sets the pipeline type. Optional; defaults to `vlm` if unset. | No | vlm | +| `DOCLING_OCR_ENGINE` | Sets the ocr engine, if `DOCLING_OCR_PIPELINE` is set to `standard`. Optional; defaults to `easyocr` | No | easyocr | | `CREATE_LOCAL_HOCR` | Whether to save hOCR files locally. | No | false | | `LOCAL_HOCR_PATH` | Path where hOCR files will be saved when hOCR generation is enabled. | No | /app/hocr | | `CREATE_LOCAL_PDF` | Whether to save enhanced PDFs locally. | No | false | diff --git a/main.go b/main.go index 3b3e0950..9c5abbf4 100644 --- a/main.go +++ b/main.go @@ -74,6 +74,8 @@ var ( pdfSkipExistingOCR = os.Getenv("PDF_SKIP_EXISTING_OCR") == "true" doclingURL = os.Getenv("DOCLING_URL") doclingImageExportMode = os.Getenv("DOCLING_IMAGE_EXPORT_MODE") + doclingOCRPipeline = os.Getenv("DOCLING_OCR_PIPELINE") + doclingOCREngine = os.Getenv("DOCLING_OCR_ENGINE") // Templates titleTemplate *template.Template @@ -221,6 +223,8 @@ func main() { MistralModel: os.Getenv("MISTRAL_MODEL"), DoclingURL: doclingURL, DoclingImageExportMode: doclingImageExportMode, + DoclingOCRPipeline: doclingOCRPipeline, + DoclingOCREngine: doclingOCREngine, EnableHOCR: true, // Always generate hOCR struct if provider supports it } @@ -547,9 +551,17 @@ func validateOrDefaultEnvVars() { log.Fatal("Please set the DOCLING_URL environment variable for Docling provider") } if doclingImageExportMode == "" { - doclingImageExportMode = "embedded" // Default to PNG + doclingImageExportMode = "embedded" // Defaults to "embedded" log.Infof("DOCLING_IMAGE_EXPORT_MODE not set, defaulting to %s", doclingImageExportMode) } + if doclingOCRPipeline == "" { + doclingOCRPipeline = "vlm" // Defaults to "vlm" + log.Infof("DOCLING_OCR_PIPELINE not set, defaulting to %s", doclingOCRPipeline) + } + if doclingOCRPipeline == "standard" && doclingOCREngine == "" { + doclingOCREngine = "easyocr" + log.Infof("DOCLING_OCR_ENGINE not set, defaulting to %s", doclingOCREngine) + } } if llmModel == "" { diff --git a/ocr/docling_provider.go b/ocr/docling_provider.go index ff312085..90913d75 100644 --- a/ocr/docling_provider.go +++ b/ocr/docling_provider.go @@ -18,6 +18,8 @@ import ( type DoclingProvider struct { baseURL string imageExportMode string + pipeline string + ocrEngine string httpClient *retryablehttp.Client } @@ -37,6 +39,8 @@ func newDoclingProvider(config Config) (*DoclingProvider, error) { provider := &DoclingProvider{ baseURL: config.DoclingURL, imageExportMode: config.DoclingImageExportMode, + pipeline: config.DoclingOCRPipeline, + ocrEngine: config.DoclingOCREngine, httpClient: client, } @@ -77,9 +81,14 @@ func (p *DoclingProvider) ProcessImage(ctx context.Context, imageContent []byte, if err := writer.WriteField("do_ocr", "true"); err != nil { return nil, fmt.Errorf("set do_ocr: %w", err) } - if err := writer.WriteField("pipeline", "vlm"); err != nil { + if err := writer.WriteField("pipeline", p.pipeline); err != nil { return nil, fmt.Errorf("set pipeline: %w", err) } + if p.pipeline == "standard" { + if err := writer.WriteField("ocr_engine", p.ocrEngine); err != nil { + return nil, fmt.Errorf("set ocr_engine: %w", err) + } + } if err := writer.WriteField("image_export_mode", p.imageExportMode); err != nil { return nil, fmt.Errorf("set image_export_mode: %w", err) } @@ -91,7 +100,7 @@ func (p *DoclingProvider) ProcessImage(ctx context.Context, imageContent []byte, } // Create HTTP request - requestURL := p.baseURL + "/v1alpha/convert/file" + requestURL := p.baseURL + "/v1/convert/file" req, err := retryablehttp.NewRequestWithContext(ctx, "POST", requestURL, &requestBody) if err != nil { logger.WithError(err).Error("Failed to create HTTP request") @@ -105,7 +114,8 @@ func (p *DoclingProvider) ProcessImage(ctx context.Context, imageContent []byte, logger.WithFields(logrus.Fields{ "to_formats": "md", "do_ocr": "true", - "pipeline": "vlm", + "pipeline": p.pipeline, + "ocr_engine": p.ocrEngine, "image_export_mode": p.imageExportMode, }).Debug("Docling request parameters") diff --git a/ocr/docling_provider_test.go b/ocr/docling_provider_test.go index 3c0a5cff..ae0f0d0d 100644 --- a/ocr/docling_provider_test.go +++ b/ocr/docling_provider_test.go @@ -45,7 +45,7 @@ func TestDoclingProvider_ProcessImage(t *testing.T) { { name: "Success Case", mockHandler: func(w http.ResponseWriter, r *http.Request) { - assert.Equal(t, "/v1alpha/convert/file", r.URL.Path) + assert.Equal(t, "/v1/convert/file", r.URL.Path) assert.Equal(t, "POST", r.Method) assert.Contains(t, r.Header.Get("Content-Type"), "multipart/form-data") assert.Equal(t, "application/json", r.Header.Get("Accept")) diff --git a/ocr/provider.go b/ocr/provider.go index acf330ad..84909420 100644 --- a/ocr/provider.go +++ b/ocr/provider.go @@ -3,6 +3,7 @@ package ocr import ( "context" "fmt" + "strings" "github.com/gardar/ocrchestra/pkg/hocr" "github.com/sirupsen/logrus" @@ -56,6 +57,8 @@ type Config struct { // Docling settings DoclingURL string DoclingImageExportMode string + DoclingOCRPipeline string // Optional, defaults to "vlm" + DoclingOCREngine string // Optional, defaults to "easyocr", if DoclingOCRPipeline == "standard" // OCR output options EnableHOCR bool // Whether to generate hOCR data if supported by the provider @@ -97,6 +100,18 @@ func NewProvider(config Config) (Provider, error) { if config.DoclingURL == "" { return nil, fmt.Errorf("missing required Docling configuration (DOCLING_URL)") } + + config.DoclingOCRPipeline = strings.TrimSpace(config.DoclingOCRPipeline) + config.DoclingOCREngine = strings.TrimSpace(config.DoclingOCREngine) + if config.DoclingOCRPipeline == "" { + config.DoclingOCRPipeline = "vlm" + } + if config.DoclingOCRPipeline == "standard" && config.DoclingOCREngine == "" { + config.DoclingOCREngine = "easyocr" + } + if config.DoclingOCRPipeline != "vlm" && config.DoclingOCRPipeline != "standard" { + return nil, fmt.Errorf("unsupported docling pipeline: %q (supported: vlm, standard)", config.DoclingOCRPipeline) + } log.WithField("url", config.DoclingURL).Info("Using Docling provider") return newDoclingProvider(config)