Skip to content
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ https://github.com/user-attachments/assets/bd5d38b9-9309-40b9-93ca-918dfa4f3fd4
- [Working with Local LLMs](#working-with-local-llms)
- [Token Management](#token-management)
- [PDF Processing Issues](#pdf-processing-issues)
- [Custom Field Generation Issues](#custom-field-generation-issues)
- [Contributing](#contributing)
- [Support the Project](#support-the-project)
- [License](#license)
Expand Down Expand Up @@ -220,6 +221,10 @@ services:
# Option 4: Docling Server
# OCR_PROVIDER: 'docling' # Use a Docling server
# DOCLING_URL: 'http://your-docling-server:port' # URL of your Docling instance
# DOCLING_IMAGE_EXPORT_MODE: "placeholder" # Optional, defaults to "embedded"
# DOCLING_OCR_PIPELINE: "standard" # Optional, defaults to "vlm"
# DOCLING_OCR_ENGINE: "easyocr" # Optional, defaults to "easyocr" (only used when `DOCLING_OCR_PIPELINE is set to 'standard')


AUTO_OCR_TAG: "paperless-gpt-ocr-auto" # Optional, default: paperless-gpt-ocr-auto
OCR_LIMIT_PAGES: "5" # Optional, default: 5. Set to 0 for no limit.
Expand Down Expand Up @@ -364,6 +369,9 @@ paperless-gpt supports four different OCR providers, each with unique strengths
```yaml
OCR_PROVIDER: "docling"
DOCLING_URL: "http://your-docling-server:port"
DOCLING_IMAGE_EXPORT_MODE: "placeholder" # Optional, defaults to "embedded"
DOCLING_OCR_PIPELINE: "standard" # Optional, defaults to "vlm"
DOCLING_OCR_ENGINE: "macocr" # Optional, defaults to "easyocr" (only used when `DOCLING_OCR_PIPELINE is set to 'standard')
```

## OCR Processing Modes
Expand Down Expand Up @@ -558,6 +566,8 @@ For best results with the enhanced OCR features:
| `GOOGLE_APPLICATION_CREDENTIALS` | Path to the mounted Google service account key. Required if OCR_PROVIDER is `google_docai`. | Cond. | |
| `DOCLING_URL` | URL of the Docling server instance. Required if OCR_PROVIDER is `docling`. | Cond. | |
| `DOCLING_IMAGE_EXPORT_MODE` | Mode for image export. Optional; defaults to `embedded` if unset. | No | embedded |
| `DOCLING_OCR_PIPELINE` | Sets the pipeline type. Optional; defaults to `vlm` if unset. | No | vlm |
| `DOCLING_OCR_ENGINE` | Sets the ocr engine, if `DOCLING_OCR_PIPELINE` is set to `standard`. Optional; defaults to `easyocr` | No | easyocr |
| `CREATE_LOCAL_HOCR` | Whether to save hOCR files locally. | No | false |
| `LOCAL_HOCR_PATH` | Path where hOCR files will be saved when hOCR generation is enabled. | No | /app/hocr |
| `CREATE_LOCAL_PDF` | Whether to save enhanced PDFs locally. | No | false |
Expand Down
14 changes: 13 additions & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ var (
pdfSkipExistingOCR = os.Getenv("PDF_SKIP_EXISTING_OCR") == "true"
doclingURL = os.Getenv("DOCLING_URL")
doclingImageExportMode = os.Getenv("DOCLING_IMAGE_EXPORT_MODE")
doclingOCRPipeline = os.Getenv("DOCLING_OCR_PIPELINE")
doclingOCREngine = os.Getenv("DOCLING_OCR_ENGINE")

// Templates
titleTemplate *template.Template
Expand Down Expand Up @@ -221,6 +223,8 @@ func main() {
MistralModel: os.Getenv("MISTRAL_MODEL"),
DoclingURL: doclingURL,
DoclingImageExportMode: doclingImageExportMode,
DoclingOCRPipeline: doclingOCRPipeline,
DoclingOCREngine: doclingOCREngine,
EnableHOCR: true, // Always generate hOCR struct if provider supports it
}

Expand Down Expand Up @@ -547,9 +551,17 @@ func validateOrDefaultEnvVars() {
log.Fatal("Please set the DOCLING_URL environment variable for Docling provider")
}
if doclingImageExportMode == "" {
doclingImageExportMode = "embedded" // Default to PNG
doclingImageExportMode = "embedded" // Defaults to "embedded"
log.Infof("DOCLING_IMAGE_EXPORT_MODE not set, defaulting to %s", doclingImageExportMode)
}
if doclingOCRPipeline == "" {
doclingOCRPipeline = "vlm" // Defaults to "vlm"
log.Infof("DOCLING_OCR_PIPELINE not set, defaulting to %s", doclingOCRPipeline)
}
if doclingOCRPipeline == "standard" && doclingOCREngine == "" {
doclingOCREngine = "easyocr"
log.Infof("DOCLING_OCR_ENGINE not set, defaulting to %s", doclingOCREngine)
}
}

if llmModel == "" {
Expand Down
16 changes: 13 additions & 3 deletions ocr/docling_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ import (
type DoclingProvider struct {
baseURL string
imageExportMode string
pipeline string
ocrEngine string
httpClient *retryablehttp.Client
}

Expand All @@ -37,6 +39,8 @@ func newDoclingProvider(config Config) (*DoclingProvider, error) {
provider := &DoclingProvider{
baseURL: config.DoclingURL,
imageExportMode: config.DoclingImageExportMode,
pipeline: config.DoclingOCRPipeline,
ocrEngine: config.DoclingOCREngine,
httpClient: client,
}
Comment on lines 39 to 45
Copy link
Contributor

@coderabbitai coderabbitai bot Sep 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Default/validate pipeline and engine inside provider.

Protect against empty values when callers omit them. This keeps the ocr package self-contained.

 provider := &DoclingProvider{
   baseURL:         config.DoclingURL,
   imageExportMode: config.DoclingImageExportMode,
-  pipeline:        config.DoclingOCRPipeline,
-  ocrEngine:       config.DoclingOCREngine,
+  pipeline:        config.DoclingOCRPipeline,
+  ocrEngine:       config.DoclingOCREngine,
   httpClient:      client,
 }
+
+// Fallbacks and minimal validation
+if provider.pipeline == "" {
+  provider.pipeline = "vlm"
+}
+if provider.pipeline == "standard" && provider.ocrEngine == "" {
+  provider.ocrEngine = "easyocr"
+}
+if provider.pipeline != "vlm" && provider.pipeline != "standard" {
+  return nil, fmt.Errorf("unsupported docling pipeline: %q (supported: vlm, standard)", provider.pipeline)
+}
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
provider := &DoclingProvider{
baseURL: config.DoclingURL,
imageExportMode: config.DoclingImageExportMode,
pipeline: config.DoclingOCRPipeline,
ocrEngine: config.DoclingOCREngine,
httpClient: client,
}
provider := &DoclingProvider{
baseURL: config.DoclingURL,
imageExportMode: config.DoclingImageExportMode,
pipeline: config.DoclingOCRPipeline,
ocrEngine: config.DoclingOCREngine,
httpClient: client,
}
// Fallbacks and minimal validation
if provider.pipeline == "" {
provider.pipeline = "vlm"
}
if provider.pipeline == "standard" && provider.ocrEngine == "" {
provider.ocrEngine = "easyocr"
}
if provider.pipeline != "vlm" && provider.pipeline != "standard" {
return nil, fmt.Errorf(
"unsupported docling pipeline: %q (supported: vlm, standard)",
provider.pipeline,
)
}
🤖 Prompt for AI Agents
In ocr/docling_provider.go around lines 39 to 45, the provider is created
directly from config without validating or defaulting pipeline/engine; update
the constructor to check if config.DoclingOCRPipeline or config.DoclingOCREngine
are empty and replace them with explicit fallback constants (e.g. a named
default pipeline and default engine), or return an error if missing; ensure
defaults live near the provider code (constants) and that the provider fields
are set to the validated/defaulted values so callers can omit those config
fields safely.

Copy link
Contributor Author

@T-Eberle T-Eberle Sep 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point.

However, I would recommend creating an interface for ALL OCR engines to with a validate method, as I believe it would simplify the code.

We could then use the interface to add more methods to make it easier to add upcoming OCR methods.

Currently, the validation starts in the provider.go file, function NewProvider. I added the the validation for docling in the same function to keep validation concise.

This should be another issue and not be considered in this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@T-Eberle You make an excellent architectural point! A unified validation interface across all OCR providers would indeed be much cleaner and more maintainable than adding validation piecemeal to individual providers.

You're absolutely right that this broader refactoring should be a separate issue rather than expanding the scope of this PR, which is focused on the Docling API version update and new environment variables.

Would you like me to create a follow-up issue to track the OCR provider validation interface improvement? I can outline the proposal for:

  • Creating a common validation interface for all OCR providers
  • Moving validation logic from provider.go's NewProvider into individual provider implementations
  • Establishing consistent patterns for future OCR provider additions

This would ensure the architectural improvement gets proper attention and discussion without delaying your current changes.

🐰
💭 thinking about clean architecture
|
🔧

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. See #624

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@T-Eberle Perfect! Thank you for creating the follow-up issue #624 to track the OCR provider validation interface improvement. This keeps your current PR focused while ensuring the architectural enhancement gets proper attention later.

Your approach of maintaining separation of concerns is spot-on - the Docling API updates in this PR are clean and straightforward without getting mixed up with broader refactoring work.


If you found this review helpful, would you consider giving us a shout-out on X?

Thank you for using CodeRabbit!


Expand Down Expand Up @@ -77,9 +81,14 @@ func (p *DoclingProvider) ProcessImage(ctx context.Context, imageContent []byte,
if err := writer.WriteField("do_ocr", "true"); err != nil {
return nil, fmt.Errorf("set do_ocr: %w", err)
}
if err := writer.WriteField("pipeline", "vlm"); err != nil {
if err := writer.WriteField("pipeline", p.pipeline); err != nil {
return nil, fmt.Errorf("set pipeline: %w", err)
}
if p.pipeline == "standard" {
if err := writer.WriteField("ocr_engine", p.ocrEngine); err != nil {
return nil, fmt.Errorf("set ocr_engine: %w", err)
}
}
if err := writer.WriteField("image_export_mode", p.imageExportMode); err != nil {
return nil, fmt.Errorf("set image_export_mode: %w", err)
}
Expand All @@ -91,7 +100,7 @@ func (p *DoclingProvider) ProcessImage(ctx context.Context, imageContent []byte,
}

// Create HTTP request
requestURL := p.baseURL + "/v1alpha/convert/file"
requestURL := p.baseURL + "/v1/convert/file"
req, err := retryablehttp.NewRequestWithContext(ctx, "POST", requestURL, &requestBody)
if err != nil {
logger.WithError(err).Error("Failed to create HTTP request")
Expand All @@ -105,7 +114,8 @@ func (p *DoclingProvider) ProcessImage(ctx context.Context, imageContent []byte,
logger.WithFields(logrus.Fields{
"to_formats": "md",
"do_ocr": "true",
"pipeline": "vlm",
"pipeline": p.pipeline,
"ocr_engine": p.ocrEngine,
"image_export_mode": p.imageExportMode,
}).Debug("Docling request parameters")

Expand Down
2 changes: 1 addition & 1 deletion ocr/docling_provider_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ func TestDoclingProvider_ProcessImage(t *testing.T) {
{
name: "Success Case",
mockHandler: func(w http.ResponseWriter, r *http.Request) {
assert.Equal(t, "/v1alpha/convert/file", r.URL.Path)
assert.Equal(t, "/v1/convert/file", r.URL.Path)
assert.Equal(t, "POST", r.Method)
assert.Contains(t, r.Header.Get("Content-Type"), "multipart/form-data")
assert.Equal(t, "application/json", r.Header.Get("Accept"))
Expand Down
15 changes: 15 additions & 0 deletions ocr/provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package ocr
import (
"context"
"fmt"
"strings"

"github.com/gardar/ocrchestra/pkg/hocr"
"github.com/sirupsen/logrus"
Expand Down Expand Up @@ -56,6 +57,8 @@ type Config struct {
// Docling settings
DoclingURL string
DoclingImageExportMode string
DoclingOCRPipeline string // Optional, defaults to "vlm"
DoclingOCREngine string // Optional, defaults to "easyocr", if DoclingOCRPipeline == "standard"

// OCR output options
EnableHOCR bool // Whether to generate hOCR data if supported by the provider
Expand Down Expand Up @@ -97,6 +100,18 @@ func NewProvider(config Config) (Provider, error) {
if config.DoclingURL == "" {
return nil, fmt.Errorf("missing required Docling configuration (DOCLING_URL)")
}

config.DoclingOCRPipeline = strings.TrimSpace(config.DoclingOCRPipeline)
config.DoclingOCREngine = strings.TrimSpace(config.DoclingOCREngine)
if config.DoclingOCRPipeline == "" {
config.DoclingOCRPipeline = "vlm"
}
if config.DoclingOCRPipeline == "standard" && config.DoclingOCREngine == "" {
config.DoclingOCREngine = "easyocr"
}
if config.DoclingOCRPipeline != "vlm" && config.DoclingOCRPipeline != "standard" {
return nil, fmt.Errorf("unsupported docling pipeline: %q (supported: vlm, standard)", config.DoclingOCRPipeline)
}
log.WithField("url", config.DoclingURL).Info("Using Docling provider")
return newDoclingProvider(config)

Expand Down
Loading