diff --git a/README.md b/README.md index 175bf976..068d3407 100644 --- a/README.md +++ b/README.md @@ -218,6 +218,10 @@ services: # OCR_PROVIDER: 'docling' # Use a Docling server # DOCLING_URL: 'http://your-docling-server:port' # URL of your Docling instance + # Option 5: iOS-OCR-Server + # OCR_PROVIDER: 'ios_ocr' # Use iOS-OCR-Server + # IOS_OCR_SERVER_URL: 'http://your-ios-device-ip:8080' # URL of your iOS device running iOS-OCR-Server + AUTO_OCR_TAG: "paperless-gpt-ocr-auto" # Optional, default: paperless-gpt-ocr-auto OCR_LIMIT_PAGES: "5" # Optional, default: 5. Set to 0 for no limit. LOG_LEVEL: "info" # Optional: debug, warn, error @@ -363,6 +367,24 @@ paperless-gpt supports four different OCR providers, each with unique strengths DOCLING_URL: "http://your-docling-server:port" ``` +### 5. iOS-OCR-Server + +- **Key Features**: + - Turn your iPhone into a powerful local OCR server using Apple's Vision Framework + - No cloud dependencies, unlimited usage, complete privacy + - High-quality OCR using Apple's native Vision Framework + - Returns both text and bounding box information +- **Best For**: + - Users with an iPhone/iPad who want high-quality OCR + - Privacy-focused environments requiring local processing + - Cost-effective solution for unlimited OCR processing +- **Configuration**: + ```yaml + OCR_PROVIDER: "ios_ocr" + IOS_OCR_SERVER_URL: "http://your-ios-device-ip:8080" + ``` +- **Setup**: Follow the instructions at [iOS-OCR-Server](https://github.com/riddleling/iOS-OCR-Server) to set up the server on your iOS device. + ## OCR Processing Modes paperless-gpt offers different methods for processing documents, giving you flexibility based on your needs and OCR provider capabilities: @@ -555,6 +577,7 @@ For best results with the enhanced OCR features: | `GOOGLE_APPLICATION_CREDENTIALS` | Path to the mounted Google service account key. Required if OCR_PROVIDER is `google_docai`. | Cond. | | | `DOCLING_URL` | URL of the Docling server instance. Required if OCR_PROVIDER is `docling`. | Cond. | | | `DOCLING_IMAGE_EXPORT_MODE` | Mode for image export. Optional; defaults to `embedded` if unset. | No | embedded | +| `IOS_OCR_SERVER_URL` | URL of the iOS-OCR-Server instance. Required if OCR_PROVIDER is `ios_ocr`. | Cond. | | | `CREATE_LOCAL_HOCR` | Whether to save hOCR files locally. | No | false | | `LOCAL_HOCR_PATH` | Path where hOCR files will be saved when hOCR generation is enabled. | No | /app/hocr | | `CREATE_LOCAL_PDF` | Whether to save enhanced PDFs locally. | No | false | diff --git a/main.go b/main.go index 41934556..41449918 100644 --- a/main.go +++ b/main.go @@ -74,6 +74,7 @@ var ( pdfSkipExistingOCR = os.Getenv("PDF_SKIP_EXISTING_OCR") == "true" doclingURL = os.Getenv("DOCLING_URL") doclingImageExportMode = os.Getenv("DOCLING_IMAGE_EXPORT_MODE") + iosOCRServerURL = os.Getenv("IOS_OCR_SERVER_URL") // Templates titleTemplate *template.Template @@ -220,6 +221,7 @@ func main() { MistralModel: os.Getenv("MISTRAL_MODEL"), DoclingURL: doclingURL, DoclingImageExportMode: doclingImageExportMode, + IOSOCRServerURL: iosOCRServerURL, EnableHOCR: true, // Always generate hOCR struct if provider supports it } @@ -455,6 +457,7 @@ func validateOCRProviderModeCompatibility(provider, mode string) error { "google_docai": {"image", "pdf", "whole_pdf"}, // Google Document AI supports all modes "mistral_ocr": {"image", "pdf", "whole_pdf"}, // Mistral OCR supports all modes "docling": {"image"}, // Docling only supports image mode + "ios_ocr": {"image"}, // iOS-OCR-Server only supports image mode } modes, exists := supportedModes[provider] @@ -542,6 +545,12 @@ func validateOrDefaultEnvVars() { } } + if ocrProvider == "ios_ocr" { + if iosOCRServerURL == "" { + log.Fatal("Please set the IOS_OCR_SERVER_URL environment variable for iOS-OCR-Server provider") + } + } + if llmModel == "" { log.Fatal("Please set the LLM_MODEL environment variable.") } diff --git a/ocr/ios_ocr_provider.go b/ocr/ios_ocr_provider.go new file mode 100644 index 00000000..e4199f48 --- /dev/null +++ b/ocr/ios_ocr_provider.go @@ -0,0 +1,173 @@ +package ocr + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "mime/multipart" + "net/http" + "time" + + "github.com/hashicorp/go-retryablehttp" + "github.com/sirupsen/logrus" +) + +// IOSOCRProvider implements OCR using iOS-OCR-Server +type IOSOCRProvider struct { + baseURL string + httpClient *retryablehttp.Client +} + +// newIOSOCRProvider creates a new iOS-OCR-Server provider +func newIOSOCRProvider(config Config) (*IOSOCRProvider, error) { + logger := log.WithFields(logrus.Fields{ + "url": config.IOSOCRServerURL, + }) + logger.Info("Creating new iOS-OCR-Server provider") + + if config.IOSOCRServerURL == "" { + logger.Error("Missing required iOS-OCR-Server URL") + return nil, fmt.Errorf("missing required iOS-OCR-Server URL") + } + + client := retryablehttp.NewClient() + client.RetryMax = 3 + client.RetryWaitMin = 1 * time.Second + client.RetryWaitMax = 10 * time.Second + client.Logger = logger // Use the logger from the ocr package + + provider := &IOSOCRProvider{ + baseURL: config.IOSOCRServerURL, + httpClient: client, + } + + logger.Info("Successfully initialized iOS-OCR-Server provider") + return provider, nil +} + +// ProcessImage sends the image content to the iOS-OCR-Server for OCR +func (p *IOSOCRProvider) ProcessImage(ctx context.Context, imageContent []byte, pageNumber int) (*OCRResult, error) { + logger := log.WithFields(logrus.Fields{ + "provider": "ios_ocr", + "url": p.baseURL, + "page_number": pageNumber, + "data_size": len(imageContent), + }) + logger.Debug("Starting iOS-OCR-Server processing") + + // Prepare multipart request body + var requestBody bytes.Buffer + writer := multipart.NewWriter(&requestBody) + + // Add image file part + part, err := writer.CreateFormFile("file", "document.png") + if err != nil { + logger.WithError(err).Error("Failed to create form file") + return nil, fmt.Errorf("failed to create form file: %w", err) + } + _, err = io.Copy(part, bytes.NewReader(imageContent)) + if err != nil { + logger.WithError(err).Error("Failed to copy image content to form") + return nil, fmt.Errorf("failed to copy image content: %w", err) + } + + // Close the multipart writer + err = writer.Close() + if err != nil { + logger.WithError(err).Error("Failed to close multipart writer") + return nil, fmt.Errorf("failed to close multipart writer: %w", err) + } + + // Create HTTP request + endpoint := p.baseURL + "/ocr" + req, err := retryablehttp.NewRequestWithContext(ctx, "POST", endpoint, &requestBody) + if err != nil { + logger.WithError(err).Error("Failed to create HTTP request") + return nil, fmt.Errorf("failed to create request: %w", err) + } + + // Set headers + req.Header.Set("Content-Type", writer.FormDataContentType()) + + logger.Debug("Sending request to iOS-OCR-Server") + + // Send request + resp, err := p.httpClient.Do(req) + if err != nil { + logger.WithError(err).Error("Failed to send request to iOS-OCR-Server") + return nil, fmt.Errorf("error sending request to iOS-OCR-Server: %w", err) + } + defer resp.Body.Close() + + // Read response body + respBodyBytes, err := io.ReadAll(resp.Body) + if err != nil { + logger.WithError(err).Error("Failed to read iOS-OCR-Server response body") + return nil, fmt.Errorf("failed to read response body: %w", err) + } + + // Check HTTP status + if resp.StatusCode != http.StatusOK { + logger.WithFields(logrus.Fields{ + "status_code": resp.StatusCode, + "response": string(respBodyBytes), + }).Error("iOS-OCR-Server returned non-200 status") + return nil, fmt.Errorf("iOS-OCR-Server returned status %d: %s", resp.StatusCode, string(respBodyBytes)) + } + + // Parse response + var ocrResponse IOSOCRResponse + err = json.Unmarshal(respBodyBytes, &ocrResponse) + if err != nil { + logger.WithError(err).WithField("response", string(respBodyBytes)).Error("Failed to parse iOS-OCR-Server response") + return nil, fmt.Errorf("failed to parse iOS-OCR-Server response: %w", err) + } + + // Check if processing was successful + if !ocrResponse.Success { + logger.Error("iOS-OCR-Server processing failed") + return nil, fmt.Errorf("iOS-OCR-Server processing failed") + } + + logger.WithFields(logrus.Fields{ + "text_length": len(ocrResponse.OCRResult), + "num_boxes": len(ocrResponse.OCRBoxes), + "image_width": ocrResponse.ImageWidth, + "image_height": ocrResponse.ImageHeight, + }).Info("Successfully processed image with iOS-OCR-Server") + + // Create OCR result + result := &OCRResult{ + Text: ocrResponse.OCRResult, + Metadata: make(map[string]string), + } + + // Add metadata + result.Metadata["provider"] = "ios_ocr" + result.Metadata["image_width"] = fmt.Sprintf("%d", ocrResponse.ImageWidth) + result.Metadata["image_height"] = fmt.Sprintf("%d", ocrResponse.ImageHeight) + result.Metadata["num_boxes"] = fmt.Sprintf("%d", len(ocrResponse.OCRBoxes)) + + return result, nil +} + +// IOSOCRResponse represents the response from iOS-OCR-Server +type IOSOCRResponse struct { + Message string `json:"message"` + ImageWidth int `json:"image_width"` + OCRResult string `json:"ocr_result"` + OCRBoxes []IOSOCRBox `json:"ocr_boxes"` + Success bool `json:"success"` + ImageHeight int `json:"image_height"` +} + +// IOSOCRBox represents a text bounding box from iOS-OCR-Server +type IOSOCRBox struct { + Text string `json:"text"` + W float64 `json:"w"` + X float64 `json:"x"` + H float64 `json:"h"` + Y float64 `json:"y"` +} \ No newline at end of file diff --git a/ocr/ios_ocr_provider_test.go b/ocr/ios_ocr_provider_test.go new file mode 100644 index 00000000..9f2a2f82 --- /dev/null +++ b/ocr/ios_ocr_provider_test.go @@ -0,0 +1,190 @@ +package ocr + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "testing" + + "github.com/hashicorp/go-retryablehttp" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestNewIOSOCRProvider(t *testing.T) { + tests := []struct { + name string + config Config + expectError bool + }{ + { + name: "valid config", + config: Config{ + IOSOCRServerURL: "http://localhost:8080", + }, + expectError: false, + }, + { + name: "missing URL", + config: Config{ + IOSOCRServerURL: "", + }, + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + provider, err := newIOSOCRProvider(tt.config) + if tt.expectError { + assert.Error(t, err) + assert.Nil(t, provider) + } else { + assert.NoError(t, err) + assert.NotNil(t, provider) + assert.Equal(t, tt.config.IOSOCRServerURL, provider.baseURL) + } + }) + } +} + +func TestIOSOCRProvider_ProcessImage(t *testing.T) { + // Mock successful response + mockResponse := IOSOCRResponse{ + Message: "File uploaded successfully", + ImageWidth: 446, + OCRResult: "Test OCR Result\nMultiple lines of text", + Success: true, + ImageHeight: 408, + OCRBoxes: []IOSOCRBox{ + { + Text: "Test OCR Result", + W: 200, + X: 10, + H: 30, + Y: 5, + }, + { + Text: "Multiple lines of text", + W: 250, + X: 10, + H: 30, + Y: 40, + }, + }, + } + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, "POST", r.Method) + assert.Equal(t, "/ocr", r.URL.Path) + assert.Contains(t, r.Header.Get("Content-Type"), "multipart/form-data") + + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(mockResponse) + })) + defer server.Close() + + provider := &IOSOCRProvider{ + baseURL: server.URL, + httpClient: retryablehttp.NewClient(), + } + + // Test data + testImageData := []byte("fake image data") + + result, err := provider.ProcessImage(context.Background(), testImageData, 1) + + require.NoError(t, err) + assert.Equal(t, "Test OCR Result\nMultiple lines of text", result.Text) + assert.NotNil(t, result.Metadata) + assert.Equal(t, "ios_ocr", result.Metadata["provider"]) + assert.Equal(t, "446", result.Metadata["image_width"]) + assert.Equal(t, "408", result.Metadata["image_height"]) + assert.Equal(t, "2", result.Metadata["num_boxes"]) +} + +func TestIOSOCRProvider_ProcessImage_ErrorHandling(t *testing.T) { + tests := []struct { + name string + serverResponse func(w http.ResponseWriter, r *http.Request) + expectError bool + errorContains string + }{ + { + name: "server error", + serverResponse: func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + fmt.Fprintln(w, "Internal server error") + }, + expectError: true, + errorContains: "error sending request to iOS-OCR-Server", + }, + { + name: "invalid JSON response", + serverResponse: func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + fmt.Fprintln(w, "invalid json") + }, + expectError: true, + errorContains: "failed to parse iOS-OCR-Server response", + }, + { + name: "processing failed", + serverResponse: func(w http.ResponseWriter, r *http.Request) { + response := IOSOCRResponse{ + Success: false, + Message: "Processing failed", + } + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(response) + }, + expectError: true, + errorContains: "iOS-OCR-Server processing failed", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(tt.serverResponse)) + defer server.Close() + + client := retryablehttp.NewClient() + client.RetryMax = 0 // Disable retries for error testing + + provider := &IOSOCRProvider{ + baseURL: server.URL, + httpClient: client, + } + + testImageData := []byte("fake image data") + result, err := provider.ProcessImage(context.Background(), testImageData, 1) + + if tt.expectError { + assert.Error(t, err) + assert.Nil(t, result) + assert.Contains(t, err.Error(), tt.errorContains) + } else { + assert.NoError(t, err) + assert.NotNil(t, result) + } + }) + } +} + +func TestIOSOCRProvider_Integration(t *testing.T) { + // Test the full flow with NewProvider + config := Config{ + Provider: "ios_ocr", + IOSOCRServerURL: "http://localhost:8080", + } + + provider, err := NewProvider(config) + require.NoError(t, err) + + // Verify it's the correct type + iosProvider, ok := provider.(*IOSOCRProvider) + require.True(t, ok) + assert.Equal(t, "http://localhost:8080", iosProvider.baseURL) +} \ No newline at end of file diff --git a/ocr/provider.go b/ocr/provider.go index acf330ad..8e464a97 100644 --- a/ocr/provider.go +++ b/ocr/provider.go @@ -57,6 +57,9 @@ type Config struct { DoclingURL string DoclingImageExportMode string + // iOS-OCR-Server settings + IOSOCRServerURL string + // OCR output options EnableHOCR bool // Whether to generate hOCR data if supported by the provider HOCROutputPath string // Where to save hOCR output files @@ -109,6 +112,13 @@ func NewProvider(config Config) (Provider, error) { }).Info("Using Mistral OCR provider") return newMistralOCRProvider(config) + case "ios_ocr": + if config.IOSOCRServerURL == "" { + return nil, fmt.Errorf("missing required iOS-OCR-Server configuration (IOS_OCR_SERVER_URL)") + } + log.WithField("url", config.IOSOCRServerURL).Info("Using iOS-OCR-Server provider") + return newIOSOCRProvider(config) + default: return nil, fmt.Errorf("unsupported OCR provider: %s", config.Provider) } diff --git a/web-app/package-lock.json b/web-app/package-lock.json index b88a0da9..8931aca9 100644 --- a/web-app/package-lock.json +++ b/web-app/package-lock.json @@ -1780,7 +1780,6 @@ "integrity": "sha512-ukd93VGzaNPMAUPy0gRDSC57UuQbnH9Kussp7HBjM06YFi9uZTFhOvMSO2OKqXm1rSgzOE+pVx1k1PYHGwlc8Q==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "csstype": "^3.0.2" } @@ -1883,7 +1882,6 @@ "integrity": "sha512-EHrrEsyhOhxYt8MTg4zTF+DJMuNBzWwgvvOYNj/zm1vnaD/IC5zCXFehZv94Piqa2cRFfXrTFxIvO95L7Qc/cw==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "8.44.1", "@typescript-eslint/types": "8.44.1", @@ -2132,7 +2130,6 @@ "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "dev": true, "license": "MIT", - "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -2573,7 +2570,6 @@ } ], "license": "MIT", - "peer": true, "dependencies": { "caniuse-lite": "^1.0.30001688", "electron-to-chromium": "^1.5.73", @@ -3362,7 +3358,6 @@ "integrity": "sha512-hB4FIzXovouYzwzECDcUkJ4OcfOEkXTv2zRY6B9bkwjx/cprAq0uvm1nl7zvQ0/TsUk0zQiN4uPfJpB9m+rPMQ==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.8.0", "@eslint-community/regexpp": "^4.12.1", @@ -4853,7 +4848,6 @@ } ], "license": "MIT", - "peer": true, "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", @@ -5132,7 +5126,6 @@ "resolved": "https://registry.npmjs.org/react/-/react-19.1.1.tgz", "integrity": "sha512-w8nqGImo45dmMIfljjMwOGtbmC/mk4CMYhWIicdSflH91J9TyCyczcPFXJzrZ/ZXcgGRFeP6BU0BEJTw6tZdfQ==", "license": "MIT", - "peer": true, "engines": { "node": ">=0.10.0" } @@ -5142,7 +5135,6 @@ "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.1.1.tgz", "integrity": "sha512-Dlq/5LAZgF0Gaz6yiqZCf6VCcZs1ghAJyrsu84Q/GT0gV+mCxbfmKNoGRKBYMJ8IEdGPqu49YWXD02GCknEDkw==", "license": "MIT", - "peer": true, "dependencies": { "scheduler": "^0.26.0" }, @@ -5920,7 +5912,6 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">=12" }, @@ -6008,7 +5999,6 @@ "integrity": "sha512-CWBzXQrc/qOkhidw1OzBTQuYRbfyxDXJMVJ1XNwUHGROVmuaeiEm3OslpZ1RV96d7SKKjZKrSJu3+t/xlw3R9A==", "dev": true, "license": "Apache-2.0", - "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -6135,7 +6125,6 @@ "integrity": "sha512-VbA8ScMvAISJNJVbRDTJdCwqQoAareR/wutevKanhR2/1EkoXVZVkkORaYm/tNVCjP/UDTKtcw3bAkwOUdedmA==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "esbuild": "^0.25.0", "fdir": "^6.5.0", @@ -6229,7 +6218,6 @@ "integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==", "dev": true, "license": "MIT", - "peer": true, "engines": { "node": ">=12" },