icereed · Copilot · Sep 27, 2025 · Sep 27, 2025 · Sep 27, 2025
diff --git a/README.md b/README.md
@@ -218,6 +218,10 @@ services:
       # OCR_PROVIDER: 'docling'              # Use a Docling server
       # DOCLING_URL: 'http://your-docling-server:port' # URL of your Docling instance
 
+      # Option 5: iOS-OCR-Server
+      # OCR_PROVIDER: 'ios_ocr'              # Use iOS-OCR-Server
+      # IOS_OCR_SERVER_URL: 'http://your-ios-device-ip:8080' # URL of your iOS device running iOS-OCR-Server
+
       AUTO_OCR_TAG: "paperless-gpt-ocr-auto" # Optional, default: paperless-gpt-ocr-auto
       OCR_LIMIT_PAGES: "5" # Optional, default: 5. Set to 0 for no limit.
       LOG_LEVEL: "info" # Optional: debug, warn, error
@@ -363,6 +367,24 @@ paperless-gpt supports four different OCR providers, each with unique strengths
   DOCLING_URL: "http://your-docling-server:port"
   ```
 
+### 5. iOS-OCR-Server
+
+- **Key Features**:
+  - Turn your iPhone into a powerful local OCR server using Apple's Vision Framework
+  - No cloud dependencies, unlimited usage, complete privacy
+  - High-quality OCR using Apple's native Vision Framework
+  - Returns both text and bounding box information
+- **Best For**:
+  - Users with an iPhone/iPad who want high-quality OCR
+  - Privacy-focused environments requiring local processing
+  - Cost-effective solution for unlimited OCR processing
+- **Configuration**:
+  ```yaml
+  OCR_PROVIDER: "ios_ocr"
+  IOS_OCR_SERVER_URL: "http://your-ios-device-ip:8080"
+  ```
+- **Setup**: Follow the instructions at [iOS-OCR-Server](https://github.com/riddleling/iOS-OCR-Server) to set up the server on your iOS device.
+
 ## OCR Processing Modes
 
 paperless-gpt offers different methods for processing documents, giving you flexibility based on your needs and OCR provider capabilities:
@@ -555,6 +577,7 @@ For best results with the enhanced OCR features:
 | `GOOGLE_APPLICATION_CREDENTIALS`    | Path to the mounted Google service account key. Required if OCR_PROVIDER is `google_docai`.                                                                                                   | Cond.    |                            |
 | `DOCLING_URL`                       | URL of the Docling server instance. Required if OCR_PROVIDER is `docling`.                                                                                                                    | Cond.    |                            |
 | `DOCLING_IMAGE_EXPORT_MODE`         | Mode for image export. Optional; defaults to `embedded` if unset.                                                                                                                             | No       | embedded                   |
+| `IOS_OCR_SERVER_URL`                | URL of the iOS-OCR-Server instance. Required if OCR_PROVIDER is `ios_ocr`.                                                                                                                     | Cond.    |                            |
 | `CREATE_LOCAL_HOCR`                 | Whether to save hOCR files locally.                                                                                                                                                           | No       | false                      |
 | `LOCAL_HOCR_PATH`                   | Path where hOCR files will be saved when hOCR generation is enabled.                                                                                                                          | No       | /app/hocr                  |
 | `CREATE_LOCAL_PDF`                  | Whether to save enhanced PDFs locally.                                                                                                                                                        | No       | false                      |

diff --git a/main.go b/main.go
@@ -74,6 +74,7 @@ var (
 	pdfSkipExistingOCR            = os.Getenv("PDF_SKIP_EXISTING_OCR") == "true"
 	doclingURL                    = os.Getenv("DOCLING_URL")
 	doclingImageExportMode        = os.Getenv("DOCLING_IMAGE_EXPORT_MODE")
+	iosOCRServerURL               = os.Getenv("IOS_OCR_SERVER_URL")
 
 	// Templates
 	titleTemplate         *template.Template
@@ -220,6 +221,7 @@ func main() {
 		MistralModel:             os.Getenv("MISTRAL_MODEL"),
 		DoclingURL:               doclingURL,
 		DoclingImageExportMode:   doclingImageExportMode,
+		IOSOCRServerURL:          iosOCRServerURL,
 		EnableHOCR:               true, // Always generate hOCR struct if provider supports it
 	}
 
@@ -455,6 +457,7 @@ func validateOCRProviderModeCompatibility(provider, mode string) error {
 		"google_docai": {"image", "pdf", "whole_pdf"}, // Google Document AI supports all modes
 		"mistral_ocr":  {"image", "pdf", "whole_pdf"}, // Mistral OCR supports all modes
 		"docling":      {"image"},                     // Docling only supports image mode
+		"ios_ocr":      {"image"},                     // iOS-OCR-Server only supports image mode
 	}
 
 	modes, exists := supportedModes[provider]
@@ -542,6 +545,12 @@ func validateOrDefaultEnvVars() {
 		}
 	}
 
+	if ocrProvider == "ios_ocr" {
+		if iosOCRServerURL == "" {
+			log.Fatal("Please set the IOS_OCR_SERVER_URL environment variable for iOS-OCR-Server provider")
+		}
+	}
+
 	if llmModel == "" {
 		log.Fatal("Please set the LLM_MODEL environment variable.")
 	}

diff --git a/ocr/ios_ocr_provider.go b/ocr/ios_ocr_provider.go
@@ -0,0 +1,173 @@
+package ocr
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"mime/multipart"
+	"net/http"
+	"time"
+
+	"github.com/hashicorp/go-retryablehttp"
+	"github.com/sirupsen/logrus"
+)
+
+// IOSOCRProvider implements OCR using iOS-OCR-Server
+type IOSOCRProvider struct {
+	baseURL    string
+	httpClient *retryablehttp.Client
+}
+
+// newIOSOCRProvider creates a new iOS-OCR-Server provider
+func newIOSOCRProvider(config Config) (*IOSOCRProvider, error) {
+	logger := log.WithFields(logrus.Fields{
+		"url": config.IOSOCRServerURL,
+	})
+	logger.Info("Creating new iOS-OCR-Server provider")
+
+	if config.IOSOCRServerURL == "" {
+		logger.Error("Missing required iOS-OCR-Server URL")
+		return nil, fmt.Errorf("missing required iOS-OCR-Server URL")
+	}
+
+	client := retryablehttp.NewClient()
+	client.RetryMax = 3
+	client.RetryWaitMin = 1 * time.Second
+	client.RetryWaitMax = 10 * time.Second
+	client.Logger = logger // Use the logger from the ocr package
+
+	provider := &IOSOCRProvider{
+		baseURL:    config.IOSOCRServerURL,
+		httpClient: client,
+	}
+
+	logger.Info("Successfully initialized iOS-OCR-Server provider")
+	return provider, nil
+}
+
+// ProcessImage sends the image content to the iOS-OCR-Server for OCR
+func (p *IOSOCRProvider) ProcessImage(ctx context.Context, imageContent []byte, pageNumber int) (*OCRResult, error) {
+	logger := log.WithFields(logrus.Fields{
+		"provider":    "ios_ocr",
+		"url":         p.baseURL,
+		"page_number": pageNumber,
+		"data_size":   len(imageContent),
+	})
+	logger.Debug("Starting iOS-OCR-Server processing")
+
+	// Prepare multipart request body
+	var requestBody bytes.Buffer
+	writer := multipart.NewWriter(&requestBody)
+
+	// Add image file part
+	part, err := writer.CreateFormFile("file", "document.png")
+	if err != nil {
+		logger.WithError(err).Error("Failed to create form file")
+		return nil, fmt.Errorf("failed to create form file: %w", err)
+	}
+	_, err = io.Copy(part, bytes.NewReader(imageContent))
+	if err != nil {
+		logger.WithError(err).Error("Failed to copy image content to form")
+		return nil, fmt.Errorf("failed to copy image content: %w", err)
+	}
+
+	// Close the multipart writer
+	err = writer.Close()
+	if err != nil {
+		logger.WithError(err).Error("Failed to close multipart writer")
+		return nil, fmt.Errorf("failed to close multipart writer: %w", err)
+	}
+
+	// Create HTTP request
+	endpoint := p.baseURL + "/ocr"
+	req, err := retryablehttp.NewRequestWithContext(ctx, "POST", endpoint, &requestBody)
+	if err != nil {
+		logger.WithError(err).Error("Failed to create HTTP request")
+		return nil, fmt.Errorf("failed to create request: %w", err)
+	}
+
+	// Set headers
+	req.Header.Set("Content-Type", writer.FormDataContentType())
+
+	logger.Debug("Sending request to iOS-OCR-Server")
+
+	// Send request
+	resp, err := p.httpClient.Do(req)
+	if err != nil {
+		logger.WithError(err).Error("Failed to send request to iOS-OCR-Server")
+		return nil, fmt.Errorf("error sending request to iOS-OCR-Server: %w", err)
+	}
+	defer resp.Body.Close()
+
+	// Read response body
+	respBodyBytes, err := io.ReadAll(resp.Body)
+	if err != nil {
+		logger.WithError(err).Error("Failed to read iOS-OCR-Server response body")
+		return nil, fmt.Errorf("failed to read response body: %w", err)
+	}
+
+	// Check HTTP status
+	if resp.StatusCode != http.StatusOK {
+		logger.WithFields(logrus.Fields{
+			"status_code": resp.StatusCode,
+			"response":    string(respBodyBytes),
+		}).Error("iOS-OCR-Server returned non-200 status")
+		return nil, fmt.Errorf("iOS-OCR-Server returned status %d: %s", resp.StatusCode, string(respBodyBytes))
+	}
+
+	// Parse response
+	var ocrResponse IOSOCRResponse
+	err = json.Unmarshal(respBodyBytes, &ocrResponse)
+	if err != nil {
+		logger.WithError(err).WithField("response", string(respBodyBytes)).Error("Failed to parse iOS-OCR-Server response")
+		return nil, fmt.Errorf("failed to parse iOS-OCR-Server response: %w", err)
+	}
+
+	// Check if processing was successful
+	if !ocrResponse.Success {
+		logger.Error("iOS-OCR-Server processing failed")
+		return nil, fmt.Errorf("iOS-OCR-Server processing failed")
+	}
+
+	logger.WithFields(logrus.Fields{
+		"text_length":    len(ocrResponse.OCRResult),
+		"num_boxes":      len(ocrResponse.OCRBoxes),
+		"image_width":    ocrResponse.ImageWidth,
+		"image_height":   ocrResponse.ImageHeight,
+	}).Info("Successfully processed image with iOS-OCR-Server")
+
+	// Create OCR result
+	result := &OCRResult{
+		Text:     ocrResponse.OCRResult,
+		Metadata: make(map[string]string),
+	}
+
+	// Add metadata
+	result.Metadata["provider"] = "ios_ocr"
+	result.Metadata["image_width"] = fmt.Sprintf("%d", ocrResponse.ImageWidth)
+	result.Metadata["image_height"] = fmt.Sprintf("%d", ocrResponse.ImageHeight)
+	result.Metadata["num_boxes"] = fmt.Sprintf("%d", len(ocrResponse.OCRBoxes))
+
+	return result, nil
+}
+
+// IOSOCRResponse represents the response from iOS-OCR-Server
+type IOSOCRResponse struct {
+	Message     string      `json:"message"`
+	ImageWidth  int         `json:"image_width"`
+	OCRResult   string      `json:"ocr_result"`
+	OCRBoxes    []IOSOCRBox `json:"ocr_boxes"`
+	Success     bool        `json:"success"`
+	ImageHeight int         `json:"image_height"`
+}
+
+// IOSOCRBox represents a text bounding box from iOS-OCR-Server
+type IOSOCRBox struct {
+	Text string  `json:"text"`
+	W    float64 `json:"w"`
+	X    float64 `json:"x"`
+	H    float64 `json:"h"`
+	Y    float64 `json:"y"`
+}