Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,10 @@ services:
# OCR_PROVIDER: 'docling' # Use a Docling server
# DOCLING_URL: 'http://your-docling-server:port' # URL of your Docling instance

# Option 5: iOS-OCR-Server
# OCR_PROVIDER: 'ios_ocr' # Use iOS-OCR-Server
# IOS_OCR_SERVER_URL: 'http://your-ios-device-ip:8080' # URL of your iOS device running iOS-OCR-Server

AUTO_OCR_TAG: "paperless-gpt-ocr-auto" # Optional, default: paperless-gpt-ocr-auto
OCR_LIMIT_PAGES: "5" # Optional, default: 5. Set to 0 for no limit.
LOG_LEVEL: "info" # Optional: debug, warn, error
Expand Down Expand Up @@ -363,6 +367,24 @@ paperless-gpt supports four different OCR providers, each with unique strengths
DOCLING_URL: "http://your-docling-server:port"
```

### 5. iOS-OCR-Server

- **Key Features**:
- Turn your iPhone into a powerful local OCR server using Apple's Vision Framework
- No cloud dependencies, unlimited usage, complete privacy
- High-quality OCR using Apple's native Vision Framework
- Returns both text and bounding box information
- **Best For**:
- Users with an iPhone/iPad who want high-quality OCR
- Privacy-focused environments requiring local processing
- Cost-effective solution for unlimited OCR processing
- **Configuration**:
```yaml
OCR_PROVIDER: "ios_ocr"
IOS_OCR_SERVER_URL: "http://your-ios-device-ip:8080"
```
- **Setup**: Follow the instructions at [iOS-OCR-Server](https://github.com/riddleling/iOS-OCR-Server) to set up the server on your iOS device.

## OCR Processing Modes

paperless-gpt offers different methods for processing documents, giving you flexibility based on your needs and OCR provider capabilities:
Expand Down Expand Up @@ -555,6 +577,7 @@ For best results with the enhanced OCR features:
| `GOOGLE_APPLICATION_CREDENTIALS` | Path to the mounted Google service account key. Required if OCR_PROVIDER is `google_docai`. | Cond. | |
| `DOCLING_URL` | URL of the Docling server instance. Required if OCR_PROVIDER is `docling`. | Cond. | |
| `DOCLING_IMAGE_EXPORT_MODE` | Mode for image export. Optional; defaults to `embedded` if unset. | No | embedded |
| `IOS_OCR_SERVER_URL` | URL of the iOS-OCR-Server instance. Required if OCR_PROVIDER is `ios_ocr`. | Cond. | |
| `CREATE_LOCAL_HOCR` | Whether to save hOCR files locally. | No | false |
| `LOCAL_HOCR_PATH` | Path where hOCR files will be saved when hOCR generation is enabled. | No | /app/hocr |
| `CREATE_LOCAL_PDF` | Whether to save enhanced PDFs locally. | No | false |
Expand Down
9 changes: 9 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ var (
pdfSkipExistingOCR = os.Getenv("PDF_SKIP_EXISTING_OCR") == "true"
doclingURL = os.Getenv("DOCLING_URL")
doclingImageExportMode = os.Getenv("DOCLING_IMAGE_EXPORT_MODE")
iosOCRServerURL = os.Getenv("IOS_OCR_SERVER_URL")

// Templates
titleTemplate *template.Template
Expand Down Expand Up @@ -220,6 +221,7 @@ func main() {
MistralModel: os.Getenv("MISTRAL_MODEL"),
DoclingURL: doclingURL,
DoclingImageExportMode: doclingImageExportMode,
IOSOCRServerURL: iosOCRServerURL,
EnableHOCR: true, // Always generate hOCR struct if provider supports it
}

Expand Down Expand Up @@ -455,6 +457,7 @@ func validateOCRProviderModeCompatibility(provider, mode string) error {
"google_docai": {"image", "pdf", "whole_pdf"}, // Google Document AI supports all modes
"mistral_ocr": {"image", "pdf", "whole_pdf"}, // Mistral OCR supports all modes
"docling": {"image"}, // Docling only supports image mode
"ios_ocr": {"image"}, // iOS-OCR-Server only supports image mode
}

modes, exists := supportedModes[provider]
Expand Down Expand Up @@ -542,6 +545,12 @@ func validateOrDefaultEnvVars() {
}
}

if ocrProvider == "ios_ocr" {
if iosOCRServerURL == "" {
log.Fatal("Please set the IOS_OCR_SERVER_URL environment variable for iOS-OCR-Server provider")
}
}

if llmModel == "" {
log.Fatal("Please set the LLM_MODEL environment variable.")
}
Expand Down
173 changes: 173 additions & 0 deletions ocr/ios_ocr_provider.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
package ocr

import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"mime/multipart"
"net/http"
"time"

"github.com/hashicorp/go-retryablehttp"
"github.com/sirupsen/logrus"
)

// IOSOCRProvider implements OCR using iOS-OCR-Server
type IOSOCRProvider struct {
baseURL string
httpClient *retryablehttp.Client
}

// newIOSOCRProvider creates a new iOS-OCR-Server provider
func newIOSOCRProvider(config Config) (*IOSOCRProvider, error) {
logger := log.WithFields(logrus.Fields{
"url": config.IOSOCRServerURL,
})
logger.Info("Creating new iOS-OCR-Server provider")

if config.IOSOCRServerURL == "" {
logger.Error("Missing required iOS-OCR-Server URL")
return nil, fmt.Errorf("missing required iOS-OCR-Server URL")
}

client := retryablehttp.NewClient()
client.RetryMax = 3
client.RetryWaitMin = 1 * time.Second
client.RetryWaitMax = 10 * time.Second
client.Logger = logger // Use the logger from the ocr package

provider := &IOSOCRProvider{
baseURL: config.IOSOCRServerURL,
httpClient: client,
}

logger.Info("Successfully initialized iOS-OCR-Server provider")
return provider, nil
}

// ProcessImage sends the image content to the iOS-OCR-Server for OCR
func (p *IOSOCRProvider) ProcessImage(ctx context.Context, imageContent []byte, pageNumber int) (*OCRResult, error) {
logger := log.WithFields(logrus.Fields{
"provider": "ios_ocr",
"url": p.baseURL,
"page_number": pageNumber,
"data_size": len(imageContent),
})
logger.Debug("Starting iOS-OCR-Server processing")

// Prepare multipart request body
var requestBody bytes.Buffer
writer := multipart.NewWriter(&requestBody)

// Add image file part
part, err := writer.CreateFormFile("file", "document.png")
if err != nil {
logger.WithError(err).Error("Failed to create form file")
return nil, fmt.Errorf("failed to create form file: %w", err)
}
_, err = io.Copy(part, bytes.NewReader(imageContent))
if err != nil {
logger.WithError(err).Error("Failed to copy image content to form")
return nil, fmt.Errorf("failed to copy image content: %w", err)
}

// Close the multipart writer
err = writer.Close()
if err != nil {
logger.WithError(err).Error("Failed to close multipart writer")
return nil, fmt.Errorf("failed to close multipart writer: %w", err)
}

// Create HTTP request
endpoint := p.baseURL + "/ocr"
req, err := retryablehttp.NewRequestWithContext(ctx, "POST", endpoint, &requestBody)
if err != nil {
logger.WithError(err).Error("Failed to create HTTP request")
return nil, fmt.Errorf("failed to create request: %w", err)
}

// Set headers
req.Header.Set("Content-Type", writer.FormDataContentType())

logger.Debug("Sending request to iOS-OCR-Server")

// Send request
resp, err := p.httpClient.Do(req)
if err != nil {
logger.WithError(err).Error("Failed to send request to iOS-OCR-Server")
return nil, fmt.Errorf("error sending request to iOS-OCR-Server: %w", err)
}
defer resp.Body.Close()

// Read response body
respBodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
logger.WithError(err).Error("Failed to read iOS-OCR-Server response body")
return nil, fmt.Errorf("failed to read response body: %w", err)
}

// Check HTTP status
if resp.StatusCode != http.StatusOK {
logger.WithFields(logrus.Fields{
"status_code": resp.StatusCode,
"response": string(respBodyBytes),
}).Error("iOS-OCR-Server returned non-200 status")
return nil, fmt.Errorf("iOS-OCR-Server returned status %d: %s", resp.StatusCode, string(respBodyBytes))
}

// Parse response
var ocrResponse IOSOCRResponse
err = json.Unmarshal(respBodyBytes, &ocrResponse)
if err != nil {
logger.WithError(err).WithField("response", string(respBodyBytes)).Error("Failed to parse iOS-OCR-Server response")
return nil, fmt.Errorf("failed to parse iOS-OCR-Server response: %w", err)
}

// Check if processing was successful
if !ocrResponse.Success {
logger.Error("iOS-OCR-Server processing failed")
return nil, fmt.Errorf("iOS-OCR-Server processing failed")
}

logger.WithFields(logrus.Fields{
"text_length": len(ocrResponse.OCRResult),
"num_boxes": len(ocrResponse.OCRBoxes),
"image_width": ocrResponse.ImageWidth,
"image_height": ocrResponse.ImageHeight,
}).Info("Successfully processed image with iOS-OCR-Server")

// Create OCR result
result := &OCRResult{
Text: ocrResponse.OCRResult,
Metadata: make(map[string]string),
}

// Add metadata
result.Metadata["provider"] = "ios_ocr"
result.Metadata["image_width"] = fmt.Sprintf("%d", ocrResponse.ImageWidth)
result.Metadata["image_height"] = fmt.Sprintf("%d", ocrResponse.ImageHeight)
result.Metadata["num_boxes"] = fmt.Sprintf("%d", len(ocrResponse.OCRBoxes))

return result, nil
}

// IOSOCRResponse represents the response from iOS-OCR-Server
type IOSOCRResponse struct {
Message string `json:"message"`
ImageWidth int `json:"image_width"`
OCRResult string `json:"ocr_result"`
OCRBoxes []IOSOCRBox `json:"ocr_boxes"`
Success bool `json:"success"`
ImageHeight int `json:"image_height"`
}

// IOSOCRBox represents a text bounding box from iOS-OCR-Server
type IOSOCRBox struct {
Text string `json:"text"`
W float64 `json:"w"`
X float64 `json:"x"`
H float64 `json:"h"`
Y float64 `json:"y"`
}
Loading
Loading