diff --git a/app_llm.go b/app_llm.go index 2d3fe9ec..62b7986d 100644 --- a/app_llm.go +++ b/app_llm.go @@ -165,17 +165,34 @@ func (app *App) getSuggestedTags( } // getSuggestedTitle generates a suggested title for a document using the LLM -func (app *App) getSuggestedTitle(ctx context.Context, content string, originalTitle string, logger *logrus.Entry) (string, error) { +func (app *App) getSuggestedTitle(ctx context.Context, documentID int, content string, originalTitle string, logger *logrus.Entry) (string, error) { likelyLanguage := getLikelyLanguage() + // Fetch similar documents to help with title consistency + var similarDocumentTitles []string + similarDocs, err := app.Client.GetSimilarDocuments(ctx, documentID, 5) // Get up to 5 similar documents + if err != nil { + // Log the error but don't fail the title generation + logger.Debugf("Failed to fetch similar documents for title consistency: %v", err) + } else { + // Extract titles from similar documents + for _, doc := range similarDocs { + if doc.Title != "" && doc.Title != originalTitle { + similarDocumentTitles = append(similarDocumentTitles, doc.Title) + } + } + logger.Debugf("Found %d similar documents for title consistency", len(similarDocumentTitles)) + } + templateMutex.RLock() defer templateMutex.RUnlock() // Get available tokens for content templateData := map[string]interface{}{ - "Language": likelyLanguage, - "Content": content, - "Title": originalTitle, + "Language": likelyLanguage, + "Content": content, + "Title": originalTitle, + "SimilarDocumentTitles": similarDocumentTitles, } availableTokens, err := getAvailableTokensForContent(titleTemplate, templateData) @@ -448,7 +465,7 @@ func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionReque var suggestedCustomFields []CustomFieldSuggestion if suggestionRequest.GenerateTitles { - suggestedTitle, err = app.getSuggestedTitle(ctx, content, suggestedTitle, docLogger) + suggestedTitle, err = app.getSuggestedTitle(ctx, documentID, content, suggestedTitle, docLogger) if err != nil { mu.Lock() errorsList = append(errorsList, fmt.Errorf("Document %d: %v", documentID, err)) diff --git a/app_llm_test.go b/app_llm_test.go index 57833be1..b9b97f1c 100644 --- a/app_llm_test.go +++ b/app_llm_test.go @@ -104,8 +104,10 @@ func TestPromptTokenLimits(t *testing.T) { // Create a test app with mock LLM mockLLM := &mockLLM{} + mockClient := &mockPaperlessClient{} app := &App{ - LLM: mockLLM, + LLM: mockLLM, + Client: mockClient, } // Set up test template @@ -157,7 +159,7 @@ Content: {{.Content}} // Test with the app's LLM ctx := context.Background() - _, err = app.getSuggestedTitle(ctx, truncatedContent, "Test Title", testLogger) + _, err = app.getSuggestedTitle(ctx, 1, truncatedContent, "Test Title", testLogger) require.NoError(t, err) // Verify truncation @@ -269,8 +271,10 @@ func TestTokenLimitInTitleGeneration(t *testing.T) { // Create a test app with mock LLM mockLLM := &mockLLM{} + mockClient := &mockPaperlessClient{} app := &App{ - LLM: mockLLM, + LLM: mockLLM, + Client: mockClient, } // Test content that would exceed reasonable token limits @@ -284,7 +288,7 @@ func TestTokenLimitInTitleGeneration(t *testing.T) { // Call getSuggestedTitle ctx := context.Background() - _, err := app.getSuggestedTitle(ctx, longContent, "Original Title", testLogger) + _, err := app.getSuggestedTitle(ctx, 1, longContent, "Original Title", testLogger) require.NoError(t, err) // Verify the final prompt size @@ -360,8 +364,9 @@ func TestStripReasoning(t *testing.T) { // mockPaperlessClient is a mock implementation of the ClientInterface for testing. type mockPaperlessClient struct { - CustomFields []CustomField - Error error + CustomFields []CustomField + SimilarDocuments []Document + Error error } func (m *mockPaperlessClient) GetCustomFields(ctx context.Context) ([]CustomField, error) { @@ -406,6 +411,12 @@ func (m *mockPaperlessClient) GetTaskStatus(ctx context.Context, taskID string) return nil, nil } func (m *mockPaperlessClient) DeleteDocument(ctx context.Context, documentID int) error { return nil } +func (m *mockPaperlessClient) GetSimilarDocuments(ctx context.Context, documentID int, count int) ([]Document, error) { + if m.Error != nil { + return nil, m.Error + } + return m.SimilarDocuments, nil +} func TestGetSuggestedCustomFields(t *testing.T) { // 1. Setup @@ -484,3 +495,135 @@ func findFieldByID(fields []CustomFieldSuggestion, id int) (CustomFieldSuggestio } return CustomFieldSuggestion{}, false } + +func TestGetSuggestedTitle_WithSimilarDocuments(t *testing.T) { + testLogger := logrus.WithField("test", "test") + + // Set higher token limit for this test + originalLimit := os.Getenv("TOKEN_LIMIT") + os.Setenv("TOKEN_LIMIT", "200") + resetTokenLimit() + defer func() { + os.Setenv("TOKEN_LIMIT", originalLimit) + resetTokenLimit() + }() + + // Create a mock client that returns similar documents + mockClient := &mockPaperlessClient{ + SimilarDocuments: []Document{ + {ID: 2, Title: "Invoice January 2023 - Company ABC"}, + {ID: 3, Title: "Invoice February 2023 - Company ABC"}, + {ID: 4, Title: "Receipt March 2023 - Company XYZ"}, + }, + } + + mockLLM := &mockLLM{Response: "Invoice March 2023 - Company ABC"} + app := &App{ + LLM: mockLLM, + Client: mockClient, + } + + // Set up title template + var err error + titleTemplate, err = template.New("title").Parse(`I will provide you with the content of a document. +Your task is to find a suitable document title. +{{if .SimilarDocumentTitles}}I have found some similar documents with the following titles: +{{range .SimilarDocumentTitles}}- {{.}} +{{end}}Please try to be consistent with the naming pattern.{{end}} + +{{.Title}} +{{.Content}}`) + require.NoError(t, err) + + ctx := context.Background() + content := "This is an invoice from Company ABC for March 2023 services." + originalTitle := "document.pdf" + + title, err := app.getSuggestedTitle(ctx, 1, content, originalTitle, testLogger) + require.NoError(t, err) + assert.Equal(t, "Invoice March 2023 - Company ABC", title) + + // Verify that the prompt included similar document titles + assert.Contains(t, mockLLM.lastPrompt, "Invoice January 2023 - Company ABC") + assert.Contains(t, mockLLM.lastPrompt, "Invoice February 2023 - Company ABC") + assert.Contains(t, mockLLM.lastPrompt, "Receipt March 2023 - Company XYZ") + assert.Contains(t, mockLLM.lastPrompt, "Please try to be consistent with the naming pattern") +} + +func TestGetSuggestedTitle_NoSimilarDocuments(t *testing.T) { + testLogger := logrus.WithField("test", "test") + + // Create a mock client that returns no similar documents + mockClient := &mockPaperlessClient{ + SimilarDocuments: []Document{}, + } + + mockLLM := &mockLLM{Response: "Contract Agreement 2023"} + app := &App{ + LLM: mockLLM, + Client: mockClient, + } + + // Set up title template + var err error + titleTemplate, err = template.New("title").Parse(`I will provide you with the content of a document. +Your task is to find a suitable document title. +{{if .SimilarDocumentTitles}}I have found some similar documents with the following titles: +{{range .SimilarDocumentTitles}}- {{.}} +{{end}}Please try to be consistent with the naming pattern.{{end}} + +{{.Title}} +{{.Content}}`) + require.NoError(t, err) + + ctx := context.Background() + content := "This is a contract agreement document." + originalTitle := "document.pdf" + + title, err := app.getSuggestedTitle(ctx, 1, content, originalTitle, testLogger) + require.NoError(t, err) + assert.Equal(t, "Contract Agreement 2023", title) + + // Verify that the prompt did not include the similar documents section + assert.NotContains(t, mockLLM.lastPrompt, "I have found some similar documents") + assert.NotContains(t, mockLLM.lastPrompt, "Please try to be consistent with the naming pattern") +} + +func TestGetSuggestedTitle_SimilarDocumentsError(t *testing.T) { + testLogger := logrus.WithField("test", "test") + + // Create a mock client that returns an error for similar documents + mockClient := &mockPaperlessClient{ + Error: fmt.Errorf("API error"), + } + + mockLLM := &mockLLM{Response: "Generated Title"} + app := &App{ + LLM: mockLLM, + Client: mockClient, + } + + // Set up title template + var err error + titleTemplate, err = template.New("title").Parse(`I will provide you with the content of a document. +Your task is to find a suitable document title. +{{if .SimilarDocumentTitles}}I have found some similar documents with the following titles: +{{range .SimilarDocumentTitles}}- {{.}} +{{end}}Please try to be consistent with the naming pattern.{{end}} + +{{.Title}} +{{.Content}}`) + require.NoError(t, err) + + ctx := context.Background() + content := "This is a document." + originalTitle := "document.pdf" + + // Should still work even if similar documents lookup fails + title, err := app.getSuggestedTitle(ctx, 1, content, originalTitle, testLogger) + require.NoError(t, err) + assert.Equal(t, "Generated Title", title) + + // Verify that the prompt did not include the similar documents section + assert.NotContains(t, mockLLM.lastPrompt, "I have found some similar documents") +} diff --git a/default_prompts/title_prompt.tmpl b/default_prompts/title_prompt.tmpl index df6c7f16..d128ccee 100644 --- a/default_prompts/title_prompt.tmpl +++ b/default_prompts/title_prompt.tmpl @@ -1,6 +1,9 @@ I will provide you with the content of a document that has been partially read by OCR (so it may contain errors). Your task is to find a suitable document title that I can use as the title in the paperless-ngx program. If the original title is already adding value and not just a technical filename you can use it as extra information to enhance your suggestion. +{{if .SimilarDocumentTitles}}I have found some similar documents in the database with the following titles that might help you maintain consistency: +{{range .SimilarDocumentTitles}}- {{.}} +{{end}}Please try to be consistent with the naming pattern of these similar documents if they provide informative titles.{{end}} Respond only with the title, without any additional information. The content is likely in {{.Language}}. The data will be provided using an XML-like format for clarity: diff --git a/paperless.go b/paperless.go index fd4b91f2..e7335290 100644 --- a/paperless.go +++ b/paperless.go @@ -395,6 +395,105 @@ func (client *PaperlessClient) GetDocument(ctx context.Context, documentID int) }, nil } +// GetSimilarDocuments retrieves documents that are similar to the specified document +func (client *PaperlessClient) GetSimilarDocuments(ctx context.Context, documentID int, maxResults int) ([]Document, error) { + // Get all tags to find the IDs of paperless-gpt tags to exclude + allTags, err := client.GetAllTags(ctx) + if err != nil { + return nil, fmt.Errorf("failed to get tags for exclusion: %w", err) + } + + // Find the tag IDs for paperless-gpt tags that should be excluded + var excludeTagIDs []string + for tagName, tagID := range allTags { + if tagName == manualTag || tagName == autoTag { + excludeTagIDs = append(excludeTagIDs, fmt.Sprintf("%d", tagID)) + } + } + + // Build the query path with tag exclusions + path := fmt.Sprintf("api/documents/?ordering=-score&truncate_content=true&more_like_id=%d&page_size=%d", documentID, maxResults) + if len(excludeTagIDs) > 0 { + path += "&tags__id__none=" + strings.Join(excludeTagIDs, ",") + } + + resp, err := client.Do(ctx, "GET", path, nil) + if err != nil { + return nil, fmt.Errorf("HTTP request failed in GetSimilarDocuments: %w", err) + } + defer resp.Body.Close() + + // Read the response body + bodyBytes, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + log.WithFields(logrus.Fields{ + "status_code": resp.StatusCode, + "path": path, + "response": string(bodyBytes), + "headers": resp.Header, + }).Error("Error response from server in GetSimilarDocuments") + return nil, fmt.Errorf("error searching similar documents: status=%d, body=%s", resp.StatusCode, string(bodyBytes)) + } + + var documentsResponse GetDocumentsApiResponse + err = json.Unmarshal(bodyBytes, &documentsResponse) + if err != nil { + log.WithFields(logrus.Fields{ + "response_body": string(bodyBytes), + "error": err, + }).Error("Failed to parse JSON response in GetSimilarDocuments") + return nil, fmt.Errorf("failed to parse JSON response: %w", err) + } + + allCorrespondents, err := client.GetAllCorrespondents(ctx) + if err != nil { + return nil, err + } + + documents := make([]Document, 0, len(documentsResponse.Results)) + for _, result := range documentsResponse.Results { + // Skip the document itself if it appears in the results + if result.ID == documentID { + continue + } + + tagNames := make([]string, len(result.Tags)) + for i, resultTagID := range result.Tags { + for tagName, tagID := range allTags { + if resultTagID == tagID { + tagNames[i] = tagName + break + } + } + } + + correspondentName := "" + if result.Correspondent != 0 { + for name, id := range allCorrespondents { + if result.Correspondent == id { + correspondentName = name + break + } + } + } + + documents = append(documents, Document{ + ID: result.ID, + Title: result.Title, + Content: result.Content, + Correspondent: correspondentName, + Tags: tagNames, + CreatedDate: result.CreatedDate, + }) + } + + return documents, nil +} + // UpdateDocuments updates the specified documents with suggested changes func (client *PaperlessClient) UpdateDocuments(ctx context.Context, documents []DocumentSuggestion, db *gorm.DB, isUndo bool) error { availableTags, err := client.GetAllTags(ctx) @@ -441,7 +540,7 @@ func (client *PaperlessClient) UpdateDocuments(ctx context.Context, documents [] if !hasSameTags(originalDoc.Tags, finalTagNames) { originalFields["tags"] = originalDoc.Tags - var newTagIDs []int + var newTagIDs []int = []int{} for _, tagName := range finalTagNames { if tagID, exists := availableTags[tagName]; exists { newTagIDs = append(newTagIDs, tagID) diff --git a/paperless_test.go b/paperless_test.go index ad4e595a..105ca36a 100644 --- a/paperless_test.go +++ b/paperless_test.go @@ -8,6 +8,7 @@ import ( "net/http" "net/http/httptest" "os" + "strings" "testing" "github.com/stretchr/testify/assert" @@ -476,3 +477,269 @@ func TestDownloadDocumentAsPDF(t *testing.T) { // Testing with splitting=true would be more complex so we'll skip that for simplicity } + +func TestGetSimilarDocuments(t *testing.T) { + env := newTestEnv(t) + defer env.teardown() + + // Mock response for similar documents API + similarDocs := []GetDocumentApiResponseResult{ + { + ID: 2, + Title: "Invoice January 2023 - Company ABC", + }, + { + ID: 3, + Title: "Invoice February 2023 - Company ABC", + }, + { + ID: 4, + Title: "Receipt March 2023 - Company XYZ", + }, + } + + response := GetDocumentsApiResponse{ + Count: 3, + Results: similarDocs, + } + + env.mockResponses["/api/documents/"] = func(w http.ResponseWriter, r *http.Request) { + // Verify query parameters + assert.Equal(t, "GET", r.Method) + assert.Equal(t, "-score", r.URL.Query().Get("ordering")) + assert.Equal(t, "true", r.URL.Query().Get("truncate_content")) + assert.Equal(t, "1", r.URL.Query().Get("more_like_id")) + assert.Equal(t, "5", r.URL.Query().Get("page_size")) + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(response) + } + + // Add required mocks for tags and correspondents that GetSimilarDocuments calls + env.setMockResponse("/api/tags/", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(map[string]interface{}{ + "results": []map[string]interface{}{ + {"id": 1, "name": "tag1"}, + }, + "next": nil, + }) + }) + + // Test successful case + ctx := context.Background() + documents, err := env.client.GetSimilarDocuments(ctx, 1, 5) + require.NoError(t, err) + assert.Len(t, documents, 3) + assert.Equal(t, "Invoice January 2023 - Company ABC", documents[0].Title) + assert.Equal(t, "Invoice February 2023 - Company ABC", documents[1].Title) + assert.Equal(t, "Receipt March 2023 - Company XYZ", documents[2].Title) +} + +func TestGetSimilarDocuments_NoResults(t *testing.T) { + env := newTestEnv(t) + defer env.teardown() + + // Mock response with no results + response := GetDocumentsApiResponse{ + Count: 0, + Results: []GetDocumentApiResponseResult{}, + } + + env.mockResponses["/api/documents/"] = func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(response) + } + + // Add required mocks for tags and correspondents + env.setMockResponse("/api/tags/", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(map[string]interface{}{ + "results": []map[string]interface{}{}, + "next": nil, + }) + }) + + ctx := context.Background() + documents, err := env.client.GetSimilarDocuments(ctx, 1, 5) + require.NoError(t, err) + assert.Len(t, documents, 0) +} + +func TestGetSimilarDocuments_Error(t *testing.T) { + env := newTestEnv(t) + defer env.teardown() + + // Add required mocks for tags (since GetSimilarDocuments calls GetAllTags first) + env.setMockResponse("/api/tags/", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(map[string]interface{}{ + "results": []map[string]interface{}{}, + "next": nil, + }) + }) + + env.mockResponses["/api/documents/"] = func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + w.Write([]byte("Internal Server Error")) + } + + ctx := context.Background() + documents, err := env.client.GetSimilarDocuments(ctx, 1, 5) + assert.Error(t, err) + assert.Nil(t, documents) + assert.Contains(t, err.Error(), "error searching similar documents") +} + +func TestGetSimilarDocuments_TagsError(t *testing.T) { + env := newTestEnv(t) + defer env.teardown() + + // Mock tags endpoint to return an error + env.setMockResponse("/api/tags/", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + w.Write([]byte("Tags API Error")) + }) + + ctx := context.Background() + documents, err := env.client.GetSimilarDocuments(ctx, 1, 5) + assert.Error(t, err) + assert.Nil(t, documents) + assert.Contains(t, err.Error(), "failed to get tags for exclusion") +} + +func TestGetSimilarDocuments_ExcludesPaperlessGPTTags(t *testing.T) { + // Set environment variables for the test + originalManualTag := os.Getenv("MANUAL_TAG") + originalAutoTag := os.Getenv("AUTO_TAG") + defer func() { + os.Setenv("MANUAL_TAG", originalManualTag) + os.Setenv("AUTO_TAG", originalAutoTag) + }() + + // Set the tag values and reinitialize the global variables + os.Setenv("MANUAL_TAG", "paperless-gpt") + os.Setenv("AUTO_TAG", "paperless-gpt-auto") + manualTag = "paperless-gpt" + autoTag = "paperless-gpt-auto" + + env := newTestEnv(t) + defer env.teardown() + + // Mock similar documents + similarDocs := []GetDocumentApiResponseResult{ + { + ID: 2, + Title: "Test Document 1", + }, + } + + response := GetDocumentsApiResponse{ + Count: 1, + Results: similarDocs, + } + + // Track the received query parameters + var receivedQuery string + env.mockResponses["/api/documents/"] = func(w http.ResponseWriter, r *http.Request) { + receivedQuery = r.URL.RawQuery + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(response) + } + + // Add required mocks for tags (include paperless-gpt tags) + env.setMockResponse("/api/tags/", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(map[string]interface{}{ + "results": []map[string]interface{}{ + {"id": 1, "name": "regular-tag"}, + {"id": 2, "name": "paperless-gpt"}, // manualTag + {"id": 3, "name": "paperless-gpt-auto"}, // autoTag + }, + "next": nil, + }) + }) + + ctx := context.Background() + documents, err := env.client.GetSimilarDocuments(ctx, 1, 5) + require.NoError(t, err) + assert.Len(t, documents, 1) + + // Verify that the query excludes the paperless-gpt tags + assert.Contains(t, receivedQuery, "ordering=-score") + assert.Contains(t, receivedQuery, "truncate_content=true") + assert.Contains(t, receivedQuery, "more_like_id=1") + assert.Contains(t, receivedQuery, "page_size=5") + // Check that tag exclusion is present (order may vary) + assert.True(t, + strings.Contains(receivedQuery, "tags__id__none=2,3") || strings.Contains(receivedQuery, "tags__id__none=3,2"), + "Should exclude paperless-gpt tags with IDs 2 and 3 (in any order), got: %s", receivedQuery) +} + +func TestGetSimilarDocuments_NoPaperlessGPTTagsToExclude(t *testing.T) { + // Set environment variables for the test + originalManualTag := os.Getenv("MANUAL_TAG") + originalAutoTag := os.Getenv("AUTO_TAG") + defer func() { + os.Setenv("MANUAL_TAG", originalManualTag) + os.Setenv("AUTO_TAG", originalAutoTag) + }() + + // Set the tag values and reinitialize the global variables + os.Setenv("MANUAL_TAG", "paperless-gpt") + os.Setenv("AUTO_TAG", "paperless-gpt-auto") + manualTag = "paperless-gpt" + autoTag = "paperless-gpt-auto" + + env := newTestEnv(t) + defer env.teardown() + + // Mock similar documents + similarDocs := []GetDocumentApiResponseResult{ + { + ID: 2, + Title: "Test Document 1", + }, + } + + response := GetDocumentsApiResponse{ + Count: 1, + Results: similarDocs, + } + + // Track the received query parameters + var receivedQuery string + env.mockResponses["/api/documents/"] = func(w http.ResponseWriter, r *http.Request) { + receivedQuery = r.URL.RawQuery + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(response) + } + + // Add required mocks for tags (no paperless-gpt tags this time) + env.setMockResponse("/api/tags/", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(map[string]interface{}{ + "results": []map[string]interface{}{ + {"id": 1, "name": "regular-tag"}, + {"id": 2, "name": "other-tag"}, + }, + "next": nil, + }) + }) + + ctx := context.Background() + documents, err := env.client.GetSimilarDocuments(ctx, 1, 5) + require.NoError(t, err) + assert.Len(t, documents, 1) + + // Verify that the query does not include tag exclusions when no paperless-gpt tags exist + assert.Contains(t, receivedQuery, "ordering=-score") + assert.Contains(t, receivedQuery, "truncate_content=true") + assert.Contains(t, receivedQuery, "more_like_id=1") + assert.Contains(t, receivedQuery, "page_size=5") + assert.NotContains(t, receivedQuery, "tags__id__none", "Should not include tag exclusions when no paperless-gpt tags exist") +} diff --git a/types.go b/types.go index ffeec83c..fa17700c 100644 --- a/types.go +++ b/types.go @@ -161,6 +161,7 @@ type ClientInterface interface { UploadDocument(ctx context.Context, data []byte, filename string, metadata map[string]interface{}) (string, error) GetTaskStatus(ctx context.Context, taskID string) (map[string]interface{}, error) DeleteDocument(ctx context.Context, documentID int) error + GetSimilarDocuments(ctx context.Context, documentID int, count int) ([]Document, error) } // DocumentProcessor defines the interface for processing documents with OCR