1
1
import logging
2
- from collections import defaultdict
3
2
from typing import Collection , Dict , List , Set , Type
4
3
from uuid import UUID
5
4
9
8
from qdrant_client .http .models import models
10
9
11
10
from welearn_datastack .data .db_models import DocumentSlice
12
- from welearn_datastack .exceptions import (
13
- ErrorWhileDeletingChunks ,
14
- )
11
+ from welearn_datastack .exceptions import ErrorWhileDeletingChunks
15
12
16
13
logger = logging .getLogger (__name__ )
17
14
@@ -31,30 +28,30 @@ def classify_documents_per_collection(
31
28
"""
32
29
tmp_collections_names_in_qdrant = qdrant_connector .get_collections ().collections
33
30
collections_names_in_qdrant = [c .name for c in tmp_collections_names_in_qdrant ]
34
- model_name_collection_name = {}
35
- for x in collections_names_in_qdrant :
36
- parts = x .split ("_" )
37
- if len (parts ) >= 4 :
38
- model_name_collection_name [parts [3 ]] = x
39
- else :
40
- logger .warning (
41
- "Collection name '%s' does not follow the expected format" , x
42
- )
43
31
44
- ret : Dict [str , Set [UUID ]] = defaultdict ( set )
32
+ ret : Dict [str , Set [UUID ]] = {}
45
33
for dslice in slices :
46
- model_name = dslice .embedding_model .title
47
- try :
48
- collection_name = model_name_collection_name [model_name ]
49
- ret [collection_name ].add (dslice .document_id ) # type: ignore
50
- except KeyError :
51
- logger .warning (
52
- "No collection found for model %s, document %s" ,
53
- model_name ,
54
- dslice .document_id ,
34
+ lang = dslice .document .lang
35
+ model = dslice .embedding_model .title
36
+ collection_name = None
37
+ multilingual_collection = f"collection_welearn_mul_{ model } "
38
+ mono_collection = f"collection_welearn_{ lang } _{ model } "
39
+
40
+ # Check multilingual or mono lingual
41
+ if multilingual_collection in collections_names_in_qdrant :
42
+ collection_name = multilingual_collection
43
+ elif mono_collection in collections_names_in_qdrant :
44
+ collection_name = mono_collection
45
+ else :
46
+ logger .error (
47
+ f"Collection { collection_name } not found in Qdrant, slice { dslice .id } ignored" ,
55
48
)
56
49
continue
57
50
51
+ if collection_name not in ret :
52
+ ret [collection_name ] = set ()
53
+ ret [collection_name ].add (dslice .document_id ) # type: ignore
54
+
58
55
return ret
59
56
60
57
@@ -73,7 +70,6 @@ def delete_points_related_to_document(
73
70
"""
74
71
logger .info ("Deletion started" )
75
72
logger .debug (f"Deleting points related to { documents_ids } in { collection_name } " )
76
- op_res = None
77
73
78
74
try :
79
75
op_res = qdrant_connector .delete (
0 commit comments