1
1
import heapq
2
2
from functools import reduce
3
- from typing import Iterator , Iterable , Union
3
+ from operator import itemgetter
4
+ from typing import Iterator , Iterable , Union , Sequence , Optional
4
5
5
6
import deprecation
6
7
11
12
12
13
from skmine .itemsets import LCM
13
14
from bitarray import bitarray , frozenbitarray as fbarray
14
- from bitarray .util import zeros as bazeros , subset as basubset
15
+ from bitarray .util import zeros as bazeros , subset as basubset , count_and
15
16
from collections import deque
16
17
from tqdm .auto import tqdm
17
18
@@ -188,18 +189,10 @@ def iter_equivalence_class(attribute_extents: list[fbarray], intent: fbarray = N
188
189
"""
189
190
N_OBJS , N_ATTRS = len (attribute_extents [0 ]), len (attribute_extents )
190
191
191
- intent = bazeros (N_ATTRS ) if intent is None else intent
192
+ intent = ~ bazeros (N_ATTRS ) if intent is None else intent
192
193
193
- def conjunct_extent (premise : fbarray ) -> fbarray :
194
- res = ~ bazeros (N_OBJS )
195
- for m in premise .itersearch (True ):
196
- res &= attribute_extents [m ]
197
- if not res .any ():
198
- break
194
+ total_extent = extension (intent , attribute_extents )
199
195
200
- return fbarray (res )
201
-
202
- total_extent = conjunct_extent (intent )
203
196
stack = [[m ] for m in intent .itersearch (True )][::- 1 ]
204
197
205
198
yield intent
@@ -212,7 +205,7 @@ def conjunct_extent(premise: fbarray) -> fbarray:
212
205
attrs_to_eval [m ] = False
213
206
attrs_to_eval = fbarray (attrs_to_eval )
214
207
215
- conj = conjunct_extent (attrs_to_eval )
208
+ conj = extension (attrs_to_eval , attribute_extents )
216
209
if conj != total_extent :
217
210
continue
218
211
@@ -221,6 +214,62 @@ def conjunct_extent(premise: fbarray) -> fbarray:
221
214
stack += [attrs_to_remove + [m ] for m in intent .itersearch (True ) if m > last_attr ][::- 1 ]
222
215
223
216
217
+ def iter_equivalence_class_levelwise (
218
+ attribute_extents : list [fbarray ], intent : fbarray = None ,
219
+ presort_output : bool = True
220
+ ) -> Iterator [fbarray ]:
221
+ """Iterate subsets of attributes from equivalence class using with level-wise iteration technique
222
+
223
+ The output equivalence class goes from the maximal subsets of attributes to the smallest ones.
224
+ Equivalent subsets of attributes are the ones that describe the same subset of objects.
225
+
226
+
227
+ Parameters
228
+ ----------
229
+ attribute_extents:
230
+ The list of objects described by each specific attribute (converted to bitarrays)
231
+ intent:
232
+ Intent to compute equivalence class for. If None is passed, Iterate equivalence class of all attributes
233
+ presort_output:
234
+ Modify the order of the outputed descriptions to make it match the output of `iter_equivalence_class` function.
235
+
236
+ Returns
237
+ -------
238
+ Iterator[frozenbitarray]:
239
+ Iterator over bitarrays representing equivalent subsets of attributes
240
+
241
+ """
242
+ N_OBJS , N_ATTRS = len (attribute_extents [0 ]), len (attribute_extents )
243
+
244
+ intent = set (range (N_ATTRS )) if intent is None else set (intent .search (True ))
245
+ if not intent :
246
+ yield fbarray (bazeros (N_ATTRS ))
247
+ return
248
+
249
+ total_extent = extension (intent , attribute_extents )
250
+
251
+ # antigenerator: s.t. ext(intent\antigenerator) = ext(intent)
252
+ antigenerator , next_antigenerators = None , deque ([tuple ()])
253
+ for level in range (0 , len (intent ) + 1 ):
254
+ antigenerators , next_antigenerators = next_antigenerators , deque ()
255
+ for antigenerator in antigenerators :
256
+ generator = intent - set (antigenerator )
257
+
258
+ extent = extension (generator , attribute_extents )
259
+ if extent == total_extent :
260
+ yield next (isets2bas ([generator ], N_ATTRS ))
261
+ next_antigenerators .append (antigenerator )
262
+
263
+ if not next_antigenerators :
264
+ break
265
+
266
+ next_antigenerators = map (itemgetter (0 ), generate_next_level_descriptions (next_antigenerators ))\
267
+ if level else [(attr_id ,) for attr_id in intent ]
268
+
269
+ if presort_output :
270
+ next_antigenerators = sorted (next_antigenerators , reverse = True )
271
+
272
+
224
273
def list_keys_via_eqclass (equiv_class : Iterable [fbarray ]) -> list [fbarray ]:
225
274
"""List minimal subsets from given equivalence class"""
226
275
potent_keys = []
@@ -605,3 +654,213 @@ def list_stable_extents_via_gsofia(
605
654
stable_extents = dict (most_stable_extents )
606
655
607
656
return set (stable_extents )
657
+
658
+
659
+ def generate_next_level_descriptions (
660
+ same_level_descriptions : Sequence [tuple [int , ...]],
661
+ attribute_extents : Sequence [fbarray ] = None ,
662
+ n_attributes : int = None
663
+ ) -> Iterator [tuple [tuple [int , ...], Optional [int ]]]:
664
+ """Generate the next level descriptions from the given ones
665
+
666
+ Descriptions (i.e. set of attributes/items) belong to the same level when they have the same length.
667
+ Description of n+1 attributes will only be generated
668
+ only if all its subdescriptions of n attributes can be found in `current_level_descriptions`.
669
+
670
+ Parameters
671
+ ----------
672
+ same_level_descriptions:
673
+ Sequence of descriptions (as tuples of indices of their attributes) of the same length
674
+ attribute_extents:
675
+ Sequence extents of attributes.
676
+ Every extent is a set of objects described by an attribute and represented with a bitarray.
677
+ The parameter is optional, and it provides a slight optimisation
678
+ for support computations of generated descriptions
679
+ n_attributes:
680
+ Number of attributes. The parameter is only required when no `attribute_extents` are provided
681
+ and the `same_level_descriptions` contain one empty-set description.
682
+
683
+ Returns
684
+ -------
685
+ Iterator of pairs (next_level_description, next_level_description_support) where
686
+ next_level_description: tuple[int, ...]
687
+ Next-level description (as a tuple of indices of its attributes) composed of the given
688
+ `current_level_descriptions`.
689
+ next_level_description_support: int | None
690
+ The support of the corresponding next_level_description. (if `attribute_extents` is provided, else None)
691
+ Support of a description is the number of objects it describes.
692
+
693
+ """
694
+ provide_support = attribute_extents is not None
695
+ zero_level = next (len (descr ) for descr in same_level_descriptions ) == 0
696
+
697
+ n_attributes = len (attribute_extents ) if attribute_extents is not None else n_attributes
698
+ if n_attributes is None :
699
+ if not zero_level :
700
+ n_attributes = max (max (descr ) for descr in same_level_descriptions ) + 1
701
+ else :
702
+ raise ValueError ('Provide `n_attributes` parameter to `generate_next_level_descriptions` functions. '
703
+ 'As it is not deducible from the values of the other parameters.' )
704
+
705
+ if zero_level :
706
+ for next_attr in range (n_attributes ):
707
+ yield (next_attr ,), attribute_extents [next_attr ].count () if provide_support else None
708
+ return
709
+
710
+ possible_suffixes : dict [tuple [int , ...], list [int ]] = {}
711
+ for description in same_level_descriptions :
712
+ if description [:- 1 ] not in possible_suffixes :
713
+ possible_suffixes [description [:- 1 ]] = []
714
+ possible_suffixes [description [:- 1 ]].append (description [- 1 ])
715
+ possible_suffixes = {description : set (suffixes ) for description , suffixes in possible_suffixes .items ()}
716
+
717
+ for description in same_level_descriptions :
718
+ subdescriptions = [description [:i ]+ description [i + 1 :] for i in range (len (description ))]
719
+ if any (subdescription not in possible_suffixes for subdescription in subdescriptions ):
720
+ continue
721
+
722
+ extent = extension (description , attribute_extents ) if provide_support else None
723
+ next_attributes = reduce (set .intersection , (possible_suffixes [subgen ] for subgen in subdescriptions ))
724
+ for next_attr in next_attributes :
725
+ if next_attr <= description [- 1 ]:
726
+ continue
727
+ next_support = count_and (extent , attribute_extents [next_attr ]) if provide_support else None
728
+ yield description + (next_attr , ), next_support
729
+
730
+
731
+ def iter_minimal_rare_itemsets_via_mrgexp (
732
+ attribute_extents : list [fbarray ], max_support : int ,
733
+ max_length : int = None
734
+ ) -> Iterator [fbarray ]:
735
+ """List minimal rare itemsets using MRG-Exp (aka Carpathia-G-Rare) algorithm
736
+
737
+ A minimal rare itemset (or a minimal rare description) is a minimal subset of attributes
738
+ that describes less than (or equal to) `max_support` objects.
739
+ Minimality here means that any subset of a minimal rare itemset describes more than `max_support` objects.
740
+
741
+ Parameters
742
+ ----------
743
+ attribute_extents:
744
+ Sequence extents of attributes.
745
+ Every extent is a set of objects described by an attribute and represented with a bitarray.
746
+ max_support:
747
+ Maximal number of objects that should be described by an itemset (aka a description).
748
+ max_length:
749
+ Maximum size of a rare itemset.
750
+ Default value: the number of attributes: len(attribute_extents).
751
+
752
+ Returns
753
+ -------
754
+ minimal_rare_itemsets:
755
+ Minimal rare itemsets found by the algorithm.
756
+ The itemsets are placed in the order of increasing sizes:
757
+ the first itemset contains the fewer attributes, the latter contains the maximal number of attributes.
758
+
759
+ Notes
760
+ -----
761
+
762
+ The algorithm is introduced in Szathmary, L., Napoli, A., & Valtchev, P. (2007, October). Towards rare itemset mining.
763
+ In 19th IEEE international conference on tools with artificial intelligence (ICTAI 2007) (Vol. 1, pp. 305-312). IEEE.
764
+ """
765
+ n_attrs = len (attribute_extents )
766
+ max_length = n_attrs if max_length is None else max_length
767
+ total_extent = attribute_extents [0 ] | ~ attribute_extents [0 ]
768
+
769
+ prev_level_generators , cur_level_gens = None , {tuple (): total_extent .count ()}
770
+ for level in range (1 , max_length + 1 ):
771
+ prev_level_generators , cur_level_gens = cur_level_gens , {}
772
+ if not prev_level_generators :
773
+ break
774
+
775
+ new_candidates = generate_next_level_descriptions (prev_level_generators , attribute_extents )
776
+ for new_generator , new_support in new_candidates :
777
+ sub_generators = (new_generator [:i ] + new_generator [i + 1 :] for i in range (level ))
778
+
779
+ not_a_generator = any (
780
+ sub_gen not in prev_level_generators or new_support == prev_level_generators [sub_gen ]
781
+ for sub_gen in sub_generators
782
+ )
783
+ if not_a_generator :
784
+ continue
785
+
786
+ if new_support <= max_support :
787
+ yield next (isets2bas ([new_generator ], n_attrs ))
788
+ continue
789
+
790
+ cur_level_gens [new_generator ] = new_support
791
+
792
+
793
+ def iter_minimal_broad_clusterings_via_mrgexp (
794
+ attribute_extents : list [fbarray ], min_coverage : int ,
795
+ max_length : int = None ,
796
+ min_added_coverage : int = 1
797
+ ) -> Iterator [fbarray ]:
798
+ """Iterate minimal broad clusterings using an analogue of MRG-Exp algorithm for minimal rare itemsets mining
799
+
800
+ A minimal broad clustering is a minimal subset of attributes
801
+ that, together, cover more than (or equal to) `min_coverage` objects.
802
+ Minimality here means that any subset of a minimal broad clustering describes less than `min_coverage` objects.
803
+
804
+ Coverage of a clustering is the number of objects lying in the union of all the clusters
805
+ (here 'a cluster' is a synonym of 'an attribute').
806
+
807
+ Parameters
808
+ ----------
809
+ attribute_extents:
810
+ Sequence extents of attributes.
811
+ Every extent is a set of objects described by an attribute and represented with a bitarray.
812
+ min_coverage:
813
+ Minimal number of objects that should be covered by all the clusters (attributes) together
814
+ max_length:
815
+ Maximum size of a clustering.
816
+ Default value: the number of attributes: len(attribute_extents).
817
+ min_added_coverage:
818
+ Minimal number of objects that a cluster (i.e. an attribute) should bring to a clustering.
819
+ For example, for a clustering {a, b, c}, its every subset ({a,b}, {a, c}, {b, c}) should cover
820
+ less than `coverage({a,b,c}) - min_added_coverage` objects.
821
+
822
+ Returns
823
+ -------
824
+ minimal_broad_clusterings:
825
+ Minimal broad clusterings found by the algorithm.
826
+ The clusterings are placed in the order of increasing sizes:
827
+ the first clustering contains the fewer attributes, the latter contains the maximal number of attributes.
828
+
829
+ Notes
830
+ -----
831
+
832
+ The algorithm was introduced in:
833
+ E.Dudyrev et al. "Clustering with Stable Pattern Concepts"
834
+ Published in Amedeo Napoli and Sebastian Rudolph (Eds.):
835
+ The 12th International Workshop "What can FCA do for Artificial Intelligence?",
836
+ FCA4AI 2024, co-located with ECAI 2024, October 19 2024, Santiago de Compostela, Spain.
837
+ """
838
+ n_objs , n_attrs = len (attribute_extents [0 ]), len (attribute_extents )
839
+ max_length = n_attrs if max_length is None else max_length
840
+ empty_extent = attribute_extents [0 ] & ~ attribute_extents [0 ]
841
+ leftovers = [~ extent for extent in attribute_extents ]
842
+
843
+ prev_level_generators , cur_level_gens = None , {tuple (): empty_extent .count ()}
844
+ for level in range (1 , max_length + 1 ):
845
+ prev_level_generators , cur_level_gens = cur_level_gens , {}
846
+ if not prev_level_generators :
847
+ break
848
+
849
+ new_candidates = generate_next_level_descriptions (prev_level_generators , leftovers )
850
+ for new_generator , new_leftovers_support in new_candidates :
851
+ new_coverage = n_objs - new_leftovers_support
852
+ sub_generators = (new_generator [:i ] + new_generator [i + 1 :] for i in range (level ))
853
+
854
+ not_a_generator = any (
855
+ sub_gen not in prev_level_generators
856
+ or new_coverage < prev_level_generators [sub_gen ] + min_added_coverage
857
+ for sub_gen in sub_generators
858
+ )
859
+ if not_a_generator :
860
+ continue
861
+
862
+ if new_coverage >= min_coverage :
863
+ yield next (isets2bas ([new_generator ], n_attrs ))
864
+ continue
865
+
866
+ cur_level_gens [new_generator ] = new_coverage
0 commit comments