Skip to content

Commit 0e6e985

Browse files
authored
Merge pull request #28 from EgorDudyrev/feature/rare_itemset_mining
Feature/rare itemset mining
2 parents 91536f7 + 48f7446 commit 0e6e985

File tree

3 files changed

+346
-16
lines changed

3 files changed

+346
-16
lines changed

caspailleur/base_functions.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from functools import reduce
22
from itertools import chain, combinations
3-
from typing import Iterable, Iterator, Union, Any
3+
from typing import Iterable, Iterator, Union, Any, Sequence
44

55
import deprecation
66
from bitarray import frozenbitarray as fbarray, bitarray
@@ -32,7 +32,7 @@ def is_psubset_of(A: Union[set[int], fbarray], B: Union[set[int], fbarray]) -> b
3232
return (A & B == A) and A != B
3333

3434

35-
def maximal_extent(crosses_per_columns: Union[list[set], list[bitarray]]) -> Union[set, bitarray]:
35+
def maximal_extent(crosses_per_columns: Union[Sequence[set], Sequence[bitarray]]) -> Union[set, bitarray]:
3636
"""Return the whole set of objects from `crosses_per_columns` data representation"""
3737
first_column = crosses_per_columns[0]
3838
if isinstance(first_column, bitarray):
@@ -45,7 +45,7 @@ def maximal_extent(crosses_per_columns: Union[list[set], list[bitarray]]) -> Uni
4545
return type(first_column)(all_attrs)
4646

4747

48-
def extension(description: Union[Iterable[int], bitarray], crosses_per_columns: Union[list[set], list[bitarray]])\
48+
def extension(description: Union[Iterable[int], bitarray], crosses_per_columns: Union[Sequence[set], Sequence[bitarray]])\
4949
-> Union[set[int], bitarray]:
5050
"""Select the indices of rows described by `description`"""
5151
column_type = type(crosses_per_columns[0])

caspailleur/mine_equivalence_classes.py

+272-13
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import heapq
22
from functools import reduce
3-
from typing import Iterator, Iterable, Union
3+
from operator import itemgetter
4+
from typing import Iterator, Iterable, Union, Sequence, Optional
45

56
import deprecation
67

@@ -11,7 +12,7 @@
1112

1213
from skmine.itemsets import LCM
1314
from bitarray import bitarray, frozenbitarray as fbarray
14-
from bitarray.util import zeros as bazeros, subset as basubset
15+
from bitarray.util import zeros as bazeros, subset as basubset, count_and
1516
from collections import deque
1617
from tqdm.auto import tqdm
1718

@@ -188,18 +189,10 @@ def iter_equivalence_class(attribute_extents: list[fbarray], intent: fbarray = N
188189
"""
189190
N_OBJS, N_ATTRS = len(attribute_extents[0]), len(attribute_extents)
190191

191-
intent = bazeros(N_ATTRS) if intent is None else intent
192+
intent = ~bazeros(N_ATTRS) if intent is None else intent
192193

193-
def conjunct_extent(premise: fbarray) -> fbarray:
194-
res = ~bazeros(N_OBJS)
195-
for m in premise.itersearch(True):
196-
res &= attribute_extents[m]
197-
if not res.any():
198-
break
194+
total_extent = extension(intent, attribute_extents)
199195

200-
return fbarray(res)
201-
202-
total_extent = conjunct_extent(intent)
203196
stack = [[m] for m in intent.itersearch(True)][::-1]
204197

205198
yield intent
@@ -212,7 +205,7 @@ def conjunct_extent(premise: fbarray) -> fbarray:
212205
attrs_to_eval[m] = False
213206
attrs_to_eval = fbarray(attrs_to_eval)
214207

215-
conj = conjunct_extent(attrs_to_eval)
208+
conj = extension(attrs_to_eval, attribute_extents)
216209
if conj != total_extent:
217210
continue
218211

@@ -221,6 +214,62 @@ def conjunct_extent(premise: fbarray) -> fbarray:
221214
stack += [attrs_to_remove+[m] for m in intent.itersearch(True) if m > last_attr][::-1]
222215

223216

217+
def iter_equivalence_class_levelwise(
218+
attribute_extents: list[fbarray], intent: fbarray = None,
219+
presort_output: bool = True
220+
) -> Iterator[fbarray]:
221+
"""Iterate subsets of attributes from equivalence class using with level-wise iteration technique
222+
223+
The output equivalence class goes from the maximal subsets of attributes to the smallest ones.
224+
Equivalent subsets of attributes are the ones that describe the same subset of objects.
225+
226+
227+
Parameters
228+
----------
229+
attribute_extents:
230+
The list of objects described by each specific attribute (converted to bitarrays)
231+
intent:
232+
Intent to compute equivalence class for. If None is passed, Iterate equivalence class of all attributes
233+
presort_output:
234+
Modify the order of the outputed descriptions to make it match the output of `iter_equivalence_class` function.
235+
236+
Returns
237+
-------
238+
Iterator[frozenbitarray]:
239+
Iterator over bitarrays representing equivalent subsets of attributes
240+
241+
"""
242+
N_OBJS, N_ATTRS = len(attribute_extents[0]), len(attribute_extents)
243+
244+
intent = set(range(N_ATTRS)) if intent is None else set(intent.search(True))
245+
if not intent:
246+
yield fbarray(bazeros(N_ATTRS))
247+
return
248+
249+
total_extent = extension(intent, attribute_extents)
250+
251+
# antigenerator: s.t. ext(intent\antigenerator) = ext(intent)
252+
antigenerator, next_antigenerators = None, deque([tuple()])
253+
for level in range(0, len(intent) + 1):
254+
antigenerators, next_antigenerators = next_antigenerators, deque()
255+
for antigenerator in antigenerators:
256+
generator = intent - set(antigenerator)
257+
258+
extent = extension(generator, attribute_extents)
259+
if extent == total_extent:
260+
yield next(isets2bas([generator], N_ATTRS))
261+
next_antigenerators.append(antigenerator)
262+
263+
if not next_antigenerators:
264+
break
265+
266+
next_antigenerators = map(itemgetter(0), generate_next_level_descriptions(next_antigenerators))\
267+
if level else [(attr_id,) for attr_id in intent]
268+
269+
if presort_output:
270+
next_antigenerators = sorted(next_antigenerators, reverse=True)
271+
272+
224273
def list_keys_via_eqclass(equiv_class: Iterable[fbarray]) -> list[fbarray]:
225274
"""List minimal subsets from given equivalence class"""
226275
potent_keys = []
@@ -605,3 +654,213 @@ def list_stable_extents_via_gsofia(
605654
stable_extents = dict(most_stable_extents)
606655

607656
return set(stable_extents)
657+
658+
659+
def generate_next_level_descriptions(
660+
same_level_descriptions: Sequence[tuple[int, ...]],
661+
attribute_extents: Sequence[fbarray] = None,
662+
n_attributes: int = None
663+
) -> Iterator[tuple[tuple[int, ...], Optional[int]]]:
664+
"""Generate the next level descriptions from the given ones
665+
666+
Descriptions (i.e. set of attributes/items) belong to the same level when they have the same length.
667+
Description of n+1 attributes will only be generated
668+
only if all its subdescriptions of n attributes can be found in `current_level_descriptions`.
669+
670+
Parameters
671+
----------
672+
same_level_descriptions:
673+
Sequence of descriptions (as tuples of indices of their attributes) of the same length
674+
attribute_extents:
675+
Sequence extents of attributes.
676+
Every extent is a set of objects described by an attribute and represented with a bitarray.
677+
The parameter is optional, and it provides a slight optimisation
678+
for support computations of generated descriptions
679+
n_attributes:
680+
Number of attributes. The parameter is only required when no `attribute_extents` are provided
681+
and the `same_level_descriptions` contain one empty-set description.
682+
683+
Returns
684+
-------
685+
Iterator of pairs (next_level_description, next_level_description_support) where
686+
next_level_description: tuple[int, ...]
687+
Next-level description (as a tuple of indices of its attributes) composed of the given
688+
`current_level_descriptions`.
689+
next_level_description_support: int | None
690+
The support of the corresponding next_level_description. (if `attribute_extents` is provided, else None)
691+
Support of a description is the number of objects it describes.
692+
693+
"""
694+
provide_support = attribute_extents is not None
695+
zero_level = next(len(descr) for descr in same_level_descriptions) == 0
696+
697+
n_attributes = len(attribute_extents) if attribute_extents is not None else n_attributes
698+
if n_attributes is None:
699+
if not zero_level:
700+
n_attributes = max(max(descr) for descr in same_level_descriptions) + 1
701+
else:
702+
raise ValueError('Provide `n_attributes` parameter to `generate_next_level_descriptions` functions. '
703+
'As it is not deducible from the values of the other parameters.')
704+
705+
if zero_level:
706+
for next_attr in range(n_attributes):
707+
yield (next_attr,), attribute_extents[next_attr].count() if provide_support else None
708+
return
709+
710+
possible_suffixes: dict[tuple[int, ...], list[int]] = {}
711+
for description in same_level_descriptions:
712+
if description[:-1] not in possible_suffixes:
713+
possible_suffixes[description[:-1]] = []
714+
possible_suffixes[description[:-1]].append(description[-1])
715+
possible_suffixes = {description: set(suffixes) for description, suffixes in possible_suffixes.items()}
716+
717+
for description in same_level_descriptions:
718+
subdescriptions = [description[:i]+description[i+1:] for i in range(len(description))]
719+
if any(subdescription not in possible_suffixes for subdescription in subdescriptions):
720+
continue
721+
722+
extent = extension(description, attribute_extents) if provide_support else None
723+
next_attributes = reduce(set.intersection, (possible_suffixes[subgen] for subgen in subdescriptions))
724+
for next_attr in next_attributes:
725+
if next_attr <= description[-1]:
726+
continue
727+
next_support = count_and(extent, attribute_extents[next_attr]) if provide_support else None
728+
yield description + (next_attr, ), next_support
729+
730+
731+
def iter_minimal_rare_itemsets_via_mrgexp(
732+
attribute_extents: list[fbarray], max_support: int,
733+
max_length: int = None
734+
) -> Iterator[fbarray]:
735+
"""List minimal rare itemsets using MRG-Exp (aka Carpathia-G-Rare) algorithm
736+
737+
A minimal rare itemset (or a minimal rare description) is a minimal subset of attributes
738+
that describes less than (or equal to) `max_support` objects.
739+
Minimality here means that any subset of a minimal rare itemset describes more than `max_support` objects.
740+
741+
Parameters
742+
----------
743+
attribute_extents:
744+
Sequence extents of attributes.
745+
Every extent is a set of objects described by an attribute and represented with a bitarray.
746+
max_support:
747+
Maximal number of objects that should be described by an itemset (aka a description).
748+
max_length:
749+
Maximum size of a rare itemset.
750+
Default value: the number of attributes: len(attribute_extents).
751+
752+
Returns
753+
-------
754+
minimal_rare_itemsets:
755+
Minimal rare itemsets found by the algorithm.
756+
The itemsets are placed in the order of increasing sizes:
757+
the first itemset contains the fewer attributes, the latter contains the maximal number of attributes.
758+
759+
Notes
760+
-----
761+
762+
The algorithm is introduced in Szathmary, L., Napoli, A., & Valtchev, P. (2007, October). Towards rare itemset mining.
763+
In 19th IEEE international conference on tools with artificial intelligence (ICTAI 2007) (Vol. 1, pp. 305-312). IEEE.
764+
"""
765+
n_attrs = len(attribute_extents)
766+
max_length = n_attrs if max_length is None else max_length
767+
total_extent = attribute_extents[0] | ~attribute_extents[0]
768+
769+
prev_level_generators, cur_level_gens = None, {tuple(): total_extent.count()}
770+
for level in range(1, max_length + 1):
771+
prev_level_generators, cur_level_gens = cur_level_gens, {}
772+
if not prev_level_generators:
773+
break
774+
775+
new_candidates = generate_next_level_descriptions(prev_level_generators, attribute_extents)
776+
for new_generator, new_support in new_candidates:
777+
sub_generators = (new_generator[:i] + new_generator[i + 1:] for i in range(level))
778+
779+
not_a_generator = any(
780+
sub_gen not in prev_level_generators or new_support == prev_level_generators[sub_gen]
781+
for sub_gen in sub_generators
782+
)
783+
if not_a_generator:
784+
continue
785+
786+
if new_support <= max_support:
787+
yield next(isets2bas([new_generator], n_attrs))
788+
continue
789+
790+
cur_level_gens[new_generator] = new_support
791+
792+
793+
def iter_minimal_broad_clusterings_via_mrgexp(
794+
attribute_extents: list[fbarray], min_coverage: int,
795+
max_length: int = None,
796+
min_added_coverage: int = 1
797+
) -> Iterator[fbarray]:
798+
"""Iterate minimal broad clusterings using an analogue of MRG-Exp algorithm for minimal rare itemsets mining
799+
800+
A minimal broad clustering is a minimal subset of attributes
801+
that, together, cover more than (or equal to) `min_coverage` objects.
802+
Minimality here means that any subset of a minimal broad clustering describes less than `min_coverage` objects.
803+
804+
Coverage of a clustering is the number of objects lying in the union of all the clusters
805+
(here 'a cluster' is a synonym of 'an attribute').
806+
807+
Parameters
808+
----------
809+
attribute_extents:
810+
Sequence extents of attributes.
811+
Every extent is a set of objects described by an attribute and represented with a bitarray.
812+
min_coverage:
813+
Minimal number of objects that should be covered by all the clusters (attributes) together
814+
max_length:
815+
Maximum size of a clustering.
816+
Default value: the number of attributes: len(attribute_extents).
817+
min_added_coverage:
818+
Minimal number of objects that a cluster (i.e. an attribute) should bring to a clustering.
819+
For example, for a clustering {a, b, c}, its every subset ({a,b}, {a, c}, {b, c}) should cover
820+
less than `coverage({a,b,c}) - min_added_coverage` objects.
821+
822+
Returns
823+
-------
824+
minimal_broad_clusterings:
825+
Minimal broad clusterings found by the algorithm.
826+
The clusterings are placed in the order of increasing sizes:
827+
the first clustering contains the fewer attributes, the latter contains the maximal number of attributes.
828+
829+
Notes
830+
-----
831+
832+
The algorithm was introduced in:
833+
E.Dudyrev et al. "Clustering with Stable Pattern Concepts"
834+
Published in Amedeo Napoli and Sebastian Rudolph (Eds.):
835+
The 12th International Workshop "What can FCA do for Artificial Intelligence?",
836+
FCA4AI 2024, co-located with ECAI 2024, October 19 2024, Santiago de Compostela, Spain.
837+
"""
838+
n_objs, n_attrs = len(attribute_extents[0]), len(attribute_extents)
839+
max_length = n_attrs if max_length is None else max_length
840+
empty_extent = attribute_extents[0] & ~attribute_extents[0]
841+
leftovers = [~extent for extent in attribute_extents]
842+
843+
prev_level_generators, cur_level_gens = None, {tuple(): empty_extent.count()}
844+
for level in range(1, max_length + 1):
845+
prev_level_generators, cur_level_gens = cur_level_gens, {}
846+
if not prev_level_generators:
847+
break
848+
849+
new_candidates = generate_next_level_descriptions(prev_level_generators, leftovers)
850+
for new_generator, new_leftovers_support in new_candidates:
851+
new_coverage = n_objs - new_leftovers_support
852+
sub_generators = (new_generator[:i] + new_generator[i + 1:] for i in range(level))
853+
854+
not_a_generator = any(
855+
sub_gen not in prev_level_generators
856+
or new_coverage < prev_level_generators[sub_gen] + min_added_coverage
857+
for sub_gen in sub_generators
858+
)
859+
if not_a_generator:
860+
continue
861+
862+
if new_coverage >= min_coverage:
863+
yield next(isets2bas([new_generator], n_attrs))
864+
continue
865+
866+
cur_level_gens[new_generator] = new_coverage

0 commit comments

Comments
 (0)