diff --git a/filefisher/_filefinder.py b/filefisher/_filefinder.py index 1bb63fe..fcdfd2d 100644 --- a/filefisher/_filefinder.py +++ b/filefisher/_filefinder.py @@ -739,6 +739,56 @@ def search_single(self, **query) -> "FileContainer": raise ValueError(msg) return fc + + def search_intersection(self, search_key, intersect_key) -> "FileContainer": + """subset paths that have the same value for `intersect_key` along `search_key` + + Parameters + ---------- + search_key : str + Key along wich to search for intersecting values of `intersect_key`. + + intersect_key : str + Key whose values are intersected between the values found for each value of `search_key`. + + Returns + ------- + search_result : FileContainer + + Examples + -------- + >>> fc = FileContainer(pd.DataFrame({ + ... "path": ["./folder1/file_1.txt", "./folder1/file_2.txt", "./folder2/file_1.txt"], + ... "folder": ["folder1", "folder1", "folder2"], + ... "file": ["file_1.txt", "file_2.txt", "file_1.txt"], + ... }).set_index("path")) + >>> fc.search_intersection(search_key = "folder", intersect_key = "file") # returns FileContainer with paths that have the same value for `files` along `folders`. + + folder file + path + ./folder1/file_1.txt folder1 file_1.txt + ./folder2/file_1.txt folder2 file_1.txt + + Raises + ------ + ValueError + If no intersecting values of `intersect_key` are found along `search_key`. + Or if `search_key` or `intersect_key` are not in the DataFrame. + + """ + + intersect_key_values = {} + for key in self.df[search_key].values: + intersect_key_values[key] = set(self.search(**{search_key: key}).df[intersect_key].values) + + intersection = set.intersection(*intersect_key_values.values()) + + if intersection == set(): + msg = f"No intersecting values of '{intersect_key}' found along '{search_key}'." + raise ValueError(msg) + + df = self._get_subset(**{intersect_key: list(intersection)}) + return type(self)(df) def concat(self, other, drop_duplicates=True): """concatenate two FileContainers diff --git a/filefisher/tests/test_filecontainer.py b/filefisher/tests/test_filecontainer.py index 99f18a5..9bb8cf0 100644 --- a/filefisher/tests/test_filecontainer.py +++ b/filefisher/tests/test_filecontainer.py @@ -187,6 +187,32 @@ def test_filecontainer_search_single(example_df, example_fc): pd.testing.assert_frame_equal(result.df, expected) +def test_filecontainer_search_intersection(): + fc = FileContainer(pd.DataFrame({ + "path": ["folder1/file_1.txt", "folder1/file_2.txt", "folder2/file_1.txt"], + "folder": ["folder1", "folder1", "folder2"], + "file": ["file_1.txt", "file_2.txt", "file_1.txt"], + }).set_index("path")) + + result = fc.search_intersection(search_key="folder", intersect_key="file") + expected = pd.DataFrame({ + "folder": ["folder1", "folder2"], + "file": ["file_1.txt", "file_1.txt"], + }, index=pd.Index(["folder1/file_1.txt", "folder2/file_1.txt"], name="path")) + + pd.testing.assert_frame_equal(result.df, expected) + +def test_filecontainer_search_intersection_error(): + fc = FileContainer(pd.DataFrame({ + "path": ["folder1/file_1.txt", "folder2/file_2.txt"], + "folder": ["folder1", "folder2"], + "file": ["file_1.txt", "file_2.txt"], + }).set_index("path")) + + with pytest.raises(ValueError, match="No intersecting values of 'file' found along 'folder'."): + fc.search_intersection(search_key="folder", intersect_key="file") + + def test_filecontainer_concat(example_fc): with pytest.raises(ValueError, match="Can only concatenate two FileContainers."):