From f353e711a63cbdd7ef8d2722facf1c9567551ff7 Mon Sep 17 00:00:00 2001 From: Vinson Fan Date: Mon, 4 Mar 2024 10:27:30 -0800 Subject: [PATCH 01/13] comment out two unused fns --- saspt/.DS_Store | Bin 0 -> 6148 bytes saspt/dataset.py | 90 ++++++++++++++++++++++++++++++----------------- 2 files changed, 58 insertions(+), 32 deletions(-) create mode 100644 saspt/.DS_Store diff --git a/saspt/.DS_Store b/saspt/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..7ef7938a030a1076fdc267a4bd2a254fcf6d0a32 GIT binary patch literal 6148 zcmeHK%}T>S5T3QwrWBzEg&r5Y7OX{U#Y?F51&ruHr6#0kFwK@GwTDv3SzpK}@p+ut z-GJ2|Jc-yD*!^bbXE*af_6Gn&ZyFu|)B(W3Mkq*GA!J_Z+OWZdLeCMy5Yk|lj)Qo` zM1Rpl-(GFwPK7GFm(DN3I<0#Eqt#`3eDsOI8oQhL(?z~4e^D;l1ragahL!(Qj z;-Jv|;3^u<2KAjYm1KUD3@17v3WpeSdmSaAn)TE)2~(Zx8HZDKs)KrCK5uu9nzDV| zT{PwVq}^)DPWN!Js5-lQ`==L!r+Ad87el2$*0pR~EZ`NDFI7EzlO$Hj1A2ofVwge0ibTY^wpbS>rv z(SssPDxygh_K6`(I@+a;b1miuO*#m@GR|XHE*>vJua0)9!$G(Pxn~BLfn^3tW>}~5 z{~Ujrm5==86do}H%)mcmK$QDlzlTNHv-Mkfbk<62_t;1%E++*A^`%Py4rm`~Xs7W@ b(lO4pm>Z;7$ga~7`68eR;f@*j1qMC pd.DataFrame: self._marginal_posterior_occs_dataframe = df return self._marginal_posterior_occs_dataframe - def infer_posterior_by_condition(self, col: str, normalize: bool=False - ) -> Tuple[np.ndarray, List[str]]: - """ Aggregate trajectories across files by grouping on an arbitrary - column in *self.paths*. Run state array inference on each group. - - args - ---- - col : str, a column in *self.paths* to group by - normalize : bool, normalize posterior occupations after running - - returns - ------- - ( - 2D numpy.ndarray of shape (n_conditions, n_diff_coefs), - posterior occupations for each condition marginalized - on diffusion coefficient; - - list of str of length n_conditions, the names of the conditions - corresponding to the first axis - ) - """ - posterior_occs, conditions = self.apply_by(col, - self.calc_marginal_posterior_occs, is_variadic=True) - posterior_occs = np.asarray(posterior_occs) - if normalize: - posterior_occs = normalize_2d(posterior_occs, axis=1) - return posterior_occs, conditions - ############# ## METHODS ## ############# @@ -399,7 +371,7 @@ def calc_naive_occs(self, *track_paths: str) -> np.ndarray: total number of jumps observed for each SPT experiment """ SA = self._init_state_array(*track_paths) - return SA.naive_occs + return SA.n_jumps * SA.naive_occs def calc_posterior_occs(self, *track_paths: str) -> np.ndarray: """ @@ -415,7 +387,7 @@ def calc_posterior_occs(self, *track_paths: str) -> np.ndarray: """ SA = self._init_state_array(*track_paths) return SA.n_jumps * SA.posterior_occs - + ''' def calc_marginal_naive_occs(self, *track_paths: str) -> np.ndarray: """ Calculate the likelihood function for a particular set of trajectories, marginalized on the diffusion coefficient. @@ -450,6 +422,60 @@ def calc_marginal_posterior_occs(self, *track_paths: str) -> np.ndarray: """ return self.likelihood.marginalize_on_diff_coef( self.calc_posterior_occs(*track_paths)) + ''' + def calc_occs_and_processed_stats(self, *track_paths: str) -> Tuple[np.ndarray, pd.DataFrame]: + """ Wrapper to calculate naive and posterior occupations + and processed track statistics for a set of trajectories. + This allows us to subsample the same trajectories to get + these three attributes. + + args + ---- + track_paths : paths to files with trajectories readable + by saspt.utils.load_detections + + returns + ------- + ( + numpy.ndarray of shape *n_diff_coefs*, occupations scaled + by the total number of jumps observed in this set of + trajectories; + + pandas.DataFrame, statistics on the preprocessed trajectories + ) + """ + SA = self._init_state_array(*track_paths) + naive_occs = SA.n_jumps * SA.naive_occs + posterior_occs = SA.n_jumps * SA.posterior_occs + return SA.n_jumps * SA.posterior_occs, SA.T.processed_track_statistics + + def infer_posterior_by_condition(self, col: str, normalize: bool=False + ) -> Tuple[np.ndarray, List[str]]: + """ Aggregate trajectories across files by grouping on an arbitrary + column in *self.paths*. Run state array inference on each group. + + args + ---- + col : str, a column in *self.paths* to group by + normalize : bool, normalize posterior occupations after running + + returns + ------- + ( + 2D numpy.ndarray of shape (n_conditions, n_diff_coefs), + posterior occupations for each condition marginalized + on diffusion coefficient; + + list of str of length n_conditions, the names of the conditions + corresponding to the first axis + ) + """ + posterior_occs, conditions = self.apply_by(col, + self.calc_marginal_posterior_occs, is_variadic=True) + posterior_occs = np.asarray(posterior_occs) + if normalize: + posterior_occs = normalize_2d(posterior_occs, axis=1) + return posterior_occs, conditions ############## ## PLOTTING ## @@ -664,8 +690,8 @@ def g(filepath: str) -> dict: return result def parallel_map(self, func, args, msg: str=None, progress_bar: bool=False): - """ Parallelize a function across multiple arguments using a process-based - dask scheduler. + """ Parallelize a function across multiple arguments using a + process-based dask scheduler. args ---- From 8fe635bd4dc4c27e48a6576b21309fccbbf2f7e0 Mon Sep 17 00:00:00 2001 From: Vinson Fan Date: Mon, 4 Mar 2024 10:29:26 -0800 Subject: [PATCH 02/13] Revert "comment out two unused fns" This reverts commit f353e711a63cbdd7ef8d2722facf1c9567551ff7. --- saspt/.DS_Store | Bin 6148 -> 0 bytes saspt/dataset.py | 90 +++++++++++++++++------------------------------ 2 files changed, 32 insertions(+), 58 deletions(-) delete mode 100644 saspt/.DS_Store diff --git a/saspt/.DS_Store b/saspt/.DS_Store deleted file mode 100644 index 7ef7938a030a1076fdc267a4bd2a254fcf6d0a32..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK%}T>S5T3QwrWBzEg&r5Y7OX{U#Y?F51&ruHr6#0kFwK@GwTDv3SzpK}@p+ut z-GJ2|Jc-yD*!^bbXE*af_6Gn&ZyFu|)B(W3Mkq*GA!J_Z+OWZdLeCMy5Yk|lj)Qo` zM1Rpl-(GFwPK7GFm(DN3I<0#Eqt#`3eDsOI8oQhL(?z~4e^D;l1ragahL!(Qj z;-Jv|;3^u<2KAjYm1KUD3@17v3WpeSdmSaAn)TE)2~(Zx8HZDKs)KrCK5uu9nzDV| zT{PwVq}^)DPWN!Js5-lQ`==L!r+Ad87el2$*0pR~EZ`NDFI7EzlO$Hj1A2ofVwge0ibTY^wpbS>rv z(SssPDxygh_K6`(I@+a;b1miuO*#m@GR|XHE*>vJua0)9!$G(Pxn~BLfn^3tW>}~5 z{~Ujrm5==86do}H%)mcmK$QDlzlTNHv-Mkfbk<62_t;1%E++*A^`%Py4rm`~Xs7W@ b(lO4pm>Z;7$ga~7`68eR;f@*j1qMC pd.DataFrame: self._marginal_posterior_occs_dataframe = df return self._marginal_posterior_occs_dataframe + def infer_posterior_by_condition(self, col: str, normalize: bool=False + ) -> Tuple[np.ndarray, List[str]]: + """ Aggregate trajectories across files by grouping on an arbitrary + column in *self.paths*. Run state array inference on each group. + + args + ---- + col : str, a column in *self.paths* to group by + normalize : bool, normalize posterior occupations after running + + returns + ------- + ( + 2D numpy.ndarray of shape (n_conditions, n_diff_coefs), + posterior occupations for each condition marginalized + on diffusion coefficient; + + list of str of length n_conditions, the names of the conditions + corresponding to the first axis + ) + """ + posterior_occs, conditions = self.apply_by(col, + self.calc_marginal_posterior_occs, is_variadic=True) + posterior_occs = np.asarray(posterior_occs) + if normalize: + posterior_occs = normalize_2d(posterior_occs, axis=1) + return posterior_occs, conditions + ############# ## METHODS ## ############# @@ -371,7 +399,7 @@ def calc_naive_occs(self, *track_paths: str) -> np.ndarray: total number of jumps observed for each SPT experiment """ SA = self._init_state_array(*track_paths) - return SA.n_jumps * SA.naive_occs + return SA.naive_occs def calc_posterior_occs(self, *track_paths: str) -> np.ndarray: """ @@ -387,7 +415,7 @@ def calc_posterior_occs(self, *track_paths: str) -> np.ndarray: """ SA = self._init_state_array(*track_paths) return SA.n_jumps * SA.posterior_occs - ''' + def calc_marginal_naive_occs(self, *track_paths: str) -> np.ndarray: """ Calculate the likelihood function for a particular set of trajectories, marginalized on the diffusion coefficient. @@ -422,60 +450,6 @@ def calc_marginal_posterior_occs(self, *track_paths: str) -> np.ndarray: """ return self.likelihood.marginalize_on_diff_coef( self.calc_posterior_occs(*track_paths)) - ''' - def calc_occs_and_processed_stats(self, *track_paths: str) -> Tuple[np.ndarray, pd.DataFrame]: - """ Wrapper to calculate naive and posterior occupations - and processed track statistics for a set of trajectories. - This allows us to subsample the same trajectories to get - these three attributes. - - args - ---- - track_paths : paths to files with trajectories readable - by saspt.utils.load_detections - - returns - ------- - ( - numpy.ndarray of shape *n_diff_coefs*, occupations scaled - by the total number of jumps observed in this set of - trajectories; - - pandas.DataFrame, statistics on the preprocessed trajectories - ) - """ - SA = self._init_state_array(*track_paths) - naive_occs = SA.n_jumps * SA.naive_occs - posterior_occs = SA.n_jumps * SA.posterior_occs - return SA.n_jumps * SA.posterior_occs, SA.T.processed_track_statistics - - def infer_posterior_by_condition(self, col: str, normalize: bool=False - ) -> Tuple[np.ndarray, List[str]]: - """ Aggregate trajectories across files by grouping on an arbitrary - column in *self.paths*. Run state array inference on each group. - - args - ---- - col : str, a column in *self.paths* to group by - normalize : bool, normalize posterior occupations after running - - returns - ------- - ( - 2D numpy.ndarray of shape (n_conditions, n_diff_coefs), - posterior occupations for each condition marginalized - on diffusion coefficient; - - list of str of length n_conditions, the names of the conditions - corresponding to the first axis - ) - """ - posterior_occs, conditions = self.apply_by(col, - self.calc_marginal_posterior_occs, is_variadic=True) - posterior_occs = np.asarray(posterior_occs) - if normalize: - posterior_occs = normalize_2d(posterior_occs, axis=1) - return posterior_occs, conditions ############## ## PLOTTING ## @@ -690,8 +664,8 @@ def g(filepath: str) -> dict: return result def parallel_map(self, func, args, msg: str=None, progress_bar: bool=False): - """ Parallelize a function across multiple arguments using a - process-based dask scheduler. + """ Parallelize a function across multiple arguments using a process-based + dask scheduler. args ---- From 40da44290acfff5b61c4c76f78feb257890563c8 Mon Sep 17 00:00:00 2001 From: Vinson Fan Date: Mon, 4 Mar 2024 10:31:41 -0800 Subject: [PATCH 03/13] comment out two unused fns --- saspt/dataset.py | 94 +++++++++++++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 33 deletions(-) diff --git a/saspt/dataset.py b/saspt/dataset.py index 518c8b1..1a04e0b 100644 --- a/saspt/dataset.py +++ b/saspt/dataset.py @@ -348,34 +348,6 @@ def marginal_posterior_occs_dataframe(self) -> pd.DataFrame: self._marginal_posterior_occs_dataframe = df return self._marginal_posterior_occs_dataframe - def infer_posterior_by_condition(self, col: str, normalize: bool=False - ) -> Tuple[np.ndarray, List[str]]: - """ Aggregate trajectories across files by grouping on an arbitrary - column in *self.paths*. Run state array inference on each group. - - args - ---- - col : str, a column in *self.paths* to group by - normalize : bool, normalize posterior occupations after running - - returns - ------- - ( - 2D numpy.ndarray of shape (n_conditions, n_diff_coefs), - posterior occupations for each condition marginalized - on diffusion coefficient; - - list of str of length n_conditions, the names of the conditions - corresponding to the first axis - ) - """ - posterior_occs, conditions = self.apply_by(col, - self.calc_marginal_posterior_occs, is_variadic=True) - posterior_occs = np.asarray(posterior_occs) - if normalize: - posterior_occs = normalize_2d(posterior_occs, axis=1) - return posterior_occs, conditions - ############# ## METHODS ## ############# @@ -399,7 +371,7 @@ def calc_naive_occs(self, *track_paths: str) -> np.ndarray: total number of jumps observed for each SPT experiment """ SA = self._init_state_array(*track_paths) - return SA.naive_occs + return SA.n_jumps * SA.naive_occs def calc_posterior_occs(self, *track_paths: str) -> np.ndarray: """ @@ -415,7 +387,7 @@ def calc_posterior_occs(self, *track_paths: str) -> np.ndarray: """ SA = self._init_state_array(*track_paths) return SA.n_jumps * SA.posterior_occs - + ''' def calc_marginal_naive_occs(self, *track_paths: str) -> np.ndarray: """ Calculate the likelihood function for a particular set of trajectories, marginalized on the diffusion coefficient. @@ -450,6 +422,62 @@ def calc_marginal_posterior_occs(self, *track_paths: str) -> np.ndarray: """ return self.likelihood.marginalize_on_diff_coef( self.calc_posterior_occs(*track_paths)) + ''' + def calc_occs_and_processed_stats(self, *track_paths: str) -> Tuple[np.ndarray, pd.DataFrame]: + """ Wrapper to calculate naive and posterior occupations + and processed track statistics for a set of trajectories. + This allows us to subsample the same trajectories to get + these three attributes. + + args + ---- + track_paths : paths to files with trajectories readable + by saspt.utils.load_detections + + returns + ------- + ( + numpy.ndarray of shape *n_diff_coefs*, occupations scaled + by the total number of jumps observed in this set of + trajectories; + + pandas.DataFrame, statistics on the preprocessed trajectories + ) + """ + pass + ''' + SA = self._init_state_array(*track_paths) + naive_occs = SA.n_jumps * SA.naive_occs + posterior_occs = SA.n_jumps * SA.posterior_occs + return SA.n_jumps * SA.posterior_occs, SA.T.processed_track_statistics''' + + def infer_posterior_by_condition(self, col: str, normalize: bool=False + ) -> Tuple[np.ndarray, List[str]]: + """ Aggregate trajectories across files by grouping on an arbitrary + column in *self.paths*. Run state array inference on each group. + + args + ---- + col : str, a column in *self.paths* to group by + normalize : bool, normalize posterior occupations after running + + returns + ------- + ( + 2D numpy.ndarray of shape (n_conditions, n_diff_coefs), + posterior occupations for each condition marginalized + on diffusion coefficient; + + list of str of length n_conditions, the names of the conditions + corresponding to the first axis + ) + """ + posterior_occs, conditions = self.apply_by(col, + self.calc_marginal_posterior_occs, is_variadic=True) + posterior_occs = np.asarray(posterior_occs) + if normalize: + posterior_occs = normalize_2d(posterior_occs, axis=1) + return posterior_occs, conditions ############## ## PLOTTING ## @@ -664,8 +692,8 @@ def g(filepath: str) -> dict: return result def parallel_map(self, func, args, msg: str=None, progress_bar: bool=False): - """ Parallelize a function across multiple arguments using a process-based - dask scheduler. + """ Parallelize a function across multiple arguments using a + process-based dask scheduler. args ---- @@ -730,4 +758,4 @@ def apply_by(self, col: str, func: Callable, is_variadic: bool=False, else: result = self.parallel_map(lambda paths: func(paths, **kwargs), file_groups) - return result, conditions + return result, conditions \ No newline at end of file From a082232e594e9bff8d8c6eba27d44274ace835ef Mon Sep 17 00:00:00 2001 From: Vinson Fan Date: Mon, 4 Mar 2024 13:15:28 -0800 Subject: [PATCH 04/13] bundle and parallelize posterior occs, naive occs, processed stats --- saspt/dataset.py | 59 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/saspt/dataset.py b/saspt/dataset.py index 1a04e0b..a97ff07 100644 --- a/saspt/dataset.py +++ b/saspt/dataset.py @@ -234,11 +234,12 @@ def naive_occs(self) -> np.ndarray: """ if not hasattr(self, "_naive_occs"): if self.n_files > 0: - self._naive_occs = np.asarray(self.parallel_map( + self.calc_occs_and_stats_parallelized() + '''self._naive_occs = np.asarray(self.parallel_map( self.calc_naive_occs, self.paths[self.path_col], progress_bar=self.progress_bar, - )) + ))''' else: self._naive_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64) return self._naive_occs @@ -257,11 +258,12 @@ def posterior_occs(self) -> np.ndarray: """ if not hasattr(self, "_posterior_occs"): if self.n_files > 0: - self._posterior_occs = np.asarray(self.parallel_map( + self.calc_occs_and_stats_parallelized() + '''self._posterior_occs = np.asarray(self.parallel_map( self.calc_posterior_occs, self.paths[self.path_col], progress_bar=self.progress_bar, - )) + ))''' else: self._posterior_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64) return self._posterior_occs @@ -357,7 +359,7 @@ def clear(self): for attr in ["_n_files", "_naive_occs", "_posterior_occs"]: if hasattr(self, attr): delattr(self, attr) - + ''' def calc_naive_occs(self, *track_paths: str) -> np.ndarray: """ args @@ -387,7 +389,7 @@ def calc_posterior_occs(self, *track_paths: str) -> np.ndarray: """ SA = self._init_state_array(*track_paths) return SA.n_jumps * SA.posterior_occs - ''' + def calc_marginal_naive_occs(self, *track_paths: str) -> np.ndarray: """ Calculate the likelihood function for a particular set of trajectories, marginalized on the diffusion coefficient. @@ -423,11 +425,12 @@ def calc_marginal_posterior_occs(self, *track_paths: str) -> np.ndarray: return self.likelihood.marginalize_on_diff_coef( self.calc_posterior_occs(*track_paths)) ''' - def calc_occs_and_processed_stats(self, *track_paths: str) -> Tuple[np.ndarray, pd.DataFrame]: - """ Wrapper to calculate naive and posterior occupations - and processed track statistics for a set of trajectories. - This allows us to subsample the same trajectories to get - these three attributes. + def calc_occs_and_stats_parallelized(self, *track_paths: str + ) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]: + """ Calculate naive occupations, posterior occupations, + and processed track statistics, parallelized for a set + of trajectories. This allows us to subsample the same + trajectories (if needed) to get these three attributes. args ---- @@ -444,12 +447,34 @@ def calc_occs_and_processed_stats(self, *track_paths: str) -> Tuple[np.ndarray, pandas.DataFrame, statistics on the preprocessed trajectories ) """ - pass - ''' - SA = self._init_state_array(*track_paths) - naive_occs = SA.n_jumps * SA.naive_occs - posterior_occs = SA.n_jumps * SA.posterior_occs - return SA.n_jumps * SA.posterior_occs, SA.T.processed_track_statistics''' + @dask.delayed + def g(filepath: str) -> Tuple[np.ndarray, np.ndarray, dict]: + SA = self._init_state_array(filepath) + naive_occs = SA.n_jumps * SA.naive_occs + posterior_occs = SA.n_jumps * SA.posterior_occs + stats = SA.trajectories.processed_track_statistics + stats[self.path_col] = filepath + return (naive_occs, posterior_occs, stats) + naive_occs, posterior_occs, stats = self.parallel_map( + g, self.paths[self.path_col]) + + # Test for empty stats dict + if not stats: + self._naive_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64) + self._posterior_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64) + self._processed_track_statistics = pd.DataFrame( + columns=TrajectoryGroup.statistic_names + [self.path_col]) + return + + # Put stats into DF and sanity check + stats = pd.DataFrame(stats) + assert (stats[self.path_col] == self.paths[self.path_col]).all() + + # Map all metadata from the input paths DataFrame to the track statistics dataframe + for c in filter(lambda c: c!=self.path_col, self.paths.columns): + stats[c] = self.paths[c] + + self._processed_track_statistics = stats def infer_posterior_by_condition(self, col: str, normalize: bool=False ) -> Tuple[np.ndarray, List[str]]: From f05d68ed739af2cb0a414e9cd198ae429a0c4459 Mon Sep 17 00:00:00 2001 From: Vinson Fan Date: Mon, 4 Mar 2024 13:16:57 -0800 Subject: [PATCH 05/13] actually assign some values to attrs --- saspt/dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/saspt/dataset.py b/saspt/dataset.py index a97ff07..f667ce1 100644 --- a/saspt/dataset.py +++ b/saspt/dataset.py @@ -475,6 +475,8 @@ def g(filepath: str) -> Tuple[np.ndarray, np.ndarray, dict]: stats[c] = self.paths[c] self._processed_track_statistics = stats + self._naive_occs = np.asarray(naive_occs) + self._posterior_occs = np.asarray(posterior_occs) def infer_posterior_by_condition(self, col: str, normalize: bool=False ) -> Tuple[np.ndarray, List[str]]: From 46e40a531f120740e52a296438e2b5d077083655 Mon Sep 17 00:00:00 2001 From: Vinson Fan Date: Mon, 4 Mar 2024 13:49:19 -0800 Subject: [PATCH 06/13] clean up --- saspt/dataset.py | 144 ++++++----------------------------------------- 1 file changed, 18 insertions(+), 126 deletions(-) diff --git a/saspt/dataset.py b/saspt/dataset.py index f667ce1..c039ad1 100644 --- a/saspt/dataset.py +++ b/saspt/dataset.py @@ -217,7 +217,11 @@ def processed_track_statistics(self) -> pd.DataFrame: pandas.DataFrame, where each row corresponds to one file """ if not hasattr(self, "_processed_track_statistics"): - self._processed_track_statistics = self._get_processed_track_statistics() + if self.n_files > 0: + self.calc_occs_and_stats_parallelized() + else: + self._processed_track_statistics = pd.DataFrame( + columns=TrajectoryGroup.statistic_names + [self.path_col]) return self._processed_track_statistics @property @@ -235,11 +239,6 @@ def naive_occs(self) -> np.ndarray: if not hasattr(self, "_naive_occs"): if self.n_files > 0: self.calc_occs_and_stats_parallelized() - '''self._naive_occs = np.asarray(self.parallel_map( - self.calc_naive_occs, - self.paths[self.path_col], - progress_bar=self.progress_bar, - ))''' else: self._naive_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64) return self._naive_occs @@ -259,11 +258,6 @@ def posterior_occs(self) -> np.ndarray: if not hasattr(self, "_posterior_occs"): if self.n_files > 0: self.calc_occs_and_stats_parallelized() - '''self._posterior_occs = np.asarray(self.parallel_map( - self.calc_posterior_occs, - self.paths[self.path_col], - progress_bar=self.progress_bar, - ))''' else: self._posterior_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64) return self._posterior_occs @@ -359,74 +353,9 @@ def clear(self): for attr in ["_n_files", "_naive_occs", "_posterior_occs"]: if hasattr(self, attr): delattr(self, attr) - ''' - def calc_naive_occs(self, *track_paths: str) -> np.ndarray: - """ - args - ---- - track_paths : paths to files with trajectories, readable by - saspt.utils.load_detections - - returns - ------- - numpy.ndarray of shape *self.shape*, occupations scaled by the - total number of jumps observed for each SPT experiment - """ - SA = self._init_state_array(*track_paths) - return SA.n_jumps * SA.naive_occs - - def calc_posterior_occs(self, *track_paths: str) -> np.ndarray: - """ - args - ---- - track_paths : paths to files with trajectories, readable by - saspt.utils.load_detections - - returns - ------- - numpy.ndarray of shape *self.shape*, mean posterior occupations - scaled by the total number of jumps observed for each SPT experiment - """ - SA = self._init_state_array(*track_paths) - return SA.n_jumps * SA.posterior_occs - def calc_marginal_naive_occs(self, *track_paths: str) -> np.ndarray: - """ Calculate the likelihood function for a particular set of - trajectories, marginalized on the diffusion coefficient. - - args - ---- - track_paths : paths to files with trajectories readable - by saspt.utils.load_detections - - returns - ------- - numpy.ndarray of shape *n_diff_coefs*, occupations scaled by the - total number of jumps observed in these trajectories - """ - return self.likelihood.marginalize_on_diff_coef( - self.calc_naive_occs(*track_paths)) - - def calc_marginal_posterior_occs(self, *track_paths: str) -> np.ndarray: - """ Calculate the posterior mean state occupations for a particular - set of trajectories, marginalized on diffusion coefficient. - - args - ---- - track_paths : paths to files with trajectories readable - by saspt.utils.load_detections - - returns - ------- - numpy.ndarray of shape *n_diff_coefs*, occupations scaled - by the total number of jumps observed in this set of - trajectories - """ - return self.likelihood.marginalize_on_diff_coef( - self.calc_posterior_occs(*track_paths)) - ''' - def calc_occs_and_stats_parallelized(self, *track_paths: str - ) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]: + def calc_occs_and_stats_parallelized(self) -> Tuple[ + np.ndarray, np.ndarray, pd.DataFrame]: """ Calculate naive occupations, posterior occupations, and processed track statistics, parallelized for a set of trajectories. This allows us to subsample the same @@ -436,16 +365,6 @@ def calc_occs_and_stats_parallelized(self, *track_paths: str ---- track_paths : paths to files with trajectories readable by saspt.utils.load_detections - - returns - ------- - ( - numpy.ndarray of shape *n_diff_coefs*, occupations scaled - by the total number of jumps observed in this set of - trajectories; - - pandas.DataFrame, statistics on the preprocessed trajectories - ) """ @dask.delayed def g(filepath: str) -> Tuple[np.ndarray, np.ndarray, dict]: @@ -454,10 +373,14 @@ def g(filepath: str) -> Tuple[np.ndarray, np.ndarray, dict]: posterior_occs = SA.n_jumps * SA.posterior_occs stats = SA.trajectories.processed_track_statistics stats[self.path_col] = filepath - return (naive_occs, posterior_occs, stats) - naive_occs, posterior_occs, stats = self.parallel_map( - g, self.paths[self.path_col]) - + return naive_occs, posterior_occs, stats + + result = self.parallel_map( + g, self.paths[self.path_col], progress_bar=self.progress_bar) + naive_occs = np.asarray([r[0] for r in result]) + posterior_occs = np.asarray([r[1] for r in result]) + stats = [r[2] for r in result] + # Test for empty stats dict if not stats: self._naive_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64) @@ -465,12 +388,13 @@ def g(filepath: str) -> Tuple[np.ndarray, np.ndarray, dict]: self._processed_track_statistics = pd.DataFrame( columns=TrajectoryGroup.statistic_names + [self.path_col]) return - + # Put stats into DF and sanity check stats = pd.DataFrame(stats) assert (stats[self.path_col] == self.paths[self.path_col]).all() - # Map all metadata from the input paths DataFrame to the track statistics dataframe + # Map all metadata from the input paths DataFrame + # to the track statistics dataframe for c in filter(lambda c: c!=self.path_col, self.paths.columns): stats[c] = self.paths[c] @@ -654,38 +578,6 @@ def _init_state_array(self, *track_paths: str) -> StateArray: StateArray over them """ return StateArray(self._load_tracks(*track_paths), self.likelihood, self.params) - def _get_processed_track_statistics(self) -> pd.DataFrame: - """ Calculate some statistics on the preprocessed trajectories for each - file in this StateArrayDataset. - - returns - ------- - pandas.DataFrame with each row corresponding to one file. Columns - correspond to different statistics - """ - @dask.delayed - def g(filepath: str) -> dict: - T = self._load_tracks(filepath) - stats = T.processed_track_statistics - stats[self.path_col] = filepath - return stats - result = pd.DataFrame(self.parallel_map(g, self.paths[self.path_col])) - - # Conceivable that there are zero files in this dataset - if len(result) == 0: - result[self.path_col] = self.paths[self.path_col] - for stat in TrajectoryGroup.statistic_names: - result[stat] = pd.Series([], dtype=np.float64) - - # Sanity check - assert (result[self.path_col] == self.paths[self.path_col]).all() - - # Map all metadata from the input paths DataFrame to the track statistics dataframe - for c in filter(lambda c: c!=self.path_col, self.paths.columns): - result[c] = self.paths[c] - - return result - def _get_raw_track_statistics(self) -> pd.DataFrame: """ Calculated some statistics on the raw trajectories for each file in this StateArrayDataset. From 5d8f439c301d44827d1749a5b95c6d1758956118 Mon Sep 17 00:00:00 2001 From: Vinson Fan Date: Mon, 4 Mar 2024 13:51:16 -0800 Subject: [PATCH 07/13] add a check for no filepaths _get_raw_track_statistics as well --- saspt/dataset.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/saspt/dataset.py b/saspt/dataset.py index c039ad1..c668f7f 100644 --- a/saspt/dataset.py +++ b/saspt/dataset.py @@ -201,7 +201,11 @@ def raw_track_statistics(self) -> pd.DataFrame: pandas.DataFrame, where each row corresponds to one file """ if not hasattr(self, "_raw_track_statistics"): - self._raw_track_statistics = self._get_raw_track_statistics() + if self.n_files > 0: + self._raw_track_statistics = self._get_raw_track_statistics() + else: + self._raw_track_statistics = pd.DataFrame( + columns=TrajectoryGroup.statistic_names + [self.path_col]) return self._raw_track_statistics @property From 86894760e9f7283042346f989c160c2b67dbea7a Mon Sep 17 00:00:00 2001 From: Vinson Fan Date: Mon, 4 Mar 2024 14:27:56 -0800 Subject: [PATCH 08/13] add expected columns to track stats --- saspt/dataset.py | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/saspt/dataset.py b/saspt/dataset.py index c668f7f..cd7ed6a 100644 --- a/saspt/dataset.py +++ b/saspt/dataset.py @@ -204,8 +204,11 @@ def raw_track_statistics(self) -> pd.DataFrame: if self.n_files > 0: self._raw_track_statistics = self._get_raw_track_statistics() else: + # Set empty stats with expected columns and metadata self._raw_track_statistics = pd.DataFrame( - columns=TrajectoryGroup.statistic_names + [self.path_col]) + columns=TrajectoryGroup.statistic_names) + for c in self.paths.columns: + self._raw_track_statistics[c] = self.paths[c] return self._raw_track_statistics @property @@ -225,7 +228,9 @@ def processed_track_statistics(self) -> pd.DataFrame: self.calc_occs_and_stats_parallelized() else: self._processed_track_statistics = pd.DataFrame( - columns=TrajectoryGroup.statistic_names + [self.path_col]) + columns=TrajectoryGroup.statistic_names) + for c in self.paths.columns: + self._processed_track_statistics[c] = self.paths[c] return self._processed_track_statistics @property @@ -387,10 +392,14 @@ def g(filepath: str) -> Tuple[np.ndarray, np.ndarray, dict]: # Test for empty stats dict if not stats: + # Set empty occs self._naive_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64) self._posterior_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64) + # Set empty stats with expected columns and metadata self._processed_track_statistics = pd.DataFrame( - columns=TrajectoryGroup.statistic_names + [self.path_col]) + columns=TrajectoryGroup.statistic_names) + for c in self.paths.columns: + self._raw_track_statistics[c] = self.paths[c] return # Put stats into DF and sanity check @@ -406,6 +415,25 @@ def g(filepath: str) -> Tuple[np.ndarray, np.ndarray, dict]: self._naive_occs = np.asarray(naive_occs) self._posterior_occs = np.asarray(posterior_occs) + def calc_marginal_posterior_occs(self, *track_paths: str) -> np.ndarray: + """ Calculate the posterior mean state occupations for a particular + set of trajectories, marginalized on diffusion coefficient. + + args + ---- + track_paths : paths to files with trajectories readable + by saspt.utils.load_detections + + returns + ------- + numpy.ndarray of shape *n_diff_coefs*, occupations scaled + by the total number of jumps observed in this set of + trajectories + """ + SA = self._init_state_array(*track_paths) + return self.likelihood.marginalize_on_diff_coef( + SA.n_jumps * SA.posterior_occs) + def infer_posterior_by_condition(self, col: str, normalize: bool=False ) -> Tuple[np.ndarray, List[str]]: """ Aggregate trajectories across files by grouping on an arbitrary From ad6d2c2f522bb8b2e0d08197b63e95c1413caca2 Mon Sep 17 00:00:00 2001 From: Vinson Fan Date: Mon, 4 Mar 2024 15:02:47 -0800 Subject: [PATCH 09/13] fix test for marginalized naive occs --- tests/test_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index d80b60b..d6f1d14 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -98,7 +98,7 @@ def test_marginal_naive_occs(self): ML = D.marginal_naive_occs assert isinstance(ML, np.ndarray) assert ML.shape == (len(self.paths), len(self.likelihood.diff_coefs)) - assert (np.abs(ML.sum(axis=1) - 1.0) < 1.0e-6).all() + assert (np.abs(ML.sum(axis=1) - D.jumps_per_file) < 1.0e-6).all() # Make sure StateArrayDataset.clear works D.clear() From 7b61467005c7e90112618b1ec8e5dfa535993547 Mon Sep 17 00:00:00 2001 From: Vinson Fan Date: Mon, 4 Mar 2024 15:21:28 -0800 Subject: [PATCH 10/13] add subsampling test within SADs --- tests/test_dataset.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index d6f1d14..490727f 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -214,3 +214,31 @@ def test_posterior_line_plot(self): condition_col=self.condition_col) self.check_plot_func(D.posterior_line_plot, "_out_test_posterior_line_plot.png") + + def test_subsampling(self): + # New params with a smaller sample size + sample_size = 10 + params = StateArrayParameters( + pixel_size_um=0.16, + frame_interval=0.01, + focal_depth=0.7, + splitsize=10, + sample_size=sample_size, + start_frame=0, + max_iter=10, + conc_param=1.0, + progress_bar=False, + num_workers=2, + ) + self.params = params + D = StateArrayDataset(self.paths, self.likelihood, + params=self.params, path_col=self.path_col, + condition_col=self.condition_col) + + # Check that jumps_per_file and implied jumps are correct + assert np.allclose(D.jumps_per_file.astype(float), D.posterior_occs.sum(axis=(1,2))) + assert np.allclose(D.jumps_per_file.astype(float), D.naive_occs.sum(axis=(1,2))) + + # Check that subsampling actually worked + n_trajs = D.processed_track_statistics['n_tracks'] + assert (n_trajs <= sample_size).all() From 8699ade8a987830912460461a9bccce876e096c6 Mon Sep 17 00:00:00 2001 From: Vinson Fan Date: Mon, 4 Mar 2024 16:09:42 -0800 Subject: [PATCH 11/13] processed track stats must be cleared; add test --- saspt/dataset.py | 3 ++- tests/test_dataset.py | 11 +++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/saspt/dataset.py b/saspt/dataset.py index cd7ed6a..8ca0ec8 100644 --- a/saspt/dataset.py +++ b/saspt/dataset.py @@ -359,7 +359,8 @@ def marginal_posterior_occs_dataframe(self) -> pd.DataFrame: def clear(self): """ Delete expensive cached attributes """ - for attr in ["_n_files", "_naive_occs", "_posterior_occs"]: + for attr in ["_n_files", "_naive_occs", "_posterior_occs", + "_processed_track_statistics"]: if hasattr(self, attr): delattr(self, attr) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 490727f..d181c3f 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -242,3 +242,14 @@ def test_subsampling(self): # Check that subsampling actually worked n_trajs = D.processed_track_statistics['n_tracks'] assert (n_trajs <= sample_size).all() + + # Clear and repeat tests + D.clear() + + # Check that jumps_per_file and implied jumps are correct + assert np.allclose(D.jumps_per_file.astype(float), D.posterior_occs.sum(axis=(1,2))) + assert np.allclose(D.jumps_per_file.astype(float), D.naive_occs.sum(axis=(1,2))) + + # Check that subsampling actually worked + n_trajs = D.processed_track_statistics['n_tracks'] + assert (n_trajs <= sample_size).all() From c23fe21d5702cc35e66986a94274953f0eced01b Mon Sep 17 00:00:00 2001 From: Vinson Fan Date: Mon, 4 Mar 2024 16:21:30 -0800 Subject: [PATCH 12/13] fix inaccurate docstring --- saspt/dataset.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/saspt/dataset.py b/saspt/dataset.py index 8ca0ec8..2b54dc8 100644 --- a/saspt/dataset.py +++ b/saspt/dataset.py @@ -370,11 +370,6 @@ def calc_occs_and_stats_parallelized(self) -> Tuple[ and processed track statistics, parallelized for a set of trajectories. This allows us to subsample the same trajectories (if needed) to get these three attributes. - - args - ---- - track_paths : paths to files with trajectories readable - by saspt.utils.load_detections """ @dask.delayed def g(filepath: str) -> Tuple[np.ndarray, np.ndarray, dict]: From 9847c3533dccadd723b09e944a7db062b8ce682b Mon Sep 17 00:00:00 2001 From: Vinson Fan Date: Tue, 5 Mar 2024 12:22:13 -0800 Subject: [PATCH 13/13] need to also clear jumps --- saspt/dataset.py | 2 +- tests/test_dataset.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/saspt/dataset.py b/saspt/dataset.py index 2b54dc8..79cab2b 100644 --- a/saspt/dataset.py +++ b/saspt/dataset.py @@ -360,7 +360,7 @@ def marginal_posterior_occs_dataframe(self) -> pd.DataFrame: def clear(self): """ Delete expensive cached attributes """ for attr in ["_n_files", "_naive_occs", "_posterior_occs", - "_processed_track_statistics"]: + "_processed_track_statistics", "_jumps_per_file"]: if hasattr(self, attr): delattr(self, attr) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index d181c3f..7815680 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -245,8 +245,6 @@ def test_subsampling(self): # Clear and repeat tests D.clear() - - # Check that jumps_per_file and implied jumps are correct assert np.allclose(D.jumps_per_file.astype(float), D.posterior_occs.sum(axis=(1,2))) assert np.allclose(D.jumps_per_file.astype(float), D.naive_occs.sum(axis=(1,2)))