From f353e711a63cbdd7ef8d2722facf1c9567551ff7 Mon Sep 17 00:00:00 2001
From: Vinson Fan <vinsfan368@gmail.com>
Date: Mon, 4 Mar 2024 10:27:30 -0800
Subject: [PATCH 01/13] comment out two unused fns

---
 saspt/.DS_Store  | Bin 0 -> 6148 bytes
 saspt/dataset.py |  90 ++++++++++++++++++++++++++++++-----------------
 2 files changed, 58 insertions(+), 32 deletions(-)
 create mode 100644 saspt/.DS_Store
diff --git a/saspt/.DS_Store b/saspt/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..7ef7938a030a1076fdc267a4bd2a254fcf6d0a32
GIT binary patch
literal 6148
zcmeHK%}T>S5T3QwrWBzEg&r5Y7OX{U#Y?F51&ruHr6#0kFwK@GwTDv3SzpK}@p+ut
z-GJ2|Jc-yD*!^bbXE*af_6Gn&ZyFu|)B(W3Mkq*GA!J_Z+OWZdLeCMy5Yk|lj)Qo`
zM1Rpl-(G<fd>FwPK7GFm(DN3I<0#Eqt#`3eDsOI8oQhL(?z~4e^D;l1ragahL!(Qj
z;-Jv|;3^u<2KAjYm1KUD3@17v3WpeSdmSaAn)TE)2~(Zx8HZDKs)KrCK5uu9nzDV|
zT{PwVq}^)DPWN!Js5-lQ`==L!r+Ad87el2$*0pR~EZ`NDFI7EzlO$Hj1A2<ABASsI
zU<Q~0W?;P-u;+kNTd%UYT4sP5_yGg7KS*qZuEpG--a4?O>ofVwge0ibTY^wpbS>rv
z(SssPDxygh_K6`(I@+a;b1miuO*#m@GR|XHE*>vJua0)9!$G(Pxn~BLfn^3tW>}~5
z{~Ujrm5==86do}H%)mcmK$QDlzlTNHv-Mkfbk<62_t;1%E++*A^`%Py4rm`~Xs7W@
b(lO4pm>Z;7$ga~7`68eR;f@*j1qMC<mx4^R

literal 0
HcmV?d00001

diff --git a/saspt/dataset.py b/saspt/dataset.py
index 518c8b1..e958dad 100644
--- a/saspt/dataset.py
+++ b/saspt/dataset.py
@@ -348,34 +348,6 @@ def marginal_posterior_occs_dataframe(self) -> pd.DataFrame:
             self._marginal_posterior_occs_dataframe = df 
         return self._marginal_posterior_occs_dataframe
 
-    def infer_posterior_by_condition(self, col: str, normalize: bool=False
-        ) -> Tuple[np.ndarray, List[str]]:
-        """ Aggregate trajectories across files by grouping on an arbitrary
-        column in *self.paths*. Run state array inference on each group. 
-
-        args
-        ----
-            col         :   str, a column in *self.paths* to group by
-            normalize   :   bool, normalize posterior occupations after running
-
-        returns
-        -------
-            (
-                2D numpy.ndarray of shape (n_conditions, n_diff_coefs), 
-                    posterior occupations for each condition marginalized
-                    on diffusion coefficient;
-
-                list of str of length n_conditions, the names of the conditions
-                    corresponding to the first axis
-            )
-        """
-        posterior_occs, conditions = self.apply_by(col,
-            self.calc_marginal_posterior_occs, is_variadic=True)
-        posterior_occs = np.asarray(posterior_occs)
-        if normalize:
-            posterior_occs = normalize_2d(posterior_occs, axis=1)
-        return posterior_occs, conditions
-
     #############
     ## METHODS ##
     #############
@@ -399,7 +371,7 @@ def calc_naive_occs(self, *track_paths: str) -> np.ndarray:
                 total number of jumps observed for each SPT experiment
         """
         SA = self._init_state_array(*track_paths)
-        return SA.naive_occs
+        return SA.n_jumps * SA.naive_occs
 
     def calc_posterior_occs(self, *track_paths: str) -> np.ndarray:
         """
@@ -415,7 +387,7 @@ def calc_posterior_occs(self, *track_paths: str) -> np.ndarray:
         """
         SA = self._init_state_array(*track_paths)
         return SA.n_jumps * SA.posterior_occs
-
+    '''
     def calc_marginal_naive_occs(self, *track_paths: str) -> np.ndarray:
         """ Calculate the likelihood function for a particular set of 
         trajectories, marginalized on the diffusion coefficient.
@@ -450,6 +422,60 @@ def calc_marginal_posterior_occs(self, *track_paths: str) -> np.ndarray:
         """
         return self.likelihood.marginalize_on_diff_coef(
             self.calc_posterior_occs(*track_paths))
+    '''
+    def calc_occs_and_processed_stats(self, *track_paths: str) -> Tuple[np.ndarray, pd.DataFrame]:
+        """ Wrapper to calculate naive and posterior occupations
+        and processed track statistics for a set of trajectories.
+        This allows us to subsample the same trajectories to get
+        these three attributes.
+
+        args
+        ----
+            track_paths :   paths to files with trajectories readable
+                            by saspt.utils.load_detections
+
+        returns
+        -------
+            (
+                numpy.ndarray of shape *n_diff_coefs*, occupations scaled
+                    by the total number of jumps observed in this set of 
+                    trajectories;
+
+                pandas.DataFrame, statistics on the preprocessed trajectories
+            )
+        """
+        SA = self._init_state_array(*track_paths)
+        naive_occs = SA.n_jumps * SA.naive_occs
+        posterior_occs = SA.n_jumps * SA.posterior_occs
+        return SA.n_jumps * SA.posterior_occs, SA.T.processed_track_statistics
+    
+    def infer_posterior_by_condition(self, col: str, normalize: bool=False
+        ) -> Tuple[np.ndarray, List[str]]:
+        """ Aggregate trajectories across files by grouping on an arbitrary
+        column in *self.paths*. Run state array inference on each group. 
+
+        args
+        ----
+            col         :   str, a column in *self.paths* to group by
+            normalize   :   bool, normalize posterior occupations after running
+
+        returns
+        -------
+            (
+                2D numpy.ndarray of shape (n_conditions, n_diff_coefs), 
+                    posterior occupations for each condition marginalized
+                    on diffusion coefficient;
+
+                list of str of length n_conditions, the names of the conditions
+                    corresponding to the first axis
+            )
+        """
+        posterior_occs, conditions = self.apply_by(col,
+            self.calc_marginal_posterior_occs, is_variadic=True)
+        posterior_occs = np.asarray(posterior_occs)
+        if normalize:
+            posterior_occs = normalize_2d(posterior_occs, axis=1)
+        return posterior_occs, conditions
 
     ##############
     ## PLOTTING ##
@@ -664,8 +690,8 @@ def g(filepath: str) -> dict:
         return result       
 
     def parallel_map(self, func, args, msg: str=None, progress_bar: bool=False):
-        """ Parallelize a function across multiple arguments using a process-based
-        dask scheduler.
+        """ Parallelize a function across multiple arguments using a 
+        process-based dask scheduler.
 
         args
         ----

From 8fe635bd4dc4c27e48a6576b21309fccbbf2f7e0 Mon Sep 17 00:00:00 2001
From: Vinson Fan <vinsfan368@gmail.com>
Date: Mon, 4 Mar 2024 10:29:26 -0800
Subject: [PATCH 02/13] Revert "comment out two unused fns"

This reverts commit f353e711a63cbdd7ef8d2722facf1c9567551ff7.
---
 saspt/.DS_Store  | Bin 6148 -> 0 bytes
 saspt/dataset.py |  90 +++++++++++++++++------------------------------
 2 files changed, 32 insertions(+), 58 deletions(-)
 delete mode 100644 saspt/.DS_Store

diff --git a/saspt/.DS_Store b/saspt/.DS_Store
deleted file mode 100644
index 7ef7938a030a1076fdc267a4bd2a254fcf6d0a32..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHK%}T>S5T3QwrWBzEg&r5Y7OX{U#Y?F51&ruHr6#0kFwK@GwTDv3SzpK}@p+ut
z-GJ2|Jc-yD*!^bbXE*af_6Gn&ZyFu|)B(W3Mkq*GA!J_Z+OWZdLeCMy5Yk|lj)Qo`
zM1Rpl-(G<fd>FwPK7GFm(DN3I<0#Eqt#`3eDsOI8oQhL(?z~4e^D;l1ragahL!(Qj
z;-Jv|;3^u<2KAjYm1KUD3@17v3WpeSdmSaAn)TE)2~(Zx8HZDKs)KrCK5uu9nzDV|
zT{PwVq}^)DPWN!Js5-lQ`==L!r+Ad87el2$*0pR~EZ`NDFI7EzlO$Hj1A2<ABASsI
zU<Q~0W?;P-u;+kNTd%UYT4sP5_yGg7KS*qZuEpG--a4?O>ofVwge0ibTY^wpbS>rv
z(SssPDxygh_K6`(I@+a;b1miuO*#m@GR|XHE*>vJua0)9!$G(Pxn~BLfn^3tW>}~5
z{~Ujrm5==86do}H%)mcmK$QDlzlTNHv-Mkfbk<62_t;1%E++*A^`%Py4rm`~Xs7W@
b(lO4pm>Z;7$ga~7`68eR;f@*j1qMC<mx4^R

diff --git a/saspt/dataset.py b/saspt/dataset.py
index e958dad..518c8b1 100644
--- a/saspt/dataset.py
+++ b/saspt/dataset.py
@@ -348,6 +348,34 @@ def marginal_posterior_occs_dataframe(self) -> pd.DataFrame:
             self._marginal_posterior_occs_dataframe = df 
         return self._marginal_posterior_occs_dataframe
 
+    def infer_posterior_by_condition(self, col: str, normalize: bool=False
+        ) -> Tuple[np.ndarray, List[str]]:
+        """ Aggregate trajectories across files by grouping on an arbitrary
+        column in *self.paths*. Run state array inference on each group. 
+
+        args
+        ----
+            col         :   str, a column in *self.paths* to group by
+            normalize   :   bool, normalize posterior occupations after running
+
+        returns
+        -------
+            (
+                2D numpy.ndarray of shape (n_conditions, n_diff_coefs), 
+                    posterior occupations for each condition marginalized
+                    on diffusion coefficient;
+
+                list of str of length n_conditions, the names of the conditions
+                    corresponding to the first axis
+            )
+        """
+        posterior_occs, conditions = self.apply_by(col,
+            self.calc_marginal_posterior_occs, is_variadic=True)
+        posterior_occs = np.asarray(posterior_occs)
+        if normalize:
+            posterior_occs = normalize_2d(posterior_occs, axis=1)
+        return posterior_occs, conditions
+
     #############
     ## METHODS ##
     #############
@@ -371,7 +399,7 @@ def calc_naive_occs(self, *track_paths: str) -> np.ndarray:
                 total number of jumps observed for each SPT experiment
         """
         SA = self._init_state_array(*track_paths)
-        return SA.n_jumps * SA.naive_occs
+        return SA.naive_occs
 
     def calc_posterior_occs(self, *track_paths: str) -> np.ndarray:
         """
@@ -387,7 +415,7 @@ def calc_posterior_occs(self, *track_paths: str) -> np.ndarray:
         """
         SA = self._init_state_array(*track_paths)
         return SA.n_jumps * SA.posterior_occs
-    '''
+
     def calc_marginal_naive_occs(self, *track_paths: str) -> np.ndarray:
         """ Calculate the likelihood function for a particular set of 
         trajectories, marginalized on the diffusion coefficient.
@@ -422,60 +450,6 @@ def calc_marginal_posterior_occs(self, *track_paths: str) -> np.ndarray:
         """
         return self.likelihood.marginalize_on_diff_coef(
             self.calc_posterior_occs(*track_paths))
-    '''
-    def calc_occs_and_processed_stats(self, *track_paths: str) -> Tuple[np.ndarray, pd.DataFrame]:
-        """ Wrapper to calculate naive and posterior occupations
-        and processed track statistics for a set of trajectories.
-        This allows us to subsample the same trajectories to get
-        these three attributes.
-
-        args
-        ----
-            track_paths :   paths to files with trajectories readable
-                            by saspt.utils.load_detections
-
-        returns
-        -------
-            (
-                numpy.ndarray of shape *n_diff_coefs*, occupations scaled
-                    by the total number of jumps observed in this set of 
-                    trajectories;
-
-                pandas.DataFrame, statistics on the preprocessed trajectories
-            )
-        """
-        SA = self._init_state_array(*track_paths)
-        naive_occs = SA.n_jumps * SA.naive_occs
-        posterior_occs = SA.n_jumps * SA.posterior_occs
-        return SA.n_jumps * SA.posterior_occs, SA.T.processed_track_statistics
-    
-    def infer_posterior_by_condition(self, col: str, normalize: bool=False
-        ) -> Tuple[np.ndarray, List[str]]:
-        """ Aggregate trajectories across files by grouping on an arbitrary
-        column in *self.paths*. Run state array inference on each group. 
-
-        args
-        ----
-            col         :   str, a column in *self.paths* to group by
-            normalize   :   bool, normalize posterior occupations after running
-
-        returns
-        -------
-            (
-                2D numpy.ndarray of shape (n_conditions, n_diff_coefs), 
-                    posterior occupations for each condition marginalized
-                    on diffusion coefficient;
-
-                list of str of length n_conditions, the names of the conditions
-                    corresponding to the first axis
-            )
-        """
-        posterior_occs, conditions = self.apply_by(col,
-            self.calc_marginal_posterior_occs, is_variadic=True)
-        posterior_occs = np.asarray(posterior_occs)
-        if normalize:
-            posterior_occs = normalize_2d(posterior_occs, axis=1)
-        return posterior_occs, conditions
 
     ##############
     ## PLOTTING ##
@@ -690,8 +664,8 @@ def g(filepath: str) -> dict:
         return result       
 
     def parallel_map(self, func, args, msg: str=None, progress_bar: bool=False):
-        """ Parallelize a function across multiple arguments using a 
-        process-based dask scheduler.
+        """ Parallelize a function across multiple arguments using a process-based
+        dask scheduler.
 
         args
         ----

From 40da44290acfff5b61c4c76f78feb257890563c8 Mon Sep 17 00:00:00 2001
From: Vinson Fan <vinsfan368@gmail.com>
Date: Mon, 4 Mar 2024 10:31:41 -0800
Subject: [PATCH 03/13] comment out two unused fns

---
 saspt/dataset.py | 94 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 61 insertions(+), 33 deletions(-)

diff --git a/saspt/dataset.py b/saspt/dataset.py
index 518c8b1..1a04e0b 100644
--- a/saspt/dataset.py
+++ b/saspt/dataset.py
@@ -348,34 +348,6 @@ def marginal_posterior_occs_dataframe(self) -> pd.DataFrame:
             self._marginal_posterior_occs_dataframe = df 
         return self._marginal_posterior_occs_dataframe
 
-    def infer_posterior_by_condition(self, col: str, normalize: bool=False
-        ) -> Tuple[np.ndarray, List[str]]:
-        """ Aggregate trajectories across files by grouping on an arbitrary
-        column in *self.paths*. Run state array inference on each group. 
-
-        args
-        ----
-            col         :   str, a column in *self.paths* to group by
-            normalize   :   bool, normalize posterior occupations after running
-
-        returns
-        -------
-            (
-                2D numpy.ndarray of shape (n_conditions, n_diff_coefs), 
-                    posterior occupations for each condition marginalized
-                    on diffusion coefficient;
-
-                list of str of length n_conditions, the names of the conditions
-                    corresponding to the first axis
-            )
-        """
-        posterior_occs, conditions = self.apply_by(col,
-            self.calc_marginal_posterior_occs, is_variadic=True)
-        posterior_occs = np.asarray(posterior_occs)
-        if normalize:
-            posterior_occs = normalize_2d(posterior_occs, axis=1)
-        return posterior_occs, conditions
-
     #############
     ## METHODS ##
     #############
@@ -399,7 +371,7 @@ def calc_naive_occs(self, *track_paths: str) -> np.ndarray:
                 total number of jumps observed for each SPT experiment
         """
         SA = self._init_state_array(*track_paths)
-        return SA.naive_occs
+        return SA.n_jumps * SA.naive_occs
 
     def calc_posterior_occs(self, *track_paths: str) -> np.ndarray:
         """
@@ -415,7 +387,7 @@ def calc_posterior_occs(self, *track_paths: str) -> np.ndarray:
         """
         SA = self._init_state_array(*track_paths)
         return SA.n_jumps * SA.posterior_occs
-
+    '''
     def calc_marginal_naive_occs(self, *track_paths: str) -> np.ndarray:
         """ Calculate the likelihood function for a particular set of 
         trajectories, marginalized on the diffusion coefficient.
@@ -450,6 +422,62 @@ def calc_marginal_posterior_occs(self, *track_paths: str) -> np.ndarray:
         """
         return self.likelihood.marginalize_on_diff_coef(
             self.calc_posterior_occs(*track_paths))
+    '''
+    def calc_occs_and_processed_stats(self, *track_paths: str) -> Tuple[np.ndarray, pd.DataFrame]:
+        """ Wrapper to calculate naive and posterior occupations
+        and processed track statistics for a set of trajectories.
+        This allows us to subsample the same trajectories to get
+        these three attributes.
+
+        args
+        ----
+            track_paths :   paths to files with trajectories readable
+                            by saspt.utils.load_detections
+
+        returns
+        -------
+            (
+                numpy.ndarray of shape *n_diff_coefs*, occupations scaled
+                    by the total number of jumps observed in this set of 
+                    trajectories;
+
+                pandas.DataFrame, statistics on the preprocessed trajectories
+            )
+        """
+        pass
+        '''
+        SA = self._init_state_array(*track_paths)
+        naive_occs = SA.n_jumps * SA.naive_occs
+        posterior_occs = SA.n_jumps * SA.posterior_occs
+        return SA.n_jumps * SA.posterior_occs, SA.T.processed_track_statistics'''
+    
+    def infer_posterior_by_condition(self, col: str, normalize: bool=False
+        ) -> Tuple[np.ndarray, List[str]]:
+        """ Aggregate trajectories across files by grouping on an arbitrary
+        column in *self.paths*. Run state array inference on each group. 
+
+        args
+        ----
+            col         :   str, a column in *self.paths* to group by
+            normalize   :   bool, normalize posterior occupations after running
+
+        returns
+        -------
+            (
+                2D numpy.ndarray of shape (n_conditions, n_diff_coefs), 
+                    posterior occupations for each condition marginalized
+                    on diffusion coefficient;
+
+                list of str of length n_conditions, the names of the conditions
+                    corresponding to the first axis
+            )
+        """
+        posterior_occs, conditions = self.apply_by(col,
+            self.calc_marginal_posterior_occs, is_variadic=True)
+        posterior_occs = np.asarray(posterior_occs)
+        if normalize:
+            posterior_occs = normalize_2d(posterior_occs, axis=1)
+        return posterior_occs, conditions
 
     ##############
     ## PLOTTING ##
@@ -664,8 +692,8 @@ def g(filepath: str) -> dict:
         return result       
 
     def parallel_map(self, func, args, msg: str=None, progress_bar: bool=False):
-        """ Parallelize a function across multiple arguments using a process-based
-        dask scheduler.
+        """ Parallelize a function across multiple arguments using a 
+        process-based dask scheduler.
 
         args
         ----
@@ -730,4 +758,4 @@ def apply_by(self, col: str, func: Callable, is_variadic: bool=False,
         else:
             result = self.parallel_map(lambda paths: func(paths, **kwargs), file_groups)
 
-        return result, conditions
+        return result, conditions
\ No newline at end of file

From a082232e594e9bff8d8c6eba27d44274ace835ef Mon Sep 17 00:00:00 2001
From: Vinson Fan <vinsfan368@gmail.com>
Date: Mon, 4 Mar 2024 13:15:28 -0800
Subject: [PATCH 04/13] bundle and parallelize posterior occs, naive occs,
 processed stats

---
 saspt/dataset.py | 59 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 17 deletions(-)

diff --git a/saspt/dataset.py b/saspt/dataset.py
index 1a04e0b..a97ff07 100644
--- a/saspt/dataset.py
+++ b/saspt/dataset.py
@@ -234,11 +234,12 @@ def naive_occs(self) -> np.ndarray:
         """
         if not hasattr(self, "_naive_occs"):
             if self.n_files > 0:
-                self._naive_occs = np.asarray(self.parallel_map(
+                self.calc_occs_and_stats_parallelized()
+                '''self._naive_occs = np.asarray(self.parallel_map(
                     self.calc_naive_occs,
                     self.paths[self.path_col],
                     progress_bar=self.progress_bar,
-                ))
+                ))'''
             else:
                 self._naive_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64)
         return self._naive_occs
@@ -257,11 +258,12 @@ def posterior_occs(self) -> np.ndarray:
         """
         if not hasattr(self, "_posterior_occs"):
             if self.n_files > 0:
-                self._posterior_occs = np.asarray(self.parallel_map(
+                self.calc_occs_and_stats_parallelized()
+                '''self._posterior_occs = np.asarray(self.parallel_map(
                     self.calc_posterior_occs,
                     self.paths[self.path_col],
                     progress_bar=self.progress_bar,
-                ))
+                ))'''
             else:
                 self._posterior_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64)
         return self._posterior_occs
@@ -357,7 +359,7 @@ def clear(self):
         for attr in ["_n_files", "_naive_occs", "_posterior_occs"]:
             if hasattr(self, attr):
                 delattr(self, attr)
-
+    '''
     def calc_naive_occs(self, *track_paths: str) -> np.ndarray:
         """
         args
@@ -387,7 +389,7 @@ def calc_posterior_occs(self, *track_paths: str) -> np.ndarray:
         """
         SA = self._init_state_array(*track_paths)
         return SA.n_jumps * SA.posterior_occs
-    '''
+    
     def calc_marginal_naive_occs(self, *track_paths: str) -> np.ndarray:
         """ Calculate the likelihood function for a particular set of 
         trajectories, marginalized on the diffusion coefficient.
@@ -423,11 +425,12 @@ def calc_marginal_posterior_occs(self, *track_paths: str) -> np.ndarray:
         return self.likelihood.marginalize_on_diff_coef(
             self.calc_posterior_occs(*track_paths))
     '''
-    def calc_occs_and_processed_stats(self, *track_paths: str) -> Tuple[np.ndarray, pd.DataFrame]:
-        """ Wrapper to calculate naive and posterior occupations
-        and processed track statistics for a set of trajectories.
-        This allows us to subsample the same trajectories to get
-        these three attributes.
+    def calc_occs_and_stats_parallelized(self, *track_paths: str
+        ) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]:
+        """ Calculate naive occupations, posterior occupations, 
+        and processed track statistics, parallelized for a set 
+        of trajectories. This allows us to subsample the same 
+        trajectories (if needed) to get these three attributes.
 
         args
         ----
@@ -444,12 +447,34 @@ def calc_occs_and_processed_stats(self, *track_paths: str) -> Tuple[np.ndarray,
                 pandas.DataFrame, statistics on the preprocessed trajectories
             )
         """
-        pass
-        '''
-        SA = self._init_state_array(*track_paths)
-        naive_occs = SA.n_jumps * SA.naive_occs
-        posterior_occs = SA.n_jumps * SA.posterior_occs
-        return SA.n_jumps * SA.posterior_occs, SA.T.processed_track_statistics'''
+        @dask.delayed
+        def g(filepath: str) -> Tuple[np.ndarray, np.ndarray, dict]:
+            SA = self._init_state_array(filepath)
+            naive_occs = SA.n_jumps * SA.naive_occs
+            posterior_occs = SA.n_jumps * SA.posterior_occs
+            stats = SA.trajectories.processed_track_statistics
+            stats[self.path_col] = filepath
+            return (naive_occs, posterior_occs, stats)
+        naive_occs, posterior_occs, stats = self.parallel_map(
+            g, self.paths[self.path_col])
+        
+        # Test for empty stats dict
+        if not stats:
+            self._naive_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64)
+            self._posterior_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64)
+            self._processed_track_statistics = pd.DataFrame(
+                columns=TrajectoryGroup.statistic_names + [self.path_col])
+            return
+        
+        # Put stats into DF and sanity check
+        stats = pd.DataFrame(stats)
+        assert (stats[self.path_col] == self.paths[self.path_col]).all()
+
+        # Map all metadata from the input paths DataFrame to the track statistics dataframe
+        for c in filter(lambda c: c!=self.path_col, self.paths.columns):
+            stats[c] = self.paths[c]
+        
+        self._processed_track_statistics = stats
     
     def infer_posterior_by_condition(self, col: str, normalize: bool=False
         ) -> Tuple[np.ndarray, List[str]]:

From f05d68ed739af2cb0a414e9cd198ae429a0c4459 Mon Sep 17 00:00:00 2001
From: Vinson Fan <vinsfan368@gmail.com>
Date: Mon, 4 Mar 2024 13:16:57 -0800
Subject: [PATCH 05/13] actually assign some values to attrs

---
 saspt/dataset.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/saspt/dataset.py b/saspt/dataset.py
index a97ff07..f667ce1 100644
--- a/saspt/dataset.py
+++ b/saspt/dataset.py
@@ -475,6 +475,8 @@ def g(filepath: str) -> Tuple[np.ndarray, np.ndarray, dict]:
             stats[c] = self.paths[c]
         
         self._processed_track_statistics = stats
+        self._naive_occs = np.asarray(naive_occs)
+        self._posterior_occs = np.asarray(posterior_occs)
     
     def infer_posterior_by_condition(self, col: str, normalize: bool=False
         ) -> Tuple[np.ndarray, List[str]]:

From 46e40a531f120740e52a296438e2b5d077083655 Mon Sep 17 00:00:00 2001
From: Vinson Fan <vinsfan368@gmail.com>
Date: Mon, 4 Mar 2024 13:49:19 -0800
Subject: [PATCH 06/13] clean up

---
 saspt/dataset.py | 144 ++++++-----------------------------------------
 1 file changed, 18 insertions(+), 126 deletions(-)

diff --git a/saspt/dataset.py b/saspt/dataset.py
index f667ce1..c039ad1 100644
--- a/saspt/dataset.py
+++ b/saspt/dataset.py
@@ -217,7 +217,11 @@ def processed_track_statistics(self) -> pd.DataFrame:
             pandas.DataFrame, where each row corresponds to one file
         """
         if not hasattr(self, "_processed_track_statistics"):
-            self._processed_track_statistics = self._get_processed_track_statistics()
+            if self.n_files > 0:
+                self.calc_occs_and_stats_parallelized()
+            else:
+                self._processed_track_statistics = pd.DataFrame(
+                    columns=TrajectoryGroup.statistic_names + [self.path_col])
         return self._processed_track_statistics
 
     @property 
@@ -235,11 +239,6 @@ def naive_occs(self) -> np.ndarray:
         if not hasattr(self, "_naive_occs"):
             if self.n_files > 0:
                 self.calc_occs_and_stats_parallelized()
-                '''self._naive_occs = np.asarray(self.parallel_map(
-                    self.calc_naive_occs,
-                    self.paths[self.path_col],
-                    progress_bar=self.progress_bar,
-                ))'''
             else:
                 self._naive_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64)
         return self._naive_occs
@@ -259,11 +258,6 @@ def posterior_occs(self) -> np.ndarray:
         if not hasattr(self, "_posterior_occs"):
             if self.n_files > 0:
                 self.calc_occs_and_stats_parallelized()
-                '''self._posterior_occs = np.asarray(self.parallel_map(
-                    self.calc_posterior_occs,
-                    self.paths[self.path_col],
-                    progress_bar=self.progress_bar,
-                ))'''
             else:
                 self._posterior_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64)
         return self._posterior_occs
@@ -359,74 +353,9 @@ def clear(self):
         for attr in ["_n_files", "_naive_occs", "_posterior_occs"]:
             if hasattr(self, attr):
                 delattr(self, attr)
-    '''
-    def calc_naive_occs(self, *track_paths: str) -> np.ndarray:
-        """
-        args
-        ----
-            track_paths :   paths to files with trajectories, readable by
-                            saspt.utils.load_detections
-
-        returns
-        -------
-            numpy.ndarray of shape *self.shape*, occupations scaled by the
-                total number of jumps observed for each SPT experiment
-        """
-        SA = self._init_state_array(*track_paths)
-        return SA.n_jumps * SA.naive_occs
-
-    def calc_posterior_occs(self, *track_paths: str) -> np.ndarray:
-        """
-        args
-        ----
-            track_paths :   paths to files with trajectories, readable by
-                            saspt.utils.load_detections
-
-        returns
-        -------
-            numpy.ndarray of shape *self.shape*, mean posterior occupations
-                scaled by the total number of jumps observed for each SPT experiment
-        """
-        SA = self._init_state_array(*track_paths)
-        return SA.n_jumps * SA.posterior_occs
     
-    def calc_marginal_naive_occs(self, *track_paths: str) -> np.ndarray:
-        """ Calculate the likelihood function for a particular set of 
-        trajectories, marginalized on the diffusion coefficient.
-
-        args
-        ----
-            track_paths :   paths to files with trajectories readable
-                            by saspt.utils.load_detections
-
-        returns
-        -------
-            numpy.ndarray of shape *n_diff_coefs*, occupations scaled by the
-                total number of jumps observed in these trajectories
-        """
-        return self.likelihood.marginalize_on_diff_coef(
-            self.calc_naive_occs(*track_paths))
-
-    def calc_marginal_posterior_occs(self, *track_paths: str) -> np.ndarray:
-        """ Calculate the posterior mean state occupations for a particular
-        set of trajectories, marginalized on diffusion coefficient.
-
-        args
-        ----
-            track_paths :   paths to files with trajectories readable
-                            by saspt.utils.load_detections
-
-        returns
-        -------
-            numpy.ndarray of shape *n_diff_coefs*, occupations scaled
-                by the total number of jumps observed in this set of 
-                trajectories
-        """
-        return self.likelihood.marginalize_on_diff_coef(
-            self.calc_posterior_occs(*track_paths))
-    '''
-    def calc_occs_and_stats_parallelized(self, *track_paths: str
-        ) -> Tuple[np.ndarray, np.ndarray, pd.DataFrame]:
+    def calc_occs_and_stats_parallelized(self) -> Tuple[
+        np.ndarray, np.ndarray, pd.DataFrame]:
         """ Calculate naive occupations, posterior occupations, 
         and processed track statistics, parallelized for a set 
         of trajectories. This allows us to subsample the same 
@@ -436,16 +365,6 @@ def calc_occs_and_stats_parallelized(self, *track_paths: str
         ----
             track_paths :   paths to files with trajectories readable
                             by saspt.utils.load_detections
-
-        returns
-        -------
-            (
-                numpy.ndarray of shape *n_diff_coefs*, occupations scaled
-                    by the total number of jumps observed in this set of 
-                    trajectories;
-
-                pandas.DataFrame, statistics on the preprocessed trajectories
-            )
         """
         @dask.delayed
         def g(filepath: str) -> Tuple[np.ndarray, np.ndarray, dict]:
@@ -454,10 +373,14 @@ def g(filepath: str) -> Tuple[np.ndarray, np.ndarray, dict]:
             posterior_occs = SA.n_jumps * SA.posterior_occs
             stats = SA.trajectories.processed_track_statistics
             stats[self.path_col] = filepath
-            return (naive_occs, posterior_occs, stats)
-        naive_occs, posterior_occs, stats = self.parallel_map(
-            g, self.paths[self.path_col])
-        
+            return naive_occs, posterior_occs, stats
+
+        result = self.parallel_map(
+            g, self.paths[self.path_col], progress_bar=self.progress_bar)       
+        naive_occs = np.asarray([r[0] for r in result])
+        posterior_occs = np.asarray([r[1] for r in result])
+        stats = [r[2] for r in result]
+
         # Test for empty stats dict
         if not stats:
             self._naive_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64)
@@ -465,12 +388,13 @@ def g(filepath: str) -> Tuple[np.ndarray, np.ndarray, dict]:
             self._processed_track_statistics = pd.DataFrame(
                 columns=TrajectoryGroup.statistic_names + [self.path_col])
             return
-        
+
         # Put stats into DF and sanity check
         stats = pd.DataFrame(stats)
         assert (stats[self.path_col] == self.paths[self.path_col]).all()
 
-        # Map all metadata from the input paths DataFrame to the track statistics dataframe
+        # Map all metadata from the input paths DataFrame 
+        # to the track statistics dataframe
         for c in filter(lambda c: c!=self.path_col, self.paths.columns):
             stats[c] = self.paths[c]
         
@@ -654,38 +578,6 @@ def _init_state_array(self, *track_paths: str) -> StateArray:
         StateArray over them """
         return StateArray(self._load_tracks(*track_paths), self.likelihood, self.params)
 
-    def _get_processed_track_statistics(self) -> pd.DataFrame:
-        """ Calculate some statistics on the preprocessed trajectories for each 
-        file in this StateArrayDataset.
-
-        returns
-        -------
-            pandas.DataFrame with each row corresponding to one file. Columns
-                correspond to different statistics
-        """
-        @dask.delayed
-        def g(filepath: str) -> dict:
-            T = self._load_tracks(filepath)
-            stats = T.processed_track_statistics
-            stats[self.path_col] = filepath
-            return stats
-        result = pd.DataFrame(self.parallel_map(g, self.paths[self.path_col]))
-
-        # Conceivable that there are zero files in this dataset
-        if len(result) == 0:
-            result[self.path_col] = self.paths[self.path_col]
-            for stat in TrajectoryGroup.statistic_names:
-                result[stat] = pd.Series([], dtype=np.float64)
-                
-        # Sanity check
-        assert (result[self.path_col] == self.paths[self.path_col]).all()
-
-        # Map all metadata from the input paths DataFrame to the track statistics dataframe
-        for c in filter(lambda c: c!=self.path_col, self.paths.columns):
-            result[c] = self.paths[c]
-
-        return result
-
     def _get_raw_track_statistics(self) -> pd.DataFrame:
         """ Calculated some statistics on the raw trajectories for each file in 
         this StateArrayDataset.

From 5d8f439c301d44827d1749a5b95c6d1758956118 Mon Sep 17 00:00:00 2001
From: Vinson Fan <vinsfan368@gmail.com>
Date: Mon, 4 Mar 2024 13:51:16 -0800
Subject: [PATCH 07/13] add a check for no filepaths _get_raw_track_statistics
 as well

---
 saspt/dataset.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/saspt/dataset.py b/saspt/dataset.py
index c039ad1..c668f7f 100644
--- a/saspt/dataset.py
+++ b/saspt/dataset.py
@@ -201,7 +201,11 @@ def raw_track_statistics(self) -> pd.DataFrame:
             pandas.DataFrame, where each row corresponds to one file
         """
         if not hasattr(self, "_raw_track_statistics"):
-            self._raw_track_statistics = self._get_raw_track_statistics()
+            if self.n_files > 0:
+                self._raw_track_statistics = self._get_raw_track_statistics()
+            else:
+                self._raw_track_statistics = pd.DataFrame(
+                    columns=TrajectoryGroup.statistic_names + [self.path_col])
         return self._raw_track_statistics   
 
     @property

From 86894760e9f7283042346f989c160c2b67dbea7a Mon Sep 17 00:00:00 2001
From: Vinson Fan <vinsfan368@gmail.com>
Date: Mon, 4 Mar 2024 14:27:56 -0800
Subject: [PATCH 08/13] add expected columns to track stats

---
 saspt/dataset.py | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/saspt/dataset.py b/saspt/dataset.py
index c668f7f..cd7ed6a 100644
--- a/saspt/dataset.py
+++ b/saspt/dataset.py
@@ -204,8 +204,11 @@ def raw_track_statistics(self) -> pd.DataFrame:
             if self.n_files > 0:
                 self._raw_track_statistics = self._get_raw_track_statistics()
             else:
+                # Set empty stats with expected columns and metadata
                 self._raw_track_statistics = pd.DataFrame(
-                    columns=TrajectoryGroup.statistic_names + [self.path_col])
+                    columns=TrajectoryGroup.statistic_names)
+                for c in self.paths.columns:
+                    self._raw_track_statistics[c] = self.paths[c]
         return self._raw_track_statistics   
 
     @property
@@ -225,7 +228,9 @@ def processed_track_statistics(self) -> pd.DataFrame:
                 self.calc_occs_and_stats_parallelized()
             else:
                 self._processed_track_statistics = pd.DataFrame(
-                    columns=TrajectoryGroup.statistic_names + [self.path_col])
+                    columns=TrajectoryGroup.statistic_names)
+                for c in self.paths.columns:
+                    self._processed_track_statistics[c] = self.paths[c]
         return self._processed_track_statistics
 
     @property 
@@ -387,10 +392,14 @@ def g(filepath: str) -> Tuple[np.ndarray, np.ndarray, dict]:
 
         # Test for empty stats dict
         if not stats:
+            # Set empty occs
             self._naive_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64)
             self._posterior_occs = np.zeros((self.n_files, *self.shape), dtype=np.float64)
+            # Set empty stats with expected columns and metadata
             self._processed_track_statistics = pd.DataFrame(
-                columns=TrajectoryGroup.statistic_names + [self.path_col])
+                columns=TrajectoryGroup.statistic_names)
+            for c in self.paths.columns:
+                self._raw_track_statistics[c] = self.paths[c]
             return
 
         # Put stats into DF and sanity check
@@ -406,6 +415,25 @@ def g(filepath: str) -> Tuple[np.ndarray, np.ndarray, dict]:
         self._naive_occs = np.asarray(naive_occs)
         self._posterior_occs = np.asarray(posterior_occs)
     
+    def calc_marginal_posterior_occs(self, *track_paths: str) -> np.ndarray:
+        """ Calculate the posterior mean state occupations for a particular
+        set of trajectories, marginalized on diffusion coefficient.
+
+        args
+        ----
+            track_paths :   paths to files with trajectories readable
+                            by saspt.utils.load_detections
+
+        returns
+        -------
+            numpy.ndarray of shape *n_diff_coefs*, occupations scaled
+                by the total number of jumps observed in this set of 
+                trajectories
+        """
+        SA = self._init_state_array(*track_paths)
+        return self.likelihood.marginalize_on_diff_coef(
+            SA.n_jumps * SA.posterior_occs)
+    
     def infer_posterior_by_condition(self, col: str, normalize: bool=False
         ) -> Tuple[np.ndarray, List[str]]:
         """ Aggregate trajectories across files by grouping on an arbitrary

From ad6d2c2f522bb8b2e0d08197b63e95c1413caca2 Mon Sep 17 00:00:00 2001
From: Vinson Fan <vinsfan368@gmail.com>
Date: Mon, 4 Mar 2024 15:02:47 -0800
Subject: [PATCH 09/13] fix test for marginalized naive occs

---
 tests/test_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index d80b60b..d6f1d14 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -98,7 +98,7 @@ def test_marginal_naive_occs(self):
         ML = D.marginal_naive_occs
         assert isinstance(ML, np.ndarray)
         assert ML.shape == (len(self.paths), len(self.likelihood.diff_coefs))
-        assert (np.abs(ML.sum(axis=1) - 1.0) < 1.0e-6).all()
+        assert (np.abs(ML.sum(axis=1) - D.jumps_per_file) < 1.0e-6).all()
 
         # Make sure StateArrayDataset.clear works
         D.clear()

From 7b61467005c7e90112618b1ec8e5dfa535993547 Mon Sep 17 00:00:00 2001
From: Vinson Fan <vinsfan368@gmail.com>
Date: Mon, 4 Mar 2024 15:21:28 -0800
Subject: [PATCH 10/13] add subsampling test within SADs

---
 tests/test_dataset.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index d6f1d14..490727f 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -214,3 +214,31 @@ def test_posterior_line_plot(self):
             condition_col=self.condition_col)
         self.check_plot_func(D.posterior_line_plot,
             "_out_test_posterior_line_plot.png")
+    
+    def test_subsampling(self):
+        # New params with a smaller sample size
+        sample_size = 10
+        params = StateArrayParameters(
+            pixel_size_um=0.16,
+            frame_interval=0.01,
+            focal_depth=0.7,
+            splitsize=10,
+            sample_size=sample_size,
+            start_frame=0,
+            max_iter=10,
+            conc_param=1.0,
+            progress_bar=False,
+            num_workers=2,
+        )
+        self.params = params
+        D = StateArrayDataset(self.paths, self.likelihood,
+            params=self.params, path_col=self.path_col,
+            condition_col=self.condition_col)
+        
+        # Check that jumps_per_file and implied jumps are correct
+        assert np.allclose(D.jumps_per_file.astype(float), D.posterior_occs.sum(axis=(1,2)))
+        assert np.allclose(D.jumps_per_file.astype(float), D.naive_occs.sum(axis=(1,2)))
+
+        # Check that subsampling actually worked
+        n_trajs = D.processed_track_statistics['n_tracks']
+        assert (n_trajs <= sample_size).all()

From 8699ade8a987830912460461a9bccce876e096c6 Mon Sep 17 00:00:00 2001
From: Vinson Fan <vinsfan368@gmail.com>
Date: Mon, 4 Mar 2024 16:09:42 -0800
Subject: [PATCH 11/13] processed track stats must be cleared; add test

---
 saspt/dataset.py      |  3 ++-
 tests/test_dataset.py | 11 +++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/saspt/dataset.py b/saspt/dataset.py
index cd7ed6a..8ca0ec8 100644
--- a/saspt/dataset.py
+++ b/saspt/dataset.py
@@ -359,7 +359,8 @@ def marginal_posterior_occs_dataframe(self) -> pd.DataFrame:
 
     def clear(self):
         """ Delete expensive cached attributes """
-        for attr in ["_n_files", "_naive_occs", "_posterior_occs"]:
+        for attr in ["_n_files", "_naive_occs", "_posterior_occs",
+                     "_processed_track_statistics"]:
             if hasattr(self, attr):
                 delattr(self, attr)
     
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index 490727f..d181c3f 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -242,3 +242,14 @@ def test_subsampling(self):
         # Check that subsampling actually worked
         n_trajs = D.processed_track_statistics['n_tracks']
         assert (n_trajs <= sample_size).all()
+
+        # Clear and repeat tests
+        D.clear()
+        
+        # Check that jumps_per_file and implied jumps are correct
+        assert np.allclose(D.jumps_per_file.astype(float), D.posterior_occs.sum(axis=(1,2)))
+        assert np.allclose(D.jumps_per_file.astype(float), D.naive_occs.sum(axis=(1,2)))
+
+        # Check that subsampling actually worked
+        n_trajs = D.processed_track_statistics['n_tracks']
+        assert (n_trajs <= sample_size).all()

From c23fe21d5702cc35e66986a94274953f0eced01b Mon Sep 17 00:00:00 2001
From: Vinson Fan <vinsfan368@gmail.com>
Date: Mon, 4 Mar 2024 16:21:30 -0800
Subject: [PATCH 12/13] fix inaccurate docstring

---
 saspt/dataset.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/saspt/dataset.py b/saspt/dataset.py
index 8ca0ec8..2b54dc8 100644
--- a/saspt/dataset.py
+++ b/saspt/dataset.py
@@ -370,11 +370,6 @@ def calc_occs_and_stats_parallelized(self) -> Tuple[
         and processed track statistics, parallelized for a set 
         of trajectories. This allows us to subsample the same 
         trajectories (if needed) to get these three attributes.
-
-        args
-        ----
-            track_paths :   paths to files with trajectories readable
-                            by saspt.utils.load_detections
         """
         @dask.delayed
         def g(filepath: str) -> Tuple[np.ndarray, np.ndarray, dict]:

From 9847c3533dccadd723b09e944a7db062b8ce682b Mon Sep 17 00:00:00 2001
From: Vinson Fan <vinsfan368@gmail.com>
Date: Tue, 5 Mar 2024 12:22:13 -0800
Subject: [PATCH 13/13] need to also clear jumps

---
 saspt/dataset.py      | 2 +-
 tests/test_dataset.py | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/saspt/dataset.py b/saspt/dataset.py
index 2b54dc8..79cab2b 100644
--- a/saspt/dataset.py
+++ b/saspt/dataset.py
@@ -360,7 +360,7 @@ def marginal_posterior_occs_dataframe(self) -> pd.DataFrame:
     def clear(self):
         """ Delete expensive cached attributes """
         for attr in ["_n_files", "_naive_occs", "_posterior_occs",
-                     "_processed_track_statistics"]:
+                     "_processed_track_statistics", "_jumps_per_file"]:
             if hasattr(self, attr):
                 delattr(self, attr)
     
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
index d181c3f..7815680 100644
--- a/tests/test_dataset.py
+++ b/tests/test_dataset.py
@@ -245,8 +245,6 @@ def test_subsampling(self):
 
         # Clear and repeat tests
         D.clear()
-        
-        # Check that jumps_per_file and implied jumps are correct
         assert np.allclose(D.jumps_per_file.astype(float), D.posterior_occs.sum(axis=(1,2)))
         assert np.allclose(D.jumps_per_file.astype(float), D.naive_occs.sum(axis=(1,2)))