Merge pull request #215 from AlexandrovLab/spm-i213

mdbarnesUCSD · web-flow · commit c945e1df72ab · 2025-03-13T14:13:26.000-07:00
Spm i213
diff --git a/.travis.yml b/.travis.yml
@@ -7,7 +7,7 @@ python:
 
 before_install:
   - pip install --upgrade pip setuptools packaging
-  - if ! [ -f ./src/GRCh37.tar.gz ]; then wget ftp://alexandrovlab-ftp.ucsd.edu/pub/tools/SigProfilerMatrixGenerator/GRCh37.tar.gz -P ./src/; fi
+  - if ! [ -f ./src/GRCh37.tar.gz ]; then wget --connect-timeout=10 --tries=20 ftp://alexandrovlab-ftp.ucsd.edu/pub/tools/SigProfilerMatrixGenerator/GRCh37.tar.gz -P ./src/; fi
 
 install:
   - pip install .[tests]
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 ## [Unreleased]
 
+## [1.3.1] - 2025-03-13
+
+### Fixed
+- SV Matrix generation compatibility: Updated remnant Pandas and NumPy < 2.0.0 syntax to ensure compatibility with newer versions.
+- Indexing error in reference genome handling: Edge cases where BED file indices extended beyond the reference genome range caused crashes. Index accesses are now restricted to valid ranges, and cases where the genomic context extends beyond valid positions are skipped.
+- Uninitialized variable: The dinuc_mat variable was accessed before being initialized, causing a runtime error. It is now explicitly initialized to None before use.
+
 ## [1.3.0] - 2025-02-11
 
 ### Changed
diff --git a/SigProfilerMatrixGenerator/scripts/SVMatrixGenerator.py b/SigProfilerMatrixGenerator/scripts/SVMatrixGenerator.py
@@ -46,7 +46,7 @@ def calcIntermutDist2(subs_type, first_chrom_na=False):
         ]
 
         if first_chrom_na:
-            prevPos_arr_c = np.hstack((np.NAN, pos_array_im_c.flatten()[:-1]))
+            prevPos_arr_c = np.hstack((np.nan, pos_array_im_c.flatten()[:-1]))
         else:
             prevPos_arr_c = np.hstack((0, pos_array_im_c.flatten()[:-1]))
         distPrev_arr_c = pos_array_im_c - prevPos_arr_c
@@ -69,7 +69,7 @@ def calcIntermutDist(subs_type, first_chrom_na=False):
         subs_type_chrom = subs_type[subs_type["chr"] == c].sort_values("position")
         if first_chrom_na:
             subs_type_chrom["prevPos"] = np.hstack(
-                (np.NAN, subs_type_chrom["position"].values.flatten()[:-1])
+                (np.nan, subs_type_chrom["position"].values.flatten()[:-1])
             )
         else:
             subs_type_chrom["prevPos"] = np.hstack(
@@ -292,7 +292,7 @@ def calcIntermutDist(subs_type, first_chrom_na=False):
         subs_type_chrom = subs_type[subs_type["chr"] == c].sort_values("position")
         if first_chrom_na:
             subs_type_chrom["prevPos"] = np.hstack(
-                (np.NAN, subs_type_chrom["position"].values.flatten()[:-1])
+                (np.nan, subs_type_chrom["position"].values.flatten()[:-1])
             )
         else:
             subs_type_chrom["prevPos"] = np.hstack(
@@ -324,7 +324,7 @@ def calcIntermutDist2(subs_type, first_chrom_na=False):
         ]
 
         if first_chrom_na:
-            prevPos_arr_c = np.hstack((np.NAN, pos_array_im_c.flatten()[:-1]))
+            prevPos_arr_c = np.hstack((np.nan, pos_array_im_c.flatten()[:-1]))
         else:
             prevPos_arr_c = np.hstack((0, pos_array_im_c.flatten()[:-1]))
         distPrev_arr_c = pos_array_im_c - prevPos_arr_c
@@ -815,8 +815,8 @@ def annotateBedpe(sv_bedpe):
 
     sample_bps = pd.DataFrame(columns=cncd.columns)
     for chromi in unique_py(cncd["chr"]):
-        sample_bps = sample_bps.append(
-            cncd[cncd["chr"] == chromi].sort_values("position", kind="mergesort"),
+        sample_bps = pd.concat(
+            [sample_bps, cncd[cncd["chr"] == chromi].sort_values("position", kind="mergesort")],
             ignore_index=True,
         )
 
@@ -829,7 +829,7 @@ def annotateBedpe(sv_bedpe):
     exp_dist = genome_size / len(sample_bps)
     gamma_sdev = 25  #
     PEAK_FACTOR = 10
-    thresh_dist = np.NaN
+    thresh_dist = np.nan
 
     if logScale:
         sample_bps["intermut_dist"] = np.log10(
@@ -844,15 +844,15 @@ def annotateBedpe(sv_bedpe):
         if np.isnan(thresh_dist):
             thresh_dist = exp_dist / PEAK_FACTOR
 
-    gamma = np.NaN
+    gamma = np.nan
     if np.isnan(gamma) & ~np.isnan(gamma_sdev):
         # compute the mean absolute deviation
         sdev = getMad(sample_bps["intermut_dist"].values)
         gamma = gamma_sdev * sdev
 
     sample_bps["is_clustered_single"] = False
     all_kat_regions = pd.DataFrame()
-    sample_bps["mean_intermut_dist"] = np.NaN
+    sample_bps["mean_intermut_dist"] = np.nan
     for chrom in unique_py(sample_bps["chr"]):  # loop over chromosomes
         sample_bps_flag = (
             sample_bps["chr"] == chrom
diff --git a/SigProfilerMatrixGenerator/scripts/SigProfilerMatrixGeneratorFunc.py b/SigProfilerMatrixGenerator/scripts/SigProfilerMatrixGeneratorFunc.py
@@ -1415,6 +1415,9 @@ def SigProfilerMatrixGeneratorFunc(
         # Sorts files based on chromosome, sample, and start position
         if not chrom_based:
             chrom_start = None
+
+        # initialize dinuc_mat to None
+        dinuc_mat = None
         if i != 1:
             for file in vcf_files:
                 if reference_genome.lower() != "ebv":
@@ -2817,6 +2820,16 @@ def SigProfilerMatrixGeneratorFunc(
                 )
 
             if not chrom_based:
+                # Ensure all required keys exist
+                for key, indel_set in [
+                    ("ID", indel_types),
+                    ("simple", indel_types_simple),
+                    ("tsb", indel_types),
+                    ("complete", indel_types)
+                ]:
+                    if key not in mutation_ID or not isinstance(mutation_ID[key], pd.DataFrame) or mutation_ID[key].empty:
+                        mutation_ID[key] = pd.DataFrame(0, index=indel_set, columns=samples)
+
                 matGen.matrix_generator_INDEL(
                     output_matrix,
                     samples,
diff --git a/SigProfilerMatrixGenerator/scripts/save_context_distribution.py b/SigProfilerMatrixGenerator/scripts/save_context_distribution.py
@@ -125,7 +125,6 @@ def context_distribution(
                 for l in range(i, i + context, 1):
                     nuc += tsb_ref[chromosome[l]][1]
                 base = nuc[int(context / 2)]
-
                 count += 1
                 if count == 1000000:
                     print(i)
@@ -394,17 +393,27 @@ def context_distribution_BED(
 
             if chrom == chrom_initial:
                 chrom_length += end - start
-                for i in range(start, end + 1 - context, 1):
+                for i in range(start, min(end + 1 - context, len(chromosome))):
                     nuc = ""
-                    for l in range(i, i + context, 1):
+                    for l in range(i, min(i + context, len(chromosome))):
                         nuc += tsb_ref[chromosome[l]][1]
-                    base = nuc[int(context / 2)]
+
+                    # Skip incomplete windows
+                    if len(nuc) != context:
+                        overlap = context - len(nuc)
+                        print(f"Skipping window at index {i}: expected context length {context}, but got {len(nuc)}. "
+                            f"Missing {overlap} base(s) due to boundary at the end of the chromosome.")
+                        continue
 
                     # Skip the base if unknown
                     if "N" in nuc:
                         pass
-
                     else:
+                        # Assign `base` to the middle nucleotide of the context
+                        if len(nuc) >= int(context / 2) + 1:  # Prevent IndexError
+                            base = nuc[int(context / 2)]
+                        else:
+                            continue  # Skip if `nuc` is too short
                         if context_input != "DINUC" and context_input != "DBS186":
                             # Only save the pyrimidine context (canonical)
                             if base == "A" or base == "G":
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 
 from setuptools import setup
 
-VERSION = "1.3.0"
+VERSION = "1.3.1"
 
 # remove the dist folder first if exists
 if os.path.exists("dist"):
@@ -23,7 +23,7 @@ def write_version_py(filename="SigProfilerMatrixGenerator/version.py"):
 # THIS FILE IS GENERATED FROM SIGPROFILEMATRIXGENERATOR SETUP.PY
 short_version = '%(version)s'
 version = '%(version)s'
-Update = 'v1.3.0: Require Pandas and Numpy >= 2.0.0 and Python >= 3.9'
+Update = 'v1.3.1: Update SV matrix calls to pandas and numpy to use >= 2.0.0 syntax'
 
 	"""
     fh = open(filename, "w")
@@ -52,7 +52,7 @@ def write_version_py(filename="SigProfilerMatrixGenerator/version.py"):
     python_requires=">=3.9",
     install_requires=[
         "matplotlib>=2.2.2",
-        "sigProfilerPlotting>=1.4.0",
+        "sigProfilerPlotting>=1.4.1",
         "statsmodels>=0.9.0",
         "numpy>=2.0.0",
         "pandas>=2.0.0",