Skip to content

Commit 65912f9

Browse files
authored
Zed styling (#149)
* initial pass * ruff
1 parent e740d6b commit 65912f9

34 files changed

+9790
-6332
lines changed

.github/workflows/qiita-plugin-ci.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ jobs:
156156
lint:
157157
runs-on: ubuntu-latest
158158
steps:
159-
- name: flake8
159+
- name: ruff
160160
uses: actions/setup-python@v2
161161
with:
162162
python-version: 3.9
@@ -166,5 +166,5 @@ jobs:
166166
uses: actions/checkout@v2
167167
- name: lint
168168
run: |
169-
pip install -q flake8
170-
flake8 .
169+
pip install -q ruff
170+
ruff check .

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ git clone https://github.com/biocore/mg-scripts.git
1414
Create a Python3 Conda environment in which to run the notebook:
1515

1616
```bash
17-
conda create --yes -n spp python='python=3.9' scikit-learn pandas numpy nose pep8 flake8 matplotlib jupyter notebook 'seaborn>=0.7.1' pip openpyxl 'seqtk>=1.4' click scipy fastq-pair
17+
conda create --yes -n spp python='python=3.9' scikit-learn pandas numpy nose ruff matplotlib jupyter notebook 'seaborn>=0.7.1' pip openpyxl 'seqtk>=1.4' click scipy fastq-pair
1818
```
1919

2020
Activate the Conda environment:

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ dependencies = [
4343
"pandas",
4444
"lxml",
4545
'requests',
46-
'flake8',
46+
'ruff',
4747
'nose',
4848
'coverage',
4949
'pgzip',

src/sequence_processing_pipeline/Commands.py

Lines changed: 44 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,35 @@
11
import glob
22
import gzip
33
import os
4+
45
import click
5-
from sequence_processing_pipeline.util import (iter_paired_files,
6-
determine_orientation)
6+
7+
from sequence_processing_pipeline.util import determine_orientation, iter_paired_files
78

89

9-
def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
10-
batch_prefix, allow_fwd_only=False):
11-
'''Partitions input fastqs to coarse bins
10+
def split_similar_size_bins(
11+
data_location_path, max_file_list_size_in_gb, batch_prefix, allow_fwd_only=False
12+
):
13+
"""Partitions input fastqs to coarse bins
1214
1315
:param data_location_path: Path to the ConvertJob directory.
1416
:param max_file_list_size_in_gb: Upper threshold for file-size.
1517
:param batch_prefix: Path + file-name prefix for output-files.
1618
:param allow_fwd_only: ignore rev match, helpful for long reads.
1719
:return: The number of output-files created, size of largest bin.
18-
'''
20+
"""
1921
# to prevent issues w/filenames like the ones below from being mistaken
2022
# for R1 or R2 files, use determine_orientation().
2123
# LS_8_22_2014_R2_SRE_S2_L007_I1_001.fastq.gz
2224
# LS_8_22_2014_R1_SRE_S3_L007_I1_001.fastq.gz
2325

2426
# since the names of all fastq files are being scanned for orientation,
2527
# collect all of them instead of mistakenly pre-filtering some files.
26-
fastq_paths = glob.glob(data_location_path + '/*/*.fastq.gz')
27-
fastq_paths = [x for x in fastq_paths
28-
if determine_orientation(x) in ['R1', 'R2']]
28+
fastq_paths = glob.glob(data_location_path + "/*/*.fastq.gz")
29+
fastq_paths = [x for x in fastq_paths if determine_orientation(x) in ["R1", "R2"]]
2930

3031
# convert from GB and halve as we sum R1
31-
max_size = (int(max_file_list_size_in_gb) * (2 ** 30) / 2)
32+
max_size = int(max_file_list_size_in_gb) * (2**30) / 2
3233

3334
split_offset = 0
3435

@@ -42,7 +43,7 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
4243
if allow_fwd_only:
4344
for a in fastq_paths:
4445
r1_size = os.stat(a).st_size
45-
output_base = os.path.dirname(a).split('/')[-1]
46+
output_base = os.path.dirname(a).split("/")[-1]
4647
if current_size + r1_size > max_size:
4748
# bucket is full.
4849
if bucket_size > max_bucket_size:
@@ -56,7 +57,7 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
5657

5758
split_offset += 1
5859
current_size = r1_size
59-
fp = open(batch_prefix + '-%d' % split_offset, 'w')
60+
fp = open(batch_prefix + "-%d" % split_offset, "w")
6061
else:
6162
# add to bucket_size
6263
bucket_size += r1_size
@@ -68,7 +69,7 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
6869
r1_size = os.stat(a).st_size
6970
r2_size = os.stat(b).st_size
7071

71-
output_base = os.path.dirname(a).split('/')[-1]
72+
output_base = os.path.dirname(a).split("/")[-1]
7273
if current_size + r1_size > max_size:
7374
# bucket is full.
7475
if bucket_size > max_bucket_size:
@@ -82,7 +83,7 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
8283

8384
split_offset += 1
8485
current_size = r1_size
85-
fp = open(batch_prefix + '-%d' % split_offset, 'w')
86+
fp = open(batch_prefix + "-%d" % split_offset, "w")
8687
else:
8788
# add to bucket_size
8889
bucket_size += r1_size + r2_size
@@ -93,9 +94,10 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
9394
if fp is not None:
9495
fp.close()
9596

96-
code_dir = 'qp-knight-lab-processing/tests/test_output/'
97+
code_dir = "qp-knight-lab-processing/tests/test_output/"
9798
is_test = data_location_path.endswith(
98-
(f'{code_dir}ConvertJob', f'{code_dir}TRIntegrateJob/integrated'))
99+
(f"{code_dir}ConvertJob", f"{code_dir}TRIntegrateJob/integrated")
100+
)
99101

100102
if split_offset == 0 and not is_test:
101103
raise ValueError("No splits made")
@@ -104,23 +106,23 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
104106

105107

106108
def demux_cmd(id_map_fp, fp_fp, out_d, task, maxtask):
107-
with open(id_map_fp, 'r') as f:
109+
with open(id_map_fp, "r") as f:
108110
id_map = f.readlines()
109-
id_map = [line.strip().split('\t') for line in id_map]
111+
id_map = [line.strip().split("\t") for line in id_map]
110112

111113
# fp needs to be an open file handle.
112114
# ensure task and maxtask are proper ints when coming from cmd-line.
113-
with open(fp_fp, 'r') as fp:
115+
with open(fp_fp, "r") as fp:
114116
demux(id_map, fp, out_d, int(task), int(maxtask))
115117

116118

117119
def demux(id_map, fp, out_d, task, maxtask):
118120
"""Split infile data based in provided map"""
119-
delimiter = '::MUX::'
120-
mode = 'wt'
121-
ext = '.fastq.gz'
122-
sep = '/'
123-
rec = '@'
121+
delimiter = "::MUX::"
122+
mode = "wt"
123+
ext = ".fastq.gz"
124+
sep = "/"
125+
rec = "@"
124126

125127
openfps = {}
126128

@@ -142,7 +144,7 @@ def demux(id_map, fp, out_d, task, maxtask):
142144
pass
143145
current_fp_r1 = gzip.open(fullname_r1, mode)
144146
current_fp_r2 = gzip.open(fullname_r2, mode)
145-
current_fp = {'1': current_fp_r1, '2': current_fp_r2}
147+
current_fp = {"1": current_fp_r1, "2": current_fp_r2}
146148
openfps[idx] = current_fp
147149

148150
# setup a parser
@@ -181,7 +183,7 @@ def demux(id_map, fp, out_d, task, maxtask):
181183
# no '\n'
182184
orientation = sid[-1]
183185
# hexdump confirms separator is ' ', not '\t'
184-
sid = rec + sid + ' ' + metadata + '\n'
186+
sid = rec + sid + " " + metadata + "\n"
185187
else:
186188
raise ValueError(f"'{sid}' is not a recognized form")
187189

@@ -201,29 +203,29 @@ def cli():
201203

202204

203205
@cli.command()
204-
@click.option('--id-map', type=click.Path(exists=True), required=True)
205-
@click.option('--infile', type=click.Path(exists=True), required=True)
206-
@click.option('--output', type=click.Path(exists=True), required=True)
207-
@click.option('--task', type=int, required=True)
208-
@click.option('--maxtask', type=int, required=True)
206+
@click.option("--id-map", type=click.Path(exists=True), required=True)
207+
@click.option("--infile", type=click.Path(exists=True), required=True)
208+
@click.option("--output", type=click.Path(exists=True), required=True)
209+
@click.option("--task", type=int, required=True)
210+
@click.option("--maxtask", type=int, required=True)
209211
def demux_just_fwd(id_map, infile, output, task, maxtask):
210-
with open(id_map, 'r') as f:
212+
with open(id_map, "r") as f:
211213
id_map = f.readlines()
212-
id_map = [line.strip().split('\t') for line in id_map]
214+
id_map = [line.strip().split("\t") for line in id_map]
213215

214216
# fp needs to be an open file handle.
215217
# ensure task and maxtask are proper ints when coming from cmd-line.
216-
with open(infile, 'r') as fp:
218+
with open(infile, "r") as fp:
217219
demux_just_fwd_processing(id_map, fp, output, int(task), int(maxtask))
218220

219221

220222
def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask):
221223
"""Split infile data based in provided map"""
222-
delimiter = '::MUX::'
223-
mode = 'wt'
224-
ext = '.fastq.gz'
225-
sep = '/'
226-
rec = '@'
224+
delimiter = "::MUX::"
225+
mode = "wt"
226+
ext = ".fastq.gz"
227+
sep = "/"
228+
rec = "@"
227229

228230
openfps = {}
229231

@@ -243,7 +245,7 @@ def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask):
243245
except FileExistsError:
244246
pass
245247
current_fp_r1 = gzip.open(fullname_r1, mode)
246-
current_fp = {'1': current_fp_r1}
248+
current_fp = {"1": current_fp_r1}
247249
openfps[idx] = current_fp
248250

249251
# setup a parser
@@ -253,7 +255,7 @@ def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask):
253255
qual = iter(fp)
254256

255257
# there is only fwd so the orientation is always '1'
256-
orientation = '1'
258+
orientation = "1"
257259

258260
for i, s, d, q in zip(seq_id, seq, dumb, qual):
259261
# '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1'
@@ -270,7 +272,7 @@ def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask):
270272

271273
current_fp = openfps[fname_encoded]
272274

273-
current_fp[orientation].write(f'{rec}{sid}')
275+
current_fp[orientation].write(f"{rec}{sid}")
274276
current_fp[orientation].write(s)
275277
current_fp[orientation].write(d)
276278
current_fp[orientation].write(q)

0 commit comments

Comments
 (0)