11import glob
22import gzip
33import os
4+
45import click
5- from sequence_processing_pipeline . util import ( iter_paired_files ,
6- determine_orientation )
6+
7+ from sequence_processing_pipeline . util import determine_orientation , iter_paired_files
78
89
9- def split_similar_size_bins (data_location_path , max_file_list_size_in_gb ,
10- batch_prefix , allow_fwd_only = False ):
11- '''Partitions input fastqs to coarse bins
10+ def split_similar_size_bins (
11+ data_location_path , max_file_list_size_in_gb , batch_prefix , allow_fwd_only = False
12+ ):
13+ """Partitions input fastqs to coarse bins
1214
1315 :param data_location_path: Path to the ConvertJob directory.
1416 :param max_file_list_size_in_gb: Upper threshold for file-size.
1517 :param batch_prefix: Path + file-name prefix for output-files.
1618 :param allow_fwd_only: ignore rev match, helpful for long reads.
1719 :return: The number of output-files created, size of largest bin.
18- '''
20+ """
1921 # to prevent issues w/filenames like the ones below from being mistaken
2022 # for R1 or R2 files, use determine_orientation().
2123 # LS_8_22_2014_R2_SRE_S2_L007_I1_001.fastq.gz
2224 # LS_8_22_2014_R1_SRE_S3_L007_I1_001.fastq.gz
2325
2426 # since the names of all fastq files are being scanned for orientation,
2527 # collect all of them instead of mistakenly pre-filtering some files.
26- fastq_paths = glob .glob (data_location_path + '/*/*.fastq.gz' )
27- fastq_paths = [x for x in fastq_paths
28- if determine_orientation (x ) in ['R1' , 'R2' ]]
28+ fastq_paths = glob .glob (data_location_path + "/*/*.fastq.gz" )
29+ fastq_paths = [x for x in fastq_paths if determine_orientation (x ) in ["R1" , "R2" ]]
2930
3031 # convert from GB and halve as we sum R1
31- max_size = ( int (max_file_list_size_in_gb ) * (2 ** 30 ) / 2 )
32+ max_size = int (max_file_list_size_in_gb ) * (2 ** 30 ) / 2
3233
3334 split_offset = 0
3435
@@ -42,7 +43,7 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
4243 if allow_fwd_only :
4344 for a in fastq_paths :
4445 r1_size = os .stat (a ).st_size
45- output_base = os .path .dirname (a ).split ('/' )[- 1 ]
46+ output_base = os .path .dirname (a ).split ("/" )[- 1 ]
4647 if current_size + r1_size > max_size :
4748 # bucket is full.
4849 if bucket_size > max_bucket_size :
@@ -56,7 +57,7 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
5657
5758 split_offset += 1
5859 current_size = r1_size
59- fp = open (batch_prefix + ' -%d' % split_offset , 'w' )
60+ fp = open (batch_prefix + " -%d" % split_offset , "w" )
6061 else :
6162 # add to bucket_size
6263 bucket_size += r1_size
@@ -68,7 +69,7 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
6869 r1_size = os .stat (a ).st_size
6970 r2_size = os .stat (b ).st_size
7071
71- output_base = os .path .dirname (a ).split ('/' )[- 1 ]
72+ output_base = os .path .dirname (a ).split ("/" )[- 1 ]
7273 if current_size + r1_size > max_size :
7374 # bucket is full.
7475 if bucket_size > max_bucket_size :
@@ -82,7 +83,7 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
8283
8384 split_offset += 1
8485 current_size = r1_size
85- fp = open (batch_prefix + ' -%d' % split_offset , 'w' )
86+ fp = open (batch_prefix + " -%d" % split_offset , "w" )
8687 else :
8788 # add to bucket_size
8889 bucket_size += r1_size + r2_size
@@ -93,9 +94,10 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
9394 if fp is not None :
9495 fp .close ()
9596
96- code_dir = ' qp-knight-lab-processing/tests/test_output/'
97+ code_dir = " qp-knight-lab-processing/tests/test_output/"
9798 is_test = data_location_path .endswith (
98- (f'{ code_dir } ConvertJob' , f'{ code_dir } TRIntegrateJob/integrated' ))
99+ (f"{ code_dir } ConvertJob" , f"{ code_dir } TRIntegrateJob/integrated" )
100+ )
99101
100102 if split_offset == 0 and not is_test :
101103 raise ValueError ("No splits made" )
@@ -104,23 +106,23 @@ def split_similar_size_bins(data_location_path, max_file_list_size_in_gb,
104106
105107
106108def demux_cmd (id_map_fp , fp_fp , out_d , task , maxtask ):
107- with open (id_map_fp , 'r' ) as f :
109+ with open (id_map_fp , "r" ) as f :
108110 id_map = f .readlines ()
109- id_map = [line .strip ().split (' \t ' ) for line in id_map ]
111+ id_map = [line .strip ().split (" \t " ) for line in id_map ]
110112
111113 # fp needs to be an open file handle.
112114 # ensure task and maxtask are proper ints when coming from cmd-line.
113- with open (fp_fp , 'r' ) as fp :
115+ with open (fp_fp , "r" ) as fp :
114116 demux (id_map , fp , out_d , int (task ), int (maxtask ))
115117
116118
117119def demux (id_map , fp , out_d , task , maxtask ):
118120 """Split infile data based in provided map"""
119- delimiter = ' ::MUX::'
120- mode = 'wt'
121- ext = ' .fastq.gz'
122- sep = '/'
123- rec = '@'
121+ delimiter = " ::MUX::"
122+ mode = "wt"
123+ ext = " .fastq.gz"
124+ sep = "/"
125+ rec = "@"
124126
125127 openfps = {}
126128
@@ -142,7 +144,7 @@ def demux(id_map, fp, out_d, task, maxtask):
142144 pass
143145 current_fp_r1 = gzip .open (fullname_r1 , mode )
144146 current_fp_r2 = gzip .open (fullname_r2 , mode )
145- current_fp = {'1' : current_fp_r1 , '2' : current_fp_r2 }
147+ current_fp = {"1" : current_fp_r1 , "2" : current_fp_r2 }
146148 openfps [idx ] = current_fp
147149
148150 # setup a parser
@@ -181,7 +183,7 @@ def demux(id_map, fp, out_d, task, maxtask):
181183 # no '\n'
182184 orientation = sid [- 1 ]
183185 # hexdump confirms separator is ' ', not '\t'
184- sid = rec + sid + ' ' + metadata + ' \n '
186+ sid = rec + sid + " " + metadata + " \n "
185187 else :
186188 raise ValueError (f"'{ sid } ' is not a recognized form" )
187189
@@ -201,29 +203,29 @@ def cli():
201203
202204
203205@cli .command ()
204- @click .option (' --id-map' , type = click .Path (exists = True ), required = True )
205- @click .option (' --infile' , type = click .Path (exists = True ), required = True )
206- @click .option (' --output' , type = click .Path (exists = True ), required = True )
207- @click .option (' --task' , type = int , required = True )
208- @click .option (' --maxtask' , type = int , required = True )
206+ @click .option (" --id-map" , type = click .Path (exists = True ), required = True )
207+ @click .option (" --infile" , type = click .Path (exists = True ), required = True )
208+ @click .option (" --output" , type = click .Path (exists = True ), required = True )
209+ @click .option (" --task" , type = int , required = True )
210+ @click .option (" --maxtask" , type = int , required = True )
209211def demux_just_fwd (id_map , infile , output , task , maxtask ):
210- with open (id_map , 'r' ) as f :
212+ with open (id_map , "r" ) as f :
211213 id_map = f .readlines ()
212- id_map = [line .strip ().split (' \t ' ) for line in id_map ]
214+ id_map = [line .strip ().split (" \t " ) for line in id_map ]
213215
214216 # fp needs to be an open file handle.
215217 # ensure task and maxtask are proper ints when coming from cmd-line.
216- with open (infile , 'r' ) as fp :
218+ with open (infile , "r" ) as fp :
217219 demux_just_fwd_processing (id_map , fp , output , int (task ), int (maxtask ))
218220
219221
220222def demux_just_fwd_processing (id_map , fp , out_d , task , maxtask ):
221223 """Split infile data based in provided map"""
222- delimiter = ' ::MUX::'
223- mode = 'wt'
224- ext = ' .fastq.gz'
225- sep = '/'
226- rec = '@'
224+ delimiter = " ::MUX::"
225+ mode = "wt"
226+ ext = " .fastq.gz"
227+ sep = "/"
228+ rec = "@"
227229
228230 openfps = {}
229231
@@ -243,7 +245,7 @@ def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask):
243245 except FileExistsError :
244246 pass
245247 current_fp_r1 = gzip .open (fullname_r1 , mode )
246- current_fp = {'1' : current_fp_r1 }
248+ current_fp = {"1" : current_fp_r1 }
247249 openfps [idx ] = current_fp
248250
249251 # setup a parser
@@ -253,7 +255,7 @@ def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask):
253255 qual = iter (fp )
254256
255257 # there is only fwd so the orientation is always '1'
256- orientation = '1'
258+ orientation = "1"
257259
258260 for i , s , d , q in zip (seq_id , seq , dumb , qual ):
259261 # '@1', 'LH00444:84:227CNHLT4:7:1101:41955:2443/1'
@@ -270,7 +272,7 @@ def demux_just_fwd_processing(id_map, fp, out_d, task, maxtask):
270272
271273 current_fp = openfps [fname_encoded ]
272274
273- current_fp [orientation ].write (f' { rec } { sid } ' )
275+ current_fp [orientation ].write (f" { rec } { sid } " )
274276 current_fp [orientation ].write (s )
275277 current_fp [orientation ].write (d )
276278 current_fp [orientation ].write (q )
0 commit comments