Skip to content

Commit 57b5e0d

Browse files
committed
Switch to using dnaio for simpler install requirements
1 parent 975b46f commit 57b5e0d

File tree

4 files changed

+42
-36
lines changed

4 files changed

+42
-36
lines changed

countess/core/plugins.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ def execute(
339339
self.progress = 100
340340
return self.combine(ddbc, [ddbc.table(tablename)])
341341
else:
342-
row_limit_per_file = (row_limit // len(filenames_and_params)) if row_limit else None
342+
row_limit_per_file = int(row_limit // len(filenames_and_params)) if row_limit else None
343343
progress_per_file = 100 / len(filenames_and_params)
344344

345345
def _load(x):
@@ -353,6 +353,7 @@ def _load(x):
353353
logger.debug("DuckdbParallelLoadFilePlugin.execute _load table %s %s", tablename, repr(filename))
354354
self.load_file_wrapper(cursor, filename, file_param, row_limit_per_file).create(tablename)
355355
self.progress += progress_per_file / 2
356+
logger.debug("DuckdbParallelLoadFilePlugin.execute _load table %s done", tablename)
356357
return tablename
357358

358359
# run a bunch of _loads in parallel threads, collecting them in whatever order they return.

countess/gui/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
info_button,
3232
)
3333

34-
preview_row_limit: Optional[int] = (psutil.virtual_memory().available/1024/1024/1024)*10000
34+
preview_row_limit: Optional[int] = int(psutil.virtual_memory().available / 1024 / 1024 / 1024) * 10000
3535

3636
usage = """usage: countess_gui [--log LEVEL] [INIFILE]
3737

countess/plugins/fastq.py

Lines changed: 38 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -34,33 +34,43 @@ def load_file(
3434
) -> duckdb.DuckDBPyRelation:
3535
# Open the file, convert it to a RecordBatchReader and then
3636
# wrap that up as a DuckDBPyRelation so we can filter it.
37-
fastq_iter = dnaio.open(filename, open_threads=1)
37+
logger.debug("Loading file %s row_limit %s", filename, row_limit)
38+
39+
# Take up to row_limit records from this file
40+
fastq_iter = itertools.islice(dnaio.open(filename, open_threads=1), row_limit)
41+
42+
def _record_to_dict(record):
43+
d = {"sequence": record.sequence}
44+
if self.header_column:
45+
d["header"] = record.name
46+
return d
47+
48+
def _avg_quality(record):
49+
return sum(ord(c) for c in record.qualities) / len(record.qualities) - 33
50+
51+
pyarrow_schema = pyarrow.schema([pyarrow.field("sequence", pyarrow.string())])
52+
if self.header_column:
53+
pyarrow_schema.append(pyarrow.field("header", pyarrow.string()))
54+
55+
# Generator which batches records 5000 at a time into RecordBatches
3856
record_batch_iter = (
39-
pyarrow.RecordBatch.from_pylist([{'sequence': z.sequence, 'quality_scores': z.qualities} for z in y])
40-
for y in itertools.batched(fastq_iter, 5000)
41-
)
42-
rel = cursor.from_arrow(
43-
pyarrow.RecordBatchReader.from_batches(
44-
pyarrow.schema({'sequence': 'str', 'quality_scores': 'str'}),
45-
record_batch_iter
57+
pyarrow.RecordBatch.from_pylist(
58+
[
59+
_record_to_dict(record)
60+
for record in batch
61+
if self.min_avg_quality <= 0 or self.min_avg_quality <= _avg_quality(record)
62+
]
4663
)
64+
for batch in itertools.batched(fastq_iter, 5000)
4765
)
48-
if row_limit is not None:
49-
pass
50-
#rel = rel.limit(row_limit)
51-
52-
if self.min_avg_quality > 0:
53-
rel = rel.filter(
54-
"list_aggregate(list_transform(string_split(quality_scores, ''), x -> ord(x)), 'avg') - 33 >= %f"
55-
% self.min_avg_quality.value
56-
)
66+
67+
# We can turn that generator of RecordBatches into a temporary table
68+
rel = cursor.from_arrow(pyarrow.RecordBatchReader.from_batches(pyarrow_schema, record_batch_iter))
5769

5870
if self.group:
5971
rel = rel.aggregate("sequence, count(*) as count")
60-
elif self.header_column:
61-
rel = rel.project("sequence, name || ' ' || description as header")
62-
else:
63-
rel = rel.project("sequence")
72+
73+
logger.debug("Loading file %s row_limit %s done", filename, row_limit)
6474
return rel
6575

6676
def combine(
@@ -83,23 +93,17 @@ class LoadFastaPlugin(DuckdbLoadFileWithTheLotPlugin):
8393

8494
file_types = [("FASTA", [".fasta", ".fa", ".fasta.gz", ".fa.gz", ".fasta.bz2", ".fa.bz2"])]
8595

86-
sequence_column = StringParam("Sequence Column", "sequence")
87-
header_column = StringParam("Header Column", "header")
88-
8996
def load_file(
9097
self, cursor: duckdb.DuckDBPyConnection, filename: str, file_param: BaseParam, row_limit: Optional[int] = None
9198
) -> duckdb.DuckDBPyRelation:
92-
fasta_iter = dnaio.open(filename, open_threads=1)
99+
pyarrow_schema = pyarrow.schema(
100+
[pyarrow.field("sequence", pyarrow.string()), pyarrow.field("header", pyarrow.string())]
101+
)
102+
103+
fasta_iter = itertools.islice(dnaio.open(filename, open_threads=1), row_limit)
93104
record_batch_iter = (
94-
pyarrow.RecordBatch.from_pylist([{'seq': z.sequence, 'qual': z.qualities} for z in y])
105+
pyarrow.RecordBatch.from_pylist([{"sequence": z.sequence, "header": z.name} for z in y])
95106
for y in itertools.batched(fasta_iter, 5000)
96107
)
97-
rel = cursor.from_arrow(
98-
pyarrow.RecordBatchReader.from_batches(
99-
pyarrow.schema({'seq': 'str', 'qual': 'str'}),
100-
record_batch_iter
101-
)
102-
)
103-
if row_limit is not None:
104-
rel = rel.limit(row_limit)
108+
rel = cursor.from_arrow(pyarrow.RecordBatchReader.from_batches(pyarrow_schema, record_batch_iter))
105109
return rel

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ dev = [
4646
'twine~=6.1.0',
4747
'packaging~=25.0',
4848
'pandas-stubs~=2.1.0',
49+
'pyarrow-stubs~=20.0.0',
4950
'pytest~=7.2',
5051
'pytest-socket~=0.6.0',
5152
'requests-mock~=1.11.0',

0 commit comments

Comments
 (0)