Skip to content

Commit 975b46f

Browse files
committed
try moving away from biobear due to rust-related installation issues on some platforms
1 parent f0d98ec commit 975b46f

File tree

3 files changed

+32
-10
lines changed

3 files changed

+32
-10
lines changed

countess/plugins/fastq.py

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1+
import itertools
12
import logging
23
from typing import Iterable, Optional
34

4-
import biobear
5+
import dnaio
56
import duckdb
7+
import pyarrow
68

79
from countess import VERSION
810
from countess.core.parameters import BaseParam, BooleanParam, FloatParam, StringParam
@@ -32,10 +34,20 @@ def load_file(
3234
) -> duckdb.DuckDBPyRelation:
3335
# Open the file, convert it to a RecordBatchReader and then
3436
# wrap that up as a DuckDBPyRelation so we can filter it.
35-
reader = biobear.connect().read_fastq_file(filename)
36-
rel = cursor.from_arrow(reader.to_arrow_record_batch_reader())
37+
fastq_iter = dnaio.open(filename, open_threads=1)
38+
record_batch_iter = (
39+
pyarrow.RecordBatch.from_pylist([{'sequence': z.sequence, 'quality_scores': z.qualities} for z in y])
40+
for y in itertools.batched(fastq_iter, 5000)
41+
)
42+
rel = cursor.from_arrow(
43+
pyarrow.RecordBatchReader.from_batches(
44+
pyarrow.schema({'sequence': 'str', 'quality_scores': 'str'}),
45+
record_batch_iter
46+
)
47+
)
3748
if row_limit is not None:
38-
rel = rel.limit(row_limit)
49+
pass
50+
#rel = rel.limit(row_limit)
3951

4052
if self.min_avg_quality > 0:
4153
rel = rel.filter(
@@ -77,8 +89,17 @@ class LoadFastaPlugin(DuckdbLoadFileWithTheLotPlugin):
7789
def load_file(
7890
self, cursor: duckdb.DuckDBPyConnection, filename: str, file_param: BaseParam, row_limit: Optional[int] = None
7991
) -> duckdb.DuckDBPyRelation:
80-
reader = biobear.connect().read_fasta_file(filename)
81-
rel = cursor.from_arrow(reader.to_arrow_record_batch_reader())
92+
fasta_iter = dnaio.open(filename, open_threads=1)
93+
record_batch_iter = (
94+
pyarrow.RecordBatch.from_pylist([{'seq': z.sequence, 'qual': z.qualities} for z in y])
95+
for y in itertools.batched(fasta_iter, 5000)
96+
)
97+
rel = cursor.from_arrow(
98+
pyarrow.RecordBatchReader.from_batches(
99+
pyarrow.schema({'seq': 'str', 'qual': 'str'}),
100+
record_batch_iter
101+
)
102+
)
82103
if row_limit is not None:
83104
rel = rel.limit(row_limit)
84105
return rel

docs/installing-countess/index.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,10 @@ we use [homebrew](https://brew.sh/) to install more recent versions.
9797
* Run `rustup-init.exe` to install rust and cargo.
9898

9999
* If you haven't already got VS Code installed, select 1 to install it.
100-
Once it is installed, rustup will ask you to reboot.
101-
Once your machine has rebooted run `rustup-init.exe` again to finish
102-
installation of rust and cargo.
100+
101+
* Ignore the prompt to log in to Visual Studio and just let rustup finish.
102+
* Once it is installed, run `rustup-init.exe` and select 1 again.
103+
* This time it should say "Rust is installed now. Great!"
103104

104105
* now open `cmd.exe` and run `pip install countess`.
105106

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ classifiers = [
1818
'Topic :: Scientific/Engineering :: Bio-Informatics',
1919
]
2020
dependencies = [
21-
'biobear~=0.23.2',
21+
'dnaio~=1.2.3',
2222
'duckdb>=1.3.1',
2323
'fqfa~=1.3.1',
2424
'more_itertools~=9.1.0',

0 commit comments

Comments
 (0)