-
Notifications
You must be signed in to change notification settings - Fork 0
Sourcery refactored master branch #1
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,18 +21,13 @@ def __init__(self, database: str, host: Tuple[str] = ('localhost',), **kwargs): | |
|
||
def get_item(self, item: str) -> Union[SeqRecord, None]: | ||
t = self.loop.run_until_complete(self.col.find_one({'$or': [{i: item} for i in self.ids]})) | ||
if t is None: | ||
return None | ||
r = self._extract_seqrecord(t['raw_record']) | ||
return r | ||
return None if t is None else self._extract_seqrecord(t['raw_record']) | ||
|
||
def get_iter(self) -> Generator[SeqRecord, None, None]: | ||
q = asyncio.Queue() | ||
self.loop.create_task(self._get_iter(q)) | ||
r = self.loop.run_until_complete(q.get()) | ||
while r: | ||
while r := self.loop.run_until_complete(q.get()): | ||
yield r | ||
r = self.loop.run_until_complete(q.get()) | ||
Comment on lines
-32
to
-35
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
async def _get_iter(self, q: asyncio.Queue) -> None: | ||
async for entry in self.col.find({'_id': {'$exists': True}}): | ||
|
@@ -42,10 +37,8 @@ async def _get_iter(self, q: asyncio.Queue) -> None: | |
def get_iterkeys(self) -> Generator[str, None, None]: | ||
q = asyncio.Queue() | ||
self.loop.create_task(self._get_iterkeys(q)) | ||
r = self.loop.run_until_complete(q.get()) | ||
while r: | ||
while r := self.loop.run_until_complete(q.get()): | ||
yield r | ||
r = self.loop.run_until_complete(q.get()) | ||
Comment on lines
-45
to
-48
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
async def _get_iterkeys(self, q: asyncio.Queue) -> None: | ||
async for i in self.col.find({'_id': {'$exists': True}}, {'_id': 1}): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,7 +30,6 @@ def __init__(self, database: str, host: Union[tuple, str], | |
self.create_protein_func = partial(create_protein_func, compressor=self.compressor) | ||
from UniprotDB._utils import _extract_seqrecord | ||
self._extract_seqrecord = partial(_extract_seqrecord, decompressor=self.decompressor) | ||
pass | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
def initialize(self, seq_handles: Iterable, | ||
filter_fn: Callable[[bytes], bool] = None, | ||
|
@@ -91,9 +90,7 @@ def update(self, handles: Iterable, | |
def add_record(self, raw_record: bytes, test: str = None, test_attr: str = None) -> bool: | ||
protein = self.create_protein_func(raw_record) | ||
if test: | ||
good = False | ||
if test == protein['_id']: | ||
good = True | ||
good = test == protein['_id'] | ||
Comment on lines
-94
to
+93
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
if not good: | ||
for ref in ([test_attr] if test_attr else self.ids): | ||
if test in protein.get(ref, []): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,15 +64,20 @@ def _setup_dbs(self) -> None: | |
|
||
self.db: Dict[str] = {} | ||
for i in range(self.db_splits): | ||
self.db[str(i)] = lmdb.open(os.path.join(self.host, str(i) + '.lmdb'), | ||
map_size=self.map_size / self.db_splits, | ||
writemap=True, map_async=True, readahead=False) | ||
self.db[str(i)] = lmdb.open( | ||
os.path.join(self.host, f'{str(i)}.lmdb'), | ||
map_size=self.map_size / self.db_splits, | ||
writemap=True, | ||
map_async=True, | ||
readahead=False, | ||
) | ||
|
||
if self.has_index: | ||
self.index_dbs: Dict[str] = {} | ||
for index in self.indices: | ||
for i in range(self.index_db_splits): | ||
self.index_dbs[index + str(i)] = \ | ||
lmdb.open(os.path.join(self.host, index + str(i) + '.lmdb'), | ||
lmdb.open(os.path.join(self.host, index + str(i) + '.lmdb'), | ||
Comment on lines
-67
to
+80
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
map_size=self.map_size / self.index_db_splits, | ||
writemap=True, map_async=True, readahead=False) | ||
with open(os.path.join(self.host, 'db_info.json'), 'w') as o: | ||
|
@@ -109,9 +114,7 @@ def get_item(self, item: str) -> Union[SeqRecord, None]: | |
with self.db[self._get_subdb(t.decode())].begin() as txn: | ||
t = txn.get(t) | ||
break | ||
if t is None: | ||
return None | ||
return self._extract_seqrecord(t) | ||
return None if t is None else self._extract_seqrecord(t) | ||
Comment on lines
-112
to
+117
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
def get_iter(self) -> Generator[SeqRecord, None, None]: | ||
for i in range(self.db_splits): | ||
|
@@ -148,8 +151,7 @@ def get_by(self, attr: str, value: str) -> List[SeqRecord]: | |
db = self.index_dbs[subdb].open_db(dupsort=True) | ||
cur = txn.cursor(db=db) | ||
if cur.set_key(value.encode()): | ||
for i in cur.iternext_dup(): | ||
ret.append(self.get_item(i.decode())) | ||
ret.extend(self.get_item(i.decode()) for i in cur.iternext_dup()) | ||
Comment on lines
-151
to
+154
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
return ret | ||
|
||
def _create_indices(self, background: bool = False) -> None: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,10 +18,7 @@ def __init__(self, database: str, host: Union[str, tuple] = ('localhost',), **kw | |
|
||
def get_item(self, item: str) -> Union[SeqRecord, None]: | ||
t = self.col.find_one({'$or': [{i: item} for i in self.ids]}, {'raw_record': True}) | ||
if t is None: | ||
return None | ||
r = self._extract_seqrecord(t['raw_record']) | ||
return r | ||
return None if t is None else self._extract_seqrecord(t['raw_record']) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
def get_iter(self) -> Generator[SeqRecord, None, None]: | ||
for entry in self.col.find({}, {'raw_record': True}): | ||
|
@@ -38,11 +35,8 @@ def length(self) -> int: | |
return self.col.count_documents({}) | ||
|
||
def get_by(self, attr: str, value: str) -> List[SeqRecord]: | ||
ret = [] | ||
res = self.col.find({attr: value}, {'raw_record': True}) | ||
for i in res: | ||
ret.append(self._extract_seqrecord(i['raw_record'])) | ||
return ret | ||
return [self._extract_seqrecord(i['raw_record']) for i in res] | ||
Comment on lines
-41
to
+39
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
def _reset(self) -> None: | ||
self.client[self.database].proteins.drop() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,7 +8,7 @@ def _get_record(handle: BinaryIO, ignore: Collection[bytes] = (b'R', b'C')): | |
""" | ||
lines = [] | ||
for line in handle: | ||
if not line[0] in ignore: | ||
if line[0] not in ignore: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
lines.append(line) | ||
if line.startswith(b'//'): | ||
yield b''.join(lines) | ||
|
@@ -22,8 +22,7 @@ def filter_proks(record: bytes): | |
good_taxa = {b'Archaea', b'Bacteria', } | ||
taxa = re.search(b'OC.*\n', record).group()[5:] | ||
base_taxa = taxa.split(b'; ')[0] | ||
good = base_taxa in good_taxa | ||
return good | ||
return base_taxa in good_taxa | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
|
||
def parse_raw_swiss(handle: BinaryIO, filter_fn: Callable[[bytes], bool] = None): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -101,7 +101,7 @@ def update_trembl_taxa(self, taxa: Iterable, filter_fn: Callable[[bytes], bool] | |
import urllib.request | ||
for taxon in taxa: | ||
taxon_handle = gzip.open(urllib.request.urlopen(trembl_taxa_prefix.format(taxon))) | ||
print("Updating {}".format(taxon)) | ||
print(f"Updating {taxon}") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
self.update([taxon_handle], filter_fn, loud, workers=workers) | ||
taxon_handle.close() | ||
|
||
|
@@ -130,7 +130,12 @@ def create_index(flatfiles: Iterable, host: Union[str, tuple] = (), | |
host/filename + dbtype, fill the database with the protein entries and returns a SeqDB object. | ||
""" | ||
from .data_loader import process_main | ||
s = process_main(flatfiles, host, | ||
dbtype=dbtype, initialize=True, verbose=True, n_jobs=n_jobs, **kwargs) | ||
|
||
return s | ||
return process_main( | ||
flatfiles, | ||
host, | ||
dbtype=dbtype, | ||
initialize=True, | ||
verbose=True, | ||
n_jobs=n_jobs, | ||
**kwargs | ||
) | ||
Comment on lines
-133
to
+141
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -78,7 +78,7 @@ def _extract_seqrecord(raw_record: bytes, decompressor: zstd.ZstdDecompressor) - | |
|
||
def search_uniprot(value: str, retries: int = 3) -> Generator[bytes, None, None]: | ||
possible_ids = [] | ||
for x in range(retries): | ||
for _ in range(retries): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
try: | ||
possible_ids = requests.get(query_req.format(value)).content.split() | ||
break | ||
|
@@ -87,7 +87,7 @@ def search_uniprot(value: str, retries: int = 3) -> Generator[bytes, None, None] | |
|
||
raw_record = None | ||
for pid in possible_ids[:5]: | ||
for x in range(retries): | ||
for _ in range(retries): | ||
try: | ||
raw_record = requests.get(fetch_req.format(pid.decode())).content | ||
break | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -162,10 +162,7 @@ def process_main(dats: Iterable[str], | |
|
||
if verbose: | ||
from tqdm import tqdm | ||
if num_seqs: | ||
pbar = tqdm(total=num_seqs) | ||
else: | ||
pbar = tqdm() | ||
pbar = tqdm(total=num_seqs) if num_seqs else tqdm() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
current = len(seqdb) | ||
|
||
with tempfile.TemporaryDirectory() as directory: | ||
|
@@ -180,7 +177,7 @@ def process_main(dats: Iterable[str], | |
logging.debug('Started async processing') | ||
logging.debug(f'Opening fifos {fifos}') | ||
output_handles = [open(f, 'wb') for f in fifos] | ||
logging.debug(f'Opening fifos') | ||
logging.debug('Opening fifos') | ||
feed_steps = feed_files(fh, output_handles) | ||
all_fed = False | ||
logging.debug(f'Starting fifo feeding from {dat}') | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,14 +62,14 @@ def test_update(self): | |
with gzip.open('TestFiles/testbig.dat.gz', 'rb') as h: | ||
self.db.update([h]) | ||
with gzip.open('TestFiles/testbig.dat.gz', 'rb') as h: | ||
ids = set(line.split()[1].decode() for line in h if line.startswith(b'ID')) | ||
inserted_ids = set(e.name for e in self.db) | ||
ids = {line.split()[1].decode() for line in h if line.startswith(b'ID')} | ||
inserted_ids = {e.name for e in self.db} | ||
Comment on lines
-65
to
+66
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
self.assertEqual(inserted_ids, ids) | ||
|
||
def test_update_filtered(self): | ||
with gzip.open('TestFiles/testbig.dat.gz', 'rb') as h: | ||
self.db.update([h], filter_fn=filter_proks) | ||
self.assertEqual(len(set(e.name for e in self.db)), 70) | ||
self.assertEqual(len({e.name for e in self.db}), 70) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
|
||
@unittest.skipUnless(HAS_MONGO, "requires pymongo") | ||
|
@@ -79,8 +79,7 @@ def setUp(self): | |
self.database = 'test_uni2' | ||
|
||
import os | ||
db_host = os.environ.get('TEST_DB_HOST') | ||
if db_host: | ||
if db_host := os.environ.get('TEST_DB_HOST'): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
self.db = UniprotDB.create_index(['TestFiles/test.dat.bgz'], | ||
host=(db_host,), | ||
database=self.database, | ||
|
@@ -101,9 +100,7 @@ def setUp(self): | |
self.database = 'test_uni2' | ||
|
||
import os | ||
db_host = os.environ.get('TEST_DB_HOST') | ||
|
||
if db_host: | ||
if db_host := os.environ.get('TEST_DB_HOST'): | ||
Comment on lines
-104
to
+103
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
self.db = UniprotDB.create_index(['TestFiles/test.dat.bgz'], host=(db_host,), | ||
database=self.database, dbtype='mongoasync') | ||
else: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Function
MongoDatabase.get_item
refactored with the following changes:inline-immediately-returned-variable
)reintroduce-else
)assign-if-exp
)