Skip to content
This repository was archived by the owner on Jan 7, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 3 additions & 10 deletions UniprotDB/AsyncMongoDB.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,13 @@ def __init__(self, database: str, host: Tuple[str] = ('localhost',), **kwargs):

def get_item(self, item: str) -> Union[SeqRecord, None]:
t = self.loop.run_until_complete(self.col.find_one({'$or': [{i: item} for i in self.ids]}))
if t is None:
return None
r = self._extract_seqrecord(t['raw_record'])
return r
return None if t is None else self._extract_seqrecord(t['raw_record'])
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function MongoDatabase.get_item refactored with the following changes:


def get_iter(self) -> Generator[SeqRecord, None, None]:
q = asyncio.Queue()
self.loop.create_task(self._get_iter(q))
r = self.loop.run_until_complete(q.get())
while r:
while r := self.loop.run_until_complete(q.get()):
yield r
r = self.loop.run_until_complete(q.get())
Comment on lines -32 to -35
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function MongoDatabase.get_iter refactored with the following changes:


async def _get_iter(self, q: asyncio.Queue) -> None:
async for entry in self.col.find({'_id': {'$exists': True}}):
Expand All @@ -42,10 +37,8 @@ async def _get_iter(self, q: asyncio.Queue) -> None:
def get_iterkeys(self) -> Generator[str, None, None]:
q = asyncio.Queue()
self.loop.create_task(self._get_iterkeys(q))
r = self.loop.run_until_complete(q.get())
while r:
while r := self.loop.run_until_complete(q.get()):
yield r
r = self.loop.run_until_complete(q.get())
Comment on lines -45 to -48
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function MongoDatabase.get_iterkeys refactored with the following changes:


async def _get_iterkeys(self, q: asyncio.Queue) -> None:
async for i in self.col.find({'_id': {'$exists': True}}, {'_id': 1}):
Expand Down
5 changes: 1 addition & 4 deletions UniprotDB/BaseDatabase.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ def __init__(self, database: str, host: Union[tuple, str],
self.create_protein_func = partial(create_protein_func, compressor=self.compressor)
from UniprotDB._utils import _extract_seqrecord
self._extract_seqrecord = partial(_extract_seqrecord, decompressor=self.decompressor)
pass
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function BaseDatabase.__init__ refactored with the following changes:


def initialize(self, seq_handles: Iterable,
filter_fn: Callable[[bytes], bool] = None,
Expand Down Expand Up @@ -91,9 +90,7 @@ def update(self, handles: Iterable,
def add_record(self, raw_record: bytes, test: str = None, test_attr: str = None) -> bool:
protein = self.create_protein_func(raw_record)
if test:
good = False
if test == protein['_id']:
good = True
good = test == protein['_id']
Comment on lines -94 to +93
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function BaseDatabase.add_record refactored with the following changes:

if not good:
for ref in ([test_attr] if test_attr else self.ids):
if test in protein.get(ref, []):
Expand Down
20 changes: 11 additions & 9 deletions UniprotDB/LMDB.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,20 @@ def _setup_dbs(self) -> None:

self.db: Dict[str] = {}
for i in range(self.db_splits):
self.db[str(i)] = lmdb.open(os.path.join(self.host, str(i) + '.lmdb'),
map_size=self.map_size / self.db_splits,
writemap=True, map_async=True, readahead=False)
self.db[str(i)] = lmdb.open(
os.path.join(self.host, f'{str(i)}.lmdb'),
map_size=self.map_size / self.db_splits,
writemap=True,
map_async=True,
readahead=False,
)

if self.has_index:
self.index_dbs: Dict[str] = {}
for index in self.indices:
for i in range(self.index_db_splits):
self.index_dbs[index + str(i)] = \
lmdb.open(os.path.join(self.host, index + str(i) + '.lmdb'),
lmdb.open(os.path.join(self.host, index + str(i) + '.lmdb'),
Comment on lines -67 to +80
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function RawLMDBDatabase._setup_dbs refactored with the following changes:

map_size=self.map_size / self.index_db_splits,
writemap=True, map_async=True, readahead=False)
with open(os.path.join(self.host, 'db_info.json'), 'w') as o:
Expand Down Expand Up @@ -109,9 +114,7 @@ def get_item(self, item: str) -> Union[SeqRecord, None]:
with self.db[self._get_subdb(t.decode())].begin() as txn:
t = txn.get(t)
break
if t is None:
return None
return self._extract_seqrecord(t)
return None if t is None else self._extract_seqrecord(t)
Comment on lines -112 to +117
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function RawLMDBDatabase.get_item refactored with the following changes:


def get_iter(self) -> Generator[SeqRecord, None, None]:
for i in range(self.db_splits):
Expand Down Expand Up @@ -148,8 +151,7 @@ def get_by(self, attr: str, value: str) -> List[SeqRecord]:
db = self.index_dbs[subdb].open_db(dupsort=True)
cur = txn.cursor(db=db)
if cur.set_key(value.encode()):
for i in cur.iternext_dup():
ret.append(self.get_item(i.decode()))
ret.extend(self.get_item(i.decode()) for i in cur.iternext_dup())
Comment on lines -151 to +154
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function RawLMDBDatabase.get_by refactored with the following changes:

return ret

def _create_indices(self, background: bool = False) -> None:
Expand Down
10 changes: 2 additions & 8 deletions UniprotDB/MongoDB.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,7 @@ def __init__(self, database: str, host: Union[str, tuple] = ('localhost',), **kw

def get_item(self, item: str) -> Union[SeqRecord, None]:
t = self.col.find_one({'$or': [{i: item} for i in self.ids]}, {'raw_record': True})
if t is None:
return None
r = self._extract_seqrecord(t['raw_record'])
return r
return None if t is None else self._extract_seqrecord(t['raw_record'])
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function MongoDatabase.get_item refactored with the following changes:


def get_iter(self) -> Generator[SeqRecord, None, None]:
for entry in self.col.find({}, {'raw_record': True}):
Expand All @@ -38,11 +35,8 @@ def length(self) -> int:
return self.col.count_documents({})

def get_by(self, attr: str, value: str) -> List[SeqRecord]:
ret = []
res = self.col.find({attr: value}, {'raw_record': True})
for i in res:
ret.append(self._extract_seqrecord(i['raw_record']))
return ret
return [self._extract_seqrecord(i['raw_record']) for i in res]
Comment on lines -41 to +39
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function MongoDatabase.get_by refactored with the following changes:


def _reset(self) -> None:
self.client[self.database].proteins.drop()
Expand Down
5 changes: 2 additions & 3 deletions UniprotDB/SwissProtUtils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def _get_record(handle: BinaryIO, ignore: Collection[bytes] = (b'R', b'C')):
"""
lines = []
for line in handle:
if not line[0] in ignore:
if line[0] not in ignore:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function _get_record refactored with the following changes:

  • Simplify logical expression using De Morgan identities (de-morgan)

lines.append(line)
if line.startswith(b'//'):
yield b''.join(lines)
Expand All @@ -22,8 +22,7 @@ def filter_proks(record: bytes):
good_taxa = {b'Archaea', b'Bacteria', }
taxa = re.search(b'OC.*\n', record).group()[5:]
base_taxa = taxa.split(b'; ')[0]
good = base_taxa in good_taxa
return good
return base_taxa in good_taxa
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function filter_proks refactored with the following changes:



def parse_raw_swiss(handle: BinaryIO, filter_fn: Callable[[bytes], bool] = None):
Expand Down
15 changes: 10 additions & 5 deletions UniprotDB/UniprotDB.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def update_trembl_taxa(self, taxa: Iterable, filter_fn: Callable[[bytes], bool]
import urllib.request
for taxon in taxa:
taxon_handle = gzip.open(urllib.request.urlopen(trembl_taxa_prefix.format(taxon)))
print("Updating {}".format(taxon))
print(f"Updating {taxon}")
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function SeqDB.update_trembl_taxa refactored with the following changes:

self.update([taxon_handle], filter_fn, loud, workers=workers)
taxon_handle.close()

Expand Down Expand Up @@ -130,7 +130,12 @@ def create_index(flatfiles: Iterable, host: Union[str, tuple] = (),
host/filename + dbtype, fill the database with the protein entries and returns a SeqDB object.
"""
from .data_loader import process_main
s = process_main(flatfiles, host,
dbtype=dbtype, initialize=True, verbose=True, n_jobs=n_jobs, **kwargs)

return s
return process_main(
flatfiles,
host,
dbtype=dbtype,
initialize=True,
verbose=True,
n_jobs=n_jobs,
**kwargs
)
Comment on lines -133 to +141
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function create_index refactored with the following changes:

4 changes: 2 additions & 2 deletions UniprotDB/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def _extract_seqrecord(raw_record: bytes, decompressor: zstd.ZstdDecompressor) -

def search_uniprot(value: str, retries: int = 3) -> Generator[bytes, None, None]:
possible_ids = []
for x in range(retries):
for _ in range(retries):
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function search_uniprot refactored with the following changes:

try:
possible_ids = requests.get(query_req.format(value)).content.split()
break
Expand All @@ -87,7 +87,7 @@ def search_uniprot(value: str, retries: int = 3) -> Generator[bytes, None, None]

raw_record = None
for pid in possible_ids[:5]:
for x in range(retries):
for _ in range(retries):
try:
raw_record = requests.get(fetch_req.format(pid.decode())).content
break
Expand Down
7 changes: 2 additions & 5 deletions UniprotDB/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,7 @@ def process_main(dats: Iterable[str],

if verbose:
from tqdm import tqdm
if num_seqs:
pbar = tqdm(total=num_seqs)
else:
pbar = tqdm()
pbar = tqdm(total=num_seqs) if num_seqs else tqdm()
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function process_main refactored with the following changes:

current = len(seqdb)

with tempfile.TemporaryDirectory() as directory:
Expand All @@ -180,7 +177,7 @@ def process_main(dats: Iterable[str],
logging.debug('Started async processing')
logging.debug(f'Opening fifos {fifos}')
output_handles = [open(f, 'wb') for f in fifos]
logging.debug(f'Opening fifos')
logging.debug('Opening fifos')
feed_steps = feed_files(fh, output_handles)
all_fed = False
logging.debug(f'Starting fifo feeding from {dat}')
Expand Down
13 changes: 5 additions & 8 deletions UniprotDBTests.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,14 @@ def test_update(self):
with gzip.open('TestFiles/testbig.dat.gz', 'rb') as h:
self.db.update([h])
with gzip.open('TestFiles/testbig.dat.gz', 'rb') as h:
ids = set(line.split()[1].decode() for line in h if line.startswith(b'ID'))
inserted_ids = set(e.name for e in self.db)
ids = {line.split()[1].decode() for line in h if line.startswith(b'ID')}
inserted_ids = {e.name for e in self.db}
Comment on lines -65 to +66
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function SeqDBTest.test_update refactored with the following changes:

self.assertEqual(inserted_ids, ids)

def test_update_filtered(self):
with gzip.open('TestFiles/testbig.dat.gz', 'rb') as h:
self.db.update([h], filter_fn=filter_proks)
self.assertEqual(len(set(e.name for e in self.db)), 70)
self.assertEqual(len({e.name for e in self.db}), 70)
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function SeqDBTest.test_update_filtered refactored with the following changes:



@unittest.skipUnless(HAS_MONGO, "requires pymongo")
Expand All @@ -79,8 +79,7 @@ def setUp(self):
self.database = 'test_uni2'

import os
db_host = os.environ.get('TEST_DB_HOST')
if db_host:
if db_host := os.environ.get('TEST_DB_HOST'):
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function MongoTest.setUp refactored with the following changes:

self.db = UniprotDB.create_index(['TestFiles/test.dat.bgz'],
host=(db_host,),
database=self.database,
Expand All @@ -101,9 +100,7 @@ def setUp(self):
self.database = 'test_uni2'

import os
db_host = os.environ.get('TEST_DB_HOST')

if db_host:
if db_host := os.environ.get('TEST_DB_HOST'):
Comment on lines -104 to +103
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function AsyncTest.setUp refactored with the following changes:

self.db = UniprotDB.create_index(['TestFiles/test.dat.bgz'], host=(db_host,),
database=self.database, dbtype='mongoasync')
else:
Expand Down