Skip to content

Commit 29ff47f

Browse files
authored
Merge pull request #70 from Ensembl/feature/dataset_factory
Feature/dataset factory
2 parents fdc861d + 7f62c83 commit 29ff47f

File tree

15 files changed

+1056
-50
lines changed

15 files changed

+1056
-50
lines changed

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.0.0
1+
2.0.1

requirements.in

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
ensembl-py@git+https://github.com/Ensembl/[email protected]
22
grpcio
33
grpcio-tools
4-
grpcio-reflection
5-
sqlalchemy
6-
types-pymysql
4+
sqlalchemy<=2.0
5+
types-pymysql
6+
urllib3~=1.26.15
7+

requirements.txt

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#
55
# pip-compile requirements.in
66
#
7-
certifi==2024.2.2
7+
certifi==2023.11.17
88
# via requests
99
charset-normalizer==3.3.2
1010
# via requests
@@ -16,14 +16,11 @@ exceptiongroup==1.2.0
1616
# via pytest
1717
greenlet==3.0.3
1818
# via sqlalchemy
19-
grpcio==1.62.0
19+
grpcio==1.60.0
2020
# via
2121
# -r requirements.in
22-
# grpcio-reflection
2322
# grpcio-tools
24-
grpcio-reflection==1.62.0
25-
# via -r requirements.in
26-
grpcio-tools==1.62.0
23+
grpcio-tools==1.60.0
2724
# via -r requirements.in
2825
idna==3.6
2926
# via requests
@@ -33,13 +30,11 @@ mysqlclient==2.1.1
3330
# via ensembl-py
3431
packaging==23.2
3532
# via pytest
36-
pluggy==1.4.0
33+
pluggy==1.3.0
3734
# via pytest
38-
protobuf==4.25.3
39-
# via
40-
# grpcio-reflection
41-
# grpcio-tools
42-
pytest==8.0.2
35+
protobuf==4.25.2
36+
# via grpcio-tools
37+
pytest==7.4.4
4338
# via
4439
# ensembl-py
4540
# pytest-dependency
@@ -62,8 +57,10 @@ tomli==2.0.1
6257
# via pytest
6358
types-pymysql==1.1.0.1
6459
# via -r requirements.in
65-
urllib3==2.2.1
66-
# via requests
60+
urllib3==1.26.18
61+
# via
62+
# -r requirements.in
63+
# requests
6764

6865
# The following packages are considered to be unsafe in a requirements file:
6966
# setuptools

src/ensembl/production/metadata/api/exceptions.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,4 +47,8 @@ class UpdateBackCoreException(UpdaterException, RuntimeError):
4747

4848
class TypeNotFoundException(UpdaterException, RuntimeError):
4949
"""Dataset Type not found"""
50-
pass
50+
pass
51+
52+
class DatasetFactoryException(Exception):
53+
"""An error occured while using dataset factory"""
54+
pass

src/ensembl/production/metadata/api/factories/datasets.py

Lines changed: 338 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
#!/usr/bin/env python
2+
# See the NOTICE file distributed with this work for additional information
3+
# regarding copyright ownership.
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
'''
15+
Fetch Genome Info From New Metadata Database
16+
'''
17+
18+
import argparse
19+
import json
20+
import logging
21+
import re
22+
from dataclasses import dataclass, field
23+
from ensembl.database import DBConnection
24+
from ensembl.production.metadata.api.factories.datasets import DatasetFactory
25+
from ensembl.production.metadata.api.models.dataset import DatasetType, Dataset, DatasetSource
26+
from ensembl.production.metadata.api.models.genome import Genome, GenomeDataset
27+
from ensembl.production.metadata.api.models.organism import Organism, OrganismGroup, OrganismGroupMember
28+
from sqlalchemy import select, text
29+
from typing import List
30+
31+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
32+
logger = logging.getLogger(__name__)
33+
34+
35+
@dataclass
36+
class GenomeInputFilters:
37+
38+
metadata_db_uri: str
39+
genome_uuid: List[str] = field(default_factory=list)
40+
dataset_uuid: List[str] = field(default_factory=list)
41+
division: List[str] = field(default_factory=list)
42+
dataset_type: str = "assembly"
43+
species: List[str] = field(default_factory=list)
44+
antispecies: List[str] = field(default_factory=list)
45+
dataset_status: List[str] = field(default_factory=lambda: ["Submitted"])
46+
batch_size: int = 50
47+
page: int = 1
48+
organism_group_type: str = "DIVISION"
49+
run_all: int = 0
50+
update_dataset_status: str = ""
51+
update_dataset_attribute: dict = field(default_factory=lambda: {})
52+
columns: List = field(default_factory=lambda: [Genome.genome_uuid,
53+
Genome.production_name.label('species'),
54+
Dataset.dataset_uuid,
55+
Dataset.status.label('dataset_status'),
56+
DatasetSource.name.label('dataset_source'),
57+
DatasetType.name.label('dataset_type'),
58+
])
59+
@dataclass
60+
class GenomeFactory:
61+
@staticmethod
62+
def _apply_filters(query, filters):
63+
64+
query = query.filter(OrganismGroup.type == filters.organism_group_type)
65+
66+
if filters.run_all:
67+
filters.division = [
68+
'EnsemblBacteria',
69+
'EnsemblVertebrates',
70+
'EnsemblPlants',
71+
'EnsemblProtists',
72+
'EnsemblMetazoa',
73+
'EnsemblFungi',
74+
]
75+
76+
if filters.genome_uuid:
77+
query = query.filter(Genome.genome_uuid.in_(filters.genome_uuid))
78+
79+
if filters.dataset_uuid:
80+
query = query.filter(Dataset.dataset_uuid.in_(filters.dataset_uuid))
81+
82+
if filters.division:
83+
ensembl_divisions = filters.division
84+
85+
if filters.organism_group_type == 'DIVISION':
86+
pattern = re.compile(r'^(ensembl)?', re.IGNORECASE)
87+
ensembl_divisions = ['Ensembl' + pattern.sub('', d).capitalize() for d in ensembl_divisions if d]
88+
89+
query = query.filter(OrganismGroup.name.in_(ensembl_divisions))
90+
91+
if filters.species:
92+
species = set(filters.species) - set(filters.antispecies)
93+
94+
if species:
95+
query = query.filter(Genome.production_name.in_(filters.species))
96+
else:
97+
query = query.filter(~Genome.production_name.in_(filters.antispecies))
98+
99+
elif filters.antispecies:
100+
query = query.filter(~Genome.production_name.in_(filters.antispecies))
101+
102+
if filters.dataset_type:
103+
query = query.filter(Genome.genome_datasets.any(DatasetType.name.in_([filters.dataset_type])))
104+
105+
if filters.dataset_status:
106+
query = query.filter(Dataset.status.in_(filters.dataset_status))
107+
108+
if filters.batch_size:
109+
filters.page = filters.page if filters.page > 0 else 1
110+
query = query.offset((filters.page - 1) * filters.batch_size).limit(filters.batch_size)
111+
112+
return query
113+
114+
def _build_query(self, filters):
115+
query = select(filters.columns) \
116+
.select_from(Genome) \
117+
.join(Genome.organism) \
118+
.join(Organism.organism_group_members) \
119+
.join(OrganismGroupMember.organism_group) \
120+
.outerjoin(Genome.genome_datasets) \
121+
.join(GenomeDataset.dataset) \
122+
.join(Dataset.dataset_source) \
123+
.join(Dataset.dataset_type) \
124+
.group_by(Genome.genome_id, Dataset.dataset_id) \
125+
.order_by(Genome.genome_uuid)
126+
127+
return self._apply_filters(query, filters)
128+
129+
def get_genomes(self, **filters: GenomeInputFilters):
130+
131+
filters = GenomeInputFilters(**filters)
132+
logger.info(f'Get Genomes with filters {filters}')
133+
134+
with DBConnection(filters.metadata_db_uri).session_scope() as session:
135+
query = self._build_query(filters)
136+
logger.info(f'Executing SQL query: {query}')
137+
for genome in session.execute(query).fetchall():
138+
genome_info = genome._asdict()
139+
dataset_uuid = genome_info.get('dataset_uuid', None)
140+
141+
# TODO: below code required with implementation of datasetstatus enum class in dataset models
142+
# #convert status enum object to string value
143+
# dataset_status = genome_info.get('dataset_status', None)
144+
# if dataset_status and isinstance(dataset_status, DatasetStatus) :
145+
# genome_info['dataset_status'] = dataset_status.value
146+
147+
if not dataset_uuid:
148+
logger.warning(
149+
f"No dataset uuid found for genome {genome_info} skipping this genome "
150+
)
151+
continue
152+
153+
if filters.update_dataset_status:
154+
_, status = DatasetFactory().update_dataset_status(dataset_uuid, filters.update_dataset_status,
155+
session=session)
156+
if filters.update_dataset_status == status:
157+
158+
logger.info(
159+
f"Updated Dataset status for dataset uuid: {dataset_uuid} from {filters.update_dataset_status} to {status} for genome {genome_info['genome_uuid']}"
160+
)
161+
genome_info['updated_dataset_status'] = status
162+
163+
else:
164+
logger.warning(
165+
f"Cannot update status for dataset uuid: {dataset_uuid} {filters.update_dataset_status} to {status} for genome {genome['genome_uuid']}"
166+
)
167+
genome_info['updated_dataset_status'] = None
168+
169+
yield genome_info
170+
171+
172+
173+
def main():
174+
parser = argparse.ArgumentParser(
175+
prog='genome.py',
176+
description='Fetch Ensembl genome info from the new metadata database'
177+
)
178+
parser.add_argument('--genome_uuid', type=str, nargs='*', default=[], required=False,
179+
help='List of genome UUIDs to filter the query. Default is an empty list.')
180+
parser.add_argument('--dataset_uuid', type=str, nargs='*', default=[], required=False,
181+
help='List of dataset UUIDs to filter the query. Default is an empty list.')
182+
parser.add_argument('--organism_group_type', type=str, default='DIVISION', required=False,
183+
help='Organism group type to filter the query. Default is "DIVISION"')
184+
parser.add_argument('--division', type=str, nargs='*', default=[], required=False,
185+
help='List of organism group names to filter the query. Default is an empty list.')
186+
parser.add_argument('--dataset_type', type=str, default="assembly", required=False,
187+
help='List of dataset types to filter the query. Default is an empty list.')
188+
parser.add_argument('--species', type=str, nargs='*', default=[], required=False,
189+
help='List of Species Production names to filter the query. Default is an empty list.')
190+
parser.add_argument('--antispecies', type=str, nargs='*', default=[], required=False,
191+
help='List of Species Production names to exclude from the query. Default is an empty list.')
192+
parser.add_argument('--dataset_status', nargs='*', default=["Submitted"],
193+
choices=['Submitted', 'Processing', 'Processed', 'Released'], required=False,
194+
help='List of dataset statuses to filter the query. Default is an empty list.')
195+
parser.add_argument('--update_dataset_status', type=str, default="", required=False,
196+
choices=['Submitted', 'Processing', 'Processed', 'Released', ''],
197+
help='Update the status of the selected datasets to the specified value. ')
198+
parser.add_argument('--batch_size', type=int, default=50, required=False,
199+
help='Number of results to retrieve per batch. Default is 50.')
200+
parser.add_argument('--page', default=1, required=False,
201+
type=lambda x: int(x) if int(x) > 0 else argparse.ArgumentTypeError("{x} is not a positive integer"),
202+
help='The page number for pagination. Default is 1.')
203+
parser.add_argument('--metadata_db_uri', type=str, required=True,
204+
help='metadata db mysql uri, ex: mysql://ensro@localhost:3366/ensembl_genome_metadata')
205+
parser.add_argument('--output', type=str, required=True, help='output file ex: genome_info.json')
206+
207+
args = parser.parse_args()
208+
209+
meta_details = re.match(r"mysql:\/\/.*:?(.*?)@(.*?):\d+\/(.*)", args.metadata_db_uri)
210+
with open(args.output, 'w') as json_output:
211+
logger.info(f'Connecting Metadata Database with host:{meta_details.group(2)} & dbname:{meta_details.group(3)}')
212+
213+
genome_fetcher = GenomeFactory()
214+
215+
logger.info(f'Writing Results to {args.output}')
216+
for genome in genome_fetcher.get_genomes(
217+
metadata_db_uri=args.metadata_db_uri,
218+
update_dataset_status=args.update_dataset_status,
219+
genome_uuid=args.genome_uuid,
220+
dataset_uuid=args.dataset_uuid,
221+
organism_group_type=args.organism_group_type,
222+
division=args.division,
223+
dataset_type=args.dataset_type,
224+
species=args.species,
225+
antispecies=args.antispecies,
226+
batch_size=args.batch_size,
227+
dataset_status=args.dataset_status,
228+
) or []:
229+
json.dump(genome, json_output)
230+
json_output.write("\n")
231+
232+
logger.info(f'Completed !')
233+
234+
235+
if __name__ == "__main__":
236+
logger.info('Fetching Genome Information From New Metadata Database')
237+
main()

src/ensembl/production/metadata/api/models/dataset.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,14 @@
99
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1010
# See the License for the specific language governing permissions and
1111
# limitations under the License.
12-
from sqlalchemy import Column, Integer, String, Enum, text, ForeignKey, Index
12+
import datetime
13+
import logging
14+
import uuid
15+
16+
from sqlalchemy import Column, Integer, String, Enum, text, ForeignKey, Index, JSON
1317
from sqlalchemy.dialects.mysql import DATETIME
1418
from sqlalchemy.orm import relationship
1519
from sqlalchemy.sql import func
16-
import datetime
17-
import uuid
18-
import logging
1920

2021
from ensembl.production.metadata.api.exceptions import MissingMetaException
2122
from ensembl.production.metadata.api.models.base import Base, LoadAble
@@ -48,8 +49,7 @@ class Dataset(LoadAble, Base):
4849
created = Column(DATETIME(fsp=6), server_default=func.now(), default=datetime.datetime.utcnow)
4950
dataset_source_id = Column(ForeignKey('dataset_source.dataset_source_id'), nullable=False, index=True)
5051
label = Column(String(128), nullable=False)
51-
status = Column(Enum('Submitted', 'Progressing', 'Processed', 'Released'),
52-
server_default=text("'Submitted'"))
52+
status = Column(Enum('Submitted', 'Processing', 'Processed', 'Released'), server_default=text('Submitted'))
5353

5454
# One to many relationships
5555
# dataset_id to dataset attribute and genome dataset
@@ -126,6 +126,9 @@ class DatasetType(LoadAble, Base):
126126
topic = Column(String(32), nullable=False)
127127
description = Column(String(255))
128128
details_uri = Column(String(255))
129+
parent = Column(String(128), default=None)
130+
depends_on = Column(String(128), default=None)
131+
filter_on = Column(JSON, default=None)
129132
# One to many relationships
130133
# dataset_type_id to dataset
131134
datasets = relationship('Dataset', back_populates='dataset_type')
Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,22 @@
1-
1 assembly Genomic assembly Core Annotation Compilation of sequences for a genome \N
2-
2 genebuild Genomic Build Core Annotation Genomic annotations for an assembly \N
3-
3 variation mRatBN7.2 Variation Annotation Short variant data for rattus_norvegicus \N
4-
4 evidence Variation Evidence Variation Annotation \N \N
5-
5 regulation_build Regulations Regulatory Annotation \N \N
6-
6 homologies Comparative homologies Comparative Annotation \N \N
7-
7 regulatory_features Regulatory Annotation Regulatory Annotation Regulatory annotation for an assembly \N
1+
1 assembly Genomic assembly Core Annotation Compilation of sequences for a genome \N \N \N \N
2+
2 genebuild Genomic Build Core Annotation Genomic annotations for an assembly \N \N \N \N
3+
3 variation mRatBN7.2 Variation Annotation Short variant data for rattus_norvegicus \N \N \N \N
4+
4 evidence Variation Evidence Variation Annotation \N \N \N \N \N
5+
5 regulation_build Regulations Regulatory Annotation \N \N \N \N \N
6+
6 homologies Comparative homologies Comparative Annotation \N \N \N \N \N
7+
7 regulatory_features Regulatory Annotation Regulatory Annotation Regulatory annotation for an assembly \N \N \N \N
8+
8 xrefs External References Production Compute Xref genome annotation for Genebuild \N 2 \N \N
9+
9 protein_features Protein Features Production Compute Interpro scan run against proteins \N 2 8 \N
10+
10 alpha_fold AlphaFold Production Compute AlphaFold compute against proteins \N 2 9 \N
11+
11 checksums Checkums compute Production Compute Compute DNA sequences checksums \N 2 \N \N
12+
12 refget_load Refget Loading Production Compute Refeget database provisioning \N 2 11 \N
13+
13 compara_load Compara Data Loading Production Release Preparation Load MongoDB homologies \N 6 15 \N
14+
14 search_dumps Data dumps for THOAS Production Release Preparation Dumps flat file for THOAS loading \N 2 1,8,9 \N
15+
15 compara_compute Compute homologie database Production Compute Compute genome homologies database \N 6 \N \N
16+
16 ftp_dumps FTP File dumps Production Release Preparation Dumps all FTP File format from genebuild \N 2 1,8,9 \N
17+
17 compara_dumps Homologies file dumps Production Compute Dumped homologies tsv files \N 6 15 \N
18+
18 blast Blast file dumps Production Compute Dumps blast indexed files \N 2 \N \N
19+
20 variation_track Variation Track API update Production Release Preparation Load Variation Track API \N 3 \N \N
20+
21 genome_browser_track Genebuild Track API update Production Release Preparation Load Genebuild track API \N 2 \N \N
21+
22 regulation_track Regulation Track API update Production Release Preparation Load Regulation Track API \N 7 \N \N
22+
23 thoas_load Thoas Loading Production Release Preparation Load MongoDB THOAS collection \N 2 11,12,14 \N

0 commit comments

Comments
 (0)