|
| 1 | +#!/usr/bin/env python |
| 2 | +# See the NOTICE file distributed with this work for additional information |
| 3 | +# regarding copyright ownership. |
| 4 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | +# you may not use this file except in compliance with the License. |
| 6 | +# You may obtain a copy of the License at |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# Unless required by applicable law or agreed to in writing, software |
| 9 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 10 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 11 | +# See the License for the specific language governing permissions and |
| 12 | +# limitations under the License. |
| 13 | + |
| 14 | +''' |
| 15 | +Fetch Genome Info From New Metadata Database |
| 16 | +''' |
| 17 | + |
| 18 | +import argparse |
| 19 | +import json |
| 20 | +import logging |
| 21 | +import re |
| 22 | +from dataclasses import dataclass, field |
| 23 | +from ensembl.database import DBConnection |
| 24 | +from ensembl.production.metadata.api.factories.datasets import DatasetFactory |
| 25 | +from ensembl.production.metadata.api.models.dataset import DatasetType, Dataset, DatasetSource |
| 26 | +from ensembl.production.metadata.api.models.genome import Genome, GenomeDataset |
| 27 | +from ensembl.production.metadata.api.models.organism import Organism, OrganismGroup, OrganismGroupMember |
| 28 | +from sqlalchemy import select, text |
| 29 | +from typing import List |
| 30 | + |
| 31 | +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
| 32 | +logger = logging.getLogger(__name__) |
| 33 | + |
| 34 | + |
| 35 | +@dataclass |
| 36 | +class GenomeInputFilters: |
| 37 | + |
| 38 | + metadata_db_uri: str |
| 39 | + genome_uuid: List[str] = field(default_factory=list) |
| 40 | + dataset_uuid: List[str] = field(default_factory=list) |
| 41 | + division: List[str] = field(default_factory=list) |
| 42 | + dataset_type: str = "assembly" |
| 43 | + species: List[str] = field(default_factory=list) |
| 44 | + antispecies: List[str] = field(default_factory=list) |
| 45 | + dataset_status: List[str] = field(default_factory=lambda: ["Submitted"]) |
| 46 | + batch_size: int = 50 |
| 47 | + page: int = 1 |
| 48 | + organism_group_type: str = "DIVISION" |
| 49 | + run_all: int = 0 |
| 50 | + update_dataset_status: str = "" |
| 51 | + update_dataset_attribute: dict = field(default_factory=lambda: {}) |
| 52 | + columns: List = field(default_factory=lambda: [Genome.genome_uuid, |
| 53 | + Genome.production_name.label('species'), |
| 54 | + Dataset.dataset_uuid, |
| 55 | + Dataset.status.label('dataset_status'), |
| 56 | + DatasetSource.name.label('dataset_source'), |
| 57 | + DatasetType.name.label('dataset_type'), |
| 58 | + ]) |
| 59 | +@dataclass |
| 60 | +class GenomeFactory: |
| 61 | + @staticmethod |
| 62 | + def _apply_filters(query, filters): |
| 63 | + |
| 64 | + query = query.filter(OrganismGroup.type == filters.organism_group_type) |
| 65 | + |
| 66 | + if filters.run_all: |
| 67 | + filters.division = [ |
| 68 | + 'EnsemblBacteria', |
| 69 | + 'EnsemblVertebrates', |
| 70 | + 'EnsemblPlants', |
| 71 | + 'EnsemblProtists', |
| 72 | + 'EnsemblMetazoa', |
| 73 | + 'EnsemblFungi', |
| 74 | + ] |
| 75 | + |
| 76 | + if filters.genome_uuid: |
| 77 | + query = query.filter(Genome.genome_uuid.in_(filters.genome_uuid)) |
| 78 | + |
| 79 | + if filters.dataset_uuid: |
| 80 | + query = query.filter(Dataset.dataset_uuid.in_(filters.dataset_uuid)) |
| 81 | + |
| 82 | + if filters.division: |
| 83 | + ensembl_divisions = filters.division |
| 84 | + |
| 85 | + if filters.organism_group_type == 'DIVISION': |
| 86 | + pattern = re.compile(r'^(ensembl)?', re.IGNORECASE) |
| 87 | + ensembl_divisions = ['Ensembl' + pattern.sub('', d).capitalize() for d in ensembl_divisions if d] |
| 88 | + |
| 89 | + query = query.filter(OrganismGroup.name.in_(ensembl_divisions)) |
| 90 | + |
| 91 | + if filters.species: |
| 92 | + species = set(filters.species) - set(filters.antispecies) |
| 93 | + |
| 94 | + if species: |
| 95 | + query = query.filter(Genome.production_name.in_(filters.species)) |
| 96 | + else: |
| 97 | + query = query.filter(~Genome.production_name.in_(filters.antispecies)) |
| 98 | + |
| 99 | + elif filters.antispecies: |
| 100 | + query = query.filter(~Genome.production_name.in_(filters.antispecies)) |
| 101 | + |
| 102 | + if filters.dataset_type: |
| 103 | + query = query.filter(Genome.genome_datasets.any(DatasetType.name.in_([filters.dataset_type]))) |
| 104 | + |
| 105 | + if filters.dataset_status: |
| 106 | + query = query.filter(Dataset.status.in_(filters.dataset_status)) |
| 107 | + |
| 108 | + if filters.batch_size: |
| 109 | + filters.page = filters.page if filters.page > 0 else 1 |
| 110 | + query = query.offset((filters.page - 1) * filters.batch_size).limit(filters.batch_size) |
| 111 | + |
| 112 | + return query |
| 113 | + |
| 114 | + def _build_query(self, filters): |
| 115 | + query = select(filters.columns) \ |
| 116 | + .select_from(Genome) \ |
| 117 | + .join(Genome.organism) \ |
| 118 | + .join(Organism.organism_group_members) \ |
| 119 | + .join(OrganismGroupMember.organism_group) \ |
| 120 | + .outerjoin(Genome.genome_datasets) \ |
| 121 | + .join(GenomeDataset.dataset) \ |
| 122 | + .join(Dataset.dataset_source) \ |
| 123 | + .join(Dataset.dataset_type) \ |
| 124 | + .group_by(Genome.genome_id, Dataset.dataset_id) \ |
| 125 | + .order_by(Genome.genome_uuid) |
| 126 | + |
| 127 | + return self._apply_filters(query, filters) |
| 128 | + |
| 129 | + def get_genomes(self, **filters: GenomeInputFilters): |
| 130 | + |
| 131 | + filters = GenomeInputFilters(**filters) |
| 132 | + logger.info(f'Get Genomes with filters {filters}') |
| 133 | + |
| 134 | + with DBConnection(filters.metadata_db_uri).session_scope() as session: |
| 135 | + query = self._build_query(filters) |
| 136 | + logger.info(f'Executing SQL query: {query}') |
| 137 | + for genome in session.execute(query).fetchall(): |
| 138 | + genome_info = genome._asdict() |
| 139 | + dataset_uuid = genome_info.get('dataset_uuid', None) |
| 140 | + |
| 141 | + # TODO: below code required with implementation of datasetstatus enum class in dataset models |
| 142 | + # #convert status enum object to string value |
| 143 | + # dataset_status = genome_info.get('dataset_status', None) |
| 144 | + # if dataset_status and isinstance(dataset_status, DatasetStatus) : |
| 145 | + # genome_info['dataset_status'] = dataset_status.value |
| 146 | + |
| 147 | + if not dataset_uuid: |
| 148 | + logger.warning( |
| 149 | + f"No dataset uuid found for genome {genome_info} skipping this genome " |
| 150 | + ) |
| 151 | + continue |
| 152 | + |
| 153 | + if filters.update_dataset_status: |
| 154 | + _, status = DatasetFactory().update_dataset_status(dataset_uuid, filters.update_dataset_status, |
| 155 | + session=session) |
| 156 | + if filters.update_dataset_status == status: |
| 157 | + |
| 158 | + logger.info( |
| 159 | + f"Updated Dataset status for dataset uuid: {dataset_uuid} from {filters.update_dataset_status} to {status} for genome {genome_info['genome_uuid']}" |
| 160 | + ) |
| 161 | + genome_info['updated_dataset_status'] = status |
| 162 | + |
| 163 | + else: |
| 164 | + logger.warning( |
| 165 | + f"Cannot update status for dataset uuid: {dataset_uuid} {filters.update_dataset_status} to {status} for genome {genome['genome_uuid']}" |
| 166 | + ) |
| 167 | + genome_info['updated_dataset_status'] = None |
| 168 | + |
| 169 | + yield genome_info |
| 170 | + |
| 171 | + |
| 172 | + |
| 173 | +def main(): |
| 174 | + parser = argparse.ArgumentParser( |
| 175 | + prog='genome.py', |
| 176 | + description='Fetch Ensembl genome info from the new metadata database' |
| 177 | + ) |
| 178 | + parser.add_argument('--genome_uuid', type=str, nargs='*', default=[], required=False, |
| 179 | + help='List of genome UUIDs to filter the query. Default is an empty list.') |
| 180 | + parser.add_argument('--dataset_uuid', type=str, nargs='*', default=[], required=False, |
| 181 | + help='List of dataset UUIDs to filter the query. Default is an empty list.') |
| 182 | + parser.add_argument('--organism_group_type', type=str, default='DIVISION', required=False, |
| 183 | + help='Organism group type to filter the query. Default is "DIVISION"') |
| 184 | + parser.add_argument('--division', type=str, nargs='*', default=[], required=False, |
| 185 | + help='List of organism group names to filter the query. Default is an empty list.') |
| 186 | + parser.add_argument('--dataset_type', type=str, default="assembly", required=False, |
| 187 | + help='List of dataset types to filter the query. Default is an empty list.') |
| 188 | + parser.add_argument('--species', type=str, nargs='*', default=[], required=False, |
| 189 | + help='List of Species Production names to filter the query. Default is an empty list.') |
| 190 | + parser.add_argument('--antispecies', type=str, nargs='*', default=[], required=False, |
| 191 | + help='List of Species Production names to exclude from the query. Default is an empty list.') |
| 192 | + parser.add_argument('--dataset_status', nargs='*', default=["Submitted"], |
| 193 | + choices=['Submitted', 'Processing', 'Processed', 'Released'], required=False, |
| 194 | + help='List of dataset statuses to filter the query. Default is an empty list.') |
| 195 | + parser.add_argument('--update_dataset_status', type=str, default="", required=False, |
| 196 | + choices=['Submitted', 'Processing', 'Processed', 'Released', ''], |
| 197 | + help='Update the status of the selected datasets to the specified value. ') |
| 198 | + parser.add_argument('--batch_size', type=int, default=50, required=False, |
| 199 | + help='Number of results to retrieve per batch. Default is 50.') |
| 200 | + parser.add_argument('--page', default=1, required=False, |
| 201 | + type=lambda x: int(x) if int(x) > 0 else argparse.ArgumentTypeError("{x} is not a positive integer"), |
| 202 | + help='The page number for pagination. Default is 1.') |
| 203 | + parser.add_argument('--metadata_db_uri', type=str, required=True, |
| 204 | + help='metadata db mysql uri, ex: mysql://ensro@localhost:3366/ensembl_genome_metadata') |
| 205 | + parser.add_argument('--output', type=str, required=True, help='output file ex: genome_info.json') |
| 206 | + |
| 207 | + args = parser.parse_args() |
| 208 | + |
| 209 | + meta_details = re.match(r"mysql:\/\/.*:?(.*?)@(.*?):\d+\/(.*)", args.metadata_db_uri) |
| 210 | + with open(args.output, 'w') as json_output: |
| 211 | + logger.info(f'Connecting Metadata Database with host:{meta_details.group(2)} & dbname:{meta_details.group(3)}') |
| 212 | + |
| 213 | + genome_fetcher = GenomeFactory() |
| 214 | + |
| 215 | + logger.info(f'Writing Results to {args.output}') |
| 216 | + for genome in genome_fetcher.get_genomes( |
| 217 | + metadata_db_uri=args.metadata_db_uri, |
| 218 | + update_dataset_status=args.update_dataset_status, |
| 219 | + genome_uuid=args.genome_uuid, |
| 220 | + dataset_uuid=args.dataset_uuid, |
| 221 | + organism_group_type=args.organism_group_type, |
| 222 | + division=args.division, |
| 223 | + dataset_type=args.dataset_type, |
| 224 | + species=args.species, |
| 225 | + antispecies=args.antispecies, |
| 226 | + batch_size=args.batch_size, |
| 227 | + dataset_status=args.dataset_status, |
| 228 | + ) or []: |
| 229 | + json.dump(genome, json_output) |
| 230 | + json_output.write("\n") |
| 231 | + |
| 232 | + logger.info(f'Completed !') |
| 233 | + |
| 234 | + |
| 235 | +if __name__ == "__main__": |
| 236 | + logger.info('Fetching Genome Information From New Metadata Database') |
| 237 | + main() |
0 commit comments