Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion core/api/utils/samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,14 @@ def prepare_fields_in_sample(s_data):

def split_sample_data(data):
"""Split the json request into dictionnaries with the right fields"""
ALIASES = {
"collecting_institution_code_1": "lab_code_1",
}
split_data = {"sample": {}, "author": {}, "gisaid": {}, "ena": {}}

for item, value in data.items():
normalized_items = {ALIASES.get(item, item): value for item, value in data.items()}

for item, value in normalized_items.items():
if "author" in item:
split_data["author"][item] = value
continue
Expand Down
69 changes: 64 additions & 5 deletions core/api/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import core.api.utils.variants
import core.api.utils.common_functions
import core.config
import core.utils.lab_catalog


@extend_schema(
Expand Down Expand Up @@ -58,6 +59,7 @@
"study_title": "",
"study_type": "Whole Genome Sequencing",
"submitting_lab_sample_id": "LAB_856232",
"collecting_institution_code_1": "1328000027",
},
)
],
Expand Down Expand Up @@ -85,6 +87,7 @@
"study_title": serializers.CharField(required=False),
"study_type": serializers.CharField(required=False),
"submitting_lab_sample_id": serializers.CharField(),
"collecting_institution_code_1": serializers.CharField(),
},
),
description="More descriptive text",
Expand All @@ -105,6 +108,7 @@
"collecting_lab_sample_id": "1000",
"submitting_lab_sample_id": "None",
"collecting_institution": "Instituto de Salud Carlos III",
"lab_code_1": "1328000027",
"submitting_institution": "Instituto de Salud Carlos III",
"sequence_file_R1": "SAMPLE1_R1.fastq.gz",
"sequence_file_R2": "SAMPLE1_R2.fastq.gz",
Expand Down Expand Up @@ -142,6 +146,7 @@
"collecting_lab_sample_id": "1000",
"submitting_lab_sample_id": "None",
"collecting_institution": "Instituto de Salud Carlos III",
"lab_code_1": "1328000027",
"submitting_institution": "Instituto de Salud Carlos III",
"sequence_file_R1": "SAMPLE1_R1.fastq.gz",
"sequence_file_R2": "SAMPLE1_R2.fastq.gz",
Expand Down Expand Up @@ -186,20 +191,58 @@ def create_sample_data(request):
}
return Response(error, status=status.HTTP_400_BAD_REQUEST)
schema_id = schema_obj.get_schema_id()
# check if sample id field and collecting_institution are in the request
# Check mandatory identifiers (lab name is derived further below if needed)
required_db_fields = [
"sequencing_sample_id",
"collecting_lab_sample_id",
"submitting_institution",
"collecting_institution",
]
if any(field not in data for field in required_db_fields):
missing_fields = [f for f in required_db_fields if f not in data]
missing_fields = [f for f in required_db_fields if not data.get(f)]
if missing_fields:
print(f"ERROR. Missing: {missing_fields}")
return Response(
{"ERROR": f"Missing: {missing_fields}", "message": "", "data": {}},
status=status.HTTP_409_CONFLICT,
)
lab_code_field = "collecting_institution_code_1"
lab_code_raw = data.get(lab_code_field)
lab_code_value = str(lab_code_raw).strip() if lab_code_raw else ""
if lab_code_value in core.config.FIELD_EMPTY_VALUES or not lab_code_value:
print(f"ERROR. Missing: [{lab_code_field}]")
return Response(
{
"ERROR": f"Missing: [{lab_code_field}]",
"message": "",
"data": {},
},
status=status.HTTP_409_CONFLICT,
)
resolved_collecting_name = core.utils.lab_catalog.ensure_lab_display(
lab_code_value, fallback_name=data.get("collecting_institution")
)
provided_collecting_name = (data.get("collecting_institution") or "").strip()
if provided_collecting_name:
if (
resolved_collecting_name
and provided_collecting_name.lower() != resolved_collecting_name.lower()
):
# Canonicalise to the catalog name
data["collecting_institution"] = resolved_collecting_name
else:
if resolved_collecting_name:
data["collecting_institution"] = resolved_collecting_name
else:
print("Unable to resolve collecting_institution from lab_code_1")
return Response(
{
"ERROR": "Missing: ['collecting_institution']",
"message": "",
"data": {},
},
status=status.HTTP_409_CONFLICT,
)
# Include collecting_institution in the required fields list for fingerprint
required_db_fields.append("collecting_institution")
# check if sample is already defined
temp_fingerprint = core.utils.samples.build_sample_fingerprint(
*[data[field] for field in required_db_fields]
Expand All @@ -220,6 +263,8 @@ def create_sample_data(request):
split_data = core.api.utils.samples.split_sample_data(data)
# Add schema id to store in database
split_data["sample"]["schema_obj"] = schema_id
split_data["sample"]["lab_code_1"] = lab_code_value
split_data["sample"]["collecting_institution"] = data["collecting_institution"]
sample_serializer = core.api.serializers.CreateSampleSerializer(
data=split_data["sample"]
)
Expand Down Expand Up @@ -892,11 +937,19 @@ def check_sample_exists(request):
status=status.HTTP_400_BAD_REQUEST,
)
data = request.query_params
lab_code_field = "collecting_institution_code_1"
lab_code_raw = data.get(lab_code_field)
lab_code_value = str(lab_code_raw).strip() if lab_code_raw else ""
collecting_institution = (data.get("collecting_institution") or "").strip()
resolved_collecting_name = core.utils.lab_catalog.ensure_lab_display(
lab_code_value, fallback_name=collecting_institution
)

required_dict = {
"sequencing_sample_id": data.get("sequencing_sample_id"),
"collecting_lab_sample_id": data.get("collecting_lab_sample_id"),
"submitting_institution": data.get("submitting_institution"),
"collecting_institution": data.get("collecting_institution"),
"collecting_institution": collecting_institution or resolved_collecting_name,
}
if not all(required_dict.values()):
missing_fields = [x for x, v in required_dict.items() if not v]
Expand All @@ -908,6 +961,12 @@ def check_sample_exists(request):
},
status=status.HTTP_400_BAD_REQUEST,
)
if (
collecting_institution
and resolved_collecting_name
and collecting_institution.lower() != resolved_collecting_name.lower()
):
required_dict["collecting_institution"] = resolved_collecting_name
temp_fingerprint = core.utils.samples.build_sample_fingerprint(
*[value for value in required_dict.values()]
)
Expand Down
3 changes: 2 additions & 1 deletion core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@
"microbiology_lab_sample_id",
"sequencing_sample_id",
"submitting_lab_sample_id",
"lab_code_1",
"sequence_file_R1",
"sequence_file_R2",
"sequence_file_R1_md5",
Expand All @@ -157,5 +158,5 @@
GROUPS_HIERARCHY_ORDERLIST = ("RelecovManager", "Submitter", "Collector")
INSTITUTION_FIELD_MAPDICT = {
"Submitter": "submitting_institution",
"Collector": "collecting_institution",
"Collector": "lab_code_1",
}
52 changes: 50 additions & 2 deletions core/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generic imports
import hashlib
import logging
from django.db import models, IntegrityError, transaction
from django.contrib.auth.models import User
from django.db.models.signals import post_save
Expand All @@ -25,8 +26,54 @@ def get_lab_name(self):
# Mapped to submitting_institution field in Sample model
return "%s" % (self.laboratory)

def get_lab_code(self):
return "%s" % (self.code_id)
def _resolve_lab_code(self):
"""Return the `lab_code_1` associated with the laboratory name."""

if not self.laboratory:
return ""
try:
from core.utils import lab_catalog
except ImportError:
logging.getLogger(__name__).warning(
"Unable to import lab_catalog to resolve lab codes"
)
return ""
resolved = lab_catalog.get_lab_code(self.laboratory)
return resolved or ""

def get_lab_code(self, fallback_to_lookup=True):
"""Return the stored lab code, looking it up if missing."""

if self.code_id:
return "%s" % (self.code_id)
if not fallback_to_lookup:
return ""

resolved = self._resolve_lab_code()
if resolved:
if self.pk:
type(self).objects.filter(pk=self.pk).update(code_id=resolved)
else:
self.code_id = resolved
return resolved

def save(self, *args, **kwargs):
resolved_code = self._resolve_lab_code()
if resolved_code:
self.code_id = resolved_code
update_fields = kwargs.get("update_fields")
if update_fields is not None:
if isinstance(update_fields, (list, tuple, set, frozenset)):
fields = []
for item in update_fields:
if item not in fields:
fields.append(item)
else:
fields = [update_fields]
if "code_id" not in fields:
fields.append("code_id")
kwargs["update_fields"] = fields
super().save(*args, **kwargs)


@receiver(post_save, sender=User)
Expand Down Expand Up @@ -705,6 +752,7 @@ class Sample(models.Model):
sequencing_sample_id = models.CharField(max_length=80, null=True, blank=True)
submitting_lab_sample_id = models.CharField(max_length=80, null=True, blank=True)
collecting_institution = models.CharField(max_length=120, null=True, blank=True)
lab_code_1 = models.CharField(max_length=80, null=True, blank=True)
submitting_institution = models.CharField(max_length=120, null=True, blank=True)
sequence_file_R1 = models.CharField(max_length=80, null=True, blank=True)
sequence_file_R2 = models.CharField(max_length=80, null=True, blank=True)
Expand Down
98 changes: 68 additions & 30 deletions core/utils/bioinfo_analysis.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
from django.db.models import Count, QuerySet
from django.core.cache import cache
from collections import Counter
from itertools import islice
from typing import Iterable, Union

from django.core.cache import cache
from django.db.models import QuerySet

import core.config
import core.models
import core.utils.samples

SchemaLike = Union["core.models.Schema", Iterable["core.models.Schema"], QuerySet]


def _chunked(iterator, chunk_size):
"""Yield lists pulled from ``iterator`` with up to ``chunk_size`` items."""
while True:
chunk = list(islice(iterator, chunk_size))
if not chunk:
break
yield chunk


def get_bio_analysis_stats_from_lab(
lab_name=None, institution_type="submitting_institution"
):
Expand Down Expand Up @@ -71,53 +83,79 @@ def get_bioinfo_analyis_fields_utilization(
schemas (all by default). Executes **one** heavy query + one light query.
Results can be cached for `cache_seconds`.
"""
# -- 0. Pick or normalise schemas ------------------------------------
# -- 0. Normalise schema identifiers ---------------------------------
if schema_qs is None:
schema_qs = core.models.Schema.objects.all()
schema_ids = list(core.models.Schema.objects.values_list("pk", flat=True))
elif isinstance(schema_qs, QuerySet):
pass
schema_ids = list(schema_qs.values_list("pk", flat=True))
elif isinstance(schema_qs, (list, tuple, set)):
schema_ids = [getattr(item, "pk", item) for item in schema_qs]
else:
schema_qs = schema_qs if isinstance(schema_qs, (list, tuple)) else [schema_qs]
schema_ids = [getattr(schema_qs, "pk", schema_qs)]

schema_ids = sorted({sid for sid in schema_ids if sid is not None})
if not schema_ids:
return {}

# -- 1. Check cache ---------------------------------------------------
cache_key = None
if use_cache:
cache_key = f"bioinfo_util_{hash(tuple(x.pk for x in schema_qs))}"
cache_key = f"bioinfo_util_{hash(tuple(schema_ids))}"
cached = cache.get(cache_key)
if cached:
return cached

sample_filter = {"schema_obj_id__in": schema_ids}

# -- 2. Total samples -------------------------------------------------
num_samples = (
core.models.Sample.objects.filter(schema_obj__in=schema_qs)
.only("id") # lighter count(*)
.count()
)
num_samples = core.models.Sample.objects.filter(**sample_filter).count()
if num_samples == 0:
return {}

# -- 3. One grouped query: filled counts ------------------------------
FIELD_EMPTY = core.config.FIELD_EMPTY_VALUES
rows = (
core.models.BioinfoAnalysisValue.objects.filter(
bioinfo_analysis_fieldID__schemaID__in=schema_qs,
value__isnull=False,
)
.exclude(value__in=FIELD_EMPTY)
.values("bioinfo_analysis_fieldID__label_name")
.annotate(filled=Count("sample", distinct=True))
# -- 3. Chunked scan of bioinfo values -------------------------------
FIELD_EMPTY = set(core.config.FIELD_EMPTY_VALUES)
through_model = core.models.Sample.bio_analysis_values.through
batch_size = 500
fields_counter = Counter()

sample_iter = (
core.models.Sample.objects.filter(**sample_filter)
.values_list("pk", flat=True)
.iterator(chunk_size=batch_size)
)

fields_value = {
r["bioinfo_analysis_fieldID__label_name"]: r["filled"] for r in rows
}
for batch in _chunked(sample_iter, batch_size):
if not batch:
continue

seen_pairs = set()
rows = (
through_model.objects.filter(sample_id__in=batch)
.filter(bioinfoanalysisvalue__value__isnull=False)
.exclude(bioinfoanalysisvalue__value__in=FIELD_EMPTY)
.values_list(
"bioinfoanalysisvalue__bioinfo_analysis_fieldID__label_name",
"sample_id",
)
.iterator(chunk_size=batch_size)
)

for label, sample_id in rows:
key = (label, sample_id)
if key in seen_pairs:
continue
seen_pairs.add(key)
fields_counter[label] += 1

fields_value = dict(fields_counter)
fields_norm = {k: v / num_samples for k, v in fields_value.items()}
labels_with_value = set(fields_value)

# -- 4. Single light query to fetch ALL labels ------------------------
# -- 4. Fetch defined labels -----------------------------------------
defined_labels = set(
core.models.BioinfoAnalysisField.objects.filter(
schemaID__in=schema_qs
).values_list("label_name", flat=True)
core.models.BioinfoAnalysisField.objects.filter(schemaID__pk__in=schema_ids)
.values_list("label_name", flat=True)
.distinct()
)
never_used = defined_labels - labels_with_value

Expand All @@ -130,7 +168,7 @@ def get_bioinfo_analyis_fields_utilization(
}

# -- 5. Cache for N seconds ------------------------------------------
if use_cache:
if cache_key:
cache.set(cache_key, result, cache_seconds)

return result
Loading