-
Notifications
You must be signed in to change notification settings - Fork 0
Review GAGE5 #42
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Review GAGE5 #42
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,151 @@ | ||||||||
| from Bio import SeqIO | ||||||||
| from Bio.SeqUtils import GC | ||||||||
| from abc import ABC, abstractmethod | ||||||||
|
|
||||||||
| def filter_fastq(input_path, gc_bounds=(0, 100), length_bounds=(0, float('inf')), quality_threshold=0, output_filename=None): | ||||||||
| """ | ||||||||
| Filters a FASTQ file based on GC content, sequence length, and quality threshold using Biopython. | ||||||||
|
|
||||||||
| Args: | ||||||||
| - input_path (str): Path to the input FASTQ file. | ||||||||
| - gc_bounds (tuple): Tuple specifying the minimum and maximum GC content for filtering. Default is (0, 100). | ||||||||
| - length_bounds (tuple): Tuple specifying the minimum and maximum sequence length for filtering. Default is (0, infinity). | ||||||||
| - quality_threshold (float): Minimum quality score for filtering. Default is 0. | ||||||||
| - output_filename (str): Name of the output file. If None, the default filename will be used. | ||||||||
|
|
||||||||
| Returns: | ||||||||
| - str: Message indicating the success of the filtering process. | ||||||||
| """ | ||||||||
|
Comment on lines
+6
to
+18
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Очень классная докстринга: понятно что делает функция, какие аргументы, их типы, а также дефолтные значения. |
||||||||
| filtered_seqs = [] | ||||||||
|
|
||||||||
| with open(input_path, 'r') as fastq_file: | ||||||||
| for record in SeqIO.parse(fastq_file, 'fastq'): | ||||||||
| gc_content = GC(record.seq) | ||||||||
| seq_length = len(record.seq) | ||||||||
| quality_score = sum(record.letter_annotations["phred_quality"]) / seq_length | ||||||||
|
|
||||||||
| if (gc_bounds[0] <= gc_content <= gc_bounds[1] and | ||||||||
| length_bounds[0] <= seq_length <= length_bounds[1] and | ||||||||
| quality_score >= quality_threshold): | ||||||||
| filtered_seqs.append(record) | ||||||||
|
|
||||||||
| if output_filename is None: | ||||||||
| output_filename = f"filtered_{input_path}" | ||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Если в переменной |
||||||||
| elif not output_filename.endswith('.fastq'): | ||||||||
| output_filename += '.fastq' | ||||||||
|
Comment on lines
+34
to
+35
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. супер |
||||||||
|
|
||||||||
| with open(output_filename, 'w') as output_file: | ||||||||
| SeqIO.write(filtered_seqs, output_file, 'fastq') | ||||||||
|
|
||||||||
| return "Filtered data was saved into output file" | ||||||||
|
|
||||||||
|
|
||||||||
| class BiologicalSequence(ABC): | ||||||||
| """Abstract base class for biological sequences.""" | ||||||||
|
|
||||||||
| def __init__(self, sequence): | ||||||||
| """Initialize a BiologicalSequence object with a given sequence.""" | ||||||||
| self.sequence = sequence | ||||||||
|
|
||||||||
| def __len__(self): | ||||||||
| """Return the length of the sequence.""" | ||||||||
| return len(self.sequence) | ||||||||
|
|
||||||||
| def __getitem__(self, index): | ||||||||
| """Return the item at the specified index.""" | ||||||||
| return self.sequence[index] | ||||||||
|
|
||||||||
| def __str__(self): | ||||||||
| """Return the string representation of the sequence.""" | ||||||||
| return self.sequence | ||||||||
|
|
||||||||
| @abstractmethod | ||||||||
| def check_alphabet(self): | ||||||||
| """Check if the sequence contains valid alphabet characters.""" | ||||||||
| pass | ||||||||
|
|
||||||||
| class NucleicAcidSequence(BiologicalSequence): | ||||||||
| """Abstract base class""" | ||||||||
|
|
||||||||
| DNA_LETTERS = set("ATGCatgc") | ||||||||
| RNA_LETTERS = set("AUGCaugc") | ||||||||
| AMINO_ACID_LETTERS = set("ACDEFGHIKLMNPQRSTVWY") | ||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Возможно аминокислотный алфавит стоило вынести из класса для нуклеиновых кислот, чтобы наши абстракции не нарушали биологию 😅. В целом классно, что учтено наличие больших и маленьких букв. |
||||||||
|
|
||||||||
| def check_alphabet(self): | ||||||||
| """Check if the sequence contains valid nucleic- or aminoacid alphabet characters.""" | ||||||||
| return set(self.sequence).issubset(self.DNA_LETTERS | self.RNA_LETTERS | self.AMINO_ACID_LETTERS) | ||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Мне кажется, что проверку на принадлежность к аминокислотной последовательности можно было бы убрать из класса NucleicAcidSequence и оставить только в классе AminoAcidSequence.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. check_alphabet должа быть у всех из них |
||||||||
|
|
||||||||
| def complement(self): | ||||||||
| """Return the complement sequence.""" | ||||||||
| comp_map_dna = {"A": "T", "G": "C", "T": "A", "C": "G", "a": "t", "t": "a", "g": "c", "c": "g"} | ||||||||
| return ''.join(comp_map_dna.get(base, base) for base in self.sequence) | ||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Если будет буква
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Верно подмечено. Здорово тогда было бы добавить что это решается использованием не There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Подкину к комментарию про метод
Suggested change
|
||||||||
|
|
||||||||
| def gc_content(self): | ||||||||
| """Return the GC content of the sequence.""" | ||||||||
| gc_count = (self.sequence.upper().count('G') + self.sequence.upper().count('C')) / len(self.sequence) * 100 | ||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Учтено, что могут быть заглавные и прописные буквы. Класс! |
||||||||
| return gc_count | ||||||||
|
|
||||||||
| class DNASequence(NucleicAcidSequence): | ||||||||
| """Class representing a DNA sequence.""" | ||||||||
|
|
||||||||
| TRANSCRIBE_DICT = { | ||||||||
| 'T': 'U', | ||||||||
| 't': 'u' | ||||||||
| } | ||||||||
|
|
||||||||
| def __init__(self, sequence): | ||||||||
| """Initialize a DNASequence object with a given sequence.""" | ||||||||
| super().__init__(sequence) | ||||||||
| if not self.check_alphabet(): | ||||||||
| raise ValueError("Invalid DNA sequence") | ||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Круто, что есть встроенная проверка. Но если взять что-то типо
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Верно подмечено |
||||||||
|
|
||||||||
| def transcribe(self): | ||||||||
| """Transcribe the DNA sequence into an RNA sequence.""" | ||||||||
| transcribed_seq = ''.join(self.TRANSCRIBE_DICT.get(base, base) for base in self.sequence) | ||||||||
| return transcribed_seq | ||||||||
|
Comment on lines
+102
to
+105
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. сначала требует вызова There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Здесь в методе transcribe возвращается просто строка transcribed_seq, а ниже в классе RNASequence при вызове метода reverse возращается новый объект класса RNASequence. Можно было бы для достижения единообразия и там и там возращать строки, либо чтобы после вызова метода transcribe создавался объект класса RNASequence.
Suggested change
|
||||||||
|
|
||||||||
| class RNASequence(NucleicAcidSequence): | ||||||||
| """Class representing an RNA sequence.""" | ||||||||
|
|
||||||||
| def __init__(self, sequence): | ||||||||
| """Initialize an RNASequence object with a given sequence.""" | ||||||||
| super().__init__(sequence) | ||||||||
| if not self.check_alphabet(): | ||||||||
| raise ValueError("Invalid RNA sequence") | ||||||||
|
|
||||||||
| def reverse(self): | ||||||||
| """Return the reverse of the RNA sequence.""" | ||||||||
| return RNASequence(self.sequence[::-1]) | ||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Здесь также, как и в предыдущих случаях, можно возвращать просто развернутую последовательность, а не создавать новый экземпляр класса
Suggested change
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ну не совсем. У нас был объект сиквенс. Мы его развернули. Почему в ходе разворота тип данных изменила и последовательность превратилась в строку? Хотя тут проблема с тем что захардкожено имя класса, лучше через |
||||||||
|
|
||||||||
| class AminoAcidSequence(BiologicalSequence): | ||||||||
| """Class representing an amino acid sequence.""" | ||||||||
|
|
||||||||
| def check_alphabet(self): | ||||||||
| """Check if the sequence contains valid amino acid alphabet characters.""" | ||||||||
| if not set(self.sequence).issubset(NucleicAcidSequence.AMINO_ACID_LETTERS): | ||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Данный set лучше вынести из класса NucleicAcidSequence |
||||||||
| raise ValueError("Invalid amino acid sequence") | ||||||||
|
Comment on lines
+123
to
+126
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Если просто запустить |
||||||||
|
|
||||||||
| def amino_acid_profile(self): | ||||||||
| """Return the profile of the amino acid sequence.""" | ||||||||
| self.check_alphabet() | ||||||||
|
|
||||||||
| aa_biochemistry = { | ||||||||
| 'hydrophobic': ['G', 'A', 'V', 'L', 'I', 'P', 'F', 'M', 'W'], | ||||||||
| 'polar': ['S', 'T', 'C', 'N', 'Q', 'Y'], | ||||||||
| '- charged': ['E', 'D'], | ||||||||
| '+ charged': ['K', 'H', 'R'] | ||||||||
| } | ||||||||
| profile = {} | ||||||||
|
|
||||||||
| for group in aa_biochemistry: | ||||||||
| profile[group] = 0.0 | ||||||||
|
|
||||||||
| for amino_acid in self.sequence: | ||||||||
| for group_name, group_list in aa_biochemistry.items(): | ||||||||
| if amino_acid.upper() in group_list: | ||||||||
| profile[group_name] += 1 | ||||||||
|
|
||||||||
| total_length = len(self.sequence) | ||||||||
| for group, count in profile.items(): | ||||||||
| profile[group] = round((count / total_length), 2) | ||||||||
| return profile | ||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Классный метод! |
||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Переменные могут загрустить, что не знают свои типы здесь (но они точно есть в докстринге и это хорошо):