-
Notifications
You must be signed in to change notification settings - Fork 45
Hw4 petrikov #17
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Hw4 petrikov #17
Changes from all commits
3bd9f31
2657157
065e4ce
6d644d2
5ca47ee
b1c1858
e0da3fa
1786d8a
78ce026
f92e9e0
c52a25c
a3ebaa9
9185621
d9b28e4
b42cffb
e9b1c35
5e52e87
df0ba32
6396992
a49dbe7
52d72b5
7508ab3
e8c5942
0ae464f
1e7eb30
f1d3732
5bcb364
d27cf98
b8c1c15
1f337dd
47885a5
bb3afee
a278f9b
76b32f8
5891ff1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,225 @@ | ||
AMINO_ACIDS_NAMES = {'A': 'Ala', | ||
'R': 'Arg', | ||
'N': 'Asn', | ||
'D': 'Asp', | ||
'V': 'Val', | ||
'H': 'His', | ||
'G': 'Gly', | ||
'Q': 'Gln', | ||
'E': 'Glu', | ||
'I': 'Ile', | ||
'L': 'Leu', | ||
'K': 'Lys', | ||
'M': 'Met', | ||
'P': 'Pro', | ||
'S': 'Ser', | ||
'Y': 'Tyr', | ||
'T': 'Thr', | ||
'W': 'Trp', | ||
'F': 'Phe', | ||
'C': 'Cys'} | ||
|
||
GRAVY_AA_VALUES = {'L': 3.8, | ||
'K': -3.9, | ||
'M': 1.9, | ||
'F': 2.8, | ||
'P': -1.6, | ||
'S': -0.8, | ||
'T': -0.7, | ||
'W': -0.9, | ||
'Y': -1.3, | ||
'V': 4.2, | ||
'A': 1.8, | ||
'R': -4.5, | ||
'N': -3.5, | ||
'D': -3.5, | ||
'C': 2.5, | ||
'Q': -3.5, | ||
'E': -3.5, | ||
'G': -0.4, | ||
'H': -3.2, | ||
'I': 4.5} | ||
|
||
VALID_SYMBOLS = set(AMINO_ACIDS_NAMES) | ||
|
||
|
||
def calc_gravy(seq: str) -> float: | ||
""" | ||
Calculate GRAVY (grand average of hydropathy) value | ||
of given amino acids sequence | ||
""" | ||
gravy_aa_sum = 0 | ||
for amino_ac in seq: | ||
gravy_aa_sum += GRAVY_AA_VALUES[amino_ac] | ||
return round(gravy_aa_sum / len(seq), 3) | ||
|
||
|
||
def calc_total_charge(charged_amino_ac_numbers_list: list, | ||
ph_value: float) -> float: | ||
""" | ||
Calculate the approximate total charge of some amino acid sequence | ||
for given pH value | ||
based only on a list of the number of key charged amino acids. | ||
""" | ||
n_terminal_charge = 1 / (1 + 10 ** (ph_value - 8.2)) | ||
c_terminal_charge = -1 / (1 + 10 ** (3.65 - ph_value)) | ||
cys_charge = -charged_amino_ac_numbers_list[0] / (1 + 10 ** (8.18 - ph_value)) | ||
asp_charge = -charged_amino_ac_numbers_list[1] / (1 + 10 ** (3.9 - ph_value)) | ||
glu_charge = -charged_amino_ac_numbers_list[2] / (1 + 10 ** (4.07 - ph_value)) | ||
tyr_charge = -charged_amino_ac_numbers_list[3] / (1 + 10 ** (10.46 - ph_value)) | ||
his_charge = charged_amino_ac_numbers_list[4] / (1 + 10 ** (ph_value - 6.04)) | ||
lys_charge = charged_amino_ac_numbers_list[5] / (1 + 10 ** (ph_value - 10.54)) | ||
arg_charge = charged_amino_ac_numbers_list[6] / (1 + 10 ** (ph_value - 12.48)) | ||
total_charge = (n_terminal_charge + | ||
c_terminal_charge + | ||
cys_charge + | ||
asp_charge + | ||
glu_charge + | ||
tyr_charge + | ||
his_charge + | ||
lys_charge + | ||
arg_charge) | ||
return total_charge | ||
|
||
|
||
def calc_iso_point(seq: str): | ||
""" | ||
Calculate approximate isoelectric point of given amino acids sequence | ||
""" | ||
charged_amino_ac_numbers = [] | ||
for amino_ac in ("C", "D", "E", "Y", "H", "K", "R"): | ||
charged_amino_ac_numbers.append(seq.count(amino_ac)) | ||
total_charge_tmp = 1 | ||
ph_iso_point = -0.1 | ||
while total_charge_tmp > 0: | ||
ph_iso_point += 0.1 | ||
total_charge_tmp = calc_total_charge( | ||
charged_amino_ac_numbers, | ||
ph_iso_point) | ||
return round(ph_iso_point, 1) | ||
|
||
|
||
def transform_to_three_letters(seq: str) -> str: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Имхо, лучше было бы разделитель как раз задать параметром по умолчанию |
||
""" | ||
Transform 1-letter aminoacid symbols in | ||
sequence to 3-letter symbols separated by | ||
hyphens. | ||
""" | ||
new_name = '' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. кажется, не оч удачное название )) |
||
for amino_acid in seq: | ||
new_name += AMINO_ACIDS_NAMES[amino_acid] + '-' | ||
return new_name[:-1] | ||
|
||
|
||
def sequence_length(seq: str) -> int: | ||
""" | ||
Function counts number of aminoacids in | ||
given sequence | ||
""" | ||
return len(seq) | ||
|
||
|
||
def calc_protein_mass(seq: str) -> int: | ||
""" | ||
Calculate protein molecular weight using the average | ||
molecular weight of amino acid - 110 Da | ||
""" | ||
return len(seq) * 110 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. У вас же есть функция return sequence_length(seq) * 110 |
||
|
||
|
||
def find_heaviest_proteins(sequence: list): | ||
""" | ||
Return the sequence of the heaviest protein from list | ||
""" | ||
protein_mass = {} | ||
list_of_protein = sequence | ||
for i in list_of_protein: | ||
protein_mass[i] = calc_protein_mass(i) | ||
return count_uniq_max_mass(protein_mass) | ||
Comment on lines
+130
to
+138
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Все те же самые комменты, что и для функции |
||
|
||
|
||
def count_uniq_max_mass(protein_mass): | ||
""" | ||
Count amount of proteins with the same maximum mass and return them | ||
""" | ||
max_weight = max(protein_mass.values()) | ||
count_protein = 0 | ||
proteins = [] | ||
for i in protein_mass: | ||
if protein_mass[i] == max_weight: | ||
count_protein += 1 | ||
if count_protein >= 1: | ||
proteins.append(i) | ||
|
||
return f'{proteins} - {max_weight}' | ||
Comment on lines
+141
to
+154
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Все те же самые комменты, что и для функции |
||
|
||
|
||
def find_lightest_proteins(sequence: list): | ||
""" | ||
Return the sequence of the lightest protein from list | ||
""" | ||
Comment on lines
+158
to
+160
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Опять же, функция |
||
protein_mass = {} | ||
list_of_protein = sequence | ||
for i in list_of_protein: | ||
Comment on lines
+162
to
+163
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ух )))
def find_lightest_proteins(list_of_proteins: List[str]) -> List[str]: # Хотя у вас тут возвращается не список :(
def find_lightest_proteins(proteins: List[str]) -> List[str]:
"""
...
"""
protein_mass = {}
for protein in proteins:
protein_mass[protein] = calc_protein_mass(protein) |
||
protein_mass[i] = calc_protein_mass(i) | ||
return count_uniq_min_mass(protein_mass) | ||
|
||
|
||
def count_uniq_min_mass(protein_mass): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. По названию я ожидаю, что функция вернет вообще число ))) |
||
""" | ||
Count amount of proteins with the same minimum mass and return them | ||
""" | ||
Comment on lines
+168
to
+171
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Нету описания, что вообще принимает функция (ни в тайп хинте, ни в докстринге). Буквально, что там находится? Из названия вот вообще не очевидно, почему это словарь... Что в ключе, что в значении? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Нет ))) |
||
min_weight = min(protein_mass.values()) | ||
count_protein = 0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. А зачем тут вот это? |
||
proteins = [] | ||
for i in protein_mass: | ||
if protein_mass[i] == min_weight: | ||
Comment on lines
+175
to
+176
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Почему for protein, weight in protein_mass.items():
if weight == min_weight: |
||
count_protein += 1 | ||
if count_protein >= 1: | ||
Comment on lines
+177
to
+178
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Зачем? 0_0 |
||
proteins.append(i) | ||
return f'{proteins} - {min_weight}' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Тут надо бы было делать просто |
||
|
||
|
||
def check_sequences(seqs: list): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. def check_sequences(seqs: List[str]): |
||
""" | ||
Raise ValueError if at least one sequence | ||
contains non valid symbols | ||
""" | ||
if not (isinstance(seqs, list)): | ||
raise ValueError("Enter valid protein sequence") | ||
for seq in seqs: | ||
if (not (isinstance(seq, str))) or (not (set(seq.upper()).issubset(VALID_SYMBOLS))): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if not isinstance(seq, str) or not set(seq.upper()).issubset(VALID_SYMBOLS): |
||
raise ValueError("Enter valid protein sequence") | ||
|
||
|
||
# Didn't place at the beginning because the functions are defined above | ||
FUNC_STR_INPUT = { | ||
'gravy': calc_gravy, | ||
'iso': calc_iso_point, | ||
'rename': transform_to_three_letters, | ||
'lengths': sequence_length, | ||
'molw': calc_protein_mass} | ||
|
||
FUNC_LIST_INPUT = { | ||
'heavy': find_heaviest_proteins, | ||
'light': find_lightest_proteins} | ||
|
||
|
||
def process_seqs(option: str, seqs: list): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. def process_seqs(option: str, seqs: List[str]): |
||
""" | ||
Perform some simple operations on amino acids sequences. | ||
""" | ||
Comment on lines
+209
to
+211
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Раз это главная функция (в какой-то степени точка входа), тут бы побольше описания дать... |
||
if isinstance(seqs, str): | ||
seq_tmp = seqs | ||
seqs = [seq_tmp] | ||
Comment on lines
+212
to
+214
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ну вот по идее такого никогда не должно быть, т.к. вы явно сказали, что seqs -- лист ))) |
||
check_sequences(seqs) | ||
if option in FUNC_STR_INPUT.keys(): | ||
results = [] | ||
for seq in seqs: | ||
result_tmp = FUNC_STR_INPUT[option](seq.upper()) | ||
results.append(result_tmp) | ||
return results | ||
elif option in FUNC_LIST_INPUT.keys(): | ||
return FUNC_LIST_INPUT[option](seqs) | ||
Comment on lines
+222
to
+223
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. А че почему не сделать всё STR_INPUT? )) |
||
else: | ||
raise ValueError("Enter valid operation") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# ProtSeqO | ||
|
||
## Tool for PROtein SEQuences Operation | ||
|
||
*This is the repo for the fourth homework of the BI Python 2023 course* | ||
|
||
This tool can perform some simple operations on amino acid sequences: | ||
* help you calculate protein lengths, molecular weights, isoelectric points and GRAVY values | ||
* find and show you heaviest and lightest proteins | ||
* rewrite 1-letter sequence to 3-letter sequence | ||
|
||
## How use ProtSeqO | ||
Execute script (you should be on directory with script): | ||
```bash | ||
python3 | ||
>>> from ProtSeqO import process_seqs | ||
>>>print(process_seqs(__command__, __sequence or list of sequences__)) | ||
``` | ||
|
||
You can input to `process_seqs()` sequence as string or list with any strings of sequences. __Pay attention__ that your sequence(s) should contain 1-letter symbols (case does not matters) of 20 common amino acids ('U' for selenocysteine and 'O' for pyrrolysine doesn't allowed). | ||
|
||
Command must be a string with one of followed options. | ||
|
||
## ProtSeqO options | ||
* 'lengths' - return list with numbers of AA in each sequence(s) | ||
* 'molw' - return list of protein molecular weight (use the average molecular weight of AA, 110 Da) | ||
* 'iso' - return list of approximate isoelectric point of given amino acids sequence | ||
* 'gravy' - return list of GRAVY (grand average of hydropathy) values | ||
* 'rename' - return list of sequences in 3-letter AA code (AA separated by hyphens) | ||
* 'heavy' - return the sequence(s) with maximum molecular weight and weigth value | ||
* 'light' - return the sequence(s) with minimum molecular weight and weigth value | ||
|
||
## ProtSeqO using examples | ||
```python | ||
python3 | ||
>>> from ProtSeqO import process_seqs | ||
>>> print(process_seqs('iso', ['ACGTWWA', 'ILATTWP'])) | ||
### [5.8, 6.0] | ||
>>> print(process_seqs('gravy', 'ilattwp')) | ||
### [0.886] | ||
>>> print(process_seqs('rename', ['ACGTwwa'])) | ||
### ['Ala-Cys-Gly-Thr-Trp-Trp-Ala'] | ||
>>> print(process_seqs('heavy', ['ILATTWP'], ['ACGTwwa'])) | ||
### ['ILATTWP', 'ACGTwwa'] - 770 | ||
``` | ||
|
||
## In case of problem - contact with us in GitHub | ||
___Developers___: | ||
* Petrikov Kirill | ||
* Muradova Gulgaz | ||
* Yury Popov | ||
|
||
 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Не оч понял, а кто 4-ый девелопер ))) |
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Тут не оч хорошо использовать список. Лучше бы сделать тут вот так: