Skip to content

Commit 925b650

Browse files
authored
rewrite clean_fasta_cdna_cds.py
1 parent 54fb18c commit 925b650

File tree

1 file changed

+82
-163
lines changed

1 file changed

+82
-163
lines changed
Lines changed: 82 additions & 163 deletions
Original file line numberDiff line numberDiff line change
@@ -1,194 +1,113 @@
11

22
from Bio import SeqIO
3+
from Bio.Seq import Seq
34
import sys
4-
from os import listdir
5+
from os import listdir
6+
import os
57

6-
def read_fasta_files(input_folder_faa,format_input):
7-
files = listdir(input_folder_faa)
8-
fa_all = []
9-
species_name_all = [] # which are the file name of fasta files
108

9+
10+
11+
def read_fasta_files(input_folder, format_input="fna"):
12+
13+
files = listdir(input_folder)
14+
records_all = []
15+
file_names = []
1116
for file in files:
1217
sp_name = file.split(".")[:-1]
13-
if file.split(".")[-1] == format_input:
14-
species_name_all.append(".".join(sp_name))
15-
records_prot = list(SeqIO.parse(input_folder_faa+file, "fasta"))
16-
fa_all.append(records_prot)
17-
print("there are ",len(species_name_all),format_input, " files, and the first file has ",len(fa_all[0]),"sequences in it.") # , sum([len(i) for i in og_all]
18-
19-
return (species_name_all, fa_all)
20-
21-
def read_fiveLetter_species_file(input_five_letter_csv):
22-
fiveLetter_species_dic ={}
23-
file1 = open(input_five_letter_csv,"r")
24-
for line in file1:
25-
species_name, fiveLetter_species = line.strip().split("\t")
26-
fiveLetter_species_dic[species_name] = fiveLetter_species
27-
file1.close()
28-
return fiveLetter_species_dic
18+
if file.split(".")[-1] == format_input:
19+
file_names.append(file)
20+
records = list(SeqIO.parse(input_folder + file, "fasta"))
21+
records_all.append(records)
22+
else:
23+
print("we are not reading the file "+str(input_folder+file)+" since extension is not faa.")
24+
if records_all:
25+
print("there are ", len(file_names), format_input, " files, and the first file has ", len(records_all[0]), "sequences in it.")
26+
else:
27+
print("there is no " +format_input, " files in ",input_folder)
28+
return file_names, records_all
2929

3030

31-
def write_fiveLetter_species_file(species_name_all, output_five_letter_tsv):
31+
def create_five_letter(file_names, output_five_letter_tsv = "clean_five_letter_species.tsv"):
3232

3333
fiveLetter_species_dic = {}
34-
try:
35-
for species_name in species_name_all: # let's try to extract a code which unique from file name
36-
fiveLetter_species= species_name.split(".")[0].split("_")[1][-5:] # GCA_000849305.1_ViralProj14697_translated_cds.faa # JN032115.1_cds_from_genomic.fna
37-
fiveLetter_species_dic[species_name] = fiveLetter_species
38-
except:
39-
fiveLetter_species_dic = {}
40-
41-
if len(set(fiveLetter_species_dic.values())) != len(set(species_name_all)):
42-
#"we assume the last five letter of NCBI is unique, please provide five letter code for species name as input as Read2tree works with five letter code specicies name."
43-
fiveLetter_species_dic = {}
44-
countr=0
45-
for species_name in species_name_all:
46-
fiveLetter_species= "s"+str(countr).zfill(4) #species_name.split(".")[0].split("_")[1][-5:]
47-
fiveLetter_species_dic[species_name] = fiveLetter_species
48-
countr+=1
49-
50-
51-
file1 = open(output_five_letter_tsv,"w")
34+
countr = 0
35+
for file_name in file_names:
36+
fiveLetter_species = "s" + str(countr).zfill(4)
37+
fiveLetter_species_dic[file_name] = fiveLetter_species
38+
countr += 1
39+
file_out = open(output_five_letter_tsv, "w")
5240
for species_name, fiveLetter in fiveLetter_species_dic.items():
53-
file1.write(species_name+"\t"+fiveLetter+"\n")
54-
file1.close()
55-
56-
return fiveLetter_species_dic
57-
41+
file_out.write(species_name + "\t" + fiveLetter + "\n")
42+
file_out.close()
43+
print("the five letter codes for each faa files are written in "+output_five_letter_tsv)
5844

45+
return fiveLetter_species_dic
5946

6047

61-
def edit_record_write_fna(species_name_all_fna, fna_all, output_file_fna):
62-
# Add the five letter species code to each record in fasta file
63-
all_prot_fna = []
64-
for species_idx_fna, species_record in enumerate(fna_all):
65-
species_name_fna = species_name_all_fna[species_idx_fna]
66-
species_name_faa = species_name_fna[:-17]+"_translated_cds"
67-
fiveLetter_species = fiveLetter_species_dic[species_name_faa]
68-
#print(species_name_fna, fiveLetter_species)
69-
70-
for prot in species_record:
71-
72-
prot_id_old = prot.id.split(" ")[0][3:]
73-
# lcl|KY249672.1_prot_APW78783.1_1 [protein=NS1] [protein_id=APW78783.1] [location=99..518] [gbkey=CDS]
74-
# >lcl|AF092942.1_cds_AAC96311.1_11
75-
# >lcl|AF092942.1_prot_AAC96311.1_11
76-
prot_id_old_split= prot_id_old.split("_")
77-
try:
78-
prot_id_old_split.remove("prot")
79-
except:
80-
try:
81-
prot_id_old_split.remove("cds")
82-
except:
83-
print("Error: prot/cds is not inside the record id of ", prot.id.split(" ")[0])
84-
print("we expect such format >lcl|AF092942.1_cds_AAC96311.1_11. Contact the developers.")
85-
exit
86-
87-
prot_id_edit = ".".join(prot_id_old_split)
88-
89-
prot_id_new = fiveLetter_species+ prot_id_edit
90-
prot.id = prot_id_new
91-
prot.name = prot_id_new
92-
prot.description = prot_id_new
93-
94-
95-
all_prot_fna.append(prot)
9648

97-
SeqIO.write(all_prot_fna, output_file_fna, "fasta")
49+
def clean_translate(records ,species_fivelet):
9850

99-
return all_prot_fna
100-
101-
102-
def edit_record_write_faa(species_name_all_faa, faa_all, fiveLetter_species_dic, output_folder_faa,all_prot_fna_id_set):
103-
104-
# Add the five letter species code to each record in fasta file
105-
for species_idx, species_record in enumerate(faa_all):
106-
species_name = species_name_all_faa[species_idx]
107-
fiveLetter_species = fiveLetter_species_dic[species_name]
108-
for prot in species_record:
109-
prot_id_old = prot.id.split(" ")[0][3:]
110-
# lcl|KY249672.1_prot_APW78783.1_1 [protein=NS1] [protein_id=APW78783.1] [location=99..518] [gbkey=CDS]
111-
# >lcl|AF092942.1_cds_AAC96311.1_11
112-
# >lcl|AF092942.1_prot_AAC96311.1_11
113-
prot_id_old_split= prot_id_old.split("_")
114-
try:
115-
prot_id_old_split.remove("prot")
116-
except:
117-
try:
118-
prot_id_old_split.remove("cds")
119-
except:
120-
print("Error: prot/cds is not inside the record id of ", prot.id.split(" ")[0])
121-
print("we expect such format >lcl|AF092942.1_cds_AAC96311.1_11. Contact the developers. ")
122-
exit
123-
prot_id_edit = ".".join(prot_id_old_split)
124-
125-
prot_id_new = fiveLetter_species+ prot_id_edit
126-
127-
assert prot_id_new in all_prot_fna_id_set,prot_id_old+"is not in fna file (exact match after removing _cds_ or _prot_)"
128-
prot.id = prot_id_new
129-
prot.name = prot_id_new
130-
prot.description = prot_id_new
131-
132-
133-
SeqIO.write(species_record, output_folder_faa+fiveLetter_species+".fa", "fasta")
51+
records_nuc = []
52+
records_aa = []
53+
for record in records:
54+
sequence = record.seq
55+
remainder = len(sequence) % 3
56+
if remainder != 0:
57+
sequence +=Seq('N' * (3 - remainder))
58+
record.seq= sequence
59+
60+
id_old = str(record.id).replace("_","").replace(".","")
61+
id_new= species_fivelet + id_old
13462

135-
return faa_all
63+
nuc_seq= SeqIO.SeqRecord(sequence, id=id_new, description="cleaned for r2t", name = id_new)
64+
65+
protein_seq = sequence.translate()
66+
protein_seq = SeqIO.SeqRecord(protein_seq, id=id_new, description="cleaned for r2t", name = id_new)
67+
68+
69+
records_nuc.append(nuc_seq)
70+
records_aa.append(protein_seq)
13671

72+
print("the clean aa and nuc for "+species_fivelet+" is ready")
73+
74+
return records_nuc, records_aa
75+
13776

13877

13978

140-
if __name__ == '__main__':
14179

142-
input_folder_faa = sys.argv[1]+"/" # "data/"
143-
output_folder_faa = sys.argv[2]+"/" # "DB/"
80+
if __name__ == '__main__':
14481

145-
output_file_fna = sys.argv[3] # "all_cdna.fa"
82+
input_folder_fna = sys.argv[1] + "/" # "myfolder/input_fna/" #
83+
84+
file_names, records_all = read_fasta_files(input_folder_fna, "fna")
85+
fiveLetter_species_dic = create_five_letter(file_names)
86+
87+
88+
folder_aa= "clean_aa"
14689

147-
if len(sys.argv)>4:
148-
input_five_letter_tsv = sys.argv[4]
90+
91+
if not os.path.exists(folder_aa):
92+
os.makedirs(folder_aa)
14993
else:
150-
input_five_letter_tsv = ""
94+
print("ERROR the folder exists "+folder_aa +" better to remove it ")
15195

96+
records_nuc_all_clean=[]
97+
for idx in range(len(file_names)):
98+
file_name = file_names[idx]
99+
records = records_all[idx]
100+
species_fivelet = fiveLetter_species_dic[file_name]
152101

153-
'''
154-
$ cat five_letter_species.tsv
155-
GCA_003266525.1_ASM326652v1_translated_cds 66525
156-
GCA_000857565.1_ViralProj15251_translated_cds 57565
157-
GCA_000849305.1_ViralProj14697_translated_cds 49305
102+
records_nuc, records_aa = clean_translate(records ,species_fivelet)
103+
104+
SeqIO.write(records_aa, folder_aa+"/"+species_fivelet+".fa", "fasta")
105+
106+
records_nuc_all_clean += records_nuc # one big list
158107

159-
'''
160108

161-
output_five_letter_tsv = input_folder_faa+"five_letter_species.tsv" # argv[2]
162-
163-
164-
input_folder_fna = input_folder_faa
165-
166-
(species_name_all_faa, faa_all) = read_fasta_files(input_folder_faa,"faa")
167-
(species_name_all_fna, fna_all) = read_fasta_files(input_folder_fna,"fna")
168-
169-
assert len(species_name_all_faa) ==len(species_name_all_fna), "the number of faa and fna files should be the same in the folder."
170-
171-
assert len(faa_all) ==len(fna_all), "the number of faa and fna records should be the same."
172-
173-
if input_five_letter_tsv:
174-
fiveLetter_species_dic = read_fiveLetter_species_file(input_five_letter_tsv)
175-
176-
else:
177-
fiveLetter_species_dic= write_fiveLetter_species_file(species_name_all_faa, output_five_letter_tsv)
178-
179-
180-
181-
all_prot_fna = edit_record_write_fna(species_name_all_fna, fna_all, output_file_fna)
182-
print("Edited cdna records are written to the file",output_file_fna)
183-
184-
all_prot_fna_recordid = [i.id for i in all_prot_fna]
185-
all_prot_fna_id_set = set(all_prot_fna_recordid)
186-
assert len(all_prot_fna_recordid) == len(all_prot_fna_id_set), "all record id in fna files should be unique. we consider this format when we checl" +all_prot_fna_recordid[0]
187-
188-
189-
faa_all = edit_record_write_faa(species_name_all_faa, faa_all, fiveLetter_species_dic, output_folder_faa, all_prot_fna_id_set)
190-
191-
print("Edited protien records are written to the folder",output_folder_faa)
192-
193-
109+
SeqIO.write(records_nuc_all_clean, "dna_ref.fa", "fasta")
110+
111+
print("we wrote "+str(len(file_names))+" faa fiels in the folder "+folder_aa+" and the nucluetide sequences all together in dna_ref.fa" )
194112

113+
print("Now you can use the folder with OMA standalone" )

0 commit comments

Comments
 (0)