-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbioprocessor.py
92 lines (80 loc) · 3.49 KB
/
bioprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import spacy
import pysolr
import re
class BioProcessor:
def __init__(self):
# Load a SpaCy NER Model
self.nlp = spacy.load("en_ner_bc5cdr_md")
# Setup a Solr instance. The timeout is optional.
self.solr = pysolr.Solr('http://librairy.linkeddata.es/solr/atc', timeout=10)
# Setup a Solr instance. The timeout is optional.
self.solr_diseases = pysolr.Solr('http://librairy.linkeddata.es/solr/diseases', timeout=10)
def get_diseases(self, text):
doc = self.nlp(text)
candidates = []
# Search for candidates based on SpaCy NER
for entity in doc.ents:
#print("----->",entity.label, entity.label_, entity.text)
if (entity.label_ == "DISEASE" and len(entity.text) > 2):
#print(entity.text, entity.vector_norm)
candidates.append(entity)
# Retrieve the ATC Code
diseases = []
for candidate in list(set(candidates)):
#print("candidate: ", candidate)
label = re.sub(r'\W+', ' ', candidate.text)
solr_query = "name_t:\""+label+"\"^100 or synonyms:\""+label+"\"^10 or mappings:\""+label+"\"^1"
results = self.solr_diseases.search(solr_query)
if (len(results) == 0):
label_tokens = label.split(" ")
if (len(label_tokens) > 3):
new_label = " ".join(label_tokens[:len(label_tokens)-1])
solr_query = "name_t:\""+new_label+"\"^100 or synonyms:\""+new_label+"\"^10 or mappings:\""+new_label+"\"^1"
results = self.solr_diseases.search(solr_query)
for result in results:
name = result["name_t"]
disease = {}
disease["name"] = name
disease['code'] =result["id"]
disease['level']=result["level_i"]
diseases.append(disease)
break
return diseases
def get_drugs(self, text):
doc = self.nlp(text)
candidates = []
# Search for candidates based on suffix
for token in doc:
token_text = token.text.lower()
if ("vir" in token_text):
candidates.append(token_text)
elif ("feron" in token_text):
candidates.append(token_text)
elif ("umab" in token_text):
candidates.append(token_text)
# Search for candidates based on SpaCy NER
for entity in doc.ents:
#print("----->",entity.label, entity.label_, entity.text)
if (entity.label_ == "CHEMICAL" and len(entity.text) > 2):
candidates.append(entity.text.lower())
# Retrieve the ATC Code
drugs = []
for candidate in list(set(candidates)):
#print("candidate: ", candidate)
label = re.sub(r'\W+', ' ', candidate)
results = self.solr.search("label_t:"+label)
for result in results:
drug = {}
if ("label_t" in result):
drug["name"] = result["label_t"]
if ("code_s" in result):
drug["atc_code"] = result["code_s"]
if ("parent_s" in result):
drug["atc_parent"] = result["parent_s"]
if ("cui_s" in result):
drug["cui"] = result["cui_s"]
if ("level_i" in result):
drug["level"] = result["level_i"]
drugs.append(drug)
break
return drugs