-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse.py
105 lines (91 loc) · 3.63 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from bs4 import BeautifulSoup
import requests
import nltk
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
class AI():
def __init__(self, html, question):
self.end_chat = False
self.got_topic = False
self.do_not_respond = True
self.title = None
self.text_data = []
self.sentences = []
self.para_indices = []
self.current_sent_idx = None
self.punctuation_dict = str.maketrans({p:None for p in punctuation})
self.lemmatizer = nltk.stem.WordNetLemmatizer()
self.stopwords = nltk.corpus.stopwords.words('english')
self.contentHtml = requests.utils.unquote(html)
self.question = "when indonesia independents day?"#question
self.initHtml()
def sinkronTokenizer(self):
vectorizer = TfidfVectorizer(tokenizer=self.preprocess)
self.sentences.append(self.question)
tfidf = vectorizer.fit_transform(self.sentences)
scores = cosine_similarity(tfidf[-1],tfidf)
self.current_sent_idx = scores.argsort()[0][-2]
scores = scores.flatten()
scores.sort()
value = scores[-2]
if value != 0:
print(self.getQP()[0]+self.text_data[self.para_indices[self.current_sent_idx]] or self.sentences[self.current_sent_idx])
else:
print(False)
del self.sentences[-1]
def getQP(self):
ps = nltk.PorterStemmer()
inp_wt = []
for w in self.question.split():
inp_wt.append(ps.stem(w))
q_type = []
for i in inp_wt:
if i in ['who', 'name of']:
q_type.append('who ')
list_to_search = ['he', 'she', 'they', 'Mr', 'Mrs', 'Ms']
elif i in ['what']:
q_type.append('what ')
elif i in ['when']:
q_type.append('when ')
elif i in ['where']:
q_type.append('where ')
elif i in ['why']:
q_type.append('why ')
return q_type
def initHtml(self):
try:
data = self.contentHtml
soup = BeautifulSoup(data, 'html.parser')
p_data = soup.findAll('p')
dd_data = soup.findAll('dd')
p_list = [p for p in p_data]
dd_list = [dd for dd in dd_data]
for tag in p_list+dd_list:
a = []
for i in tag.contents:
if i.name != 'sup' and i.string != None:
stripped = ' '.join(i.string.strip().split())
a.append(stripped)
self.text_data.append(' '.join(a))
for i,para in enumerate(self.text_data):
sentences = nltk.sent_tokenize(para)
self.sentences.extend(sentences)
index = [i]*len(sentences)
self.para_indices.extend(index)
self.title = soup.find('h1').string
self.got_topic = True
self.sinkronTokenizer()
except Exception as e:
print(e)
def preprocess(self, text):
text = text.lower().strip().translate(self.punctuation_dict)
words = nltk.word_tokenize(text)
words = [w for w in words if w not in self.stopwords]
return [self.lemmatizer.lemmatize(w) for w in words]
html = input()
question = input()
AI(html, question)