-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp_final.py
145 lines (109 loc) · 6.19 KB
/
app_final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import streamlit as st
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style as style
style.use('fivethirtyeight')
import seaborn as sns
import time
import warnings
warnings.filterwarnings('ignore')
# Load the vecotrize vocabulary specific to the category
# rb: read bytes
# wb: write bytes
with open(r"./models/toxic_vect.pkl", "rb") as f:
tox = pickle.load(f)
with open(r"./models/severe_toxic_vect.pkl", "rb") as f:
sev = pickle.load(f)
with open(r"./models/obscene_vect.pkl", "rb") as f:
obs = pickle.load(f)
with open(r"./models/insult_vect.pkl", "rb") as f:
ins = pickle.load(f)
with open(r"./models/threat_vect.pkl", "rb") as f:
thr = pickle.load(f)
with open(r"./models/identity_hate_vect.pkl", "rb") as f:
ide = pickle.load(f)
# Load the ML models
with open(r"./models/toxic_model.pkl", "rb") as f:
tox_model = pickle.load(f)
with open(r"./models/severe_toxic_model.pkl", "rb") as f:
sev_model = pickle.load(f)
with open(r"./models/obscene_model.pkl", "rb") as f:
obs_model = pickle.load(f)
with open(r"./models/insult_model.pkl", "rb") as f:
ins_model = pickle.load(f)
with open(r"./models/threat_model.pkl", "rb") as f:
thr_model = pickle.load(f)
with open(r"./models/identity_hate_model.pkl", "rb") as f:
ide_model = pickle.load(f)
code_repo = '[Github repo](https://github.com/cristobalza/toxic_nlp_project)'
ibm_docu = '[here](https://developer.ibm.com/technologies/artificial-intelligence/models/max-toxic-comment-classifier/)'
kaggle_link = '[Kaggle Competition](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)'
# Title
st.title('Toxic Word Detector App')
st.markdown('## Introduction ')
st.markdown('\n\n')
st.markdown("When we are engaging in conversations on online platforms, in most cases, we know the intention of our words and what we are trying to convey with them. Sometimes, however, we might not considerate the possible consequences of our words and we end up hurting other people. There is also the case of people that cyberbully other people just for fun, for the sake of being mean, or some other reason. Regardless of the case, the Internet can be a place where words can easily inflict great damage to entire populations with or without the knowledge of the emissor. Thus this app tries to detect the toxicity of words.")
st.markdown("Detecting toxicity in a binary way is a very risky task that can end up making more harm than good for our society. In this app, we are not trying to detect the cyberbullers and then silence them. Instead we want to mitigate the cyberbulling issue by letting the user know why the things they say can hurt other people's feelings. Letting the user know that maybe is not exactly the word, but rather the context of our sentences are the ones that can be misused.")
st.markdown("This app uses Machine Learning to predict your text against different categories of language toxicity such as `toxic, severe toxic, obscene, insulting, threating, identity hateful` categories. To see more about the model and work pipeline used in this project, you can visit my "+code_repo+".", unsafe_allow_html=True)
st.markdown("It is important for the user/reader to know that the data used to train the models for this project was from a "+kaggle_link+" and the documentation of the toxic categories can be found "+ibm_docu+".", unsafe_allow_html=True)
st.info("This app may show innacurate results, so please don't take this results for granted. This app is still under construction and by no means it is ready to be used in the real world situations such as social media purposes and others. \n \n Thanks for your understanding ")
# status = st.radio("Do you understand the above message? If so, please answer to continue with the app:", ("Yes", "No"))
# if status == 'Yes':
if st.checkbox("Do you understand the above message? If so, please answer to continue with the app:"):
# Text input
st.markdown('#### Instructions')
st.markdown('- Step 1: Input desired text data')
st.markdown('- Step 2: Press `Predict` button and see how toxic was your text.')
time.sleep(5)
text = st.text_area('Enter your Text', 'Type here' )
if st.button('Predict'):
# Take a string input from user
user_input = text
data = [user_input]
vect = tox.transform(data)
pred_tox = tox_model.predict_proba(vect)[:,1]
vect = sev.transform(data)
pred_sev = sev_model.predict_proba(vect)[:,1]
vect = obs.transform(data)
pred_obs = obs_model.predict_proba(vect)[:,1]
vect = thr.transform(data)
pred_thr = thr_model.predict_proba(vect)[:,1]
vect = ins.transform(data)
pred_ins = ins_model.predict_proba(vect)[:,1]
vect = ide.transform(data)
pred_ide = ide_model.predict_proba(vect)[:,1]
# Round it and multiply by 100
out_tox = round(pred_tox[0], 2) * 100
out_sev = round(pred_sev[0], 2) * 100
out_obs = round(pred_obs[0], 2) * 100
out_ins = round(pred_ins[0], 2) * 100
out_thr = round(pred_thr[0], 2) * 100
out_ide = round(pred_ide[0], 2) * 100
bar_labels=['toxic','severe_toxic','obscene', 'insult','threat', 'identity_hate']
data = [out_tox, out_sev, out_obs, out_ins, out_thr, out_ide]
st.set_option('deprecation.showPyplotGlobalUse', False)
# # Graph
df = pd.DataFrame(data = [data[::-1]], columns = bar_labels)
# height = data
# bars = bar_labels
# y_pos = np.arange(len(bars))
# # Create horizontal bars
# plt.barh(y_pos, height)
# # Format graph
# plt.yticks(y_pos, bars)
sns.barplot(data=df, orient = 'h', palette='inferno')
plt.xlim(0,100)
plt.xlabel("% Toxicity Level Detected")
st.pyplot()
cols = df.columns
df.rename(columns = {i:i+' %' for i in cols}, inplace=True)
st.table(df)
#End
st.balloons()
# else :
# st.error(":(")
link = '[cristobalza.com](http://cristobalza.com)'
st.markdown('See my website and other projects @ '+link, unsafe_allow_html=True)