-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
154 lines (96 loc) · 3.44 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# Core Pkgs
import streamlit as st
# NLP
import neattext.functions as nfx
# EDA
import pandas as pd
# Text Downloader
import base64
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
# Data Viz Pkgs
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use("Agg")
from wordcloud import WordCloud
def plot_wordcloud(my_text):
my_wordcloud = WordCloud().generate(my_text)
fig = plt.figure()
plt.imshow(my_wordcloud, interpolation="bilinear")
plt.axis("off")
st.pyplot(fig)
# Load NLP Pkgs
import spacy
nlp = spacy.load('en_core_web_sm')
# Fxns
def text_analyzer(my_text):
docx = nlp(my_text)
allData = [(token.text,token.shape_,token.pos_,token.tag_,token.lemma_,token.is_alpha,token.is_stop) for token in docx]
df = pd.DataFrame(allData,columns=['Token','Shape','PoS','Tag','Lemma','IsAlpha','Is_Stopword'])
return df
def text_downloader(raw_text):
b64 = base64.b64encode(raw_text.encode()).decode()
new_filename = "clean_text_result_{}_.txt".format(timestr)
st.markdown("### ** 📩 ⬇️ Download Cleaned Text file **")
href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click here!</a>'
st.markdown(href, unsafe_allow_html=True)
def make_downloadable(data):
csvfile = data.to_csv(index=False)
b64 = base64.b64encode(csvfile.encode()).decode()
new_filename = "nlp_result_{}_.csv".format(timestr)
st.markdown("### ** 📩 ⬇️ Download CSV file **")
href = f'<a href="data:file/csv;base64,{b64}" download="{new_filename}">Click here!</a>'
st.markdown(href, unsafe_allow_html=True)
def main():
st.title("Text Cleaner App")
menu = ["TextCleaner","About"]
choice = st.sidebar.selectbox("Menu",menu)
if choice == "TextCleaner":
st.subheader("Text Cleaning")
text_file = st.file_uploader("Upload Txt File",type=['txt'])
normalize_case = st.sidebar.checkbox("Normalize Case")
clean_stopwords = st.sidebar.checkbox("Stopwords")
clean_punctuations = st.sidebar.checkbox("Punctuations")
clean_emails = st.sidebar.checkbox("Emails")
clean_special_char = st.sidebar.checkbox("Special Characters")
clean_numbers = st.sidebar.checkbox("Numbers")
clean_urls = st.sidebar.checkbox("URLS")
if text_file is not None:
file_details = {"Filename":text_file.name,"Filesize":text_file.size,"Filetype":text_file.type}
st.write(file_details)
# Decode Text
raw_text = text_file.read().decode('utf-8')
col1,col2 = st.beta_columns(2)
with col1:
with st.beta_expander("Original Text"):
st.write(raw_text)
with col2:
with st.beta_expander("Processed Text"):
if normalize_case:
raw_text = raw_text.lower()
if clean_stopwords:
raw_text = nfx.remove_stopwords(raw_text)
if clean_numbers:
raw_text = nfx.remove_numbers(raw_text)
if clean_urls:
raw_text = nfx.remove_urls(raw_text)
if clean_punctuations:
raw_text = nfx.remove_punctuations(raw_text)
st.write(raw_text)
text_downloader(raw_text)
with st.beta_expander("Text Analysis"):
token_result_df = text_analyzer(raw_text)
st.dataframe(token_result_df)
make_downloadable(token_result_df)
with st.beta_expander("Plot Wordcloud"):
plot_wordcloud(raw_text)
with st.beta_expander("Plot POS Tags"):
fig = plt.figure()
sns.countplot(token_result_df['PoS'])
plt.xticks(rotation=45)
st.pyplot(fig)
else:
st.subheader("About")
if __name__ == '__main__':
main()