-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
165 lines (122 loc) · 5.5 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import pandas as pd
import numpy as np
import nltk
#nltk.download()
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
import streamlit as st
df = pd.read_csv("./clean_data.csv")
# disable warnings
st.set_option('deprecation.showPyplotGlobalUse', False)
st.set_option('deprecation.showfileUploaderEncoding', False)
st.title('Job Recommender System')
st.subheader("Navigate to side bar to see project info")
st.subheader("See below for options")
hide_streamlit_style = '''
<style>
#MainMenu {visibility: hidden;}
footer {visibility: hidden;}
</style>
'''
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
st.sidebar.markdown(
"""
----------
## Project Overview
This is a Job recommendation web app that uses filtering techniques and Natural Language Processing (NLP)
to suggest 10 top jobs to user upon entering a specific job/role (and probably other preferences).
""")
st.sidebar.header("") # initialize empty space
st.sidebar.markdown(
"""
----------
## Text data conversion method is "TF-IDF"
Term Frequency - Inverse Document Frequency (TF-IDF) converts text data to vectors as model can only process numerical data; it weights the word counts by measure of how often they appear in the dataset
""")
st.sidebar.header("") # initialize empty space
st.sidebar.markdown(
"""
----------
## NOTE:
If the Job/your preferences could not be matched with the available jobs, the overview of job data will be returned with their scores all labeled as "0.0"
""")
user_input = st.text_input("Enter any job you want recommendation(s) on")
from_user = pd.DataFrame(data=[user_input], columns = ["Text"])
from_user.index=range(len(from_user.index))
st.text("Vectorizing Method: TF-IDF")
#Replace Nan values
df.fillna("", inplace = True)
#Creating the Jobs corpus
df['Text'] = df['Job title'].map(str) + ' ' + df['Salary'] + ' ' + df['Date'] + ' ' + df['Company Location'] + ' ' + df['Company Name'] + ' ' + df['city']+ ' ' + df['state']
#df.head(10)
data = df[['Unnamed: 0', 'Text', 'Job title','Company Name']]
data = data.fillna(' ')
data.rename(columns={'Unnamed: 0':"Job.ID"}, inplace = True)
stopword = stopwords.words('english')
stopword_ = set(stopwords.words('english'))
wn = WordNetLemmatizer()
# Create word tokens
def token_txt(token):
return token not in stopword_ and token not in list(string.punctuation) and len(token) > 2
def clean_txt(text):
clean_text = []
clean_text2 = []
text = re.sub("'", "", text)
text = re.sub("(\\d|\\W)+", " ", text)
text = text.replace("nbsp", "")
clean_text = [wn.lemmatize(word, pos = "v") for word in word_tokenize(text.lower()) if token_txt(word)]
clean_text2 = [word for word in clean_text if token_txt(word)]
return " ".join(clean_text2)
data['Text'] = data['Text'].apply(clean_txt)
tfidf_vect = TfidfVectorizer()
# Fitting and transforming the vector
tfidf_comb = tfidf_vect.fit_transform((data['Text'])) #Computing the Cosine Similarity using TF-IDF
user_tfidf = tfidf_vect.transform(from_user['Text'])
cos_sim_tfidf = map(lambda x: cosine_similarity(user_tfidf, x), tfidf_comb)
rec1 = list(cos_sim_tfidf)
def get_recommendation(top, the_data, scores):
recommendation = pd.DataFrame(columns = ['Job_ID', 'Job title', 'Company Name', 'Accuracy'], dtype=object)
count = 0
for i in top:
recommendation.at[count, 'Job_ID'] = the_data['Job.ID'][i]
recommendation.at[count, 'Job title'] = the_data['Job title'][i]
recommendation.at[count, 'Company Name'] = the_data['Company Name'][i]
recommendation.at[count, 'Accuracy'] = scores[count]
count += 1
return recommendation
#Using TF-IDF for recommendation
np.bool = np.bool_
np.object = object
top10_tfidf = sorted(range(len(rec1)), key = lambda i: rec1[i], reverse = True)[:10]
list_scores_tfidf = [rec1[i][0][0] for i in top10_tfidf]
tfidf_recommendation = get_recommendation(top10_tfidf, data, list_scores_tfidf) #Recommendation with TF-IDF
tfidf_recommendation["Accuracy"] = tfidf_recommendation["Accuracy"].astype(float)
tfidf_recommendation.Accuracy = tfidf_recommendation.Accuracy.round(2)
# Another vectorizing method that could be of interest is using Count Vectorizer
count_vect = CountVectorizer()
# Fitting and transforming the vectorizer
count_comb = count_vect.fit_transform((data['Text'])) #fitting and transforming the vector
user_count_countvec = count_vect.transform(from_user['Text'])
cos_sim_count_countvec = map(lambda x: cosine_similarity(user_count_countvec, x), count_comb)
count_vec1 = list(cos_sim_count_countvec)
top10_count_vec_count = sorted(range(len(count_vec1)), key = lambda i: count_vec1[i], reverse = True)[:10]
list_scores_vec_count = [count_vec1[i][0][0] for i in top10_count_vec_count]
count_vec_recommendation = get_recommendation(top10_count_vec_count, data, list_scores_vec_count) #Recommendation with count vectorizer
def main():
if st.button("Recommend Jobs"):
st.write(tfidf_recommendation)
if __name__ == '__main__':
main()