-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtextpreprocessing.py
45 lines (38 loc) · 1.58 KB
/
textpreprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import pandas as pd
df=pd.read_csv('FinalCsv.csv', names=['Url','label'])
print(df.head())
import re
ignore_words = ['http','https','www','com','xml','php','xl','asp','co','za','html','php']
def extract_words(sentence):
words = re.sub("[^\w]", " ", sentence).split() #nltk.word_tokenize(sentence)
words_cleaned = [w.lower() for w in words if w not in ignore_words]
return words_cleaned
df['Tokens'] = df['Url'].apply(lambda x: extract_words(x))
print(df.head())
import re
ignore_words = ['http','https','www','com','xml','php','xl','asp','co','za','html','php']
def extract_words(sentence):
words = re.sub("[^\w]", " ", sentence).split() #nltk.word_tokenize(sentence)
words_cleaned = [w.lower() for w in words if w not in ignore_words]
return words_cleaned
df['Tokens'] = df['Url'].apply(lambda x: extract_words(x))
from sklearn.model_selection import train_test_split
x= df['Url']
y=df['label']
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(analyzer=extract_words,stop_words=ignore_words,vocabulary=['about',"contact",'gallery', "blog"])
count_train = count_vectorizer.fit_transform(x)
feature_names = count_vectorizer.get_feature_names()
print(feature_names)
X_vect = pd.DataFrame(count_train.toarray())
from sklearn.tree import DecisionTreeClassifier
dt= DecisionTreeClassifier()
dt.fit(X_vect,y)
from External_Internal import getAllInternalLinks as GLA
def predictor():
url=input()
scraped_url = GLA(url)
for link in scraped_url:
y_pred= dt.predict(count_vectorizer.fit_transform([link]))
print(y_pred,url)
predictor()