-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlogistic_regression_classifier.py
67 lines (53 loc) · 2.15 KB
/
logistic_regression_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from sklearn.linear_model import LogisticRegressionCV
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
#read data
yes_df = pd.read_csv('clean_data/vpn15s_cleaned.csv')
no_df = pd.read_csv('clean_data/novpn15s_cleaned.csv')
# Create labels for dataset
yes_df['label'] = 1
no_df['label'] = 0
#combine 2 dataset
combined_df = pd.concat([yes_df, no_df], ignore_index=True)
# separate features and specify target variable
X = combined_df.drop(columns=['label'])
y = combined_df['label']
#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#specifying the category and numerical columns
cate_cols=[cname for cname in X_train.columns if X_train[cname].dtype=="object"]
nume_cols=[cname for cname in X_train.columns if X_train[cname].dtype in ['int64','float64']]
# preprocessing to deal with different type of datas
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, nume_cols),
('cat', categorical_transformer, cate_cols)
])
cpu_cores=8
lr=Pipeline(steps=[('preprocessor', preprocessor),
('model', LogisticRegressionCV(n_jobs=cpu_cores,solver='liblinear'))
])
lr.fit(X_train,y_train)
# predicting some value
y_pred = lr.predict(X_test)
# evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
# print the report
print(classification_report(y_test, y_pred))
# view some first prediction
print("First few predictions:", y_pred[:5])
print("First few actual values:", y_test[:5].values)