-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathped_append_analysis_ML.py
127 lines (81 loc) · 2.71 KB
/
ped_append_analysis_ML.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 18 20:28:49 2024
@author: Joseph LaVigne
"""
from ucimlrepo import fetch_ucirepo
# fetch dataset
regensburg_pediatric_appendicitis = fetch_ucirepo(id=938)
# data (as pandas dataframes)
X = regensburg_pediatric_appendicitis.data.features
y = regensburg_pediatric_appendicitis.data.targets
# metadata
print(regensburg_pediatric_appendicitis.metadata)
# variable information
print(regensburg_pediatric_appendicitis.variables)
ped_append = X
X.to_csv('ped_append.csv', index=False)
print(ped_append)
#prepare EDA
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#sns.pairplot(ped_append)
#plt.show()
#plt.clf()
for col in ["Segmented_Neutrophils", "WBC_Count"]:
header_line = "-"*25
print(f"{header_line} Summary for {col}{header_line}")
print()
print(ped_append[col].describe())
print()
#
plt.scatter(ped_append["Segmented_Neutrophils"], ped_append["WBC_Count"])
plt.show()
sn = pd.array(ped_append["Segmented_Neutrophils"])
sn = sn.reshape(-1,1)
wbc = pd.array(ped_append["WBC_Count"])
wbc = wbc.reshape(-1,1)
# produce a simple linear regression model for ML prediction
from sklearn.linear_model import LinearRegression
line_fitter = LinearRegression()
#line_fitter.fit(sn, wbc)
# the line fitter model will not work with the previously assigned arrays sn and wbc because there missing data, therefore, I will need to clean the data.
# cleaning the data
print(ped_append.dtypes)
print(ped_append["Segmented_Neutrophils"].isna().sum())
print(ped_append.count())
# 54 observations of segmented neutrophils recorded.
# 776 observations of WBC recorded.
# remove the missing data from the segmented neutrophil column to make a new dataframe
sn_1 = ped_append.dropna(subset=['Segmented_Neutrophils'])
print(sn_1.info)
sn_2 = ped_append.dropna(subset=['WBC_Count'])
print(sn_2.info)
#take the subset of sn_1 and sn_2 to make a new dataframe. This will yield a 54 observation long dataframe.
sn_3 = pd.merge(sn_1, sn_2, on = "Segmented_Neutrophils", indicator=True)
print(sn_3)
sn_3 = sn_3.drop_duplicates()
print(sn_3.info)
frames = [sn_1, sn_2]
sn_3 = pd.concat(frames)
print(sn_3.info)
#I should only have 53 columns total
sn_3 = sn_3.dropna(subset=["Segmented_Neutrophils"]).drop_duplicates()
print(sn_3.info)
#I will separate our the columns now
sn_4 = sn_3["Segmented_Neutrophils"]
print(sn_4.info)
wbc_1 = sn_3["WBC_Count"]
print(wbc_1.info)
sn_4 = pd.array(sn_4)
sn_4 = sn_4.reshape(-1,1)
wbc_1 = pd.array(wbc_1)
wbc_1 = wbc_1.reshape(-1,1)
print(sn_4, wbc_1)
line_fitter.fit(sn_4, wbc_1)
wbc_1_predict = [line_fitter.predict(sn_4)]
plt.plot(sn_4, wbc_1)
plt.plot(sn_4, wbc_1_predict)
plt.show()