Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Teamname : ML101 #13

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 206 additions & 0 deletions ML101.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Import libraries and load data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.impute import KNNImputer\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"\n",
"train_data = pd.read_csv(\"ML101_train_dataset.csv\")\n",
"test_data = pd.read_csv(\"ML101_dataset_test_feature.csv\")\n",
"\n",
"target_features = ['Systolic BP', 'Diastolic BP']\n",
"numerical_cols = [col for col in train_data.columns if train_data[col].dtype in ['int64', 'float64']]\n",
"categorical_cols = [col for col in train_data.columns if train_data[col].dtype == \"object\"]\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Fill NaN values using K-NN imputation"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# drop first column\n",
"# train_data.drop(train_data.columns[0], axis=1, inplace=True)\n",
"data = train_data[numerical_cols].dropna(thresh=9)\n",
"# drop rows with missing target\n",
"data.dropna(subset=target_features, inplace=True)\n",
"\n",
"imputer = KNNImputer(n_neighbors=5)\n",
"imputer.fit(data)\n",
"data = pd.DataFrame(imputer.transform(data), columns=data.columns)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Save the imputer for later use"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"na_filled_data = data\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Perform linear regression"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[184.6198455 236.20606093 184.39996347 ... 182.66436659 240.36077657\n",
" 218.73457401]\n",
"[ 84.1455178 106.91911572 76.97858708 ... 81.23055905 109.15988902\n",
" 103.09938601]\n"
]
}
],
"source": [
"from sklearn.linear_model import LinearRegression\n",
"\n",
"x_train_cols = [col for col in data.columns if col not in target_features]\n",
"x_train = data[x_train_cols]\n",
"y_train = data[target_features]\n",
"x_test = test_data[x_train_cols]\n",
"\n",
"# For Systolic BP\n",
"lr = LinearRegression()\n",
"lr.fit(x_train, y_train['Systolic BP'])\n",
"sys_pred = lr.predict(x_test)\n",
"print(sys_pred)\n",
"\n",
"# For Diastolic BP\n",
"lr = LinearRegression()\n",
"lr.fit(x_train, y_train['Diastolic BP'])\n",
"dia_pred = lr.predict(x_test)\n",
"print(dia_pred)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Save Predictions to CSV file"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"predictions = pd.DataFrame({'Systolic BP': sys_pred, 'Diastolic BP': dia_pred})\n",
"\n",
"# add index as last column with name 'ID'\n",
"predictions['ID'] = pd.Series(range(1, len(predictions) + 1))\n",
"predictions.set_index('ID', inplace=True)\n",
"\n",
"# save to csv\n",
"predictions.to_csv('kaggle.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Perform Classification of LifeStyle"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Bad' 'Bad' 'Bad' ... 'Bad' 'Bad' 'Bad']\n"
]
}
],
"source": [
"# add column 'Lifestyle' from train_data to data\n",
"data['LifeStyle'] = train_data['LifeStyle']\n",
"\n",
"# drop rows with missing target\n",
"data.dropna(subset=['LifeStyle'], inplace=True)\n",
"\n",
"X_train = data[x_train_cols]\n",
"Y_train = data[\"LifeStyle\"]\n",
"X_test = na_filled_data[x_train_cols]\n",
"\n",
"rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)\n",
"rf.fit(X_train, Y_train)\n",
"\n",
"rf_pred = rf.predict(X_test)\n",
"print(rf_pred)\n",
"\n",
"predictions = pd.DataFrame({'LifeStyle': rf_pred})\n",
"\n",
"# add index as last column with name 'ID'\n",
"predictions['ID'] = pd.Series(range(1, len(predictions) + 1))\n",
"predictions.set_index('ID', inplace=True)\n",
"\n",
"# save to csv\n",
"predictions.to_csv('ml101.csv')\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading