KamandPrompt · AmanSikarwar · Oct 27, 2023
diff --git a/ML101.ipynb b/ML101.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Import libraries and load data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.impute import KNNImputer\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "\n",
+    "\n",
+    "train_data = pd.read_csv(\"ML101_train_dataset.csv\")\n",
+    "test_data = pd.read_csv(\"ML101_dataset_test_feature.csv\")\n",
+    "\n",
+    "target_features = ['Systolic BP', 'Diastolic BP']\n",
+    "numerical_cols = [col for col in train_data.columns if train_data[col].dtype in ['int64', 'float64']]\n",
+    "categorical_cols = [col for col in train_data.columns if train_data[col].dtype == \"object\"]\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Fill NaN values using K-NN imputation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# drop first column\n",
+    "# train_data.drop(train_data.columns[0], axis=1, inplace=True)\n",
+    "data = train_data[numerical_cols].dropna(thresh=9)\n",
+    "# drop rows with missing target\n",
+    "data.dropna(subset=target_features, inplace=True)\n",
+    "\n",
+    "imputer = KNNImputer(n_neighbors=5)\n",
+    "imputer.fit(data)\n",
+    "data = pd.DataFrame(imputer.transform(data), columns=data.columns)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Save the imputer for later use"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "na_filled_data = data\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Perform linear regression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[184.6198455  236.20606093 184.39996347 ... 182.66436659 240.36077657\n",
+      " 218.73457401]\n",
+      "[ 84.1455178  106.91911572  76.97858708 ...  81.23055905 109.15988902\n",
+      " 103.09938601]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.linear_model import LinearRegression\n",
+    "\n",
+    "x_train_cols = [col for col in data.columns if col not in target_features]\n",
+    "x_train = data[x_train_cols]\n",
+    "y_train = data[target_features]\n",
+    "x_test = test_data[x_train_cols]\n",
+    "\n",
+    "# For Systolic BP\n",
+    "lr = LinearRegression()\n",
+    "lr.fit(x_train, y_train['Systolic BP'])\n",
+    "sys_pred = lr.predict(x_test)\n",
+    "print(sys_pred)\n",
+    "\n",
+    "# For Diastolic BP\n",
+    "lr = LinearRegression()\n",
+    "lr.fit(x_train, y_train['Diastolic BP'])\n",
+    "dia_pred = lr.predict(x_test)\n",
+    "print(dia_pred)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Save Predictions to CSV file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions = pd.DataFrame({'Systolic BP': sys_pred, 'Diastolic BP': dia_pred})\n",
+    "\n",
+    "# add index as last column with name 'ID'\n",
+    "predictions['ID'] = pd.Series(range(1, len(predictions) + 1))\n",
+    "predictions.set_index('ID', inplace=True)\n",
+    "\n",
+    "# save to csv\n",
+    "predictions.to_csv('kaggle.csv')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Perform Classification of LifeStyle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Bad' 'Bad' 'Bad' ... 'Bad' 'Bad' 'Bad']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# add column 'Lifestyle' from train_data to data\n",
+    "data['LifeStyle'] = train_data['LifeStyle']\n",
+    "\n",
+    "# drop rows with missing target\n",
+    "data.dropna(subset=['LifeStyle'], inplace=True)\n",
+    "\n",
+    "X_train = data[x_train_cols]\n",
+    "Y_train = data[\"LifeStyle\"]\n",
+    "X_test = na_filled_data[x_train_cols]\n",
+    "\n",
+    "rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)\n",
+    "rf.fit(X_train, Y_train)\n",
+    "\n",
+    "rf_pred = rf.predict(X_test)\n",
+    "print(rf_pred)\n",
+    "\n",
+    "predictions = pd.DataFrame({'LifeStyle': rf_pred})\n",
+    "\n",
+    "# add index as last column with name 'ID'\n",
+    "predictions['ID'] = pd.Series(range(1, len(predictions) + 1))\n",
+    "predictions.set_index('ID', inplace=True)\n",
+    "\n",
+    "# save to csv\n",
+    "predictions.to_csv('ml101.csv')\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}