diff --git a/Anupriya/Kaggle/kaggle_supervised.ipynb b/Anupriya/Kaggle/kaggle_supervised.ipynb new file mode 100644 index 00000000..c88c2dc3 --- /dev/null +++ b/Anupriya/Kaggle/kaggle_supervised.ipynb @@ -0,0 +1,2562 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4NOZaiyCArZN", + "outputId": "f8daafbc-8e5c-4156-be27-f9ef465e733d" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "\n", + "def load_data(s):\n", + " data=pd.read_csv(s)\n", + " return data\n", + "\n", + "path_a1raw='/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/a1_raw.csv'\n", + "path_a2raw='/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/a2_raw.csv'\n", + "path_a3raw='/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/a3_raw.csv'\n", + "path_b1raw='/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/b1_raw.csv'\n", + "path_b3raw='/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/b3_raw.csv'\n", + "path_c1raw='/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/c1_raw.csv'\n", + "path_c3raw='/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/c3_raw.csv'\n", + "\n", + "path_a1='/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/a1_va3.csv'\n", + "path_a3='/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/a3_va3.csv'\n", + "path_test='/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/test.csv'\n", + "path_b1='/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/b1_va3.csv'\n", + "path_b3='/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/b3_va3.csv'\n", + "path_c1='/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/c1_va3.csv'\n", + "path_c3='/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/c3_va3.csv'\n", + "path_test='/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/test.csv'\n", + "\n", + "data_a1=load_data(path_a1)\n", + "data_a3=load_data(path_a3)\n", + "data_b1=load_data(path_b1)\n", + "data_b3=load_data(path_b3)\n", + "data_c1=load_data(path_c1)\n", + "data_c3=load_data(path_c3)\n", + "data_a1raw=load_data(path_a1raw)\n", + "data_a2raw=load_data(path_a2raw)\n", + "data_a3raw=load_data(path_a3raw)\n", + "data_b1raw=load_data(path_b1raw)\n", + "data_b3raw=load_data(path_b3raw)\n", + "data_c1raw=load_data(path_c1raw)\n", + "data_c3raw=load_data(path_c3raw)\n", + "test=load_data(path_test)\n" + ], + "metadata": { + "id": "3oOZi4n0YtyQ" + }, + "execution_count": 15, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(data_a1raw['phase'].unique())\n", + "print(data_a2raw['phase'].unique())\n", + "print(data_a3raw['phase'].unique())\n", + "print(data_b1raw['phase'].unique())\n", + "print(data_b3raw['phase'].unique())\n", + "print(data_c1raw['phase'].unique())\n", + "print(data_c3raw['phase'].unique())\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wwsMc--NV7nr", + "outputId": "64bc90ef-367a-491f-bc18-0fa5680da663" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['Rest' 'Preparation' 'Stroke' 'Hold' 'Retraction']\n", + "['Rest' 'Preparation' 'Stroke' 'Retraction' 'Hold']\n", + "['Rest' 'Preparation' 'Stroke' 'Retraction' 'Hold']\n", + "['Rest' 'Preparation' 'Hold' 'Stroke' 'Retraction' 'Preparação']\n", + "['Rest' 'Preparation' 'Hold' 'Stroke' 'Retraction']\n", + "['Rest' 'Preparation' 'Stroke' 'Hold' 'Retraction']\n", + "['Rest' 'Preparation' 'Stroke' 'Hold' 'Retraction']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "data_b1raw['phase']=data_b1raw['phase'].replace(['Preparação'],['Preparation'])\n", + "data_b1raw['phase'].unique()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6wXLPeIeYoF2", + "outputId": "8085c572-51a8-42eb-da6e-3e723b132527" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array(['Rest', 'Preparation', 'Hold', 'Stroke', 'Retraction'],\n", + " dtype=object)" + ] + }, + "metadata": {}, + "execution_count": 17 + } + ] + }, + { + "cell_type": "code", + "source": [ + "framesprocessed=[data_a1,data_a3,data_b1,data_b3,data_c1,data_c3]\n", + "framesraw=[data_a1raw,data_a3raw,data_b1raw,data_b3raw,data_c1raw,data_c3raw]\n", + "\n", + "for i in framesraw:\n", + " i.drop(range(0,4),inplace=True)\n", + " i.reset_index()\n", + "\n", + "final=pd.concat(framesraw)\n", + "final.reset_index(inplace=True)\n", + "finalv=pd.concat(framesprocessed)\n", + "finalv.reset_index(inplace=True)\n", + "final_df=pd.concat([finalv,final],axis=1)\n", + "final_df" + ], + "metadata": { + "id": "0hnNF188SYzD", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 443 + }, + "outputId": "1c43b604-b0de-424c-b509-7facd10b885c" + }, + "execution_count": 18, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " index 1 2 3 4 5 6 \\\n", + "0 0 -0.005009 -0.000964 0.000573 0.008623 0.005667 0.001302 \n", + "1 1 0.004905 0.001209 -0.000649 0.004737 0.003166 0.000819 \n", + "2 2 -0.002393 -0.000216 0.000136 0.003028 0.001212 0.000336 \n", + "3 3 -0.001394 -0.000242 0.000056 0.001182 0.000575 0.000225 \n", + "4 4 -0.000156 -0.000004 0.000023 0.001585 0.000630 0.000094 \n", + "... ... ... ... ... ... ... ... \n", + "8608 1439 -0.003709 -0.006168 0.000786 -0.000155 0.001088 -0.000144 \n", + "8609 1440 -0.000727 0.001536 -0.000211 0.000700 -0.000975 0.000067 \n", + "8610 1441 0.003074 0.007870 -0.000962 0.000526 -0.000779 0.000090 \n", + "8611 1442 0.003297 0.008467 -0.001035 0.000578 -0.000740 0.000101 \n", + "8612 1443 0.000204 -0.000040 0.000058 0.000586 -0.000619 0.000087 \n", + "\n", + " 7 8 9 ... sy sz lwx \\\n", + "0 -0.000631 0.000130 -0.000048 ... 4.225485 1.775536 4.983912 \n", + "1 -0.000572 -0.000015 0.000023 ... 4.223284 1.777401 5.000410 \n", + "2 -0.000449 0.000017 0.000047 ... 4.223690 1.777571 5.001656 \n", + "3 -0.000479 -0.000050 0.000104 ... 4.224827 1.777669 5.002672 \n", + "4 -0.000303 0.000097 0.000065 ... 4.223671 1.778054 5.012298 \n", + "... ... ... ... ... ... ... ... \n", + "8608 -0.003815 -0.004658 0.000656 ... 4.199645 1.939572 3.801623 \n", + "8609 -0.001147 0.000177 0.000008 ... 4.199096 1.939843 3.736713 \n", + "8610 0.002786 0.005035 -0.000606 ... 4.200613 1.940351 3.736855 \n", + "8611 0.002947 0.005385 -0.000652 ... 4.200203 1.940679 3.736708 \n", + "8612 0.000229 0.000003 0.000061 ... 4.198929 1.941195 3.736303 \n", + "\n", + " lwy lwz rwx rwy rwz timestamp phase \n", + "0 4.296833 1.569889 5.193762 4.335417 1.560144 5702167 Rest \n", + "1 4.301358 1.566544 5.164159 4.313107 1.552097 5702307 Rest \n", + "2 4.299812 1.566537 5.136817 4.307087 1.551576 5702338 Rest \n", + "3 4.298810 1.566489 5.125220 4.300282 1.550805 5702370 Rest \n", + "4 4.298582 1.565061 5.114789 4.292008 1.549765 5702432 Rest \n", + "... ... ... ... ... ... ... ... \n", + "8608 5.192412 1.812156 5.206748 5.086565 1.837070 5432739 Rest \n", + "8609 5.067120 1.828599 5.205452 5.085346 1.837165 5432771 Rest \n", + "8610 5.068608 1.828367 5.202618 5.090534 1.836787 5432808 Rest \n", + "8611 5.067500 1.828450 5.196628 5.095811 1.836236 5432836 Rest \n", + "8612 5.066618 1.828540 5.194844 5.096206 1.836543 5432869 Rest \n", + "\n", + "[8613 rows x 55 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
index123456789...syszlwxlwylwzrwxrwyrwztimestampphase
00-0.005009-0.0009640.0005730.0086230.0056670.001302-0.0006310.000130-0.000048...4.2254851.7755364.9839124.2968331.5698895.1937624.3354171.5601445702167Rest
110.0049050.001209-0.0006490.0047370.0031660.000819-0.000572-0.0000150.000023...4.2232841.7774015.0004104.3013581.5665445.1641594.3131071.5520975702307Rest
22-0.002393-0.0002160.0001360.0030280.0012120.000336-0.0004490.0000170.000047...4.2236901.7775715.0016564.2998121.5665375.1368174.3070871.5515765702338Rest
33-0.001394-0.0002420.0000560.0011820.0005750.000225-0.000479-0.0000500.000104...4.2248271.7776695.0026724.2988101.5664895.1252204.3002821.5508055702370Rest
44-0.000156-0.0000040.0000230.0015850.0006300.000094-0.0003030.0000970.000065...4.2236711.7780545.0122984.2985821.5650615.1147894.2920081.5497655702432Rest
..................................................................
86081439-0.003709-0.0061680.000786-0.0001550.001088-0.000144-0.003815-0.0046580.000656...4.1996451.9395723.8016235.1924121.8121565.2067485.0865651.8370705432739Rest
86091440-0.0007270.001536-0.0002110.000700-0.0009750.000067-0.0011470.0001770.000008...4.1990961.9398433.7367135.0671201.8285995.2054525.0853461.8371655432771Rest
861014410.0030740.007870-0.0009620.000526-0.0007790.0000900.0027860.005035-0.000606...4.2006131.9403513.7368555.0686081.8283675.2026185.0905341.8367875432808Rest
861114420.0032970.008467-0.0010350.000578-0.0007400.0001010.0029470.005385-0.000652...4.2002031.9406793.7367085.0675001.8284505.1966285.0958111.8362365432836Rest
861214430.000204-0.0000400.0000580.000586-0.0006190.0000870.0002290.0000030.000061...4.1989291.9411953.7363035.0666181.8285405.1948445.0962061.8365435432869Rest
\n", + "

8613 rows × 55 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 18 + } + ] + }, + { + "cell_type": "code", + "source": [ + "final_df.drop(['Phase'],axis=1,inplace=True)\n", + "list1=list(range(0,5))\n", + "print(final_df['phase'].unique())\n", + "final_df['phase'].replace(final_df['phase'].unique(),list1,inplace=True)" + ], + "metadata": { + "id": "lAQuSOMcXhZo", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "d40de825-89bb-45e5-cc5b-8cf8246325b0" + }, + "execution_count": 19, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "['Rest' 'Preparation' 'Stroke' 'Hold' 'Retraction']\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "y=final_df['phase'].values\n", + "final_df['phase'].unique()\n", + "y" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "DmFlaaLBcGSW", + "outputId": "eed7aec0-4d15-4e77-b849-f49823461ba4" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([0, 0, 0, ..., 0, 0, 0])" + ] + }, + "metadata": {}, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "source": [ + "X=final_df.drop(['phase'],axis=1)" + ], + "metadata": { + "id": "cLrh-HwpZQw2" + }, + "execution_count": 21, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "X.drop(['index'],axis=1,inplace=True)" + ], + "metadata": { + "id": "O8glM7I1YIQb" + }, + "execution_count": 22, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import lightgbm as lgbm\n", + "\n", + "lgbm=lgbm.LGBMClassifier()\n", + "\n", + "X" + ], + "metadata": { + "id": "zf3ja_z4ZzGS", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 443 + }, + "outputId": "ee4f18e3-958e-4db5-c9e1-952088693cef" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " 1 2 3 4 5 6 7 \\\n", + "0 -0.005009 -0.000964 0.000573 0.008623 0.005667 0.001302 -0.000631 \n", + "1 0.004905 0.001209 -0.000649 0.004737 0.003166 0.000819 -0.000572 \n", + "2 -0.002393 -0.000216 0.000136 0.003028 0.001212 0.000336 -0.000449 \n", + "3 -0.001394 -0.000242 0.000056 0.001182 0.000575 0.000225 -0.000479 \n", + "4 -0.000156 -0.000004 0.000023 0.001585 0.000630 0.000094 -0.000303 \n", + "... ... ... ... ... ... ... ... \n", + "8608 -0.003709 -0.006168 0.000786 -0.000155 0.001088 -0.000144 -0.003815 \n", + "8609 -0.000727 0.001536 -0.000211 0.000700 -0.000975 0.000067 -0.001147 \n", + "8610 0.003074 0.007870 -0.000962 0.000526 -0.000779 0.000090 0.002786 \n", + "8611 0.003297 0.008467 -0.001035 0.000578 -0.000740 0.000101 0.002947 \n", + "8612 0.000204 -0.000040 0.000058 0.000586 -0.000619 0.000087 0.000229 \n", + "\n", + " 8 9 10 ... sx sy sz \\\n", + "0 0.000130 -0.000048 0.007762 ... 5.052367 4.225485 1.775536 \n", + "1 -0.000015 0.000023 0.002706 ... 5.045395 4.223284 1.777401 \n", + "2 0.000017 0.000047 0.002868 ... 5.045374 4.223690 1.777571 \n", + "3 -0.000050 0.000104 0.001171 ... 5.045767 4.224827 1.777669 \n", + "4 0.000097 0.000065 0.001579 ... 5.047422 4.223671 1.778054 \n", + "... ... ... ... ... ... ... ... \n", + "8608 -0.004658 0.000656 0.000060 ... 4.473687 4.199645 1.939572 \n", + "8609 0.000177 0.000008 0.000423 ... 4.474245 4.199096 1.939843 \n", + "8610 0.005035 -0.000606 0.000413 ... 4.476590 4.200613 1.940351 \n", + "8611 0.005385 -0.000652 0.000580 ... 4.477201 4.200203 1.940679 \n", + "8612 0.000003 0.000061 0.000644 ... 4.478990 4.198929 1.941195 \n", + "\n", + " lwx lwy lwz rwx rwy rwz timestamp \n", + "0 4.983912 4.296833 1.569889 5.193762 4.335417 1.560144 5702167 \n", + "1 5.000410 4.301358 1.566544 5.164159 4.313107 1.552097 5702307 \n", + "2 5.001656 4.299812 1.566537 5.136817 4.307087 1.551576 5702338 \n", + "3 5.002672 4.298810 1.566489 5.125220 4.300282 1.550805 5702370 \n", + "4 5.012298 4.298582 1.565061 5.114789 4.292008 1.549765 5702432 \n", + "... ... ... ... ... ... ... ... \n", + "8608 3.801623 5.192412 1.812156 5.206748 5.086565 1.837070 5432739 \n", + "8609 3.736713 5.067120 1.828599 5.205452 5.085346 1.837165 5432771 \n", + "8610 3.736855 5.068608 1.828367 5.202618 5.090534 1.836787 5432808 \n", + "8611 3.736708 5.067500 1.828450 5.196628 5.095811 1.836236 5432836 \n", + "8612 3.736303 5.066618 1.828540 5.194844 5.096206 1.836543 5432869 \n", + "\n", + "[8613 rows x 51 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
12345678910...sxsyszlwxlwylwzrwxrwyrwztimestamp
0-0.005009-0.0009640.0005730.0086230.0056670.001302-0.0006310.000130-0.0000480.007762...5.0523674.2254851.7755364.9839124.2968331.5698895.1937624.3354171.5601445702167
10.0049050.001209-0.0006490.0047370.0031660.000819-0.000572-0.0000150.0000230.002706...5.0453954.2232841.7774015.0004104.3013581.5665445.1641594.3131071.5520975702307
2-0.002393-0.0002160.0001360.0030280.0012120.000336-0.0004490.0000170.0000470.002868...5.0453744.2236901.7775715.0016564.2998121.5665375.1368174.3070871.5515765702338
3-0.001394-0.0002420.0000560.0011820.0005750.000225-0.000479-0.0000500.0001040.001171...5.0457674.2248271.7776695.0026724.2988101.5664895.1252204.3002821.5508055702370
4-0.000156-0.0000040.0000230.0015850.0006300.000094-0.0003030.0000970.0000650.001579...5.0474224.2236711.7780545.0122984.2985821.5650615.1147894.2920081.5497655702432
..................................................................
8608-0.003709-0.0061680.000786-0.0001550.001088-0.000144-0.003815-0.0046580.0006560.000060...4.4736874.1996451.9395723.8016235.1924121.8121565.2067485.0865651.8370705432739
8609-0.0007270.001536-0.0002110.000700-0.0009750.000067-0.0011470.0001770.0000080.000423...4.4742454.1990961.9398433.7367135.0671201.8285995.2054525.0853461.8371655432771
86100.0030740.007870-0.0009620.000526-0.0007790.0000900.0027860.005035-0.0006060.000413...4.4765904.2006131.9403513.7368555.0686081.8283675.2026185.0905341.8367875432808
86110.0032970.008467-0.0010350.000578-0.0007400.0001010.0029470.005385-0.0006520.000580...4.4772014.2002031.9406793.7367085.0675001.8284505.1966285.0958111.8362365432836
86120.000204-0.0000400.0000580.000586-0.0006190.0000870.0002290.0000030.0000610.000644...4.4789904.1989291.9411953.7363035.0666181.8285405.1948445.0962061.8365435432869
\n", + "

8613 rows × 51 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 23 + } + ] + }, + { + "cell_type": "code", + "source": [ + "y" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "otA0Zm92fw0h", + "outputId": "b9c83261-686b-41a2-af8a-ebf22cc3f986" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([0, 0, 0, ..., 0, 0, 0])" + ] + }, + "metadata": {}, + "execution_count": 24 + } + ] + }, + { + "cell_type": "code", + "source": [ + "lgbm.fit(X,y)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 248 + }, + "id": "aqDhvd1g4b-z", + "outputId": "eba62194-ea74-4a98-bbee-f7531a491e02" + }, + "execution_count": 25, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[LightGBM] [Warning] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001060 seconds.\n", + "You can set `force_row_wise=true` to remove the overhead.\n", + "And if memory is not enough, you can set `force_col_wise=true`.\n", + "[LightGBM] [Info] Total Bins 13005\n", + "[LightGBM] [Info] Number of data points in the train set: 8613, number of used features: 51\n", + "[LightGBM] [Info] Start training from score -1.341454\n", + "[LightGBM] [Info] Start training from score -1.512999\n", + "[LightGBM] [Info] Start training from score -1.229411\n", + "[LightGBM] [Info] Start training from score -2.210902\n", + "[LightGBM] [Info] Start training from score -2.152273\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "LGBMClassifier()" + ], + "text/html": [ + "
LGBMClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ] + }, + "metadata": {}, + "execution_count": 25 + } + ] + }, + { + "cell_type": "code", + "source": [ + "Y_a2=data_a2raw.drop([0,1,2,3])\n", + "Y_a2.reset_index(inplace=True)\n", + "Y_a2.drop(['index'],axis=1,inplace=True)\n", + "Y_a2\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 443 + }, + "id": "7j0HqnPSdL_1", + "outputId": "eebe6a42-4471-4ced-e3b5-47fe7e5b8d3f" + }, + "execution_count": 26, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " lhx lhy lhz rhx rhy rhz hx \\\n", + "0 4.497225 5.598363 1.684676 6.932706 5.420004 1.648139 5.532781 \n", + "1 4.504442 5.563369 1.682441 6.984883 5.338807 1.662477 5.535068 \n", + "2 4.501790 5.587282 1.681250 6.936526 5.400179 1.650723 5.536529 \n", + "3 4.481035 5.558424 1.678168 6.935569 5.403949 1.650177 5.536117 \n", + "4 4.490471 5.547697 1.677370 6.954857 5.384894 1.654895 5.537160 \n", + "... ... ... ... ... ... ... ... \n", + "1255 3.766681 5.047060 1.599805 6.787716 4.922377 1.483261 5.494840 \n", + "1256 3.772569 5.135340 1.607126 6.771120 4.998858 1.492831 5.457356 \n", + "1257 3.781345 5.191569 1.616638 6.748940 5.098884 1.488763 5.463538 \n", + "1258 3.794636 5.245894 1.628859 6.735619 5.142869 1.507751 5.469743 \n", + "1259 3.815556 5.279714 1.638650 6.699938 5.184754 1.514433 5.480086 \n", + "\n", + " hy hz sx sy sz lwx lwy \\\n", + "0 1.472957 1.781428 5.581297 4.110899 1.776406 4.550096 5.212202 \n", + "1 1.473257 1.780948 5.581542 4.111409 1.776078 4.534203 5.175910 \n", + "2 1.473684 1.780335 5.581291 4.111289 1.775740 4.530342 5.199273 \n", + "3 1.472946 1.780279 5.581693 4.109772 1.775356 4.523950 5.174112 \n", + "4 1.473327 1.779768 5.582080 4.108705 1.775060 4.521791 5.162915 \n", + "... ... ... ... ... ... ... ... \n", + "1255 1.408275 1.695259 5.391037 4.153896 1.700377 4.035254 4.780551 \n", + "1256 1.410738 1.691791 5.375273 4.153625 1.699941 4.155283 5.046596 \n", + "1257 1.409960 1.692598 5.366023 4.152654 1.699411 4.154757 5.052544 \n", + "1258 1.409930 1.692958 5.356151 4.152353 1.699148 4.155412 5.076689 \n", + "1259 1.410949 1.694152 5.343333 4.151536 1.698735 4.169959 5.114348 \n", + "\n", + " lwz rwx rwy rwz timestamp phase \n", + "0 1.688152 6.621651 5.184755 1.650331 5103827 Rest \n", + "1 1.689498 6.619938 5.200892 1.661059 5103859 Rest \n", + "2 1.687336 6.613071 5.181889 1.651599 5103893 Rest \n", + "3 1.688738 6.613035 5.184223 1.651119 5103916 Rest \n", + "4 1.688910 6.594951 5.234004 1.653260 5103947 Rest \n", + "... ... ... ... ... ... ... \n", + "1255 1.625402 6.644851 4.683230 1.522554 5155770 Retraction \n", + "1256 1.621549 6.672435 4.837710 1.516715 5155833 Retraction \n", + "1257 1.622160 6.684550 4.894862 1.529447 5155902 Retraction \n", + "1258 1.623406 6.632552 4.916888 1.539666 5155939 Retraction \n", + "1259 1.622405 6.615832 4.942080 1.543551 5155956 Retraction \n", + "\n", + "[1260 rows x 20 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
lhxlhylhzrhxrhyrhzhxhyhzsxsyszlwxlwylwzrwxrwyrwztimestampphase
04.4972255.5983631.6846766.9327065.4200041.6481395.5327811.4729571.7814285.5812974.1108991.7764064.5500965.2122021.6881526.6216515.1847551.6503315103827Rest
14.5044425.5633691.6824416.9848835.3388071.6624775.5350681.4732571.7809485.5815424.1114091.7760784.5342035.1759101.6894986.6199385.2008921.6610595103859Rest
24.5017905.5872821.6812506.9365265.4001791.6507235.5365291.4736841.7803355.5812914.1112891.7757404.5303425.1992731.6873366.6130715.1818891.6515995103893Rest
34.4810355.5584241.6781686.9355695.4039491.6501775.5361171.4729461.7802795.5816934.1097721.7753564.5239505.1741121.6887386.6130355.1842231.6511195103916Rest
44.4904715.5476971.6773706.9548575.3848941.6548955.5371601.4733271.7797685.5820804.1087051.7750604.5217915.1629151.6889106.5949515.2340041.6532605103947Rest
...............................................................
12553.7666815.0470601.5998056.7877164.9223771.4832615.4948401.4082751.6952595.3910374.1538961.7003774.0352544.7805511.6254026.6448514.6832301.5225545155770Retraction
12563.7725695.1353401.6071266.7711204.9988581.4928315.4573561.4107381.6917915.3752734.1536251.6999414.1552835.0465961.6215496.6724354.8377101.5167155155833Retraction
12573.7813455.1915691.6166386.7489405.0988841.4887635.4635381.4099601.6925985.3660234.1526541.6994114.1547575.0525441.6221606.6845504.8948621.5294475155902Retraction
12583.7946365.2458941.6288596.7356195.1428691.5077515.4697431.4099301.6929585.3561514.1523531.6991484.1554125.0766891.6234066.6325524.9168881.5396665155939Retraction
12593.8155565.2797141.6386506.6999385.1847541.5144335.4800861.4109491.6941525.3433334.1515361.6987354.1699595.1143481.6224056.6158324.9420801.5435515155956Retraction
\n", + "

1260 rows × 20 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 26 + } + ] + }, + { + "cell_type": "code", + "source": [ + "list1=list(range(0,5))\n", + "Y_a2['phase'].replace(Y_a2['phase'].unique(),list1,inplace=True)\n", + "y_a2=Y_a2['phase'].values\n", + "\n", + "y_a2_nophase=Y_a2.drop(['phase'],axis=1)\n", + "xa2test=pd.concat([test,y_a2_nophase],axis=1)\n", + "xa2test\n", + "print(y_a2)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QmhA-NzDdHo0", + "outputId": "d61aa073-e8c1-46aa-c936-293ca81071c9" + }, + "execution_count": 27, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[0 0 0 ... 3 3 3]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "y_pred_lgbm_a2=lgbm.predict(xa2test)" + ], + "metadata": { + "id": "dWIHqNAvfJH4" + }, + "execution_count": 28, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import accuracy_score" + ], + "metadata": { + "id": "G19SoNqwaP2g" + }, + "execution_count": 29, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "accuracy_score(y_pred_lgbm_a2,y_a2)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "A3Ti4_f1fOdg", + "outputId": "0429dc66-511c-46c8-ebec-82580947f10f" + }, + "execution_count": 30, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.6753968253968254" + ] + }, + "metadata": {}, + "execution_count": 30 + } + ] + }, + { + "cell_type": "code", + "source": [ + "y_pred_lgbm_a2" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mvemRLtGf_h0", + "outputId": "dfdc029f-f006-4002-fcc9-9f8627b21e53" + }, + "execution_count": 31, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "array([1, 1, 1, ..., 4, 4, 4])" + ] + }, + "metadata": {}, + "execution_count": 31 + } + ] + }, + { + "cell_type": "code", + "source": [ + "list1= ['D','P','S','H','R']\n", + "final=[]\n", + "for i in y_pred_lgbm_a2:\n", + " final.append(list[i])\n" + ], + "metadata": { + "id": "6FWw6GRUSwuK" + }, + "execution_count": 61, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "ID=[]\n", + "for i in range(1260):\n", + " ID.append(i+1)\n", + "print(ID)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dCfoKs8xUwqR", + "outputId": "bb6bff50-7504-43ca-aabd-a8565fc168fa" + }, + "execution_count": 62, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "my_submission = pd.DataFrame({'ID': ID, 'Phase': final})\n", + "my_submission.to_csv('/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/submission_lgbm.csv', index=False)\n" + ], + "metadata": { + "id": "_LLzDj0wUNrj" + }, + "execution_count": 63, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "XGBoost" + ], + "metadata": { + "id": "BbRewsePXEiC" + } + }, + { + "cell_type": "code", + "source": [ + "import xgboost as xgb\n", + "from xgboost import XGBClassifier\n", + "from xgboost import cv" + ], + "metadata": { + "id": "4LHiV6iM6byg" + }, + "execution_count": 35, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 123)\n" + ], + "metadata": { + "id": "7Gizx_mONMmn" + }, + "execution_count": 36, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "params = {\n", + " 'min_child_weight': [1, 5, 10],\n", + " 'gamma': [0.5, 1, 1.5, 2, 5],\n", + " 'subsample': [0.6, 0.8, 1.0],\n", + " 'colsample_bytree': [0.6, 0.8, 1.0],\n", + " 'max_depth': [3, 4, 5],\n", + " 'learning_rate':[0.1,0.01,0.02,1]\n", + " }" + ], + "metadata": { + "id": "hD78jJscLDQ3" + }, + "execution_count": 37, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "xgb_model=XGBClassifier(n_estimators=400,nthread=1,objective='multi:softmax')\n" + ], + "metadata": { + "id": "cb_3PVqTPA_I" + }, + "execution_count": 38, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "xgb_model.fit(X,y)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 248 + }, + "id": "PMX7ePXBK8RD", + "outputId": "82409f99-1741-4659-e601-233fc6aa13e5" + }, + "execution_count": 39, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "XGBClassifier(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n", + " interaction_constraints=None, learning_rate=None, max_bin=None,\n", + " max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=None, max_leaves=None,\n", + " min_child_weight=None, missing=nan, monotone_constraints=None,\n", + " n_estimators=400, n_jobs=None, nthread=1, num_parallel_tree=None,\n", + " objective='multi:softmax', ...)" + ], + "text/html": [ + "
XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
+              "              colsample_bylevel=None, colsample_bynode=None,\n",
+              "              colsample_bytree=None, early_stopping_rounds=None,\n",
+              "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
+              "              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,\n",
+              "              interaction_constraints=None, learning_rate=None, max_bin=None,\n",
+              "              max_cat_threshold=None, max_cat_to_onehot=None,\n",
+              "              max_delta_step=None, max_depth=None, max_leaves=None,\n",
+              "              min_child_weight=None, missing=nan, monotone_constraints=None,\n",
+              "              n_estimators=400, n_jobs=None, nthread=1, num_parallel_tree=None,\n",
+              "              objective='multi:softmax', ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ] + }, + "metadata": {}, + "execution_count": 39 + } + ] + }, + { + "cell_type": "code", + "source": [ + "y_pred_xgb_a2=xgb_model.predict(xa2test)\n" + ], + "metadata": { + "id": "wQ9NjC6q7Lj_" + }, + "execution_count": 55, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "accuracy_score(y_pred_xgb_a2,y_a2)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8dDJw9AU7nXs", + "outputId": "02b6208b-255a-4d3b-ccb1-ed78f33026e3" + }, + "execution_count": 56, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.6952380952380952" + ] + }, + "metadata": {}, + "execution_count": 56 + } + ] + }, + { + "cell_type": "code", + "source": [ + "list2= ['D','P','S','H','R']\n", + "final=[]\n", + "for i in y_pred_xgb_a2:\n", + " final.append(list[i])\n" + ], + "metadata": { + "id": "MsysV7bIYvwU" + }, + "execution_count": 58, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "ID=[]\n", + "for i in range(1260):\n", + " ID.append(i+1)\n", + "print(ID)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "H5C-3gMqYx38", + "outputId": "7b9cd305-723f-4b9b-a5b9-5ec4996bb05a" + }, + "execution_count": 59, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "my_submission_xgboost = pd.DataFrame({'ID': ID, 'Phase': final})\n", + "my_submission_xgboost.to_csv('/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/submission_xgboost.csv', index=False)\n" + ], + "metadata": { + "id": "LTZHXRLdY8sd" + }, + "execution_count": 60, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from sklearn.model_selection import RandomizedSearchCV, GridSearchCV\n", + "from sklearn.metrics import roc_auc_score\n", + "from sklearn.model_selection import StratifiedKFold" + ], + "metadata": { + "id": "9C5MT8mYX3GP" + }, + "execution_count": 45, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "folds = 5\n", + "param_comb = 5\n", + "\n", + "skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)\n", + "\n", + "randomsearch = RandomizedSearchCV(xgb_model, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=1001 )\n", + "randomsearch.fit(X, y)\n" + ], + "metadata": { + "id": "huEFEA_1XvQ9", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 169 + }, + "outputId": "a24767ab-1467-441c-89d5-331074980a0b" + }, + "execution_count": 46, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Fitting 5 folds for each of 5 candidates, totalling 25 fits\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py:952: UserWarning: One or more of the test scores are non-finite: [nan nan nan nan nan]\n", + " warnings.warn(\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "RandomizedSearchCV(cv=,\n", + " estimator=XGBClassifier(base_score=None, booster=None,\n", + " callbacks=None,\n", + " colsample_bylevel=None,\n", + " colsample_bynode=None,\n", + " colsample_bytree=None,\n", + " early_stopping_rounds=None,\n", + " enable_categorical=False,\n", + " eval_metric=None, feature_types=None,\n", + " gamma=None, gpu_id=None,\n", + " grow_policy=None,\n", + " importance_type...\n", + " monotone_constraints=None,\n", + " n_estimators=400, n_jobs=None,\n", + " nthread=1, num_parallel_tree=None,\n", + " objective='multi:softmax', ...),\n", + " n_iter=5, n_jobs=4,\n", + " param_distributions={'colsample_bytree': [0.6, 0.8, 1.0],\n", + " 'gamma': [0.5, 1, 1.5, 2, 5],\n", + " 'learning_rate': [0.1, 0.01, 0.02, 1],\n", + " 'max_depth': [3, 4, 5],\n", + " 'min_child_weight': [1, 5, 10],\n", + " 'subsample': [0.6, 0.8, 1.0]},\n", + " random_state=1001, scoring='roc_auc', verbose=3)" + ], + "text/html": [ + "
RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7f07ad022960>,\n",
+              "                   estimator=XGBClassifier(base_score=None, booster=None,\n",
+              "                                           callbacks=None,\n",
+              "                                           colsample_bylevel=None,\n",
+              "                                           colsample_bynode=None,\n",
+              "                                           colsample_bytree=None,\n",
+              "                                           early_stopping_rounds=None,\n",
+              "                                           enable_categorical=False,\n",
+              "                                           eval_metric=None, feature_types=None,\n",
+              "                                           gamma=None, gpu_id=None,\n",
+              "                                           grow_policy=None,\n",
+              "                                           importance_type...\n",
+              "                                           monotone_constraints=None,\n",
+              "                                           n_estimators=400, n_jobs=None,\n",
+              "                                           nthread=1, num_parallel_tree=None,\n",
+              "                                           objective='multi:softmax', ...),\n",
+              "                   n_iter=5, n_jobs=4,\n",
+              "                   param_distributions={'colsample_bytree': [0.6, 0.8, 1.0],\n",
+              "                                        'gamma': [0.5, 1, 1.5, 2, 5],\n",
+              "                                        'learning_rate': [0.1, 0.01, 0.02, 1],\n",
+              "                                        'max_depth': [3, 4, 5],\n",
+              "                                        'min_child_weight': [1, 5, 10],\n",
+              "                                        'subsample': [0.6, 0.8, 1.0]},\n",
+              "                   random_state=1001, scoring='roc_auc', verbose=3)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ] + }, + "metadata": {}, + "execution_count": 46 + } + ] + }, + { + "cell_type": "code", + "source": [ + "print('\\n All results:')\n", + "print(randomsearch.cv_results_)\n", + "print('\\n Best estimator:')\n", + "print(randomsearch.best_estimator_)\n", + "print('\\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))\n", + "print(randomsearch.best_score_ * 2 - 1)\n", + "print('\\n Best hyperparameters:')\n", + "print(randomsearch.best_params_)\n", + "results = pd.DataFrame(randomsearch.cv_results_)\n", + "results.to_csv('xgb-random-grid-search-results-01.csv', index=False)" + ], + "metadata": { + "id": "YkegXZwm22_7", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "74a4404f-7cd8-4176-93c9-a57bc2526273" + }, + "execution_count": 47, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + " All results:\n", + "{'mean_fit_time': array([197.39904776, 227.97758846, 258.94723792, 139.75200572,\n", + " 244.76948318]), 'std_fit_time': array([ 4.87418783, 2.79416663, 2.5347546 , 1.46544292, 58.12875989]), 'mean_score_time': array([0.00361195, 0.00266466, 0.00352955, 0.00089574, 0.00095153]), 'std_score_time': array([0.00391645, 0.00138396, 0.00286 , 0.00091305, 0.00095201]), 'param_subsample': masked_array(data=[0.8, 1.0, 0.8, 0.8, 1.0],\n", + " mask=[False, False, False, False, False],\n", + " fill_value='?',\n", + " dtype=object), 'param_min_child_weight': masked_array(data=[5, 5, 10, 10, 1],\n", + " mask=[False, False, False, False, False],\n", + " fill_value='?',\n", + " dtype=object), 'param_max_depth': masked_array(data=[4, 4, 5, 4, 4],\n", + " mask=[False, False, False, False, False],\n", + " fill_value='?',\n", + " dtype=object), 'param_learning_rate': masked_array(data=[0.1, 0.01, 0.1, 0.02, 0.01],\n", + " mask=[False, False, False, False, False],\n", + " fill_value='?',\n", + " dtype=object), 'param_gamma': masked_array(data=[2, 2, 1, 1.5, 5],\n", + " mask=[False, False, False, False, False],\n", + " fill_value='?',\n", + " dtype=object), 'param_colsample_bytree': masked_array(data=[0.8, 0.8, 1.0, 0.6, 1.0],\n", + " mask=[False, False, False, False, False],\n", + " fill_value='?',\n", + " dtype=object), 'params': [{'subsample': 0.8, 'min_child_weight': 5, 'max_depth': 4, 'learning_rate': 0.1, 'gamma': 2, 'colsample_bytree': 0.8}, {'subsample': 1.0, 'min_child_weight': 5, 'max_depth': 4, 'learning_rate': 0.01, 'gamma': 2, 'colsample_bytree': 0.8}, {'subsample': 0.8, 'min_child_weight': 10, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 1.0}, {'subsample': 0.8, 'min_child_weight': 10, 'max_depth': 4, 'learning_rate': 0.02, 'gamma': 1.5, 'colsample_bytree': 0.6}, {'subsample': 1.0, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.01, 'gamma': 5, 'colsample_bytree': 1.0}], 'split0_test_score': array([nan, nan, nan, nan, nan]), 'split1_test_score': array([nan, nan, nan, nan, nan]), 'split2_test_score': array([nan, nan, nan, nan, nan]), 'split3_test_score': array([nan, nan, nan, nan, nan]), 'split4_test_score': array([nan, nan, nan, nan, nan]), 'mean_test_score': array([nan, nan, nan, nan, nan]), 'std_test_score': array([nan, nan, nan, nan, nan]), 'rank_test_score': array([1, 1, 1, 1, 1], dtype=int32)}\n", + "\n", + " Best estimator:\n", + "XGBClassifier(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=0.8, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " gamma=2, gpu_id=None, grow_policy=None, importance_type=None,\n", + " interaction_constraints=None, learning_rate=0.1, max_bin=None,\n", + " max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=4, max_leaves=None,\n", + " min_child_weight=5, missing=nan, monotone_constraints=None,\n", + " n_estimators=400, n_jobs=None, nthread=1, num_parallel_tree=None,\n", + " objective='multi:softmax', ...)\n", + "\n", + " Best normalized gini score for 5-fold search with 5 parameter combinations:\n", + "nan\n", + "\n", + " Best hyperparameters:\n", + "{'subsample': 0.8, 'min_child_weight': 5, 'max_depth': 4, 'learning_rate': 0.1, 'gamma': 2, 'colsample_bytree': 0.8}\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "best_model = randomsearch.best_estimator_" + ], + "metadata": { + "id": "1smwxidb4Z2H" + }, + "execution_count": 48, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "y_a2_pred_rs=best_model.predict(xa2test)" + ], + "metadata": { + "id": "JF4VHNBfT3eZ" + }, + "execution_count": 49, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "accuracy_score(y_a2_pred_rs,y_a2)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PcVBGk2mWINc", + "outputId": "0378a583-9cdf-4a48-b90b-cee4b842d31c" + }, + "execution_count": 50, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "0.6682539682539682" + ] + }, + "metadata": {}, + "execution_count": 50 + } + ] + }, + { + "cell_type": "code", + "source": [ + "list= ['D','P','S','H','R']\n", + "final=[]\n", + "for i in y_a2_pred_rs:\n", + " final.append(list[i])\n" + ], + "metadata": { + "id": "GCAlfjHXZXNU" + }, + "execution_count": 51, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "ID=[]\n", + "for i in range(1260):\n", + " ID.append(i+1)\n", + "print(ID)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zZxtHIykZlcv", + "outputId": "c8c14832-418e-460c-c938-dd1da57c43f8" + }, + "execution_count": 52, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "my_submission_rs = pd.DataFrame({'ID': ID, 'Phase': final})\n", + "my_submission_rs.to_csv('/content/drive/MyDrive/kaggle/dsg-challenge-1-supervised-learning/submission_rs.csv', index=False)\n" + ], + "metadata": { + "id": "5i6cj2-wZn12" + }, + "execution_count": 53, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/assMath/ass2B/2B_Anupriya.ipynb b/assMath/ass2B/2B_Anupriya.ipynb new file mode 100644 index 00000000..f3843d94 --- /dev/null +++ b/assMath/ass2B/2B_Anupriya.ipynb @@ -0,0 +1,697 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "xdivz28Z6L02" + }, + "source": [ + "### __Assigment 2B__\n", + "#### __Question 1(a)__\n", + "\n", + "For a given distribution $p(y|x)$ that is the distribution of the outcome $y$ given the data $x$, we usually estimate mean because mean gives us the expected value of $y$ given a certain $x$. It helps us realise the central value around which we can expect $y$ to be." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SqIZ6ebklw1G" + }, + "source": [ + "####__Question 1(b)__\n", + "So, for vanilla linear regression, we have the following if $\\hat{y}$ is the predicted outcome by the model and $textbf{w}$ is the vector of parameters (weights) then,\n", + "\n", + "$\\hat{y}= \\textbf{w}^\\textbf{T}.\\textbf{x}$\n", + "\n", + "So our task is to be able to define such a $\\hat{y}$, or in other words, find the required $\\textbf{w}$. Since this is vanilla linear regression, there is just one independent variable $x$, hence just one parameter (weight), $w$.\n", + "To do so, we'll be using MSE here, so,\n", + "\n", + "$\\nabla_wMSE_{train}=0 $, which ultimately gives,\n", + "\n", + "$w=(X^TX)^{-1}X^Ty$\n", + "\n", + "after some linear algebra jazz that is too much to type here (from Goodfellow page 108,109)\n", + "\n", + "where, for us, $X$ is the design matrix (which, in our simple case is just the transpose of the vector $\\textbf{x}$ itself since there is just one value of x per training dataset pair, $X=[-x^T-]$) and $y$ is a vector of all target values from the training set. So, one thing to note is that $X^TX$ is just a single value and it's inverse is simply $\\frac{1}{X^TX}$.\n", + "\n", + "Now, we've been given that $p(y|x) \\sim N(y:\\mu,\\sigma)$ where $\\mu=w*x$ and $\\sigma=I$.\n", + "\n", + "So, assuming we generate a dataset that follows such a distribution, we need $x$, which will just be a bunch of single values from a distribution (not specified, could be anything we want so let's take $N(0,1)$, we need a true value of $w$ that we will decide randomly and we need to find the corresponding $y$ which has a mean vector $w*x$ and covariance matrix (size n*n) equal to $I$, which will give us our required dataset. After this, we can use the above relation to estimate $w$ and that should be close to the true value of $w$.\n", + "\n", + "We could, like before, just define the loss function and minimise it using Scipy. So. we'll also try that." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CpnDWH9DLg6K", + "outputId": "54aa8b31-dd50-450f-8e72-912bd98cc6b5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True value of parameter: 0.39942218451390976\n", + "Estimated value of parameter, thanks to MSE: 0.38803774521035544\n", + "Estimated value of parameter, thanks to Scipy: 0.388037742515314\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from numpy import random\n", + "\n", + "n=1000\n", + "x_mean=0\n", + "x_sd=1\n", + "X=np.random.normal(x_mean,x_sd,size=n)\n", + "\"\"\"\n", + "shape = 1000\n", + "value = 1\n", + "x_b = np.empty(shape, dtype=np.int)\n", + "x_b.fill(value)\n", + "X=np.column_stack((x, x_b))\n", + "\n", + "#print(X.shape) # matrix\n", + "\"\"\"\n", + "true_w=random.rand() # could be whatever we want technically\n", + "print(\"True value of parameter: \",true_w)\n", + "\n", + "y_mean=true_w*X\n", + "y_covariance=np.eye(n)\n", + "y = np.random.multivariate_normal(y_mean, y_covariance)\n", + "#print(y.shape) # also a vector\n", + "\n", + "#estimated_w=(np.linalg.inv(X.dot(X.T))).dot((X.T).dot(y))\n", + "\n", + "estimated_w=(np.power(X.dot(X.T),-1))*((X.T).dot(y))\n", + "print(\"Estimated value of parameter, thanks to MSE: \",estimated_w)\n", + "\n", + "# hence we can now use the line y=w*x for our vanilla linear regression model\n", + "\n", + "# however, we should be able to solve for w without doing all the math, so let's try that\n", + "\n", + "from scipy.optimize import minimize\n", + "\n", + "def loss(w,X,y):\n", + " J=0\n", + " for i in range(n):\n", + " J+=0.5*(np.power((w*X[i]-y[i]),2))\n", + " return J\n", + "initialparameters=1\n", + "result = minimize(loss, initialparameters, args=(X,y), method='L-BFGS-B')\n", + "estimated__w=result.x\n", + "print(\"Estimated value of parameter, thanks to Scipy: \",estimated__w[0])\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zJ1STkI_RQfc" + }, + "source": [ + "The choice of sigma here, that is the covariance, is basically telling us how varied our measurements are. So, in our final objective, which was minimising the squared error term, sigma is not exactly doing anything. The given value of sigma is just for generating the dataset we need. While applying MSE and minimising the function to obtain the corresponding $w$, we didn't really need sigma. So, parameterising sigma is not exactly going to do anything.\n", + "\n", + "Yes however, when we observe our predictions, then calculating it's sigma and comparing it to the true sigma can tell us how varied our predictions are." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eEvyl1JBREZf" + }, + "source": [ + "#### __Question 1(c)__\n", + "Having an exponential family distribution means the superpower of being able to appl GLMs (generalised linear models). Exponential families have distribution functions of the form,\n", + "\n", + "$f(x)=b(y)e^{\\eta^TT(y)-a(\\eta)}$\n", + "\n", + "So, let's choose Poisson distribution. We wanna obtain $b(y), T(y), a(\\eta)$ and $\\eta$ itself by observing the poisson distribution function,\n", + "\n", + "$f(x)=\\frac{\\lambda^y.e^{(-\\lambda)}}{y!}$, which can be rewritten as\n", + "\n", + "$f(x)=e^{y.ln(\\lambda)-\\lambda-ln(y!)}$\n", + "\n", + "So,\n", + "\n", + "$\\eta=ln(\\lambda)$ and $T(y)=y$, at least that is easy to see.\n", + "\n", + "Further for poisson distribution, $\\mu=\\lambda$ and $\\sigma^2=\\lambda$\n", + "\n", + "Now, to construct a GLM, we want to predict the expected value of $T(y)$ given $x$. So, we are looking for a prediction $h(x)=E(y|x)$. Further $\\eta$ and $x$ are related by $\\eta=w^Tx$, where $w$ is our unknown parameter.\n", + "\n", + "Here, $h(x)=E(y|x)=\\mu=\\lambda=w^Tx$\n", + "\n", + "The problem is that, by definition and usage, $\\lambda$ is always greater than or equal to $0$ and a linear relationship like $\\lambda=w^Tx$ doesn't seem nice.So, instead, we can use the relationship,\n", + "\n", + "$ln(\\lambda_i)=w^Tx$\n", + "\n", + "$=\n", + "\\begin{pmatrix}\n", + "w_0\\;w_1\n", + "\\end{pmatrix}\n", + "\\times\n", + "\\begin{pmatrix}\n", + "1\\\\\n", + "x_i\n", + "\\end{pmatrix}\n", + "$\n", + "\n", + "Such that, $\\lambda_i=e^{w_0+w_1x_i}$\n", + "\n", + "So, what we are trying to estimate are these unknown parameters with the help of this relationship between $\\lambda$, $w$ and $x$.\n", + "We need to define a loss function, then minimise it to get the desired unknown parameters. Again, problem is that this is no more gaussian! But we do know that we get the same results by minimising loss function as we do by maximising the likelihood function. So, we'll have to go by that approach.\n", + "Ultimately, we'll be minimising the negative log-likelihood function, so in a way, that's going to be like our loss function.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "tUOV4XHosr0_", + "outputId": "c0b4aee4-a5df-44bb-a735-187641bc8f98" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True value of parameters: 0.5468524318685486 , 0.9830005256800832\n", + "Estimated value of parameter, thanks to Scipy: 0.49062509412517374 , 1.0114374335545537\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from numpy import random\n", + "\n", + "def log_likelihood_poisson(w,X,y):\n", + " L=0\n", + " for i in range(1000):\n", + " c=np.exp(w[0]+w[1]*X[i])\n", + " p=c-y[i]*(np.log(c))\n", + " L+=p\n", + " return L\n", + "\n", + "# Let's generate a simple dataset corresponding to all the above stuff\n", + "n=1000\n", + "X=np.random.randn(n)\n", + "w_0=random.rand() # could be whatever we want technically\n", + "w_1=random.rand()\n", + "print(\"True value of parameters: \",w_0,\",\",w_1)\n", + "lambda_i=np.exp(w_0+w_1*X)\n", + "y=np.random.poisson(lambda_i)\n", + "\n", + "\n", + "initialparameters=np.array([1,1]) # let's say\n", + "result = minimize(log_likelihood_poisson, initialparameters, args=(X,y), method='L-BFGS-B')\n", + "estimated__w=result.x\n", + "print(\"Estimated value of parameter, thanks to Scipy: \",estimated__w[0], \",\",estimated__w[1])\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Zn0Xk9m938r5" + }, + "source": [ + "#### __Question 2__\n", + "\n", + "Firstly, the dataset is on my drive so path should be changed accordingly in order to run this code.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WYDvdI2D3u9h", + "outputId": "c58e9675-6f83-470e-bec5-3b4a87391795" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mounted at /content/drive\n" + ] + } + ], + "source": [ + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "ExqUDGVV-HO2", + "outputId": "60cbaf3a-0bc3-46a5-ca55-e2662f2564ea" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mean Squared Error (MODEL A) (Test Set A): 22.25770734371434\n", + "Mean Squared Error (MODEL A) (Test Set B): 2.5616550522775925\n", + "Mean Squared Error (MODEL A) (Test Set C): 0.062457236803248974\n", + "Mean Squared Error (MODEL B) (Test Set A): 32.73219379378818\n", + "Mean Squared Error (MODEL B) (Test Set B): 1.9577195262268569e-25\n", + "Mean Squared Error (MODEL B) (Test Set C): 2.6140814129669656\n", + "Mean Squared Error (MODEL C) (Test Set A): 16.15674668223636\n", + "Mean Squared Error (MODEL C) (Test Set B): 15.774197648432851\n", + "Mean Squared Error (MODEL C) (Test Set C): 5.699794881410673\n", + "Minimum error: 1.9577195262268569e-25\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error Distribution (MODEL A) (Test Set A): Mean = 22.25770734371434 Standard Deviation = 53.96589716508258\n", + "Error Distribution (MODEL A) (Test Set B): Mean = 2.5616550522775925 Standard Deviation = 0.0022966488099699024\n", + "Error Distribution (MODEL A) (Test Set C): Mean = 0.062457236803248974 Standard Deviation = 0.08652385376675549\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error Distribution (MODEL B) (Test Set A): Mean = 32.73219379378818 Standard Deviation = 65.61751153849882\n", + "Error Distribution (MODEL B) (Test Set B): Mean = 1.9577195262268569e-25 Standard Deviation = 1.892899978318431e-25\n", + "Error Distribution (MODEL B) (Test Set C): Mean = 2.6140814129669656 Standard Deviation = 0.8034601498911891\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error Distribution (MODEL C) (Test Set A): Mean = 16.15674668223636 Standard Deviation = 38.162560581973146\n", + "Error Distribution (MODEL C) (Test Set B): Mean = 15.774197648432851 Standard Deviation = 0.03993390753228745\n", + "Error Distribution (MODEL C) (Test Set C): Mean = 5.699794881410673 Standard Deviation = 1.1902720177778146\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import mean_squared_error\n", + "import matplotlib.pyplot as plt\n", + "\n", + "ds_trainA = pd.read_csv('/content/drive/MyDrive/Q2/train_set_A.csv')\n", + "ds_trainB = pd.read_csv('/content/drive/MyDrive/Q2/train_set_B.csv')\n", + "ds_trainC = pd.read_csv('/content/drive/MyDrive/Q2/train_set_C.csv')\n", + "ds_testA = pd.read_csv('/content/drive/MyDrive/Q2/test_set_A.csv')\n", + "ds_testB = pd.read_csv('/content/drive/MyDrive/Q2/test_set_B.csv')\n", + "ds_testC = pd.read_csv('/content/drive/MyDrive/Q2/test_set_C.csv')\n", + "\n", + "x_train_A, y_train_A = ds_trainA[['x']], ds_trainA['y']\n", + "x_train_B, y_train_B = ds_trainB[['x']], ds_trainB['y']\n", + "x_train_C, y_train_C = ds_trainC[['x']], ds_trainC['y']\n", + "x_test_A, y_test_A = ds_testA[['x']], ds_testA['y']\n", + "x_test_B, y_test_B = ds_testB[['x']], ds_testB['y']\n", + "x_test_C, y_test_C = ds_testC[['x']], ds_testC['y']\n", + "\n", + "model_A = LinearRegression().fit(x_train_A, y_train_A)\n", + "model_B = LinearRegression().fit(x_train_B, y_train_B)\n", + "model_C = LinearRegression().fit(x_train_C, y_train_C)\n", + "\n", + "\n", + "# now since it's not necessary for train set A and test set A to come from the same distribution\n", + "# in order to find the actual data distribution, we'll have to see which train and test set pair\n", + "# has the least MSE, that pair would be the one coming from the actual distribution\n", + "\n", + "mse_AA= mean_squared_error(y_test_A, model_A.predict(x_test_A))\n", + "mse_AB= mean_squared_error(y_test_B, model_A.predict(x_test_A))\n", + "mse_AC= mean_squared_error(y_test_C, model_A.predict(x_test_A))\n", + "\n", + "mse_BA = mean_squared_error(y_test_A, model_B.predict(x_test_B))\n", + "mse_BB = mean_squared_error(y_test_B, model_B.predict(x_test_B))\n", + "mse_BC = mean_squared_error(y_test_C, model_B.predict(x_test_B))\n", + "\n", + "mse_CA = mean_squared_error(y_test_A, model_C.predict(x_test_C))\n", + "mse_CB = mean_squared_error(y_test_B, model_C.predict(x_test_C))\n", + "mse_CC = mean_squared_error(y_test_C, model_C.predict(x_test_C))\n", + "\n", + "print(\"Mean Squared Error (MODEL A) (Test Set A):\", mse_AA)\n", + "print(\"Mean Squared Error (MODEL A) (Test Set B):\", mse_AB)\n", + "print(\"Mean Squared Error (MODEL A) (Test Set C):\", mse_AC)\n", + "\n", + "print(\"Mean Squared Error (MODEL B) (Test Set A):\", mse_BA)\n", + "print(\"Mean Squared Error (MODEL B) (Test Set B):\", mse_BB)\n", + "print(\"Mean Squared Error (MODEL B) (Test Set C):\", mse_BC)\n", + "\n", + "print(\"Mean Squared Error (MODEL C) (Test Set A):\", mse_CA)\n", + "print(\"Mean Squared Error (MODEL C) (Test Set B):\", mse_CB)\n", + "print(\"Mean Squared Error (MODEL C) (Test Set C):\", mse_CC)\n", + "\n", + "errors=np.array([mse_AA,mse_AB,mse_AC,mse_BA,mse_BB,mse_BC,mse_CA,mse_CB,mse_CC])\n", + "min_error=np.min(errors)\n", + "print(\"Minimum error: \",min_error)\n", + "\n", + "# clearly it's train and test set B that comes from the original distribution\n", + "\n", + "# now let's plot stuff\n", + "\n", + "error_AA = np.power((y_test_A - model_A.predict(x_test_A)),2)\n", + "error_AB = np.power((y_test_B - model_A.predict(x_test_B)),2)\n", + "error_AC = np.power((y_test_C - model_A.predict(x_test_C)),2)\n", + "\n", + "error_BA = np.power((y_test_A - model_B.predict(x_test_A)),2)\n", + "error_BB = np.power((y_test_B - model_B.predict(x_test_B)),2)\n", + "error_BC = np.power((y_test_C - model_B.predict(x_test_C)),2)\n", + "\n", + "error_CA = np.power((y_test_A - model_C.predict(x_test_A)),2)\n", + "error_CB = np.power((y_test_B - model_C.predict(x_test_B)),2)\n", + "error_CC = np.power((y_test_C - model_C.predict(x_test_C)),2)\n", + "\n", + "\n", + "# model A\n", + "plt.figure(figsize=(10, 6))\n", + "plt.hist(error_AA, bins='auto', alpha=0.7, label='Test Set A')\n", + "plt.hist(error_AB, bins='auto', alpha=0.7, label='Test Set B')\n", + "plt.hist(error_AC, bins='auto', alpha=0.7, label='Test Set C')\n", + "plt.xlabel('Error')\n", + "plt.ylabel('Frequency')\n", + "plt.title('Error Distribution Model A')\n", + "plt.legend()\n", + "plt.show()\n", + "\n", + "distribution_AA = np.mean(error_AA), np.std(error_AA)\n", + "distribution_AB = np.mean(error_AB), np.std(error_AB)\n", + "distribution_AC = np.mean(error_AC), np.std(error_AC)\n", + "\n", + "print(\"Error Distribution (MODEL A) (Test Set A): Mean =\", distribution_AA[0], \"Standard Deviation =\", distribution_AA[1])\n", + "print(\"Error Distribution (MODEL A) (Test Set B): Mean =\", distribution_AB[0], \"Standard Deviation =\", distribution_AB[1])\n", + "print(\"Error Distribution (MODEL A) (Test Set C): Mean =\", distribution_AC[0], \"Standard Deviation =\", distribution_AC[1])\n", + "\n", + "# model B\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "plt.hist(error_BA, bins='auto', alpha=0.7, label='Test Set A')\n", + "plt.hist(error_BB, bins='auto', alpha=0.7, label='Test Set B')\n", + "plt.hist(error_BC, bins='auto', alpha=0.7, label='Test Set C')\n", + "plt.xlabel('Error')\n", + "plt.ylabel('Frequency')\n", + "plt.title('Error Distribution Model B')\n", + "plt.legend()\n", + "plt.show()\n", + "\n", + "distribution_BA = np.mean(error_BA), np.std(error_BA)\n", + "distribution_BB = np.mean(error_BB), np.std(error_BB)\n", + "distribution_BC = np.mean(error_BC), np.std(error_BC)\n", + "\n", + "print(\"Error Distribution (MODEL B) (Test Set A): Mean =\", distribution_BA[0], \"Standard Deviation =\", distribution_BA[1])\n", + "print(\"Error Distribution (MODEL B) (Test Set B): Mean =\", distribution_BB[0], \"Standard Deviation =\", distribution_BB[1])\n", + "print(\"Error Distribution (MODEL B) (Test Set C): Mean =\", distribution_BC[0], \"Standard Deviation =\", distribution_BC[1])\n", + "\n", + "# model C\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "plt.hist(error_CA, bins='auto', alpha=0.7, label='Test Set A')\n", + "plt.hist(error_CB, bins='auto', alpha=0.7, label='Test Set B')\n", + "plt.hist(error_CC, bins='auto', alpha=0.7, label='Test Set C')\n", + "plt.xlabel('Error')\n", + "plt.ylabel('Frequency')\n", + "plt.title('Error Distribution Model C')\n", + "plt.legend()\n", + "plt.show()\n", + "\n", + "distribution_CA = np.mean(error_CA), np.std(error_CA)\n", + "distribution_CB = np.mean(error_CB), np.std(error_CB)\n", + "distribution_CC = np.mean(error_CC), np.std(error_CC)\n", + "\n", + "print(\"Error Distribution (MODEL C) (Test Set A): Mean =\", distribution_CA[0], \"Standard Deviation =\", distribution_CA[1])\n", + "print(\"Error Distribution (MODEL C) (Test Set B): Mean =\", distribution_CB[0], \"Standard Deviation =\", distribution_CB[1])\n", + "print(\"Error Distribution (MODEL C) (Test Set C): Mean =\", distribution_CC[0], \"Standard Deviation =\", distribution_CC[1])\n", + "\n", + "# by comparing each of the parameters, we can see how mean and standard deviation are also lowest for train and test set B pair.\n", + "# test set A always has the highest error for each model\n", + "# further, test set A gives least error on model C, test set B on model B and test set C on model A\n", + "# we can see the second least error is for model A on test set C\n", + "\n", + "# We can observe these plots and see how they're different, however I can't see what vital information we can obtain from this" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "h-IYfEDyUQ7e" + }, + "source": [ + "#### __Question 3__\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "E9xIHA5vhr7y", + "outputId": "3d385032-ad88-4d29-b05f-7e0c33c19529" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "True weights are, w0= 4 and w1= 3\n", + "Maximum iterations reached.\n", + "Weights estimated by Batch Gradient Descent are, w0= 3.114909961958382 and w1= 22.272814185384274\n", + "Converged in 1 iterations.\n", + "Weights estimated by Fisher Scoring are, w0= 2.501166396975581 and w1= 0.3212582200477815\n" + ] + } + ], + "source": [ + "from os import waitstatus_to_exitcode\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "\n", + "n=1000\n", + "f=1\n", + "x=np.random.normal(0, 1, (n,f))\n", + "w0=4\n", + "w1=3\n", + "w=np.array([w0,w1])\n", + "x_b=np.ones(n)\n", + "x_new=np.column_stack((x_b,x))\n", + "z=np.dot(x_new,w)\n", + "p=1 / (1 + np.exp(-z))\n", + "y=np.random.binomial(1, p)\n", + "print(\"True weights are, w0= \",w0,\" and w1= \",w1)\n", + "\n", + "\n", + "def loglikelihood(w0,w1,x,y):\n", + " LL=0\n", + " m=x.shape[0]\n", + " for i in range(m):\n", + " z=w0+w1*x[i]\n", + " p=1/(1+np.exp(-z))\n", + " LL+=y[i]*(np.log(p))+(1-y[i])*(np.log(1-p))\n", + " return LL\n", + "\n", + "def cost_function(w0,w1,x, y):\n", + " m=x.shape[0]\n", + " C=0\n", + " for i in range(m):\n", + " z=w0+w1*x[i]\n", + " p=1 / (1 + np.exp(-z))\n", + " C+= np.power((p-y[i]),2)\n", + " return C\n", + "\n", + "def batchgradientdescent(w0,w1,x,y,l,i,c):\n", + " converged=False\n", + " i_current=0\n", + " m=x.shape[0]\n", + " m=x.shape[0]\n", + " J=0\n", + " for i in range(m):\n", + " z__=w0+w1*x[i]\n", + " p__=1 / (1 + np.exp(-z__))\n", + " J+= np.power((p__-y[i]),2)\n", + "\n", + " while not converged:\n", + "\n", + " w=np.array([w0,w1])\n", + " x_b=np.ones(m)\n", + " x_new=np.column_stack((x_b,x))\n", + " z=np.dot(x_new,w)\n", + " p=1/(1+np.exp(-z))\n", + "\n", + " update0=np.sum((1/m)*(p-y))\n", + " update1=np.sum((1/m)*((p-y)*x))\n", + "\n", + " temp_w0=w0-l*update0\n", + " temp_w1=w1-l*update1\n", + "\n", + " w0=temp_w0\n", + " w1=temp_w1\n", + "\n", + " e=0\n", + " for i in range(m):\n", + " z_=w0+w1*x[i]\n", + " p_=1 / (1 + np.exp(-z_))\n", + " e+= np.power((p_-y[i]),2)\n", + " if abs(J-e)<=c:\n", + " print(\"Converged in: \",i_current,\" iterations.\")\n", + " converged=True\n", + "\n", + " J=e\n", + " i_current+=1\n", + "\n", + " if i_current==i:\n", + " print(\"Maximum iterations reached.\")\n", + " converged=True\n", + " return w0,w1\n", + "\n", + "bgd_estimated_w0,bgd_estimated_w1=batchgradientdescent(0,0,x,y,0.01,5000,0.0001)\n", + "print(\"Weights estimated by Batch Gradient Descent are, w0= \",bgd_estimated_w0,\" and w1= \",bgd_estimated_w1)\n", + "\n", + "# I can't figure out why it's giving such a shitty results? :( Sometimes it's giving a reasonable value and sometimes it's just absurd (especially w1), there's no consistency\n", + "\n", + "def fisherscoring(X,Y,c):\n", + "\n", + " # desgin matrix X\n", + "\n", + " # response vector is y as usual, written as Y\n", + "\n", + " # probability vector p\n", + " w_initial=np.matrix(np.ones(np.shape(X)[1])).T\n", + " z = np.dot(X,w_initial)\n", + " p_initial=1/(1+np.exp(-z))\n", + "\n", + " # weight matrix\n", + " W_= (np.array(p_initial) * np.array(1 - p_initial))\n", + " W= np.matrix(np.diag(W_[:, 0]))\n", + "\n", + " # c is the threshold value above which iterations continue\n", + " c=0.001\n", + "\n", + " #First derivative of Log-Likelihood with respect to each weight\n", + " U=(X.T)*(Y-p_initial)\n", + "\n", + " # Second derivative of Log-Likelihood with respect to each weight\n", + " H=(X.T)*(W)*X\n", + "\n", + " i_current=0\n", + " while True:\n", + " i_current+=1\n", + " w_current=w_initial+ (np.linalg.inv(H))*U\n", + "\n", + " if (np.abs(np.array(w_current)-np.array(w_initial)) < c).all:\n", + " model_parameters = w_current\n", + " break\n", + " else:\n", + " z=np.dot(X,w_current)\n", + " p_initial=1/(1+np.exp(-z))\n", + " W_ = (np.array(p_initial) * np.array(1 - p_initial))\n", + " W = np.matrix(np.diag(W_[:, 0]))\n", + " H= X.T * W * X\n", + " U= X.T * (Y - p_initial)\n", + " w_initial= w_current\n", + " print(\"Converged in \",i_current,\" iterations.\")\n", + " return np.array(model_parameters)\n", + "\n", + "fs_estimated_w0,fs_estimated_w1=fisherscoring(x_new,y,0.001)\n", + "print(\"Weights estimated by Fisher Scoring are, w0= \",fs_estimated_w0[0],\" and w1= \",fs_estimated_w1[0])\n", + "\n", + "# again there's some issue in the value of w1 and weirdly, there's only one iteration happening? :(\n", + "# gave up after a while cause everything seems absurd now ugh" + ] + }, + { + "cell_type": "markdown", + "source": [ + "#### __Question 4__\n", + "\n", + "a. The sample size n is extremely large, and the number of predictors p is small.\n", + "\n", + "The performance of a flexible model would be better. This is because n is extremely large, so we have a lot of information to learn from. This is good because the problem of overfitting reduces. Overfitting usually occurs when we use a flexible model for small n. That's not the case here, though. We have sufficient data to ensure we don't overfit. Besides the number of predictors (independent variables is small) so we have a simpler predictor-response relationship and more generalisation.\n", + "For a flexible model, we usually have high variance and low bias. High variance will occur here if there is a very non-linear relationship between predictor (input) and the outcome (response). But with smaller number of features, we can expect a simpler model mostly, so the variance shouldn't be very high.\n", + "\n", + "b. The number of predictors p is extremely large, and the number of observations n is small.\n", + "\n", + "Now, here an inflexible model is better.\n", + "Small n means the risk of overfitting is high with a flexible model. Plus, the number of predictors is extremely large, so that increases the risk because we have noise in the data. So, it's better we have an inflexible model such that we can have at least some generalisation.\n", + "Inflexible model implies lower variance and high number of predictors implies the possibility of a complex relationship between response and predictors so we might have low bias. In case there is simple relation (as should be for an inflexible model), we can expect higher bias.\n", + "\n", + "c. The relationship between the predictors and response is highly non-linear.\n", + "\n", + "A flexible model is better since it can capture the intricacies of the non-linear relationship hence give more accurate results. An inflexible model will miss out most details. However, depending on the sample size, we might see overfitting of data.\n", + "Highly flexible model means lower bias but higher variance." + ], + "metadata": { + "id": "FWwl6GCEARkb" + } + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/assMath/ass2a/2A_Anupriya.ipynb b/assMath/ass2a/2A_Anupriya.ipynb new file mode 100644 index 00000000..55c657f6 --- /dev/null +++ b/assMath/ass2a/2A_Anupriya.ipynb @@ -0,0 +1,572 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "## **Assignment 2A**\n" + ], + "metadata": { + "id": "Nn59ntocyMfK" + } + }, + { + "cell_type": "markdown", + "source": [ + "### __Question 1__\n", + "\n", + "The posterior, likelihood and prior are related according to,\n", + "\n", + "$ P(\\theta | y) = \\frac{P(y | \\theta) P(\\theta)}{P(y)} $\n", + "\n", + "Let A denote that Alice is working, B denote that Bob is working and $d$ denote the data that the boss has collected.\n", + "\n", + "$ \\lambda_a (=10) $ is the average number of tickets Alice collects and $ \\lambda_b (=15) $ is the average number of tickets Bob collects and note that the number of tickets $ X$ that they collect is modelled by a possion distribution,\n", + "\n", + "$ P(X=x)= \\frac{\\lambda ^x e^{-\\lambda}}{x!}$\n", + "\n", + "Now, odds that Alice is working in place of Bob = $O(A)=\\frac{P(A)}{P(B)}=\\frac{1}{10}$\n", + "\n", + "Thus, $P(A)=\\frac{1}{11}$ and $P(B)=\\frac{10}{11}$\n", + "\n", + "Further, we need the posterior odds that Alice is filling in for Bob, that is,\n", + "\n", + "$O($Alice works|data$)=\\frac{P(Alice\\;works|data)}{P(Bob\\;works|data)}=\\frac{P(A|d)}{P(B|d)}$\n", + "\n", + "Here,\n", + "\n", + "$P(A|d)=\\frac{P(d|A).P(A)}{P(d)}=\\frac{\\prod\\limits_{i=1}^{5}\\frac{\\lambda_a ^{x_i} e^{-\\lambda}}{x_i!}.P(A)}{P(d)} = \\frac{\\frac{10^{12+10+11+4+11} e^{-50}}{12! 10! 11! 4! 11!}}{P(d)}.\\frac{1}{11}$\n", + "\n", + "$P(B|d)=\\frac{P(d|B). P(B)}{P(d)}= \\frac{\\prod\\limits_{i=1}^{5}\\frac{\\lambda_b ^{x_i} e^{-\\lambda}}{x_i!}.P(B)}{P(d)} =\\frac{ \\frac{15^{12+10+11+4+11} e^{-75}}{12! 10! 11! 4! 11!}}{P(d)}. \\frac{10}{11}$\n", + "\n", + "$O($Alice works|data$)=\\frac{P(d|A)}{P(d|B)}\\frac{P(A)}{P(B)} = (\\frac{10}{15})^{48}e^{25} (\\frac{1}{10}) ≈ 25.409$" + ], + "metadata": { + "id": "mFqla4Vxeys9" + } + }, + { + "cell_type": "markdown", + "source": [ + "### __Question 2__\n", + "\n", + "#### __Part (a)__\n", + "According to the question,\n", + "\n", + "$f(\\theta)\\sim N(5,9) \\implies P(\\theta)= \\frac{1}{3\\sqrt{2\\pi}}e^{\\frac{{-(\\theta-5)^2}}{18}}$ ...prior\n", + "\n", + "$f(x|\\theta) \\sim N(\\theta,4) \\implies P(x|\\theta)=\\frac{1}{2\\sqrt{2\\pi}}e^{\\frac{{-(x-\\theta)^2}}{8}}$ ...likelihood\n", + "\n", + "Where x=6, hence,\n", + "\n", + "$P(x=6|\\theta)= \\frac{1}{2\\sqrt{2\\pi}}e^{\\frac{{-(6-\\theta)^2}}{8}} $\n", + "\n", + "Then from the usual Bayesian update table,\n", + "\n", + "Posterior, i.e., $f(\\theta|x)=f(x|\\theta).f(\\theta)=\\frac{1}{12\\pi}(e^{-\\frac{(\\theta-5)^2}{18}}.e^{\\frac{-(6-\\theta)^2}{8}}) = \\frac{1}{12\\pi}(e^{{\\frac{-(\\theta-5)^2}{18}}-{\\frac{(6-\\theta)^2}{8}}})$\n", + "\n", + "We need to modify the posterior to obtain what the mean and variance will be for the Normal distribution,\n", + "\n", + "$\\frac{(\\theta-5)^2}{18}+ \\frac{(6-\\theta)^2}{8}= \\frac{1}{2}(\\frac{\\theta^2-10\\theta+25}{9} + \\frac{\\theta^2-12\\theta+36}{4})$\n", + "$= \\frac{1}{2}(\\frac{13\\theta^2-148\\theta+424}{36})$\n", + "$=\\frac{1}{2}(13\\frac{(\\theta^2-\\frac{74}{13})^2+36}{36}) $\n", + "$=\\frac{1}{2}(\\frac{(\\theta^2-\\frac{74}{13})^2+ 36}{\\frac{36}{13}})$\n", + "\n", + "Hence,\n", + "$f(\\theta|x) \\propto e^{-\\frac{1}{2}(\\frac{(\\theta^2-\\frac{74}{13})^2}{\\frac{36}{13}})}$\n", + "\n", + "$\\therefore f(\\theta|x)\\sim N(\\frac{74}{13},\\frac{36}{13})$" + ], + "metadata": { + "id": "-O6Dk3i4ywGa" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### __Part (b)__\n", + "\n", + "Using the formulas given to us,\n", + "\n", + "$\\sigma^2 = 4$\n", + "\n", + "$\\mu_{prior}=5$\n", + "\n", + "$\\sigma_{prior}^2=9$\n", + "\n", + "$\\bar{x}=6$\n", + "\n", + "$n=4$\n", + "\n", + "We get,\n", + "\n", + "$a=\\frac{1}{\\sigma_{prior}^2}=\\frac{1}{9}$\n", + "\n", + "$b=\\frac{n}{\\sigma^2}=1$\n", + "\n", + "$\\mu_{post}=\\frac{{a\\mu_{prior}+b\\bar{x}}}{a+b}=\\frac{\\frac{5}{9}+6}{\\frac{1}{9}+1}=5.9$\n", + "\n", + "$\\sigma_{post}^2=\\frac{1}{a+b}=\\frac{1}{\\frac{1}{9}+1}=0.9$\n", + "\n", + "Thus, the posterior on theta, $f(\\theta|x)\\sim N(5.9,0.9)$\n", + "\n", + "The plot showing the prior and posterior is-" + ], + "metadata": { + "id": "pJyTh7MjFFL_" + } + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from scipy.stats import norm\n", + "x = np.arange(-10, 30, 0.0001)\n", + "plt.plot(x, norm.pdf(x, 5, 3), label='μ: 5, σ^2: 9')\n", + "plt.plot(x, norm.pdf(x, 5.9, 0.948683298), label='μ:5.9, σ^2: 0.9')\n", + "\n", + "plt.legend()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 447 + }, + "id": "ZX6hDNLZI-7n", + "outputId": "a5595ae1-6759-4fe3-c96a-466e4857f80d" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 8 + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "We see that compared to the prior, the posterior has very small variance $(=0.9)$ meaning more accuracy and we see that the posterior has a mean $(=5.9)$ which is very close to $\\bar{x}=6$. Thus we can say with more certainty that that value we receive is closer to $6$." + ], + "metadata": { + "id": "xiVBxQO9LB0b" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### __Part (c)__\n", + "\n", + "$a=\\frac{1}{\\sigma_{prior}^2}$\n", + "\n", + "$b=\\frac{n}{\\sigma^2}$\n", + "\n", + "$\\mu_{post}=\\frac{{a\\mu_{prior}+b\\bar{x}}}{a+b}$\n", + "\n", + "$\\sigma_{post}^2=\\frac{1}{a+b}$\n", + "\n", + "If there are more signals being received then $n$ increases. If $n$ increases, so does $b$. Clearly, then $\\sigma_{post}^2$ decreases, meaning less variance and more accuracy.\n", + "We can say about $\\mu_{post}$ that the weightage of $\\bar{x}$ increases as $b$ increases however no change occurs in $a$ or $\\mu_{prior}$. So, there is just more weightage of $\\bar{x}$ in the value of posterior mean." + ], + "metadata": { + "id": "uBe6gIGoMAaO" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### __Part (d)__\n", + "\n", + "We have been given that IQ follows a distribution $N(100,152)$, so this is the prior that we already have with us, where $\\theta$ is the unknown value of the true IQ such that,\n", + "\n", + "$ f(\\theta)\\sim N(100,152)$\n", + "\n", + "Next, if a person is tested multiple times, their measured IQ (say $x$) differs from their true IQ ($\\theta$) according to $N(0,102)$. As the mean is 0, the difference in the mean of the measured IQ and the true IQ (which is the true IQ itself which is no more unknown to us) is 0, or the measured IQ has a mean equal to the true value itself,\n", + "So, the likelihood of some value of IQ given some true value of IQ $\\theta$,\n", + "\n", + "$f(x|\\theta) \\sim N(\\theta,102)$\n", + "\n", + "$ (i)$ Randall Vard scored 80, so the measured IQ is 80. We can say that the expected value of his true IQ will be obtained from the posterior, the probability of his true IQ given his measured IQ. So, to get the expected value (or mean), we use,\n", + "\n", + "$a=\\frac{1}{\\sigma_{prior}^2}$\n", + "\n", + "$b=\\frac{n}{\\sigma^2}$\n", + "\n", + "$\\mu_{post}=\\frac{{a\\mu_{prior}+b\\bar{x}}}{a+b}$\n", + "\n", + "Where,\n", + "\n", + "$\\sigma_{prior}^2=152$\n", + "\n", + "$\\mu_{prior}=100$\n", + "\n", + "$n=1$ and\n", + "\n", + "$\\sigma^2=102$ (known variance)\n", + "\n", + "Using these,\n", + "\n", + "$a=\\frac{1}{152}$\n", + "\n", + "$b=\\frac{1}{102}$\n", + "\n", + "Here, $\\bar{x}=x=80$ thus,\n", + "\n", + "$\\mu_{post}= \\frac{\\frac{1}{152}.100+\\frac{1}{102}.80}{\\frac{1}{152}+\\frac{1}{102}} = 88.031$\n", + "\n", + "$(ii)$ Here, for Mary I. Taft, $\\bar{x}=x=150$ thus,\n", + "\n", + "$\\mu_{post}= \\frac{\\frac{1}{152}.100+\\frac{1}{102}.150}{\\frac{1}{152}+\\frac{1}{102}} = 129.914$\n" + ], + "metadata": { + "id": "el5ZV7bdPr64" + } + }, + { + "cell_type": "markdown", + "source": [ + "###__Question 3__\n", + "We already know how we can use MLE to estimate the unknown mean and variance of a given dataset that follows gaussian distribution.\n", + "\n", + "So, let $\\theta$ denote the set of unknown parameters $\\mu$ and $\\sigma$,\n", + "\n", + "$\\theta = \\{\\mu,\\sigma\\}$\n", + "\n", + "The goal of MLE is the maximize the probability (actually probability density since the data is continuous) of the data given $\\theta$. Hence we want to maximise the following function -\n", + "\n", + "$f\\{x_1,x_2....x_n|\\theta\\}$\n", + "\n", + "where, of course,\n", + "\n", + "$f(x_i|\\theta)= \\frac{1}{\\sqrt{2\\pi\\sigma}}e^{\\frac{{-(x_i-\\mu)^2}}{2\\sigma^2}}$\n", + "\n", + "Thus we aim the find the optimal $\\theta$ for which $f$ is maximised,\n", + "\n", + "$\\hat{\\theta}_{MLE}=argmax_{\\theta}\\prod\\limits_{i}^{n}f(x_i|\\theta)$\n", + "\n", + "To simplify calculation (taking derivative to maximise $f$), we use logarithmic function. Then by simple algebra, our equatio above changes to,\n", + "\n", + "$\\hat{\\theta}_{MLE}=argmax_{\\theta}\\sum_\\limits{i}^{n}ln(f(x_i|\\theta))$\n", + "\n", + "From MAN-006, we have already calculated the parameters estimated by MLE by taking the derivative of the log of the probability density f with respect to each unknown parameter and equating that to zero, and they turn out to be,\n", + "\n", + "$\\hat{\\mu}_{MLE}=\\frac{1}{n}\\sum\\limits_{i}^{n}x_i$\n", + "\n", + "Which is simply the sample mean, and,\n", + "\n", + "$\\hat{\\sigma}^2_{MLE}=\\frac{1}{n}\\sum\\limits_{i}^{n}(x_i-\\mu)^2$\n", + "\n", + "Which is the sample variance.\n", + "\n", + "Now, our next goal is to use python to estimate this. So, all we should have to do is give a dataset that follows gaussian distibution, define the Maximum likelihood function (the logarithmic form) and then maximize it using python to obtain the required estimates.\n", + "\n", + "A small research on the internet revealed that the easiest way to maximise the function would be using Scipy. Instead of trying to maximize the log-likelihood funciton, the standard way seems to be to minimize the negative log-likelihood function." + ], + "metadata": { + "id": "Nrayn0lxLFd2" + } + }, + { + "cell_type": "code", + "source": [ + "# first let's generate a dataset following a gaussian distribution using numpy\n", + "import numpy as np\n", + "from scipy.optimize import minimize\n", + "from scipy.stats import norm\n", + "\n", + "truemean = 100\n", + "truevariance=152\n", + "truesd=np.sqrt(truevariance)\n", + "n=1000\n", + "dataset = np.random.normal(truemean,truesd, size=n)\n", + "print(\"True Mean:\", truemean)\n", + "print(\"True Variance:\",truevariance )\n", + "\n", + "#print(dataset)\n", + "\n", + "# now we must define the negative log-likelihood function\n", + "\"\"\"\n", + "def likelihood(parameters,dataset):\n", + " mean = parameters[0]\n", + " sd=parameters[1]\n", + " sum=0\n", + " for i in dataset:\n", + " data=i\n", + " pdf = norm.pdf(data , loc = mean , scale = sd )\n", + " LL=np.log(pdf)\n", + " sum+=LL\n", + " negativeLL= -1*sum\n", + " return negativeLL\n", + "\"\"\"\n", + "\n", + "def likelihood(parameters, dataset):\n", + " mean, variance = parameters\n", + " sum=0\n", + " N = len(dataset)\n", + " for i in dataset:\n", + " sum+=(i-mean)**2\n", + " LL = - N/2 * np.log(variance) - 1/(2*variance) * sum\n", + " return -LL\n", + "\n", + "# Scipy optimization algorithms require a intial parameter value, closer to the true value, faster the optimization algorithm\n", + "# we will set the initial parameters to the standard normal distribution values since we don't have a prior\n", + "initialparameters=[0,1]\n", + "\n", + "# now we can use Scipy to minimize the negative log-likelihood function\n", + "result= minimize(likelihood, initialparameters, args=(dataset,),method='L-BFGS-B')\n", + "estimatedmean,estimatedsd=result.x\n", + "print(\"Estimated Mean:\", estimatedmean)\n", + "print(\"Estimated Standard deviation:\",estimatedsd )\n", + "\n", + "# at first I was using scipy to define the normal pdf but....\n", + "# this was not working on high values of mean and standard deviation (bigger values are giving RuntimeWarning: divide by zero encountered in log )\n", + "# I tried searching about this error a lot but didn't find anything very helpful then I switched to defining log-likelihood function from scratch and it worked!\n", + "# further it important to specify the method of optimization as L-BFGS-B otherwise we keep getting \"invalid value encountered in log\" error. No idea what's happening!\n", + "# L-BFGS-B is a variation of the standard BFGS algorithm which is used when we have limited memory and our variables have some bound, its also much faster\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "M5Z7s0BXWVOf", + "outputId": "089816a0-7b42-43fb-8b10-c5309fd03eb4" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "True Mean: 100\n", + "True Variance: 152\n", + "Estimated Mean: 99.46425688255611\n", + "Estimated Standard deviation: 167.37918914364832\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "###__Question 4__\n", + "\n", + "Firstly, let's break down the problem and understand what is asked of us.\n", + "\n", + "So, we have a dataset of binary classification, which means our data has outcomes 0 or 1 only. We want to use logistic regression model for the classification and we're been given that the model parameters (conventionally taken as __w__) has a gaussian prior.\n", + "\n", + "Now, we want to formulate the likelihood, the prior and the posterior and then find the MAP estimate of the model parameters.\n", + "\n", + "After a bit of revision, we recall that in logistic regression we use the logistic function to define the likelihood function such that it is exactly of the form,\n", + "\n", + "$P(y|\\textbf{x})=\\frac{1}{1+e^{(\\textbf{w}^{\\textbf{T}}\\textbf{x}+b)}}$\n", + "\n", + "Where, $\\textbf{w}$ is the vector of parameters.\n", + "\n", + "We might make assumptions on the posterior, that is,\n", + "\n", + "$P(x|y)$ but it doesn't matter since here we'll be directly using MAP to estimate the parameters.\n", + "\n", + "In MAP, our goal is to maximise the posterior (unlike MLE where we maximise the likelihood function). As we already know,\n", + "\n", + "$P(\\textbf{w}|y)=P(y|\\textbf{x},\\textbf{w}).P(\\textbf{w})$\n", + "\n", + "Then, we require the value of $\\textbf{w}$ such that the posterior is maximized.\n", + "\n", + "$argmax_{\\textbf{w}}[P(y|\\textbf{w}).P(\\textbf{w})]=argmax_{\\textbf{w}}\\{[ln[P(y|\\textbf{w})]+ ln[P(\\textbf{w})]\\}=argmin_{\\textbf{w}}\\{-[ln[P(y|\\textbf{w})]- ln[P(\\textbf{w})]\\}$\n", + "\n", + "Now, we have a binary classification situation, so we have to utilise the fact that Y can only take values 0 and 1. So, only one of the following two terms is going to be non-zero for any given Y,\n", + "\n", + "$P(Y=1|X)=\\frac{e^{(\\textbf{w}^{\\textbf{T}}\\textbf{x}+b)}}{1+e^{(\\textbf{w}^{\\textbf{T}}\\textbf{x}+b)}}$\n", + "\n", + "And,\n", + "\n", + "$P(Y=0|X)=\\frac{1}{1+e^{(\\textbf{w}^{\\textbf{T}}\\textbf{x}+b)}}$\n", + "\n", + "So now our goal becomes,\n", + "\n", + "$\\hat{\\textbf{w}}_{MAP}=argmax_{\\textbf{w}}\\{[ln[\\prod\\limits_{1}^{n}P(y_i|\\textbf{w})]+ ln[P(\\textbf{w})]\\}$\n", + "$=argmax_{\\textbf{w}}\\{[\\sum\\limits_{1}^{n}ln[P(y_i|\\textbf{w})]+ ln[P(\\textbf{w})]\\}$\n", + "\n", + "Now, according to the above utilisation, we can rewrite $ln[P(y_i|\\textbf{w})]$ as,\n", + "\n", + "$=\\sum\\limits_{1}^{n}Y_i.ln[P(Y_i=1|X_i,\\textbf{w})]+(1-Y_i).ln[P(Y_i=0|X_i,\\textbf{w})]$\n", + "\n", + "$=\\sum\\limits_{1}^{n}Y_i.ln[\\frac{P(Y_i=1|X_i,\\textbf{w})}{P(Y_i=0|X_i,\\textbf{w})}]+ln[P(Y_i=0|X_i,\\textbf{w})]$\n", + "\n", + "$=\\sum\\limits_{1}^{n}\\{Y_i.(\\textbf{w}^{\\textbf{T}}\\textbf{x}_i+b)-ln(1+e^{(\\textbf{w}^{\\textbf{T}}\\textbf{x}_i+b)})\\}$\n", + "\n", + "Now, let's come to the fact that our model paramters have a gaussian prior. We will pick the simplest and most convenient zero-mean gaussian with a known variance $\\sigma$ for every parameter.\n", + "\n", + "$w_j \\sim N(0,\\sigma^2)$ such that $P(\\textbf{w})=\\prod\\limits_{1}^{n}\\frac{1}{\\sigma\\sqrt{2\\pi}}e^{\\{\\frac{{-w_j}^2}{2\\sigma^2}\\}}$\n", + "\n", + "Then, (excluding terms not containing $w_j$,\n", + "\n", + "$ln[P(\\textbf{w})]=-\\frac{1}{2\\sigma^2}\\sum\\limits_{1}^{f}{\\textbf{w}_j}^2$\n", + "\n", + "Putting everything together, our goal is,\n", + "\n", + "$argmin_{\\textbf{w}}\\{-[ln[P(y|\\textbf{w})]- ln[P(\\textbf{w})]\\}=\\frac{1}{2\\sigma^2}\\sum\\limits_{1}^{f}{\\textbf{w}_j}^2 - \\sum\\limits_{1}^{n}\\{Y_i.(\\textbf{w}^\\textbf{T}.x_i+b)-ln(1+e^{(\\textbf{w}^{\\textbf{T}}\\textbf{x}_i+b)})\\}$\n", + "\n", + "So far, I believe we can do the same sort of thing we've done in the previous question - define each of the functions and finally use Scipy to minimise this poeterior.\n", + "\n", + "We'll have to specify the prior ourselves, so for the sake of simplicity, we can absorb the parameter $b$ into $w$ through an additional constant dimension in $w$ (read in some papers).\n", + "\n", + "So, $w$ has multivariate gaussian prior and once again, for simplicity, let's say it has just two dimensions (we'll also include $b$ so the final dimension of $w$ will be. Our goal now looks like minimising the following function,\n", + "\n", + "$=\\sum\\limits_{1}^{f}\\frac{1}{2\\sigma^2}{\\textbf{w}_j}^2 -\\sum\\limits_{1}^{n}\\{Y_i.(\\textbf{w}^{\\textbf{T}}\\textbf{x}_i)-ln(1+e^{(\\textbf{w}^{\\textbf{T}}\\textbf{x}_i)})\\}$\n", + "\n", + "So, let us begin." + ], + "metadata": { + "id": "ugh_Avb9wn9f" + } + }, + { + "cell_type": "code", + "source": [ + "import numpy as np\n", + "from sklearn.datasets import make_classification\n", + "from scipy.optimize import minimize\n", + "\"\"\"\n", + "# first we'll have to generate a dataset, let size be n and number of features (dimension of w) be f. Here we have just one feature\n", + "n= 1000\n", + "f= 2+1\n", + "x, y = make_classification(n_samples=n,n_features=f,n_redundant=0,n_informative=3,n_repeated=0, random_state=444)\n", + "#print(x.shape)\n", + "#print(y)\n", + "\n", + "# now for w, it's up to us to choose the variance, so let's assume the following\n", + "pmean=0\n", + "pvar=5\n", + "\n", + "def logprior(w,variance):\n", + " logprior = (-1/(2*variance)) * np.sum(np.square(w))\n", + " return logprior\n", + "\"\"\"\n", + "# first we'll have to generate a dataset, let size be n and number of features (dimension of w -1) be f.\n", + "n = 1000\n", + "f = 2\n", + "priormean=0 # for every parameter\n", + "priormeanvector = np.zeros(f + 1) # absorbing parameter b\n", + "priorvariance=5 # for every parameter and they are independent of each other\n", + "priorcovariance = priorvariance * np.eye(f + 1) # absorbing parameter b, covariances are all 0, variance of each is equal to priorvariance\n", + "\n", + "truew = np.random.multivariate_normal(priormeanvector, priorcovariance)\n", + "print(\"True parameters used for data generation :\",truew)\n", + "\n", + "x = np.hstack((np.ones((n, 1)), np.random.uniform(-1, 1, size=(n, f)))) # bias is fixed at 1, generating random features (n of them, each is a vector with 3 elements, first is 1)\n", + "z = np.dot(x, truew)\n", + "p = 1 / (1 + np.exp(-z)) # y=1 has this probability\n", + "y = np.random.binomial(1, p)\n", + "\n", + "def logprior(w,priormean,priorvariance):\n", + " LP = (-1/(2*priorvariance)) * np.sum(np.square(w-priormean))\n", + " return LP\n", + "\n", + "def loglikelihood(w, x, y):\n", + " LL=0\n", + " count=0\n", + " for i in range(n):\n", + " z = np.dot(x[i], w)\n", + " LL+=y[i]*(x[i].dot(w.T))-np.log(1+np.exp(z))\n", + " return LL\n", + "\n", + "\n", + "def neglogposterior(w,priormean,priorvariance,x,y):\n", + " return -logprior(w,priormean,priorvariance)-loglikelihood(w,x,y)\n", + "\n", + "# like before, we need initial parameters\n", + "# print(x.shape)\n", + "initialparameters_w=np.array([2,2,2]) # w is a vector after all\n", + "\n", + "result = minimize(neglogposterior, initialparameters_w, args=(priormean,priorvariance,x,y), method='L-BFGS-B')\n", + "westimate=result.x\n", + "print(\"MAP Estimated parameters: \",westimate) # it's pretty close\n", + "\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PB60-rFGx5Mi", + "outputId": "db68ff27-3cdf-4253-f0c9-73a31fbdb19e" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "True parameters used for data generation : [ 1.20033799 -2.79411209 3.2057022 ]\n", + "MAP Estimated parameters: [ 1.24238212 -3.12122653 3.15600055]\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### __Question 5__\n", + "\n", + "We have been asked to find the VC dimension of some concept classes. To make sure I understood this right, VC dimension for a concept class is simply the maximum number of points that the concept class can create a sort of seperation (shatter) for (given each point belongs to one of two classes and taking all $2^n$ possibilites in mind) such that they're classified correctly in their regions and an important thing to note is that all we need is ONE configuration of these points (and the $2^n$ possibilities of class allocation) which can be shattered by the concept class in order to define the VC dimension for it. Meaning, we don't need every configuration (different locations of points, but any one) to be able to define the VC dimension. Also, I haven't gone through the math intensive article yet.\n", + "\n", + "#### __(a) Constant Function__\n", + "Meaning this is a function that can take any constant value. So, from my understanding, given a set of points which are to be classified into two classes, a constant function can only take either \"positive\" or \"negative\" for all the given points. So, if we had 10 points, we could only assign either positive or negative to all the 10 points. If this understanding is correct, then this concept class can only classify 1 point correctly. If we had 2 points then our possibilities (despite their configuration) become,\n", + "\n", + "1. +,-\n", + "\n", + "2. -,+\n", + "\n", + "3. +,+\n", + "\n", + "4. -,-\n", + "\n", + "And in such a case, assigning only positive or only negative value to both points doesn't satisfy all 4 cases (but only one at a time, either case 3 or case 4). So, the VC dimension should be 1.\n", + "\n", + "#### __(b) Linear Function in d dimensions__\n", + "A linear function would generate a line in 2 dimensions, a plane in 3 dimensions and so on. From the reading, a linear classifier has cardinality n+1 (given n is the dimension?). Which makes sense for 2 dimensions as the VC dimension there is 3. Again, I have yet to see the mathematical proof for this one. The VC dimension should be d+1.\n", + "\n", + "#### __(c) Axis aligned rectangle in 2 dimensions__\n", + "Here, we are being restricted by \"axis aligned\". If we have 1,2 or 3 points in space, we can just enclose the points of one class within a large enough axis aligned rectangle. If we had 4 points, we can imagine a configuration where two points are lying sort of diagonally to each other but are close enough in any one dimension to allow us to put them in a narrow axis aligned rectangle. So, we'd still be able to enclose 4 points. In case of 5 points however, even if we have 3 points close together, a different combination of classes would cause difficulty like if we had the centre point of one class (+), the two side points of another class (-) and the other two outward points also of the same class as the centre point (+), then a 2-D axis aligned rectangle can't shatter these points. No other configuration works either. So, VC Dimension has to be 4.\n", + "\n", + "#### __(d) Intervals__\n", + "Intervals as in some interval on an infinite line. Now, if we have 3 points, they'd obviously be collinear and then the situation where a negative(or positive) point lies between two positive (or negative) points, would make it impossible to define an interval to shatter them. So, VC dimension can be 2.\n", + "\n", + "\n" + ], + "metadata": { + "id": "ByZ8ATC1re9E" + } + } + ] +} \ No newline at end of file diff --git a/assMath/probStat/Anupriya/Assignment1_Anupriya.pdf b/assMath/probStat/Anupriya/Assignment1_Anupriya.pdf new file mode 100644 index 00000000..7c9f74a4 Binary files /dev/null and b/assMath/probStat/Anupriya/Assignment1_Anupriya.pdf differ diff --git a/assMath/probStat/Anupriya/Assignment1_Anupriya.pdf:Zone.Identifier b/assMath/probStat/Anupriya/Assignment1_Anupriya.pdf:Zone.Identifier new file mode 100644 index 00000000..053d1127 --- /dev/null +++ b/assMath/probStat/Anupriya/Assignment1_Anupriya.pdf:Zone.Identifier @@ -0,0 +1,3 @@ +[ZoneTransfer] +ZoneId=3 +HostUrl=about:internet diff --git a/assMath/probStat/Anupriya/kalmanfilter/README.md b/assMath/probStat/Anupriya/kalmanfilter/README.md new file mode 100644 index 00000000..f0e2c165 --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/README.md @@ -0,0 +1,13 @@ +# Assignment +Make changes in deep_sort/kalman_filter_assignment.py +Run test.py code to test your code +Don't try to search for orignal code, The assignment is to get you more familliar with using statistics in real life problem setting, and it's for your own benifit. + +## Dependencies + +The code is compatible with Python 2.7 and 3. The following dependencies are +needed to run the tracker: + +* NumPy +* sklearn +* OpenCV diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/__init__.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/__init__.py new file mode 100644 index 00000000..f708a9b2 --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/__init__.py @@ -0,0 +1 @@ +__version__ = "1.3.2" diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/__pycache__/__init__.cpython-38.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 00000000..be38ef1b Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/__pycache__/__init__.cpython-38.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/__pycache__/__init__.cpython-39.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 00000000..7b79e4f5 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/__pycache__/__init__.cpython-39.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/__pycache__/deepsort_tracker.cpython-38.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/__pycache__/deepsort_tracker.cpython-38.pyc new file mode 100644 index 00000000..46d6f5f5 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/__pycache__/deepsort_tracker.cpython-38.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/__pycache__/deepsort_tracker.cpython-39.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/__pycache__/deepsort_tracker.cpython-39.pyc new file mode 100644 index 00000000..3e912377 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/__pycache__/deepsort_tracker.cpython-39.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/.ipynb_checkpoints/kalman_filter-checkpoint.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/.ipynb_checkpoints/kalman_filter-checkpoint.py new file mode 100644 index 00000000..7741d00a --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/.ipynb_checkpoints/kalman_filter-checkpoint.py @@ -0,0 +1,245 @@ +# vim: expandtab:ts=4:sw=4 +import numpy as np +import scipy.linalg + + +""" +Table for the 0.95 quantile of the chi-square distribution with N degrees of +freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv +function and used as Mahalanobis gating threshold. +""" +### Don't change +chi2inv95 = { + 1: 3.8415, + 2: 5.9915, + 3: 7.8147, + 4: 9.4877, + 5: 11.070, + 6: 12.592, + 7: 14.067, + 8: 15.507, + 9: 16.919, +} + + +class KalmanFilter(object): + """ + A simple Kalman filter for tracking bounding boxes in image space. + + The 8-dimensional state space + + x, y, a, h, vx, vy, va, vh + + contains the bounding box center position (x, y), aspect ratio a, height h, + and their respective velocities. + + Object motion follows a constant velocity model. The bounding box location + (x, y, a, h) is taken as direct observation of the state space (linear + observation model). + + """ + + def __init__(self): + dt=1.0 + + # Create Kalman filter model matrices. + ##ADD code + # Motion and observation uncertainty are chosen relative to the current + # state estimate. These weights control the amount of uncertainty in + # the model. This is a bit hacky. +##ADD code + # for state extrapolation equation + # state transition matrix + self.F = np.array([[1, 0, 0, 0, dt, 0, 0, 0], + [0, 1, 0, 0, 0, dt, 0, 0], + [0, 0, 1, 0, 0, 0, dt, 0], + [0, 0, 0, 1, 0, 0, 0, dt], + [0, 0, 0, 0, 1, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 0, 0, 1]]) + # no control matrix since this is a constant velocity model + # measurement matrix intitialised as 4 x 4 identity matrix for measuring x,y,a,h + self.H = np.array([[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 1]]) + # for covariance extrapolation equation + # Process noise covariance + self.Q = np.array([[1, 0, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 1, 0, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 0, 0, 1]]) + # Measurement noise covariance + self.R = np.array([[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 1]]) + + def initiate(self, measurement): + """Create track from unassociated measurement. + + Parameters + ---------- + measurement : ndarray + Bounding box coordinates (x, y, a, h) with center position (x, y), + aspect ratio a, and height h. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector (8 dimensional) and covariance matrix (8x8 + dimensional) of the new track. Unobserved velocities are initialized + to 0 mean. + + """ +##ADD code + # intially the x,y,a,h are observed and equal to measurement + # unobserved velocities have 0 mean (last 4 entries) + mean=np.array([0,0,0,0,0,0,0,0]) + mean[:4]=measurement + covariance =np.array([[1, 0, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 1, 0, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 0, 0, 1]]) + return mean, covariance + + def predict(self, mean, covariance): + """Run Kalman filter prediction step. + + Parameters + ---------- + mean : ndarray + The 8 dimensional mean vector of the object state at the previous + time step. + covariance : ndarray + The 8x8 dimensional covariance matrix of the object state at the + previous time step. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector and covariance matrix of the predicted + state. Unobserved velocities are initialized to 0 mean. + + """ +##ADD code + # basically the prediction equation, here state extrapolation -> x(n+1)= F.x(n) and x(n) is the mean so + mean=self.F.dot(mean) + # and now covariance extrapolation -> P(n+1)= F.P(n).F(transpose) + Q + covariance = self.F.dot(covariance).dot(self.F.T)+ self.Q + return mean, covariance + + def project(self, mean, covariance): + """Project state distribution to measurement space. + + Parameters + ---------- + mean : ndarray + The state's mean vector (8 dimensional array). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + + Returns + ------- + (ndarray, ndarray) + Returns the projected mean and covariance matrix of the given state + estimate. + + """ +##ADD code + # measurement equations? + mean=self.H.dot(mean) + covariance=self.H.dot(covariance).dot(self.H.T)+self.R + return mean, covariance + + def update(self, mean, covariance, measurement): + """Run Kalman filter correction step. + + Parameters + ---------- + mean : ndarray + The predicted state's mean vector (8 dimensional). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + measurement : ndarray + The 4 dimensional measurement vector (x, y, a, h), where (x, y) + is the center position, a the aspect ratio, and h the height of the + bounding box. + + Returns + ------- + (ndarray, ndarray) + Returns the measurement-corrected state distribution. + + """ +##ADD code + # update equation -> x(n,n)=x(n,n-1) + K(n)(z(n)-H.x(n,n-1)) + # but we need K(n), A 4x4 matrix + # Kalman gain equation -> K(n) = P(n,n-1).H(transpose).(H.P(n,n-1).H(transpose)+R)(inverse) + + Kalman_gain= covariance.dot(self.H.T).dot(np.linalg,inv(self.H.dot(covariance).dot(self.H.T)+self.R)) + + innovation=measurement - np.dot(self.H,mean) # as it is conventionally called + + new_mean= mean+ np.dot(Kalman_gain,innovation) + + # covariance update equation (longer form, cause shorter form is "numerically unstable" according to text) -> + # P(n,n)= (I-H.K(n)).P(n,n-1).(I-H.K(n))(transpose) + K(n).R.K(n)(transpose) + + newterm= np.eye(4)-np.dot(self.H,Kalman_gain) + + new_covariance=newterm.dot(covariance).dot(newterm.T) + Kalman_gain.dot(self.R).dot(Kalman_gain.T) + + return new_mean, new_covariance + + def gating_distance(self, mean, covariance, measurements, only_position=False): + """Compute gating distance between state distribution and measurements. + + A suitable distance threshold can be obtained from `chi2inv95`. If + `only_position` is False, the chi-square distribution has 4 degrees of + freedom, otherwise 2. + + Parameters + ---------- + mean : ndarray + Mean vector over the state distribution (8 dimensional). + covariance : ndarray + Covariance of the state distribution (8x8 dimensional). + measurements : ndarray + An Nx4 dimensional matrix of N measurements, each in + format (x, y, a, h) where (x, y) is the bounding box center + position, a the aspect ratio, and h the height. + only_position : Optional[bool] + If True, distance computation is done with respect to the bounding + box center position only. + + Returns + ------- + ndarray + Returns an array of length N, where the i-th element contains the + squared Mahalanobis distance between (mean, covariance) and + `measurements[i]`. + + """ + ### Don't change anything + mean, covariance = self.project(mean, covariance) + if only_position: + mean, covariance = mean[:2], covariance[:2, :2] + measurements = measurements[:, :2] + + cholesky_factor = np.linalg.cholesky(covariance) + d = measurements - mean + z = scipy.linalg.solve_triangular( + cholesky_factor, d.T, lower=True, check_finite=False, overwrite_b=True + ) + squared_maha = np.sum(z * z, axis=0) + return squared_maha diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__init__.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__init__.py new file mode 100644 index 00000000..43e08fb8 --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__init__.py @@ -0,0 +1 @@ +# vim: expandtab:ts=4:sw=4 diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/__init__.cpython-38.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 00000000..0b539e3b Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/__init__.cpython-38.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/__init__.cpython-39.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 00000000..763fa6cd Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/__init__.cpython-39.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/detection.cpython-38.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/detection.cpython-38.pyc new file mode 100644 index 00000000..d581b3d3 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/detection.cpython-38.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/detection.cpython-39.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/detection.cpython-39.pyc new file mode 100644 index 00000000..47cf7d13 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/detection.cpython-39.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/iou_matching.cpython-38.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/iou_matching.cpython-38.pyc new file mode 100644 index 00000000..fd6c8b67 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/iou_matching.cpython-38.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/iou_matching.cpython-39.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/iou_matching.cpython-39.pyc new file mode 100644 index 00000000..aa1759b4 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/iou_matching.cpython-39.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/kalman_filter.cpython-38.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/kalman_filter.cpython-38.pyc new file mode 100644 index 00000000..bd5aa79c Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/kalman_filter.cpython-38.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/kalman_filter.cpython-39.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/kalman_filter.cpython-39.pyc new file mode 100644 index 00000000..a4bf57fd Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/kalman_filter.cpython-39.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/linear_assignment.cpython-38.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/linear_assignment.cpython-38.pyc new file mode 100644 index 00000000..c19b0f97 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/linear_assignment.cpython-38.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/linear_assignment.cpython-39.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/linear_assignment.cpython-39.pyc new file mode 100644 index 00000000..10fa3c9b Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/linear_assignment.cpython-39.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/nn_matching.cpython-38.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/nn_matching.cpython-38.pyc new file mode 100644 index 00000000..7c209fd1 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/nn_matching.cpython-38.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/nn_matching.cpython-39.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/nn_matching.cpython-39.pyc new file mode 100644 index 00000000..424d4863 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/nn_matching.cpython-39.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/track.cpython-38.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/track.cpython-38.pyc new file mode 100644 index 00000000..3f0ab52e Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/track.cpython-38.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/track.cpython-39.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/track.cpython-39.pyc new file mode 100644 index 00000000..b08ecbb0 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/track.cpython-39.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/tracker.cpython-38.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/tracker.cpython-38.pyc new file mode 100644 index 00000000..a33c461c Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/tracker.cpython-38.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/tracker.cpython-39.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/tracker.cpython-39.pyc new file mode 100644 index 00000000..519dcc76 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/__pycache__/tracker.cpython-39.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/detection.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/detection.py new file mode 100644 index 00000000..30c0a52e --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/detection.py @@ -0,0 +1,62 @@ +# vim: expandtab:ts=4:sw=4 +import numpy as np + + +class Detection(object): + """ + This class represents a bounding box detection in a single image. + + Parameters + ---------- + ltwh : array_like + Bounding box in format `(x, y, w, h)`. + confidence : float + Detector confidence score. + feature : array_like + A feature vector that describes the object contained in this image. + class_name : Optional str + Detector predicted class name. + instance_mask : Optional + Instance mask corresponding to bounding box + others : Optional any + Other supplementary fields associated with detection that wants to be stored as a "memory" to be retrieve through the track downstream. + + Attributes + ---------- + ltwh : ndarray + Bounding box in format `(top left x, top left y, width, height)`. + confidence : ndarray + Detector confidence score. + feature : ndarray | NoneType + A feature vector that describes the object contained in this image. + + """ + + def __init__(self, ltwh, confidence, feature, class_name=None, instance_mask=None, others=None): + # def __init__(self, ltwh, feature): + self.ltwh = np.asarray(ltwh, dtype=np.float32) + self.confidence = float(confidence) + self.feature = np.asarray(feature, dtype=np.float32) + self.class_name = class_name + self.instance_mask = instance_mask + self.others = others + + def get_ltwh(self): + return self.ltwh.copy() + + def to_tlbr(self): + """Convert bounding box to format `(min x, min y, max x, max y)`, i.e., + `(top left, bottom right)`. + """ + ret = self.ltwh.copy() + ret[2:] += ret[:2] + return ret + + def to_xyah(self): + """Convert bounding box to format `(center x, center y, aspect ratio, + height)`, where the aspect ratio is `width / height`. + """ + ret = self.ltwh.copy() + ret[:2] += ret[2:] / 2 + ret[2] /= ret[3] + return ret diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/iou_matching.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/iou_matching.py new file mode 100644 index 00000000..f0bf802b --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/iou_matching.py @@ -0,0 +1,85 @@ +# vim: expandtab:ts=4:sw=4 +from __future__ import absolute_import +import numpy as np +from . import linear_assignment + + +def iou(bbox, candidates): + """Computer intersection over union. + + Parameters + ---------- + bbox : ndarray + A bounding box in format `(top left x, top left y, width, height)`. + candidates : ndarray + A matrix of candidate bounding boxes (one per row) in the same format + as `bbox`. + + Returns + ------- + ndarray + The intersection over union in [0, 1] between the `bbox` and each + candidate. A higher score means a larger fraction of the `bbox` is + occluded by the candidate. + + """ + bbox_tl, bbox_br = bbox[:2], bbox[:2] + bbox[2:] + candidates_tl = candidates[:, :2] + candidates_br = candidates[:, :2] + candidates[:, 2:] + + tl = np.c_[ + np.maximum(bbox_tl[0], candidates_tl[:, 0])[:, np.newaxis], + np.maximum(bbox_tl[1], candidates_tl[:, 1])[:, np.newaxis], + ] + br = np.c_[ + np.minimum(bbox_br[0], candidates_br[:, 0])[:, np.newaxis], + np.minimum(bbox_br[1], candidates_br[:, 1])[:, np.newaxis], + ] + wh = np.maximum(0.0, br - tl) + + area_intersection = wh.prod(axis=1) + area_bbox = bbox[2:].prod() + area_candidates = candidates[:, 2:].prod(axis=1) + return area_intersection / (area_bbox + area_candidates - area_intersection) + + +def iou_cost(tracks, detections, track_indices=None, detection_indices=None): + """An intersection over union distance metric. + + Parameters + ---------- + tracks : List[deep_sort.track.Track] + A list of tracks. + detections : List[deep_sort.detection.Detection] + A list of detections. + track_indices : Optional[List[int]] + A list of indices to tracks that should be matched. Defaults to + all `tracks`. + detection_indices : Optional[List[int]] + A list of indices to detections that should be matched. Defaults + to all `detections`. + + Returns + ------- + ndarray + Returns a cost matrix of shape + len(track_indices), len(detection_indices) where entry (i, j) is + `1 - iou(tracks[track_indices[i]], detections[detection_indices[j]])`. + + """ + if track_indices is None: + track_indices = np.arange(len(tracks)) + if detection_indices is None: + detection_indices = np.arange(len(detections)) + + cost_matrix = np.zeros((len(track_indices), len(detection_indices))) + candidates = np.asarray([detections[i].ltwh for i in detection_indices]) + + for row, track_idx in enumerate(track_indices): + if tracks[track_idx].time_since_update > 1: + cost_matrix[row, :] = linear_assignment.INFTY_COST + continue + + bbox = tracks[track_idx].to_ltwh() + cost_matrix[row, :] = 1.0 - iou(bbox, candidates) + return cost_matrix diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/kalman_filter.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/kalman_filter.py new file mode 100644 index 00000000..7741d00a --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/kalman_filter.py @@ -0,0 +1,245 @@ +# vim: expandtab:ts=4:sw=4 +import numpy as np +import scipy.linalg + + +""" +Table for the 0.95 quantile of the chi-square distribution with N degrees of +freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv +function and used as Mahalanobis gating threshold. +""" +### Don't change +chi2inv95 = { + 1: 3.8415, + 2: 5.9915, + 3: 7.8147, + 4: 9.4877, + 5: 11.070, + 6: 12.592, + 7: 14.067, + 8: 15.507, + 9: 16.919, +} + + +class KalmanFilter(object): + """ + A simple Kalman filter for tracking bounding boxes in image space. + + The 8-dimensional state space + + x, y, a, h, vx, vy, va, vh + + contains the bounding box center position (x, y), aspect ratio a, height h, + and their respective velocities. + + Object motion follows a constant velocity model. The bounding box location + (x, y, a, h) is taken as direct observation of the state space (linear + observation model). + + """ + + def __init__(self): + dt=1.0 + + # Create Kalman filter model matrices. + ##ADD code + # Motion and observation uncertainty are chosen relative to the current + # state estimate. These weights control the amount of uncertainty in + # the model. This is a bit hacky. +##ADD code + # for state extrapolation equation + # state transition matrix + self.F = np.array([[1, 0, 0, 0, dt, 0, 0, 0], + [0, 1, 0, 0, 0, dt, 0, 0], + [0, 0, 1, 0, 0, 0, dt, 0], + [0, 0, 0, 1, 0, 0, 0, dt], + [0, 0, 0, 0, 1, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 0, 0, 1]]) + # no control matrix since this is a constant velocity model + # measurement matrix intitialised as 4 x 4 identity matrix for measuring x,y,a,h + self.H = np.array([[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 1]]) + # for covariance extrapolation equation + # Process noise covariance + self.Q = np.array([[1, 0, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 1, 0, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 0, 0, 1]]) + # Measurement noise covariance + self.R = np.array([[1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + [0, 0, 0, 1]]) + + def initiate(self, measurement): + """Create track from unassociated measurement. + + Parameters + ---------- + measurement : ndarray + Bounding box coordinates (x, y, a, h) with center position (x, y), + aspect ratio a, and height h. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector (8 dimensional) and covariance matrix (8x8 + dimensional) of the new track. Unobserved velocities are initialized + to 0 mean. + + """ +##ADD code + # intially the x,y,a,h are observed and equal to measurement + # unobserved velocities have 0 mean (last 4 entries) + mean=np.array([0,0,0,0,0,0,0,0]) + mean[:4]=measurement + covariance =np.array([[1, 0, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 1, 0, 0, 0, 0], + [0, 0, 0, 0, 1, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 0, 0, 1]]) + return mean, covariance + + def predict(self, mean, covariance): + """Run Kalman filter prediction step. + + Parameters + ---------- + mean : ndarray + The 8 dimensional mean vector of the object state at the previous + time step. + covariance : ndarray + The 8x8 dimensional covariance matrix of the object state at the + previous time step. + + Returns + ------- + (ndarray, ndarray) + Returns the mean vector and covariance matrix of the predicted + state. Unobserved velocities are initialized to 0 mean. + + """ +##ADD code + # basically the prediction equation, here state extrapolation -> x(n+1)= F.x(n) and x(n) is the mean so + mean=self.F.dot(mean) + # and now covariance extrapolation -> P(n+1)= F.P(n).F(transpose) + Q + covariance = self.F.dot(covariance).dot(self.F.T)+ self.Q + return mean, covariance + + def project(self, mean, covariance): + """Project state distribution to measurement space. + + Parameters + ---------- + mean : ndarray + The state's mean vector (8 dimensional array). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + + Returns + ------- + (ndarray, ndarray) + Returns the projected mean and covariance matrix of the given state + estimate. + + """ +##ADD code + # measurement equations? + mean=self.H.dot(mean) + covariance=self.H.dot(covariance).dot(self.H.T)+self.R + return mean, covariance + + def update(self, mean, covariance, measurement): + """Run Kalman filter correction step. + + Parameters + ---------- + mean : ndarray + The predicted state's mean vector (8 dimensional). + covariance : ndarray + The state's covariance matrix (8x8 dimensional). + measurement : ndarray + The 4 dimensional measurement vector (x, y, a, h), where (x, y) + is the center position, a the aspect ratio, and h the height of the + bounding box. + + Returns + ------- + (ndarray, ndarray) + Returns the measurement-corrected state distribution. + + """ +##ADD code + # update equation -> x(n,n)=x(n,n-1) + K(n)(z(n)-H.x(n,n-1)) + # but we need K(n), A 4x4 matrix + # Kalman gain equation -> K(n) = P(n,n-1).H(transpose).(H.P(n,n-1).H(transpose)+R)(inverse) + + Kalman_gain= covariance.dot(self.H.T).dot(np.linalg,inv(self.H.dot(covariance).dot(self.H.T)+self.R)) + + innovation=measurement - np.dot(self.H,mean) # as it is conventionally called + + new_mean= mean+ np.dot(Kalman_gain,innovation) + + # covariance update equation (longer form, cause shorter form is "numerically unstable" according to text) -> + # P(n,n)= (I-H.K(n)).P(n,n-1).(I-H.K(n))(transpose) + K(n).R.K(n)(transpose) + + newterm= np.eye(4)-np.dot(self.H,Kalman_gain) + + new_covariance=newterm.dot(covariance).dot(newterm.T) + Kalman_gain.dot(self.R).dot(Kalman_gain.T) + + return new_mean, new_covariance + + def gating_distance(self, mean, covariance, measurements, only_position=False): + """Compute gating distance between state distribution and measurements. + + A suitable distance threshold can be obtained from `chi2inv95`. If + `only_position` is False, the chi-square distribution has 4 degrees of + freedom, otherwise 2. + + Parameters + ---------- + mean : ndarray + Mean vector over the state distribution (8 dimensional). + covariance : ndarray + Covariance of the state distribution (8x8 dimensional). + measurements : ndarray + An Nx4 dimensional matrix of N measurements, each in + format (x, y, a, h) where (x, y) is the bounding box center + position, a the aspect ratio, and h the height. + only_position : Optional[bool] + If True, distance computation is done with respect to the bounding + box center position only. + + Returns + ------- + ndarray + Returns an array of length N, where the i-th element contains the + squared Mahalanobis distance between (mean, covariance) and + `measurements[i]`. + + """ + ### Don't change anything + mean, covariance = self.project(mean, covariance) + if only_position: + mean, covariance = mean[:2], covariance[:2, :2] + measurements = measurements[:, :2] + + cholesky_factor = np.linalg.cholesky(covariance) + d = measurements - mean + z = scipy.linalg.solve_triangular( + cholesky_factor, d.T, lower=True, check_finite=False, overwrite_b=True + ) + squared_maha = np.sum(z * z, axis=0) + return squared_maha diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/linear_assignment.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/linear_assignment.py new file mode 100644 index 00000000..d1f6fc65 --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/linear_assignment.py @@ -0,0 +1,213 @@ +# vim: expandtab:ts=4:sw=4 +from __future__ import absolute_import +import numpy as np + +# from sklearn.utils.linear_assignment_ import linear_assignment +from scipy.optimize import linear_sum_assignment +from . import kalman_filter + + +INFTY_COST = 1e5 + + +def min_cost_matching( + distance_metric, + max_distance, + tracks, + detections, + track_indices=None, + detection_indices=None, +): + """Solve linear assignment problem. + + Parameters + ---------- + distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray + The distance metric is given a list of tracks and detections as well as + a list of N track indices and M detection indices. The metric should + return the NxM dimensional cost matrix, where element (i, j) is the + association cost between the i-th track in the given track indices and + the j-th detection in the given detection_indices. + max_distance : float + Gating threshold. Associations with cost larger than this value are + disregarded. + tracks : List[track.Track] + A list of predicted tracks at the current time step. + detections : List[detection.Detection] + A list of detections at the current time step. + track_indices : List[int] + List of track indices that maps rows in `cost_matrix` to tracks in + `tracks` (see description above). + detection_indices : List[int] + List of detection indices that maps columns in `cost_matrix` to + detections in `detections` (see description above). + + Returns + ------- + (List[(int, int)], List[int], List[int]) + Returns a tuple with the following three entries: + * A list of matched track and detection indices. + * A list of unmatched track indices. + * A list of unmatched detection indices. + + """ + if track_indices is None: + track_indices = np.arange(len(tracks)) + if detection_indices is None: + detection_indices = np.arange(len(detections)) + + if len(detection_indices) == 0 or len(track_indices) == 0: + return [], track_indices, detection_indices # Nothing to match. + + cost_matrix = distance_metric(tracks, detections, track_indices, detection_indices) + cost_matrix[cost_matrix > max_distance] = max_distance + 1e-5 + # indices = linear_assignment(cost_matrix) + indices = np.vstack(linear_sum_assignment(cost_matrix)).T + + matches, unmatched_tracks, unmatched_detections = [], [], [] + for col, detection_idx in enumerate(detection_indices): + if col not in indices[:, 1]: + unmatched_detections.append(detection_idx) + for row, track_idx in enumerate(track_indices): + if row not in indices[:, 0]: + unmatched_tracks.append(track_idx) + for row, col in indices: + track_idx = track_indices[row] + detection_idx = detection_indices[col] + if cost_matrix[row, col] > max_distance: + unmatched_tracks.append(track_idx) + unmatched_detections.append(detection_idx) + else: + matches.append((track_idx, detection_idx)) + return matches, unmatched_tracks, unmatched_detections + + +def matching_cascade( + distance_metric, + max_distance, + cascade_depth, + tracks, + detections, + track_indices=None, + detection_indices=None, +): + """Run matching cascade. + + Parameters + ---------- + distance_metric : Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray + The distance metric is given a list of tracks and detections as well as + a list of N track indices and M detection indices. The metric should + return the NxM dimensional cost matrix, where element (i, j) is the + association cost between the i-th track in the given track indices and + the j-th detection in the given detection indices. + max_distance : float + Gating threshold. Associations with cost larger than this value are + disregarded. + cascade_depth: int + The cascade depth, should be se to the maximum track age. + tracks : List[track.Track] + A list of predicted tracks at the current time step. + detections : List[detection.Detection] + A list of detections at the current time step. + track_indices : Optional[List[int]] + List of track indices that maps rows in `cost_matrix` to tracks in + `tracks` (see description above). Defaults to all tracks. + detection_indices : Optional[List[int]] + List of detection indices that maps columns in `cost_matrix` to + detections in `detections` (see description above). Defaults to all + detections. + + Returns + ------- + (List[(int, int)], List[int], List[int]) + Returns a tuple with the following three entries: + * A list of matched track and detection indices. + * A list of unmatched track indices. + * A list of unmatched detection indices. + + """ + if track_indices is None: + track_indices = list(range(len(tracks))) + if detection_indices is None: + detection_indices = list(range(len(detections))) + + unmatched_detections = detection_indices + matches = [] + for level in range(cascade_depth): + if len(unmatched_detections) == 0: # No detections left + break + + track_indices_l = [ + k for k in track_indices if tracks[k].time_since_update == 1 + level + ] + if len(track_indices_l) == 0: # Nothing to match at this level + continue + + matches_l, _, unmatched_detections = min_cost_matching( + distance_metric, + max_distance, + tracks, + detections, + track_indices_l, + unmatched_detections, + ) + matches += matches_l + unmatched_tracks = list(set(track_indices) - set(k for k, _ in matches)) + return matches, unmatched_tracks, unmatched_detections + + +def gate_cost_matrix( + kf, + cost_matrix, + tracks, + detections, + track_indices, + detection_indices, + gated_cost=INFTY_COST, + only_position=False, +): + """Invalidate infeasible entries in cost matrix based on the state + distributions obtained by Kalman filtering. + + Parameters + ---------- + kf : The Kalman filter. + cost_matrix : ndarray + The NxM dimensional cost matrix, where N is the number of track indices + and M is the number of detection indices, such that entry (i, j) is the + association cost between `tracks[track_indices[i]]` and + `detections[detection_indices[j]]`. + tracks : List[track.Track] + A list of predicted tracks at the current time step. + detections : List[detection.Detection] + A list of detections at the current time step. + track_indices : List[int] + List of track indices that maps rows in `cost_matrix` to tracks in + `tracks` (see description above). + detection_indices : List[int] + List of detection indices that maps columns in `cost_matrix` to + detections in `detections` (see description above). + gated_cost : Optional[float] + Entries in the cost matrix corresponding to infeasible associations are + set this value. Defaults to a very large value. + only_position : Optional[bool] + If True, only the x, y position of the state distribution is considered + during gating. Defaults to False. + + Returns + ------- + ndarray + Returns the modified cost matrix. + + """ + gating_dim = 2 if only_position else 4 + gating_threshold = kalman_filter.chi2inv95[gating_dim] + measurements = np.asarray([detections[i].to_xyah() for i in detection_indices]) + for row, track_idx in enumerate(track_indices): + track = tracks[track_idx] + gating_distance = kf.gating_distance( + track.mean, track.covariance, measurements, only_position + ) + cost_matrix[row, gating_distance > gating_threshold] = gated_cost + return cost_matrix diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/nn_matching.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/nn_matching.py new file mode 100644 index 00000000..df6445ea --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/nn_matching.py @@ -0,0 +1,175 @@ +# vim: expandtab:ts=4:sw=4 +import numpy as np + + +def _pdist(a, b): + """Compute pair-wise squared distance between points in `a` and `b`. + + Parameters + ---------- + a : array_like + An NxM matrix of N samples of dimensionality M. + b : array_like + An LxM matrix of L samples of dimensionality M. + + Returns + ------- + ndarray + Returns a matrix of size len(a), len(b) such that eleement (i, j) + contains the squared distance between `a[i]` and `b[j]`. + + """ + a, b = np.asarray(a), np.asarray(b) + if len(a) == 0 or len(b) == 0: + return np.zeros((len(a), len(b))) + a2, b2 = np.square(a).sum(axis=1), np.square(b).sum(axis=1) + r2 = -2.0 * np.dot(a, b.T) + a2[:, None] + b2[None, :] + r2 = np.clip(r2, 0.0, float(np.inf)) + return r2 + + +def _cosine_distance(a, b, data_is_normalized=False): + """Compute pair-wise cosine distance between points in `a` and `b`. + + Parameters + ---------- + a : array_like + An NxM matrix of N samples of dimensionality M. + b : array_like + An LxM matrix of L samples of dimensionality M. + data_is_normalized : Optional[bool] + If True, assumes rows in a and b are unit length vectors. + Otherwise, a and b are explicitly normalized to lenght 1. + + Returns + ------- + ndarray + Returns a matrix of size len(a), len(b) such that eleement (i, j) + contains the squared distance between `a[i]` and `b[j]`. + + """ + if not data_is_normalized: + a = np.asarray(a) / np.linalg.norm(a, axis=1, keepdims=True) + b = np.asarray(b) / np.linalg.norm(b, axis=1, keepdims=True) + return 1.0 - np.dot(a, b.T) + + +def _nn_euclidean_distance(x, y): + """Helper function for nearest neighbor distance metric (Euclidean). + + Parameters + ---------- + x : ndarray + A matrix of N row-vectors (sample points). + y : ndarray + A matrix of M row-vectors (query points). + + Returns + ------- + ndarray + A vector of length M that contains for each entry in `y` the + smallest Euclidean distance to a sample in `x`. + + """ + distances = _pdist(x, y) + return np.maximum(0.0, distances.min(axis=0)) + + +def _nn_cosine_distance(x, y): + """Helper function for nearest neighbor distance metric (cosine). + + Parameters + ---------- + x : ndarray + A matrix of N row-vectors (sample points). + y : ndarray + A matrix of M row-vectors (query points). + + Returns + ------- + ndarray + A vector of length M that contains for each entry in `y` the + smallest cosine distance to a sample in `x`. + + """ + distances = _cosine_distance(x, y) + return distances.min(axis=0) + + +class NearestNeighborDistanceMetric(object): + """ + A nearest neighbor distance metric that, for each target, returns + the closest distance to any sample that has been observed so far. + + Parameters + ---------- + metric : str + Either "euclidean" or "cosine". + matching_threshold: float + The matching threshold. Samples with larger distance are considered an + invalid match. + budget : Optional[int] + If not None, fix samples per class to at most this number. Removes + the oldest samples when the budget is reached. + + Attributes + ---------- + samples : Dict[int -> List[ndarray]] + A dictionary that maps from target identities to the list of samples + that have been observed so far. + + """ + + def __init__(self, metric, matching_threshold, budget=None): + + if metric == "euclidean": + self._metric = _nn_euclidean_distance + elif metric == "cosine": + self._metric = _nn_cosine_distance + else: + raise ValueError("Invalid metric; must be either 'euclidean' or 'cosine'") + self.matching_threshold = matching_threshold + self.budget = budget + self.samples = {} + + def partial_fit(self, features, targets, active_targets): + """Update the distance metric with new data. + + Parameters + ---------- + features : ndarray + An NxM matrix of N features of dimensionality M. + targets : ndarray + An integer array of associated target identities. + active_targets : List[int] + A list of targets that are currently present in the scene. + + """ + for feature, target in zip(features, targets): + self.samples.setdefault(target, []).append(feature) + if self.budget is not None: + self.samples[target] = self.samples[target][-self.budget :] + self.samples = {k: self.samples[k] for k in active_targets} + + def distance(self, features, targets): + """Compute distance between features and targets. + + Parameters + ---------- + features : ndarray + An NxM matrix of N features of dimensionality M. + targets : List[int] + A list of targets to match the given `features` against. + + Returns + ------- + ndarray + Returns a cost matrix of shape len(targets), len(features), where + element (i, j) contains the closest squared distance between + `targets[i]` and `features[j]`. + + """ + cost_matrix = np.zeros((len(targets), len(features))) + for i, target in enumerate(targets): + cost_matrix[i, :] = self._metric(self.samples[target], features) + return cost_matrix diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/track.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/track.py new file mode 100644 index 00000000..3389de72 --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/track.py @@ -0,0 +1,281 @@ +# vim: expandtab:ts=4:sw=4 +class TrackState: + """ + Enumeration type for the single target track state. Newly created tracks are + classified as `tentative` until enough evidence has been collected. Then, + the track state is changed to `confirmed`. Tracks that are no longer alive + are classified as `deleted` to mark them for removal from the set of active + tracks. + + """ + + Tentative = 1 + Confirmed = 2 + Deleted = 3 + + +class Track: + """ + A single target track with state space `(x, y, a, h)` and associated + velocities, where `(x, y)` is the center of the bounding box, `a` is the + aspect ratio and `h` is the height. + + Parameters + ---------- + mean : ndarray + Mean vector of the initial state distribution. + covariance : ndarray + Covariance matrix of the initial state distribution. + track_id : int + A unique track identifier. + n_init : int + Number of consecutive detections before the track is confirmed. The + track state is set to `Deleted` if a miss occurs within the first + `n_init` frames. + max_age : int + The maximum number of consecutive misses before the track state is + set to `Deleted`. + feature : Optional[ndarray] + Feature vector of the detection this track originates from. If not None, + this feature is added to the `features` cache. + original_ltwh : Optional List + Bounding box associated with matched detection + det_class : Optional str + Classname of matched detection + det_conf : Optional float + Confidence associated with matched detection + instance_mask : Optional + Instance mask associated with matched detection + others : Optional any + Any supplementary fields related to matched detection + + Attributes + ---------- + mean : ndarray + Mean vector of the initial state distribution. + covariance : ndarray + Covariance matrix of the initial state distribution. + track_id : int + A unique track identifier. + hits : int + Total number of measurement updates. + age : int + Total number of frames since first occurrence. + time_since_update : int + Total number of frames since last measurement update. + state : TrackState + The current track state. + features : List[ndarray] + A cache of features. On each measurement update, the associated feature + vector is added to this list. + + """ + + def __init__( + self, + mean, + covariance, + track_id, + n_init, + max_age, + feature=None, + original_ltwh=None, + det_class=None, + det_conf=None, + instance_mask=None, + others=None, + ): + self.mean = mean + self.covariance = covariance + self.track_id = track_id + self.hits = 1 + self.age = 1 + self.time_since_update = 0 + + self.state = TrackState.Tentative + self.features = [] + self.latest_feature = None + if feature is not None: + self.features.append(feature) + self.latest_feature = feature + + + self._n_init = n_init + self._max_age = max_age + + self.original_ltwh = original_ltwh + self.det_class = det_class + self.det_conf = det_conf + self.instance_mask = instance_mask + self.others = others + + def to_tlwh(self, orig=False, orig_strict=False): + """Get current position in bounding box format `(top left x, top left y, + width, height)`. This function is POORLY NAMED. But we are keeping the way it works the way it works in order not to break any older libraries that depend on this. + + Returns + ------- + ndarray + The KF-predicted bounding box by default. + If `orig` is True and track is matched to a detection this round, then the original det is returned. + """ + return self.to_ltwh(orig=orig, orig_strict=orig_strict) + + def to_ltwh(self, orig=False, orig_strict=False): + """Get current position in bounding box format `(top left x, top left y, + width, height)`. + + Params + ------ + orig : bool + To use original detection (True) or KF predicted (False). Only works for original dets that are horizontal BBs. + orig_strict: bool + Only relevant when orig is True. If orig_strict is True, it ONLY outputs original bbs and will not output kalman mean even if original bb is not available. + + Returns + ------- + ndarray + The KF-predicted bounding box by default. + If `orig` is True and track is matched to a detection this round, then the original det is returned. + + """ + if orig: + if self.original_ltwh is None: + if orig_strict: + return None + # else if not orig_strict, return kalman means below + else: + return self.original_ltwh.copy() + + ret = self.mean[:4].copy() + ret[2] *= ret[3] + ret[:2] -= ret[2:] / 2 + return ret + + def to_tlbr(self, orig=False, orig_strict=False): + """Get current position in bounding box format `(min x, miny, max x, + max y)`. This original function is POORLY NAMED. But we are keeping the way it works the way it works in order not to break any older projects that depend on this. + USE THIS AT YOUR OWN RISK. LIESSSSSSSSSS! + Returns LIES + ------- + ndarray + The KF-predicted bounding box by default. + If `orig` is True and track is matched to a detection this round, then the original det is returned. + """ + return self.to_ltrb(orig=orig, orig_strict=orig_strict) + + def to_ltrb(self, orig=False, orig_strict=False): + """Get current position in bounding box format `(min x, miny, max x, + max y)`. + + Params + ------ + orig : bool + To use original detection (True) or KF predicted (False). Only works for original dets that are horizontal BBs. + + Returns + ------- + ndarray + The KF-predicted bounding box by default. + If `orig` is True and track is matched to a detection this round, then the original det is returned. + """ + ret = self.to_ltwh(orig=orig, orig_strict=orig_strict) + if ret is not None: + ret[2:] = ret[:2] + ret[2:] + return ret + + def get_det_conf(self): + """ + `det_conf` will be None is there are no associated detection this round + """ + return self.det_conf + + def get_det_class(self): + """ + Only `det_class` will be persisted in the track even if there are no associated detection this round. + """ + return self.det_class + + def get_instance_mask(self): + ''' + Get instance mask associated with detection. Will be None is there are no associated detection this round + ''' + return self.instance_mask + + def get_det_supplementary(self): + """ + Get supplementary info associated with the detection. Will be None is there are no associated detection this round. + """ + return self.others + + def get_feature(self): + ''' + Get latest appearance feature + ''' + return self.latest_feature + + def predict(self, kf): + """Propagate the state distribution to the current time step using a + Kalman filter prediction step. + + Parameters + ---------- + kf : kalman_filter.KalmanFilter + The Kalman filter. + + """ + self.mean, self.covariance = kf.predict(self.mean, self.covariance) + self.age += 1 + self.time_since_update += 1 + self.original_ltwh = None + self.det_conf = None + self.instance_mask = None + self.others = None + + def update(self, kf, detection): + """Perform Kalman filter measurement update step and update the feature + cache. + + Parameters + ---------- + kf : kalman_filter.KalmanFilter + The Kalman filter. + detection : Detection + The associated detection. + + """ + self.original_ltwh = detection.get_ltwh() + self.mean, self.covariance = kf.update( + self.mean, self.covariance, detection.to_xyah() + ) + self.features.append(detection.feature) + self.latest_feature = detection.feature + self.det_conf = detection.confidence + self.det_class = detection.class_name + self.instance_mask = detection.instance_mask + self.others = detection.others + + self.hits += 1 + + self.time_since_update = 0 + if self.state == TrackState.Tentative and self.hits >= self._n_init: + self.state = TrackState.Confirmed + + def mark_missed(self): + """Mark this track as missed (no association at the current time step).""" + if self.state == TrackState.Tentative: + self.state = TrackState.Deleted + elif self.time_since_update > self._max_age: + self.state = TrackState.Deleted + + def is_tentative(self): + """Returns True if this track is tentative (unconfirmed).""" + return self.state == TrackState.Tentative + + def is_confirmed(self): + """Returns True if this track is confirmed.""" + return self.state == TrackState.Confirmed + + def is_deleted(self): + """Returns True if this track is dead and should be deleted.""" + return self.state == TrackState.Deleted diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/tracker.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/tracker.py new file mode 100644 index 00000000..e6f7c7e1 --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deep_sort/tracker.py @@ -0,0 +1,211 @@ +# vim: expandtab:ts=4:sw=4 +from __future__ import absolute_import +from datetime import datetime +import numpy as np +from . import kalman_filter +from . import linear_assignment +from . import iou_matching +from .track import Track + + +class Tracker: + """ + This is the multi-target tracker. + + Parameters + ---------- + metric : nn_matching.NearestNeighborDistanceMetric + A distance metric for measurement-to-track association. + max_age : int + Maximum number of missed misses before a track is deleted. + n_init : int + Number of consecutive detections before the track is confirmed. The + track state is set to `Deleted` if a miss occurs within the first + `n_init` frames. + today: Optional[datetime.date] + Provide today's date, for naming of tracks + + Attributes + ---------- + metric : nn_matching.NearestNeighborDistanceMetric + The distance metric used for measurement to track association. + max_age : int + Maximum number of missed misses before a track is deleted. + n_init : int + Number of frames that a track remains in initialization phase. + kf : kalman_filter.KalmanFilter + A Kalman filter to filter target trajectories in image space. + tracks : List[Track] + The list of active tracks at the current time step. + gating_only_position : Optional[bool] + Used during gating, comparing KF predicted and measured states. If True, only the x, y position of the state distribution is considered during gating. Defaults to False, where x,y, aspect ratio and height will be considered. + """ + + def __init__( + self, + metric, + max_iou_distance=0.7, + max_age=30, + n_init=3, + override_track_class=None, + today=None, + gating_only_position=False, + ): + self.today = today + self.metric = metric + self.max_iou_distance = max_iou_distance + self.max_age = max_age + self.n_init = n_init + self.gating_only_position = gating_only_position + + self.kf = kalman_filter.KalmanFilter() + self.tracks = [] + self.del_tracks_ids = [] + self._next_id = 1 + if override_track_class: + self.track_class = override_track_class + else: + self.track_class = Track + + def predict(self): + """Propagate track state distributions one time step forward. + + This function should be called once every time step, before `update`. + """ + for track in self.tracks: + track.predict(self.kf) + + def update(self, detections, today=None): + """Perform measurement update and track management. + + Parameters + ---------- + detections : List[deep_sort.detection.Detection] + A list of detections at the current time step. + today: Optional[datetime.date] + Provide today's date, for naming of tracks + """ + if self.today: + if today is None: + today = datetime.now().date() + # Check if its a new day, then refresh idx + if today != self.today: + self.today = today + self._next_id = 1 + + # Run matching cascade. + matches, unmatched_tracks, unmatched_detections = self._match(detections) + + # Update track set. + for track_idx, detection_idx in matches: + self.tracks[track_idx].update(self.kf, detections[detection_idx]) + for track_idx in unmatched_tracks: + self.tracks[track_idx].mark_missed() + for detection_idx in unmatched_detections: + self._initiate_track(detections[detection_idx]) + new_tracks = [] + self.del_tracks_ids = [] + for t in self.tracks: + if not t.is_deleted(): + new_tracks.append(t) + else: + self.del_tracks_ids.append(t.track_id) + self.tracks = new_tracks + # self.tracks = [t for t in self.tracks if not t.is_deleted()] + + # Update distance metric. + active_targets = [t.track_id for t in self.tracks if t.is_confirmed()] + features, targets = [], [] + for track in self.tracks: + if not track.is_confirmed(): + continue + features += track.features + targets += [track.track_id for _ in track.features] + track.features = [] + self.metric.partial_fit( + np.asarray(features), np.asarray(targets), active_targets + ) + + def _match(self, detections): + def gated_metric(tracks, dets, track_indices, detection_indices): + features = np.array([dets[i].feature for i in detection_indices]) + targets = np.array([tracks[i].track_id for i in track_indices]) + cost_matrix = self.metric.distance(features, targets) + cost_matrix = linear_assignment.gate_cost_matrix( + self.kf, cost_matrix, tracks, dets, track_indices, detection_indices, only_position=self.gating_only_position + ) + + return cost_matrix + + # Split track set into confirmed and unconfirmed tracks. + confirmed_tracks = [i for i, t in enumerate(self.tracks) if t.is_confirmed()] + unconfirmed_tracks = [ + i for i, t in enumerate(self.tracks) if not t.is_confirmed() + ] + + # Associate confirmed tracks using appearance features. + ( + matches_a, + unmatched_tracks_a, + unmatched_detections, + ) = linear_assignment.matching_cascade( + gated_metric, + self.metric.matching_threshold, + self.max_age, + self.tracks, + detections, + confirmed_tracks, + ) + + # Associate remaining tracks together with unconfirmed tracks using IOU. + iou_track_candidates = unconfirmed_tracks + [ + k for k in unmatched_tracks_a if self.tracks[k].time_since_update == 1 + ] + unmatched_tracks_a = [ + k for k in unmatched_tracks_a if self.tracks[k].time_since_update != 1 + ] + ( + matches_b, + unmatched_tracks_b, + unmatched_detections, + ) = linear_assignment.min_cost_matching( + iou_matching.iou_cost, + self.max_iou_distance, + self.tracks, + detections, + iou_track_candidates, + unmatched_detections, + ) + + matches = matches_a + matches_b + unmatched_tracks = list(set(unmatched_tracks_a + unmatched_tracks_b)) + return matches, unmatched_tracks, unmatched_detections + + def _initiate_track(self, detection): + mean, covariance = self.kf.initiate(detection.to_xyah()) + + if self.today: + track_id = "{}_{}".format(self.today, self._next_id) + else: + track_id = "{}".format(self._next_id) + self.tracks.append( + self.track_class( + mean, + covariance, + track_id, + self.n_init, + self.max_age, + # mean, covariance, self._next_id, self.n_init, self.max_age, + feature=detection.feature, + original_ltwh=detection.get_ltwh(), + det_class=detection.class_name, + det_conf=detection.confidence, + instance_mask=detection.instance_mask, + others=detection.others, + ) + ) + self._next_id += 1 + + def delete_all_tracks(self): + self.tracks = [] + self._next_id = 1 diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deepsort_tracker.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deepsort_tracker.py new file mode 100644 index 00000000..319849fb --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/deepsort_tracker.py @@ -0,0 +1,336 @@ +import time +import logging +from collections.abc import Iterable + +import cv2 +import numpy as np + +from deep_sort_realtime.deep_sort import nn_matching +from deep_sort_realtime.deep_sort.detection import Detection +from deep_sort_realtime.deep_sort.tracker import Tracker +from deep_sort_realtime.utils.nms import non_max_suppression + +logger = logging.getLogger(__name__) + +EMBEDDER_CHOICES = [ + "mobilenet", + "torchreid", + "clip_RN50", + "clip_RN101", + "clip_RN50x4", + "clip_RN50x16", + "clip_ViT-B/32", + "clip_ViT-B/16", +] + + +class DeepSort(object): + def __init__( + self, + max_iou_distance=0.7, + max_age=30, + n_init=3, + nms_max_overlap=1.0, + max_cosine_distance=0.2, + nn_budget=None, + gating_only_position=False, + override_track_class=None, + embedder="mobilenet", + half=True, + bgr=True, + embedder_gpu=True, + embedder_model_name=None, + embedder_wts=None, + polygon=False, + today=None, + ): + """ + + Parameters + ---------- + max_iou_distance : Optional[float] = 0.7 + Gating threshold on IoU. Associations with cost larger than this value are + disregarded. Argument for deep_sort_realtime.deep_sort.tracker.Tracker. + max_age : Optional[int] = 30 + Maximum number of missed misses before a track is deleted. Argument for deep_sort_realtime.deep_sort.tracker.Tracker. + n_init : int + Number of frames that a track remains in initialization phase. Defaults to 3. Argument for deep_sort_realtime.deep_sort.tracker.Tracker. + nms_max_overlap : Optional[float] = 1.0 + Non-maxima suppression threshold: Maximum detection overlap, if is 1.0, nms will be disabled + max_cosine_distance : Optional[float] = 0.2 + Gating threshold for cosine distance + nn_budget : Optional[int] = None + Maximum size of the appearance descriptors, if None, no budget is enforced + gating_only_position : Optional[bool] + Used during gating, comparing KF predicted and measured states. If True, only the x, y position of the state distribution is considered during gating. Defaults to False, where x,y, aspect ratio and height will be considered. + override_track_class : Optional[object] = None + Giving this will override default Track class, this must inherit Track. Argument for deep_sort_realtime.deep_sort.tracker.Tracker. + embedder : Optional[str] = 'mobilenet' + Whether to use in-built embedder or not. If None, then embeddings must be given during update. + Choice of ['mobilenet', 'torchreid', 'clip_RN50', 'clip_RN101', 'clip_RN50x4', 'clip_RN50x16', 'clip_ViT-B/32', 'clip_ViT-B/16'] + half : Optional[bool] = True + Whether to use half precision for deep embedder (applicable for mobilenet only) + bgr : Optional[bool] = True + Whether frame given to embedder is expected to be BGR or not (RGB) + embedder_gpu: Optional[bool] = True + Whether embedder uses gpu or not + embedder_model_name: Optional[str] = None + Only used when embedder=='torchreid'. This provides which model to use within torchreid library. Check out torchreid's model zoo. + embedder_wts: Optional[str] = None + Optional specification of path to embedder's model weights. Will default to looking for weights in `deep_sort_realtime/embedder/weights`. If deep_sort_realtime is installed as a package and CLIP models is used as embedder, best to provide path. + polygon: Optional[bool] = False + Whether detections are polygons (e.g. oriented bounding boxes) + today: Optional[datetime.date] + Provide today's date, for naming of tracks. Argument for deep_sort_realtime.deep_sort.tracker.Tracker. + """ + self.nms_max_overlap = nms_max_overlap + metric = nn_matching.NearestNeighborDistanceMetric( + "cosine", max_cosine_distance, nn_budget + ) + self.tracker = Tracker( + metric, + max_iou_distance=max_iou_distance, + max_age=max_age, + n_init=n_init, + override_track_class=override_track_class, + today=today, + gating_only_position=gating_only_position, + ) + + if embedder is not None: + if embedder not in EMBEDDER_CHOICES: + raise Exception(f"Embedder {embedder} is not a valid choice.") + if embedder == "mobilenet": + from deep_sort_realtime.embedder.embedder_pytorch import ( + MobileNetv2_Embedder as Embedder, + ) + + self.embedder = Embedder( + half=half, + max_batch_size=16, + bgr=bgr, + gpu=embedder_gpu, + model_wts_path=embedder_wts, + ) + elif embedder == 'torchreid': + from deep_sort_realtime.embedder.embedder_pytorch import TorchReID_Embedder as Embedder + + self.embedder = Embedder( + bgr=bgr, + gpu=embedder_gpu, + model_name=embedder_model_name, + model_wts_path=embedder_wts, + ) + + elif embedder.startswith('clip_'): + from deep_sort_realtime.embedder.embedder_clip import ( + Clip_Embedder as Embedder, + ) + + model_name = "_".join(embedder.split("_")[1:]) + self.embedder = Embedder( + model_name=model_name, + model_wts_path=embedder_wts, + max_batch_size=16, + bgr=bgr, + gpu=embedder_gpu, + ) + + else: + self.embedder = None + self.polygon = polygon + logger.info("DeepSort Tracker initialised") + logger.info(f"- max age: {max_age}") + logger.info(f"- appearance threshold: {max_cosine_distance}") + logger.info( + f'- nms threshold: {"OFF" if self.nms_max_overlap==1.0 else self.nms_max_overlap }' + ) + logger.info(f"- max num of appearance features: {nn_budget}") + logger.info( + f'- overriding track class : {"No" if override_track_class is None else "Yes"}' + ) + logger.info(f'- today given : {"No" if today is None else "Yes"}') + logger.info(f'- in-build embedder : {"No" if self.embedder is None else "Yes"}') + logger.info(f'- polygon detections : {"No" if polygon is False else "Yes"}') + + def update_tracks(self, raw_detections, embeds=None, frame=None, today=None, others=None, instance_masks=None): + + """Run multi-target tracker on a particular sequence. + + Parameters + ---------- + raw_detections (horizontal bb) : List[ Tuple[ List[float or int], float, str ] ] + List of detections, each in tuples of ( [left,top,w,h] , confidence, detection_class) + raw_detections (polygon) : List[ List[float], List[int or str], List[float] ] + List of Polygons, Classes, Confidences. All 3 sublists of the same length. A polygon defined as a ndarray-like [x1,y1,x2,y2,...]. + embeds : Optional[ List[] ] = None + List of appearance features corresponding to detections + frame : Optional [ np.ndarray ] = None + if embeds not given, Image frame must be given here, in [H,W,C]. + today: Optional[datetime.date] + Provide today's date, for naming of tracks + others: Optional[ List ] = None + Other things associated to detections to be stored in tracks, usually, could be corresponding segmentation mask, other associated values, etc. Currently others is ignored with polygon is True. + instance_masks: Optional [ List ] = None + Instance masks corresponding to detections. If given, they are used to filter out background and only use foreground for apperance embedding. Expects numpy boolean mask matrix. + + Returns + ------- + list of track objects (Look into track.py for more info or see "main" section below in this script to see simple example) + + """ + + if embeds is None: + if self.embedder is None: + raise Exception( + "Embedder not created during init so embeddings must be given now!" + ) + if frame is None: + raise Exception("either embeddings or frame must be given!") + + assert isinstance(raw_detections,Iterable) + + if len(raw_detections) > 0: + if not self.polygon: + assert len(raw_detections[0][0])==4 + raw_detections = [d for d in raw_detections if d[0][2] > 0 and d[0][3] > 0] + + if embeds is None: + embeds = self.generate_embeds(frame, raw_detections, instance_masks=instance_masks) + + # Proper deep sort detection objects that consist of bbox, confidence and embedding. + detections = self.create_detections(raw_detections, embeds, instance_masks=instance_masks, others=others) + else: + polygons, bounding_rects = self.process_polygons(raw_detections[0]) + + if embeds is None: + embeds = self.generate_embeds_poly(frame, polygons, bounding_rects) + + # Proper deep sort detection objects that consist of bbox, confidence and embedding. + detections = self.create_detections_poly( + raw_detections, embeds, bounding_rects, + ) + else: + detections = [] + + # Run non-maxima suppression. + boxes = np.array([d.ltwh for d in detections]) + scores = np.array([d.confidence for d in detections]) + if self.nms_max_overlap < 1.0: + # nms_tic = time.perf_counter() + indices = non_max_suppression(boxes, self.nms_max_overlap, scores) + # nms_toc = time.perf_counter() + # logger.debug(f'nms time: {nms_toc-nms_tic}s') + detections = [detections[i] for i in indices] + + # Update tracker. + self.tracker.predict() + self.tracker.update(detections, today=today) + + return self.tracker.tracks + + def refresh_track_ids(self): + self.tracker._next_id + + def generate_embeds(self, frame, raw_dets, instance_masks=None): + crops, cropped_inst_masks = self.crop_bb(frame, raw_dets, instance_masks=instance_masks) + if cropped_inst_masks is not None: + masked_crops = [] + for crop, mask in zip(crops, cropped_inst_masks): + masked_crop = np.zeros_like(crop) + masked_crop = masked_crop + np.array([123.675, 116.28, 103.53], dtype=crop.dtype) + masked_crop[mask] = crop[mask] + masked_crops.append(masked_crop) + return self.embedder.predict(masked_crops) + else: + return self.embedder.predict(crops) + + def generate_embeds_poly(self, frame, polygons, bounding_rects): + crops = self.crop_poly_pad_black(frame, polygons, bounding_rects) + return self.embedder.predict(crops) + + def create_detections(self, raw_dets, embeds, instance_masks=None, others=None): + detection_list = [] + for i, (raw_det, embed) in enumerate(zip(raw_dets, embeds)): + detection_list.append( + Detection( + raw_det[0], + raw_det[1], + embed, + class_name=raw_det[2] if len(raw_det)==3 else None, + instance_mask = instance_masks[i] if isinstance(instance_masks, Iterable) else instance_masks, + others = others[i] if isinstance(others, Iterable) else others, + ) + ) # raw_det = [bbox, conf_score, class] + return detection_list + + def create_detections_poly(self, dets, embeds, bounding_rects): + detection_list = [] + dets.extend([embeds, bounding_rects]) + for raw_polygon, cl, score, embed, bounding_rect in zip(*dets): + x, y, w, h = bounding_rect + x = max(0, x) + y = max(0, y) + bbox = [x, y, w, h] + detection_list.append( + Detection(bbox, score, embed, class_name=cl, others=raw_polygon) + ) + return detection_list + + @staticmethod + def process_polygons(raw_polygons): + polygons = [ + [polygon[x : x + 2] for x in range(0, len(polygon), 2)] + for polygon in raw_polygons + ] + bounding_rects = [ + cv2.boundingRect(np.array([polygon]).astype(int)) for polygon in polygons + ] + return polygons, bounding_rects + + @staticmethod + def crop_bb(frame, raw_dets, instance_masks=None): + crops = [] + im_height, im_width = frame.shape[:2] + if instance_masks is not None: + masks = [] + else: + masks = None + for i, detection in enumerate(raw_dets): + l, t, w, h = [int(x) for x in detection[0]] + r = l + w + b = t + h + crop_l = max(0, l) + crop_r = min(im_width, r) + crop_t = max(0, t) + crop_b = min(im_height, b) + crops.append(frame[crop_t:crop_b, crop_l:crop_r]) + if instance_masks is not None: + masks.append( instance_masks[i][crop_t:crop_b, crop_l:crop_r] ) + + return crops, masks + + @staticmethod + def crop_poly_pad_black(frame, polygons, bounding_rects): + masked_polys = [] + im_height, im_width = frame.shape[:2] + for polygon, bounding_rect in zip(polygons, bounding_rects): + mask = np.zeros(frame.shape, dtype=np.uint8) + polygon_mask = np.array([polygon]).astype(int) + cv2.fillPoly(mask, polygon_mask, color=(255, 255, 255)) + + # apply the mask + masked_image = cv2.bitwise_and(frame, mask) + + # crop masked image + x, y, w, h = bounding_rect + crop_l = max(0, x) + crop_r = min(im_width, x + w) + crop_t = max(0, y) + crop_b = min(im_height, y + h) + cropped = masked_image[crop_t:crop_b, crop_l:crop_r].copy() + masked_polys.append(np.array(cropped)) + return masked_polys + + def delete_all_tracks(self): + self.tracker.delete_all_tracks() diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__init__.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/__init__.cpython-38.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 00000000..fa652098 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/__init__.cpython-38.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/__init__.cpython-39.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 00000000..a0f15654 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/__init__.cpython-39.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/embedder_pytorch.cpython-38.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/embedder_pytorch.cpython-38.pyc new file mode 100644 index 00000000..1e8f1701 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/embedder_pytorch.cpython-38.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/embedder_pytorch.cpython-39.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/embedder_pytorch.cpython-39.pyc new file mode 100644 index 00000000..b482b5b7 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/embedder_pytorch.cpython-39.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/mobilenetv2_bottle.cpython-38.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/mobilenetv2_bottle.cpython-38.pyc new file mode 100644 index 00000000..5757620b Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/mobilenetv2_bottle.cpython-38.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/mobilenetv2_bottle.cpython-39.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/mobilenetv2_bottle.cpython-39.pyc new file mode 100644 index 00000000..2e1f275d Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/__pycache__/mobilenetv2_bottle.cpython-39.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/embedder_clip.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/embedder_clip.py new file mode 100644 index 00000000..06a42cb3 --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/embedder_clip.py @@ -0,0 +1,101 @@ +import os +import logging +from pathlib import Path + +import clip +import cv2 +import numpy as np +import pkg_resources +import torch +from PIL import Image + +logger = logging.getLogger(__name__) + + +def _batch(iterable, bs=1): + l = len(iterable) + for ndx in range(0, l, bs): + yield iterable[ndx : min(ndx + bs, l)] + + +class Clip_Embedder(object): + """ + Clip_Embedder loads a CLIP model of specified architecture, outputting a feature of size 1024. + + Params + ------ + - model_name (optional, str) : CLIP model to use + - model_wts_path (optional, str): Optional specification of path to CLIP model weights. Defaults to None and look for weights in `deep_sort_realtime/embedder/weights` or clip will download from internet into their own cache. + - max_batch_size (optional, int) : max batch size for embedder, defaults to 16 + - bgr (optional, Bool) : boolean flag indicating if input frames are bgr or not, defaults to True + - gpu (optional, Bool) : boolean flag indicating if gpu is enabled or not, defaults to True + """ + + def __init__( + self, + model_name="ViT-B/32", + model_wts_path=None, + max_batch_size=16, + bgr=True, + gpu=True, + ): + if model_wts_path is None: + assert model_name in clip.available_models() + + weights_name = model_name.replace("/", "-") + weights_path = ( + Path(__file__).parent.resolve() / "weights" / f"{weights_name}.pt" + ) + if weights_path.is_file(): + model_wts_path = str(weights_path) + else: + model_wts_path = model_name + + self.device = "cuda" if gpu else "cpu" + self.model, self.img_preprocess = clip.load(model_wts_path, device=self.device) + self.model.eval() + + self.max_batch_size = max_batch_size + self.bgr = bgr + + logger.info("Clip Embedder for Deep Sort initialised") + logger.info(f"- gpu enabled: {gpu}") + logger.info(f"- max batch size: {self.max_batch_size}") + logger.info(f"- expects BGR: {self.bgr}") + logger.info(f"- model name: {model_name}") + + zeros = np.zeros((100, 100, 3), dtype=np.uint8) + self.predict([zeros]) # warmup + + def predict(self, np_images): + """ + batch inference + + Params + ------ + np_images : list of ndarray + list of (H x W x C), bgr or rgb according to self.bgr + + Returns + ------ + list of features (np.array with dim = 1024) + + """ + if not np_images: + return [] + + if self.bgr: + np_images = [cv2.cvtColor(img, cv2.COLOR_BGR2RGB) for img in np_images] + + pil_images = [ + self.img_preprocess(Image.fromarray(rgb)).to(self.device) + for rgb in np_images + ] + + all_feats = [] + for this_batch in _batch(pil_images, bs=self.max_batch_size): + batch = torch.stack(this_batch, 0) + with torch.no_grad(): + feats = self.model.encode_image(batch) + all_feats.extend(feats.cpu().data.numpy()) + return all_feats diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/embedder_pytorch.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/embedder_pytorch.py new file mode 100644 index 00000000..9847edf4 --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/embedder_pytorch.py @@ -0,0 +1,232 @@ +import os +import logging + +import cv2 +import numpy as np +import pkg_resources +import torch +from torchvision.transforms import transforms + +from deep_sort_realtime.embedder.mobilenetv2_bottle import MobileNetV2_bottle + +logger = logging.getLogger(__name__) + +MOBILENETV2_BOTTLENECK_WTS = pkg_resources.resource_filename( + "deep_sort_realtime", "embedder/weights/mobilenetv2_bottleneck_wts.pt" +) + +TORCHREID_OSNET_AIN_X1_0_MS_D_C_WTS = pkg_resources.resource_filename( + "deep_sort_realtime", "embedder/weights/osnet_ain_ms_d_c_wtsonly.pth" +) + +INPUT_WIDTH = 224 + + +def batch(iterable, bs=1): + l = len(iterable) + for ndx in range(0, l, bs): + yield iterable[ndx : min(ndx + bs, l)] + + +class MobileNetv2_Embedder(object): + """ + MobileNetv2_Embedder loads a Mobilenetv2 pretrained on Imagenet1000, with classification layer removed, exposing the bottleneck layer, outputing a feature of size 1280. + + Params + ------ + - model_wts_path (optional, str) : path to mobilenetv2 model weights, defaults to the model file in ./mobilenetv2 + - half (optional, Bool) : boolean flag to use half precision or not, defaults to True + - max_batch_size (optional, int) : max batch size for embedder, defaults to 16 + - bgr (optional, Bool) : boolean flag indicating if input frames are bgr or not, defaults to True + - gpu (optional, Bool) : boolean flag indicating if gpu is enabled or not + """ + + def __init__( + self, model_wts_path=None, half=True, max_batch_size=16, bgr=True, gpu=True + ): + if model_wts_path is None: + model_wts_path = MOBILENETV2_BOTTLENECK_WTS + assert os.path.exists( + model_wts_path + ), f"Mobilenetv2 model path {model_wts_path} does not exists!" + self.model = MobileNetV2_bottle(input_size=INPUT_WIDTH, width_mult=1.0) + self.model.load_state_dict(torch.load(model_wts_path)) + + self.gpu = gpu and torch.cuda.is_available() + if self.gpu: + self.model.cuda() # loads model to gpu + self.half = half + if self.half: + self.model.half() + else: + self.half = False + + self.model.eval() # inference mode, deactivates dropout layers + + self.max_batch_size = max_batch_size + self.bgr = bgr + + logger.info("MobileNetV2 Embedder for Deep Sort initialised") + logger.info(f"- gpu enabled: {self.gpu}") + logger.info(f"- half precision: {self.half}") + logger.info(f"- max batch size: {self.max_batch_size}") + logger.info(f"- expects BGR: {self.bgr}") + + zeros = np.zeros((100, 100, 3), dtype=np.uint8) + self.predict([zeros]) # warmup + + def preprocess(self, np_image): + """ + Preprocessing for embedder network: Flips BGR to RGB, resize, convert to torch tensor, normalise with imagenet mean and variance, reshape. Note: input image yet to be loaded to GPU through tensor.cuda() + + Parameters + ---------- + np_image : ndarray + (H x W x C) + + Returns + ------- + Torch Tensor + + """ + if self.bgr: + np_image_rgb = np_image[..., ::-1] + else: + np_image_rgb = np_image + + input_image = cv2.resize(np_image_rgb, (INPUT_WIDTH, INPUT_WIDTH)) + trans = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + input_image = trans(input_image) + input_image = input_image.view(1, 3, INPUT_WIDTH, INPUT_WIDTH) + + return input_image + + def predict(self, np_images): + """ + batch inference + + Params + ------ + np_images : list of ndarray + list of (H x W x C), bgr or rgb according to self.bgr + + Returns + ------ + list of features (np.array with dim = 1280) + + """ + all_feats = [] + + preproc_imgs = [self.preprocess(img) for img in np_images] + + for this_batch in batch(preproc_imgs, bs=self.max_batch_size): + this_batch = torch.cat(this_batch, dim=0) + if self.gpu: + this_batch = this_batch.cuda() + if self.half: + this_batch = this_batch.half() + output = self.model.forward(this_batch) + + all_feats.extend(output.cpu().data.numpy()) + + return all_feats + + +class TorchReID_Embedder(object): + """ + Embedder that works with torchreid (https://github.com/KaiyangZhou/deep-person-reid). Model zoo: https://kaiyangzhou.github.io/deep-person-reid/MODEL_ZOO + + Params + ------ + - model_name (optional, str): name of model, see torchreid model zoo. defaults to osnet_ain_x1_0 + - model_wts_path (optional, str) : path to torchreid model weights, defaults to TORCHREID_OSNET_AIN_X1_0_MS_D_C_WTS if model_name=='osnet_ain_x1_0' (default) and else, imagenet pretrained weights of given model + - bgr (optional, Bool) : boolean flag indicating if input frames are bgr or not, defaults to True + - gpu (optional, Bool) : boolean flag indicating if gpu is enabled or not + - max_batch_size: Does nothing, just for compatibility to other embedder classes + """ + + def __init__( + self, model_name=None, model_wts_path=None, bgr=True, gpu=True, max_batch_size=None, + ): + try: + import torchreid + except ImportError: + raise Exception('ImportError: torchreid is not installed, please install and try again or choose another embedder') + + from torchreid.utils import FeatureExtractor + + if model_name is None: + model_name = 'osnet_ain_x1_0' + + if model_wts_path is None: + model_wts_path = '' + + if model_name=='osnet_ain_x1_0' and model_wts_path=='': + model_wts_path = TORCHREID_OSNET_AIN_X1_0_MS_D_C_WTS + + self.gpu = gpu and torch.cuda.is_available() + if self.gpu: + device = 'cuda' + else: + device = 'cpu' + + self.model = FeatureExtractor( + model_name=model_name, + model_path=model_wts_path, + device=device, + ) + + self.bgr = bgr + + logger.info("TorchReID Embedder for Deep Sort initialised") + logger.info(f"- gpu enabled: {self.gpu}") + logger.info(f"- expects BGR: {self.bgr}") + + zeros = np.zeros((100, 100, 3), dtype=np.uint8) + self.predict([zeros]) # warmup + + def preprocess(self, np_image): + """ + Preprocessing for embedder network: Flips BGR to RGB, resize, convert to torch tensor, normalise with imagenet mean and variance, reshape. Note: input image yet to be loaded to GPU through tensor.cuda() + + Parameters + ---------- + np_image : ndarray + (H x W x C) + + Returns + ------- + Torch Tensor + + """ + if self.bgr: + np_image_rgb = np_image[..., ::-1] + else: + np_image_rgb = np_image + # torchreid handles the rest of the preprocessing + return np_image_rgb + + def predict(self, np_images): + """ + batch inference + + Params + ------ + np_images : list of ndarray + list of (H x W x C), bgr or rgb according to self.bgr + + Returns + ------ + list of features (np.array with dim = 1280) + + """ + preproc_imgs = [self.preprocess(img) for img in np_images] + output = self.model(preproc_imgs) + return output.cpu().data.numpy() diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/embedder_tf.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/embedder_tf.py new file mode 100644 index 00000000..0d66f8c6 --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/embedder_tf.py @@ -0,0 +1,129 @@ +import os +import logging +from pathlib import Path + +import cv2 +import numpy as np +import pkg_resources +import tensorflow as tf + +MOBILENETV2_BOTTLENECK_WTS = pkg_resources.resource_filename( + "deep_sort_realtime", + "embedder/weights/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224.h5", +) + +logger = logging.getLogger(__name__) + +gpus = tf.config.experimental.list_physical_devices("GPU") +if gpus: + # Currently, memory growth needs to be the same across GPUs + for gpu in gpus: + tf.config.experimental.set_memory_growth(gpu, True) + +INPUT_WIDTH = 224 + + +def batch(iterable, bs=1): + l = len(iterable) + for ndx in range(0, l, bs): + yield iterable[ndx : min(ndx + bs, l)] + + +def get_mobilenetv2_with_preproc(wts="imagenet"): + i = tf.keras.layers.Input([None, None, 3], dtype=tf.uint8) + x = tf.cast(i, tf.float32) + x = tf.keras.applications.mobilenet_v2.preprocess_input(x) + + full_model = tf.keras.applications.mobilenet_v2.MobileNetV2( + input_shape=None, + weights=str(wts), + classifier_activation=None, + ) + core_model = tf.keras.Model(full_model.input, full_model.layers[-2].output) + + x = core_model(x) + + model = tf.keras.Model(inputs=[i], outputs=[x]) + model.summary() + return model + + +class MobileNetv2_Embedder(object): + """ + MobileNetv2_Embedder loads a Mobilenetv2 pretrained on Imagenet1000, with classification layer removed, exposing the bottleneck layer, outputing a feature of size 1280. + + Params + ------ + - model_wts_path (optional, str) : path to mobilenetv2 model weights, defaults to the model file in ./mobilenetv2 + - max_batch_size (optional, int) : max batch size for embedder, defaults to 16 + - bgr (optional, Bool) : boolean flag indicating if input frames are bgr or not, defaults to True + - gpu (optional, Bool) : boolean flag indicating if gpu is enabled or not + """ + + def __init__(self, model_wts_path=None, max_batch_size=16, bgr=True, gpu=True): + + if not gpu: + os.environ["CUDA_VISIBLE_DEVICES"] = "-1" + + if model_wts_path is None: + model_wts_path = MOBILENETV2_BOTTLENECK_WTS + model_wts_path = Path(model_wts_path) + assert ( + model_wts_path.is_file() + ), f"Mobilenetv2 model path {model_wts_path} does not exists!" + + self.model = get_mobilenetv2_with_preproc(wts=model_wts_path) + + self.max_batch_size = max_batch_size + self.bgr = bgr + + logger.info("MobileNetV2 Embedder (tf) for Deep Sort initialised") + logger.info(f"- max batch size: {self.max_batch_size}") + logger.info(f"- expects BGR: {self.bgr}") + + zeros = np.zeros((100, 100, 3), dtype=np.uint8) + self.predict([zeros, zeros]) # warmup + + def preprocess(self, np_image): + """ + Parameters + ---------- + np_image : ndarray + (H x W x C) + + Returns + ------- + TF Tensor + + """ + if self.bgr: + np_image_rgb = np_image[..., ::-1] + else: + np_image_rgb = np_image + np_image_rgb = cv2.resize(np_image_rgb, (INPUT_WIDTH, INPUT_WIDTH)) + return tf.convert_to_tensor(np_image_rgb) + + def predict(self, np_images): + """ + batch inference + + Params + ------ + np_images : list of ndarray + list of (H x W x C), bgr or rgb according to self.bgr + + Returns + ------ + list of features (np.array with dim = 1280) + + """ + all_feats = [] + + preproc_imgs = [self.preprocess(img) for img in np_images] + + for this_batch in batch(preproc_imgs, bs=self.max_batch_size): + this_batch = tf.stack(this_batch, axis=0) + output = self.model(this_batch) + all_feats.extend(output.numpy()) + + return all_feats diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/mobilenetv2_bottle.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/mobilenetv2_bottle.py new file mode 100644 index 00000000..a825117a --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/mobilenetv2_bottle.py @@ -0,0 +1,135 @@ +import torch.nn as nn +import math + + +def conv_bn(inp, oup, stride): + return nn.Sequential( + nn.Conv2d(inp, oup, 3, stride, 1, bias=False), + nn.BatchNorm2d(oup), + nn.ReLU6(inplace=True), + ) + + +def conv_1x1_bn(inp, oup): + return nn.Sequential( + nn.Conv2d(inp, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + nn.ReLU6(inplace=True), + ) + + +class InvertedResidual(nn.Module): + def __init__(self, inp, oup, stride, expand_ratio): + super(InvertedResidual, self).__init__() + self.stride = stride + assert stride in [1, 2] + + hidden_dim = round(inp * expand_ratio) + self.use_res_connect = self.stride == 1 and inp == oup + + if expand_ratio == 1: + self.conv = nn.Sequential( + # dw + nn.Conv2d( + hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False + ), + nn.BatchNorm2d(hidden_dim), + nn.ReLU6(inplace=True), + # pw-linear + nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + ) + else: + self.conv = nn.Sequential( + # pw + nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False), + nn.BatchNorm2d(hidden_dim), + nn.ReLU6(inplace=True), + # dw + nn.Conv2d( + hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False + ), + nn.BatchNorm2d(hidden_dim), + nn.ReLU6(inplace=True), + # pw-linear + nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + ) + + def forward(self, x): + if self.use_res_connect: + return x + self.conv(x) + else: + return self.conv(x) + + +class MobileNetV2_bottle(nn.Module): + def __init__(self, input_size=224, width_mult=1.0): + super(MobileNetV2_bottle, self).__init__() + block = InvertedResidual + input_channel = 32 + last_channel = 1280 + interverted_residual_setting = [ + # t, c, n, s + [1, 16, 1, 1], + [6, 24, 2, 2], + [6, 32, 3, 2], + [6, 64, 4, 2], + [6, 96, 3, 1], + [6, 160, 3, 2], + [6, 320, 1, 1], + ] + + # building first layer + assert input_size % 32 == 0 + input_channel = int(input_channel * width_mult) + self.last_channel = ( + int(last_channel * width_mult) if width_mult > 1.0 else last_channel + ) + self.features = [conv_bn(3, input_channel, 2)] + # building inverted residual blocks + for t, c, n, s in interverted_residual_setting: + output_channel = int(c * width_mult) + for i in range(n): + if i == 0: + self.features.append( + block(input_channel, output_channel, s, expand_ratio=t) + ) + else: + self.features.append( + block(input_channel, output_channel, 1, expand_ratio=t) + ) + input_channel = output_channel + # building last several layers + self.features.append(conv_1x1_bn(input_channel, self.last_channel)) + # make it nn.Sequential + self.features = nn.Sequential(*self.features) + + # # building classifier + # self.classifier = nn.Sequential( + # nn.Dropout(0.2), + # nn.Linear(self.last_channel, n_class), + # ) + + self._initialize_weights() + + def forward(self, x): + x = self.features(x) + x = x.mean(3).mean(2) + # x = self.classifier(x) + return x + + def _initialize_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2.0 / n)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + n = m.weight.size(1) + m.weight.data.normal_(0, 0.01) + m.bias.data.zero_() diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/weights/download_clip_wts.sh b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/weights/download_clip_wts.sh new file mode 100644 index 00000000..46e1b7c1 --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/weights/download_clip_wts.sh @@ -0,0 +1,6 @@ +# wget https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt +# wget https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt +# wget https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt +# wget https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt +wget https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt +# wget https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/weights/download_tf_wts.sh b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/weights/download_tf_wts.sh new file mode 100644 index 00000000..2c13bac4 --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/weights/download_tf_wts.sh @@ -0,0 +1,7 @@ +# Checks if gdown is installed else install +if ! type "gdown" > /dev/null; then + pip3 install gdown +fi + +# Downloads with gdown +gdown https://drive.google.com/uc?id=1RBroAFc0tmfxgvrh7iXc2e1EK8TVzXkA diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/weights/mobilenetv2_bottleneck_wts.pt b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/weights/mobilenetv2_bottleneck_wts.pt new file mode 100644 index 00000000..12074439 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/weights/mobilenetv2_bottleneck_wts.pt differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/weights/osnet_ain_ms_d_c_wtsonly.pth b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/weights/osnet_ain_ms_d_c_wtsonly.pth new file mode 100644 index 00000000..20e46f48 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/embedder/weights/osnet_ain_ms_d_c_wtsonly.pth differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/__init__.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/__init__.py new file mode 100644 index 00000000..43e08fb8 --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/__init__.py @@ -0,0 +1 @@ +# vim: expandtab:ts=4:sw=4 diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/__pycache__/__init__.cpython-38.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 00000000..c7620967 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/__pycache__/__init__.cpython-38.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/__pycache__/__init__.cpython-39.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 00000000..51e75f00 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/__pycache__/__init__.cpython-39.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/__pycache__/nms.cpython-38.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/__pycache__/nms.cpython-38.pyc new file mode 100644 index 00000000..b513f956 Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/__pycache__/nms.cpython-38.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/__pycache__/nms.cpython-39.pyc b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/__pycache__/nms.cpython-39.pyc new file mode 100644 index 00000000..2bc79b9e Binary files /dev/null and b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/__pycache__/nms.cpython-39.pyc differ diff --git a/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/nms.py b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/nms.py new file mode 100644 index 00000000..e0c1088a --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/deep_sort_realtime/utils/nms.py @@ -0,0 +1,64 @@ +import numpy as np + + +def non_max_suppression(boxes, max_bbox_overlap, scores=None): + """Suppress overlapping detections. + Original code from [1]_ has been adapted to include confidence score. + .. [1] http://www.pyimagesearch.com/2015/02/16/ + faster-non-maximum-suppression-python/ + Examples + -------- + >>> boxes = [d.roi for d in detections] + >>> scores = [d.confidence for d in detections] + >>> indices = non_max_suppression(boxes, max_bbox_overlap, scores) + >>> detections = [detections[i] for i in indices] + Parameters + ---------- + boxes : ndarray + Array of ROIs (x, y, width, height). + max_bbox_overlap : float + ROIs that overlap more than this values are suppressed. + scores : Optional[array_like] + Detector confidence score. + Returns + ------- + List[int] + Returns indices of detections that have survived non-maxima suppression. + """ + if len(boxes) == 0: + return [] + + boxes = boxes.astype(np.float32) + pick = [] + + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + boxes[:, 0] + y2 = boxes[:, 3] + boxes[:, 1] + + area = (x2 - x1 + 1) * (y2 - y1 + 1) + if scores is not None: + idxs = np.argsort(scores) + else: + idxs = np.argsort(y2) + + while len(idxs) > 0: + last = len(idxs) - 1 + i = idxs[last] + pick.append(i) + + xx1 = np.maximum(x1[i], x1[idxs[:last]]) + yy1 = np.maximum(y1[i], y1[idxs[:last]]) + xx2 = np.minimum(x2[i], x2[idxs[:last]]) + yy2 = np.minimum(y2[i], y2[idxs[:last]]) + + w = np.maximum(0, xx2 - xx1 + 1) + h = np.maximum(0, yy2 - yy1 + 1) + + overlap = (w * h) / area[idxs[:last]] + + idxs = np.delete( + idxs, np.concatenate(([last], np.where(overlap > max_bbox_overlap)[0])) + ) + + return pick diff --git a/assMath/probStat/Anupriya/kalmanfilter/tempCodeRunnerFile.py b/assMath/probStat/Anupriya/kalmanfilter/tempCodeRunnerFile.py new file mode 100644 index 00000000..440c3491 --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/tempCodeRunnerFile.py @@ -0,0 +1 @@ +cv2.CAP_V4L2 \ No newline at end of file diff --git a/assMath/probStat/Anupriya/kalmanfilter/test.py b/assMath/probStat/Anupriya/kalmanfilter/test.py new file mode 100644 index 00000000..8c2b1277 --- /dev/null +++ b/assMath/probStat/Anupriya/kalmanfilter/test.py @@ -0,0 +1,70 @@ +import datetime +from ultralytics import YOLO +import cv2 +from deep_sort_realtime.deepsort_tracker import DeepSort +import numpy as np + +CONFIDENCE_THRESHOLD = 0.8 +GREEN = (0, 255, 0) +WHITE = (255, 255, 255) + +# initialize the video capture object +video_cap = cv2.VideoCapture(cv2.CAP_V4L2) + +# load the pre-trained YOLOv8n model +model = YOLO("yolov8l.pt") +tracker = DeepSort(max_age=50) + +while True: + start = datetime.datetime.now() + ret, frame = video_cap.read() + + if not ret: + break + + detections = model(frame)[0] + + results = [] + person=[] + boxes=detections.boxes + for i in range(len(boxes)): + data=boxes[i] + xmin, ymin, xmax, ymax =data.xyxy[0][0].cpu(), data.xyxy[0][1].cpu(), data.xyxy[0][2].cpu(), data.xyxy[0][3].cpu() + results.append([[xmin, ymin, xmax - xmin, ymax - ymin], data.conf[0].cpu(), data.cls]) + + + tracks = tracker.update_tracks(results, frame=frame) + # loop over the tracks + for track in tracks: + # if the track is not confirmed, ignore it + if not track.is_confirmed(): + continue + + # get the track id and the bounding box + track_id = track.track_id + ltrb = track.to_ltrb() + + xmin, ymin, xmax, ymax = int(ltrb[0]), int( + ltrb[1]), int(ltrb[2]), int(ltrb[3]) + # draw the bounding box and the track id + cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), GREEN, 2) + cv2.rectangle(frame, (xmin, ymin - 20), (xmin + 20, ymin), GREEN, -1) + cv2.putText(frame, str(track_id), (xmin + 5, ymin - 8), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, WHITE, 2) + + # end time to compute the fps + end = datetime.datetime.now() + # show the time it took to process 1 frame + print(f"Time to process 1 frame: {(end - start).total_seconds() * 1000:.0f} milliseconds") + # calculate the frame per second and draw it on the frame + fps = f"FPS: {1 / (end - start).total_seconds():.2f}" + cv2.putText(frame, fps, (50, 50), + cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 0, 255), 8) + + # show the frame to our screen + cv2.imshow("Frame", frame) + if cv2.waitKey(1) == ord("q"): + break + +video_cap.release() +cv2.destroyAllWindows() \ No newline at end of file