Added Results data and IPython notebooks. Updated gdrive_download scr…

…ipt. Minor fix in trex_inference.
Cisco-Talos · Sep 6, 2022 · 934e71b · 934e71b
1 parent 891028b
commit 934e71b
Show file tree

Hide file tree

Showing 62 changed files with 7,548 additions and 5,476 deletions.
diff --git a/.gitignore b/.gitignore
@@ -24,4 +24,6 @@ DBs/Dataset-2/features/
 DBs/Dataset-2/pairs/
 DBs/Dataset-Vulnerability/*.csv
 DBs/Dataset-Vulnerability/features/
-DBs/Dataset-Vulnerability/pairs/
+DBs/Dataset-Vulnerability/pairs/
+
+Results/data/
diff --git a/Models/Trex/NeuralNetwork/trex_inference.py b/Models/Trex/NeuralNetwork/trex_inference.py
@@ -107,8 +107,8 @@ def main(input_pairs, input_traces, model_checkpoint_dir,
 
         cs_list.append(torch.cosine_similarity(emb_a, emb_b)[0].item())
 
-    # Saving the cosine similarity in the 'cs' column
-    df['cs'] = cs_list[:df.shape[0]]
+    # Saving the cosine similarity in the 'sim' column
+    df['sim'] = cs_list[:df.shape[0]]
 
     pairs_fname = ntpath.basename(input_pairs)
     df_out = os.path.join(output_dir, "{}.trex_out.csv".format(pairs_fname))

diff --git a/Results/README.md b/Results/README.md
@@ -1,4 +1,60 @@
 # Results
 
-The CSV files with the results from all the experiments and the IPython notebooks to extract the different metrics will be released soon.
+This folder contains the results of the experiments and the IPython notebooks to extract the different metrics and generate the plots.
 
+## Download the output data for each model we tested
+
+**Warning: the following steps will require about 13GB of free disk space.**
+
+To download the data from [Google Drive](https://drive.google.com/drive/folders/13kyJagd1eBR3CC5shnR5DdCGOFWF0Dbe?usp=sharing) use the [`gdrive_download.py`](../gdrive_download.py) Python3 script and follow the instructions below:
+
+1. Install the Python3 [virtualenv](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/#installing-virtualenv)
+
+2. Create a new virtualenv and install the required packages
+```bash
+# create a new "env" environment
+python3 -m venv ../env
+# enter the virtual environment
+source ../env/bin/activate
+
+# Install the requirements in the current environment
+pip install -r ../requirements.txt
+```
+
+3. Download and unzip the data in the corresponding folders:
+```bash
+python3 ../gdrive_download.py --results
+```
+
+The data will be unzipped in the following directories:
+```bash
+Results/data/Dataset-1
+Results/data/Dataset-1-CodeCMR
+Results/data/Dataset-2
+Results/data/Dataset-Vulnerability
+Results/data/raw_results
+```
+
+## Process the data to extract the different metrics and generate the plots
+
+Most of the model implementations directly return the similarity between the function pairs for each [dataset](../DBs/) we tested. The CSV files with the results are saved in the corresponding `Dataset` folder under the `data` directory.
+
+All the CSV files use the same header:
+```csv
+idb_path_1,fva_1,idb_path_2,fva_2,sim
+```
+* `idb_path` and `fva` are used as "primary keys" to identify a single function
+* The `sim` column contains the similarity (distance) value computed using the specific metric required by each approach.
+
+However, some models require an intermediate step to convert the output to this standard form. The `data/raw_results` folder includes the output from Asm2vec/Doc2vec, Catalog1, CodeCMR and FunctionSimSearch.
+* Use the [`Convert Asm2vec results`](notebooks/Convert%20Asm2vec%20results.ipynb) IPython notebook to process the Asm2vec and Doc2vec output (`data/raw_results/Asm2vec`)
+* Use the [`Convert Catalog1 results`](notebooks/Convert%20Catalog1%20results.ipynb) IPython notebook to process the Catalog1 output (`data/raw_results/Catalog1`)
+* Use the [`Convert CodeCMR results`](./notebooks/Convert%20CodeCMR%20results.ipynb) IPython notebook to process the CodeCMR output (`data/raw_results/CodeCMR`)
+* Use the [`Convert FunctionSimSearch results`](./notebooks/Convert%20FunctionSimSearch%20results.ipynb) IPython notebook to process the FunctionSimSearch output (`data/raw_results/FunctionSimSearch`).
+
+Finally, there are three IPython notebooks to extract the metrics for all the experiments:
+- [`AUC and similarity plots`](./notebooks/AUC%20and%20similarity%20plots.ipynb) computes the AUC for each task and model configuration
+- [`MRR@10 and Recall@K`](./notebooks/MRR@10%20and%[email protected]) computes the *MRR@10* and *Recall@K* metrics
+- [`Vulnerability task eval`](./notebooks/Vulnerability%20task%20eval.ipynb) generates the metrics for the Vulnerability test case.
+
+The output is saved in the [`metrics_and_plots`](./notebooks/metrics_and_plots) folder.
diff --git a/Results/notebooks/AUC and similarity plots.ipynb b/Results/notebooks/AUC and similarity plots.ipynb
diff --git a/Results/notebooks/Convert Asm2vec results.ipynb b/Results/notebooks/Convert Asm2vec results.ipynb
@@ -52,7 +52,6 @@
     "import pandas as pd\n",
     "\n",
     "from scipy.spatial.distance import cosine\n",
-    "from sklearn import metrics\n",
     "from tqdm import tqdm"
    ]
   },
@@ -63,9 +62,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def dot_product(e1, e2):\n",
-    "    return np.dot(e1, e2)\n",
-    "\n",
     "def cosine_similarity(e1, e2):\n",
     "    return 1 - cosine(e1, e2)"
    ]
@@ -100,24 +96,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def compute_embedding_similarity(df_pairs, df_asm2vec, is_pos):\n",
+    "def compute_embedding_similarity(df_pairs, df_asm2vec):\n",
+    "    \n",
+    "    df_asm2vec = df_asm2vec[['idb_path', 'fva', 'embeddings']]\n",
+    "    \n",
     "    df_pairs = df_pairs.merge(df_asm2vec,\n",
     "                              how='left',\n",
     "                              left_on=['idb_path_1', 'fva_1'],\n",
     "                              right_on=['idb_path', 'fva'])\n",
     "    df_pairs.rename(columns={'embeddings': 'embeddings_1'}, inplace=True)\n",
-    "\n",
+    "    \n",
     "    df_pairs = df_pairs.merge(df_asm2vec,\n",
     "                              how='left',\n",
     "                              left_on=['idb_path_2', 'fva_2'],\n",
     "                              right_on=['idb_path', 'fva'])\n",
     "    df_pairs.rename(columns={'embeddings': 'embeddings_2'}, inplace=True)\n",
     "\n",
     "    df_pairs['sim'] = compute_cosine_similarity(df_pairs)\n",
-    "\n",
-    "    del df_pairs['embeddings_1']\n",
-    "    del df_pairs['embeddings_2']\n",
-    "\n",
+    "    df_pairs = df_pairs[['idb_path_1','fva_1','idb_path_2','fva_2','sim']]\n",
     "    return df_pairs"
    ]
   },
@@ -131,17 +127,69 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "d450d9ad",
+   "execution_count": 6,
+   "id": "7430e0e2",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[D] Processing ../data/raw_results/Asm2vec/Dataset-1_asm2vec_e10/embeddings.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "450000it [00:37, 11876.41it/s]\n",
+      "450000it [00:37, 11934.86it/s]\n",
+      "800it [00:00, 11988.99it/s]\n",
+      "80000it [00:06, 12150.48it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[D] Processing ../data/raw_results/Asm2vec/Dataset-1_pvdbow_e10/embeddings.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "450000it [00:37, 11980.94it/s]\n",
+      "450000it [00:37, 12116.82it/s]\n",
+      "800it [00:00, 12090.82it/s]\n",
+      "80000it [00:06, 12286.63it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[D] Processing ../data/raw_results/Asm2vec/Dataset-1_pvdm_e10/embeddings.csv\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "450000it [00:37, 11915.80it/s]\n",
+      "450000it [00:37, 11966.94it/s]\n",
+      "800it [00:00, 12042.17it/s]\n",
+      "80000it [00:06, 12161.65it/s]\n"
+     ]
+    }
+   ],
    "source": [
     "DB1_PATH = \"../../DBs/Dataset-1/pairs/testing/\"\n",
     "\n",
     "for folder in [\n",
-    "    'Dataset-1_asm2vec_e3',\n",
-    "    'Dataset-1_pvdbow_e3',\n",
-    "        'Dataset-1_pvdm_e3']:\n",
+    "    'Dataset-1_asm2vec_e10',\n",
+    "    'Dataset-1_pvdbow_e10',\n",
+    "        'Dataset-1_pvdm_e10']:\n",
     "\n",
     "    embedding_path = os.path.join(\n",
     "        \"../data/raw_results/Asm2vec/\", folder, \"embeddings.csv\")\n",
@@ -157,10 +205,10 @@
     "    df_pos_rank = pd.read_csv(os.path.join(DB1_PATH, \"pos_rank_testing_Dataset-1.csv\"), index_col=0)\n",
     "    df_neg_rank = pd.read_csv(os.path.join(DB1_PATH, \"neg_rank_testing_Dataset-1.csv\"), index_col=0)\n",
     "    \n",
-    "    df_pos = compute_embedding_similarity(df_pos, df_emb, is_pos=True)\n",
-    "    df_neg = compute_embedding_similarity(df_neg, df_emb, is_pos=True)\n",
-    "    df_pos_rank = compute_embedding_similarity(df_pos_rank, df_emb, is_pos=True)\n",
-    "    df_neg_rank = compute_embedding_similarity(df_neg_rank, df_emb, is_pos=True)\n",
+    "    df_pos = compute_embedding_similarity(df_pos, df_emb)\n",
+    "    df_neg = compute_embedding_similarity(df_neg, df_emb)\n",
+    "    df_pos_rank = compute_embedding_similarity(df_pos_rank, df_emb)\n",
+    "    df_neg_rank = compute_embedding_similarity(df_neg_rank, df_emb)\n",
     "\n",
     "    df_pos.to_csv(\"../data/Dataset-1/pos_testing_{}.csv\".format(folder), index=False)\n",
     "    df_neg.to_csv(\"../data/Dataset-1/neg_testing_{}.csv\".format(folder), index=False)\n",
@@ -178,7 +226,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "monthly-birthday",
    "metadata": {},
    "outputs": [
@@ -193,10 +241,10 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "150000it [00:12, 12291.55it/s]\n",
-      "150000it [00:12, 12296.05it/s]\n",
-      "600it [00:00, 12261.66it/s]\n",
-      "60000it [00:04, 12310.80it/s]\n"
+      "150000it [00:12, 11899.46it/s]\n",
+      "150000it [00:12, 12039.02it/s]\n",
+      "600it [00:00, 11950.26it/s]\n",
+      "60000it [00:04, 12146.01it/s]\n"
      ]
     },
     {
@@ -210,10 +258,10 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "150000it [00:12, 12035.30it/s]\n",
-      "150000it [00:12, 11957.38it/s]\n",
-      "600it [00:00, 11958.90it/s]\n",
-      "60000it [00:04, 12043.97it/s]\n"
+      "150000it [00:12, 12077.09it/s]\n",
+      "150000it [00:12, 11948.48it/s]\n",
+      "600it [00:00, 12087.27it/s]\n",
+      "60000it [00:04, 12163.42it/s]\n"
      ]
     },
     {
@@ -227,10 +275,10 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "150000it [00:12, 11913.91it/s]\n",
-      "150000it [00:12, 11684.97it/s]\n",
-      "600it [00:00, 11644.43it/s]\n",
-      "60000it [00:04, 12045.19it/s]\n"
+      "150000it [00:12, 12048.42it/s]\n",
+      "150000it [00:12, 12142.73it/s]\n",
+      "600it [00:00, 11897.95it/s]\n",
+      "60000it [00:04, 12185.50it/s]\n"
      ]
     }
    ],
@@ -256,10 +304,10 @@
     "    df_pos_rank = pd.read_csv(os.path.join(DB2_PATH, \"pos_rank_testing_Dataset-2.csv\"), index_col=0)\n",
     "    df_neg_rank = pd.read_csv(os.path.join(DB2_PATH, \"neg_rank_testing_Dataset-2.csv\"), index_col=0)\n",
     "    \n",
-    "    df_pos = compute_embedding_similarity(df_pos, df_emb, is_pos=True)\n",
-    "    df_neg = compute_embedding_similarity(df_neg, df_emb, is_pos=True)\n",
-    "    df_pos_rank = compute_embedding_similarity(df_pos_rank, df_emb, is_pos=True)\n",
-    "    df_neg_rank = compute_embedding_similarity(df_neg_rank, df_emb, is_pos=True)\n",
+    "    df_pos = compute_embedding_similarity(df_pos, df_emb)\n",
+    "    df_neg = compute_embedding_similarity(df_neg, df_emb)\n",
+    "    df_pos_rank = compute_embedding_similarity(df_pos_rank, df_emb)\n",
+    "    df_neg_rank = compute_embedding_similarity(df_neg_rank, df_emb)\n",
     "\n",
     "    df_pos.to_csv(\"../data/Dataset-2/pos_testing_{}.csv\".format(folder), index=False)\n",
     "    df_neg.to_csv(\"../data/Dataset-2/neg_testing_{}.csv\".format(folder), index=False)\n",
@@ -292,7 +340,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "88700it [00:07, 12208.86it/s]\n"
+      "88700it [00:07, 12256.27it/s]\n"
      ]
     },
     {
@@ -306,7 +354,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "88700it [00:07, 12258.50it/s]\n"
+      "88700it [00:07, 12256.20it/s]\n"
      ]
     },
     {
@@ -320,7 +368,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "88700it [00:07, 12289.16it/s]\n"
+      "88700it [00:07, 12178.80it/s]\n"
      ]
     }
    ],
@@ -343,18 +391,10 @@
     "\n",
     "    df_testing = pd.read_csv(os.path.join(DB2_PATH, \"pairs_testing_Dataset-Vulnerability.csv\"), index_col=0)\n",
     "    \n",
-    "    df_testing = compute_embedding_similarity(df_testing, df_emb, is_pos=True)\n",
+    "    df_testing = compute_embedding_similarity(df_testing, df_emb)\n",
     "\n",
     "    df_testing.to_csv(\"../data/Dataset-Vulnerability/testing_{}.csv\".format(folder), index=False)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "87332de1",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {