From a78b60e086a55bd1fa2c1e74a17fb6a5bfe79c8d Mon Sep 17 00:00:00 2001
From: Juan Caballero <juan.caballero@imdea.org>
Date: Tue, 1 Sep 2020 19:26:48 +0200
Subject: [PATCH] Added first version of AVClass2

---
 LICENSE                                       |    2 +-
 README.md                                     |  552 ++-----
 avclass/README.md                             |  392 +++++
 .../avclass_alias_detect.py                   |    0
 .../avclass_generic_detect.py                 |    0
 .../avclass_labeler.py                        |    0
 {data => avclass/data}/default.aliases        |    0
 {data => avclass/data}/default.generics       |    0
 {lib => avclass/lib}/avclass_common.py        |    0
 {lib => avclass/lib}/evaluate_clustering.py   |    0
 avclass2/README.md                            |  252 ++++
 avclass2/avclass2_input_checker.py            |   51 +
 avclass2/avclass2_labeler.py                  |  469 ++++++
 avclass2/avclass2_update_module.py            |  480 ++++++
 avclass2/data/expansion                       |   17 +
 avclass2/data/tagging                         | 1300 +++++++++++++++++
 avclass2/data/taxonomy                        | 1138 +++++++++++++++
 avclass2/lib/avclass2_common.py               |  636 ++++++++
 avclass2/lib/evaluate_clustering.py           |  141 ++
 {data => examples}/malheurReference_gt.tsv    |    0
 {data => examples}/malheurReference_lb.json   |    0
 examples/vtv3_sample.json                     |    1 +
 22 files changed, 4999 insertions(+), 432 deletions(-)
 create mode 100644 avclass/README.md
 rename avclass_alias_detect.py => avclass/avclass_alias_detect.py (100%)
 rename avclass_generic_detect.py => avclass/avclass_generic_detect.py (100%)
 rename avclass_labeler.py => avclass/avclass_labeler.py (100%)
 rename {data => avclass/data}/default.aliases (100%)
 rename {data => avclass/data}/default.generics (100%)
 rename {lib => avclass/lib}/avclass_common.py (100%)
 rename {lib => avclass/lib}/evaluate_clustering.py (100%)
 create mode 100644 avclass2/README.md
 create mode 100755 avclass2/avclass2_input_checker.py
 create mode 100755 avclass2/avclass2_labeler.py
 create mode 100755 avclass2/avclass2_update_module.py
 create mode 100644 avclass2/data/expansion
 create mode 100644 avclass2/data/tagging
 create mode 100644 avclass2/data/taxonomy
 create mode 100755 avclass2/lib/avclass2_common.py
 create mode 100755 avclass2/lib/evaluate_clustering.py
 rename {data => examples}/malheurReference_gt.tsv (100%)
 rename {data => examples}/malheurReference_lb.json (100%)
 create mode 100644 examples/vtv3_sample.json

diff --git a/LICENSE b/LICENSE
index 7996e63..1b6a62b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2016 MaliciaLab @ IMDEA Software Institute
+Copyright (c) 2016-2020 MaliciaLab @ IMDEA Software Institute
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index f7d1ee8..fcdd06f 100644
--- a/README.md
+++ b/README.md
@@ -1,484 +1,174 @@
-# AVClass
+# AVClass and AVClass2
 
-[AVClass](https://github.com/malicialab/avclass) 
-is a malware labeling tool.
+AVClass and AVClass2 are Python tools to tag / label malware samples. 
+You give them as input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports) 
+and they output tags extracted from the AV labels of each sample. 
+The original AVClass only outputs family names (i.e., family tags). 
+By default, it outputs the most likely family for each sample (e.g., *zbot*, *virut*). 
+It can also output a ranking of all alternative family names it found for each sample.
+The newer AVClass2, in addition to family names, also outputs other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). 
 
-You give it as input the AV labels for a large number of 
-malware samples (e.g., VirusTotal JSON reports) and it outputs the most 
-likely family name for each sample that it can extract from the AV labels. 
-It can also output a ranking of all alternative names it found for each sample.
+A quick example helps illustrating the differences. If you run AVClass2 on our example input file:
 
-The design and evaluation of AVClass is detailed in our 
-[RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf):
-
-> Marcos Sebastián, Richard Rivera, Platon Kotzias, and Juan Caballero. 
-AVClass: A tool for Massive Malware Labeling. 
-In Proceedings of the International Symposium on Research in 
-Attacks, Intrusions and Defenses,
-September 2016.
-
-In a nutshell, AVClass comprises two phases: 
-preparation (optional) and labeling.
-Code for both is included, 
-but most users will be only interested in the labeling, which outputs the 
-family name for the samples. 
-The preparation produces a list of aliases and generic tokens 
-used by the labeling. 
-If you use our default aliases and generic tokens lists, 
-you do not need to run the preparation.
-
-**Why is AVClass useful?**
-
-Because a lot of times security researchers want to extract 
-family information from AV labels, 
-but this process is not as simple as it looks, 
-especially if you need to do it for large numbers (e.g., millions) of 
-samples. Some advantages of AVClass are:
-
-1. *Automatic.* 
-  AVClass removes manual analysis limitations on the size of 
-  the input dataset.
-
-2. *Vendor-agnostic.*
-  AVclass operates on the labels of any available set of AV engines, 
-  which can vary from sample to sample.
-
-3. *Cross-platform.*
-  AVclass can be used for any platforms supported by AV engines, 
-  e.g., Windows or Android malware.
-
-4. *Does not require executables.*
-  AV labels can be obtained from online services like VirusTotal 
-  using a sample's hash, even when the executable is not available.
-
-5. *Quantified accuracy.* 
-  We have evaluated AVClass on 5 publicly available malware datasets with 
-  ground truth. Details are in the above RAID 2016 paper. 
-
-6. *Open source.*
-  The code is available and we are happy to incorporate suggestions and 
-  improvements so that the security community benefits from AVClass.
-
-**Limitations**
-
-The main limitation of AVClass is that its output depends on the input 
-AV labels. 
-It tries to compensate for the noise on those labels, but 
-cannot identify the family of a sample if AV engines do not provide 
-non-generic family names to that sample. 
-In particular, it cannot label samples if at least 2 AV engines 
-do not agree on a non-generic family name. 
-Results on 8 million samples showed that AVClass could label 81% of the 
-samples. 
-In other words, it could not label 19% of the 
-samples because their labels contained only generic tokens.
-
-Still, there are many samples that AVClass can label and thus we believe 
-you will find it a useful tool. 
-We recommend you to read the discussion section in our RAID 2016 paper for 
-more details.
-
-## Labeling 
-   
-The labeler takes as input 
-a JSON file with the AV labels of malware samples (-vt or -lb switches), 
-a file with generic tokens (-gen switch), 
-and a file with aliases (-alias switch). 
-It outputs the most likely family name for each sample.
-If you do not provide alias or generic tokens files, 
-the default ones in the *data* folder are used.
-
-```
-$./avclass_labeler.py -lb data/malheurReference_lb.json -v > malheurReference.labels
-```
-  
-The above command labels the samples whose AV labels are in the 
-*data/malheurReference_lb.json* file.
-It prints the results to stdout, 
-which we redirect to the *malheurReference.labels* file.
-The output looks like this:
-
-```
-aca2d12934935b070df8f50e06a20539 adrotator
-67d15459e1f85898851148511c86d88d adultbrowser
+```shell
+$./avclass2/avclass2_labeler.py -lb examples/malheurReference_lb.json -p
 ```
 
-which means sample aca2d12934935b070df8f50e06a20539 is most likely 
-from the *adrotator* family and 
-67d15459e1f85898851148511c86d88d from the *adultbrowser* family.
-
-The verbose (-v) switch makes it output an extra 
-*malheurReference_lb.verbose* file
-with all families extracted for each sample ranked by the number of AV 
-engines that use that family.
-The file looks like this:
+the output on stdout is:
 
 ```
-aca2d12934935b070df8f50e06a20539  [(u'adrotator', 8), (u'zlob', 2)]
-ee90a64fcfaa54a314a7b5bfe9b57357  [(u'swizzor', 19)]
-f465a2c1b852373c72a1ccd161fbe94c  SINGLETON:f465a2c1b852373c72a1ccd161fbe94c
+aca2d12934935b070df8f50e06a20539 33 CLASS:grayware|10,CLASS:grayware:adware|9,FILE:os:windows|8,FAM:adrotator|8,CLASS:downloader|3,FAM:zlob|2
+67d15459e1f85898851148511c86d88d 37 CLASS:dialer|23,FILE:os:windows|9,FAM:adultbrowser|8,CLASS:dialer:porndialer|7,CLASS:grayware|6,CLASS:grayware:tool|3,FAM:target|2
 ```
+which means sample *aca2d12934935b070df8f50e06a20539* 
+was flagged by 33 AV engines and 10 of them agree it is *grayware*, 9 that it is more specifically *adware*, 
+8 mention that it runs on *windows*, another 8 that it is the *adrotator* family, 
+3 that it is a *downloader*, and 2 that it belongs instead to the *zlob* family.
+Sample *67d15459e1f85898851148511c86d88d* is flagged by 37 AV engines and 23 of them 
+consider it a *dialer*, 8 that it belongs to the *adultbrowser* family, and so on.
 
-which means that for sample aca2d12934935b070df8f50e06a20539 
-there are 8 AV engines assigning *adrotator* as the family and  
-another 2 assigning *zlob*.
-Thus, *adrotator* is the most likely family.
-On the other hand, for ee90a64fcfaa54a314a7b5bfe9b57357 there are 19 AV 
-engines assigning *swizzor* as family, 
-and no other family was found.
-The last line means that for sample f465a2c1b852373c72a1ccd161fbe94c
-no family name was found in the AV labels. 
-Thus, the sample is placed by himself in a singleton cluster 
-with the name of the cluster being the sample's hash.
-
-Note that the sum of the number of AV engines may not equal the number 
-of AV engines with a label for that sample in the input file 
-because the labels of some AV engines may only include generic tokens 
-that are removed by AVClass.
-
-
-## Input JSON format
-
-AVClass supports two input JSON formats: 
-
-1. VirusTotal JSON reports (*-vt file*), 
-where each line in *file* should be the full JSON of a 
-VirusTotal report as fetched through the VirusTotal API.
-By default, it assumes the VT reports are from VT API version 2.
-If the VT reports are from VT API version 3, add the -vt3 command line option.
+If you instead run AVClass on the same input file:
 
-2. Simplified JSON (*-lb file*),
-where each line in *file* should be a JSON 
-with (at least) these fields:
-{md5, sha1, sha256, av_labels}. 
-There is an example of such input file in *data/malheurReference_lb.json*
-
-**Why have 2 different input formats?**
-
-We believe most users will get the AV labels using VirusTotal. 
-However, AVClass is IO-bound and a VirusTotal report 
-in addition to the AV labels and hashes includes 
-much other data that AVClass does not need. 
-Thus, when applying AVClass to millions of samples,
-reducing the input file size by removing unnnecessary data 
-significantly improves efficiency. 
-Furthermore, users could obtain AV labels from other sources and 
-the simpler the input JSON format, 
-the easier to convert those AV labels into an input file.
-
-**Multiple input files**
-
-AVClass can handle multiple input files putting the results in the same 
-output files
-(if you want results in separate files, process each input file separately).
-
-It is possible to provide the -vt and -lb input options multiple times.
-
-```
-$./avclass_labeler.py -vt <file1> -vt <file2> > all.labels
-```
-```
-$./avclass_labeler.py -lb <file1> -lb <file2> > all.labels
-```
-
-There are also -vtdir and -lbdir options that can be used to provide 
-an input directory where all files are VT (-vtdir) 
-or simplified (-lbdir) JSON reports
-
-
-```
-$./avclass_labeler.py -vtdir <directory> > all.labels
-```
-
-It is also possible to combine -vt with -vtdir and -lb with -lbdir, 
-but you cannot combine input files of different format. 
-Thus, this command works:
-
-
-```
-$./avclass_labeler.py -vt <file> -vtdir <directory> > all.labels
+```shell
+$./avclass/avclass_labeler.py -lb examples/malheurReference_lb.json
 ```
 
-But, this one throws an error: 
+the output looks like this:
 
 ```
-$./avclass_labeler.py -vt <file1> -lb <file2> > all.labels
-```
-
-
-At this point you have read the most important information on how to use 
-AVClass. 
-The following sections describe steps that most users will not need.
-
-## Labeling: Family Ranking
-
-AVClass has a -fam switch to output a file with a ranking of the 
-families assigned to the input samples. 
+aca2d12934935b070df8f50e06a20539 adrotator
+67d15459e1f85898851148511c86d88d adultbrowser
+``` 
 
-```
-$./avclass_labeler.py -lb data/malheurReference_lb.json -v -fam > malheurReference.labels
-```
+which simply reports the most common family name for each sample.
 
-will produce a file called *malheurReference_lb.families* with two columns:
+In a nutshell, that is the main difference between both tools. 
+Of course, there are more options for both tools, 
+which you can read about in their corresponding README files. 
 
-```
-virut 441
-allaple 301
-podnuha 300
-```
 
-indicating that 441 samples were classified in the virut family, 
-301 as allaple, and 300 as podnuha.
+## Which one should I use?
 
-This switch is very similar to using the following shell command:
+AVClass2 is the newer tool and it extracts more information 
+from the input AV labels.
+So, if you are new to AVClass and AVClass2, we recommend trying it out first.
 
-```
-$cut -f 2 malheurReference.labels | sort | uniq -c | sort -nr
-```
+However, there are several reasons to keep AVClass around. 
+First, it is more mature and used by many analysts, 
+so we want to preserve backwards compatibility.
+Second, for some applications only family names are needed and 
+for that AVClass is enough.
+Third, AVClass is faster than AVClass2 since it extracts less info. 
+The lower runtime is nice when processing millions of samples and 
+not requiring the extra tags AVClass2 provides. 
 
-The main difference is that using the -fam switch all SINGLETON samples, 
-i.e., those for which no label was found, 
-are grouped into a fake *SINGLETONS* family, 
-while the shell command would leave each singleton as a separate family.
+## References
 
+The design and evaluation of AVClass is detailed in our 
+[RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf):
 
-## Labeling: PUP Classification
+> Marcos Sebastián, Richard Rivera, Platon Kotzias, and Juan Caballero. 
+AVClass: A Tool for Massive Malware Labeling. 
+In Proceedings of the International Symposium on Research in 
+Attacks, Intrusions and Defenses,
+September 2016.
 
-AVClass also has a -pup switch to classify a sample as
-Potentially Unwanted Program (PUP) or malware.
-This classification looks for PUP-related keywords
-(e.g., pup, pua, unwanted, adware) in the AV labels and was proposed in our
-[CCS 2015 paper](https://software.imdea.org/~juanca/papers/malsign_ccs15.pdf):
+The design and evaluation of AVClass2 is detailed in our ACSAC 2020 paper.
 
-> Platon Kotzias, Srdjan Matic, Richard Rivera, and Juan Caballero.
-Certified PUP: Abuse in Authenticode Code Signing.
-In Proceedings of the 22nd ACM Conference on Computer and Communication Security, Denver, CO, October 2015
+> Silvia Sebastián, Juan Caballero. 
+AVClass2: Massive Malware Tag Extraction from AV Labels. 
+In proceedings of the Annual Computer Security Applications Conference, December 2020.
 
-```
-$./avclass_labeler.py -lb data/malheurReference_lb.json -v -pup > malheurReference.labels
-```
+## Why are AVClass and AVClass2 useful?
 
-With the -pup switch the output of the *malheurReference.labels* file
-looks like this:
+Because a lot of times security researchers want to extract family and other 
+information from AV labels, but this process is not as simple as it looks, 
+especially if you need to do it for large numbers (e.g., millions) of samples. 
+Some advantages of AVClass and AVClass2 are:
 
-```
-aca2d12934935b070df8f50e06a20539 adrotator 1
-67d15459e1f85898851148511c86d88d adultbrowser 0
-```
+1. *Automatic.* They remove manual analysis limitations on the size of the 
+input 
+dataset.
 
-The digit at the end is a Boolean flag that 
-indicates sample aca2d12934935b070df8f50e06a20539 is
-(likely) PUP, but sample 67d15459e1f85898851148511c86d88d is (likely) not.
-
-In our experience the PUP classification is conservative,
-i.e., if it says the sample is PUP, it most likely is.
-But, if it says that it is not PUP, it could still be PUP if the AV labels
-do not contain PUP-related keywords.
-Note that it is possible that some samples from a family get 
-the PUP flag while other samples from the same family do not
-because the PUP-related keywords may not appear in the labels of 
-all samples from the same family. 
-To address this issue, you can combine the -pup switch with the -fam switch.
-This combination will add into the families file the classification of the 
-family as malware or PUP, based on a majority vote among the samples in a 
-family.
+2. *Vendor-agnostic.* They operate on the labels of any available set of AV 
+engines, which can vary from sample to sample.
 
-```
-$./avclass_labeler.py -lb data/malheurReference_lb.json -v -pup -fam > malheurReference.labels
-```
+3. *Cross-platform.* They can be used for any platforms supported by AV 
+engines, e.g., Windows or Android malware.
 
-will produce a file called *malheurReference_lb.families* with five columns:
+4. *Does not require executables.* AV labels can be obtained from online services
+ like VirusTotal using a sample's hash, even when the executable is not available.
 
-```
-# Family  Total Malware PUP FamType
-virut 441 441 0 malware
-magiccasino 173 0 173 pup
-ejik  168 124 44  malware
-```
+5. *Quantified accuracy.* We have evaluated AVClass and AVClass2 on millions of 
+samples and publicly available malware datasets with ground truth. 
+Evaluation details are in the RAID 2016 and ACSAC 2020 papers.
 
-For virut, the numbers indicate all the 441 virut samples are classified 
-as malware, and thus the last column states that virut is a malware family. 
-For magiccasino, all 173 samples are labeled as PUP, thus the family is PUP.
-For ejik, out of the 168 samples, 124 are labeled as malware and 44 as PUP, 
-so the family is classified as malware.
+6. *Open source.* The code is available and we are happy to incorporate 
+suggestions and improvements so that the security community benefits from 
+these tools.
 
+## Limitations
 
-## Labeling: Ground Truth Evaluation
+The main limitations of AVClass and AVClass2 are that its output depends 
+on the input AV labels. 
+Both tools try to compensate for the noise on the AV labels, 
+but cannot identify tags if AV engines do not provide non-generic tokens 
+in the labels of a sample. 
+In particular, they cannot tag samples if at least 2 AV engines 
+do not agree on a tag. 
 
-If you have ground truth for some malware samples, 
-i.e., you know the true family for those samples, you can evaluate the accuracy of the labeling output by AVClass on those samples with respect to that 
-ground truth.
-The evaluation metrics used are precision, recall, and F1 measure.
-See our RAID 2016 paper above for their definition.
+Still, there are many samples that both tools can tag
+and thus we believe you will find them useful.
+We recommend you to read the RAID 2016 and ACSAC 2020 papers for more details.
 
-```
-$./avclass_labeler.py -lb data/malheurReference_lb.json -v -gt data/malheurReference_gt.tsv -eval > data/malheurReference.labels
-```
+## Input JSON format
 
-The output includes these lines:
+AVClass and AVClass2 support two input JSON formats: 
 
-```
-Calculating precision and recall
-3131 out of 3131
-Precision: 90.81  Recall: 94.05 F1-Measure: 92.40
-```
+1. VirusTotal JSON reports (*-vt file*), 
+where each line in *file* should be the full JSON of a 
+VirusTotal report as fetched through the VirusTotal API.
+By default, it assumes the VT reports are from VT API version 2.
+If the VT reports are from VT API version 3, add the -vt3 command line option.
 
-The last line corresponds to the accuracy metrics obtained by 
-comparing AVClass results with the provided ground truth.
+2. Simplified JSON (*-lb file*),
+where each line in *file* should be a JSON 
+with (at least) these fields:
+{md5, sha1, sha256, av_labels}. 
+There is an example of such input file in *examples/malheurReference_lb.json*
 
-Each line in the *data/malheurReference_gt.tsv* file has 
-two **tab-separated** columns:
+**Why have two different input formats?**
 
-```
-0058780b175c3ce5e244f595951f611b8a24bee2 CASINO
-```
+We believe most users will get the AV labels using VirusTotal. 
+However, AVClass and AVClass2 are IO-bound and a VirusTotal report 
+in addition to the AV labels and hashes includes 
+much other data that the tools do not need. 
+Thus, when applying AVClass or AVClass2 to millions of samples,
+reducing the input file size by removing unnnecessary data 
+significantly improves efficiency. 
+Furthermore, users could obtain AV labels from other sources and 
+the simpler the input JSON format, 
+the easier to convert those AV labels into an input file.
 
-which indicates that sample 0058780b175c3ce5e244f595951f611b8a24bee2 
-is known to be of the *CASINO* family.
-Each sample in the input file should also appear in the ground truth file.
-Note that the particular label assigned to each family does not matter. 
-What matters is that all samples in the same family are assigned the 
-same family name (i.e., the same string in the second column) 
-
-The ground truth can be obtained from publicly available malware 
-datasets. 
-The one in *data/malheurReference_gt.tsv* comes from the 
-[Malheur](http://www.mlsec.org/malheur/) dataset. 
-There are other public datasets with ground truth such as 
-[Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or 
-[Malicia](http://malicia-project.com/dataset.html).
-
-
-## Preparation: Generic Token Detection
-
-The labeling takes as input a file with generic tokens that should be 
-ignored in the AV labels, e.g., trojan, virus, generic, linux.
-By default, the labeling uses the *data/default.generics* 
-generic tokens file.
-You can edit that file to add additional generic tokens you feel 
-we are missing.
-
-In our RAID 2016 paper we describe an automatic approach to 
-identify generic tokens, which **requires ground truth**, 
-i.e., it requires knowing the true family for each input sample.
-Not only that, but **the ground truth should be large**, 
-i.e., contain at least one hundred thousand samples. 
-In our work we identified generic tokens using as ground truth 
-the concatenation of all datasets for which we had ground truth.
-This requirement of a large ground truth dataset is why we expect most users 
-will skip this step and simply use our provided default file.
-
-If you want to test generic token detection you can do:
+## Dependencies
 
-```
- $./avclass_generic_detect.py -lb data/malheurReference_lb.json -gt data/malheurReference_gt.tsv -tgen 10 > malheurReference.gen 
-```
+AVClass and AVClass2 are both written in Python. 
+They should both run on Python versions above 2.7 and 3.0.
 
-Each line in the *data/malheurReference_gt.tsv* file has 
-two **tab-separated** columns:
+They do not require installing any dependencies.
 
-```
-0058780b175c3ce5e244f595951f611b8a24bee2 CASINO
-```
+## Support and Contributing
 
-which indicates that sample 0058780b175c3ce5e244f595951f611b8a24bee2 
-is known to be of the *CASINO* family.
-
-The *-tgen 10* switch is a threshold for the minimum number of families 
-where a token has to be observed to be considered generic. 
-If the switch is ommitted, the default threshold of 8 is used.
-
-The above command outputs two files: 
-*malheurReference.gen* and *malheurReference_lb.gen*. 
-Each of them has 2 columns: token and number of families where the token 
-was observed.
-File *malheurReference.gen* is the final output with the detected 
-generic tokens for which the number of families is above 
-the given threshold. 
-The file *malheurReference_lb.gen* has this information for all tokens.
-Thus, *malheurReference.gen* is a subset of *malheurReference_lb.gen*. 
-
-However, note that in the above command you are trying to identify generic 
-tokens from a small dataset since Drebin only contains 3K labeled samples. 
-Thus, *malheurReference.gen* only contains 25 identified generic tokens. 
-Using those 25 generic tokens will produce significantly worse results 
-than using the generic tokens in *data/default.generics*. 
-For more details you can refer to our RAID 2016 paper.
-
-
-## Preparation: Alias Detection
-
-Different vendors may assign different names (i.e., aliases) for the same
-family. For example, some vendors may use *zeus* and others *zbot* 
-as aliases for the same malware family. 
-The labeling takes as input a file with aliases that should be merged.
-By default, the labeling uses the *data/default.aliases* aliases file.
-You can edit that file to add additional aliases you feel we are missing.
-
-In our RAID 2016 paper we describe an automatic approach 
-to identify aliases.
-Our alias detection approach 
-**requires as input the AV labels for large set of samples**, 
-e.g., several million samples. 
-In contrast with the generic token detection, the input samples for 
-alias detection **do not need to be labeled**, 
-i.e., no need to know their family.
-In our work we identified aliases using as input the largest of our 
-unlabeled datasets, which contained nearly 8M samples. 
-This requirement of a large input dataset is why we expect most users
-will skip this step and simply use our provided default file.
-
-If you want to test alias detection you can do:
+If you have issues or want to contribute, please file a issue or perform a 
+pull request through GitHub.
 
-```
-$./avclass_alias_detect.py -lb data/malheurReference_lb.json -nalias 100 -talias 0.98 > malheurReference.aliases
-```
+## License
 
-The -nalias threshold provides the minimum number of samples two tokens 
-need to be observed in to be considered aliases. 
-If the switch is not provided the default is 20.
-
-The -talias threshold provides the minimum fraction of times that 
-the samples appear together.
-If the switch is not provided the default is 0.94 (94%).
-
-The above command outputs two files:
-*malheurReference.aliases* and *malheurReference_lb.alias*.
-Each of them has 6 columns: 
-1. t1: token that is an alias
-2. t2: family for which t1 is an alias
-3. |t1|: number of input samples where t1 was observed
-4. |t2|: number of input samples where t2 was observed
-5. |t1^t2|: number of input samples where both t1 and t2 were observed
-6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 
-were observed over the number of input samples where t1 was observed.
-
-File *malheurReference.aliases* is the final output with the 
-detected aliases that satisfy the -nalias and -talias thresholds.
-The file *malheurReference_lb.alias* has this information for all tokens.
-Thus, *malheurReference.aliases* is a subset 
-of *malheurReference_lb.alias*.
-
-However, note that in the above command you are trying to identify aliases
-from a small dataset since Drebin only contains 3K samples.
-Thus, *malheurReference.aliases* only contains 6 identified aliases. 
-Using those 6 aliases will produce significantly worse results than using 
-the aliases in *data/default.aliases*.
-As mentioned, to improve the identified aliases you should provide as 
-input several million samples.
-For more details you can refer to our RAID 2016 paper.
-
-
-## Support
-
-If you have issues or want to contribute generic tokens and/or aliases,
-please file a bug report through GitHub.
+AVClass and AVClass2 are both released under the MIT license
 
 ## Contributors
 
-Several members of the MaliciaLab at the 
-[IMDEA Software Institute](http://software.imdea.org) 
-have contributed code to AVClass including:
-Marcos Sebastián, Richard Rivera, Platon Kotzias, Srdjan Matic, and 
-Juan Caballero.
+Several members of the MaliciaLab at the [IMDEA Software Institute](http://software.imdea.org) 
+have contributed code to AVClasss and AVClass2: 
+Marcos Sebastián, Richard Rivera, Platon Kotzias, Srdjan Matic, Silvia Sebastián, and Juan Caballero.
 
diff --git a/avclass/README.md b/avclass/README.md
new file mode 100644
index 0000000..134fa87
--- /dev/null
+++ b/avclass/README.md
@@ -0,0 +1,392 @@
+# AVClass
+
+AVClass is a malware labeling tool.
+
+You give it as input the AV labels for a large number of 
+malware samples (e.g., VirusTotal JSON reports) and it outputs the most 
+likely family name for each sample that it can extract from the AV labels. 
+It can also output a ranking of all alternative names it found for each sample.
+
+The design and evaluation of AVClass is detailed in our 
+[RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf):
+
+> Marcos Sebastián, Richard Rivera, Platon Kotzias, and Juan Caballero. 
+AVClass: A Tool for Massive Malware Labeling. 
+In Proceedings of the International Symposium on Research in 
+Attacks, Intrusions and Defenses,
+September 2016.
+
+In a nutshell, AVClass comprises two phases: 
+preparation (optional) and labeling.
+Code for both is included, 
+but most users will be only interested in the labeling, which outputs the 
+family name for the samples. 
+The preparation produces a list of aliases and generic tokens 
+used by the labeling. 
+If you use our default aliases and generic tokens lists, 
+you do not need to run the preparation.
+
+
+## Labeling 
+   
+The labeler takes as input 
+a JSON file with the AV labels of malware samples (-vt or -lb options), 
+a file with generic tokens (-gen option), 
+and a file with aliases (-alias option). 
+It outputs the most likely family name for each sample.
+If you do not provide alias or generic tokens files, 
+the default ones in the *data* folder are used.
+
+```shell
+$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v > malheurReference.labels
+```
+  
+The above command labels the samples whose AV labels are in the 
+*../examples/malheurReference_lb.json* file.
+It prints the results to stdout, 
+which we redirect to the *malheurReference.labels* file.
+The output looks like this:
+
+```
+aca2d12934935b070df8f50e06a20539 adrotator
+67d15459e1f85898851148511c86d88d adultbrowser
+```
+
+which means sample aca2d12934935b070df8f50e06a20539 is most likely 
+from the *adrotator* family and 
+67d15459e1f85898851148511c86d88d from the *adultbrowser* family.
+
+The verbose (-v) option makes it output an extra 
+*malheurReference_lb.verbose* file
+with all families extracted for each sample ranked by the number of AV 
+engines that use that family.
+The file looks like this:
+
+```
+aca2d12934935b070df8f50e06a20539  [(u'adrotator', 8), (u'zlob', 2)]
+ee90a64fcfaa54a314a7b5bfe9b57357  [(u'swizzor', 19)]
+f465a2c1b852373c72a1ccd161fbe94c  SINGLETON:f465a2c1b852373c72a1ccd161fbe94c
+```
+
+which means that for sample aca2d12934935b070df8f50e06a20539 
+there are 8 AV engines assigning *adrotator* as the family and  
+another 2 assigning *zlob*.
+Thus, *adrotator* is the most likely family.
+On the other hand, for ee90a64fcfaa54a314a7b5bfe9b57357 there are 19 AV 
+engines assigning *swizzor* as family, 
+and no other family was found.
+The last line means that for sample f465a2c1b852373c72a1ccd161fbe94c
+no family name was found in the AV labels. 
+Thus, the sample is placed by himself in a singleton cluster 
+with the name of the cluster being the sample's hash.
+
+Note that the sum of the number of AV engines may not equal the number 
+of AV engines with a label for that sample in the input file 
+because the labels of some AV engines may only include generic tokens 
+that are removed by AVClass.
+
+## Input JSON format
+
+AVClass supports three input JSON formats:
+
+1. VirusTotal JSON reports (**-vt** file), where each line in file should be 
+   the full JSON of a VirusTotal report as fetched through the VirusTotal API. 
+   By default, it assumes the VT reports are from VT API version 2. 
+   If the VT reports are from VT API version 3, add the **-vt3** command line option.
+
+2. Simplified JSON (**-lb** file), where each line in file should be a JSON with 
+   (at least) these fields: {md5, sha1, sha256, scan_date, av_labels}. 
+   There is an example of such input file in ../examples/malheurReference_lb.json
+
+**Multiple input files**
+
+AVClass can handle multiple input files putting the results in the same output files 
+(if you want results in separate files, process each input file separately).
+
+It is possible to provide the -vt and -lb input options multiple times.
+
+```shell
+$./avclass_labeler.py -vt <file1> -vt <file2>
+```
+```shell
+$./avclass_labeler.py -lb <file1> -lb <file2>
+```
+
+There are also -vtdir and -lbdir options that can be used to provide 
+an input directory where all files are VT (-vtdir) or simplified (-lbdir) JSON reports:
+
+```shell
+$./avclass_labeler.py -vtdir <directory>
+```
+
+It is also possible to combine -vt with -vtdir and -lb with -lbdir, 
+but you cannot combine input files of different format. Thus, this command works:
+
+```shell
+$./avclass_labeler.py -vt <file> -vtdir <directory>
+```
+
+But, this one throws an error:
+
+```shell
+$./avclass_labeler.py -vt <file1> -lb <file2>
+```
+
+## Labeling: Family Ranking
+
+AVClass has a -fam option to output a file with a ranking of the 
+families assigned to the input samples. 
+
+```shell
+$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -fam > malheurReference.labels
+```
+
+will produce a file called *malheurReference_lb.families* with two columns:
+
+```
+virut 441
+allaple 301
+podnuha 300
+```
+
+indicating that 441 samples were classified in the virut family, 
+301 as allaple, and 300 as podnuha.
+
+This option is very similar to using the following shell command:
+
+```shell
+$cut -f 2 malheurReference.labels | sort | uniq -c | sort -nr
+```
+
+The main difference is that using the -fam option all SINGLETON samples, 
+i.e., those for which no label was found, 
+are grouped into a fake *SINGLETONS* family, 
+while the shell command would leave each singleton as a separate family.
+
+
+## Labeling: PUP Classification
+
+AVClass also has a -pup option to classify a sample as
+Potentially Unwanted Program (PUP) or malware.
+This classification looks for PUP-related keywords
+(e.g., pup, pua, unwanted, adware) in the AV labels and was proposed in our
+[CCS 2015 paper](https://software.imdea.org/~juanca/papers/malsign_ccs15.pdf):
+
+> Platon Kotzias, Srdjan Matic, Richard Rivera, and Juan Caballero.
+Certified PUP: Abuse in Authenticode Code Signing.
+In Proceedings of the 22nd ACM Conference on Computer and Communication Security, Denver, CO, October 2015
+
+```shell
+$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -pup > malheurReference.labels
+```
+
+With the -pup option the output of the *malheurReference.labels* file
+looks like this:
+
+```
+aca2d12934935b070df8f50e06a20539 adrotator 1
+67d15459e1f85898851148511c86d88d adultbrowser 0
+```
+
+The digit at the end is a Boolean flag that 
+indicates sample aca2d12934935b070df8f50e06a20539 is
+(likely) PUP, but sample 67d15459e1f85898851148511c86d88d is (likely) not.
+
+In our experience the PUP classification is conservative,
+i.e., if it says the sample is PUP, it most likely is.
+But, if it says that it is not PUP, it could still be PUP if the AV labels
+do not contain PUP-related keywords.
+Note that it is possible that some samples from a family get 
+the PUP flag while other samples from the same family do not
+because the PUP-related keywords may not appear in the labels of 
+all samples from the same family. 
+To address this issue, you can combine the -pup option with the -fam option.
+This combination will add into the families file the classification of the 
+family as malware or PUP, based on a majority vote among the samples in a 
+family.
+
+```shell
+$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -pup -fam > malheurReference.labels
+```
+
+will produce a file called *malheurReference_lb.families* with five columns:
+
+```
+# Family  Total Malware PUP FamType
+virut 441 441 0 malware
+magiccasino 173 0 173 pup
+ejik  168 124 44  malware
+```
+
+For virut, the numbers indicate all the 441 virut samples are classified 
+as malware, and thus the last column states that virut is a malware family. 
+For magiccasino, all 173 samples are labeled as PUP, thus the family is PUP.
+For ejik, out of the 168 samples, 124 are labeled as malware and 44 as PUP, 
+so the family is classified as malware.
+
+
+## Labeling: Ground Truth Evaluation
+
+If you have ground truth for some malware samples, 
+i.e., you know the true family for those samples, you can evaluate the accuracy of the labeling output by AVClass on those samples with respect to that 
+ground truth.
+The evaluation metrics used are precision, recall, and F1 measure.
+See our RAID 2016 paper above for their definition.
+
+```shell
+$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -gt ../examples/malheurReference_gt.tsv -eval > malheurReference.labels
+```
+
+The output includes these lines:
+
+```
+Calculating precision and recall
+3131 out of 3131
+Precision: 90.81  Recall: 94.05 F1-Measure: 92.40
+```
+
+The last line corresponds to the accuracy metrics obtained by 
+comparing AVClass results with the provided ground truth.
+
+Each line in the *../examples/malheurReference_gt.tsv* file has 
+two **tab-separated** columns:
+
+```
+0058780b175c3ce5e244f595951f611b8a24bee2 CASINO
+```
+
+which indicates that sample 0058780b175c3ce5e244f595951f611b8a24bee2 
+is known to be of the *CASINO* family.
+Each sample in the input file should also appear in the ground truth file.
+Note that the particular label assigned to each family does not matter. 
+What matters is that all samples in the same family are assigned the 
+same family name (i.e., the same string in the second column) 
+
+The ground truth can be obtained from publicly available malware 
+datasets. 
+The one in *../examples/malheurReference_gt.tsv* comes from the 
+[Malheur](http://www.mlsec.org/malheur/) dataset. 
+There are other public datasets with ground truth such as 
+[Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or 
+[Malicia](http://malicia-project.com/dataset.html).
+
+
+## Preparation: Generic Token Detection
+
+The labeling takes as input a file with generic tokens that should be 
+ignored in the AV labels, e.g., trojan, virus, generic, linux.
+By default, the labeling uses the *data/default.generics* 
+generic tokens file.
+You can edit that file to add additional generic tokens you feel 
+we are missing.
+
+In our RAID 2016 paper we describe an automatic approach to 
+identify generic tokens, which **requires ground truth**, 
+i.e., it requires knowing the true family for each input sample.
+Not only that, but **the ground truth should be large**, 
+i.e., contain at least one hundred thousand samples. 
+In our work we identified generic tokens using as ground truth 
+the concatenation of all datasets for which we had ground truth.
+This requirement of a large ground truth dataset is why we expect most users 
+will skip this step and simply use our provided default file.
+
+If you want to test generic token detection you can do:
+
+```shell
+ $./avclass_generic_detect.py -lb ../examples/malheurReference_lb.json -gt ../examples/malheurReference_gt.tsv -tgen 10 > malheurReference.gen 
+```
+
+Each line in the *../examples/malheurReference_gt.tsv* file has 
+two **tab-separated** columns:
+
+```
+0058780b175c3ce5e244f595951f611b8a24bee2 CASINO
+```
+
+which indicates that sample 0058780b175c3ce5e244f595951f611b8a24bee2 
+is known to be of the *CASINO* family.
+
+The *-tgen 10* option is a threshold for the minimum number of families 
+where a token has to be observed to be considered generic. 
+If the option is ommitted, the default threshold of 8 is used.
+
+The above command outputs two files: 
+*malheurReference.gen* and *malheurReference_lb.gen*. 
+Each of them has 2 columns: token and number of families where the token 
+was observed.
+File *malheurReference.gen* is the final output with the detected 
+generic tokens for which the number of families is above 
+the given threshold. 
+The file *malheurReference_lb.gen* has this information for all tokens.
+Thus, *malheurReference.gen* is a subset of *malheurReference_lb.gen*. 
+
+However, note that in the above command you are trying to identify generic 
+tokens from a small dataset since Drebin only contains 3K labeled samples. 
+Thus, *malheurReference.gen* only contains 25 identified generic tokens. 
+Using those 25 generic tokens will produce significantly worse results 
+than using the generic tokens in *data/default.generics*. 
+For more details you can refer to our RAID 2016 paper.
+
+
+## Preparation: Alias Detection
+
+Different vendors may assign different names (i.e., aliases) for the same
+family. For example, some vendors may use *zeus* and others *zbot* 
+as aliases for the same malware family. 
+The labeling takes as input a file with aliases that should be merged.
+By default, the labeling uses the *data/default.aliases* aliases file.
+You can edit that file to add additional aliases you feel we are missing.
+
+In our RAID 2016 paper we describe an automatic approach 
+to identify aliases.
+Our alias detection approach 
+**requires as input the AV labels for large set of samples**, 
+e.g., several million samples. 
+In contrast with the generic token detection, the input samples for 
+alias detection **do not need to be labeled**, 
+i.e., no need to know their family.
+In our work we identified aliases using as input the largest of our 
+unlabeled datasets, which contained nearly 8M samples. 
+This requirement of a large input dataset is why we expect most users
+will skip this step and simply use our provided default file.
+
+If you want to test alias detection you can do:
+
+```shell
+$./avclass_alias_detect.py -lb ../examples/malheurReference_lb.json -nalias 100 -talias 0.98 > malheurReference.aliases
+```
+
+The -nalias threshold provides the minimum number of samples two tokens 
+need to be observed in to be considered aliases. 
+If the option is not provided the default is 20.
+
+The -talias threshold provides the minimum fraction of times that 
+the samples appear together.
+If the  is not provided the default is 0.94 (94%).
+
+The above command outputs two files:
+*malheurReference.aliases* and *malheurReference_lb.alias*.
+Each of them has 6 columns: 
+1. t1: token that is an alias
+2. t2: family for which t1 is an alias
+3. |t1|: number of input samples where t1 was observed
+4. |t2|: number of input samples where t2 was observed
+5. |t1^t2|: number of input samples where both t1 and t2 were observed
+6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 
+were observed over the number of input samples where t1 was observed.
+
+File *malheurReference.aliases* is the final output with the 
+detected aliases that satisfy the -nalias and -talias thresholds.
+The file *malheurReference_lb.alias* has this information for all tokens.
+Thus, *malheurReference.aliases* is a subset 
+of *malheurReference_lb.alias*.
+
+However, note that in the above command you are trying to identify aliases
+from a small dataset since Drebin only contains 3K samples.
+Thus, *malheurReference.aliases* only contains 6 identified aliases. 
+Using those 6 aliases will produce significantly worse results than using 
+the aliases in *data/default.aliases*.
+As mentioned, to improve the identified aliases you should provide as 
+input several million samples.
+For more details you can refer to our RAID 2016 paper.
+
diff --git a/avclass_alias_detect.py b/avclass/avclass_alias_detect.py
similarity index 100%
rename from avclass_alias_detect.py
rename to avclass/avclass_alias_detect.py
diff --git a/avclass_generic_detect.py b/avclass/avclass_generic_detect.py
similarity index 100%
rename from avclass_generic_detect.py
rename to avclass/avclass_generic_detect.py
diff --git a/avclass_labeler.py b/avclass/avclass_labeler.py
similarity index 100%
rename from avclass_labeler.py
rename to avclass/avclass_labeler.py
diff --git a/data/default.aliases b/avclass/data/default.aliases
similarity index 100%
rename from data/default.aliases
rename to avclass/data/default.aliases
diff --git a/data/default.generics b/avclass/data/default.generics
similarity index 100%
rename from data/default.generics
rename to avclass/data/default.generics
diff --git a/lib/avclass_common.py b/avclass/lib/avclass_common.py
similarity index 100%
rename from lib/avclass_common.py
rename to avclass/lib/avclass_common.py
diff --git a/lib/evaluate_clustering.py b/avclass/lib/evaluate_clustering.py
similarity index 100%
rename from lib/evaluate_clustering.py
rename to avclass/lib/evaluate_clustering.py
diff --git a/avclass2/README.md b/avclass2/README.md
new file mode 100644
index 0000000..fb5ca17
--- /dev/null
+++ b/avclass2/README.md
@@ -0,0 +1,252 @@
+# AVClass2
+
+AVClass2 is a malware tagging tool. It extends AVClass to extract from AV labels not only family name tags, but other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). 
+
+You give it as input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports)
+and it outputs tags observed in the AV labels, ranked by decreasing popularity. 
+
+The design and evaluation of AVClass2 is detailed in our ACSAC 2020 paper.
+
+> Silvia Sebastián, Juan Caballero. 
+AVClass2: Massive Malware Tag Extraction from AV Labels. 
+In proceedings of the Annual Computer Security Applications Conference, December 2020.
+
+In a nutshell, AVClass2 comprises two modules: labeling and update. Code for both is included, but most users will be only interested in the labeling, which outputs the tags for the samples. The update module is used to update the input taxonomy, tagging rules, and expansion rules. If you use our default taxonomy, tagging, and expansion files, you do not need to run the update module.
+
+
+## Labeling
+
+The labeler takes as input a JSON file with the AV labels of malware samples 
+(-vt or -lb options), 
+a file with the taxonomy (-tax option), 
+a file with tagging rules (-tag option), and
+a file with expansion rules (-exp option). 
+It outputs a set of ranked tags. 
+If you do not provide taxonomy, expansion or tagging files, 
+the default ones in the data folder are used.
+
+```shell
+$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json
+```
+
+The above command labels the samples whose AV labels are in 
+the ../examples/malheurReference_lb.json file. 
+It prints the results to stdout. 
+The output looks like this: 
+
+```
+aca2d12934935b070df8f50e06a20539 33 grayware|10,adware|9,windows|8,adrotator|8,downloader|3,zlob|2
+67d15459e1f85898851148511c86d88d 37 dialer|23,windows|9,adultbrowser|8,porndialer|7,grayware|6,tool|3,target|2
+```
+
+which means sample *aca2d12934935b070df8f50e06a20539* 
+was flagged by 33 AV engines and 10 of them agree it is *grayware*, 9 that it is more specifically *adware*, 
+8 mention that it runs on *windows*, another 8 that it is the *adrotator* family, 
+3 that it is a *downloader*, and 2 that it belongs instead to the *zlob* family.
+Sample *67d15459e1f85898851148511c86d88d* is flagged by 37 AV engines and 23 of them 
+consider it a *dialer*, 8 that it belongs to the *adultbrowser* family, and so on. 
+
+The -p option outputs the full path of each tag in the taxonomy: 
+
+```shell
+$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -p
+```
+
+The above command line outputs:
+
+```
+aca2d12934935b070df8f50e06a20539 33 CLASS:grayware|10,CLASS:grayware:adware|9,FILE:os:windows|8,FAM:adrotator|8,CLASS:downloader|3,FAM:zlob|2
+67d15459e1f85898851148511c86d88d 37 CLASS:dialer|23,FILE:os:windows|9,FAM:adultbrowser|8,CLASS:dialer:porndialer|7,CLASS:grayware|6,CLASS:grayware:tool|3,FAM:target|2
+```
+
+where each tag has been replaced by its taxonomy path, which starts with the category in capitals, 
+followed by the path in the category (if any), and the tag itself, all separated by colons. 
+For example, *FAM:adrotator* makes explicit that *adrotator* is a malware family, 
+*CLASS:grayware* that *grayware* is a malware class, and 
+*CLASS:grayware:adware* that *adware* is a subclass of *grayware*.
+
+** Compatibility mode **
+
+The compatibility -c option makes AVClass2 output the same format as AVClass. 
+
+```shell
+$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -c
+```
+
+outputs:
+
+```
+bb23e1d296cf01bbaf32ed3938f9b0b8 allaple
+cc4521ea738e8ba17139f86b3def5349 SINGLETON:cc4521ea738e8ba17139f86b3def5349
+```
+
+As in AVClass, the output contains only the family name, 
+which corresponds to the highest ranked family tag, all other tags are ignored.
+Samples for which a family cannot be obtained are labeled as singletons with their hash.
+ 
+It is important to note that AVClass2 compatibility mode results can differ from AVClass results
+on the same input file.
+The differences in family names are due to differences between the generics and aliases files 
+used by AVClass and the taxonomy, tagging rules, and expansion rules used by AVClass2. 
+In the future, we may change AVClass to use the taxonomy and rules from AVClass2 
+as input (instead of the generics and aliases files) 
+to minimize such differences and avoid maintaining different data files.
+
+
+## Input JSON format
+
+AVClass2 supports three input JSON formats:
+
+1. VirusTotal JSON reports (**-vt** file), where each line in file should be 
+   the full JSON of a VirusTotal report as fetched through the VirusTotal API. 
+   By default, it assumes the VT reports are from VT API version 2. 
+   If the VT reports are from VT API version 3, add the **-vt3** command line option.
+
+2. Simplified JSON (**-lb** file), where each line in file should be a JSON with 
+   (at least) these fields: {md5, sha1, sha256, scan_date, av_labels}. 
+   There is an example of such input file in ../examples/malheurReference_lb.json
+
+**Multiple input files**
+
+AVClass can handle multiple input files putting the results in the same output files 
+(if you want results in separate files, process each input file separately).
+
+It is possible to provide the -vt and -lb input options multiple times.
+
+```shell
+$./avclass2_labeler.py -vt <file1> -vt <file2>
+```
+```shell
+$./avclass2_labeler.py -lb <file1> -lb <file2>
+```
+
+There are also -vtdir and -lbdir options that can be used to provide 
+an input directory where all files are VT (-vtdir) or simplified (-lbdir) JSON reports:
+
+```shell
+$./avclass2_labeler.py -vtdir <directory>
+```
+
+It is also possible to combine -vt with -vtdir and -lb with -lbdir, 
+but you cannot combine input files of different format. Thus, this command works:
+
+```shell
+$./avclass2_labeler.py -vt <file> -vtdir <directory>
+```
+
+But, this one throws an error:
+
+```shell
+$./avclass2_labeler.py -vt <file1> -lb <file2>
+```
+
+At this point you have read the most important information on how to use AVClass2. 
+The following sections describe steps that most users will not need.
+
+## Labeling: Ground Truth Evaluation
+
+If you have family ground truth for some malware samples, i.e., 
+you know the true family for those samples, you can evaluate the accuracy 
+of the family tags output by AVClass2 on those samples with respect to that ground truth. 
+The evaluation metrics used are precision, recall, and F1 measure. 
+See our [RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf) for their definition.
+Note that the ground truth evaluation does not apply to non-family tags, 
+i.e., it only evaluates the output of the compatibility mode.
+
+```shell
+$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -gt ../examples/malheurReference_gt.tsv > malheurReference.labels
+```
+
+The output includes these lines:
+
+```
+Calculating precision and recall
+3131 out of 3131
+Precision: 90.81  Recall: 94.05 F1-Measure: 92.40
+```
+
+Each line in the *../examples/malheurReference_gt.tsv* file has two **tab-separated** columns:
+
+```
+aca2d12934935b070df8f50e06a20539 ADROTATOR
+```
+
+which indicates that sample aca2d12934935b070df8f50e06a20539 is known 
+to be of the *ADROTATOR* family. 
+Each sample in the input file should also appear in the ground truth file. 
+Note that the particular label assigned to each family does not matter. 
+What matters is that all samples in the same family are assigned 
+the same family name (i.e., the same string in the second column)
+
+The ground truth can be obtained from publicly available malware datasets. 
+The one in *../examples/malheurReference_gt.tsv* comes from the 
+[Malheur](http://www.mlsec.org/malheur/) dataset. 
+There are other public datasets with ground truth such as 
+[Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or 
+[Malicia](http://malicia-project.com/dataset.html).
+
+## Update Module
+
+The update module can be used to suggest additions and changes to the input 
+taxonomy, tagging rules, and expansion rules. 
+Using the update module comprises of two steps.
+The first step is obtaining an alias file from the labeler:
+
+```shell
+$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -aliasdetect
+```
+
+The above command will create a file named \<file\>.alias, 
+malheurReference_lb.alias in our example. This file has 7 columns:
+
+1. t1: token that is an alias
+2. t2: tag for which t1 is an alias
+3. |t1|: number of input samples where t1 was observed
+4. |t2|: number of input samples where t2 was observed
+5. |t1^t2|: number of input samples where both t1 and t2 were observed
+6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t1 was observed.
+7. |t1^t2|/|t2|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t2 was observed.
+
+
+The Update Module takes the above file as input with the -alias option, 
+as well as the default taxonomy, tagging, and expansion files in the data directory. 
+It outputs updated taxonomy, tagging, and expansion files that include the 
+suggested additions and changes. 
+
+```shell
+$./avclass2_update_module.py -alias malheurReference_lb.alias -o output_prefix
+```
+
+This will produce three files: 
+output_prefix.taxonomy, output_prefix.tagging, output_prefix.expansion. 
+You can diff the output and input files to analyze the proposed changes.
+
+You can also modify the input taxonomy, tagging, and expansion rules in place, 
+rather than producing new files:
+
+
+```shell
+$./avclass2_update_module.py -alias malheurReference_lb.alias -update
+```
+
+
+## Customizing AVClass2
+
+AVClass2 is fully customizable: 
+Tagging, Expansion and Taxonomy files can be easily modified by the analyst 
+either manually or by running the update module. 
+
+If you change those files manually, we recommend running 
+afterwards the input checker script to keep them tidy. 
+It sorts the tags in the taxonomy and performs some basic cleaning like 
+removing redundant entries:
+
+```shell
+$./avclass2_input_checker.py -tax taxonomy_file -tag tagging_file -exp expansio_file
+```
+
+If the modifications are in the default files in the data directory you can simply run: 
+
+```shell
+$./avclass2_input_checker.py 
+```
diff --git a/avclass2/avclass2_input_checker.py b/avclass2/avclass2_input_checker.py
new file mode 100755
index 0000000..adbf8e5
--- /dev/null
+++ b/avclass2/avclass2_input_checker.py
@@ -0,0 +1,51 @@
+import os
+import sys
+import argparse
+script_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, os.path.join(script_dir, 'lib/'))
+from avclass2_common import Taxonomy, Tagging, Expansion
+
+default_tag_file = "data/tagging"
+default_tax_file = "data/taxonomy"
+default_exp_file = "data/expansion"
+
+if __name__ == '__main__':
+    argparser = argparse.ArgumentParser(prog='input_checker',
+        description='Checks format of files Tagging, Expansion and Taxonomy.')
+
+    argparser.add_argument('-tag',
+        help='tagging file',
+        default=default_tag_file)
+
+    argparser.add_argument('-tax',
+        help='taxonomy file',
+        default=default_tax_file)
+
+    argparser.add_argument('-exp',
+        help='expansion file',
+        default=default_exp_file)
+
+    # Parse arguments
+    args = argparser.parse_args()
+
+    # Normalize taxonomy
+    taxonomy = Taxonomy(args.tax)
+    taxonomy.to_file(args.tax)
+    sys.stdout.write('[-] Normalized %d tags in taxonomy %s\n' % (
+                        len(taxonomy), args.tax))
+
+    # Normalize tagging rules
+    tagging = Tagging(args.tag)
+    tagging.validate(taxonomy)
+    # tagging.expand_all_destinations()
+    tagging.to_file(args.tag)
+    sys.stdout.write('[-] Normalized %d tagging rules in %s\n' % (
+                        len(tagging), args.tag))
+
+    # Normalize expansion rules
+    expansion = Expansion(args.exp)
+    expansion.validate(taxonomy)
+    expansion.to_file(args.exp)
+    sys.stdout.write('[-] Normalized %d expansion rules in %s\n' % (
+                        len(expansion), args.exp))
+
diff --git a/avclass2/avclass2_labeler.py b/avclass2/avclass2_labeler.py
new file mode 100755
index 0000000..cf872dd
--- /dev/null
+++ b/avclass2/avclass2_labeler.py
@@ -0,0 +1,469 @@
+#!/usr/bin/env python2
+'''
+AVClass2 labeler
+'''
+
+import os
+import sys
+script_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, os.path.join(script_dir, 'lib/'))
+import argparse
+from avclass2_common import AvLabels
+from operator import itemgetter
+import evaluate_clustering as ec
+import json
+import traceback
+
+# Default tagging file
+default_tag_file = os.path.join(script_dir, "data/tagging")
+# Default expansion file
+default_exp_file = os.path.join(script_dir, "data/expansion")
+# Default taxonomy file
+default_tax_file = os.path.join(script_dir, "data/taxonomy")
+
+def guess_hash(h):
+    ''' Given a hash string, guess the hash type based on the string length '''
+    hlen = len(h)
+    if hlen == 32:
+        return 'md5'
+    elif hlen == 40:
+        return 'sha1'
+    elif hlen == 64:
+        return 'sha256'
+    else:
+        return None
+
+def format_tag_pairs(l, taxonomy=None):
+    ''' Return ranked tags as string '''
+    if not l:
+        return ""
+    p = taxonomy.get_path(l[0][0]) if taxonomy else l[0][0]
+    out = "%s|%d" % (p, l[0][1])
+    for (t,s) in l[1:]:
+        p = taxonomy.get_path(t) if taxonomy else t
+        out += ",%s|%d" % (p, s)
+    return out
+
+def list_str(l, sep=", ", prefix=""):
+    ''' Return list as a string '''
+    if not l:
+        return ""
+    out = prefix + l[0]
+    for s in l[1:]:
+        out = out + sep + s
+    return out
+
+def main(args):
+    # Select hash used to identify sample, by default MD5
+    hash_type = args.hash if args.hash else 'md5'
+
+    # If ground truth provided, read it from file
+    gt_dict = {}
+    if args.gt:
+        with open(args.gt, 'r') as gt_fd:
+            for line in gt_fd:
+                gt_hash, family = map(str, line.strip().split('\t', 1))
+                gt_dict[gt_hash] = family
+
+        # Guess type of hash in ground truth file
+        hash_type = guess_hash(list(gt_dict.keys())[0])
+
+    # Create AvLabels object
+    av_labels = AvLabels(args.tag, args.exp, args.tax,
+                         args.av, args.aliasdetect)
+
+    # Build list of input files
+    # NOTE: duplicate input files are not removed
+    ifile_l = []
+    if (args.vt):
+        ifile_l += args.vt
+        ifile_are_vt = True
+    if (args.lb):
+        ifile_l += args.lb
+        ifile_are_vt = False
+    if (args.vtdir):
+        ifile_l += [os.path.join(args.vtdir, 
+                                  f) for f in os.listdir(args.vtdir)]
+        ifile_are_vt = True
+    if (args.lbdir):
+        ifile_l += [os.path.join(args.lbdir, 
+                                  f) for f in os.listdir(args.lbdir)]
+        ifile_are_vt = False
+
+    # Select correct sample info extraction function
+    if not ifile_are_vt:
+        get_sample_info = av_labels.get_sample_info_lb
+    elif args.vt3:
+        get_sample_info = av_labels.get_sample_info_vt_v3
+    else:
+        get_sample_info = av_labels.get_sample_info_vt_v2
+
+    # Select output prefix
+    out_prefix = os.path.basename(os.path.splitext(ifile_l[0])[0])
+
+    # Initialize state
+    first_token_dict = {}
+    token_count_map = {}
+    pair_count_map = {}
+    vt_all = 0
+    avtags_dict = {}
+    stats = {'samples': 0, 'noscans': 0, 'tagged': 0, 'maltagged': 0,
+             'FAM': 0, 'CLASS': 0, 'BEH': 0, 'FILE': 0, 'UNK': 0}
+
+    # Process each input file
+    for ifile in ifile_l:
+        # Open file
+        fd = open(ifile, 'r')
+
+        # Debug info, file processed
+        sys.stderr.write('[-] Processing input file %s\n' % ifile)
+
+        # Process all lines in file
+        for line in fd:
+
+            # If blank line, skip
+            if line == '\n':
+                continue
+
+            # Debug info
+            if vt_all % 100 == 0:
+                sys.stderr.write('\r[-] %d JSON read' % vt_all)
+                sys.stderr.flush()
+            vt_all += 1
+
+            # Read JSON line
+            vt_rep = json.loads(line)
+
+            # Extract sample info
+            sample_info = get_sample_info(vt_rep)
+
+            # If no sample info, log error and continue
+            if sample_info is None:
+                try:
+                    name = vt_rep['md5']
+                    sys.stderr.write('\nNo scans for %s\n' % name)
+                except KeyError:
+                    sys.stderr.write('\nCould not process: %s\n' % line)
+                sys.stderr.flush()
+                stats['noscans'] += 1
+                continue
+
+            # Sample's name is selected hash type (md5 by default)
+            name = getattr(sample_info, hash_type)
+
+            # If the VT report has no AV labels, output and continue
+            if not sample_info.labels:
+                sys.stdout.write('%s\t-\t[]\n' % (name))
+                # sys.stderr.write('\nNo AV labels for %s\n' % name)
+                # sys.stderr.flush()
+                continue
+
+            # Compute VT_Count
+            vt_count = len(sample_info.labels)
+
+            # Get the distinct tokens from all the av labels in the report
+            # And print them. 
+            try:
+                av_tmp = av_labels.get_sample_tags(sample_info)
+                tags = av_labels.rank_tags(av_tmp)
+
+                # AV VENDORS PER TOKEN
+                if args.avtags:
+                    for t in av_tmp:
+                        tmap = avtags_dict.get(t, {})
+                        for av in av_tmp[t]:
+                            ctr = tmap.get(av, 0)
+                            tmap[av] = ctr + 1
+                        avtags_dict[t] = tmap
+
+                if args.aliasdetect:
+                    prev_tokens = set()
+                    for entry in tags:
+                        curr_tok = entry[0]
+                        curr_count = token_count_map.get(curr_tok, 0)
+                        token_count_map[curr_tok] = curr_count + 1
+                        for prev_tok in prev_tokens:
+                            if prev_tok < curr_tok:
+                                pair = (prev_tok,curr_tok)
+                            else:
+                                pair = (curr_tok,prev_tok)
+                            pair_count = pair_count_map.get(pair, 0)
+                            pair_count_map[pair] = pair_count + 1
+                        prev_tokens.add(curr_tok)
+
+                # Collect stats
+                # FIX: should iterate once over tags, 
+                # for both stats and aliasdetect
+                if tags:
+                    stats["tagged"] += 1
+                    if args.stats:
+                        if (vt_count > 3):
+                            stats["maltagged"] += 1
+                            cat_map = {'FAM': False, 'CLASS': False,
+                                       'BEH': False, 'FILE': False, 'UNK':
+                                           False}
+                            for t in tags:
+                                path, cat = av_labels.taxonomy.get_info(t[0])
+                                cat_map[cat] = True
+                            for c in cat_map:
+                                if cat_map[c]:
+                                    stats[c] += 1
+
+                # Check if sample is PUP, if requested
+                if args.pup:
+                    if av_labels.is_pup(tags, av_labels.taxonomy):
+                        is_pup_str = "\t1"
+                    else:
+                        is_pup_str = "\t0"
+                else:
+                    is_pup_str =  ""
+
+                # Select family for sample if needed,
+                # i.e., for compatibility mode or for ground truth
+                if args.c or args.gt:
+                    fam = "SINGLETON:" + name
+                    # fam = ''
+                    for (t,s) in tags:
+                        cat = av_labels.taxonomy.get_category(t)
+                        if (cat == "UNK") or (cat == "FAM"):
+                            fam = t
+                            break
+
+                # Get ground truth family, if available
+                if args.gt:
+                    first_token_dict[name] = fam
+                    gt_family = '\t' + gt_dict.get(name, "")
+                else:
+                    gt_family = ""
+
+                # Get VT tags as string
+                if args.vtt:
+                    vtt = list_str(sample_info.vt_tags, prefix="\t")
+                else:
+                    vtt = ""
+
+                # Print family (and ground truth if available) to stdout
+                if not args.c:
+                    if args.path:
+                        tag_str = format_tag_pairs(tags, av_labels.taxonomy)
+                    else:
+                        tag_str = format_tag_pairs(tags)
+                    sys.stdout.write('%s\t%d\t%s%s%s%s\n' %
+                                     (name, vt_count, tag_str, gt_family,
+                                      is_pup_str, vtt))
+                else:
+                    sys.stdout.write('%s\t%s%s%s\n' %
+                                     (name, fam, gt_family, is_pup_str))
+            except:
+                traceback.print_exc(file=sys.stderr)
+                continue
+
+        # Debug info
+        sys.stderr.write('\r[-] %d JSON read' % vt_all)
+        sys.stderr.flush()
+        sys.stderr.write('\n')
+
+        # Close file
+        fd.close()
+
+    # Print statistics
+    sys.stderr.write(
+            "[-] Samples: %d NoScans: %d NoTags: %d GroundTruth: %d\n" % (
+                vt_all, stats['noscans'], vt_all - stats['tagged'], 
+                len(gt_dict)))
+
+    # If ground truth, print precision, recall, and F1-measure
+    if args.gt:
+        precision, recall, fmeasure = \
+                    ec.eval_precision_recall_fmeasure(gt_dict,
+                                                      first_token_dict)
+        sys.stderr.write(
+            "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % \
+                          (precision, recall, fmeasure))
+
+    # Output stats
+    if args.stats:
+        stats_fd = open("%s.stats" % out_prefix, 'w')
+        num_samples = vt_all
+        stats_fd.write('Samples: %d\n' % num_samples)
+        num_tagged = stats['tagged']
+        frac = float(num_tagged) / float(num_samples) * 100
+        stats_fd.write('Tagged (all): %d (%.01f%%)\n' % (num_tagged, frac))
+        num_maltagged = stats['maltagged']
+        frac = float(num_maltagged) / float(num_samples) * 100
+        stats_fd.write('Tagged (VT>3): %d (%.01f%%)\n' % (num_maltagged, frac))
+        for c in ['FILE','CLASS','BEH','FAM','UNK']:
+            count = stats[c]
+            frac = float(count) / float(num_maltagged) * 100
+            stats_fd.write('%s: %d (%.01f%%)\n' % (c, stats[c], frac))
+        stats_fd.close()
+
+    # Output vendor info
+    if args.avtags:
+        avtags_fd = open("%s.avtags" % out_prefix, 'w')
+        for t in sorted(avtags_dict.keys()):
+            avtags_fd.write('%s\t' % t)
+            pairs = sorted(avtags_dict[t].items(),
+                            key=lambda pair : pair[1],
+                            reverse=True)
+            for pair in pairs:
+                avtags_fd.write('%s|%d,' % (pair[0], pair[1]))
+            avtags_fd.write('\n')
+        avtags_fd.close()
+
+    # If alias detection, print map
+    if args.aliasdetect:
+        # Open alias file
+        alias_filename = out_prefix + '.alias'
+        alias_fd = open(alias_filename, 'w+')
+        # Sort token pairs by number of times they appear together
+        sorted_pairs = sorted(
+            pair_count_map.items(), key=itemgetter(1))
+        # sorted_pairs = sorted(
+        #     pair_count_map.items())
+
+        # Output header line
+        alias_fd.write("# t1\tt2\t|t1|\t|t2|\t"
+                       "|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n")
+        # Compute token pair statistic and output to alias file
+        for (t1, t2), c in sorted_pairs:
+            n1 = token_count_map[t1]
+            n2 = token_count_map[t2]
+            if (n1 < n2):
+                x = t1
+                y = t2
+                xn = n1
+                yn = n2
+            else:
+                x = t2
+                y = t1
+                xn = n2
+                yn = n1
+            f = float(c) / float(xn)
+            finv = float(c) / float(yn)
+            alias_fd.write("%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" % (
+                x, y, xn, yn, c, f, finv))
+        # Close alias file
+        alias_fd.close()
+        sys.stderr.write('[-] Alias data in %s\n' % (alias_filename))
+
+
+if __name__=='__main__':
+    argparser = argparse.ArgumentParser(prog='avclass2_labeler',
+        description='''Extracts tags for a set of samples.
+            Also calculates precision and recall if ground truth available''')
+
+    argparser.add_argument('-vt', action='append',
+        help='file with VT reports '
+             '(Can be provided multiple times)')
+
+    argparser.add_argument('-lb', action='append',
+        help='file with simplified JSON reports'
+             '{md5,sha1,sha256,scan_date,av_labels} '
+             '(Can be provided multiple times)')
+
+    argparser.add_argument('-vtdir',
+        help='existing directory with VT reports')
+
+    argparser.add_argument('-lbdir',
+        help='existing directory with simplified JSON reports')
+
+    argparser.add_argument('-vt3', action='store_true',
+        help='input are VT v3 files')
+
+    argparser.add_argument('-gt',
+        help='file with ground truth. '
+             'If provided it evaluates clustering accuracy. '
+             'Prints precision, recall, F1-measure.')
+
+    argparser.add_argument('-vtt',
+        help='Include VT tags in the output.',
+        action='store_true')
+
+    argparser.add_argument('-tag',
+        help='file with tagging rules.',
+        default = default_tag_file)
+
+    argparser.add_argument('-tax',
+        help='file with taxonomy.',
+        default = default_tax_file)
+
+    argparser.add_argument('-exp',
+        help='file with expansion rules.',
+        default = default_exp_file)
+
+    argparser.add_argument('-av',
+        help='file with list of AVs to use')
+
+    argparser.add_argument('-avtags',
+        help='extracts tags per av vendor',
+        action='store_true')
+
+    argparser.add_argument('-pup',
+        action='store_true',
+        help='if used each sample is classified as PUP or not')
+
+    argparser.add_argument('-p', '--path',
+        help='output.full path for tags',
+        action='store_true')
+
+    argparser.add_argument('-hash',
+        help='hash used to name samples. Should match ground truth',
+        choices=['md5', 'sha1', 'sha256'])
+
+    argparser.add_argument('-c',
+        help='Compatibility mode. Outputs results in AVClass format.',
+        action='store_true')
+
+    argparser.add_argument('-aliasdetect',
+        action='store_true',
+        help='if used produce aliases file at end')
+
+    argparser.add_argument('-stats',
+                           action='store_true',
+                           help='if used produce 1 file '
+                                'with stats per category '
+                                '(File, Class, '
+                                'Behavior, Family, Unclassified)')
+
+    args = argparser.parse_args()
+
+    if not args.vt and not args.lb:
+        sys.stderr.write('Argument -vt or -lb is required\n')
+        exit(1)
+
+    if args.vt and args.lb:
+        sys.stderr.write('Use either -vt or -lb argument, not both.\n')
+        exit(1)
+
+    if args.tag:
+        if args.tag == '/dev/null':
+            sys.stderr.write('[-] Using no tagging rules\n')
+        else:
+            sys.stderr.write('[-] Using tagging rules in %s\n' % (
+                              args.tag))
+    else:
+        sys.stderr.write('[-] Using default tagging rules in %s\n' % (
+                          default_tag_file))
+
+    if args.tax:
+        if args.tax == '/dev/null':
+            sys.stderr.write('[-] Using no taxonomy\n')
+        else:
+            sys.stderr.write('[-] Using taxonomy in %s\n' % (
+                              args.tax))
+    else:
+        sys.stderr.write('[-] Using default taxonomy in %s\n' % (
+                          default_tax_file))
+
+    if args.exp:
+        if args.exp == '/dev/null':
+            sys.stderr.write('[-] Using no expansion tags\n')
+        else:
+            sys.stderr.write('[-] Using expansion tags in %s\n' % (
+                              args.exp))
+    else:
+        sys.stderr.write('[-] Using default expansion tags in %s\n' % (
+                          default_exp_file))
+
+    main(args)
diff --git a/avclass2/avclass2_update_module.py b/avclass2/avclass2_update_module.py
new file mode 100755
index 0000000..0dd2209
--- /dev/null
+++ b/avclass2/avclass2_update_module.py
@@ -0,0 +1,480 @@
+#!/usr/bin/env python2
+# -*- coding: utf-8 -*-
+'''
+AVClass2 Update module
+'''
+import sys
+import os
+import argparse
+import logging
+# Make sure paths are relative to execution path
+script_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, os.path.join(script_dir, 'lib/'))
+from operator import itemgetter
+from collections import namedtuple
+from avclass2_common import Taxonomy, Expansion, Tagging
+# from Levenshtein import ratio as levenshtein_ratio
+
+# Set logging
+log = logging.getLogger(__name__)
+
+# Log warn and above to stderr
+formatter = logging.Formatter(u'%(message)s')
+handler_stderr = logging.StreamHandler(sys.stderr)
+handler_stderr.setLevel(logging.INFO)
+handler_stderr.setFormatter(formatter)
+root = logging.getLogger()
+root.setLevel(logging.DEBUG)
+root.addHandler(handler_stderr)
+
+
+# Default tagging file
+default_tagging_file = os.path.join(script_dir, "data/tagging")
+# Default expansion file
+default_expansion_file = os.path.join(script_dir, "data/expansion")
+# Default taxonomy file
+default_taxonomy_file = os.path.join(script_dir, "data/taxonomy")
+
+# Threshold for string similarity
+sim_threshold = 0.6
+
+# Relation
+Rel = namedtuple('Rel', ['t1', 't2', 't1_num', 't2_num', 
+                         'nalias_num', 'talias_num', 'tinv_alias_num'])
+
+class Update:
+    ''' Update Module '''
+    def __init__(self, rel_filepath, in_taxonomy, in_tagging, in_expansion, 
+                    n, t):
+        # Initialize inputs
+        self.__out_taxonomy = in_taxonomy
+        self.__out_tagging = in_tagging
+        self.__out_expansion = in_expansion
+        self.__n = n
+        self.__t = t
+        # Initialize blacklist
+        self.blist = in_taxonomy.platform_tags()
+        log.info(self.blist)
+        # Maps src -> cnt
+        self.src_map = {}
+        # Read relations from file
+        self.rel_set = self.read_relations(rel_filepath)
+
+    def num_rules(self):
+        return len(self.rel_set)
+
+    def is_weak_rel(self, rel):
+        ''' Return true if relationship is weak, 
+            i.e., does not meet thresholds '''
+        return ((int(rel.nalias_num) < self.__n) or
+                (float(rel.talias_num) < self.__t))
+
+    def is_blacklisted_rel(self, rel):
+        ''' Return true if relationship is blacklisted '''
+        return (rel.t1 in self.blist) or (rel.t2 in self.blist)
+
+    def is_known_rel(self, rel):
+        ''' Return true if relationship is known '''
+        t1 = rel.t1
+        t2 = rel.t2
+        # Known taxonomy relation
+        if self.__out_taxonomy.overlaps(t1,t2):
+            return True
+        # Known expansion rule
+        t1_dst = self.__out_expansion.get_dst(t1)
+        t2_dst = self.__out_expansion.get_dst(t2)
+        if (t2 in t1_dst) or (t1 in t2_dst):
+            return True
+        # Known tagging rule
+        t1_dst = sorted(self.__out_tagging.get_dst(t1))
+        t2_dst = sorted(self.__out_tagging.get_dst(t2))
+        if (t2 in t1_dst) or (t1 in t2_dst):
+            return True
+        # Known alias in tagging
+        if t1_dst and (t1_dst == t2_dst):
+            return True
+        return False
+
+    def add_tag(self, name, path):
+        ''' Add tag to taxonomy if not in tagging '''
+        l = self.__out_tagging.get_dst(name)
+        if (not l):
+            self.__out_taxonomy.add_tag(path)
+
+    def add_expansion(self, src, dst_l):
+        ''' Add expansion rule fixing destination if src in tagging '''
+        # Select source handling aliases
+        l = self.__out_tagging.get_dst(src)
+        if l:
+            new_src = l[0]
+        else:
+            new_src = src
+        # Select destinations removing overlaps with existing rule
+        l = self.__out_expansion.get_dst(src)
+        if l:
+            l.extend(dst_l)
+            target_l = self.__out_taxonomy.remove_overlaps(l)
+            self.__out_expansion.add_rule(new_src, target_l, True)
+        else:
+            self.__out_expansion.add_rule(new_src, dst_l, True)
+
+    def add_alias(self, src, dst, dst_prefix):
+        ''' Add alias relation to taxonomy, tagging '''
+        # If src in tagging, use most popular target
+        l = self.__out_tagging.get_dst(src)
+        target = dst
+        if l:
+            cnt_max = self.src_map[dst]
+            for e in l:
+                cnt = self.src_map.get(e, 0)
+                if cnt > cnt_max:
+                    target = e
+        # If dst is in tagging, update tagging rule destination, 
+        l = self.__out_tagging.get_dst(dst)
+        if l:
+            target_l = l
+        # else add dst to taxonomy
+        else:
+            target_l = [target]
+            self.__out_taxonomy.add_tag('%s:%s' % (dst_prefix, dst))
+        # Remove src from taxonomy
+        self.__out_taxonomy.remove_tag(src)
+        # Replace tagging rule
+        self.__out_tagging.add_rule(src, target_l, True)
+
+    def is_expansion_rel(self, rel):
+        ''' Return true if relation implies expansion rule '''
+        c1 = self.__out_taxonomy.get_category(rel.t1)
+        c2 = self.__out_taxonomy.get_category(rel.t2)
+        return (((c1 == "FAM") and (c2 != c1) and (c2 != "UNK")) or
+                ((c1 == "CLASS") and ((c2 == "FILE") or (c2 == "BEH"))) or
+                ((c1 == "UNK") and ((c2 == "BEH") or (c2 == "CLASS"))))
+
+    def find_expansions(self):
+        ''' Find expansions among relations '''
+        acc = []
+        for rel in self.rel_set:
+            p1 = self.__out_taxonomy.get_path(rel.t1)
+            p2 = self.__out_taxonomy.get_path(rel.t2)
+            log.info("Processing %s\t%s" % (p1, p2))
+            if self.is_expansion_rel(rel):
+                self.add_expansion(rel.t1, [rel.t2])
+                acc.append(rel)
+        for rel in acc:
+            self.rel_set.remove(rel)
+
+    #def is_alias_rel(self, rel):
+    #    ''' Return true if relation implies alias rule '''
+    #    c1 = self.__out_taxonomy.get_category(rel.t1)
+    #    c2 = self.__out_taxonomy.get_category(rel.t2)
+    #    return (((c1 == "UNK") and (c2 == "FAM")) or
+    #            ((c1 == "UNK") and (c2 == "UNK")))
+
+
+    #def find_aliases(self):
+    #    ''' Find aliases among relations '''
+    #    for rel in self.rel_set:
+    #        c1 = self.__out_taxonomy.get_category(rel.t1)
+    #        c2 = self.__out_taxonomy.get_category(rel.t2)
+    #        if self.is_alias_rel(rel):
+    #            self.G.add_node(rel.t1)
+    #            self.G.add_node(rel.t2)
+    #            self.G.add_edge(rel.t1, rel.t2, score=rel.talias_num)
+    #    self.output_components("comp")
+
+    def process_relation(self, rel):
+        ''' Process relation and update taxonomy/tagging correspondingly '''
+
+        # Obtain tag info
+        t1 = rel.t1
+        t2 = rel.t2
+        p1,c1 = self.__out_taxonomy.get_info(rel.t1)
+        p2,c2 = self.__out_taxonomy.get_info(rel.t2)
+
+        log.info("Processing %s\t%s" % (p1, p2))
+
+        # If both directions strong, then equivalent, i.e., alias
+        if (float(rel.tinv_alias_num) >= args.t):
+            if (c1 != "UNK") and (c2 == "UNK"):
+                prefix = p1[0:p1.rfind(':')]
+            elif (c1 == "UNK") and (c2 != "UNK"):
+                prefix = p2[0:p2.rfind(':')]
+            elif (c1 == "UNK") and (c2 == "UNK"):
+                prefix = "FAM"
+            elif (c1 == c2):
+                prefix = p1[0:p1.rfind(':')]
+            else:
+                log.warn("Equivalent rule with different categories: %s\t%s" % 
+                            (p1, p2))
+                return -1
+            self.add_alias(t1, t2, prefix)
+            return 1
+
+        # UNK -> FAM : alias-family
+        elif (c1 == "UNK") and (c2 == 'FAM'):
+            self.add_alias(t1, t2, "FAM")
+            return 1
+
+        # UNK -> CLASS : taxonomy-family
+        # Return 0 so that expansion handled at end
+        elif (c1 == "UNK") and (c2 == 'CLASS'):
+            self.add_tag(t1, 'FAM:%s' % t1)
+            return 0
+
+        # UNK -> BEH : taxonomy-family
+        # Return 0 so that expansion handled at end
+        elif (c1 == "UNK") and (c2 == 'BEH'):
+            self.add_tag(t1, 'FAM:%s' % t1)
+            return 0
+
+        # UNK -> FILE : taxonomy-file
+        elif (c1 == "UNK") and (c2 == 'FILE'):
+            self.add_tag(t1, '%s:%s' % (p2, t1))
+            return 1
+
+        # UNK -> UNK
+        elif (c1 == "UNK") and (c2 == "UNK"):
+            self.add_alias(t1, t2, "FAM")
+            return 1
+
+        # FAM -> UNK : alias-family
+        elif (c1 == "FAM") and (c2 == "UNK"):
+            self.add_alias(t1, t2, "FAM")
+            return 1
+
+         # FILE -> UNK : alias-file
+        elif (c1 == "FILE") and (c2 == "UNK"):
+            prefix = p1[0:p1.rfind(':')]
+            self.add_alias(t1, t2, prefix)
+            return 1
+
+        # Same category : alias
+        elif (c1 == "FAM") and (c2 == "FAM"):
+        #elif c1 == c2:
+            prefix = p2[0:p2.rfind(':')]
+            self.add_alias(t1, t2, prefix)
+            return 1
+
+        # Target unknown
+        elif (c2 == "UNK"):
+            # If tokens are similar, likely family aliases
+            # log.info("Similarity: %.02f" % levenshtein_ratio(t1, t2))
+            # if (levenshtein_ratio(t1, t2) > sim_threshold):
+            #     prefix = p1[0:p1.rfind(':')]
+            #     self.add_alias(t1, t2, prefix)
+            #     return 1
+            # else:
+            #     return 0
+            return 0
+
+        # Default: review taxonomy
+        else:
+            return 0
+
+
+    def run(self):
+        num_iter = 0
+        while self.rel_set:
+            # Do a pass in remaining relations
+            cnt = 0
+            new_set = set()
+            log.info("[-] %03d Processing relations" % num_iter)
+            while self.rel_set:
+                rel = self.rel_set.pop()
+                # If known relation, continue
+                if self.is_known_rel(rel):
+                    continue
+
+                # Process relation
+                result = self.process_relation(rel)
+
+                if result:
+                    cnt += 1
+                else:
+                    new_set.add(rel)
+
+            # Update relation set
+            self.rel_set = new_set
+
+            # If no relations processed, finish
+            if cnt == 0:
+                break
+            else:
+                num_iter += 1
+
+        # Find aliases
+        # self.find_aliases()
+
+        # Find expansions
+        log.info("[-] Finding expansions")
+        self.find_expansions()
+
+
+    def read_relations(self, filepath):
+        ''' Returns relations in file as a set 
+            Filters weak and blacklisted relations '''
+        rel_set = set()
+        with open(filepath, 'r') as fd:
+            for line in fd:
+                # Ignore comments
+                if line.startswith('#'):
+                    continue
+                # Parse line
+                t1, t2, t1_num, t2_num, nalias_num, talias_num, \
+                  tinv_alias_num = line.strip().split('\t')
+                # Build relation
+                rel = Rel(t1, t2, t1_num, t2_num, nalias_num,
+                          talias_num, tinv_alias_num)
+                # Ignore weak relations
+                if self.is_weak_rel(rel):
+                    continue
+                # Ignore blacklisted relations
+                if self.is_blacklisted_rel(rel):
+                    continue
+                # Ignore known relations
+                # NOTE: commented since we check if a 
+                # relation is known before processing it
+                #if self.is_known_rel(rel):
+                #    continue
+                # Add relation to set
+                rel_set.add(rel)
+                # Add to src_map
+                self.src_map[rel.t1] = rel.t1_num
+                self.src_map[rel.t2] = rel.t2_num
+
+        return rel_set
+
+    def output_relations(self, filepath):
+        fd = open(filepath, 'w')
+        fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t"
+                  "|t1^t2|/|t2|\n")
+        sorted_rules = sorted(self.rel_set, 
+                              key=(lambda r: (
+                                self.__out_taxonomy.get_category(r.t1),
+                                self.__out_taxonomy.get_category(r.t2))), 
+                              reverse=False)
+        for rel in sorted_rules:
+            p1,c1 = self.__out_taxonomy.get_info(rel.t1)
+            p2,c2 = self.__out_taxonomy.get_info(rel.t2)
+            fd.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %(
+                p1, p2, rel.t1_num, rel.t2_num, rel.nalias_num, 
+                rel.talias_num, rel.tinv_alias_num))
+        fd.close()
+
+    def output_rule_stats(self, fd):
+        # Initialize maps for statistics
+        self.dst_map = {}
+        self.cat_pairs_map = {}
+        # Compute rule statistics
+        for rel in self.rel_set:
+            c1 = self.__out_taxonomy.get_category(rel.t1)
+            c2 = self.__out_taxonomy.get_category(rel.t2)
+            self.cat_pairs_map[(c1,c2)] = self.cat_pairs_map.get((c1,
+                                                                  c2), 0) + 1
+            self.dst_map[rel.t2] = self.dst_map.get(rel.t2, 0) + 1
+        # Output statistics
+        cat_pairs = sorted(update.cat_pairs_map.items(), key=itemgetter(1,0), 
+                            reverse=True)
+        for (c1,c2), cnt in cat_pairs:
+            fd.write("%s\t%s\t%03d\n" % (c1, c2, cnt))
+
+        # Print dst statistics
+        dst_pairs = sorted(update.dst_map.items(), key=itemgetter(1,0), 
+                            reverse=False)
+        for dst, cnt in dst_pairs:
+            fd.write("%s\t%03d\n" % (taxonomy.get_path(dst), cnt))
+
+    def output(self, out_prefix):
+        if (not out_prefix):
+            tax_filepath = default_taxonomy_file
+            tag_filepath = default_tagging_file
+            exp_filepath = default_expansion_file
+        else:
+            tax_filepath = out_prefix + ".taxonomy"
+            tag_filepath = out_prefix + ".tagging"
+            exp_filepath = out_prefix + ".expansion"
+        taxonomy.to_file(tax_filepath)
+        tagging.expand_all_destinations()
+        tagging.to_file(tag_filepath)
+        expansion.to_file(exp_filepath)
+
+
+if __name__ == '__main__':
+    argparser = argparse.ArgumentParser(
+        description='''Given a .alias file from the labeler, 
+        generates updates for the taxonomy, tagging, and expansion files.''')
+
+    argparser.add_argument('-alias',
+        help='file to parse with alias from labeler'
+             'Labeler will run if -alias not present')
+
+    argparser.add_argument('-n',
+        help='Minimum number of times that a pair of tokes have been seen.'
+             'Default: 20',
+        type=int,
+        default=20)
+
+    argparser.add_argument('-t',
+        help='Minimum percentage of times two tokens appear together.'
+             'Default: 1.94',
+        type=float,
+        default=0.94)
+
+    argparser.add_argument('-o',
+        help='output prefix for files')
+
+    argparser.add_argument('-update',
+        action='store_true',
+        help='update default taxonomy,tagging,expansion files in place')
+
+    # Parse arguments
+    args = argparser.parse_args()
+
+    # Check we have the input
+    if not args.alias:
+        log.error('[-] Please provide an alias file with -alias')
+        exit(1)
+
+    # Set output prefix
+    if args.o:
+      out_prefix = args.o
+    else:
+      out_prefix = os.path.splitext(args.alias)[0]
+
+    # Read taxonomy
+    taxonomy = Taxonomy(default_taxonomy_file)
+
+    # Read expansion rules
+    expansion = Expansion(default_expansion_file)
+
+    # Read tagging rules
+    tagging = Tagging(default_tagging_file)
+
+    # Build update object
+    if not args.alias:
+        alias_fname = os.path.basename(os.path.splitext(ifile)[0]) + '.alias'
+    else:
+        alias_fname = args.alias
+    update = Update(alias_fname, taxonomy, tagging, expansion, args.n, args.t)
+
+    log.info('[-] Read %d relations satisfying t>=%.2f n>=%d\n' % (
+                        update.num_rules(), args.t, args.n))
+
+    # Output initial rules
+    update.output_relations(out_prefix + ".orig.rules")
+
+    # Output initial rules statistics
+    update.output_rule_stats(sys.stderr)
+
+    # Process relations
+    update.run()
+
+    # Output updated taxonomy,tagging,expansion
+    if args.update:
+        update.output(None)
+    else:
+        update.output(out_prefix)
+
+    # Output final rules
+    update.output_relations(out_prefix + ".final.rules")
+
diff --git a/avclass2/data/expansion b/avclass2/data/expansion
new file mode 100644
index 0000000..3582f10
--- /dev/null
+++ b/avclass2/data/expansion
@@ -0,0 +1,17 @@
+backdoor	server
+bitcoinminer	bitcoinmining
+clicker	click
+ddoser	ddos
+dialer	dial
+downloader	execdownload
+gamania	gamethief
+keylogger	keylog
+miner	mining
+onlinegames	gamethief
+ransomware	filecrypt
+rogueware	alertuser
+rootkit	osmodify
+searcher	search
+smshoax	sendssms
+virus	filemodify
+worm	selfpropagate
diff --git a/avclass2/data/tagging b/avclass2/data/tagging
new file mode 100644
index 0000000..dbd54a9
--- /dev/null
+++ b/avclass2/data/tagging
@@ -0,0 +1,1300 @@
+0052f0b	gappusin
+0053284d	plankton
+4share	4shared
+6a53ba64ab	smsreg
+aacf	dnotua
+achros	cova
+actehc	gomanag
+activshop	activshopper
+acute	pullupdate
+adanalysis	winkad
+adclicker	clicker
+addisplay	adware
+addrop	adware
+adfltnet	amonetize
+adgazele	adgazelle
+adiwky	airpush
+adknowledge	adware
+adload	adware
+admin	downloadadmin
+adop	fakeapp
+adplugin	adware
+adpoooh	poohad
+adspy	hotbar
+adswo	adwo
+adtrafficanalysis	winkad
+adwareeorezo	eorezo
+afoynq	ksapp
+agemt	domob
+agewap	opfake
+agile	biige
+agilebinary	biige
+agnsmit	infectionads
+airad	airinstaller
+airadinstaller	airinstaller
+airinstall	airinstaller
+akan	winwebsec
+allad	airpush
+almanahe	alman
+alureon	tdss
+amab	mobidash
+amorba	ipamor
+andef	fkdefend
+andr	android
+androidos	android
+androm	gamarue
+andromeda	gamarue
+androsmscontrol	ansmcon
+androways	badnews
+andup	fakeangry
+angel	virut
+angryangel	virut
+anserver	basebridge
+ansver	basebridge
+antiav	killsectool
+antifw	killsectool
+antimalwaredefender	defmid
+anudow	anydown
+anways	badnews
+anxin	lovetrap
+anzhi	dowgin
+apke8bd	dowgin
+apperhand	plankton
+appinventor	steek
+appleservice	coogos
+applicunsaf	grayware
+applicunwnt	grayware
+applovin	plankton
+appquanta	wkload
+apprisk	grayware
+appsgeyser	fakeflash
+aque	beebone
+arcadeparlor	gamevance
+arcadeweb	gamevance
+archsms	smshoax
+arcparlor	gamevance
+armour	androidarmour
+arto	renos
+artro	renos
+aservicea	kuguo
+autokms	winactivator
+autoruner	autorun	vobfus
+autorunerent	autorun	palevo
+avalod	sinowal
+aveasms	smskey
+avkill	killsectool
+bacteraloh	sality
+badao	smsspy
+badday	badda
+badmacro	macro
+badnew	badnews
+banach	hotbar
+bandito	unruy
+banito	unruy
+banker	infosteal
+bankrypt	bancos
+banloader	rimod
+basebrid	basebridge
+batteryd	fakedoc
+batterydoctor	fakedoc
+bbridge	basebridge
+bckdr	backdoor
+bean	nandrobox
+bearshare	bandoo
+bergat	xtrat
+bertlea	bertle
+bespal	netins
+betterinstaller	somoto
+bflient	palevo
+bibean	faketimer
+biez	loadmoney
+bitcoin	bitcoinminer
+bitminer	bitcoinminer
+bjlog	zegost
+bkdr	backdoor
+blackice	whiteice
+blic	whiteice
+blocal	vmvol
+blocker	killsectool
+bobic	bobax
+botnet	gidix
+bototer	wapomi
+boxer	fakeinst
+boxersms	fakeinst
+braininst	installbrain
+brantall	installbrain
+brappware	multiplug
+browsepulse	browsefox
+browsermodifier	multiplug
+browserplugin	multiplug
+bsihai	kabun
+bsurf	bettersurf
+btapk	smsreg
+btcmine	bitcoinminer
+bulknet	webprefix
+bundl	bundlore
+bundleapp	bundle
+bundled	bundle
+bundleinstaller	bundle
+bundler	bundle
+bundpil	gamarue
+buzb	bzub
+bxib	softonic
+c2lop	swizzor
+cabby	dalexis
+caphaw	shylock
+casonline	casino
+cawitt	smsbot
+ceeinject	inject
+cellphonetrack	mytrackp
+cellspy	mobilespy
+ceshark	cellshark
+changeup	vobfus
+chard	hiddad
+checks-gps	locationcheck
+cheval	detroie
+chinesehacker	chir
+chinky	vobfus
+chydo	pykspa
+cidox	vundo
+cimag	hiloti
+cinmeng	cinmus
+citirevo	vundo
+clemag	cleaman
+click	clicker
+clickfraud	clicker
+clickpotato	hotbar
+clickrun	installcore
+clickrunsoftware	installcore
+clickspring	purityscan
+clientconnect	opencandy
+climap	androrat
+clkpotato	hotbar
+clspring	purityscan
+cobbler	focobers
+cobblerone	focobers
+cobbleronea	focobers
+codecpa	renos
+codecpack	renos
+codepack	renos
+coee	cooee
+coinmine	miner
+coinminer	miner
+collector	autoins
+comet	darkkomet
+cometsys	darkkomet
+cometsystems	darkkomet
+condestil	firseria
+conduit	opencandy
+contrand	sckeylog
+controlrandom	sckeylog
+coolpaperleak	coolwall
+copycat	airpush
+corrupt	corrupted
+cosha	lovetrap
+counterclank	plankton
+crack	tool
+cracktool	tool
+crisis	morcut
+crori	crossrider
+crosate	svpeng
+crwind	crusewind
+cryp	packed
+crypt	packed
+cryptdomaiq	domaiq
+crypted	packed
+crypter	packed
+cryptic	packed
+cryptinno	installcore
+cryptodefense	cryptodef
+cryptominer	miner
+cryptor	packed
+cson	simbot
+ctblocker	dalexis
+cudos	fosniw
+cupi	smssend
+cybota	cycbot
+cycler	unruy
+dadmin	downloadadmin
+dailer	dialer
+dalamodo	cossta
+damaged	corrupted
+darksnow	whiteice
+datasetaler	infosteal
+daytre	upatre
+ddlight	droiddreamlight
+dealcabby	adpeak
+debris	gamarue
+delf	delphi
+delfiles	filedelete
+delfinject	delphi	inject
+delfloader	delphi	downloader
+delfsnif	delphi	infosteal
+delpbanc	delphi	infosteal
+delpdldr	delphi	downloader
+derdroi	simbad
+desktoplightning	cashon
+detroi	detroie
+detroia	detroie
+dial	dialer
+dialers	dialer
+dialpass	egroupdial
+dialplatform	dialer
+didat	dabom
+diple	vobfus
+directdown	directdownloader
+dldr	downloader
+dldrop	downloader
+dload	downloader
+dloade	downloader
+dloader	downloader
+dloadr	downloader
+dloadware	adware
+dnschanger	dnsmodify
+docdl	downloader	msoffice
+docdrop	downloader	msoffice
+docdrp	downloader	msoffice
+dogbite	dogowar
+dogwar	dogowar
+doidroot	rooter
+domainiq	domaiq
+domalq	domaiq
+domlq	domaiq
+dontlback	fakeinst
+doods	loic
+dordae	droiddreamlight
+dordrae	droiddreamlight
+dotdo	multiplug
+dotdoads	multiplug
+douga	dougalek
+dougaleaker	dougalek
+dowcen	centim
+dowins	inservice
+downad	adware
+downagent	downloader
+downldexe	downloader
+downldr	downloader
+download	downloader
+downloadasist	downloadassistant
+downloaderguide	downloadguide
+downloadmin	downloadadmin
+downloadmr	firseria
+downloadnsave	megasearch
+downloadware	adware
+downsms	dropdialer
+downware	downloader
+dracur	rebhip
+dragonball	vietsms
+dragonbranch	browsefox
+drddream	droiddream
+drdelux	droiddeluxe
+dreamexploid	droiddream
+dridexdownloader	dridex
+dridld	dridex
+driverupd	softpulse
+drixed	dridex
+droidap	smssend
+droidapp	smssend
+droiddelux	droiddeluxe
+droidkrungfu	droidkungfu
+droidlive	rootsmart
+droidrooter	rooter
+drokole	lockscreen
+dromedan	gamarue
+drop	downloader
+dropped	downloader
+dropper	downloader
+droppr	downloader
+dropr	downloader
+duel	loveletter
+dumobove	hiddad
+duptwux	lolbot
+dwnldr	downloader
+dwonk	pykspa
+easydl	amonetize
+echiui	invis
+ecsys	mailcab
+egbii	biige
+egroup	egroupdial
+eicar	testvirus
+electron	sytro
+elenoocka	dalexis
+elephant	dowgin
+elkern	klez
+elpso	vidro
+emagsoftware	smsreg
+email	spam
+emailspy	maistealer
+emerleox	fujacks
+emud	emudbot
+encoder	filecrypt	ransomware
+encpk	packed
+engwings	cardserv
+epicgames	gamevance
+epicplay	gamevance
+eqdrug	equationdrug
+equation	equationdrug
+erop	smssend
+escape	laroux
+escop	laroux
+evitanf	hiddenapp
+ewalls	imlog
+excel	msexcel
+exedial	egroupdial
+exedown	downloader
+exedrop	downloader
+expl	exploit
+expressfind	browsefox
+extens	damon
+extension	damon
+extrat	xtrat
+eydrop	dinwod
+fakapp	styricka
+fakealert	rogueware
+fakeav	rogueware
+fakebattscar	fakedoc
+fakebrows	fakeinst
+fakecodec	renos
+fakedefend	fkdefend
+fakedefender	fkdefend
+fakefldr	fakefolder
+fakeicq	fakeinst
+fakeinstall	fakeinst
+fakeinsthw	fakeinst
+fakeinstsms	fakeinst
+fakejoboffer	fakejob
+fakelogosms	fakelogo
+fakelt	elite
+fakemini	opfake
+fakemms	fakeplayer
+fakems	fakepublisher	signed
+fakengry	fakeangry
+fakenotify	opfake
+fakeplay	fakeplayer
+fakeqou	styricka
+fakerecovery	fakesysdef
+fakerun	airpush
+fakesecsuit	spyeye
+fakesite	perkel
+fakeumg	gumen
+fakeupdates	gamex
+fakmod	fakeapp
+fakromup	soft32downloader
+faktvx	fakeangry
+farex	fearso
+fastsave	megasearch
+fastsaveapp	megasearch
+fatakr	steek
+fech	wroba
+fenomen	fenomengame
+fenomengamet	fenomengame
+fenservice	fengvi
+fidgo	opfake
+filecoder	filecrypt	ransomware
+filehunter	winpump
+fileinfector	infector
+filesearch	amonetize
+finfisher	finspy
+finloski	darkkomet
+finlosky	darkkomet
+fipp	morto
+firser	firseria
+firseriainstaller	firseria
+fiseria	firseria
+fixflo	pioneer
+fkangry	fakeangry
+fkclip	smssend
+fkealrt	rogueware
+fksite	perkel
+fktime	faketimer
+flofix	pioneer
+flooder	ddos
+floxif	pioneer
+floxlib	pioneer
+flyagent	flystudio
+flystud	flystudio
+fodeg	fakeinst
+fokonge	droidkungfu
+foncysms	foncy
+foran	anforen
+fraud	rogueware
+fraudload	downloader	rogueware
+fraudtool	tool
+freepds	hotclip
+frogonal	ginmaster
+fujack	fujacks
+funclub	smssend
+funweb	mywebsearch
+fynloski	darkkomet
+gaba	gabpath
+gael	tenga
+gaelicum	tenga
+gallm	nandrobox
+game	grayware
+gamehack	onlinegames
+gamevancecs	gamevance
+gampass	gamethief
+ganelp	griptolo
+gaobot	agobot
+gasms	gambler
+gastab	gabas
+gavir	viking
+gbot	cycbot
+gdjowa	joye
+gdream	golddream
+gectams	smsspy
+geimini	geinimi
+geinim	geinimi
+geksone	crytex
+gemest	smishing
+genericab	wroba
+genericgb	basebridge
+genpack	packed
+gentroj	trojan
+gepat	airpush
+getextension	eorezo
+getfaster	4shared
+geyser	plankton
+ggsmart	rootsmart
+ggtracker	ggtrack
+ghostbot	gobot
+ghostpush	ztorg
+ginermaster	kuguo
+gingermaster	ginmaster
+glassbottle	browsefox
+gldct	loadmoney
+gletan	ganlet
+glodream	golddream
+glogo	fakeapp
+gmaster	ginmaster
+gmasterb	kuguo
+gmastere	kuguo
+gmeil	gamex
+gnurbulf	rungbu
+goidu	oveead
+goldclick	hiddad
+gonca	gonesixty
+gone	gonesixty
+gonfu	droidkungfu
+gongfu	droidkungfu
+goolbot	cycbot
+gopf	uupay
+gploader	ewind
+gprice	gorillaprice
+gray	grayware
+greatfind	browsefox
+guarder	virut
+gugespy	qplus
+gulpix	plugx
+gunpoder	dowgin
+gupboot	urelas
+gvance	gamevance
+h5games	hiddad
+habey	elite
+hackav	kiser
+hackkms	winactivator
+hacktool	tool
+hacyayu	winwebsec
+hamob	fakeflash
+hdusafe	wapron
+helldoor	hilldoor
+hellospy	spyoo
+hiddenad	hiddad
+hiddeninstall	jsmshider
+hidrag	jeefo
+hippo	hipposms
+hipsmser	hipposms
+hispo	hipposms
+hktl	tool
+hllp	virus
+hllw	worm
+hlux	kelihos
+homepage	browsermodify
+hongtoutou	adrd
+horse	trojan
+hosts-modifier	hostsmodify
+hublo	crytex
+huigezi	hupigon
+hype	loadmoney
+hyteod	kovter
+iadpush	dowgin
+ibank	shiz
+ibashade	drolnux
+ibrain	installbrain
+iceboy	icekboy
+ickboy	icekboy
+iconos	iconosys
+iconosis	iconosys
+idapk	opfake
+ihouse	spyagent
+ikangoo	smssend
+ilivid	bandoo
+imestartup	cyfin
+imonetize	amonetize
+inboxtoolbar	inbox
+indirect	directdownloader
+infdas	infectionads
+inffinity	toggle
+inffinityinternet	toggle
+infostealer	infosteal
+injcrypt	inject
+injected	inject
+injecter	inject
+injection	inject
+injector	inject
+inoco	zdtad
+inservc	inservice
+install	installer
+installcloud	installerex
+installco	installcore
+installcube	icloader
+installmat	installmate
+installmet	installmetrix
+installmon	installmonster
+installmonst	installmonster
+installmonstr	installmonster
+installq	installiq
+installrex	installerex
+installvibe	bundlore
+instantaccess	egroupdial
+instmonetizer	installmonetizer
+intex	intexdial
+intexus	intexdial
+invader	daws
+ipatre	upatre
+ircbot	bot	irc
+ispyoo	spyoo
+j2me	java
+jackpos	jinupd
+jadtre	wapomi
+javak	suggestor
+jedan	kuguo
+jelbrus	techsnab
+joke	hoax
+joleee	tedroo
+juched	griptolo
+kaka	telman
+kanav	alyak
+kasandra	sandr
+kashu	sality
+kazaa	benjamin
+keepmusic	hiddad
+keji	basebridge
+kelvin	smssend
+kernelpatch	geral
+keygen	tool
+keylog	keylogger
+kgbkeylogger	kgbspy
+kibi	ksapp
+kichhoat	smsreg
+killav	killsectool
+killfiles	files
+kituri	placms
+klevate	webprefix
+klezer	beebone
+kmsauto	winactivator
+koceg	socks
+koler	svpeng
+kometa	rukometa
+kongfu	droidkungfu
+kouto	koutodoor
+koyotelab	bandoo
+krademok	darkkomet
+kranxpay	mmarketpay
+krypt	packed
+kryptik	packed
+kryptk	packed
+kucirc	cosmu
+kuku	sality
+kungfu	droidkungfu
+kusasesms	hipposms
+lacon	laconic
+langya	lien
+lanucher	bgserv
+lavandos	vidro
+ldmon	loadmoney
+lebag	ramnit
+legana	droidkungfu
+legendmir	lmir
+legmir	lmir
+lemir	lmir
+letang	ganlet
+licat	murofet
+licum	tenga
+liezar	rasteal
+lightdd	droiddreamlight
+lijo	smssend
+lilu	gamarue
+limpopo	loadmoney
+lineage	gamania
+linkun	linkular
+liteweb	browsefox
+livesecurity	winwebsec
+livesoft	getnow
+livesoftaction	getnow
+llond	lardlond
+loadmoneyent	loadmoney
+locker	lockscreen
+locm	locmg
+lohmys	midia
+looked	viking
+loorp	wapomi
+lootor	exploit
+lotoor	exploit
+lower	airpush
+lozfoon	loozfon
+macosx	mac
+macrodown	downloader	macro
+madanf	virut
+madang	virut
+madangel	virut
+magania	gamania
+magmedia	mediamagnet
+mailer	spam
+mailstealer	maistealer
+mainservice	pjapps
+maklt	renos
+malcrypt	packed
+malhome	updtkiller
+maliciousmacro	macro
+mallocker	lockscreen
+malob	packed
+malpack	packed
+malpe	corrupted
+manalo	laroux
+mandaph	socks
+marketpay	mmarketpay
+massmailer	spam
+masterkey	master
+maxplus	zeroaccess
+maxplusent	zeroaccess
+mayachok	vundo
+mazel	somoto
+mazig	fakeinst
+mbro	winwebsec
+mdropper	downloader
+meredrop	vobfus
+meterpreter	metasploit
+mfinder	mediafinder
+midgare	bifrose
+midhos	medfos
+mikcer	wapomi
+milicenso	pirminay
+mimobsms	minimob
+mindspark	mywebsearch
+miscosms	gidix
+misosms	gidix
+mixor	loveletter
+mketpay	mmarketpay
+mmag	mediamagnet
+mmarket	mmarketpay
+mmarketp	mmarketpay
+mmob	minimob
+mo97	macro
+mobcore	airpush
+mobi	fakeinst
+mobigapp	gamex
+mobilehotdog	nandrobox
+mobinauten	smsspy
+mobistealth	stealthcell
+mobkong	smssend
+mobspy	trackplus
+mobsqueeze	fakedoc
+mofksys	swisyn
+monad	damon
+monderb	vundo
+monitor	infosteal
+monocle	monokle
+monstruos	installmonster
+montiera	delbar
+morefi	memery
+morepak	pushad
+morstar	firseria
+morstars	firseria
+mosky	skymobi
+mostofate	softomate
+mplug	multiplug
+msilobfuscator	msil	packed
+mspyonline	mspy
+msteal	maistealer
+mswdm	ipamor
+mufanom	hiloti
+mulad	kuguo
+muldrop	downloader
+multibardown	multibar
+multibardownloader	multibar
+multiinstall	vilsel
+multipluggen	multiplug
+musictoolbar	bandoo
+mutibar	multibar
+mutopy	rodecap
+mvlove	vmvol
+mw97	macro
+mytrack	mytrackp
+nabucur	virlock
+najin	feejar
+nandrob	nandrobox
+nemucod	smsreg
+neshuta	neshta
+netboxserver	netbox
+neteyes	ipamor
+netfilter	network
+netweird	netwiredrc
+networm	worm
+newyearl	plankton
+nextup	verti
+nickibot	nickyspy
+nickispy	nickyspy
+nickspy	nickyspy
+nicky	nickyspy
+nidb	spyoo
+nimefas	mseg
+nimnul	wapomi
+ninebox	kuguo
+nioserv	nocoma
+nisev	nocoma
+nofear	fearso
+nofer	fearso
+noico	zdtad
+noiconads	zdtad
+nopoc	smforw
+not-a-virus	grayware
+notcom	nocoma
+notcompatible	nocoma
+noticemob	ginmaster
+nsanti	packed
+nuwar	tibs
+nyearleaker	airpush
+nyleaker	airpush
+o97m	macro
+obfus	packed
+obfusc	packed
+obfuscate	packed
+obfuscated	packed
+obfuscator	packed
+odyssey	loadmoney
+offerad	appoffer
+office	msoffice
+ogimant	loadmoney
+olmarik	tdss
+onbsms	smssend
+oneclick	oneclickfraud
+oneclickdownload	1clickdownload
+onestep	zwangi
+onlineg	onlinegames
+onlinega	onlinegames
+onlinegam	onlinegames
+onlinegame	onlinegames
+onlinegamehack	onlinegames
+ooqqxx	boqx
+opclose	sillyfdc
+opfakesms	fakeinst
+optimizerpro	speedingupmypc
+optimum	ibryte
+optimuminstall	ibryte
+optimuminstaller	ibryte
+optinstall	ibryte
+optiuminstaller	ibryte
+optixp	optix
+optixpro	optix
+osx	mac
+osx32	mac
+otran	vobfus
+otwycal	wapomi
+overdoom	cosmu
+overt	sadenav
+overtls	sadenav
+ozotshielder	kmin
+pace	socks
+padobot	korgo
+padodor	berbew
+pakes	packed
+panda	zbot
+pandaent	zbot
+pandora	nandrobox
+parnian	smssend
+patch	filemodify
+patched	filemodify
+patcher	filemodify
+patchfile	filemodify
+pate	parite
+payint	domaiq
+payment	basebridge
+pazetus	brontok
+pe	windows
+peacomm	tibs
+pemalform	corrupted
+pemask	maskpe
+penetrata	penetho
+penetrate	penetho
+pepatch	filemodify
+perfectkeylogger	perflogger
+perfkey	perflogger
+perfloger	perflogger
+perkele	perkel
+petrolan	petrolin
+philis	viking
+pigeon	hupigon
+pigetrl	lockscreen
+pikor	wapomi
+pikorms	wapomi
+pilleuz	palevo
+pinball	hotbar
+pinfi	parite
+pinny	shiz
+pirater	walkinwat
+pirrit	tirrip
+pirritsuggestor	tirrip
+placsms	placms
+plangton	plankton
+plite	urelas
+plocust	loadmoney
+plosa	karagany
+plugin	multiplug
+pmax	zeroaccess
+podec	fobus
+podnuha	boaxxe
+poisonivy	poison
+polip	cardserv
+polipos	cardserv
+polycryptt	polycrypt
+polyransom	virlock
+popeler	firseria
+popov	fakeinst
+popuppers	soft32downloader
+porn	porndialer
+porndial	porndialer
+pornlocker	lockscreen
+portscan	network
+positivefinds	browsefox
+positmob	fakeinst
+potentially	grayware
+poweliks	wowlik
+powerliks	wowlik
+powerpack	linkular
+powessere	wowlik
+pp97m	macro
+preloader	megasearch
+premiumsms	smskey
+premiumsmsscam	smshoax
+privacyrisk	grayware
+privitize	techsnab
+prockill	killproc
+prodatect	fakesysdef
+pronny	vobfus
+protexor	ramnit
+protil	wapomi
+provar	fakeinst
+pswtool	infosteal
+pua	grayware
+pup	grayware
+pupil	plemood
+purity	purityscan
+purora	vobfus
+purple	plemood
+purplemood	plemood
+pushdo	cutwail
+putalol	couponmarvel
+pwsonlinegames	onlinegames
+pwsteal	infosteal
+pwstealer	infosteal
+pwszbot	zbot
+pykse	pykspa
+qakbot	qbot
+qhost	hostsmodify
+qhosts	hostsmodify
+qqrobber	qqrob
+qukart	berbew
+qvod	wapomi
+rabbhome	fjcon
+rabidog	dogowar
+rahack	allaple
+rahiwi	brontok
+raideloz	vobfus
+ramdo	redyms
+ranck	ranky
+ransom	ransomware
+ransomcrypt	filecrypt	ransomware
+ransomlock	lockscreen	ransomware
+rapiddown	firseria
+ratab	mamianune
+razel	rasteal
+raziel	rasteal
+recal	mogap
+recordpage	browsefox
+redirector	network
+reefwal	kalfere
+refogkeylogger	refog
+regie	fosniw
+relevant	relevantknowledge
+relik	updtkiller
+remtasu	xtrat
+renamer	files
+reptilic	reptilicus
+revmob	plankton
+revtcp	metasploit
+rimecud	palevo
+risk	grayware
+risktool	grayware	tool
+riskware	grayware
+rivalgame	gamevance
+rkdoor	koutodoor
+rknowledge	relevantknowledge
+rlemon	lemon
+rmnet	ramnit
+rodricter	simda
+rogue	rogueware
+roguesppush	shastrosms
+rollaround	browsefox
+rontokbr	brontok
+rontokbro	brontok
+roop	svpeng
+rootcager	droiddream
+ropin	leadbolt
+rorpian	zeroaccess
+ruftar	usteal
+rugo	hotbar
+runitslf	looper
+runonce	chir
+runouce	chir
+safekidzone	sakezon
+sahagent	sahat
+saho	wroba
+saiva	smammer
+saldrop	sality
+salecharger	browsefox
+salicode	sality
+salitystub	sality
+salload	sality
+salpack	sality
+salrenmetie	sality
+sambamedia	softpulse
+sancmed	sanctionedmedia
+sandrorat	sandr
+saveshare	megasearch
+scareware	rogueware
+scavir	fakeinst
+sckeylogger	sckeylog
+sclog	sckeylog
+screenblaze	prosti
+screensaver	hotbar
+script	jswebinject
+searchprotect	opencandy
+searchsuite	bandoo
+seasuite	bandoo
+seaweed	seaweth
+secretspy	smforw
+secshieldfraud	securityshield
+securitydefender	defmid
+securitytool	tool
+secxplod	securityxploded
+secxploded	securityxploded
+selfdel	beebone
+sendpay	shastrosms
+serbg	bgserv
+serpip	morto
+sethom	hiddad
+sexxoo	redmobile
+sexyclip	smssend
+sharestar	gappusin
+shell	shellcode
+shellkode	shellcode
+sheriff	sheridroid
+shifu	shiz
+shohdi	shodi
+shopathome	sahat
+signalbooster	fakedoc
+signalboosterb	fakedoc
+silentcaller	dialer
+simfect	wapomi
+simplock	simplocker
+sinodo	sinowal
+sintal	plankton
+sirefef	zeroaccess
+skanik	smssend
+slybdb	blohi
+smabo	adialer
+smadow	zeroaccess
+smbot	fakeins
+smbox	fakeinst
+smfrow	dowgin
+smokeloader	dofoil
+smsarch	smshoax
+smsbank	smsreg
+smsbox	fakeinst
+smsboxer	fakeinst
+smscc	smcc
+smser	smssend
+smsfakesky	opfake
+smsforward	smforw
+smsfraud	smshoax
+smsfwd	smforw
+smshider	jsmshider
+smsilence	smscatch
+smskute	smsagent
+smsseaw	seaweth
+smssilence	smscatch
+smsstealer	smsspy
+smthief	smsthief
+smtp	spam
+snadapps	typstu
+sndapps	typstu
+sneakytrail	installerex
+sniffer	network
+sobot	clientor
+soft32down	soft32downloader
+soft32download	soft32downloader
+softbase	softobase
+softcentral	sckeylog
+softonicdownloader	softonic
+softpules	softpulse
+softwarebundler	bundle
+sohand	sohanad
+sohaned	sohanad
+solimba	firseria
+soltern	sytro
+somato	somoto
+somotobetterinstaller	somoto
+somotoltd	somoto
+soobek	lockscreen
+spacer	unruy
+spakrab	vidro
+spambot	spam
+spammer	spam
+spamtool	spam	tool
+spatet	rebhip
+spdupmypc	speedingupmypc
+speedupmypc	uniblue
+spez	spyzie
+spyagnt	piom
+spybubb	spybubble
+spyeyes	spyeye
+spygold	golddream
+spymob	trackplus
+spyphone	phonespy
+spyrat	rebhip
+spysat	spyset
+spysheriff	harnig
+spytomobile	gpspy
+spytrack	spyset
+spyweep	spyeye
+square	squarenet
+ssam	guerrilla
+starman	allaple
+starsys	plankton
+startapp	plankton
+startpage	browsermodify
+statblaster	winfetcher
+stealer	infosteal
+steekt	steek
+stesec	smssend
+stmp	spam
+stration	warezov
+strongsignal	browsefox
+stubofsality	sality
+stufik	tufik
+sunnet	smsreg
+superoptimizer	speedingupmypc
+superpctools	speedingupmypc
+suspiciouspacker	packed
+susppack	packed
+sventore	firseria
+swiftbrowse	browsefox
+system	droidkungfu
+systemfix	fakesysdef
+systemsecurity	winwebsec
+systex	daws
+systro	sytro
+sysvenfak	loadmoney
+talklog	talkw
+taojin	taojinstar
+tapsnake	gpspy
+tattoohack	exploid
+tatus	tetus
+tazebama	mabezat
+tdownloader	installerex
+tdssrt	tdss
+tedro	tedroo
+temai	ksapp
+tepfer	fareit
+test	testvirus
+testfile	testvirus
+tibspak	tibs
+tibspk	tibs
+tibsys	tibser
+tibsystems	tibser
+ticno	multibar
+tidserv	tdss
+tiger	tigerbot
+tigrbot	tigerbot
+timpdoor	clientor
+tinbakd	tinba
+tinbelog	nandrobox
+tiny	small
+tklocker	lockscreen
+tonclank	plankton
+toorch	rootnik
+tophos	stegvob
+torchmedia	bandoo
+torpump	winpump
+tovkater	installmonster
+towelexploit	towel
+trj	trojan
+trjdown	downloader	trojan
+trjndwnlder	downloader	trojan
+troj	trojan
+trojanapt	apt	trojan
+trojanbanker	infosteal	trojan
+trojanclicker	adware	clicker	trojan
+trojandldr	downloader	trojan
+trojandownloader	downloader	trojan
+trojandropper	downloader	trojan
+trojandwnldr	downloader	trojan
+trojanfakeav	alertuser	rogueware	trojan
+trojanhorse	trojan
+trojanproxy	proxy	trojan
+trojanpsw	infosteal	trojan
+trojanransom	filecrypt	ransomware	trojan
+trojansms	sms	trojan
+trojanspy	spyware	trojan
+trojware	trojan
+truedown	truedownloader
+tsuploader	installerex
+tufei	tufik
+tugspay	domaiq
+tunkoo	silentboot
+turk	alyak
+turko	turkojan
+tuto4pc	eorezo
+tweetbot	smsbot
+twetty	twetti
+txmob	mobiletx
+typnotify	typstu
+ucont	spyagent
+ultradownload	vilsel
+ultradownloads	vilsel
+umeng	gumen
+unix	linux
+unsafe	grayware
+unwanted	grayware
+unwnt	grayware
+updatekiller	updtkiller
+updtkill	updtkiller
+uracto	maistealer
+uuser	uuserv
+uxipp	yzhc
+valhalla	xorala
+valla	xorala
+vbccrypt	vobfus
+vbcrypt	packed	visualbasic
+vbinject	inject	visualbasic
+vbkrypt	packed	visualbasic
+vbna	vobfus
+vbobf	vobfus
+vbobfus	vobfus
+vbpack	packed	visualbasic
+vernet	dusvext
+vertex	dusvext
+vertexb	dusvext
+vertexbot	dusvext
+vetor	virut
+vflood	vtflooder
+vflooder	vtflooder
+vils	vilsel
+virransom	virlock
+virtob	virut
+virtool	tool
+vitallia	vittalia
+vjadtre	wapomi
+vmdetector	vmdetect
+vmpbad	vmprotect
+vnfraye	dusvext
+vsaas	vsas
+vserv	viser
+vxidl	tibs
+w2km	macro
+w32	windows
+w64	windows
+w97m	macro
+wakeful	cardserv
+wali	wapomi
+walkfree	kalfere
+walksteal	walkinwat
+wanacry	wannacry
+wanderburst	browsefox
+wanna	wannacry
+wannacrypt	wannacry
+wannacryptor	wannacry
+wapnor	shedun
+waps	gappusin
+wapsx	gappusin
+wapz	gappusin
+waren	qumi
+waski	upatre
+wauchos	gamarue
+wbna	vobfus
+webalt	webalta
+webatla	webalta
+webpick	installerex
+websearch	search
+webtoolbar	toolbar
+wedownload	soft32downloader
+weecnaw	netwiredrc
+weiyi	smforw
+whboy	fujacks
+whistle	whistlesoftware
+whistles	whistlesoftware
+widoman	bmmedia
+win	windows
+win32	windows
+win64	windows
+winge	cardserv
+winnt	windows
+winsoft	fosniw
+winsxsbot	sfone
+wipelock	elite
+wipelocker	elite
+wirenet	netwiredrc
+wohis	dowgin
+wondertek	tekwon
+word	msword
+wpay	smsreg
+wplug	slugin
+wplugin	slugin
+wukong	yzhc
+x2km	macro
+x97m	macro
+ximad	plankton
+xloader	wroba
+xpack	packed
+xpiro	expiro
+xsider	jsmshider
+xtoober	karagany
+xtreme	xtrat
+xworm	loveletter
+yangamon	pirates
+yarwi	upatre
+yontoo	browsefox
+yoof	picsys
+yotoon	browsefox
+yourfiledownloader	expressdownloader
+yusttohq	trackplus
+yzhcsms	yzhc
+zaccess	zeroaccess
+zadved	dlhelper
+zango	hotbar
+zangosearch	hotbar
+zawet	masplot
+zbocheman	zbot
+zbomber	zombbomber
+zbotk	zbot
+zebt	hiddenapp
+zeno	zenosearch
+zeus	zbot
+zhelatin	tibs
+zombie	smszombie
+zona	zvuzona
+zpack	packed
+zsone	raden
+zwunzi	zwangi
+zybut	shiz
diff --git a/avclass2/data/taxonomy b/avclass2/data/taxonomy
new file mode 100644
index 0000000..fa6f7ad
--- /dev/null
+++ b/avclass2/data/taxonomy
@@ -0,0 +1,1138 @@
+BEH:alertuser
+BEH:autorun
+BEH:browsermodify
+BEH:browsermodify:toolbar
+BEH:click
+BEH:ddos
+BEH:defaulttab
+BEH:dial
+BEH:dnsmodify
+BEH:execdownload
+BEH:facebook
+BEH:filecrypt
+BEH:filedelete
+BEH:filemodify
+BEH:files
+BEH:hostsmodify
+BEH:infosteal
+BEH:infosteal:gamethief
+BEH:inject
+BEH:irc
+BEH:jswebinject
+BEH:keylog
+BEH:killproc
+BEH:killproc:killsectool
+BEH:locationcheck
+BEH:lockscreen
+BEH:mining
+BEH:mining:bitcoinmining
+BEH:network
+BEH:osmodify
+BEH:phishing
+BEH:proxy
+BEH:proxychanger
+BEH:search
+BEH:selfpropagate
+BEH:server
+BEH:servstart
+BEH:sms
+BEH:sms:readssms
+BEH:sms:sendssms
+BEH:spam
+BEH:tor
+BEH:vmdetect
+CLASS:apt
+CLASS:backdoor
+CLASS:bot
+CLASS:bot:bankbot
+CLASS:clicker
+CLASS:ddoser
+CLASS:dialer
+CLASS:dialer:porndialer
+CLASS:downloader
+CLASS:fakeantivirus
+CLASS:grayware
+CLASS:grayware:adware
+CLASS:grayware:adware:adlibrary
+CLASS:grayware:adware:adlibrary:adpush
+CLASS:grayware:adware:multiplug
+CLASS:grayware:casino
+CLASS:grayware:tool
+CLASS:grayware:tool:remoteadmin
+CLASS:hoax
+CLASS:hoax:smshoax
+CLASS:infector
+CLASS:keylogger
+CLASS:miner
+CLASS:miner:bitcoinminer
+CLASS:ransomware
+CLASS:rogueware
+CLASS:rooter
+CLASS:rootkit
+CLASS:searcher
+CLASS:spyware
+CLASS:virus
+CLASS:virus:prepender
+CLASS:worm
+CLASS:worm:emailworm
+FAM:0052b
+FAM:154b2720de
+FAM:1clickdownload
+FAM:4shared
+FAM:560de1fe9de
+FAM:abeciv
+FAM:accutrack
+FAM:acecard
+FAM:activeinject
+FAM:activshopper
+FAM:adcolony
+FAM:adend
+FAM:adflex
+FAM:adgazelle
+FAM:adialer
+FAM:adinject
+FAM:adir
+FAM:adlock
+FAM:admogo
+FAM:adpeak
+FAM:adpooh
+FAM:adrd
+FAM:adrotator
+FAM:adrotoob
+FAM:adultbrowser
+FAM:adviator
+FAM:adwk
+FAM:adwo
+FAM:aesads
+FAM:agobot
+FAM:agvd
+FAM:ahmyth
+FAM:ahopc
+FAM:airinstaller
+FAM:airpush
+FAM:aiwan
+FAM:aliyuncs
+FAM:allaple
+FAM:alman
+FAM:alyak
+FAM:amonetize
+FAM:androidarmour
+FAM:androidlost
+FAM:androrat
+FAM:anforen
+FAM:angupsh
+FAM:anlost
+FAM:ansmcon
+FAM:anti
+FAM:anubis
+FAM:anydown
+FAM:anzhu
+FAM:aplog
+FAM:apofer
+FAM:appoffer
+FAM:appsad
+FAM:appwiz
+FAM:appwizz
+FAM:aqplay
+FAM:asacub
+FAM:asprox
+FAM:autoins
+FAM:autosus
+FAM:axespy
+FAM:badda
+FAM:badnews
+FAM:badpac
+FAM:baiduprotect
+FAM:bajaspy
+FAM:bamital
+FAM:bancos
+FAM:bandoo
+FAM:banload
+FAM:basbanke
+FAM:basebridge
+FAM:basepay
+FAM:bauts
+FAM:bebeg
+FAM:becou
+FAM:beebone
+FAM:beita
+FAM:beitaad
+FAM:belesak
+FAM:benjamin
+FAM:berbew
+FAM:bertle
+FAM:betterad
+FAM:bettersurf
+FAM:bgserv
+FAM:bicololo
+FAM:bifrose
+FAM:biige
+FAM:binka
+FAM:bips
+FAM:birele
+FAM:bitrep
+FAM:blacklister
+FAM:bladabindi
+FAM:blohi
+FAM:blueguard
+FAM:bmmedia
+FAM:boaxxe
+FAM:bobax
+FAM:bobic
+FAM:boogrdex
+FAM:boomp
+FAM:boqx
+FAM:boyad
+FAM:bredolab
+FAM:brontok
+FAM:browsefox
+FAM:bruad
+FAM:bublik
+FAM:bundlore
+FAM:buzus
+FAM:buzztouch
+FAM:bzub
+FAM:callflakes
+FAM:callpay
+FAM:callrecorder
+FAM:campys
+FAM:carberp
+FAM:cardserv
+FAM:cashon
+FAM:cellshark
+FAM:centim
+FAM:cerekv
+FAM:cheica
+FAM:chir
+FAM:chyapo
+FAM:cinmus
+FAM:cleaman
+FAM:clevernet
+FAM:clientor
+FAM:clinator
+FAM:cmccwm
+FAM:cnbtech
+FAM:cnzz
+FAM:coinhive
+FAM:commplat
+FAM:conduit
+FAM:conficker
+FAM:contactscollector
+FAM:cooee
+FAM:coogos
+FAM:coolmirage
+FAM:coolwall
+FAM:cosmu
+FAM:cossta
+FAM:couponmarvel
+FAM:cova
+FAM:cridex
+FAM:crossrider
+FAM:crusewind
+FAM:cryptodef
+FAM:cryptolocker
+FAM:cryptowall
+FAM:crytex
+FAM:cryxos
+FAM:ctchm
+FAM:cutwail
+FAM:cycbot
+FAM:cyfin
+FAM:dabom
+FAM:dalexis
+FAM:damon
+FAM:dangbei
+FAM:darkkomet
+FAM:darop
+FAM:dasu
+FAM:daws
+FAM:dbtes
+FAM:deblio
+FAM:defmid
+FAM:delbar
+FAM:deshacop
+FAM:detroie
+FAM:dianle
+FAM:dianru
+FAM:dilidi
+FAM:dinwod
+FAM:directdownloader
+FAM:dlhelper
+FAM:dnotua
+FAM:dofoil
+FAM:dogowar
+FAM:domaiq
+FAM:domob
+FAM:dorfdo
+FAM:dorifel
+FAM:dorkbot
+FAM:dougalek
+FAM:dowgin
+FAM:downloadadmin
+FAM:downloadassistant
+FAM:downloadguide
+FAM:dqshell
+FAM:dridex
+FAM:droidcoupon
+FAM:droiddeluxe
+FAM:droiddream
+FAM:droiddreamlight
+FAM:droidkungfu
+FAM:droidsheep
+FAM:drolnux
+FAM:drolock
+FAM:dropdialer
+FAM:drosel
+FAM:drstwex
+FAM:dsploit
+FAM:dusvext
+FAM:dynamer
+FAM:easyroot
+FAM:egame
+FAM:egroupdial
+FAM:ejik
+FAM:elite
+FAM:emudbot
+FAM:eorezo
+FAM:equationdrug
+FAM:esfury
+FAM:etooe
+FAM:ewind
+FAM:expiro
+FAM:expressdownloader
+FAM:f7fa48878f6c
+FAM:faceniff
+FAM:fakeangry
+FAM:fakeapp
+FAM:fakebank
+FAM:fakebanker
+FAM:fakebkupt
+FAM:fakedep
+FAM:fakedoc
+FAM:fakeflash
+FAM:fakefolder
+FAM:fakeins
+FAM:fakeinst
+FAM:fakejob
+FAM:fakekrb
+FAM:fakelogo
+FAM:fakepay
+FAM:fakeplayer
+FAM:fakerateapp
+FAM:fakerean
+FAM:fakesysdef
+FAM:fakesysui
+FAM:faketimer
+FAM:fakevalidation
+FAM:fakgram
+FAM:fareit
+FAM:farfli
+FAM:farmap
+FAM:fateon
+FAM:fearso
+FAM:feejar
+FAM:feiad
+FAM:feiwo
+FAM:fengvi
+FAM:fenomengame
+FAM:fictus
+FAM:finspy
+FAM:firseria
+FAM:fjcon
+FAM:fkav
+FAM:fkdefend
+FAM:float
+FAM:flystudio
+FAM:fobus
+FAM:focobers
+FAM:fogo
+FAM:foncy
+FAM:forav
+FAM:fosniw
+FAM:framaroot
+FAM:freeandroidspy
+FAM:freeandspy
+FAM:freespy
+FAM:frupi
+FAM:fujacks
+FAM:gabas
+FAM:gabpath
+FAM:gamania
+FAM:gamarue
+FAM:gambler
+FAM:gamclk
+FAM:gameguardian
+FAM:gamevance
+FAM:gamex
+FAM:ganga
+FAM:ganlet
+FAM:gapev
+FAM:gappusin
+FAM:gato
+FAM:gbdialer
+FAM:gbqal
+FAM:geinimi
+FAM:general
+FAM:gepew
+FAM:geral
+FAM:getnow
+FAM:gexin
+FAM:ggtrack
+FAM:gibdy
+FAM:gidby
+FAM:gidix
+FAM:ginamster
+FAM:ginko
+FAM:ginmaster
+FAM:gizmo
+FAM:gobot
+FAM:golddream
+FAM:goldentouch
+FAM:gomanag
+FAM:gomunc
+FAM:gonesixty
+FAM:goodnews
+FAM:gorillaprice
+FAM:gpspy
+FAM:grabos
+FAM:graybird
+FAM:griptolo
+FAM:guerrilla
+FAM:gugi
+FAM:gumen
+FAM:gupay
+FAM:gysad
+FAM:hahad
+FAM:hamad
+FAM:harnig
+FAM:hasdk
+FAM:haynu
+FAM:hero
+FAM:hiddad
+FAM:hiddenapp
+FAM:hiddnad
+FAM:highster
+FAM:hilldoor
+FAM:hiloti
+FAM:hipposms
+FAM:honli
+FAM:hotbar
+FAM:hotclip
+FAM:hoverwatch
+FAM:hqowdo
+FAM:hqwar
+FAM:htmlapp
+FAM:humanspy
+FAM:hupigon
+FAM:hypay
+FAM:ibryte
+FAM:icekboy
+FAM:icloader
+FAM:iconhider
+FAM:iconosys
+FAM:icqbomber
+FAM:imali
+FAM:imaut
+FAM:imlog
+FAM:inbox
+FAM:infectionads
+FAM:inor
+FAM:inservice
+FAM:installbrain
+FAM:installcore
+FAM:installerex
+FAM:installiq
+FAM:installmetrix
+FAM:installmonetizer
+FAM:installmonster
+FAM:intersad
+FAM:intexdial
+FAM:invent
+FAM:invis
+FAM:ipamor
+FAM:iqiad
+FAM:iresearch
+FAM:irtard
+FAM:itracker
+FAM:jayqa
+FAM:jeefo
+FAM:jfpush
+FAM:jiagu
+FAM:jiead
+FAM:jifake
+FAM:jinupd
+FAM:jisut
+FAM:joye
+FAM:joynow
+FAM:jsmshider
+FAM:jssms
+FAM:judy
+FAM:juzi
+FAM:kabun
+FAM:kalfere
+FAM:kapratect
+FAM:karagany
+FAM:kasidet
+FAM:katrep
+FAM:kelihos
+FAM:kgbspy
+FAM:kidlogger
+FAM:kimia
+FAM:kingroot
+FAM:kirko
+FAM:kiser
+FAM:klez
+FAM:kmin
+FAM:kolab
+FAM:koobface
+FAM:korgo
+FAM:koutodoor
+FAM:kovter
+FAM:krefel
+FAM:ksapp
+FAM:kuguo
+FAM:kurash
+FAM:kyhub
+FAM:kyview
+FAM:laconic
+FAM:lardlond
+FAM:laroux
+FAM:ldpinch
+FAM:leadbolt
+FAM:leapp
+FAM:lemon
+FAM:letv
+FAM:lien
+FAM:linkular
+FAM:lirose
+FAM:lmir
+FAM:lmmob
+FAM:loadmoney
+FAM:loapi
+FAM:lockactivity
+FAM:locmg
+FAM:loic
+FAM:lolbot
+FAM:lollipop
+FAM:loodos
+FAM:looper
+FAM:loozfon
+FAM:lotuseed
+FAM:lotusid
+FAM:lovefraud
+FAM:loveletter
+FAM:lovetrack
+FAM:lovetrap
+FAM:lucky
+FAM:lxasj
+FAM:lynep
+FAM:mabezat
+FAM:magiccasino
+FAM:mailcab
+FAM:maistealer
+FAM:malwarescope
+FAM:mamianune
+FAM:mankess
+FAM:marcher
+FAM:mars
+FAM:marsdaemon
+FAM:mart
+FAM:masplot
+FAM:masspr
+FAM:maxapp
+FAM:mazarbot
+FAM:mecor
+FAM:medfos
+FAM:mediafinder
+FAM:mediamagnet
+FAM:meftadon
+FAM:megasearch
+FAM:memery
+FAM:menti
+FAM:metasploit
+FAM:mgyun
+FAM:midia
+FAM:migun
+FAM:milipnot
+FAM:minimob
+FAM:mirai
+FAM:mmarketpay
+FAM:mmaro
+FAM:mobby
+FAM:mobcent
+FAM:mobclick
+FAM:mobeleader
+FAM:mobhey
+FAM:mobiad
+FAM:mobidash
+FAM:mobifence
+FAM:mobikok
+FAM:mobile
+FAM:mobilepay
+FAM:mobilespy
+FAM:mobiletracker
+FAM:mobiletx
+FAM:mobwin
+FAM:mocpiad
+FAM:mogap
+FAM:mogosec
+FAM:monitorminor
+FAM:monokle
+FAM:moplus
+FAM:morcut
+FAM:morix
+FAM:morto
+FAM:mprt
+FAM:mseg
+FAM:mspy
+FAM:mtracker
+FAM:multibar
+FAM:murofet
+FAM:mwiam
+FAM:mydoom
+FAM:myfolder
+FAM:myteam
+FAM:mytrackp
+FAM:mywebsearch
+FAM:nandrobox
+FAM:navbar
+FAM:nawiaiad
+FAM:necro
+FAM:necurs
+FAM:neospy
+FAM:neshta
+FAM:netbox
+FAM:netins
+FAM:netwiredrc
+FAM:ngrbot
+FAM:nickyspy
+FAM:nitol
+FAM:nivdort
+FAM:nocoma
+FAM:notifyer
+FAM:nqshield
+FAM:obtes
+FAM:ocikq
+FAM:odpa
+FAM:oimobi
+FAM:oivim
+FAM:oixal
+FAM:omsysd
+FAM:oneclickfraud
+FAM:onexuan
+FAM:onlinegames
+FAM:opencandy
+FAM:openinstall
+FAM:opfake
+FAM:optix
+FAM:outbrowse
+FAM:oveead
+FAM:paccy
+FAM:palevo
+FAM:pandaad
+FAM:parite
+FAM:patacore
+FAM:paycall
+FAM:pcclient
+FAM:penetho
+FAM:penguin
+FAM:perflogger
+FAM:perkel
+FAM:petrolin
+FAM:phonespy
+FAM:picsys
+FAM:piom
+FAM:pioneer
+FAM:pirates
+FAM:pirminay
+FAM:pjapps
+FAM:placms
+FAM:plankton
+FAM:plemood
+FAM:plugx
+FAM:poison
+FAM:pokotus
+FAM:ponmocup
+FAM:poohad
+FAM:pornapp
+FAM:pornoasset
+FAM:pornoblocker
+FAM:pornpay
+FAM:pornvideo
+FAM:presenoker
+FAM:prorat
+FAM:prosti
+FAM:pullupdate
+FAM:pupy
+FAM:purityscan
+FAM:pushad
+FAM:pushe
+FAM:puxis
+FAM:pykspa
+FAM:qbot
+FAM:qexma
+FAM:qplus
+FAM:qqrob
+FAM:qumi
+FAM:quozha
+FAM:qushu
+FAM:raden
+FAM:ramnit
+FAM:ranky
+FAM:rasteal
+FAM:razam
+FAM:rbot
+FAM:rebhip
+FAM:recmads
+FAM:redalert
+FAM:rediassi
+FAM:redmobile
+FAM:redyms
+FAM:reflod
+FAM:refog
+FAM:regon
+FAM:relevantknowledge
+FAM:renocide
+FAM:renos
+FAM:reporo
+FAM:reptilicus
+FAM:resharer
+FAM:reveton
+FAM:riltok
+FAM:rimod
+FAM:robtes
+FAM:rodecap
+FAM:rogueurl
+FAM:root
+FAM:rootagent
+FAM:rootmaster
+FAM:rootnik
+FAM:rootsmart
+FAM:rotexy
+FAM:rufraud
+FAM:rukometa
+FAM:rungbu
+FAM:ruskill
+FAM:rusms
+FAM:sacti
+FAM:sacto
+FAM:sadenav
+FAM:sadpor
+FAM:sahat
+FAM:sakezon
+FAM:sality
+FAM:sanctionedmedia
+FAM:sandr
+FAM:savemy
+FAM:scam
+FAM:sckeylog
+FAM:sdbot
+FAM:seaweth
+FAM:secapk
+FAM:securityshield
+FAM:securityxploded
+FAM:senddroid
+FAM:severs
+FAM:sfone
+FAM:shastrosms
+FAM:shedun
+FAM:sheridroid
+FAM:shixot
+FAM:shiz
+FAM:shodi
+FAM:shuame
+FAM:shylock
+FAM:silentboot
+FAM:silentinst
+FAM:silentinstaller
+FAM:sillyfdc
+FAM:silverpush
+FAM:simbad
+FAM:simbot
+FAM:simda
+FAM:simpatchy
+FAM:simplocker
+FAM:sinowal
+FAM:skeeyah
+FAM:skplanet
+FAM:skymobi
+FAM:slic
+FAM:slocker
+FAM:slugin
+FAM:smammer
+FAM:smartfortress
+FAM:smcc
+FAM:smforw
+FAM:smishing
+FAM:smsagent
+FAM:smsbomber
+FAM:smsbot
+FAM:smscatch
+FAM:smscmd
+FAM:smsfakeinstall
+FAM:smsgol
+FAM:smskey
+FAM:smspay
+FAM:smsreg
+FAM:smssend
+FAM:smsspy
+FAM:smsthief
+FAM:smszombie
+FAM:snowfox
+FAM:socks
+FAM:soft32downloader
+FAM:softcnapp
+FAM:softobase
+FAM:softomate
+FAM:softonic
+FAM:softpulse
+FAM:sohanad
+FAM:sokmi
+FAM:somoto
+FAM:sopes
+FAM:sosceo
+FAM:soundy
+FAM:spbot
+FAM:speedingupmypc
+FAM:spigot
+FAM:spitmo
+FAM:spotad
+FAM:sprovider
+FAM:spyagent
+FAM:spyapp
+FAM:spybubble
+FAM:spydealer
+FAM:spyeye
+FAM:spyhasb
+FAM:spynote
+FAM:spyoo
+FAM:spyset
+FAM:spyzie
+FAM:squarenet
+FAM:stalk
+FAM:stealthcell
+FAM:steek
+FAM:stegvob
+FAM:stopsmsc
+FAM:stoqx
+FAM:strarpay
+FAM:styricka
+FAM:suaban
+FAM:suggestor
+FAM:supking
+FAM:svpeng
+FAM:swisyn
+FAM:swizzor
+FAM:systemmonitor
+FAM:systush
+FAM:sytro
+FAM:tachi
+FAM:talkw
+FAM:taojinstar
+FAM:tapcore
+FAM:target
+FAM:tdss
+FAM:tebak
+FAM:techsnab
+FAM:tedroo
+FAM:teebik
+FAM:tekwon
+FAM:telman
+FAM:tenga
+FAM:terkcop
+FAM:tescrypt
+FAM:teslacrypt
+FAM:tetus
+FAM:tgapp
+FAM:tgpotato
+FAM:tgpush
+FAM:tibs
+FAM:tibser
+FAM:tifamily
+FAM:tigerbot
+FAM:tinba
+FAM:tirrip
+FAM:tispy
+FAM:tocrenu
+FAM:toga
+FAM:toggle
+FAM:toofan
+FAM:tordow
+FAM:toreoc
+FAM:torjok
+FAM:totap
+FAM:towel
+FAM:tracer
+FAM:tracker
+FAM:trackerfree
+FAM:trackplus
+FAM:trclick
+FAM:tridrongo
+FAM:troom
+FAM:truedownloader
+FAM:tufik
+FAM:turkojan
+FAM:tuyoopay
+FAM:twetti
+FAM:txing
+FAM:typstu
+FAM:ultima
+FAM:umpay
+FAM:uniblue
+FAM:unruy
+FAM:upatre
+FAM:updtkiller
+FAM:urelas
+FAM:usatek
+FAM:ussder
+FAM:usteal
+FAM:utchi
+FAM:uupay
+FAM:uuserv
+FAM:vapsup
+FAM:vdloader
+FAM:verti
+FAM:vidro
+FAM:vietsms
+FAM:viking
+FAM:vilsel
+FAM:virlock
+FAM:virusdoctor
+FAM:virut
+FAM:viser
+FAM:vittalia
+FAM:vkemag
+FAM:vktihs
+FAM:vmvol
+FAM:vnapstore
+FAM:vobfus
+FAM:vpsdrop
+FAM:vsas
+FAM:vtflooder
+FAM:vundo
+FAM:wabot
+FAM:wajar
+FAM:waledac
+FAM:walex
+FAM:walien
+FAM:walkinwat
+FAM:wallad
+FAM:wannacry
+FAM:wannalocker
+FAM:wapomi
+FAM:wapron
+FAM:warezov
+FAM:webalta
+FAM:webkey
+FAM:webprefix
+FAM:whatsapp
+FAM:whistlesoftware
+FAM:whiteice
+FAM:whitesmoke
+FAM:wifikill
+FAM:winactivator
+FAM:winfetcher
+FAM:winkad
+FAM:winpump
+FAM:winwebsec
+FAM:wkload
+FAM:wooboo
+FAM:wowlik
+FAM:wqmobile
+FAM:wroba
+FAM:wtaspin
+FAM:xavierad
+FAM:xinhua
+FAM:xolco
+FAM:xorala
+FAM:xtrat
+FAM:xynyin
+FAM:yeahmobi
+FAM:yekrand
+FAM:yoga
+FAM:youku
+FAM:youmi
+FAM:yuchanglou
+FAM:yzhc
+FAM:zadmo
+FAM:zbot
+FAM:zdtad
+FAM:zegost
+FAM:zenosearch
+FAM:zeroaccess
+FAM:zhash
+FAM:zhidian
+FAM:zhui
+FAM:zitmo
+FAM:zlob
+FAM:zniu
+FAM:zombbomber
+FAM:ztorg
+FAM:zusy
+FAM:zvuzona
+FAM:zwangi
+FILE:bundle
+FILE:corrupted
+FILE:exploit
+FILE:exploit:asroot
+FILE:exploit:doidroot
+FILE:exploit:droidrt
+FILE:exploit:enoket
+FILE:exploit:exploid
+FILE:exploit:exynos
+FILE:exploit:fakeroot
+FILE:exploit:gingerbreak
+FILE:exploit:gxox
+FILE:exploit:master
+FILE:exploit:masterkey
+FILE:exploit:ratc
+FILE:exploit:rootor
+FILE:exploit:signaturebypass
+FILE:exploit:stagefright
+FILE:exploit:towelroot
+FILE:fakepdf
+FILE:fakepublisher
+FILE:filetype:flash
+FILE:filetype:html
+FILE:filetype:jpeg
+FILE:filetype:msoffice
+FILE:filetype:msoffice:msexcel
+FILE:filetype:msoffice:msword
+FILE:filetype:pdf
+FILE:filetype:text
+FILE:iframe
+FILE:iframe:iframeref
+FILE:installer
+FILE:installer:installmate
+FILE:installer:nsis
+FILE:installer:smartinstaller
+FILE:installer:wiseinstaller
+FILE:macro
+FILE:msil
+FILE:os:android
+FILE:os:linux
+FILE:os:mac
+FILE:os:windows
+FILE:packed
+FILE:packed:asprotect
+FILE:packed:decrypter
+FILE:packed:execryptor
+FILE:packed:expressor
+FILE:packed:krunchy
+FILE:packed:maskpe
+FILE:packed:molebox
+FILE:packed:nakedpack
+FILE:packed:nspack
+FILE:packed:pearmor
+FILE:packed:pecompact
+FILE:packed:polycrypt
+FILE:packed:rcryptor
+FILE:packed:themida
+FILE:packed:upack
+FILE:packed:vmprotect
+FILE:proglang:autoit
+FILE:proglang:delphi
+FILE:proglang:java
+FILE:proglang:java:genericgba
+FILE:proglang:perl
+FILE:proglang:powershell
+FILE:proglang:python
+FILE:proglang:visualbasic
+FILE:shellcode
+FILE:signed
+FILE:small
+FILE:testvirus
+FILE:webpage
+GEN:abuse
+GEN:access
+GEN:advml
+GEN:agen
+GEN:apk
+GEN:appl
+GEN:application
+GEN:attribute
+GEN:based
+GEN:behav
+GEN:behaveslike
+GEN:bloodhound
+GEN:cloud
+GEN:confidence
+GEN:dangerousobject
+GEN:deepscan
+GEN:eheur
+GEN:encodefeature
+GEN:file
+GEN:gen
+GEN:gena
+GEN:generic
+GEN:generickd
+GEN:genericr
+GEN:generik
+GEN:genetic
+GEN:genfamily:agent
+GEN:genfamily:artemis
+GEN:genfamily:badur
+GEN:genfamily:barys
+GEN:genfamily:dapato
+GEN:genfamily:delf
+GEN:genfamily:eldorado
+GEN:genfamily:foreign
+GEN:genfamily:graftor
+GEN:genfamily:jorik
+GEN:genfamily:katusha
+GEN:genfamily:kazy
+GEN:genfamily:krap
+GEN:genfamily:mikey
+GEN:genfamily:scar
+GEN:genfamily:strictor
+GEN:genfamily:symmi
+GEN:genfamily:yakes
+GEN:genmalicious
+GEN:genome
+GEN:hack
+GEN:heur
+GEN:heuristic
+GEN:high
+GEN:highconfidence
+GEN:igeneric
+GEN:kcloud
+GEN:lookslike
+GEN:malagent
+GEN:maldroid
+GEN:malicious
+GEN:maltrec
+GEN:malware
+GEN:memscan
+GEN:multi
+GEN:normal
+GEN:onion
+GEN:optional
+GEN:other
+GEN:password
+GEN:posible
+GEN:possible
+GEN:probably
+GEN:program
+GEN:reputation
+GEN:sape
+GEN:score
+GEN:securityrisk
+GEN:siggen
+GEN:software
+GEN:static
+GEN:susp
+GEN:suspect
+GEN:suspectcrc
+GEN:suspected
+GEN:suspic
+GEN:suspicious
+GEN:symvt
+GEN:threat
+GEN:trojan
+GEN:tsgeneric
+GEN:unclassifiedmalware
+GEN:undef
+GEN:undefined
+GEN:unknown
+GEN:variant
+GEN:website
diff --git a/avclass2/lib/avclass2_common.py b/avclass2/lib/avclass2_common.py
new file mode 100755
index 0000000..2233242
--- /dev/null
+++ b/avclass2/lib/avclass2_common.py
@@ -0,0 +1,636 @@
+#!/usr/bin/env python
+'''
+Main AVClass class
+'''
+
+import sys
+import re
+import string
+import logging
+from collections import OrderedDict as OrdDict
+from collections import namedtuple
+from operator import itemgetter, attrgetter
+
+# Set logging
+log = logging.getLogger(__name__)
+
+# Prefix to identify platform tags
+platform_prefix = "FILE:os:"
+
+# Default category for tags in taxonomy with no category
+uncategorized_cat  = "UNC"
+
+SampleInfo = namedtuple('SampleInfo', 
+                        ['md5', 'sha1', 'sha256', 'labels', 'vt_tags'])
+
+Tag = namedtuple('Tag', ['name', 'cat', 'path', 'prefix_l'])
+
+# AVs to use in suffix removal
+suffix_removal_av_set = {'Norman', 'Avast', 'Avira', 'Kaspersky',
+                          'ESET-NOD32', 'Fortinet', 'Jiangmin', 'Comodo',
+                          'GData', 'Avast', 'Sophos',
+                          'TrendMicro-HouseCall', 'TrendMicro',
+                          'NANO-Antivirus', 'Microsoft'}
+
+def create_tag(s):
+    ''' Create a Tag from its string representation '''
+    word_list = s.strip().split(":")
+    if len(word_list) > 1:
+        name = word_list[-1].lower()
+        cat = word_list[0].upper()
+        prefix_l = [x.lower() for x in word_list[1:-1]]
+        path = cat
+        for x in prefix_l:
+            path = path + ':' + x
+        path = path + ':' + name
+    else:
+        name = word_list[0].lower()
+        cat = uncategorized_cat
+        prefix_l = []
+        path = name
+    return Tag(name, cat, path, prefix_l)
+
+class Taxonomy:
+    '''
+    A taxonomy of tags and generic tokens read from file
+    '''
+    def __init__(self, filepath):
+        ''' Map tag.name | tag.path -> Tag '''
+        self.__tag_map = {}
+        if filepath:
+            self.read_taxonomy(filepath)
+
+    def __len__(self):
+        ''' Taxonomy length is the number of tags it contains '''
+        return len(self.__tag_map)/2
+
+    def is_generic(self, t):
+        ''' Return true if input is generic, false otherwise '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            return tag.cat == "GEN"
+        else:
+            return False
+
+    def is_tag(self, t):
+        ''' Return true if input is tag, false otherwise '''
+        return t in self.__tag_map
+
+    def add_tag(self, s, override=False):
+        ''' Add tag to taxonomy 
+            If tag already exists with different path, 
+              only replaces if override True '''
+        tag = create_tag(s)
+        t = self.__tag_map.get(tag.name, None)
+        if t and (t.path != tag.path):
+            if (not override):
+                return
+            else:
+                log.warn("[Taxonomy] Replacing %s with %s\n" % (
+                                  t.path, tag.path))
+                del self.__tag_map[t.path]
+        log.info("[Taxonomy] Adding tag %s" % s)
+        self.__tag_map[tag.name] = tag
+        self.__tag_map[tag.path] = tag
+        return
+
+    def remove_tag(self, t):
+        ''' Remove tag from taxonomy. Returns 1 if removed, zero if unknown '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            log.info("[Taxonomy] Removing tag: %s" % tag.path)
+            del self.__tag_map[tag.name]
+            del self.__tag_map[tag.path]
+            return 1
+        else:
+            return 0
+
+    def get_category(self, t):
+        ''' Return category of input tag, UNK if not a tag '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            return tag.cat
+        else:
+            return "UNK"
+
+    def get_path(self, t):
+        ''' Return full path for given tag, or empty string if not a tag '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            return tag.path
+        else:
+            return ("UNK:" + t)
+
+    def get_prefix_l(self, t):
+        ''' Return prefix list for given tag, or empty string if not a tag '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            return tag.prefix_l
+        else:
+            return []
+
+    def get_prefix(self, t):
+        ''' Return prefix string for given tag, 
+            or empty string if not a tag '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            return tag.prefix_l
+        else:
+            return t.path[0:t.path.rfind(':')]
+
+    def get_depth(self, t):
+        ''' Return depth of tag in taxonomy. 
+            Returns zero if tag not in taxonomy. 
+            A normal tag CAT:name has depth two '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            return len(tag.prefix_l) + 2
+        else:
+            return 0
+
+    def get_info(self, t):
+        ''' Return (path,category) for given tag, or UNK:t if not a tag '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            return tag.path, tag.cat
+        else:
+            return "UNK:" + t, "UNK"
+
+    def expand(self, t):
+        ''' Return list of tags in prefix list that are leaves '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            return [t for t in tag.prefix_l if t in self.__tag_map]
+        else:
+            return []
+
+    def platform_tags(self): 
+        ''' Returns list with platform tags in taxonomy '''
+        acc = set()
+        for idx,tag in self.__tag_map.items():
+            if tag.path.startswith(platform_prefix):
+                acc.add(tag.name)
+        return acc
+
+    def overlaps(self, t1, t2):
+        ''' Returns true if the path of the given tags overlaps '''
+        m1 = self.get_prefix_l(t1)
+        m2 = self.get_prefix_l(t2)
+        return (t1 in m2) or (t2 in m1)
+
+    def remove_overlaps(self, l): 
+        ''' Returns list with overlapping tags removed '''
+        if not l:
+            return l
+        pair_l = sorted([(self.get_depth(t),t) for t in l])
+        out_l = [pair_l.pop()[1]]
+        while pair_l:
+            t = pair_l.pop()[1]
+            if (not any(self.overlaps(t, e) for e in out_l)):
+                out_l.append(t)
+        return out_l
+
+    def read_taxonomy(self, filepath):
+        '''Read taxonomy from given file '''
+        with open(filepath, 'r') as fd:
+            for line in fd:
+                if line.startswith('#') or line == '\n':
+                    continue
+                self.add_tag(line.strip())
+        return
+
+    def to_file(self, filepath):
+        ''' Output sorted taxonomy to given file '''
+        # Open output file
+        fd = open(filepath, 'w')
+        # Write sorted tags
+        tag_l = sorted(self.__tag_map.items(), 
+                                key=lambda item : item[1].path, 
+                                reverse=False)
+        idx = 0
+        for name,tag in tag_l:
+            if (idx % 2) == 0:
+                fd.write(tag.path+"\n")
+            idx+=1
+        # Close output file
+        fd.close()
+
+class Rules:
+    '''
+    Rules are src -> dst1, dst2, ... relations
+    '''
+    def __init__(self, filepath):
+        ''' Map src -> set(dst) '''
+        self._rmap = {}
+        if filepath:
+            self.read_rules(filepath)
+
+    def __len__(self):
+        ''' Length is number of rules, i.e., number of src '''
+        return len(self._rmap)
+
+    def add_rule(self, src, dst_l, overwrite=False):
+        ''' Add rule. If rule exists:
+            if overwrite==True, replace destination list
+            else append dst_l to current target set  '''
+        # Remove src from dst_l if it exists
+        dst_l = filter(lambda x: x != src, dst_l)
+        # If no destinations, nothing to do
+        if (not dst_l):
+            return
+        log.info("[Rules] Adding %s -> %s" % (src, dst_l))
+        src_tag = create_tag(src)
+        if overwrite:
+            target_l = [create_tag(dst).name for dst in dst_l]
+            self._rmap[src_tag.name] = set(target_l)
+        else:
+            curr_dst = self._rmap.get(src_tag.name, set())
+            for dst in dst_l:
+                dst_tag = create_tag(dst)
+                curr_dst.add(dst_tag.name)
+            self._rmap[src_tag.name] = curr_dst
+        return
+
+    def remove_rule(self, src):
+        l = self._rmap.get(src, [])
+        if l:
+            log.info("[Rules] Removing rule: %s -> %s" % (src, l))
+            del self._rmap[src]
+            return 1
+        else:
+            return 0
+
+    def get_dst(self, src):
+        ''' Returns dst list for given src, or empty list if no expansion '''
+        return list(self._rmap.get(src, []))
+
+    def read_rules(self, filepath):
+        '''Read rules from given file'''
+        with open(filepath, 'r') as fd:
+            for line in fd:
+                if line.startswith('#') or line == '\n':
+                    continue
+                word_list = line.strip().split()
+                if len(word_list) > 1:
+                    self.add_rule(word_list[0],word_list[1:])
+        return
+
+    def to_file(self, filepath, taxonomy=None):
+        ''' Output sorted rules to given file 
+            If taxonomy is provided, it outputs full tag path '''
+        fd = open(filepath, 'w')
+        for src,dst_set in sorted(self._rmap.items()):
+            dst_l = sorted(dst_set, reverse=False)
+            if taxonomy:
+                src_path = taxonomy.get_path(src)
+                path_l = [taxonomy.get_path(t) for t in dst_l]
+                dst_str = '\t'.join(path_l)
+                fd.write("%s\t%s\n" % (src_path,dst_str))
+            else:
+                dst_str = '\t'.join(dst_l)
+                fd.write("%s\t%s\n" % (src,dst_str))
+        fd.close()
+
+    def expand_src_destinations(self, src):
+        ''' Return destination list for given src after recursively 
+            following any rules for destinations '''
+        dst_set = self._rmap.get(src, set())
+        out = set()
+        while dst_set:
+            dst = dst_set.pop()
+            l = self._rmap.get(dst, [])
+            if l:
+                for e in l:
+                    if (e not in out) and (e != dst):
+                        dst_set.add(e)
+            else:
+                out.add(dst)
+        return out
+
+    def expand_all_destinations(self):
+        ''' Return destination list for given src after recursively 
+            following any rules for destinations '''
+        src_l = self._rmap.keys()
+        for src in src_l:
+            dst_l = self.expand_src_destinations(src)
+            self._rmap[src] = dst_l
+
+class Tagging(Rules):
+    '''
+    Tagging rules have src UNK and dst in taxonomy
+    '''
+    def __init__(self, filepath):
+        Rules.__init__(self, filepath)
+
+    def validate(self, taxonomy):
+        ''' Check that tags in tagging rules are in given taxonomy '''
+        for tok,tag_l in self._rmap.items():
+            for t in tag_l:
+                if (not taxonomy.is_tag(t)):
+                    sys.stdout.write("[Tagging] %s not in taxonomy\n" % t)
+
+class Expansion(Rules):
+    '''
+    Expansion rules have src and dst in taxonomy and
+        src.category != dst.category
+    '''
+    def __init__(self, filepath):
+        Rules.__init__(self, filepath)
+
+    def validate(self, taxonomy):
+        ''' Check that tags in expansion rules are in given taxonomy '''
+        for src,dst_set in self._rmap.items():
+            if (not taxonomy.is_tag(src)):
+                sys.stdout.write("[Expansion] %s not in taxonomy\n" % src)
+            for dst in dst_set:
+                if (not taxonomy.is_tag(dst)):
+                    sys.stdout.write("[Expansion] %s not in taxonomy\n" % dst)
+
+class AvLabels:
+    '''
+    Class to operate on AV labels, 
+    such as extracting the most likely family name.
+    '''
+    def __init__(self, tag_file, exp_file = None, tax_file = None,
+                 av_file = None, aliasdetect=False):
+        # Read taxonomy
+        self.taxonomy = Taxonomy(tax_file)
+        # Read tag rules
+        self.tagging = Tagging(tag_file)
+        # Read expansion rules
+        self.expansions = Expansion(exp_file)
+        # Read AV engines
+        self.avs = self.read_avs(av_file) if av_file else None
+        # Alias statistics initialization
+        self.aliasdetect = aliasdetect
+
+    @staticmethod
+    def read_avs(avs_file):
+        '''Read AV engine set from given file'''
+        with open(avs_file) as fd:
+            avs = set(map(str.strip, fd.readlines()))
+        return avs
+
+    @staticmethod
+    def get_sample_info_lb(vt_rep):
+        '''Parse and extract sample information from JSON line
+           Returns a SampleInfo named tuple
+        '''
+        return SampleInfo(vt_rep['md5'], vt_rep['sha1'], vt_rep['sha256'],
+                          vt_rep['av_labels'], [])
+
+    @staticmethod
+    def get_sample_info_vt_v2(vt_rep):
+        '''Parse and extract sample information from JSON line
+           Returns a SampleInfo named tuple
+        '''
+        label_pairs = []
+        # Obtain scan results, if available
+        try:
+            scans = vt_rep['scans']
+            md5 = vt_rep['md5']
+            sha1 = vt_rep['sha1']
+            sha256 = vt_rep['sha256']
+        except KeyError:
+            return None
+        # Obtain labels from scan results
+        for av, res in scans.items():
+            if res['detected']:
+                label = res['result']
+                clean_label = ''.join(filter(
+                                  lambda x: x in string.printable,
+                                    label)).strip()
+                label_pairs.append((av, clean_label))
+        # Obtain VT tags, if available
+        vt_tags = vt_rep.get('tags', [])
+
+        return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags)
+
+    @staticmethod
+    def get_sample_info_vt_v3(vt_rep):
+        '''Parse and extract sample information from JSON line
+           Returns a SampleInfo named tuple
+        '''
+        label_pairs = []
+        # Obtain scan results, if available
+        try:
+            scans = vt_rep['data']['attributes']['last_analysis_results']
+            md5 = vt_rep['data']['attributes']['md5']
+            sha1 = vt_rep['data']['attributes']['sha1']
+            sha256 = vt_rep['data']['attributes']['sha256']
+        except KeyError:
+            return None
+        # Obtain labels from scan results
+        for av, res in scans.items():
+            label = res['result']
+            if label is not None:
+                clean_label = ''.join(filter(
+                                  lambda x: x in string.printable,
+                                    label)).strip()
+                label_pairs.append((av, clean_label))
+        # Obtain VT tags, if available
+        vt_tags = vt_rep['data']['attributes'].get('tags', [])
+
+        return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags)
+
+
+    @staticmethod
+    def is_pup(tag_pairs, taxonomy):
+        '''This function classifies the sample as PUP or not 
+           by checking if highest ranked CLASS tag contains "grayware"
+           and is above a predefined threshold
+           Return:
+              True/False/None
+        '''
+        threshold = 0.5
+        # If no tags, return false
+        if len(tag_pairs) < 1:
+            return None
+        max_ctr = tag_pairs[0][1]
+        for (tag,ctr) in tag_pairs:
+            (path, cat) = taxonomy.get_info(tag)
+            if (cat == "CLASS"):
+                if ("grayware" in path):
+                    return (float(ctr) >= float(max_ctr)*threshold)
+                else:
+                    return False
+        return False
+
+    @staticmethod
+    def __remove_suffixes(av_name, label):
+        '''Remove AV specific suffixes from given label
+           Returns updated label'''
+
+        # Truncate after last '.'
+        if av_name in suffix_removal_av_set:
+            label = label.rsplit('.', 1)[0]
+
+        # Truncate after last '.' 
+        # if suffix only contains digits or uppercase (no lowercase) chars
+        if av_name == 'AVG':
+            tokens = label.rsplit('.', 1)
+            if len(tokens) > 1 and re.match("^[A-Z0-9]+$", tokens[1]):
+                label = tokens[0]
+
+        # Truncate after last '!'
+        if av_name == 'Agnitum':
+            label = label.rsplit('!', 1)[0]
+
+        return label
+
+
+    def get_label_tags(self, label, hashes):
+        ''' Return list of tags in given label 
+            Tokenizes label, filters unneeded tokens, and 
+            applies tagging rules '''
+
+        # Initialize set of tags to return
+        # We use a set to avoid duplicate tokens in the same AV label
+        # This avoids "potentially unwanted" contributing twice BEH:pup
+        tags = set()
+
+        # If empty label, nothing to do
+        if not label:
+            return tags
+
+        # Split label into tokens and process each token
+        for token in re.split("[^0-9a-zA-Z]", label):
+            # Convert token to lowercase
+            token = token.lower()
+
+            # Remove digits at the end
+            end_len = len(re.findall("\d*$", token)[0])
+            if end_len:
+                token = token[:-end_len]
+
+            # Ignore token if prefix of a hash of the sample
+            # Most AVs use MD5 prefixes in labels, 
+            # but we check SHA1 and SHA256 as well
+            hash_token = False
+            for hash_str in hashes:
+                if hash_str[0:len(token)] == token:
+                  hash_token = True
+                  break
+            if hash_token:
+                continue
+
+            # Ignore generic tokens
+            if self.taxonomy.is_generic(token):
+                continue
+
+            # Apply tagging rule
+            dst_l = self.tagging.get_dst(token)
+            if dst_l:
+                # Ignore generic tokens
+                for t in dst_l:
+                    if not self.taxonomy.is_generic(t):
+                        tags.add(t)
+            # Add token if longer than 3 characters and no tagging rule
+            elif len(token) > 3:
+                tags.add(token)
+
+        # Return tags
+        return tags
+
+
+    def __expand(self, tag_set):
+        ''' Return expanded set of tags '''
+        ret = set()
+        for t in tag_set:
+            # Include tag
+            ret.add(t)
+
+            # Include target of expansion rule in output
+            ret.update(self.expansions.get_dst(t))
+
+            # Include implicit expansions in taxonomy
+            ret.update(self.taxonomy.expand(t))
+
+        # Return a list for backwards compatibility 
+        return ret
+
+    def get_sample_tags(self, sample_info):
+        ''' Returns dictionary tag -> AV list of tags for the given sample '''
+
+        # Whitelist the AVs to filter the ones with meaningful labels
+        av_whitelist = self.avs
+        # Initialize auxiliary data structures
+        duplicates = set()
+        av_dict = {}
+
+        # Process each AV label
+        for (av_name, label) in sample_info.labels:
+            # If empty label, nothing to do
+            if not label:
+                continue
+
+            ################
+            # AV selection #
+            ################
+            if av_whitelist and av_name not in av_whitelist:
+                continue
+
+            #####################
+            # Duplicate removal #
+            #####################
+
+            # Emsisoft uses same label as 
+            # GData/ESET-NOD32/BitDefender/Ad-Aware/MicroWorld-eScan,
+            # but suffixes ' (B)' to their label. Remove the suffix.
+            if label.endswith(' (B)'):
+                label = label[:-4]
+
+            # F-Secure uses Avira's engine since Nov. 2018
+            # but prefixes 'Malware.' to Avira's label. Remove the prefix.
+            if label.startswith('Malware.'):
+                label = label[8:]
+
+            # Other engines often use exactly the same label, e.g.,
+            #   AVG/Avast
+            #   K7Antivirus/K7GW
+            #   Kaspersky/ZoneAlarm
+
+            # If we have seen the exact same label before, skip
+            if label in duplicates:
+                continue
+            # If not, we add it to duplicates
+            else:
+                duplicates.add(label)
+
+            ##################
+            # Suffix removal #
+            ##################
+            label = self.__remove_suffixes(av_name, label)
+
+            ########################################################
+            # Tokenization and tagging                             #
+            ########################################################
+            hashes = [ sample_info.md5, sample_info.sha1, sample_info.sha256 ]
+            tags = self.get_label_tags(label, hashes)
+
+            ########################################################
+            # Expansions                                           #
+            ########################################################
+            # NOTE: Avoiding to do expansion when aliases
+            if self.aliasdetect:
+                expanded_tags = tags
+            else:
+                expanded_tags = self.__expand(tags)
+
+            ########################################################
+            # Stores information that relates AV vendors with tags #
+            ########################################################
+            for t in expanded_tags:
+                av_dict.setdefault(t, []).append(av_name)
+
+
+        return av_dict
+
+    def rank_tags(self, av_dict, threshold=1):
+        ''' Return list of (tag, confidence) ranked by decreasing confidence 
+            and filter tags with less or equal threshold confidence '''
+
+        pairs = ((t, len(avs)) for (t,avs) in av_dict.items() 
+                    if len(avs) > threshold)
+        return sorted(pairs, key=itemgetter(1,0), reverse=True)
+
diff --git a/avclass2/lib/evaluate_clustering.py b/avclass2/lib/evaluate_clustering.py
new file mode 100755
index 0000000..2196d7a
--- /dev/null
+++ b/avclass2/lib/evaluate_clustering.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+import sys
+
+def tp_fp_fn(CORRECT_SET, GUESS_SET):
+    """
+    INPUT: dictionary with the elements in the cluster from the ground truth
+    (CORRECT_SET) and dictionary with the elements from the estimated cluster
+    (ESTIMATED_SET).
+
+    OUTPUT: number of True Positives (elements in both clusters), False
+    Positives (elements only in the ESTIMATED_SET), False Negatives (elements
+    only in the CORRECT_SET).
+    """
+    tp = 0
+    fp = 0
+    fn = 0
+    for elem in GUESS_SET:
+        # True Positives (elements in both clusters)
+        if elem in CORRECT_SET:
+            tp += 1
+        else:
+            # False Positives (elements only in the "estimated cluster")
+            fp += 1
+    for elem in CORRECT_SET:
+        if elem not in GUESS_SET:
+            # False Negatives (elements only in the "correct cluster")
+            fn += 1
+    return tp, fp, fn
+
+
+def eval_precision_recall_fmeasure(GROUNDTRUTH_DICT, ESTIMATED_DICT):
+    """
+    INPUT: dictionary with the mapping "element:cluster_id" for both the ground
+    truth and the ESTIMATED_DICT clustering.
+
+    OUTPUT: average values of Precision, Recall and F-Measure.
+    """
+    # eval: precision, recall, f-measure
+    tmp_precision = 0
+    tmp_recall = 0
+
+    # build reverse dictionary of ESTIMATED_DICT
+    rev_est_dict = {}
+    for k, v in ESTIMATED_DICT.items():
+        if v not in rev_est_dict:
+            rev_est_dict[v] = { k }
+        else:
+            rev_est_dict[v].add(k)
+
+    # build reverse dictionary of GROUNDTRUTH_DICT
+    gt_rev_dict = {}
+    for k, v in GROUNDTRUTH_DICT.items():
+        if v not in gt_rev_dict:
+            gt_rev_dict[v] = { k }
+        else:
+            gt_rev_dict[v].add(k)
+
+    
+    counter, l = 0, len(ESTIMATED_DICT)
+
+    sys.stderr.write('Calculating precision and recall\n')
+
+    # For each element
+    for element in ESTIMATED_DICT:
+        
+        # Print progress
+        if counter % 1000 == 0:
+            sys.stderr.write('\r%d out of %d' % (counter, l))
+            sys.stderr.flush()
+        counter += 1
+
+        # Get elements in the same cluster (for "ESTIMATED_DICT cluster")
+        guess_cluster_id = ESTIMATED_DICT[element]
+
+        # Get the list of elements in the same cluster ("correct cluster")
+        correct_cluster_id = GROUNDTRUTH_DICT[element]
+
+        # Calculate TP, FP, FN
+        tp, fp, fn = tp_fp_fn(gt_rev_dict[correct_cluster_id],
+                              rev_est_dict[guess_cluster_id])
+
+        # tmp_precision
+        p = 1.0*tp/(tp+fp)
+        tmp_precision += p
+        # tmp_recall
+        r = 1.0*tp/(tp+fn)
+        tmp_recall += r
+    sys.stderr.write('\r%d out of %d' % (counter, l))
+    sys.stderr.write('\n')
+    precision = 100.0*tmp_precision/len(ESTIMATED_DICT)
+    recall = 100.0*tmp_recall/len(ESTIMATED_DICT)
+    fmeasure = (2*precision*recall)/(precision+recall)
+    return precision, recall, fmeasure
+
+
+if __name__ == "__main__":
+
+    # The ground truth.
+    # Dictionary with mapping: "element : cluster_id".
+    diz_grth = {
+        "a": 1,
+        "b": 1,
+        "c": 2,
+        "d": 3
+    }
+
+    # An example of an "estimated cluster".
+    # Dictionary with mapping: "element : cluster_id".
+    diz_estim = {
+        "a": 66,
+        "b": 'malware',
+        "c": 'goodware',
+        "d": 'trojan'
+    }
+
+    # An example of an "estimated cluster": same partitioning as for the ground
+    # truth, but just different cluster labels. Precision == Recall ==
+    # F-Measure == 100%.
+    # Dictionary with mapping: "element : cluster_id".
+    diz_estim_grth = {
+        "a": 2,
+        "b": 2,
+        "c": 66,
+        "d": 9
+    }
+
+    # a sample where estimated != ground truth
+    sys.stdout.write("Ground truth\n")
+    sys.stdout.write("%8s --> %10s\n" % ("Element", "Cluster_ID"))
+    for k, v in diz_grth.items():
+        sys.stdout.write("%8s --> %10s\n" % (k, v))
+    sys.stdout.write("\nEstimated clustering\n")
+    sys.stdout.write("%8s --> %10s\n" % ("Element", "Cluster_ID"))
+    for k, v in diz_estim.items():
+        sys.stdout.write("%8s --> %10s\n" % (k, v))
+    # precision, recall, f-measure
+    p, r, f = eval_precision_recall_fmeasure(diz_grth, diz_estim)
+    sys.stdout.write("\nPrecison: %s%%\n" % p)
+    sys.stdout.write("Recall: %s%%\n" % r)
+    sys.stdout.write("F-Measure: %s%%\n" % f)
+    
diff --git a/data/malheurReference_gt.tsv b/examples/malheurReference_gt.tsv
similarity index 100%
rename from data/malheurReference_gt.tsv
rename to examples/malheurReference_gt.tsv
diff --git a/data/malheurReference_lb.json b/examples/malheurReference_lb.json
similarity index 100%
rename from data/malheurReference_lb.json
rename to examples/malheurReference_lb.json
diff --git a/examples/vtv3_sample.json b/examples/vtv3_sample.json
new file mode 100644
index 0000000..08dfe77
--- /dev/null
+++ b/examples/vtv3_sample.json
@@ -0,0 +1 @@
+{ "data": { "attributes": { "creation_date": 1584397860, "exiftool": { "Author": "Tatyana", "Characters": 1896, "CharactersWithSpaces": 2224, "CreateDate": "2020:03:16 22:31:00", "FileType": "RTF", "FileTypeExtension": "rtf", "InternalVersionNumber": "57433", "LastModifiedBy": "apcach E", "MIMEType": "text/rtf", "ModifyDate": "2020:03:16 22:31:00", "Pages": 1, "RevisionNumber": "2", "TotalEditTime": "1 minute", "Warning": "Unsupported RTF encoding cp936. Will assume Latin.", "Words": 332 }, "first_submission_date": 1584418873, "last_analysis_date": 1584939766, "last_analysis_results": { "ALYac": { "category": "malicious", "engine_name": "ALYac", "engine_update": "20200323", "engine_version": "1.1.1.5", "method": "blacklist", "result": "Exploit.CVE-2017-11882" }, "APEX": { "category": "type-unsupported", "engine_name": "APEX", "engine_update": "20200322", "engine_version": "6.3", "method": "blacklist", "result": null }, "AVG": { "category": "malicious", "engine_name": "AVG", "engine_update": "20200323", "engine_version": "18.4.3895.0", "method": "blacklist", "result": "Other:Malware-gen [Trj]" }, "Acronis": { "category": "type-unsupported", "engine_name": "Acronis", "engine_update": "20200315", "engine_version": "1.1.1.73", "method": "blacklist", "result": null }, "Ad-Aware": { "category": "malicious", "engine_name": "Ad-Aware", "engine_update": "20200323", "engine_version": "3.0.5.370", "method": "blacklist", "result": "Trojan.Agent.ENJC" }, "AegisLab": { "category": "malicious", "engine_name": "AegisLab", "engine_update": "20200323", "engine_version": "4.2", "method": "blacklist", "result": "Hacktool.RTF.CVE-2018-0802.3!c" }, "AhnLab-V3": { "category": "undetected", "engine_name": "AhnLab-V3", "engine_update": "20200323", "engine_version": "3.17.3.26870", "method": "blacklist", "result": null }, "Alibaba": { "category": "type-unsupported", "engine_name": "Alibaba", "engine_update": "20190527", "engine_version": "0.3.0.5", "method": "blacklist", "result": null }, "Antiy-AVL": { "category": "malicious", "engine_name": "Antiy-AVL", "engine_update": "20200323", "engine_version": "3.0.0.1", "method": "blacklist", "result": "Trojan[Exploit]/RTF.Obscure.Gen" }, "Arcabit": { "category": "malicious", "engine_name": "Arcabit", "engine_update": "20200323", "engine_version": "1.0.0.870", "method": "blacklist", "result": "Trojan.Agent.ENJC" }, "Avast": { "category": "malicious", "engine_name": "Avast", "engine_update": "20200323", "engine_version": "18.4.3895.0", "method": "blacklist", "result": "Other:Malware-gen [Trj]" }, "Avast-Mobile": { "category": "undetected", "engine_name": "Avast-Mobile", "engine_update": "20200319", "engine_version": "200319-00", "method": "blacklist", "result": null }, "Avira": { "category": "malicious", "engine_name": "Avira", "engine_update": "20200323", "engine_version": "8.3.3.8", "method": "blacklist", "result": "EXP/CVE-2017-11882.zfknn" }, "Baidu": { "category": "undetected", "engine_name": "Baidu", "engine_update": "20190318", "engine_version": "1.0.0.2", "method": "blacklist", "result": null }, "BitDefender": { "category": "malicious", "engine_name": "BitDefender", "engine_update": "20200323", "engine_version": "7.2", "method": "blacklist", "result": "Trojan.Agent.ENJC" }, "BitDefenderTheta": { "category": "undetected", "engine_name": "BitDefenderTheta", "engine_update": "20200311", "engine_version": "7.2.37796.0", "method": "blacklist", "result": null }, "Bkav": { "category": "undetected", "engine_name": "Bkav", "engine_update": "20200321", "engine_version": "1.3.0.9899", "method": "blacklist", "result": null }, "CAT-QuickHeal": { "category": "malicious", "engine_name": "CAT-QuickHeal", "engine_update": "20200323", "engine_version": "14.00", "method": "blacklist", "result": "RTF.Agent.37108" }, "CMC": { "category": "undetected", "engine_name": "CMC", "engine_update": "20190321", "engine_version": "1.1.0.977", "method": "blacklist", "result": null }, "ClamAV": { "category": "malicious", "engine_name": "ClamAV", "engine_update": "20200322", "engine_version": "0.102.2.0", "method": "blacklist", "result": "Rtf.Dropper.Agent-7624526-0" }, "Comodo": { "category": "undetected", "engine_name": "Comodo", "engine_update": "20200323", "engine_version": "32234", "method": "blacklist", "result": null }, "CrowdStrike": { "category": "type-unsupported", "engine_name": "CrowdStrike", "engine_update": "20180202", "engine_version": "1.0", "method": "blacklist", "result": null }, "Cybereason": { "category": "type-unsupported", "engine_name": "Cybereason", "engine_update": "20180308", "engine_version": null, "method": "blacklist", "result": null }, "Cylance": { "category": "type-unsupported", "engine_name": "Cylance", "engine_update": "20200323", "engine_version": "2.3.1.101", "method": "blacklist", "result": null }, "Cyren": { "category": "malicious", "engine_name": "Cyren", "engine_update": "20200323", "engine_version": "6.2.2.2", "method": "blacklist", "result": "RTF/CVE1711882" }, "DrWeb": { "category": "malicious", "engine_name": "DrWeb", "engine_update": "20200323", "engine_version": "7.0.44.12030", "method": "blacklist", "result": "Exploit.Rtf.CVE2012-0158" }, "ESET-NOD32": { "category": "malicious", "engine_name": "ESET-NOD32", "engine_update": "20200323", "engine_version": "21042", "method": "blacklist", "result": "Win32/Exploit.CVE-2017-11882.AWP" }, "Emsisoft": { "category": "malicious", "engine_name": "Emsisoft", "engine_update": "20200323", "engine_version": "2018.12.0.1641", "method": "blacklist", "result": "Trojan.Agent.ENJC (B)" }, "Endgame": { "category": "type-unsupported", "engine_name": "Endgame", "engine_update": "20200226", "engine_version": "3.0.17", "method": "blacklist", "result": null }, "F-Prot": { "category": "malicious", "engine_name": "F-Prot", "engine_update": "20200323", "engine_version": "4.7.1.166", "method": "blacklist", "result": "RTF/CVE1711882" }, "F-Secure": { "category": "malicious", "engine_name": "F-Secure", "engine_update": "20200323", "engine_version": "12.0.86.52", "method": "blacklist", "result": "Exploit.EXP/CVE-2017-11882.zfknn" }, "FireEye": { "category": "undetected", "engine_name": "FireEye", "engine_update": "20200316", "engine_version": "32.31.0.0", "method": "blacklist", "result": null }, "Fortinet": { "category": "malicious", "engine_name": "Fortinet", "engine_update": "20200323", "engine_version": "6.2.142.0", "method": "blacklist", "result": "RTF/Dropper.A879!tr" }, "GData": { "category": "malicious", "engine_name": "GData", "engine_update": "20200323", "engine_version": "A:25.25222B:26.18109", "method": "blacklist", "result": "Trojan.Agent.ENJC" }, "Ikarus": { "category": "malicious", "engine_name": "Ikarus", "engine_update": "20200322", "engine_version": "0.1.5.2", "method": "blacklist", "result": "Exploit.CVE-2017-11882" }, "Invincea": { "category": "type-unsupported", "engine_name": "Invincea", "engine_update": "20200219", "engine_version": "6.3.6.26157", "method": "blacklist", "result": null }, "Jiangmin": { "category": "undetected", "engine_name": "Jiangmin", "engine_update": "20200322", "engine_version": "16.0.100", "method": "blacklist", "result": null }, "K7AntiVirus": { "category": "undetected", "engine_name": "K7AntiVirus", "engine_update": "20200323", "engine_version": "11.100.33608", "method": "blacklist", "result": null }, "K7GW": { "category": "undetected", "engine_name": "K7GW", "engine_update": "20200322", "engine_version": "11.100.33607", "method": "blacklist", "result": null }, "Kaspersky": { "category": "malicious", "engine_name": "Kaspersky", "engine_update": "20200323", "engine_version": "15.0.1.13", "method": "blacklist", "result": "HEUR:Exploit.RTF.CVE-2018-0802.gen" }, "Kingsoft": { "category": "undetected", "engine_name": "Kingsoft", "engine_update": "20200323", "engine_version": "2013.8.14.323", "method": "blacklist", "result": null }, "MAX": { "category": "undetected", "engine_name": "MAX", "engine_update": "20200323", "engine_version": "2019.9.16.1", "method": "blacklist", "result": null }, "Malwarebytes": { "category": "undetected", "engine_name": "Malwarebytes", "engine_update": "20200323", "engine_version": "3.6.4.335", "method": "blacklist", "result": null }, "MaxSecure": { "category": "undetected", "engine_name": "MaxSecure", "engine_update": "20200320", "engine_version": "1.0.0.1", "method": "blacklist", "result": null }, "McAfee": { "category": "malicious", "engine_name": "McAfee", "engine_update": "20200322", "engine_version": "6.0.6.653", "method": "blacklist", "result": "RTFObfustream.a!5E31D16D6BF3" }, "McAfee-GW-Edition": { "category": "malicious", "engine_name": "McAfee-GW-Edition", "engine_update": "20200322", "engine_version": "v2017.3010", "method": "blacklist", "result": "RTFObfustream.a!5E31D16D6BF3" }, "MicroWorld-eScan": { "category": "malicious", "engine_name": "MicroWorld-eScan", "engine_update": "20200323", "engine_version": "14.0.409.0", "method": "blacklist", "result": "Trojan.Agent.ENJC" }, "Microsoft": { "category": "malicious", "engine_name": "Microsoft", "engine_update": "20200323", "engine_version": "1.1.16800.2", "method": "blacklist", "result": "Exploit:O97M/CVE-2017-11882.G!MTB" }, "NANO-Antivirus": { "category": "malicious", "engine_name": "NANO-Antivirus", "engine_update": "20200323", "engine_version": "1.0.134.25032", "method": "blacklist", "result": "Exploit.Rtf.Heuristic-rtf.dinbqn" }, "Paloalto": { "category": "type-unsupported", "engine_name": "Paloalto", "engine_update": "20200323", "engine_version": "1.0", "method": "blacklist", "result": null }, "Panda": { "category": "undetected", "engine_name": "Panda", "engine_update": "20200322", "engine_version": "4.6.4.2", "method": "blacklist", "result": null }, "Qihoo-360": { "category": "malicious", "engine_name": "Qihoo-360", "engine_update": "20200323", "engine_version": "1.0.0.1120", "method": "blacklist", "result": "heur.rtf.obfuscated.1" }, "Rising": { "category": "undetected", "engine_name": "Rising", "engine_update": "20200322", "engine_version": "25.0.0.24", "method": "blacklist", "result": null }, "SUPERAntiSpyware": { "category": "undetected", "engine_name": "SUPERAntiSpyware", "engine_update": "20200317", "engine_version": "5.6.0.1032", "method": "blacklist", "result": null }, "Sangfor": { "category": "undetected", "engine_name": "Sangfor", "engine_update": "20200320", "engine_version": "1.0", "method": "blacklist", "result": null }, "SentinelOne": { "category": "type-unsupported", "engine_name": "SentinelOne", "engine_update": "20200220", "engine_version": "2.0.0.2603", "method": "blacklist", "result": null }, "Sophos": { "category": "undetected", "engine_name": "Sophos", "engine_update": "20200323", "engine_version": "4.98.0", "method": "blacklist", "result": null }, "Symantec": { "category": "malicious", "engine_name": "Symantec", "engine_update": "20200322", "engine_version": "1.11.0.0", "method": "blacklist", "result": "Trojan.Mdropper" }, "SymantecMobileInsight": { "category": "type-unsupported", "engine_name": "SymantecMobileInsight", "engine_update": "20200210", "engine_version": "2.0", "method": "blacklist", "result": null }, "TACHYON": { "category": "malicious", "engine_name": "TACHYON", "engine_update": "20200323", "engine_version": "2020-03-23.01", "method": "blacklist", "result": "Trojan-Exploit/RTF.CVE-2018-0798" }, "Tencent": { "category": "malicious", "engine_name": "Tencent", "engine_update": "20200323", "engine_version": "1.0.0.1", "method": "blacklist", "result": "Win32.Exploit.Cve-2018-0802.Sxen" }, "Trapmine": { "category": "type-unsupported", "engine_name": "Trapmine", "engine_update": "20200123", "engine_version": "3.2.22.914", "method": "blacklist", "result": null }, "TrendMicro": { "category": "malicious", "engine_name": "TrendMicro", "engine_update": "20200323", "engine_version": "11.0.0.1006", "method": "blacklist", "result": "TROJ_FRS.VSNTCH20" }, "TrendMicro-HouseCall": { "category": "malicious", "engine_name": "TrendMicro-HouseCall", "engine_update": "20200323", "engine_version": "10.0.0.1040", "method": "blacklist", "result": "TROJ_FRS.VSNTCH20" }, "Trustlook": { "category": "type-unsupported", "engine_name": "Trustlook", "engine_update": "20200323", "engine_version": "1.0", "method": "blacklist", "result": null }, "VBA32": { "category": "undetected", "engine_name": "VBA32", "engine_update": "20200320", "engine_version": "4.3.0", "method": "blacklist", "result": null }, "VIPRE": { "category": "undetected", "engine_name": "VIPRE", "engine_update": "20200323", "engine_version": "82430", "method": "blacklist", "result": null }, "ViRobot": { "category": "undetected", "engine_name": "ViRobot", "engine_update": "20200323", "engine_version": "2014.3.20.0", "method": "blacklist", "result": null }, "Webroot": { "category": "type-unsupported", "engine_name": "Webroot", "engine_update": "20200323", "engine_version": "1.0.0.403", "method": "blacklist", "result": null }, "Yandex": { "category": "malicious", "engine_name": "Yandex", "engine_update": "20200320", "engine_version": "5.5.2.24", "method": "blacklist", "result": "Trojan.ARicher.bSxJ5m" }, "Zillya": { "category": "undetected", "engine_name": "Zillya", "engine_update": "20200320", "engine_version": "2.0.0.4051", "method": "blacklist", "result": null }, "ZoneAlarm": { "category": "malicious", "engine_name": "ZoneAlarm", "engine_update": "20200323", "engine_version": "1.0", "method": "blacklist", "result": "HEUR:Exploit.RTF.CVE-2018-0802.gen" }, "Zoner": { "category": "malicious", "engine_name": "Zoner", "engine_update": "20200323", "engine_version": "1.0.0.1", "method": "blacklist", "result": "Probably RTFObfuscationD" }, "eGambit": { "category": "type-unsupported", "engine_name": "eGambit", "engine_update": "20200323", "engine_version": null, "method": "blacklist", "result": null } }, "last_analysis_stats": { "confirmed-timeout": 0, "failure": 0, "harmless": 0, "malicious": 35, "suspicious": 0, "timeout": 0, "type-unsupported": 15, "undetected": 24 }, "last_modification_date": 1584939782, "last_submission_date": 1584418873, "magic": "Rich Text Format data, version 1, unknown character set", "md5": "5e31d16d6bf35ea117d6d2c4d42ea879", "meaningful_name": "President discusses budget savings due to coronavirus with Finance Minister.rtf", "names": [ "President discusses budget savings due to coronavirus with Finance Minister.rtf" ], "reputation": 0, "rtf_info": { "document_properties": { "custom_xml_data_properties": 1, "default_ansi_codepage": "Simplified Chinese", "default_character_set": "ANSI", "default_languages": [ "English - United States", "Arabic - Saudi Arabia", "Chinese - People's Republic of China" ], "dos_stubs": 0, "embedded_drawings": 0, "embedded_pictures": 0, "longest_hex_string": 508408, "non_ascii_characters": 0, "objects": [ { "class": null, "type": "OLE embedded" }, { "class": null, "type": "OLE control" } ], "read_only_protection": false, "rtf_header": "rtf1", "user_protection": false }, "summary_info": { "author": "Tatyana", "creation_time": "2020-03-16 22:31:00", "editing_time": 1, "number_of_characters": 1896, "number_of_non_whitespace_characters": 2224, "number_of_pages": 1, "number_of_words": 332, "operator": "apcach E", "revision_time": "2020-03-16 22:31:00", "version": 2, "version_number": 57433 } }, "sha1": "f8fb81d0a0acf5815190e1c85d937e49bc1dfec7", "sha256": "1527f7b9bdea7752f72ffcd8b0a97e9f05092fed2cb9909a463e5775e12bd2d6", "size": 574379, "ssdeep": "12288:v2D2vD2k+tSycQFfJyLhWr95EWV9kFyTDDpRosvcVdwA0:OD2vD2k+tcQFfyhWr95EFF+LosvOwF", "tags": [ "ole-embedded", "rtf", "cve-2017-11882", "cve-2012-0158", "ole-control", "exploit", "cve-2018-0802", "cve-2018-0798" ], "times_submitted": 1, "total_votes": { "harmless": 0, "malicious": 0 }, "trid": [ { "file_type": "file seems to be plain text/ASCII", "probability": 0.0 } ], "type_description": "Rich Text Format", "type_tag": "rtf", "unique_sources": 1, "vhash": "8596f9f7a194270fb5b3a2677abd4de52" }, "id": "1527f7b9bdea7752f72ffcd8b0a97e9f05092fed2cb9909a463e5775e12bd2d6", "links": { "self": "https://www.virustotal.com/api/v3/files/1527f7b9bdea7752f72ffcd8b0a97e9f05092fed2cb9909a463e5775e12bd2d6" }, "type": "file" } }