From 652ef7bc0c8e0fe9d27d4d690dade23335d836dd Mon Sep 17 00:00:00 2001
From: Matt Miller <usr.bin.bourbon@gmail.com>
Date: Mon, 11 Jan 2021 11:51:00 -0600
Subject: [PATCH 01/36] add setup.py and a better .gitignore

---
 .gitignore | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 setup.py   |  17 +++++++
 2 files changed, 158 insertions(+), 1 deletion(-)
 create mode 100644 setup.py

diff --git a/.gitignore b/.gitignore
index 0d20b64..4b38bd9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,141 @@
-*.pyc
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+.idea/
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..d87b69e
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,17 @@
+from setuptools import find_packages, setup
+
+
+setup(
+    name='AVClass',
+    version='0.0.1',
+    description='Tag and label malware samples',
+    license='LICENSE',
+    packages=find_packages(),
+    install_requires=[],
+    setup_requires=[
+        'pytest-runner',
+    ],
+    tests_require=[
+        'pytest',
+    ],
+)

From 107bb8d1c2f4b3fb8a32c7bcb81612ae749c5534 Mon Sep 17 00:00:00 2001
From: Matt Miller <usr.bin.bourbon@gmail.com>
Date: Mon, 11 Jan 2021 12:20:10 -0600
Subject: [PATCH 02/36] implement package structure

---
 avclass/README.md                             | 450 +++++--------
 avclass/__init__.py                           |   0
 avclass/avclass_alias_detect.py               |  89 ---
 avclass/avclass_generic_detect.py             |  83 ---
 avclass/avclass_labeler.py                    | 459 -------------
 avclass/data/default.aliases                  | 559 ---------------
 avclass/data/default.generics                 | 418 ------------
 .../input_checker.py                          |  12 +-
 .../avclass2_labeler.py => avclass/labeler.py |  22 +-
 avclass/lib/avclass_common.py                 | 337 ----------
 .../update.py                                 |  28 +-
 avclass2/README.md                            | 261 -------
 avclass2/lib/avclass2_common.py               | 636 ------------------
 {avclass2/data => data}/andropup.expansion    |   0
 {avclass2/data => data}/default.expansion     |   0
 {avclass2/data => data}/default.tagging       |   0
 {avclass2/data => data}/default.taxonomy      |   0
 setup.py                                      |   2 +-
 shared/evaluate_clustering.py                 | 140 ----
 19 files changed, 178 insertions(+), 3318 deletions(-)
 create mode 100644 avclass/__init__.py
 delete mode 100755 avclass/avclass_alias_detect.py
 delete mode 100755 avclass/avclass_generic_detect.py
 delete mode 100755 avclass/avclass_labeler.py
 delete mode 100644 avclass/data/default.aliases
 delete mode 100644 avclass/data/default.generics
 rename avclass2/avclass2_input_checker.py => avclass/input_checker.py (86%)
 rename avclass2/avclass2_labeler.py => avclass/labeler.py (98%)
 delete mode 100755 avclass/lib/avclass_common.py
 rename avclass2/avclass2_update_module.py => avclass/update.py (97%)
 delete mode 100644 avclass2/README.md
 delete mode 100755 avclass2/lib/avclass2_common.py
 rename {avclass2/data => data}/andropup.expansion (100%)
 rename {avclass2/data => data}/default.expansion (100%)
 rename {avclass2/data => data}/default.tagging (100%)
 rename {avclass2/data => data}/default.taxonomy (100%)
 delete mode 100755 shared/evaluate_clustering.py

diff --git a/avclass/README.md b/avclass/README.md
index 07fb2ec..83dfaad 100644
--- a/avclass/README.md
+++ b/avclass/README.md
@@ -1,93 +1,101 @@
-# AVClass
-
-AVClass is a malware labeling tool.
-
-You give it as input the AV labels for a large number of 
-malware samples (e.g., VirusTotal JSON reports) and it outputs the most 
-likely family name for each sample that it can extract from the AV labels. 
-It can also output a ranking of all alternative names it found for each sample.
-
-The design and evaluation of AVClass is detailed in our 
-[RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf):
-
-> Marcos Sebastián, Richard Rivera, Platon Kotzias, and Juan Caballero. 
-AVClass: A Tool for Massive Malware Labeling. 
-In Proceedings of the International Symposium on Research in 
-Attacks, Intrusions and Defenses,
-September 2016.
-
-In a nutshell, AVClass comprises two phases: 
-preparation (optional) and labeling.
-Code for both is included, 
-but most users will be only interested in the labeling, which outputs the 
-family name for the samples. 
-The preparation produces a list of aliases and generic tokens 
-used by the labeling. 
-If you use our default aliases and generic tokens lists, 
-you do not need to run the preparation.
-
-
-## Labeling 
-   
-The labeler takes as input 
-a JSON file with the AV labels of malware samples (-vt or -lb options), 
-a file with generic tokens (-gen option), 
-and a file with aliases (-alias option). 
-It outputs the most likely family name for each sample.
-If you do not provide alias or generic tokens files, 
-the default ones in the *data* folder are used.
+# AVClass2
+
+AVClass2 is a malware tagging tool. It extends AVClass to extract from AV labels not only family name tags, but other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). 
+
+You give it as input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports)
+and it outputs tags observed in the AV labels, ranked by decreasing popularity. 
+
+The design and evaluation of AVClass2 is detailed in our ACSAC 2020 paper.
+
+> Silvia Sebastián, Juan Caballero. 
+AVClass2: Massive Malware Tag Extraction from AV Labels. 
+In proceedings of the Annual Computer Security Applications Conference, December 2020.
+
+In a nutshell, AVClass2 comprises two modules: labeling and update. Code for both is included, but most users will be only interested in the labeling, which outputs the tags for the samples. The update module is used to update the input taxonomy, tagging rules, and expansion rules. If you use our default taxonomy, tagging, and expansion files, you do not need to run the update module.
+
+
+## Labeling
+
+The labeler takes as input a JSON file with the AV labels of malware samples 
+(-vt or -lb options), 
+a file with the taxonomy (-tax option), 
+a file with tagging rules (-tag option), and
+a file with expansion rules (-exp option). 
+It outputs a set of ranked tags. 
+If you do not provide taxonomy, expansion or tagging files, 
+the default ones in the data folder are used.
 
 ```shell
-$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v > malheurReference.labels
+$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json
 ```
-  
-The above command labels the samples whose AV labels are in the 
-*../examples/malheurReference_lb.json* file.
-It prints the results to stdout, 
-which we redirect to the *malheurReference.labels* file.
-The output looks like this:
+
+The above command labels the samples whose AV labels are in 
+the ../examples/malheurReference_lb.json file. 
+It prints the results to stdout. 
+The output looks like this: 
 
 ```
-aca2d12934935b070df8f50e06a20539 adrotator
-67d15459e1f85898851148511c86d88d adultbrowser
+aca2d12934935b070df8f50e06a20539 33 grayware|10,adware|9,windows|8,adrotator|8,downloader|3,zlob|2
+67d15459e1f85898851148511c86d88d 37 dialer|23,windows|9,adultbrowser|8,porndialer|7,grayware|6,tool|3,target|2
 ```
 
-which means sample aca2d12934935b070df8f50e06a20539 is most likely 
-from the *adrotator* family and 
-67d15459e1f85898851148511c86d88d from the *adultbrowser* family.
+which means sample *aca2d12934935b070df8f50e06a20539* 
+was flagged by 33 AV engines and 10 of them agree it is *grayware*, 9 that it is more specifically *adware*, 
+8 mention that it runs on *windows*, another 8 that it is the *adrotator* family, 
+3 that it is a *downloader*, and 2 that it belongs instead to the *zlob* family.
+Sample *67d15459e1f85898851148511c86d88d* is flagged by 37 AV engines and 23 of them 
+consider it a *dialer*, 8 that it belongs to the *adultbrowser* family, and so on. 
 
-The verbose (-v) option makes it output an extra 
-*malheurReference_lb.verbose* file
-with all families extracted for each sample ranked by the number of AV 
-engines that use that family.
-The file looks like this:
+The -p option outputs the full path of each tag in the taxonomy: 
 
+```shell
+$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -p
 ```
-aca2d12934935b070df8f50e06a20539  [(u'adrotator', 8), (u'zlob', 2)]
-ee90a64fcfaa54a314a7b5bfe9b57357  [(u'swizzor', 19)]
-f465a2c1b852373c72a1ccd161fbe94c  SINGLETON:f465a2c1b852373c72a1ccd161fbe94c
+
+The above command line outputs:
+
+```
+aca2d12934935b070df8f50e06a20539 33 CLASS:grayware|10,CLASS:grayware:adware|9,FILE:os:windows|8,FAM:adrotator|8,CLASS:downloader|3,FAM:zlob|2
+67d15459e1f85898851148511c86d88d 37 CLASS:dialer|23,FILE:os:windows|9,FAM:adultbrowser|8,CLASS:dialer:porndialer|7,CLASS:grayware|6,CLASS:grayware:tool|3,FAM:target|2
+```
+
+where each tag has been replaced by its taxonomy path, which starts with the category in capitals, 
+followed by the path in the category (if any), and the tag itself, all separated by colons. 
+For example, *FAM:adrotator* makes explicit that *adrotator* is a malware family, 
+*CLASS:grayware* that *grayware* is a malware class, and 
+*CLASS:grayware:adware* that *adware* is a subclass of *grayware*.
+
+**Compatibility mode**
+
+The compatibility -c option makes AVClass2 output the same format as AVClass. 
+
+```shell
+$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -c
 ```
 
-which means that for sample aca2d12934935b070df8f50e06a20539 
-there are 8 AV engines assigning *adrotator* as the family and  
-another 2 assigning *zlob*.
-Thus, *adrotator* is the most likely family.
-On the other hand, for ee90a64fcfaa54a314a7b5bfe9b57357 there are 19 AV 
-engines assigning *swizzor* as family, 
-and no other family was found.
-The last line means that for sample f465a2c1b852373c72a1ccd161fbe94c
-no family name was found in the AV labels. 
-Thus, the sample is placed by himself in a singleton cluster 
-with the name of the cluster being the sample's hash.
-
-Note that the sum of the number of AV engines may not equal the number 
-of AV engines with a label for that sample in the input file 
-because the labels of some AV engines may only include generic tokens 
-that are removed by AVClass.
+outputs:
+
+```
+bb23e1d296cf01bbaf32ed3938f9b0b8 allaple
+cc4521ea738e8ba17139f86b3def5349 SINGLETON:cc4521ea738e8ba17139f86b3def5349
+```
+
+As in AVClass, the output contains only the family name, 
+which corresponds to the highest ranked family tag, all other tags are ignored.
+Samples for which a family cannot be obtained are labeled as singletons with their hash.
+ 
+It is important to note that AVClass2 compatibility mode results can differ from AVClass results
+on the same input file.
+The differences in family names are due to differences between the generics and aliases files 
+used by AVClass and the taxonomy, tagging rules, and expansion rules used by AVClass2. 
+In the future, we may change AVClass to use the taxonomy and rules from AVClass2 
+as input (instead of the generics and aliases files) 
+to minimize such differences and avoid maintaining different data files.
+
 
 ## Input JSON format
 
-AVClass supports three input JSON formats: 
+AVClass2 supports three input JSON formats:
 
 1. VirusTotal v2 API JSON reports (*-vt file*), 
 where each line in the input *file* should be the full JSON of a 
@@ -109,141 +117,53 @@ There is an example of such input file in *examples/malheurReference_lb.json*
 
 **Multiple input files**
 
-AVClass can handle multiple input files putting the results in the same output files 
+AVClass2 can handle multiple input files putting the results in the same output files 
 (if you want results in separate files, process each input file separately).
 
 It is possible to provide the -vt and -lb input options multiple times.
 
 ```shell
-$./avclass_labeler.py -vt <file1> -vt <file2>
+$./avclass2_labeler.py -vt <file1> -vt <file2>
 ```
 ```shell
-$./avclass_labeler.py -lb <file1> -lb <file2>
+$./avclass2_labeler.py -lb <file1> -lb <file2>
 ```
 
 There are also -vtdir and -lbdir options that can be used to provide 
 an input directory where all files are VT (-vtdir) or simplified (-lbdir) JSON reports:
 
 ```shell
-$./avclass_labeler.py -vtdir <directory>
+$./avclass2_labeler.py -vtdir <directory>
 ```
 
 It is also possible to combine -vt with -vtdir and -lb with -lbdir, 
 but you cannot combine input files of different format. Thus, this command works:
 
 ```shell
-$./avclass_labeler.py -vt <file> -vtdir <directory>
+$./avclass2_labeler.py -vt <file> -vtdir <directory>
 ```
 
 But, this one throws an error:
 
 ```shell
-$./avclass_labeler.py -vt <file1> -lb <file2>
-```
-
-## Labeling: Family Ranking
-
-AVClass has a -fam option to output a file with a ranking of the 
-families assigned to the input samples. 
-
-```shell
-$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -fam > malheurReference.labels
-```
-
-will produce a file called *malheurReference_lb.families* with two columns:
-
-```
-virut 441
-allaple 301
-podnuha 300
-```
-
-indicating that 441 samples were classified in the virut family, 
-301 as allaple, and 300 as podnuha.
-
-This option is very similar to using the following shell command:
-
-```shell
-$cut -f 2 malheurReference.labels | sort | uniq -c | sort -nr
+$./avclass2_labeler.py -vt <file1> -lb <file2>
 ```
 
-The main difference is that using the -fam option all SINGLETON samples, 
-i.e., those for which no label was found, 
-are grouped into a fake *SINGLETONS* family, 
-while the shell command would leave each singleton as a separate family.
-
-
-## Labeling: PUP Classification
-
-AVClass also has a -pup option to classify a sample as
-Potentially Unwanted Program (PUP) or malware.
-This classification looks for PUP-related keywords
-(e.g., pup, pua, unwanted, adware) in the AV labels and was proposed in our
-[CCS 2015 paper](https://software.imdea.org/~juanca/papers/malsign_ccs15.pdf):
-
-> Platon Kotzias, Srdjan Matic, Richard Rivera, and Juan Caballero.
-Certified PUP: Abuse in Authenticode Code Signing.
-In Proceedings of the 22nd ACM Conference on Computer and Communication Security, Denver, CO, October 2015
-
-```shell
-$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -pup > malheurReference.labels
-```
-
-With the -pup option the output of the *malheurReference.labels* file
-looks like this:
-
-```
-aca2d12934935b070df8f50e06a20539 adrotator 1
-67d15459e1f85898851148511c86d88d adultbrowser 0
-```
-
-The digit at the end is a Boolean flag that 
-indicates sample aca2d12934935b070df8f50e06a20539 is
-(likely) PUP, but sample 67d15459e1f85898851148511c86d88d is (likely) not.
-
-In our experience the PUP classification is conservative,
-i.e., if it says the sample is PUP, it most likely is.
-But, if it says that it is not PUP, it could still be PUP if the AV labels
-do not contain PUP-related keywords.
-Note that it is possible that some samples from a family get 
-the PUP flag while other samples from the same family do not
-because the PUP-related keywords may not appear in the labels of 
-all samples from the same family. 
-To address this issue, you can combine the -pup option with the -fam option.
-This combination will add into the families file the classification of the 
-family as malware or PUP, based on a majority vote among the samples in a 
-family.
-
-```shell
-$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -pup -fam > malheurReference.labels
-```
-
-will produce a file called *malheurReference_lb.families* with five columns:
-
-```
-# Family  Total Malware PUP FamType
-virut 441 441 0 malware
-magiccasino 173 0 173 pup
-ejik  168 124 44  malware
-```
-
-For virut, the numbers indicate all the 441 virut samples are classified 
-as malware, and thus the last column states that virut is a malware family. 
-For magiccasino, all 173 samples are labeled as PUP, thus the family is PUP.
-For ejik, out of the 168 samples, 124 are labeled as malware and 44 as PUP, 
-so the family is classified as malware.
-
+At this point you have read the most important information on how to use AVClass2. 
+The following sections describe steps that most users will not need.
 
 ## Labeling: Ground Truth Evaluation
 
-If you have ground truth for some malware samples, 
-i.e., you know the true family for those samples, you can evaluate the accuracy of the labeling output by AVClass on those samples with respect to that 
-ground truth.
-The evaluation metrics used are precision, recall, and F1 measure.
-See our RAID 2016 paper above for their definition.
+If you have family ground truth for some malware samples, i.e., 
+you know the true family for those samples, you can evaluate the accuracy 
+of the family tags output by AVClass2 on those samples with respect to that ground truth. 
+The evaluation metrics used are precision, recall, and F1 measure. 
+See our [RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf) for their definition.
+Note that the ground truth evaluation does not apply to non-family tags, 
+i.e., it only evaluates the output of the compatibility mode.
 
 ```shell
-$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -gt ../examples/malheurReference_gt.tsv -eval > malheurReference.labels
+$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -gt ../examples/malheurReference_gt.tsv > malheurReference.labels
 ```
 
 The output includes these lines:
@@ -254,148 +174,88 @@ Calculating precision and recall
 Precision: 90.81  Recall: 94.05 F1-Measure: 92.40
 ```
 
-The last line corresponds to the accuracy metrics obtained by 
-comparing AVClass results with the provided ground truth.
-
-Each line in the *../examples/malheurReference_gt.tsv* file has 
-two **tab-separated** columns:
+Each line in the *../examples/malheurReference_gt.tsv* file has two **tab-separated** columns:
 
 ```
-0058780b175c3ce5e244f595951f611b8a24bee2 CASINO
+aca2d12934935b070df8f50e06a20539 ADROTATOR
 ```
 
-which indicates that sample 0058780b175c3ce5e244f595951f611b8a24bee2 
-is known to be of the *CASINO* family.
-Each sample in the input file should also appear in the ground truth file.
+which indicates that sample aca2d12934935b070df8f50e06a20539 is known 
+to be of the *ADROTATOR* family. 
+Each sample in the input file should also appear in the ground truth file. 
 Note that the particular label assigned to each family does not matter. 
-What matters is that all samples in the same family are assigned the 
-same family name (i.e., the same string in the second column) 
+What matters is that all samples in the same family are assigned 
+the same family name (i.e., the same string in the second column)
 
-The ground truth can be obtained from publicly available malware 
-datasets. 
+The ground truth can be obtained from publicly available malware datasets. 
 The one in *../examples/malheurReference_gt.tsv* comes from the 
 [Malheur](http://www.mlsec.org/malheur/) dataset. 
 There are other public datasets with ground truth such as 
 [Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or 
 [Malicia](http://malicia-project.com/dataset.html).
 
+## Update Module
+
+The update module can be used to suggest additions and changes to the input 
+taxonomy, tagging rules, and expansion rules. 
+Using the update module comprises of two steps.
+The first step is obtaining an alias file from the labeler:
 
-## Preparation: Generic Token Detection
+```shell
+$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -aliasdetect
+```
+
+The above command will create a file named \<file\>.alias, 
+malheurReference_lb.alias in our example. This file has 7 columns:
 
-The labeling takes as input a file with generic tokens that should be 
-ignored in the AV labels, e.g., trojan, virus, generic, linux.
-By default, the labeling uses the *data/default.generics* 
-generic tokens file.
-You can edit that file to add additional generic tokens you feel 
-we are missing.
+1. t1: token that is an alias
+2. t2: tag for which t1 is an alias
+3. |t1|: number of input samples where t1 was observed
+4. |t2|: number of input samples where t2 was observed
+5. |t1^t2|: number of input samples where both t1 and t2 were observed
+6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t1 was observed.
+7. |t1^t2|/|t2|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t2 was observed.
 
-In our RAID 2016 paper we describe an automatic approach to 
-identify generic tokens, which **requires ground truth**, 
-i.e., it requires knowing the true family for each input sample.
-Not only that, but **the ground truth should be large**, 
-i.e., contain at least one hundred thousand samples. 
-In our work we identified generic tokens using as ground truth 
-the concatenation of all datasets for which we had ground truth.
-This requirement of a large ground truth dataset is why we expect most users 
-will skip this step and simply use our provided default file.
 
-If you want to test generic token detection you can do:
+The Update Module takes the above file as input with the -alias option, 
+as well as the default taxonomy, tagging, and expansion files in the data directory. 
+It outputs updated taxonomy, tagging, and expansion files that include the 
+suggested additions and changes. 
 
 ```shell
- $./avclass_generic_detect.py -lb ../examples/malheurReference_lb.json -gt ../examples/malheurReference_gt.tsv -tgen 10 > malheurReference.gen 
+$./avclass2_update_module.py -alias malheurReference_lb.alias -o output_prefix
 ```
 
-Each line in the *../examples/malheurReference_gt.tsv* file has 
-two **tab-separated** columns:
+This will produce three files: 
+output_prefix.taxonomy, output_prefix.tagging, output_prefix.expansion. 
+You can diff the output and input files to analyze the proposed changes.
 
-```
-0058780b175c3ce5e244f595951f611b8a24bee2 CASINO
-```
+You can also modify the input taxonomy, tagging, and expansion rules in place, 
+rather than producing new files:
 
-which indicates that sample 0058780b175c3ce5e244f595951f611b8a24bee2 
-is known to be of the *CASINO* family.
-
-The *-tgen 10* option is a threshold for the minimum number of families 
-where a token has to be observed to be considered generic. 
-If the option is ommitted, the default threshold of 8 is used.
-
-The above command outputs two files: 
-*malheurReference.gen* and *malheurReference_lb.gen*. 
-Each of them has 2 columns: token and number of families where the token 
-was observed.
-File *malheurReference.gen* is the final output with the detected 
-generic tokens for which the number of families is above 
-the given threshold. 
-The file *malheurReference_lb.gen* has this information for all tokens.
-Thus, *malheurReference.gen* is a subset of *malheurReference_lb.gen*. 
-
-However, note that in the above command you are trying to identify generic 
-tokens from a small dataset since Drebin only contains 3K labeled samples. 
-Thus, *malheurReference.gen* only contains 25 identified generic tokens. 
-Using those 25 generic tokens will produce significantly worse results 
-than using the generic tokens in *data/default.generics*. 
-For more details you can refer to our RAID 2016 paper.
-
-
-## Preparation: Alias Detection
-
-Different vendors may assign different names (i.e., aliases) for the same
-family. For example, some vendors may use *zeus* and others *zbot* 
-as aliases for the same malware family. 
-The labeling takes as input a file with aliases that should be merged.
-By default, the labeling uses the *data/default.aliases* aliases file.
-You can edit that file to add additional aliases you feel we are missing.
-
-In our RAID 2016 paper we describe an automatic approach 
-to identify aliases.
-Our alias detection approach 
-**requires as input the AV labels for large set of samples**, 
-e.g., several million samples. 
-In contrast with the generic token detection, the input samples for 
-alias detection **do not need to be labeled**, 
-i.e., no need to know their family.
-In our work we identified aliases using as input the largest of our 
-unlabeled datasets, which contained nearly 8M samples. 
-This requirement of a large input dataset is why we expect most users
-will skip this step and simply use our provided default file.
-
-If you want to test alias detection you can do:
 
 ```shell
-$./avclass_alias_detect.py -lb ../examples/malheurReference_lb.json -nalias 100 -talias 0.98 > malheurReference.aliases
+$./avclass2_update_module.py -alias malheurReference_lb.alias -update
 ```
 
-The -nalias threshold provides the minimum number of samples two tokens 
-need to be observed in to be considered aliases. 
-If the option is not provided the default is 20.
 
-The -talias threshold provides the minimum fraction of times that 
-the samples appear together.
-If the  is not provided the default is 0.94 (94%).
+## Customizing AVClass2
 
-The above command outputs two files:
-*malheurReference.aliases* and *malheurReference_lb.alias*.
-Each of them has 6 columns: 
-1. t1: token that is an alias
-2. t2: family for which t1 is an alias
-3. |t1|: number of input samples where t1 was observed
-4. |t2|: number of input samples where t2 was observed
-5. |t1^t2|: number of input samples where both t1 and t2 were observed
-6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 
-were observed over the number of input samples where t1 was observed.
-
-File *malheurReference.aliases* is the final output with the 
-detected aliases that satisfy the -nalias and -talias thresholds.
-The file *malheurReference_lb.alias* has this information for all tokens.
-Thus, *malheurReference.aliases* is a subset 
-of *malheurReference_lb.alias*.
-
-However, note that in the above command you are trying to identify aliases
-from a small dataset since Drebin only contains 3K samples.
-Thus, *malheurReference.aliases* only contains 6 identified aliases. 
-Using those 6 aliases will produce significantly worse results than using 
-the aliases in *data/default.aliases*.
-As mentioned, to improve the identified aliases you should provide as 
-input several million samples.
-For more details you can refer to our RAID 2016 paper.
+AVClass2 is fully customizable: 
+Tagging, Expansion and Taxonomy files can be easily modified by the analyst 
+either manually or by running the update module. 
+
+If you change those files manually, we recommend running 
+afterwards the input checker script to keep them tidy. 
+It sorts the tags in the taxonomy and performs some basic cleaning like 
+removing redundant entries:
+
+```shell
+$./avclass2_input_checker.py -tax taxonomy_file -tag tagging_file -exp expansio_file
+```
+
+If the modifications are in the default files in the data directory you can simply run: 
 
+```shell
+$./avclass2_input_checker.py 
+```
diff --git a/avclass/__init__.py b/avclass/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/avclass/avclass_alias_detect.py b/avclass/avclass_alias_detect.py
deleted file mode 100755
index 6624d97..0000000
--- a/avclass/avclass_alias_detect.py
+++ /dev/null
@@ -1,89 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-'''
-AVClass Alias detect
-'''
-import sys
-import argparse
-import subprocess
-import os
-
-
-def main(args):
-    # Set input switch
-    itype = '-vt' if args.vt else '-lb'
-    ifile = args.vt if args.vt else args.lb
-
-    # Set generic tokens file if provided
-    gen_switch = "-gen " + args.gen if args.gen else ""
-    sys.stderr.write('Switch: %s\n' % (gen_switch))
-
-    # Run avclass_labeler
-    sys.stderr.write('[-] Running avclass_labeler on %s\n' % (ifile))
-    FNULL = open(os.devnull, 'w')
-    labeler = subprocess.Popen(\
-       "python avclass_labeler.py %s %s %s -alias /dev/null -aliasdetect" %
-       (itype, ifile, gen_switch), shell=True, stdout=FNULL)
-    labeler.wait()
-
-    # Process alias file
-    sys.stderr.write('[-] Processing token pairs.\n')
-    alias_fname = os.path.basename(os.path.splitext(ifile)[0]) + '.alias'
-    with open(alias_fname, 'r') as fr:
-        for pos, line in enumerate(fr):
-            cline = line.strip('\n')
-            # Print headers
-            if not pos:
-                sys.stdout.write("%s\n" % cline)
-                continue
-            t1, t2, t1_num, t2_num, nalias_num, talias_num = cline.split('\t')
-            if int(nalias_num) > args.nalias and\
-              float(talias_num) > args.talias:
-                sys.stdout.write("%s\n" % cline)
-
-    # Done
-    sys.stderr.write('[-] Done.\n')
-
-
-if __name__=='__main__':
-    argparser = argparse.ArgumentParser(prog='avclass_alias_detect',
-        description='''Given a collection of VT reports it detects aliases
-        used by AVs. It runs the avclass_labeler with specific arguments
-        and processes the output.''')
-
-    argparser.add_argument('-vt',
-        help='file to parse with full VT reports '
-             '(REQUIRED if -lb argument not present)')
-
-    argparser.add_argument('-lb',
-        help='file to parse with subset of VT reports'
-             '{md5,sha1,sha256,scan_date,av_labels} '
-             '(REQUIRED if -vt not present)')
-
-    argparser.add_argument('-gen',
-        help='file with generic tokens.')
-
-    argparser.add_argument('-nalias',
-        help='Minimum number of times that a pair of tokes have been seen.'
-             'Default: 20',
-        type=int,
-        default = 20)
-
-    argparser.add_argument('-talias',
-        help='Minimum percentage of times two tokens appear together.'
-             'Default: 0.94',
-        type=float,
-        default = 0.94)
-
-    args = argparser.parse_args()
-
-    if not args.vt and not args.lb:
-        sys.stderr.write('Argument -vt or -lb is required\n')
-        exit(1)
-
-    if args.vt and args.lb:
-        sys.stderr.write('Use either -vt or -lb argument, not both.\n')
-        exit(1)
-
-    main(args)
-
diff --git a/avclass/avclass_generic_detect.py b/avclass/avclass_generic_detect.py
deleted file mode 100755
index cfdcaa8..0000000
--- a/avclass/avclass_generic_detect.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-'''
-AVClass Generic detect
-'''
-import sys
-import argparse
-import subprocess
-import os
-
-
-def main(args):
-    # Set input switch
-    itype = '-vt' if args.vt else '-lb'
-    ifile = args.vt if args.vt else args.lb
-
-    # Run avclass_labeler
-    sys.stderr.write('[-] Running avclass_labeler on %s\n' % (ifile))
-    FNULL = open(os.devnull, 'w')
-    labeler = subprocess.Popen(\
-       "python avclass_labeler.py %s %s -alias /dev/null"\
-       " -gen /dev/null -gendetect -gt %s" % 
-       (itype, ifile, args.gt), shell=True, stdout=FNULL)
-    labeler.wait()
-
-    # Process generic tokens file
-    sys.stderr.write('[-] Processing results.\n')
-    gen_fname = os.path.basename(os.path.splitext(ifile)[0]) + '.gen'
-    with open(gen_fname, 'r') as fr:
-        for pos, line in enumerate(fr):
-            cline = line.strip('\n')
-            # Print headers
-            if not pos:
-                sys.stdout.write("%s\n" % cline)
-                continue
-            token, fam_num = cline.split('\t')
-            if int(fam_num) > args.tgen:
-                sys.stdout.write("%s\n" % cline)
-
-    # Done
-    sys.stderr.write('[-] Done.\n')
-
-
-if __name__=='__main__':
-    argparser = argparse.ArgumentParser(prog='avclass_alias_detect',
-        description='''Given a collection of VT reports and the family
-        names of these samples (i.e., groundtruth) it generates a list
-        of generic tokens to be excluded from labeling.''')
-
-    argparser.add_argument('-vt',
-        help='file to parse with full VT reports '
-             '(REQUIRED if -lb argument not present)')
-
-    argparser.add_argument('-lb',
-        help='file to parse with subset of VT reports'
-             '{md5,sha1,sha256,scan_date,av_labels} '
-             '(REQUIRED if -vt not present)')
-
-    argparser.add_argument('-tgen',
-        help='Minimum number of families that a token appears. '
-             'Default: 8',
-        type=int,
-        default = 8)
-
-    argparser.add_argument('-gt',
-        help='file with ground truth')
-
-    args = argparser.parse_args()
-
-    if not args.vt and not args.lb:
-        sys.stderr.write('Argument -vt or -lb is required\n')
-        exit(1)
-
-    if args.vt and args.lb:
-        sys.stderr.write('Use either -vt or -lb argument, not both.\n')
-        exit(1)
-
-    if not args.gt:
-        sys.stderr.write('Generic token detection needs groundtruth (-gt)\n')
-        exit(1)
-
-    main(args)
-
diff --git a/avclass/avclass_labeler.py b/avclass/avclass_labeler.py
deleted file mode 100755
index 21ff9b5..0000000
--- a/avclass/avclass_labeler.py
+++ /dev/null
@@ -1,459 +0,0 @@
-#!/usr/bin/env python
-'''
-AVClass labeler
-'''
-
-import os
-import sys
-path = os.path.dirname(os.path.abspath(__file__))
-libpath = os.path.join(path, 'lib/')
-sharedpath = os.path.join(path, '../shared/')
-sys.path.insert(1, libpath)
-sys.path.insert(1, sharedpath)
-import argparse
-from avclass_common import AvLabels
-from operator import itemgetter
-import evaluate_clustering as ec
-import json
-import traceback
-
-# Default alias file
-default_alias_file = os.path.join(path, "data/default.aliases")
-# Default generic tokens file
-default_gen_file = os.path.join(path, "data/default.generics")
-
-def guess_hash(h):
-    '''Given a hash string, guess the hash type based on the string length'''
-    hlen = len(h)
-    if hlen == 32:
-        return 'md5'
-    elif hlen == 40:
-        return 'sha1'
-    elif hlen == 64:
-        return 'sha256'
-    else:
-        return None
-
-def main(args):
-    # Select hash used to identify sample, by default MD5
-    hash_type = args.hash if args.hash else 'md5'
-
-    # If ground truth provided, read it from file
-    gt_dict = {}
-    if args.gt:
-        with open(args.gt, 'r') as gt_fd:
-            for line in gt_fd:
-                gt_hash, family = map(str.lower, line.strip().split('\t', 1))
-                gt_dict[gt_hash] = family
-
-        # Guess type of hash in ground truth file
-        hash_type = guess_hash(list(gt_dict.keys())[0])
-
-    # Create AvLabels object
-    av_labels = AvLabels(args.gen, args.alias, args.av)
-
-    # Build list of input files
-    # NOTE: duplicate input files are not removed
-    ifile_l = []
-    if (args.vt):
-        ifile_l += args.vt
-        ifile_are_vt = True
-    if (args.lb):
-        ifile_l += args.lb
-        ifile_are_vt = False
-    if (args.vtdir): 
-        ifile_l += [os.path.join(args.vtdir, f) for f in os.listdir(args.vtdir)]
-        ifile_are_vt = True
-    if (args.lbdir):
-        ifile_l += [os.path.join(args.lbdir, f) for f in os.listdir(args.lbdir)]
-        ifile_are_vt = False
-
-    # Select correct sample info extraction function
-    if not ifile_are_vt:
-        get_sample_info = av_labels.get_sample_info_lb
-    elif args.vt3:
-        get_sample_info = av_labels.get_sample_info_vt_v3
-    else:
-        get_sample_info = av_labels.get_sample_info_vt_v2
-
-    # Select output prefix
-    out_prefix = os.path.basename(os.path.splitext(ifile_l[0])[0])
-
-    # If verbose, open log file
-    if args.verbose:
-        log_filename = out_prefix + '.verbose'
-        verb_fd = open(log_filename, 'w+')
-
-    # Initialize state
-    first_token_dict = {}
-    token_count_map = {}
-    pair_count_map = {}
-    token_family_map = {}
-    fam_stats = {}
-    vt_all = 0
-    vt_empty = 0
-    singletons = 0
-
-    # Process each input file
-    for ifile in ifile_l:
-        # Open file
-        fd = open(ifile, 'r')
-
-        # Debug info, file processed
-        sys.stderr.write('[-] Processing input file %s\n' % ifile)
-
-        # Process all lines in file
-        for line in fd:
-
-            # If blank line, skip
-            if line == '\n':
-                continue
-
-            # Debug info
-            if vt_all % 100 == 0:
-                sys.stderr.write('\r[-] %d JSON read' % vt_all)
-                sys.stderr.flush()
-            vt_all += 1
-
-            # Read JSON line and extract sample info (i.e., hashes and labels)
-            vt_rep = json.loads(line)
-            sample_info = get_sample_info(vt_rep)
-            if sample_info is None:
-                try:
-                    name = vt_rep['md5']
-                    sys.stderr.write('\nNo AV labels for %s\n' % name)
-                except KeyError:
-                    sys.stderr.write('\nCould not process: %s\n' % line)
-                sys.stderr.flush()
-                vt_empty += 1
-                continue
-
-            # Sample's name is selected hash type (md5 by default)
-            name = getattr(sample_info, hash_type)
-
-            # If the VT report has no AV labels, continue
-            if not sample_info[3]:
-                vt_empty += 1
-                sys.stderr.write('\nNo AV labels for %s\n' % name)
-                sys.stderr.flush()
-                continue
-            
-            # Get the distinct tokens from all the av labels in the report
-            # And print them. If not verbose, print the first token.
-            # If verbose, print the whole list
-            try:
-                # Get distinct tokens from AV labels
-                tokens = list(av_labels.get_family_ranking(sample_info).items())
-
-                # If alias detection, populate maps
-                if args.aliasdetect:
-                    prev_tokens = set()
-                    for entry in tokens:
-                        curr_tok = entry[0]
-                        curr_count = token_count_map.get(curr_tok)
-                        if curr_count:
-                            token_count_map[curr_tok] = curr_count + 1
-                        else:
-                            token_count_map[curr_tok] = 1
-                        for prev_tok in prev_tokens:
-                            if prev_tok < curr_tok:
-                                pair = (prev_tok,curr_tok) 
-                            else: 
-                                pair = (curr_tok,prev_tok)
-                            pair_count = pair_count_map.get(pair)
-                            if pair_count:
-                                pair_count_map[pair] = pair_count + 1
-                            else:
-                                pair_count_map[pair] = 1
-                        prev_tokens.add(curr_tok)
-
-                # If generic token detection, populate map
-                if args.gendetect and args.gt:
-                    for entry in tokens:
-                        curr_tok = entry[0]
-                        curr_fam_set = token_family_map.get(curr_tok)
-                        family = gt_dict[name] if name in gt_dict else None
-                        if curr_fam_set and family:
-                            curr_fam_set.add(family)
-                        elif family:
-                            token_family_map[curr_tok] = set(family)
-
-                # Top candidate is most likely family name
-                if tokens:
-                    family = tokens[0][0]
-                    is_singleton = False
-                else:
-                    family = "SINGLETON:" + name
-                    is_singleton = True
-                    singletons += 1
-
-                # Check if sample is PUP, if requested
-                if args.pup:
-                    is_pup = av_labels.is_pup(sample_info[3])
-                    if is_pup:
-                        is_pup_str = "\t1"
-                    else:
-                        is_pup_str = "\t0"
-                else:
-                    is_pup = None
-                    is_pup_str =  ""
-
-                # Build family map for precision, recall, computation
-                first_token_dict[name] = family
-
-                # Get ground truth family, if available
-                if args.gt:
-                    gt_family = '\t' + gt_dict[name] if name in gt_dict else ""
-                else:
-                    gt_family = ""
-
-                # Print family (and ground truth if available) to stdout
-                sys.stdout.write('%s\t%s%s%s\n' % (name, family, gt_family, 
-                                                    is_pup_str))
-
-                # If verbose, print tokens (and ground truth if available) 
-                # to log file
-                if args.verbose:
-                    verb_fd.write('%s\t%s%s%s\n' % (
-                        name, tokens, gt_family, is_pup_str))
-
-                # Store family stats (if required)
-                if args.fam:
-                    if is_singleton:
-                        ff = 'SINGLETONS'
-                    else:
-                        ff = family
-                    try:
-                        numAll, numMal, numPup = fam_stats[ff]
-                    except KeyError:
-                        numAll = 0
-                        numMal = 0
-                        numPup = 0
-
-                    numAll += 1
-                    if args.pup:
-                        if is_pup:
-                            numPup += 1
-                        else:
-                            numMal += 1
-                    fam_stats[ff] = (numAll, numMal, numPup)
-
-            except:
-                traceback.print_exc(file=sys.stderr)
-                continue
-
-        # Debug info
-        sys.stderr.write('\r[-] %d JSON read' % vt_all)
-        sys.stderr.flush()
-        sys.stderr.write('\n')
-
-        # Close file
-        fd.close()
-
-    # Print statistics
-    sys.stderr.write(
-            "[-] Samples: %d NoLabels: %d Singletons: %d "
-            "GroundTruth: %d\n" % (
-                vt_all, vt_empty, singletons, len(gt_dict)))
-
-    # If ground truth, print precision, recall, and F1-measure
-    if args.gt and args.eval:
-        precision, recall, fmeasure = \
-                    ec.eval_precision_recall_fmeasure(gt_dict,
-                                                      first_token_dict)
-        sys.stderr.write( \
-            "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % \
-                          (precision, recall, fmeasure))
-
-    # If generic token detection, print map
-    if args.gendetect:
-        # Open generic tokens file
-        gen_filename = out_prefix + '.gen'
-        gen_fd = open(gen_filename, 'w+')
-        # Output header line
-        gen_fd.write("Token\t#Families\n")
-        sorted_pairs = sorted(token_family_map.items(), 
-                              key=lambda x: len(x[1]) if x[1] else 0, 
-                              reverse=True)
-        for (t,fset) in sorted_pairs:
-            gen_fd.write("%s\t%d\n" % (t, len(fset)))
-
-        # Close generic tokens file
-        gen_fd.close()
-        sys.stderr.write('[-] Generic token data in %s\n' % (gen_filename))
-
-    # If alias detection, print map
-    if args.aliasdetect:
-        # Open alias file
-        alias_filename = out_prefix + '.alias'
-        alias_fd = open(alias_filename, 'w+')
-        # Sort token pairs by number of times they appear together
-        sorted_pairs = sorted(
-                pair_count_map.items(), key=itemgetter(1))
-        # Output header line
-        alias_fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\n")
-        # Compute token pair statistic and output to alias file
-        for (t1,t2),c in sorted_pairs:
-            n1 = token_count_map[t1]
-            n2 = token_count_map[t2]
-            if (n1 < n2):
-                x = t1
-                y = t2
-                xn = n1
-                yn = n2
-            else:
-                x = t2
-                y = t1
-                xn = n2
-                yn = n1
-            f = float(c) / float(xn)
-            alias_fd.write("%s\t%s\t%d\t%d\t%d\t%0.2f\n" % (
-                x,y,xn,yn,c,f))
-        # Close alias file
-        alias_fd.close()
-        sys.stderr.write('[-] Alias data in %s\n' % (alias_filename))
-
-    # If family statistics, output to file
-    if args.fam:
-        # Open family file
-        fam_filename = out_prefix + '.families'
-        fam_fd = open(fam_filename, 'w+')
-        # Output header line
-        if args.pup:
-            fam_fd.write("# Family\tTotal\tMalware\tPUP\tFamType\n")
-        else:
-            fam_fd.write("# Family\tTotal\n")
-        # Sort map
-        sorted_pairs = sorted(fam_stats.items(), key=itemgetter(1),
-                              reverse=True)
-        # Print map contents
-        for (f,fstat) in sorted_pairs:
-            if args.pup:
-                if fstat[1] > fstat[2]:
-                    famType = "malware"
-                else:
-                    famType = "pup"
-                fam_fd.write("%s\t%d\t%d\t%d\t%s\n" % (f, fstat[0], fstat[1],
-                                                fstat[2], famType))
-            else:
-                fam_fd.write("%s\t%d\n" % (f, fstat[0]))
-        # Close file
-        fam_fd.close()
-        sys.stderr.write('[-] Family data in %s\n' % (fam_filename))
-
-    # Close log file
-    if args.verbose:
-        sys.stderr.write('[-] Verbose output in %s\n' % (log_filename))
-        verb_fd.close()
-
-
-
-if __name__=='__main__':
-    argparser = argparse.ArgumentParser(prog='avclass_labeler',
-        description='''Extracts the family of a set of samples.
-            Also calculates precision and recall if ground truth available''')
-
-    argparser.add_argument('-vt', action='append',
-        help='file with VT reports '
-             '(Can be provided multiple times)')
-
-    argparser.add_argument('-lb', action='append',
-        help='file with simplified JSON reports '
-             '{md5,sha1,sha256,scan_date,av_labels} '
-             '(Can be provided multiple times)')
-
-    argparser.add_argument('-vtdir',
-        help='existing directory with VT reports')
-
-    argparser.add_argument('-lbdir',
-        help='existing directory with simplified JSON reports')
-
-    argparser.add_argument('-gt',
-        help='file with ground truth')
-
-    argparser.add_argument('-eval',
-        action='store_true',
-        help='if used it evaluates clustering accuracy.'
-             ' Prints precision, recall, F1-measure. Requires -gt parameter')
-
-    argparser.add_argument('-alias',
-        help='file with aliases.',
-        default = default_alias_file)
-
-    argparser.add_argument('-gen',
-        help='file with generic tokens.',
-        default = default_gen_file)
-
-    argparser.add_argument('-av',
-        help='file with list of AVs to use')
-
-    argparser.add_argument('-pup',
-        action='store_true',
-        help='if used each sample is classified as PUP or not')
-
-    argparser.add_argument('-gendetect',
-        action='store_true',
-        help='if used produce generics file at end. Requires -gt parameter')
-
-    argparser.add_argument('-aliasdetect',
-        action='store_true',
-        help='if used produce aliases file at end')
-
-    argparser.add_argument('-v', '--verbose',
-        action='store_true',
-        help='output .verbose file with distinct tokens')
-
-    argparser.add_argument('-hash',
-        help='hash used to name samples. Should match ground truth',
-        choices=['md5', 'sha1', 'sha256'])
-
-    argparser.add_argument('-fam',
-        action='store_true',
-        help='if used produce families file with PUP/malware counts per family')
-
-    argparser.add_argument('-vt3', action='store_true',
-        help='input are VT v3 files')
-
-    args = argparser.parse_args()
-
-    if not args.vt and not args.lb and not args.vtdir and not args.lbdir:
-        sys.stderr.write('One of the following 4 arguments is required: '
-                          '-vt,-lb,-vtdir,-lbdir\n')
-        exit(1)
-
-    if (args.vt or args.vtdir) and (args.lb or args.lbdir):
-        sys.stderr.write('Use either -vt/-vtdir or -lb/-lbdir. '
-                          'Both types of input files cannot be combined.\n')
-        exit(1)
-
-    if args.gendetect and not args.gt:
-        sys.stderr.write('Generic token detection requires -gt param\n')
-        exit(1)
-
-    if args.eval and not args.gt:
-        sys.stderr.write('Evaluating clustering accuracy needs -gt param\n')
-        exit(1)
-
-    if args.alias:
-        if args.alias == '/dev/null':
-            sys.stderr.write('[-] Using no aliases\n')
-            args.alias = None
-        else:
-            sys.stderr.write('[-] Using aliases in %s\n' % (
-                              args.alias))
-    else:
-        sys.stderr.write('[-] Using generic aliases in %s\n' % (
-                          default_alias_file))
-
-    if args.gen:
-        if args.gen == '/dev/null':
-            sys.stderr.write('[-] Using no generic tokens\n')
-            args.gen = None
-        else:
-            sys.stderr.write('[-] Using generic tokens in %s\n' % (
-                              args.gen))
-    else:
-        sys.stderr.write('[-] Using default generic tokens in %s\n' % (
-                          default_gen_file))
-        
-    main(args)
diff --git a/avclass/data/default.aliases b/avclass/data/default.aliases
deleted file mode 100644
index d9ed41c..0000000
--- a/avclass/data/default.aliases
+++ /dev/null
@@ -1,559 +0,0 @@
-oneclickdownload 1clickdownload
-4share 4shared
-getfaster 4shared
-activshop activshopper
-adgazele adgazelle
-smabo adialer
-dealcabby adpeak
-adswo adwo
-gaobot agobot
-airad airinstaller
-airadinstaller airinstaller
-airinstall airinstaller
-rahack allaple
-starman allaple
-almanahe alman
-kanav alyak
-adfltnet amonetize
-easydl amonetize
-filesearch amonetize
-imonetize amonetize
-armour androidarmour
-climap androrat
-arcparlor arcadeparlor
-badday badda
-bearshare bandoo
-ilivid bandoo
-koyotelab bandoo
-musictoolbar bandoo
-searchsuite bandoo
-seasuite bandoo
-torchmedia bandoo
-basebrid basebridge
-batteryd batterydoctor
-fakebattscar batterydoctor
-klezer beebone
-selfdel beebone
-kazaa benjamin
-qukart berbew
-padodor berbew
-bertle bertle
-bertlea bertle
-serbg bgserv
-midgare bifrose
-egbii biige
-widoman bmmedia
-bobic bobax
-boxersms boxer
-smsboxer boxer
-browsepulse browsefox
-dragonbranch browsefox
-expressfind browsefox
-glassbottle browsefox
-greatfind browsefox
-liteweb browsefox
-positivefinds browsefox
-recordpage browsefox
-rollaround browsefox
-salecharger browsefox
-strongsignal browsefox
-swiftbrowse browsefox
-wanderburst browsefox
-yontoo browsefox
-yotoon browsefox
-bundl bundlore
-installvibe bundlore
-buzb bzub
-desktoplightning cashon
-dowcen centim
-chinesehacker chir
-runonce chir
-runouce chir
-cinmeng cinmus
-clemag cleaman
-clientconnect conduit
-searchprotect conduit
-kucirc cosmu
-overdoom cosmu
-dalamodo cossta
-putalol couponmarvel
-crori crossrider
-geksone crytex
-hublo crytex
-cybota cycbot
-gbot cycbot
-goolbot cycbot
-cabby dalexis
-ctblocker dalexis
-elenoocka dalexis
-comet darkkomet
-cometsys darkkomet
-cometsystems darkkomet
-finloski darkkomet
-fynloski darkkomet
-krademok darkkomet
-montiera delbar
-cheval detroie
-detroi detroie
-detroia detroie
-eydrop dinwod
-directdown directdownloader
-indirect directdownloader
-zadved dlhelper
-dogbite dogowar
-dogwar dogowar
-rabidog dogowar
-domainiq domaiq
-domalq domaiq
-domlq domaiq
-payint domaiq
-tugspay domaiq
-downloadmin downloadadmin
-downloadasist downloadassistant
-downloaderguide downloadguide
-drdelux droiddeluxe
-ddlight droiddreamlight
-lightdd droiddreamlight
-fokonge droidkungfu
-kongfu droidkungfu
-kungfu droidkungfu
-ibashade drolnux
-dialpass egroupdial
-egroup egroupdial
-exedial egroupdial
-instantaccess egroupdial
-emud emudbot
-adwareeorezo eorezo
-getextension eorezo
-tuto4pc eorezo
-eqdrug equationdrug
-equation equationdrug
-xpiro expiro
-yourfiledownloader expressdownloader
-fakerecovery fakesysdef
-prodatect fakesysdef
-systemfix fakesysdef
-tepfer fareit
-farex fearso
-nofear fearso
-nofer fearso
-fenomen fenomengame
-fenomengamet fenomengame
-condestil firseria
-downloadmr firseria
-firser firseria
-firseriainstaller firseria
-fiseria firseria
-morstar firseria
-morstars firseria
-popeler firseria
-rapiddown firseria
-solimba firseria
-sventore firseria
-flyagent flystudio
-flystud flystudio
-cobbler focobers
-cobblerone focobers
-cudos fosniw
-regie fosniw
-winsoft fosniw
-emerleox fujacks
-fujack fujacks
-whboy fujacks
-gaba gabpath
-androm gamarue
-andromeda gamarue
-bundpil gamarue
-debris gamarue
-dromedan gamarue
-lilu gamarue
-wauchos gamarue
-arcadeparlor gamevance
-arcadeweb gamevance
-epicgames gamevance
-epicplay gamevance
-gamevancecs gamevance
-gvance gamevance
-rivalgame gamevance
-juched ganelp
-waps gappusin
-wapsx gappusin
-geimini geinimi
-geinim geinimi
-kernelpatch geral
-livesoft getnow
-livesoftaction getnow
-frogonal ginmaster
-gingermaster ginmaster
-gmaster ginmaster
-ghostbot gobot
-gdream golddream
-glodream golddream
-gprice gorillaprice
-spysheriff harnig
-helldoor hilldoor
-hippo hipposms
-hipsmser hipposms
-hispo hipposms
-banach hotbar
-clickpotato hotbar
-clkpotato hotbar
-pinball hotbar
-rugo hotbar
-screensaver hotbar
-zango hotbar
-freepds hotclip
-huigezi hupigon
-pigeon hupigon
-optimum ibryte
-optimuminstall ibryte
-optimuminstaller ibryte
-optinstall ibryte
-optiuminstaller ibryte
-ickboy icekboy
-iceboy icekboy
-installcube icloader
-iconos iconosys
-iconosis iconosys
-inboxtoolbar inbox
-dowins inservice
-inservc inservice
-braininst installbrain
-brantall installbrain
-ibrain installbrain
-clickrun installcore
-clickrunsoftware installcore
-cryptinno installcore
-installco installcore
-installrex installerex
-sneakytrail installerex
-tdownloader installerex
-tsuploader installerex
-webpick installerex
-installq installiq
-installmet installmetrix
-instmonetizer installmonetizer
-installmon installmonster
-installmonst installmonster
-installmonstr installmonster
-monstruos installmonster
-tovkater installmonster
-intex intexdial
-intexus intexdial
-neteyes ipamor
-mswdm ipamor
-amorba ipamor
-hidrag jeefo
-jackpos jinupd
-plosa karagany
-xtoober karagany
-kgbkeylogger kgbspy
-elkern klez
-padobot korgo
-rkdoor koutodoor
-hyteod kovter
-lacon laconic
-escape laroux
-escop laroux
-manalo laroux
-linkun linkular
-powerpack linkular
-legendmir lmir
-legmir lmir
-lemir lmir
-biez loadmoney
-gldct loadmoney
-ldmon loadmoney
-loadmoneyent loadmoney
-odyssey loadmoney
-ogimant loadmoney
-plocust loadmoney
-duptwux lolbot
-duel loveletter
-mixor loveletter
-xworm loveletter
-tazebama mabezat
-ratab mamianune
-midhos medfos
-magmedia mediamagnet
-mmag mediamagnet
-downloadnsave megasearch
-fastsave megasearch
-fastsaveapp megasearch
-preloader megasearch
-saveshare megasearch
-morefi memery
-lohmys midia
-marketpay mmarketpay
-mmarket mmarketpay
-mmarketp mmarketpay
-fipp morto
-serpip morto
-mspyonline mspy
-multibardown multibar
-multibardownloader multibar
-mutibar multibar
-ticno multibar
-mplug multiplug
-licat murofet
-funweb mywebsearch
-mindspark mywebsearch
-nandrob nandrobox
-neshuta neshta
-netboxserver netbox
-bespal netins
-netweird netwiredrc
-weecnaw netwiredrc
-wirenet netwiredrc
-nickispy nickyspy
-nickspy nickyspy
-conduit opencandy
-optixp optix
-optixpro optix
-bflient palevo
-pilleuz palevo
-rimecud palevo
-pate parite
-pinfi parite
-perfectkeylogger perflogger
-perfkey perflogger
-perfloger perflogger
-petrolan petrolin
-yoof picsys
-fixflo pioneer
-flofix pioneer
-floxif pioneer
-floxlib pioneer
-apperhand plankton
-plangton plankton
-pupil plemood
-purplemood plemood
-purple plemood
-gulpix plugx
-poisonivy poison
-polipos polip
-screenblaze prosti
-acute pullupdate
-clickspring purityscan
-clspring purityscan
-purity purityscan
-chydo pykspa
-dwonk pykspa
-pykse pykspa
-qakbot qbot
-qqrobber qqrob
-zsone raden
-protexor ramnit
-rmnet ramnit
-ranck ranky
-dracur rebhip
-spatet rebhip
-spyrat rebhip
-refogkeylogger refog
-relevant relevantknowledge
-rknowledge relevantknowledge
-arto renos
-codecpack renos
-codepack renos
-banloader rimod
-mutopy rodecap
-ggsmart rootsmart
-kometa rukometa
-gnurbulf rungbu
-overt sadenav
-overtls sadenav
-sahagent sahat
-shopathome sahat
-safekidzone sakezon
-kashu sality
-kuku sality
-saldrop sality
-salicode sality
-salitystub sality
-salload sality
-salpack sality
-salrenmetie sality
-stubofsality sality
-sancmed sanctionedmedia
-contrand sckeylog
-controlrandom sckeylog
-sckeylogger sckeylog
-sclog sckeylog
-softcentral sckeylog
-secxplod securityxploded
-secxploded securityxploded
-winsxsbot sfone
-ibank shiz
-pinny shiz
-shifu shiz
-zybut shiz
-shohdi shodi
-caphaw shylock
-opclose sillyfdc
-cson simbot
-rodricter simda
-avalod sinowal
-sinodo sinowal
-wplug slugin
-wplugin slugin
-koceg socks
-mandaph socks
-pace socks
-fakromup soft32downloader
-popuppers soft32downloader
-soft32down soft32downloader
-soft32download soft32downloader
-wedownload soft32downloader
-softbase softobase
-bxib softonic
-softonicdownloader softonic
-driverupd softpulse
-sambamedia softpulse
-softpules softpulse
-betterinstaller somoto
-mazel somoto
-somato somoto
-somotobetterinstaller somoto
-somotoltd somoto
-optimizerpro speedingupmypc
-spdupmypc speedingupmypc
-superoptimizer speedingupmypc
-superpctools speedingupmypc
-spyeyes spyeye
-spyweep spyeye
-square squarenet
-javak suggestor
-steekt steek
-tophos stegvob
-mofksys swisyn
-c2lop swizzor
-electron sytro
-soltern sytro
-systro sytro
-taojin taojinstar
-alureon tdss
-olmarik tdss
-tidserv tdss
-tdssrt tdss
-jelbrus techsnab
-privitize techsnab
-joleee tedroo
-tedro tedroo
-gael tenga
-gaelicum tenga
-licum tenga
-nuwar tibs
-peacomm tibs
-tibspk tibs
-zhelatin tibs
-tinbakd tinba
-pirrit tirrip
-pirritsuggestor tirrip
-inffinity toggle
-inffinityinternet toggle
-stufik tufik
-tufei tufik
-twetty twetti
-speedupmypc uniblue
-bandito unruy
-banito unruy
-cycler unruy
-spacer unruy
-cryptodef upatre
-daytre upatre
-ipatre upatre
-waski upatre
-yarwi upatre
-gupboot urelas
-plite urelas
-ruftar usteal
-nextup verti
-lavandos vidro
-spakrab vidro
-gavir viking
-looked viking
-philis viking
-multiinstall vilsel
-ultradownload vilsel
-ultradownloads vilsel
-vils vilsel
-nabucur virlock
-polyransom virlock
-virransom virlock
-angel virut
-angryangel virut
-guarder virut
-madanf virut
-madang virut
-madangel virut
-vetor virut
-virtob virut
-vserv viser
-vitallia vittalia
-changeup vobfus
-chinky vobfus
-diple vobfus
-meredrop vobfus
-pronny vobfus
-purora vobfus
-vbccrypt vobfus
-vbna vobfus
-vbobfus vobfus
-wbna vobfus
-vflood vtflooder
-vflooder vtflooder
-wanna wannacry
-wanacry wannacry
-wannacrypt wannacry
-wannacryptor wannacry
-jadtre wapomi
-loorp wapomi
-mikcer wapomi
-nimnul wapomi
-otwycal wapomi
-pikor wapomi
-pikorms wapomi
-protil wapomi
-qvod wapomi
-simfect wapomi
-vjadtre wapomi
-wali wapomi
-stration warezov
-webalt webalta
-bulknet webprefix
-klevate webprefix
-blackice whiteice
-blic whiteice
-darksnow whiteice
-autokms winactivator
-kmsauto winactivator
-hackkms winactivator
-statblaster winfetcher
-akan winwebsec
-livesecurity winwebsec
-mbro winwebsec
-systemsecurity winwebsec
-poweliks wowlik
-powerliks wowlik
-powessere wowlik
-appquanta wkload
-valla xorala
-valhalla xorala
-extrat xtrat
-remtasu xtrat
-xtreme xtrat
-zbomber zombbomber
-panda zbot
-zbocheman zbot
-zeus zbot
-bjlog zegost
-zeno zenosearch
-maxplus zeroaccess
-maxplusent zeroaccess
-pmax zeroaccess
-sirefef zeroaccess
-smadow zeroaccess
-zaccess zeroaccess
-zona zvuzona
-onestep zwangi
-zwunzi zwangi
diff --git a/avclass/data/default.generics b/avclass/data/default.generics
deleted file mode 100644
index 1fbef42..0000000
--- a/avclass/data/default.generics
+++ /dev/null
@@ -1,418 +0,0 @@
-# Architecture / OS
-win
-win32
-w32
-win64
-w64
-winnt
-linux
-unix
-android
-androidos
-andr
-macosx
-osx
-osx32
-
-# Malicious software
-malware
-malicious
-malagent
-maldroid
-dangerousobject
-
-# Heuristic detection
-generic
-generik
-gen
-agen
-genmalicious
-generickd
-tsgeneric
-genericr
-heuristic
-heur
-siggen
-genetic
-genome
-cloud
-kcloud
-memscan
-high
-score
-attribute
-advml
-bloodhound
-sape
-maltrec
-symvt
-igeneric
-eheur
-posible
-undefined
-static
-
-# Malware classes
-trojan
-horse
-troj
-trj
-trojanhorse
-trojware
-trojanransom
-trojanspy
-trojanapt
-trojanclicker
-trojanfakeav
-trojanpsw
-worm
-networm
-hllw
-virus
-fileinfector
-infector
-prepender
-hllp
-rootkit
-spyware
-ddos
-flooder
-dialer
-porndialer
-porn
-backdoor
-bkdr
-keylog
-keylogger
-datastealer
-stealer
-infostealer
-pwstealer
-banker
-monitor
-mailer
-email
-emailworm
-massmailer
-smtp
-stmp
-spam
-spammer
-spambot
-ransom
-ransomlock
-ransomcrypt
-ransomware
-filecoder
-filecryptor
-rogue
-fakeav
-fakealert
-clicker
-adclicker
-click
-miner
-coinmine
-coinminer
-bitcoinminer
-bitcoin
-btcmine
-bitminer
-trojansms
-smssend
-searcher
-phishing
-
-# Macro
-macro
-badmacro
-maliciousmacro
-w97m
-o97m
-x97m
-pp97m
-mw97
-w2km
-mo97
-x2km
-
-# Downloader
-downloader
-downldr
-dloader
-dwnldr
-dldr
-dloadr
-dloade
-download
-dload
-downware
-downagent
-dropper
-drop
-dropr
-dldrop
-exedrop
-mdropper
-muldrop
-droppr
-trojandropper
-trojandownloader
-trojandwnldr
-trjndwnlder
-exedown
-downldexe
-dropped
-docdl
-docdrop
-docdrp
-macrodown
-downloadware
-dloadware
-
-# PUP
-pup
-pua
-adware
-potentially
-unwanted
-not-a-virus
-riskware
-risk
-grayware
-unwnt
-addisplay
-adknowledge
-adload
-applicunwnt
-adplugin
-plugin
-downad
-toolbar
-webtoolbar
-casino
-casonline
-install
-installer
-bundle
-bundler
-bundled
-bundleapp
-bundleinstaller
-softwarebundler
-nsis
-browsermodifier
-unsafe
-securityrisk
-
-# Suspicious
-suspected
-suspect
-suspicious
-susp
-suspic
-suspectcrc
-reputation
-behaveslike
-lookslike
-variant
-based
-possible
-threat
-probably
-confidence
-highconfidence
-
-# Unclassified
-unknown
-unclassifiedmalware
-undef
-
-# Behavior: injection
-injector
-inject
-injecter
-vbinject
-injcrypt
-injected
-
-# Behavior: homepage modification
-homepage
-startpage
-
-# Behavior: kill
-avkill
-killav
-antiav
-antifw
-blocker
-
-# Behavior: signed
-fakems
-signed
-
-# Behavior: proxy
-proxy
-trojanproxy
-
-# Behavior: autorun
-autorun
-autoruner
-starter
-
-# Behavior: network
-netfilter
-redirector
-sniffer
-portscan
-
-# Behavior: files
-killfiles
-renamer
-
-# Behavior: services
-servstart
-server
-
-# Behavior: VM detect
-vmdetect
-vmdetector
-
-# Packer
-packer
-cryptor
-crypter
-obfuscator
-msilobfuscator
-encoder
-
-# Packed
-packed
-malpack
-encpk
-malob
-cryp
-crypt
-crypted
-cryptic
-genpack
-krypt
-kryptk
-kryptik
-obfuscated
-obfus
-obfusc
-obfuscate
-malcrypt
-vbcrypt
-vbkrypt
-vbpack
-xpack
-zpack
-susppack
-suspiciouspacker
-
-# Packed (specific packers)
-asprotect
-nspack
-pecompact
-upack
-themida
-vmprotect
-
-# Program
-program
-application
-appl
-software
-file
-
-# File types
-text
-html
-script
-word
-msword
-excel
-msexcel
-office
-msoffice
-shellcode
-shellkode
-msil
-java
-j2me
-fakedoc
-fakepdf
-webpage
-iframe
-powershell
-perl
-python
-flash
-jpeg
-autoit
-
-# Patch
-pepatch
-patchfile
-patched
-patcher
-
-# Exploit
-exploit
-expl
-
-# Corrupted
-damaged
-corrupt
-pemalform
-malpe
-
-# Tools
-tool
-risktool
-securitytool
-fraudtool
-virtool
-keygen
-hack
-hacktool
-hktl
-spamtool
-crack
-cracktool
-
-# Small
-small
-tiny
-
-# Generic families
-agent
-eldorado
-artemis
-krap
-kazy
-katusha
-pornoasset
-foreign
-symmi
-jorik
-graftor
-strictor
-
-# Test
-test
-testvirus
-
-# Misc
-password
-website
-encodefeature
-multi
-normal
-other
-optional
-access
-onion
- 
diff --git a/avclass2/avclass2_input_checker.py b/avclass/input_checker.py
similarity index 86%
rename from avclass2/avclass2_input_checker.py
rename to avclass/input_checker.py
index 0b8dc35..ca7c381 100755
--- a/avclass2/avclass2_input_checker.py
+++ b/avclass/input_checker.py
@@ -1,19 +1,13 @@
-#!/usr/bin/env python
-'''
-AVClass2 input checker
-'''
-
-import os
 import sys
 import argparse
-script_dir = os.path.dirname(os.path.abspath(__file__))
-sys.path.insert(1, os.path.join(script_dir, 'lib/'))
-from avclass2_common import Taxonomy, Tagging, Expansion
+from avclass.lib import Taxonomy, Tagging, Expansion
+
 
 default_tag_file = "data/default.tagging"
 default_tax_file = "data/default.taxonomy"
 default_exp_file = "data/default.expansion"
 
+
 if __name__ == '__main__':
     argparser = argparse.ArgumentParser(prog='input_checker',
         description='Checks format of files Tagging, Expansion and Taxonomy.')
diff --git a/avclass2/avclass2_labeler.py b/avclass/labeler.py
similarity index 98%
rename from avclass2/avclass2_labeler.py
rename to avclass/labeler.py
index ed4996c..c64a636 100755
--- a/avclass2/avclass2_labeler.py
+++ b/avclass/labeler.py
@@ -1,20 +1,15 @@
-#!/usr/bin/env python
-'''
-AVClass2 labeler
-'''
-
-import os
-import sys
-script_dir = os.path.dirname(os.path.abspath(__file__))
-sys.path.insert(1, os.path.join(script_dir, 'lib/'))
-sys.path.insert(1, os.path.join(script_dir, '../shared/'))
 import argparse
-from avclass2_common import AvLabels
-from operator import itemgetter
-import evaluate_clustering as ec
+import os
 import json
+import sys
 import traceback
 
+from operator import itemgetter
+
+from avclass.lib import AvLabels, clustering as ec
+
+
+script_dir = os.path.dirname(os.path.abspath(__file__))
 # Default tagging file
 default_tag_file = os.path.join(script_dir, "data/default.tagging")
 # Default expansion file
@@ -22,6 +17,7 @@
 # Default taxonomy file
 default_tax_file = os.path.join(script_dir, "data/default.taxonomy")
 
+
 def guess_hash(h):
     ''' Given a hash string, guess the hash type based on the string length '''
     hlen = len(h)
diff --git a/avclass/lib/avclass_common.py b/avclass/lib/avclass_common.py
deleted file mode 100755
index 5145d99..0000000
--- a/avclass/lib/avclass_common.py
+++ /dev/null
@@ -1,337 +0,0 @@
-#!/usr/bin/env python
-'''
-Main AVClass class
-'''
-
-import re
-import string
-from collections import OrderedDict as OrdDict
-from collections import namedtuple
-from operator import itemgetter, attrgetter
-
-SampleInfo = namedtuple('SampleInfo', 
-                        ['md5', 'sha1', 'sha256', 'labels'])
-
-# AVs to use in is_pup method
-pup_av_set = {'Malwarebytes', 'K7AntiVirus', 'Avast',
-              'AhnLab-V3', 'Kaspersky', 'K7GW', 'Ikarus',
-              'Fortinet', 'Antiy-AVL', 'Agnitum', 'ESET-NOD32'}
-
-# Tokens that indicate PUP used by is_pup method
-pup_tokens = {'PUA', 'Adware', 'PUP', 'Unwanted', 'Riskware', 'grayware',
-              'Unwnt', 'Adknowledge', 'toolbar', 'casino', 'casonline',
-              'AdLoad', 'not-a-virus'}
-
-# AVs to use in suffix removal
-suffix_removal_av_set = {'Norman', 'Avast', 'Avira', 'Kaspersky',
-                          'ESET-NOD32', 'Fortinet', 'Jiangmin', 'Comodo',
-                          'GData', 'Avast', 'Sophos',
-                          'TrendMicro-HouseCall', 'TrendMicro',
-                          'NANO-Antivirus', 'Microsoft'}
-
-class AvLabels:
-    '''
-    Class to operate on AV labels, 
-    such as extracting the most likely family name.
-    '''
-    def __init__(self, gen_file = None, alias_file = None, av_file = None):
-
-        # Read generic token set from file
-        self.gen_set = self.read_generics(gen_file) if gen_file else set()
-
-        # Read aliases map from file
-        self.aliases_map = self.read_aliases(alias_file) if alias_file else {}
-
-        # Read AV engine set from file
-        self.avs = self.read_avs(av_file) if av_file else None
-
-    @staticmethod
-    def read_aliases(alfile):
-        '''Read aliases map from given file'''
-        if alfile is None:
-            return {}
-        almap = {}
-        with open(alfile, 'r') as fd:
-            for line in fd:
-                alias, token = line.strip().split()[0:2]
-                almap[alias] = token
-        return almap
-
-    @staticmethod
-    def read_generics(generics_file):
-        '''Read generic token set from given file'''
-        gen_set = set()
-        with open(generics_file) as gen_fd:
-            for line in gen_fd:
-                if line.startswith('#') or line == '\n':
-                    continue
-                gen_set.add(line.strip())
-        return gen_set
-
-    @staticmethod
-    def read_avs(avs_file):
-        '''Read AV engine set from given file'''
-        with open(avs_file) as fd:
-            avs = set(map(str.strip, fd.readlines()))
-        return avs
-
-    @staticmethod
-    def get_sample_info_lb(vt_rep):
-        '''Parse and extract sample information from JSON line
-           Returns a SampleInfo named tuple
-        '''
-        return SampleInfo(vt_rep['md5'], vt_rep['sha1'], vt_rep['sha256'],
-                          vt_rep['av_labels'])
-
-    @staticmethod
-    def get_sample_info_vt_v2(vt_rep):
-        '''Parse and extract sample information from JSON line
-           Returns a SampleInfo named tuple
-        '''
-        label_pairs = []
-        # Obtain scan results, if available
-        try:
-            scans = vt_rep['scans']
-            md5 = vt_rep['md5']
-            sha1 = vt_rep['sha1']
-            sha256 = vt_rep['sha256']
-        except KeyError:
-            return None
-        # Obtain labels from scan results
-        for av, res in scans.items():
-            if res['detected']:
-                label = res['result']
-                clean_label = ''.join(filter(
-                                  lambda x: x in string.printable,
-                                    label)).strip()
-                label_pairs.append((av, clean_label))
-
-        return SampleInfo(md5, sha1, sha256, label_pairs)
-
-    @staticmethod
-    def get_sample_info_vt_v3(vt_rep):
-        '''Parse and extract sample information from JSON line
-           Returns a SampleInfo named tuple
-        '''
-        label_pairs = []
-        # Obtain scan results, if available
-        try:
-            scans = vt_rep['data']['attributes']['last_analysis_results']
-            md5 = vt_rep['data']['attributes']['md5']
-            sha1 = vt_rep['data']['attributes']['sha1']
-            sha256 = vt_rep['data']['attributes']['sha256']
-        except KeyError:
-            return None
-        # Obtain labels from scan results
-        for av, res in scans.items():
-            label = res['result']
-            if label is not None:
-                clean_label = ''.join(filter(
-                                  lambda x: x in string.printable,
-                                    label)).strip()
-                label_pairs.append((av, clean_label))
-
-        return SampleInfo(md5, sha1, sha256, label_pairs)
-
-    @staticmethod
-    def is_pup(av_label_pairs):
-        '''This function classifies the sample as PUP or not 
-           using the AV labels as explained in the paper:
-           "Certified PUP: Abuse in Authenticode Code Signing" 
-           (ACM CCS 2015)
-           It uses the AV labels of 11 specific AVs. 
-           The function checks for 13 keywords used to indicate PUP.
-           Return:
-              True/False/None
-        '''
-        # If no AV labels, nothing to do, return
-        if not av_label_pairs:
-            return None
-        # Initialize
-        pup = False
-        threshold = 0.5
-        # Set with (AV name, Flagged/not flagged as PUP), for AVs in pup_av_set
-        bool_set = set([(pair[0], t.lower() in pair[1].lower()) 
-                        for t in pup_tokens
-                        for pair in av_label_pairs
-                        if pair[0] in pup_av_set])
-
-        # Number of AVs that had a label for the sample
-        av_detected = len([p[0] for p in av_label_pairs
-                           if p[0] in pup_av_set])
-
-        # Number of AVs that flagged the sample as PUP
-        av_pup = list(map(lambda x: x[1], bool_set)).count(True)
-
-        # Flag as PUP according to a threshold
-        if (float(av_pup) >= float(av_detected)*threshold) and av_pup != 0:
-            pup = True
-        return pup
-
-
-    @staticmethod
-    def __remove_suffixes(av_name, label):
-        '''Remove AV specific suffixes from given label
-           Returns updated label'''
-
-        # Truncate after last '.'
-        if av_name in suffix_removal_av_set:
-            label = label.rsplit('.', 1)[0]
-
-        # Truncate after last '.' 
-        # if suffix only contains digits or uppercase (no lowercase) chars
-        if av_name == 'AVG':
-            tokens = label.rsplit('.', 1)
-            if len(tokens) > 1 and re.match("^[A-Z0-9]+$", tokens[1]):
-                label = tokens[0]
-
-        # Truncate after last '!'
-        if av_name in set(['Agnitum','McAffee','McAffee-GW-Edition']):
-            label = label.rsplit('!', 1)[0]
-
-        # Truncate after last '('
-        if av_name in set(['K7AntiVirus', 'K7GW']):
-            label = label.rsplit('(', 1)[0]
-
-        # Truncate after last '@'
-        # GData would belong here, but already trimmed earlier
-        if av_name in set(['Ad-Aware', 'BitDefender', 'Emsisoft', 'F-Secure', 
-                          'Microworld-eScan']):
-            label = label.rsplit('(', 1)[0]
-
-        return label
-
-
-    def __normalize(self, label, hashes):
-        '''Tokenize label, filter tokens, and replace aliases'''
-
-        # If empty label, nothing to do
-        if not label:
-            return []
-
-        # Initialize list of tokens to return
-        ret = []
-
-        # Split label into tokens and process each token
-        for token in re.split("[^0-9a-zA-Z]", label):
-            # Convert to lowercase
-            token = token.lower()
-
-            # Remove digits at the end
-            end_len = len(re.findall("\d*$", token)[0])
-            if end_len:
-                token = token[:-end_len]
-
-            # Ignore short token
-            if len(token) < 4:
-                continue
-
-            # Remove generic tokens
-            if token in self.gen_set:
-                continue
-
-            # Ignore token if prefix of a hash of the sample 
-            # Most AVs use MD5 prefixes in labels, 
-            # but we check SHA1 and SHA256 as well
-            hash_token = False
-            for hash_str in hashes:
-                if hash_str[0:len(token)] == token:
-                  hash_token = True
-                  break
-            if hash_token:
-                continue
-
-            # Replace alias
-            token = self.aliases_map[token] if token in self.aliases_map \
-                                            else token
-
-            # Add token
-            ret.append(token)
-        return ret
-
-    def get_family_ranking(self, sample_info):
-        '''
-        Returns sorted dictionary of most likely family names for sample
-        '''
-        # Extract info from named tuple
-        av_label_pairs = sample_info[3]
-        hashes = [ sample_info[0], sample_info[1], sample_info[2] ]
-
-        # Whitelist the AVs to filter the ones with meaningful labels
-        av_whitelist = self.avs
-
-        # Initialize auxiliary data structures
-        labels_seen = set()
-        token_map = {}
-
-        # Process each AV label
-        for (av_name, label) in av_label_pairs:
-            # If empty label, nothing to do
-            if not label:
-                continue
-
-            ################
-            # AV selection #
-            ################
-            if av_whitelist and av_name not in av_whitelist:
-                continue
-
-            #####################
-            # Duplicate removal #
-            #####################
-
-            # Emsisoft uses same label as 
-            # GData/ESET-NOD32/BitDefender/Ad-Aware/MicroWorld-eScan,
-            # but suffixes ' (B)' to their label. Remove the suffix.
-            if label.endswith(' (B)'):
-                label = label[:-4]
-
-            # F-Secure uses Avira's engine since Nov. 2018
-            # but prefixes 'Malware.' to Avira's label. Remove the prefix.
-            if label.startswith('Malware.'):
-                label = label[8:]
-
-            # Other engines often use exactly the same label, e.g.,
-            #   AVG/Avast
-            #   K7Antivirus/K7GW
-            #   Kaspersky/ZoneAlarm
-
-            # If we have seen the exact same label before, skip
-            if label in labels_seen:
-                continue
-            # If not, we add it to the set of labels seen
-            else:
-                labels_seen.add(label)
-
-            ##################
-            # Suffix removal #
-            ##################
-            label = self.__remove_suffixes(av_name, label)
-
-            ########################################################
-            # Tokenization, token filtering, and alias replacement #
-            ########################################################
-            tokens = self.__normalize(label, hashes)
-
-            # Increase token count in map
-            for t in tokens:
-                c = token_map[t] if t in token_map else 0
-                token_map[t] = c + 1
-
-        ##################################################################
-        # Token ranking: sorts tokens by decreasing count and then token #
-        ##################################################################
-        sorted_tokens = sorted(token_map.items(), 
-                                key=itemgetter(1,0), 
-                                reverse=True)
-
-        # Delete the tokens appearing only in one AV, add rest to output
-        sorted_dict = OrdDict()
-        for t, c in sorted_tokens:
-            if c > 1:
-                sorted_dict[t] = c
-            else:
-                break
-        
-        return sorted_dict
-
diff --git a/avclass2/avclass2_update_module.py b/avclass/update.py
similarity index 97%
rename from avclass2/avclass2_update_module.py
rename to avclass/update.py
index 1ca9e87..14bb1c5 100755
--- a/avclass2/avclass2_update_module.py
+++ b/avclass/update.py
@@ -1,19 +1,13 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-'''
-AVClass2 Update module
-'''
-import sys
-import os
 import argparse
 import logging
-# Make sure paths are relative to execution path
-script_dir = os.path.dirname(os.path.abspath(__file__))
-sys.path.insert(0, os.path.join(script_dir, 'lib/'))
-from operator import itemgetter
+import os
+import sys
+
 from collections import namedtuple
-from avclass2_common import Taxonomy, Expansion, Tagging
-# from Levenshtein import ratio as levenshtein_ratio
+from operator import itemgetter
+
+from avclass.lib import Taxonomy, Expansion, Tagging
+
 
 # Set logging
 log = logging.getLogger(__name__)
@@ -28,6 +22,7 @@
 root.addHandler(handler_stderr)
 
 
+script_dir = os.path.dirname(os.path.abspath(__file__))
 # Default tagging file
 default_tagging_file = os.path.join(script_dir, "data/default.tagging")
 # Default expansion file
@@ -42,6 +37,7 @@
 Rel = namedtuple('Rel', ['t1', 't2', 't1_num', 't2_num', 
                          'nalias_num', 'talias_num', 'tinv_alias_num'])
 
+
 class Update:
     ''' Update Module '''
     def __init__(self, rel_filepath, in_taxonomy, in_tagging, in_expansion, 
@@ -487,11 +483,7 @@ def output(self, out_prefix):
                         len(expansion), args.exp))
 
     # Build update object
-    if not args.alias:
-        alias_fname = os.path.basename(os.path.splitext(ifile)[0]) + '.alias'
-    else:
-        alias_fname = args.alias
-    update = Update(alias_fname, taxonomy, tagging, expansion, args.n, args.t)
+    update = Update(args.alias, taxonomy, tagging, expansion, args.n, args.t)
 
     log.info('[-] Processing %d relations satisfying t>=%.2f n>=%d' % (
                         update.num_rules(), args.t, args.n))
diff --git a/avclass2/README.md b/avclass2/README.md
deleted file mode 100644
index 83dfaad..0000000
--- a/avclass2/README.md
+++ /dev/null
@@ -1,261 +0,0 @@
-# AVClass2
-
-AVClass2 is a malware tagging tool. It extends AVClass to extract from AV labels not only family name tags, but other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). 
-
-You give it as input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports)
-and it outputs tags observed in the AV labels, ranked by decreasing popularity. 
-
-The design and evaluation of AVClass2 is detailed in our ACSAC 2020 paper.
-
-> Silvia Sebastián, Juan Caballero. 
-AVClass2: Massive Malware Tag Extraction from AV Labels. 
-In proceedings of the Annual Computer Security Applications Conference, December 2020.
-
-In a nutshell, AVClass2 comprises two modules: labeling and update. Code for both is included, but most users will be only interested in the labeling, which outputs the tags for the samples. The update module is used to update the input taxonomy, tagging rules, and expansion rules. If you use our default taxonomy, tagging, and expansion files, you do not need to run the update module.
-
-
-## Labeling
-
-The labeler takes as input a JSON file with the AV labels of malware samples 
-(-vt or -lb options), 
-a file with the taxonomy (-tax option), 
-a file with tagging rules (-tag option), and
-a file with expansion rules (-exp option). 
-It outputs a set of ranked tags. 
-If you do not provide taxonomy, expansion or tagging files, 
-the default ones in the data folder are used.
-
-```shell
-$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json
-```
-
-The above command labels the samples whose AV labels are in 
-the ../examples/malheurReference_lb.json file. 
-It prints the results to stdout. 
-The output looks like this: 
-
-```
-aca2d12934935b070df8f50e06a20539 33 grayware|10,adware|9,windows|8,adrotator|8,downloader|3,zlob|2
-67d15459e1f85898851148511c86d88d 37 dialer|23,windows|9,adultbrowser|8,porndialer|7,grayware|6,tool|3,target|2
-```
-
-which means sample *aca2d12934935b070df8f50e06a20539* 
-was flagged by 33 AV engines and 10 of them agree it is *grayware*, 9 that it is more specifically *adware*, 
-8 mention that it runs on *windows*, another 8 that it is the *adrotator* family, 
-3 that it is a *downloader*, and 2 that it belongs instead to the *zlob* family.
-Sample *67d15459e1f85898851148511c86d88d* is flagged by 37 AV engines and 23 of them 
-consider it a *dialer*, 8 that it belongs to the *adultbrowser* family, and so on. 
-
-The -p option outputs the full path of each tag in the taxonomy: 
-
-```shell
-$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -p
-```
-
-The above command line outputs:
-
-```
-aca2d12934935b070df8f50e06a20539 33 CLASS:grayware|10,CLASS:grayware:adware|9,FILE:os:windows|8,FAM:adrotator|8,CLASS:downloader|3,FAM:zlob|2
-67d15459e1f85898851148511c86d88d 37 CLASS:dialer|23,FILE:os:windows|9,FAM:adultbrowser|8,CLASS:dialer:porndialer|7,CLASS:grayware|6,CLASS:grayware:tool|3,FAM:target|2
-```
-
-where each tag has been replaced by its taxonomy path, which starts with the category in capitals, 
-followed by the path in the category (if any), and the tag itself, all separated by colons. 
-For example, *FAM:adrotator* makes explicit that *adrotator* is a malware family, 
-*CLASS:grayware* that *grayware* is a malware class, and 
-*CLASS:grayware:adware* that *adware* is a subclass of *grayware*.
-
-**Compatibility mode**
-
-The compatibility -c option makes AVClass2 output the same format as AVClass. 
-
-```shell
-$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -c
-```
-
-outputs:
-
-```
-bb23e1d296cf01bbaf32ed3938f9b0b8 allaple
-cc4521ea738e8ba17139f86b3def5349 SINGLETON:cc4521ea738e8ba17139f86b3def5349
-```
-
-As in AVClass, the output contains only the family name, 
-which corresponds to the highest ranked family tag, all other tags are ignored.
-Samples for which a family cannot be obtained are labeled as singletons with their hash.
- 
-It is important to note that AVClass2 compatibility mode results can differ from AVClass results
-on the same input file.
-The differences in family names are due to differences between the generics and aliases files 
-used by AVClass and the taxonomy, tagging rules, and expansion rules used by AVClass2. 
-In the future, we may change AVClass to use the taxonomy and rules from AVClass2 
-as input (instead of the generics and aliases files) 
-to minimize such differences and avoid maintaining different data files.
-
-
-## Input JSON format
-
-AVClass2 supports three input JSON formats:
-
-1. VirusTotal v2 API JSON reports (*-vt file*), 
-where each line in the input *file* should be the full JSON of a 
-VirusTotal v2 API response to the */file/report* endpoint,
-e.g., obtained by querying https://www.virustotal.com/vtapi/v2/file/report?apikey={apikey}&resource={hash}
-There is an example VirusTotal v2 input file in examples/vtv2_sample.json
-
-2. VirusTotal v3 API JSON reports (*-vt file -vt3*), 
-where each line in the input *file* should be the full JSON of a VirusTotal API version 3 response with a *File* object report, 
-e.g., obtained by querying https://www.virustotal.com/api/v3/files/{hash}
-There is an example VirusTotal v3 input file in examples/vtv3_sample.json
-
-3. Simplified JSON (*-lb file*),
-where each line in *file* should be a JSON 
-with (at least) these fields:
-{md5, sha1, sha256, av_labels}. 
-There is an example of such input file in *examples/malheurReference_lb.json*
-
-
-**Multiple input files**
-
-AVClass2 can handle multiple input files putting the results in the same output files 
-(if you want results in separate files, process each input file separately).
-
-It is possible to provide the -vt and -lb input options multiple times.
-
-```shell
-$./avclass2_labeler.py -vt <file1> -vt <file2>
-```
-```shell
-$./avclass2_labeler.py -lb <file1> -lb <file2>
-```
-
-There are also -vtdir and -lbdir options that can be used to provide 
-an input directory where all files are VT (-vtdir) or simplified (-lbdir) JSON reports:
-
-```shell
-$./avclass2_labeler.py -vtdir <directory>
-```
-
-It is also possible to combine -vt with -vtdir and -lb with -lbdir, 
-but you cannot combine input files of different format. Thus, this command works:
-
-```shell
-$./avclass2_labeler.py -vt <file> -vtdir <directory>
-```
-
-But, this one throws an error:
-
-```shell
-$./avclass2_labeler.py -vt <file1> -lb <file2>
-```
-
-At this point you have read the most important information on how to use AVClass2. 
-The following sections describe steps that most users will not need.
-
-## Labeling: Ground Truth Evaluation
-
-If you have family ground truth for some malware samples, i.e., 
-you know the true family for those samples, you can evaluate the accuracy 
-of the family tags output by AVClass2 on those samples with respect to that ground truth. 
-The evaluation metrics used are precision, recall, and F1 measure. 
-See our [RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf) for their definition.
-Note that the ground truth evaluation does not apply to non-family tags, 
-i.e., it only evaluates the output of the compatibility mode.
-
-```shell
-$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -gt ../examples/malheurReference_gt.tsv > malheurReference.labels
-```
-
-The output includes these lines:
-
-```
-Calculating precision and recall
-3131 out of 3131
-Precision: 90.81  Recall: 94.05 F1-Measure: 92.40
-```
-
-Each line in the *../examples/malheurReference_gt.tsv* file has two **tab-separated** columns:
-
-```
-aca2d12934935b070df8f50e06a20539 ADROTATOR
-```
-
-which indicates that sample aca2d12934935b070df8f50e06a20539 is known 
-to be of the *ADROTATOR* family. 
-Each sample in the input file should also appear in the ground truth file. 
-Note that the particular label assigned to each family does not matter. 
-What matters is that all samples in the same family are assigned 
-the same family name (i.e., the same string in the second column)
-
-The ground truth can be obtained from publicly available malware datasets. 
-The one in *../examples/malheurReference_gt.tsv* comes from the 
-[Malheur](http://www.mlsec.org/malheur/) dataset. 
-There are other public datasets with ground truth such as 
-[Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or 
-[Malicia](http://malicia-project.com/dataset.html).
-
-## Update Module
-
-The update module can be used to suggest additions and changes to the input 
-taxonomy, tagging rules, and expansion rules. 
-Using the update module comprises of two steps.
-The first step is obtaining an alias file from the labeler:
-
-```shell
-$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -aliasdetect
-```
-
-The above command will create a file named \<file\>.alias, 
-malheurReference_lb.alias in our example. This file has 7 columns:
-
-1. t1: token that is an alias
-2. t2: tag for which t1 is an alias
-3. |t1|: number of input samples where t1 was observed
-4. |t2|: number of input samples where t2 was observed
-5. |t1^t2|: number of input samples where both t1 and t2 were observed
-6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t1 was observed.
-7. |t1^t2|/|t2|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t2 was observed.
-
-
-The Update Module takes the above file as input with the -alias option, 
-as well as the default taxonomy, tagging, and expansion files in the data directory. 
-It outputs updated taxonomy, tagging, and expansion files that include the 
-suggested additions and changes. 
-
-```shell
-$./avclass2_update_module.py -alias malheurReference_lb.alias -o output_prefix
-```
-
-This will produce three files: 
-output_prefix.taxonomy, output_prefix.tagging, output_prefix.expansion. 
-You can diff the output and input files to analyze the proposed changes.
-
-You can also modify the input taxonomy, tagging, and expansion rules in place, 
-rather than producing new files:
-
-
-```shell
-$./avclass2_update_module.py -alias malheurReference_lb.alias -update
-```
-
-
-## Customizing AVClass2
-
-AVClass2 is fully customizable: 
-Tagging, Expansion and Taxonomy files can be easily modified by the analyst 
-either manually or by running the update module. 
-
-If you change those files manually, we recommend running 
-afterwards the input checker script to keep them tidy. 
-It sorts the tags in the taxonomy and performs some basic cleaning like 
-removing redundant entries:
-
-```shell
-$./avclass2_input_checker.py -tax taxonomy_file -tag tagging_file -exp expansio_file
-```
-
-If the modifications are in the default files in the data directory you can simply run: 
-
-```shell
-$./avclass2_input_checker.py 
-```
diff --git a/avclass2/lib/avclass2_common.py b/avclass2/lib/avclass2_common.py
deleted file mode 100755
index adf74a8..0000000
--- a/avclass2/lib/avclass2_common.py
+++ /dev/null
@@ -1,636 +0,0 @@
-#!/usr/bin/env python
-'''
-Main AVClass class
-'''
-
-import sys
-import re
-import string
-import logging
-from collections import OrderedDict as OrdDict
-from collections import namedtuple
-from operator import itemgetter, attrgetter
-
-# Set logging
-log = logging.getLogger(__name__)
-
-# Prefix to identify platform tags
-platform_prefix = "FILE:os:"
-
-# Default category for tags in taxonomy with no category
-uncategorized_cat  = "UNC"
-
-SampleInfo = namedtuple('SampleInfo', 
-                        ['md5', 'sha1', 'sha256', 'labels', 'vt_tags'])
-
-Tag = namedtuple('Tag', ['name', 'cat', 'path', 'prefix_l'])
-
-# AVs to use in suffix removal
-suffix_removal_av_set = {'Norman', 'Avast', 'Avira', 'Kaspersky',
-                          'ESET-NOD32', 'Fortinet', 'Jiangmin', 'Comodo',
-                          'GData', 'Avast', 'Sophos',
-                          'TrendMicro-HouseCall', 'TrendMicro',
-                          'NANO-Antivirus', 'Microsoft'}
-
-def create_tag(s):
-    ''' Create a Tag from its string representation '''
-    word_list = s.strip().split(":")
-    if len(word_list) > 1:
-        name = word_list[-1].lower()
-        cat = word_list[0].upper()
-        prefix_l = [x.lower() for x in word_list[1:-1]]
-        path = cat
-        for x in prefix_l:
-            path = path + ':' + x
-        path = path + ':' + name
-    else:
-        name = word_list[0].lower()
-        cat = uncategorized_cat
-        prefix_l = []
-        path = name
-    return Tag(name, cat, path, prefix_l)
-
-class Taxonomy:
-    '''
-    A taxonomy of tags and generic tokens read from file
-    '''
-    def __init__(self, filepath):
-        ''' Map tag.name | tag.path -> Tag '''
-        self.__tag_map = {}
-        if filepath:
-            self.read_taxonomy(filepath)
-
-    def __len__(self):
-        ''' Taxonomy length is the number of tags it contains '''
-        return len(self.__tag_map)//2
-
-    def is_generic(self, t):
-        ''' Return true if input is generic, false otherwise '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            return tag.cat == "GEN"
-        else:
-            return False
-
-    def is_tag(self, t):
-        ''' Return true if input is tag, false otherwise '''
-        return t in self.__tag_map
-
-    def add_tag(self, s, override=False):
-        ''' Add tag to taxonomy 
-            If tag already exists with different path, 
-              only replaces if override True '''
-        tag = create_tag(s)
-        t = self.__tag_map.get(tag.name, None)
-        if t and (t.path != tag.path):
-            if (not override):
-                return
-            else:
-                log.warn("[Taxonomy] Replacing %s with %s\n" % (
-                                  t.path, tag.path))
-                del self.__tag_map[t.path]
-        log.debug("[Taxonomy] Adding tag %s" % s)
-        self.__tag_map[tag.name] = tag
-        self.__tag_map[tag.path] = tag
-        return
-
-    def remove_tag(self, t):
-        ''' Remove tag from taxonomy. Returns 1 if removed, zero if unknown '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            log.debug("[Taxonomy] Removing tag: %s" % tag.path)
-            del self.__tag_map[tag.name]
-            del self.__tag_map[tag.path]
-            return 1
-        else:
-            return 0
-
-    def get_category(self, t):
-        ''' Return category of input tag, UNK if not a tag '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            return tag.cat
-        else:
-            return "UNK"
-
-    def get_path(self, t):
-        ''' Return full path for given tag, or empty string if not a tag '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            return tag.path
-        else:
-            return ("UNK:" + t)
-
-    def get_prefix_l(self, t):
-        ''' Return prefix list for given tag, or empty string if not a tag '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            return tag.prefix_l
-        else:
-            return []
-
-    def get_prefix(self, t):
-        ''' Return prefix string for given tag, 
-            or empty string if not a tag '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            return tag.prefix_l
-        else:
-            return t.path[0:t.path.rfind(':')]
-
-    def get_depth(self, t):
-        ''' Return depth of tag in taxonomy. 
-            Returns zero if tag not in taxonomy. 
-            A normal tag CAT:name has depth two '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            return len(tag.prefix_l) + 2
-        else:
-            return 0
-
-    def get_info(self, t):
-        ''' Return (path,category) for given tag, or UNK:t if not a tag '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            return tag.path, tag.cat
-        else:
-            return "UNK:" + t, "UNK"
-
-    def expand(self, t):
-        ''' Return list of tags in prefix list that are leaves '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            return [t for t in tag.prefix_l if t in self.__tag_map]
-        else:
-            return []
-
-    def platform_tags(self): 
-        ''' Returns list with platform tags in taxonomy '''
-        acc = set()
-        for idx,tag in self.__tag_map.items():
-            if tag.path.startswith(platform_prefix):
-                acc.add(tag.name)
-        return acc
-
-    def overlaps(self, t1, t2):
-        ''' Returns true if the path of the given tags overlaps '''
-        m1 = self.get_prefix_l(t1)
-        m2 = self.get_prefix_l(t2)
-        return (t1 in m2) or (t2 in m1)
-
-    def remove_overlaps(self, l): 
-        ''' Returns list with overlapping tags removed '''
-        if not l:
-            return l
-        pair_l = sorted([(self.get_depth(t),t) for t in l])
-        out_l = [pair_l.pop()[1]]
-        while pair_l:
-            t = pair_l.pop()[1]
-            if (not any(self.overlaps(t, e) for e in out_l)):
-                out_l.append(t)
-        return out_l
-
-    def read_taxonomy(self, filepath):
-        '''Read taxonomy from given file '''
-        with open(filepath, 'r') as fd:
-            for line in fd:
-                if line.startswith('#') or line == '\n':
-                    continue
-                self.add_tag(line.strip())
-        return
-
-    def to_file(self, filepath):
-        ''' Output sorted taxonomy to given file '''
-        # Open output file
-        fd = open(filepath, 'w')
-        # Write sorted tags
-        tag_l = sorted(self.__tag_map.items(), 
-                                key=lambda item : item[1].path, 
-                                reverse=False)
-        idx = 0
-        for name,tag in tag_l:
-            if (idx % 2) == 0:
-                fd.write(tag.path+"\n")
-            idx+=1
-        # Close output file
-        fd.close()
-
-class Rules:
-    '''
-    Rules are src -> dst1, dst2, ... relations
-    '''
-    def __init__(self, filepath):
-        ''' Map src -> set(dst) '''
-        self._rmap = {}
-        if filepath:
-            self.read_rules(filepath)
-
-    def __len__(self):
-        ''' Length is number of rules, i.e., number of src '''
-        return len(self._rmap)
-
-    def add_rule(self, src, dst_l, overwrite=False):
-        ''' Add rule. If rule exists:
-            if overwrite==True, replace destination list
-            else append dst_l to current target set  '''
-        # Remove src from dst_l if it exists
-        dst_l = filter(lambda x: x != src, dst_l)
-        # If no destinations, nothing to do
-        if (not dst_l):
-            return
-        log.debug("[Rules] Adding %s -> %s" % (src, dst_l))
-        src_tag = create_tag(src)
-        if overwrite:
-            target_l = [create_tag(dst).name for dst in dst_l]
-            self._rmap[src_tag.name] = set(target_l)
-        else:
-            curr_dst = self._rmap.get(src_tag.name, set())
-            for dst in dst_l:
-                dst_tag = create_tag(dst)
-                curr_dst.add(dst_tag.name)
-            self._rmap[src_tag.name] = curr_dst
-        return
-
-    def remove_rule(self, src):
-        l = self._rmap.get(src, [])
-        if l:
-            log.debug("[Rules] Removing rule: %s -> %s" % (src, l))
-            del self._rmap[src]
-            return 1
-        else:
-            return 0
-
-    def get_dst(self, src):
-        ''' Returns dst list for given src, or empty list if no expansion '''
-        return list(self._rmap.get(src, []))
-
-    def read_rules(self, filepath):
-        '''Read rules from given file'''
-        with open(filepath, 'r') as fd:
-            for line in fd:
-                if line.startswith('#') or line == '\n':
-                    continue
-                word_list = line.strip().split()
-                if len(word_list) > 1:
-                    self.add_rule(word_list[0],word_list[1:])
-        return
-
-    def to_file(self, filepath, taxonomy=None):
-        ''' Output sorted rules to given file 
-            If taxonomy is provided, it outputs full tag path '''
-        fd = open(filepath, 'w')
-        for src,dst_set in sorted(self._rmap.items()):
-            dst_l = sorted(dst_set, reverse=False)
-            if taxonomy:
-                src_path = taxonomy.get_path(src)
-                path_l = [taxonomy.get_path(t) for t in dst_l]
-                dst_str = '\t'.join(path_l)
-                fd.write("%s\t%s\n" % (src_path,dst_str))
-            else:
-                dst_str = '\t'.join(dst_l)
-                fd.write("%s\t%s\n" % (src,dst_str))
-        fd.close()
-
-    def expand_src_destinations(self, src):
-        ''' Return destination list for given src after recursively 
-            following any rules for destinations '''
-        dst_set = self._rmap.get(src, set())
-        out = set()
-        while dst_set:
-            dst = dst_set.pop()
-            l = self._rmap.get(dst, [])
-            if l:
-                for e in l:
-                    if (e not in out) and (e != dst):
-                        dst_set.add(e)
-            else:
-                out.add(dst)
-        return out
-
-    def expand_all_destinations(self):
-        ''' Return destination list for given src after recursively 
-            following any rules for destinations '''
-        src_l = self._rmap.keys()
-        for src in src_l:
-            dst_l = self.expand_src_destinations(src)
-            self._rmap[src] = dst_l
-
-class Tagging(Rules):
-    '''
-    Tagging rules have src UNK and dst in taxonomy
-    '''
-    def __init__(self, filepath):
-        Rules.__init__(self, filepath)
-
-    def validate(self, taxonomy):
-        ''' Check that tags in tagging rules are in given taxonomy '''
-        for tok,tag_l in self._rmap.items():
-            for t in tag_l:
-                if (not taxonomy.is_tag(t)):
-                    sys.stdout.write("[Tagging] %s not in taxonomy\n" % t)
-
-class Expansion(Rules):
-    '''
-    Expansion rules have src and dst in taxonomy and
-        src.category != dst.category
-    '''
-    def __init__(self, filepath):
-        Rules.__init__(self, filepath)
-
-    def validate(self, taxonomy):
-        ''' Check that tags in expansion rules are in given taxonomy '''
-        for src,dst_set in self._rmap.items():
-            if (not taxonomy.is_tag(src)):
-                sys.stdout.write("[Expansion] %s not in taxonomy\n" % src)
-            for dst in dst_set:
-                if (not taxonomy.is_tag(dst)):
-                    sys.stdout.write("[Expansion] %s not in taxonomy\n" % dst)
-
-class AvLabels:
-    '''
-    Class to operate on AV labels, 
-    such as extracting the most likely family name.
-    '''
-    def __init__(self, tag_file, exp_file = None, tax_file = None,
-                 av_file = None, aliasdetect=False):
-        # Read taxonomy
-        self.taxonomy = Taxonomy(tax_file)
-        # Read tag rules
-        self.tagging = Tagging(tag_file)
-        # Read expansion rules
-        self.expansions = Expansion(exp_file)
-        # Read AV engines
-        self.avs = self.read_avs(av_file) if av_file else None
-        # Alias statistics initialization
-        self.aliasdetect = aliasdetect
-
-    @staticmethod
-    def read_avs(avs_file):
-        '''Read AV engine set from given file'''
-        with open(avs_file) as fd:
-            avs = set(map(str.strip, fd.readlines()))
-        return avs
-
-    @staticmethod
-    def get_sample_info_lb(vt_rep):
-        '''Parse and extract sample information from JSON line
-           Returns a SampleInfo named tuple
-        '''
-        return SampleInfo(vt_rep['md5'], vt_rep['sha1'], vt_rep['sha256'],
-                          vt_rep['av_labels'], [])
-
-    @staticmethod
-    def get_sample_info_vt_v2(vt_rep):
-        '''Parse and extract sample information from JSON line
-           Returns a SampleInfo named tuple
-        '''
-        label_pairs = []
-        # Obtain scan results, if available
-        try:
-            scans = vt_rep['scans']
-            md5 = vt_rep['md5']
-            sha1 = vt_rep['sha1']
-            sha256 = vt_rep['sha256']
-        except KeyError:
-            return None
-        # Obtain labels from scan results
-        for av, res in scans.items():
-            if res['detected']:
-                label = res['result']
-                clean_label = ''.join(filter(
-                                  lambda x: x in string.printable,
-                                    label)).strip()
-                label_pairs.append((av, clean_label))
-        # Obtain VT tags, if available
-        vt_tags = vt_rep.get('tags', [])
-
-        return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags)
-
-    @staticmethod
-    def get_sample_info_vt_v3(vt_rep):
-        '''Parse and extract sample information from JSON line
-           Returns a SampleInfo named tuple
-        '''
-        label_pairs = []
-        # Obtain scan results, if available
-        try:
-            scans = vt_rep['data']['attributes']['last_analysis_results']
-            md5 = vt_rep['data']['attributes']['md5']
-            sha1 = vt_rep['data']['attributes']['sha1']
-            sha256 = vt_rep['data']['attributes']['sha256']
-        except KeyError:
-            return None
-        # Obtain labels from scan results
-        for av, res in scans.items():
-            label = res['result']
-            if label is not None:
-                clean_label = ''.join(filter(
-                                  lambda x: x in string.printable,
-                                    label)).strip()
-                label_pairs.append((av, clean_label))
-        # Obtain VT tags, if available
-        vt_tags = vt_rep['data']['attributes'].get('tags', [])
-
-        return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags)
-
-
-    @staticmethod
-    def is_pup(tag_pairs, taxonomy):
-        '''This function classifies the sample as PUP or not 
-           by checking if highest ranked CLASS tag contains "grayware"
-           and is above a predefined threshold
-           Return:
-              True/False/None
-        '''
-        threshold = 0.5
-        # If no tags, return false
-        if len(tag_pairs) < 1:
-            return None
-        max_ctr = tag_pairs[0][1]
-        for (tag,ctr) in tag_pairs:
-            (path, cat) = taxonomy.get_info(tag)
-            if (cat == "CLASS"):
-                if ("grayware" in path):
-                    return (float(ctr) >= float(max_ctr)*threshold)
-                else:
-                    return False
-        return False
-
-    @staticmethod
-    def __remove_suffixes(av_name, label):
-        '''Remove AV specific suffixes from given label
-           Returns updated label'''
-
-        # Truncate after last '.'
-        if av_name in suffix_removal_av_set:
-            label = label.rsplit('.', 1)[0]
-
-        # Truncate after last '.' 
-        # if suffix only contains digits or uppercase (no lowercase) chars
-        if av_name == 'AVG':
-            tokens = label.rsplit('.', 1)
-            if len(tokens) > 1 and re.match("^[A-Z0-9]+$", tokens[1]):
-                label = tokens[0]
-
-        # Truncate after last '!'
-        if av_name == 'Agnitum':
-            label = label.rsplit('!', 1)[0]
-
-        return label
-
-
-    def get_label_tags(self, label, hashes):
-        ''' Return list of tags in given label 
-            Tokenizes label, filters unneeded tokens, and 
-            applies tagging rules '''
-
-        # Initialize set of tags to return
-        # We use a set to avoid duplicate tokens in the same AV label
-        # This avoids "potentially unwanted" contributing twice BEH:pup
-        tags = set()
-
-        # If empty label, nothing to do
-        if not label:
-            return tags
-
-        # Split label into tokens and process each token
-        for token in re.split("[^0-9a-zA-Z]", label):
-            # Convert token to lowercase
-            token = token.lower()
-
-            # Remove digits at the end
-            end_len = len(re.findall("\d*$", token)[0])
-            if end_len:
-                token = token[:-end_len]
-
-            # Ignore token if prefix of a hash of the sample
-            # Most AVs use MD5 prefixes in labels, 
-            # but we check SHA1 and SHA256 as well
-            hash_token = False
-            for hash_str in hashes:
-                if hash_str[0:len(token)] == token:
-                  hash_token = True
-                  break
-            if hash_token:
-                continue
-
-            # Ignore generic tokens
-            if self.taxonomy.is_generic(token):
-                continue
-
-            # Apply tagging rule
-            dst_l = self.tagging.get_dst(token)
-            if dst_l:
-                # Ignore generic tokens
-                for t in dst_l:
-                    if not self.taxonomy.is_generic(t):
-                        tags.add(t)
-            # Add token if longer than 3 characters and no tagging rule
-            elif len(token) > 3:
-                tags.add(token)
-
-        # Return tags
-        return tags
-
-
-    def __expand(self, tag_set):
-        ''' Return expanded set of tags '''
-        ret = set()
-        for t in tag_set:
-            # Include tag
-            ret.add(t)
-
-            # Include target of expansion rule in output
-            ret.update(self.expansions.get_dst(t))
-
-            # Include implicit expansions in taxonomy
-            ret.update(self.taxonomy.expand(t))
-
-        # Return a list for backwards compatibility 
-        return ret
-
-    def get_sample_tags(self, sample_info):
-        ''' Returns dictionary tag -> AV list of tags for the given sample '''
-
-        # Whitelist the AVs to filter the ones with meaningful labels
-        av_whitelist = self.avs
-        # Initialize auxiliary data structures
-        duplicates = set()
-        av_dict = {}
-
-        # Process each AV label
-        for (av_name, label) in sample_info.labels:
-            # If empty label, nothing to do
-            if not label:
-                continue
-
-            ################
-            # AV selection #
-            ################
-            if av_whitelist and av_name not in av_whitelist:
-                continue
-
-            #####################
-            # Duplicate removal #
-            #####################
-
-            # Emsisoft uses same label as 
-            # GData/ESET-NOD32/BitDefender/Ad-Aware/MicroWorld-eScan,
-            # but suffixes ' (B)' to their label. Remove the suffix.
-            if label.endswith(' (B)'):
-                label = label[:-4]
-
-            # F-Secure uses Avira's engine since Nov. 2018
-            # but prefixes 'Malware.' to Avira's label. Remove the prefix.
-            if label.startswith('Malware.'):
-                label = label[8:]
-
-            # Other engines often use exactly the same label, e.g.,
-            #   AVG/Avast
-            #   K7Antivirus/K7GW
-            #   Kaspersky/ZoneAlarm
-
-            # If we have seen the exact same label before, skip
-            if label in duplicates:
-                continue
-            # If not, we add it to duplicates
-            else:
-                duplicates.add(label)
-
-            ##################
-            # Suffix removal #
-            ##################
-            label = self.__remove_suffixes(av_name, label)
-
-            ########################################################
-            # Tokenization and tagging                             #
-            ########################################################
-            hashes = [ sample_info.md5, sample_info.sha1, sample_info.sha256 ]
-            tags = self.get_label_tags(label, hashes)
-
-            ########################################################
-            # Expansions                                           #
-            ########################################################
-            # NOTE: Avoiding to do expansion when aliases
-            if self.aliasdetect:
-                expanded_tags = tags
-            else:
-                expanded_tags = self.__expand(tags)
-
-            ########################################################
-            # Stores information that relates AV vendors with tags #
-            ########################################################
-            for t in expanded_tags:
-                av_dict.setdefault(t, []).append(av_name)
-
-
-        return av_dict
-
-    def rank_tags(self, av_dict, threshold=1):
-        ''' Return list of (tag, confidence) ranked by decreasing confidence 
-            and filter tags with less or equal threshold confidence '''
-
-        pairs = ((t, len(avs)) for (t,avs) in av_dict.items() 
-                    if len(avs) > threshold)
-        return sorted(pairs, key=itemgetter(1,0), reverse=True)
-
diff --git a/avclass2/data/andropup.expansion b/data/andropup.expansion
similarity index 100%
rename from avclass2/data/andropup.expansion
rename to data/andropup.expansion
diff --git a/avclass2/data/default.expansion b/data/default.expansion
similarity index 100%
rename from avclass2/data/default.expansion
rename to data/default.expansion
diff --git a/avclass2/data/default.tagging b/data/default.tagging
similarity index 100%
rename from avclass2/data/default.tagging
rename to data/default.tagging
diff --git a/avclass2/data/default.taxonomy b/data/default.taxonomy
similarity index 100%
rename from avclass2/data/default.taxonomy
rename to data/default.taxonomy
diff --git a/setup.py b/setup.py
index d87b69e..ff933d6 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 
 setup(
     name='AVClass',
-    version='0.0.1',
+    version='2.0.0',
     description='Tag and label malware samples',
     license='LICENSE',
     packages=find_packages(),
diff --git a/shared/evaluate_clustering.py b/shared/evaluate_clustering.py
deleted file mode 100755
index c841d3d..0000000
--- a/shared/evaluate_clustering.py
+++ /dev/null
@@ -1,140 +0,0 @@
-#!/usr/bin/env python
-import sys
-
-def tp_fp_fn(CORRECT_SET, GUESS_SET):
-    """
-    INPUT: dictionary with the elements in the cluster from the ground truth
-    (CORRECT_SET) and dictionary with the elements from the estimated cluster
-    (ESTIMATED_SET).
-
-    OUTPUT: number of True Positives (elements in both clusters), False
-    Positives (elements only in the ESTIMATED_SET), False Negatives (elements
-    only in the CORRECT_SET).
-    """
-    tp = 0
-    fp = 0
-    fn = 0
-    for elem in GUESS_SET:
-        # True Positives (elements in both clusters)
-        if elem in CORRECT_SET:
-            tp += 1
-        else:
-            # False Positives (elements only in the "estimated cluster")
-            fp += 1
-    for elem in CORRECT_SET:
-        if elem not in GUESS_SET:
-            # False Negatives (elements only in the "correct cluster")
-            fn += 1
-    return tp, fp, fn
-
-
-def eval_precision_recall_fmeasure(GROUNDTRUTH_DICT, ESTIMATED_DICT):
-    """
-    INPUT: dictionary with the mapping "element:cluster_id" for both the ground
-    truth and the ESTIMATED_DICT clustering.
-
-    OUTPUT: average values of Precision, Recall and F-Measure.
-    """
-    # eval: precision, recall, f-measure
-    tmp_precision = 0
-    tmp_recall = 0
-
-    # build reverse dictionary of ESTIMATED_DICT
-    rev_est_dict = {}
-    for k, v in ESTIMATED_DICT.items():
-        if v not in rev_est_dict:
-            rev_est_dict[v] = { k }
-        else:
-            rev_est_dict[v].add(k)
-
-    # build reverse dictionary of GROUNDTRUTH_DICT
-    gt_rev_dict = {}
-    for k, v in GROUNDTRUTH_DICT.items():
-        if v not in gt_rev_dict:
-            gt_rev_dict[v] = { k }
-        else:
-            gt_rev_dict[v].add(k)
-
-    
-    counter, l = 0, len(ESTIMATED_DICT)
-
-    sys.stderr.write('Calculating precision and recall\n')
-
-    # For each element
-    for element in ESTIMATED_DICT:
-        
-        # Print progress
-        if counter % 1000 == 0:
-            sys.stderr.write('\r%d out of %d' % (counter, l))
-            sys.stderr.flush()
-        counter += 1
-
-        # Get elements in the same cluster (for "ESTIMATED_DICT cluster")
-        guess_cluster_id = ESTIMATED_DICT[element]
-
-        # Get the list of elements in the same cluster ("correct cluster")
-        correct_cluster_id = GROUNDTRUTH_DICT[element]
-
-        # Calculate TP, FP, FN
-        tp, fp, fn = tp_fp_fn(gt_rev_dict[correct_cluster_id],
-                              rev_est_dict[guess_cluster_id])
-
-        # tmp_precision
-        p = 1.0*tp/(tp+fp)
-        tmp_precision += p
-        # tmp_recall
-        r = 1.0*tp/(tp+fn)
-        tmp_recall += r
-    sys.stderr.write('\r%d out of %d' % (counter, l))
-    sys.stderr.write('\n')
-    precision = 100.0*tmp_precision/len(ESTIMATED_DICT)
-    recall = 100.0*tmp_recall/len(ESTIMATED_DICT)
-    fmeasure = (2*precision*recall)/(precision+recall)
-    return precision, recall, fmeasure
-
-
-if __name__ == "__main__":
-
-    # The ground truth.
-    # Dictionary with mapping: "element : cluster_id".
-    diz_grth = {
-        "a": 1,
-        "b": 1,
-        "c": 2,
-        "d": 3
-    }
-
-    # An example of an "estimated cluster".
-    # Dictionary with mapping: "element : cluster_id".
-    diz_estim = {
-        "a": 66,
-        "b": 'malware',
-        "c": 'goodware',
-        "d": 'trojan'
-    }
-
-    # An example of an "estimated cluster": same partitioning as for the ground
-    # truth, but just different cluster labels. Precision == Recall ==
-    # F-Measure == 100%.
-    # Dictionary with mapping: "element : cluster_id".
-    diz_estim_grth = {
-        "a": 2,
-        "b": 2,
-        "c": 66,
-        "d": 9
-    }
-
-    # a sample where estimated != ground truth
-    sys.stdout.write("Ground truth\n")
-    sys.stdout.write("%8s --> %10s\n" % ("Element", "Cluster_ID"))
-    for k, v in diz_grth.items():
-        sys.stdout.write("%8s --> %10s\n" % (k, v))
-    sys.stdout.write("\nEstimated clustering\n")
-    sys.stdout.write("%8s --> %10s\n" % ("Element", "Cluster_ID"))
-    for k, v in diz_estim.items():
-        sys.stdout.write("%8s --> %10s\n" % (k, v))
-    # precision, recall, f-measure
-    p, r, f = eval_precision_recall_fmeasure(diz_grth, diz_estim)
-    sys.stdout.write("\nPrecison: %s%%\n" % p)
-    sys.stdout.write("Recall: %s%%\n" % r)
-    sys.stdout.write("F-Measure: %s%%\n" % f)

From b32ace4db8bfa7f9440832b92e85446f03d669ed Mon Sep 17 00:00:00 2001
From: Matt Miller <usr.bin.bourbon@gmail.com>
Date: Mon, 11 Jan 2021 12:28:35 -0600
Subject: [PATCH 03/36] entry points

---
 avclass/input_checker.py |  5 ++++-
 avclass/labeler.py       | 12 +++++++++---
 setup.py                 |  7 ++++++-
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/avclass/input_checker.py b/avclass/input_checker.py
index ca7c381..1547742 100755
--- a/avclass/input_checker.py
+++ b/avclass/input_checker.py
@@ -8,7 +8,7 @@
 default_exp_file = "data/default.expansion"
 
 
-if __name__ == '__main__':
+def main():
     argparser = argparse.ArgumentParser(prog='input_checker',
         description='Checks format of files Tagging, Expansion and Taxonomy.')
 
@@ -48,3 +48,6 @@
     sys.stdout.write('[-] Normalized %d expansion rules in %s\n' % (
                         len(expansion), args.exp))
 
+
+if __name__ == '__main__':
+    main()
diff --git a/avclass/labeler.py b/avclass/labeler.py
index c64a636..7bc9b88 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -56,7 +56,8 @@ def list_str(l, sep=", ", prefix=""):
         out = out + sep + s
     return out
 
-def main(args):
+def main():
+    args = parse_args()
     # Select hash used to identify sample, by default MD5
     hash_type = args.hash if args.hash else 'md5'
 
@@ -351,7 +352,7 @@ def main(args):
         sys.stderr.write('[-] Alias data in %s\n' % (alias_filename))
 
 
-if __name__=='__main__':
+def parse_args():
     argparser = argparse.ArgumentParser(prog='avclass2_labeler',
         description='''Extracts tags for a set of samples.
             Also calculates precision and recall if ground truth available''')
@@ -471,4 +472,9 @@ def main(args):
         sys.stderr.write('[-] Using default expansion tags in %s\n' % (
                           default_exp_file))
 
-    main(args)
+    return args
+
+
+if __name__=='__main__':
+    main()
+    
\ No newline at end of file
diff --git a/setup.py b/setup.py
index ff933d6..fada7e7 100644
--- a/setup.py
+++ b/setup.py
@@ -14,4 +14,9 @@
     tests_require=[
         'pytest',
     ],
-)
+    entry_points={
+        'console_scripts': [
+            'avclass = avclass.labeler:main',
+            'avclass-validate = avclass.input_checker:main',
+        ],
+    })

From 4bf29c1210e405802999578840b65edb07ed3946 Mon Sep 17 00:00:00 2001
From: Matt Miller <usr.bin.bourbon@gmail.com>
Date: Mon, 11 Jan 2021 12:57:03 -0600
Subject: [PATCH 04/36] flatten package; clustering cleanup

---
 avclass/clustering.py    | 132 ++++++++
 avclass/common.py        | 637 +++++++++++++++++++++++++++++++++++++++
 avclass/input_checker.py |   2 +-
 avclass/labeler.py       |   4 +-
 avclass/update.py        |   2 +-
 5 files changed, 773 insertions(+), 4 deletions(-)
 create mode 100755 avclass/clustering.py
 create mode 100755 avclass/common.py

diff --git a/avclass/clustering.py b/avclass/clustering.py
new file mode 100755
index 0000000..c5a349c
--- /dev/null
+++ b/avclass/clustering.py
@@ -0,0 +1,132 @@
+import sys
+
+from collections import defaultdict
+from typing import Dict, Set
+
+
+def tp_fp_fn(expected: Set, guess: Set):
+    """
+    Calculate the true-positives, false-positives, and false-negatives between ``expected`` and ``guess``
+
+    :param expected: Ground truth set
+    :param guess: Estimated set
+    :return: Tuple containing true positive count, false positive count, false negative count
+    """
+    tp = len(guess.intersection(expected))
+    fp = len(guess.difference(expected))
+    fn = len(expected.difference(guess))
+
+    return tp, fp, fn
+
+
+def eval_precision_recall_fmeasure(expected: Dict, guess: Dict):
+    """
+    Evaluate the precision, recall, and f-measure for the comparison of ``expected`` to ``guess``
+
+    :param expected: Dictionary mapping an element to a cluster_id
+    :param guess: Dictionary mapping an element t a cluster_id
+    :return: Tuple containing precision, recall, and f-measure values
+    """
+    # eval: precision, recall, f-measure
+    tmp_precision = 0
+    tmp_recall = 0
+
+    # build reverse dictionary of guess
+    rev_est_dict = defaultdict(set)
+    for k, v in guess.items():
+        rev_est_dict[v].add(k)
+
+    # build reverse dictionary of expected
+    gt_rev_dict = defaultdict(set)
+    for k, v in expected.items():
+        gt_rev_dict[v].add(k)
+
+    counter, l = 0, len(guess)
+
+    sys.stderr.write('Calculating precision and recall\n')
+
+    # For each element
+    for element in guess:
+        # Print progress
+        if counter % 1000 == 0:
+            sys.stderr.write('\r%d out of %d' % (counter, l))
+            sys.stderr.flush()
+        counter += 1
+
+        # Get elements in the same cluster (for "guess cluster")
+        guess_cluster_id = guess[element]
+
+        # Get the list of elements in the same cluster ("expected cluster")
+        correct_cluster_id = expected[element]
+
+        # Calculate TP, FP, FN
+        tp, fp, fn = tp_fp_fn(gt_rev_dict[correct_cluster_id],
+                              rev_est_dict[guess_cluster_id])
+
+        # tmp_precision
+        p = 1.0*tp/(tp+fp)
+        tmp_precision += p
+
+        # tmp_recall
+        r = 1.0*tp/(tp+fn)
+        tmp_recall += r
+
+    sys.stderr.write('\r%d out of %d' % (counter, l))
+    sys.stderr.write('\n')
+
+    precision = 100.0 * tmp_precision / len(guess)
+    recall = 100.0 * tmp_recall / len(guess)
+    fmeasure = (2 * precision * recall) / (precision + recall)
+
+    return precision, recall, fmeasure
+
+
+if __name__ == "__main__":
+    # The ground truth.
+    # Dictionary with mapping: "element : cluster_id".
+    diz_grth = {
+        "a": 1,
+        "b": 1,
+        "c": 2,
+        "d": 3
+    }
+
+    # An example of an "estimated cluster".
+    # Dictionary with mapping: "element : cluster_id".
+    diz_estim = {
+        "a": 66,
+        "b": 'malware',
+        "c": 'goodware',
+        "d": 'trojan'
+    }
+
+    # An example of an "estimated cluster": same partitioning as for the ground
+    # truth, but just different cluster labels. Precision == Recall ==
+    # F-Measure == 100%.
+    # Dictionary with mapping: "element : cluster_id".
+    diz_estim_grth = {
+        "a": 2,
+        "b": 2,
+        "c": 66,
+        "d": 9
+    }
+
+    # a sample where estimated != ground truth
+    sys.stdout.write("Ground truth\n")
+    sys.stdout.write("%8s --> %10s\n" % ("Element", "Cluster_ID"))
+
+    for k, v in diz_grth.items():
+        sys.stdout.write("%8s --> %10s\n" % (k, v))
+
+    sys.stdout.write("\nEstimated clustering\n")
+    sys.stdout.write("%8s --> %10s\n" % ("Element", "Cluster_ID"))
+
+    for k, v in diz_estim.items():
+        sys.stdout.write("%8s --> %10s\n" % (k, v))
+
+    # precision, recall, f-measure
+    p, r, f = eval_precision_recall_fmeasure(diz_grth, diz_estim)
+
+    sys.stdout.write("\nPrecison: %s%%\n" % p)
+    sys.stdout.write("Recall: %s%%\n" % r)
+    sys.stdout.write("F-Measure: %s%%\n" % f)
diff --git a/avclass/common.py b/avclass/common.py
new file mode 100755
index 0000000..dc28ff4
--- /dev/null
+++ b/avclass/common.py
@@ -0,0 +1,637 @@
+import logging
+import re
+import string
+import sys
+
+from collections import namedtuple
+from operator import itemgetter
+
+
+# Set logging
+log = logging.getLogger(__name__)
+
+# Prefix to identify platform tags
+platform_prefix = "FILE:os:"
+
+# Default category for tags in taxonomy with no category
+uncategorized_cat = "UNC"
+
+SampleInfo = namedtuple('SampleInfo', 
+                        ['md5', 'sha1', 'sha256', 'labels', 'vt_tags'])
+
+Tag = namedtuple('Tag', ['name', 'cat', 'path', 'prefix_l'])
+
+# AVs to use in suffix removal
+suffix_removal_av_set = {'Norman', 'Avast', 'Avira', 'Kaspersky',
+                          'ESET-NOD32', 'Fortinet', 'Jiangmin', 'Comodo',
+                          'GData', 'Avast', 'Sophos',
+                          'TrendMicro-HouseCall', 'TrendMicro',
+                          'NANO-Antivirus', 'Microsoft'}
+
+
+def create_tag(s):
+    ''' Create a Tag from its string representation '''
+    word_list = s.strip().split(":")
+    if len(word_list) > 1:
+        name = word_list[-1].lower()
+        cat = word_list[0].upper()
+        prefix_l = [x.lower() for x in word_list[1:-1]]
+        path = cat
+        for x in prefix_l:
+            path = path + ':' + x
+        path = path + ':' + name
+    else:
+        name = word_list[0].lower()
+        cat = uncategorized_cat
+        prefix_l = []
+        path = name
+    return Tag(name, cat, path, prefix_l)
+
+
+class Taxonomy:
+    '''
+    A taxonomy of tags and generic tokens read from file
+    '''
+    def __init__(self, filepath):
+        ''' Map tag.name | tag.path -> Tag '''
+        self.__tag_map = {}
+        if filepath:
+            self.read_taxonomy(filepath)
+
+    def __len__(self):
+        ''' Taxonomy length is the number of tags it contains '''
+        return len(self.__tag_map)//2
+
+    def is_generic(self, t):
+        ''' Return true if input is generic, false otherwise '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            return tag.cat == "GEN"
+        else:
+            return False
+
+    def is_tag(self, t):
+        ''' Return true if input is tag, false otherwise '''
+        return t in self.__tag_map
+
+    def add_tag(self, s, override=False):
+        ''' Add tag to taxonomy 
+            If tag already exists with different path, 
+              only replaces if override True '''
+        tag = create_tag(s)
+        t = self.__tag_map.get(tag.name, None)
+        if t and (t.path != tag.path):
+            if (not override):
+                return
+            else:
+                log.warn("[Taxonomy] Replacing %s with %s\n" % (
+                                  t.path, tag.path))
+                del self.__tag_map[t.path]
+        log.debug("[Taxonomy] Adding tag %s" % s)
+        self.__tag_map[tag.name] = tag
+        self.__tag_map[tag.path] = tag
+        return
+
+    def remove_tag(self, t):
+        ''' Remove tag from taxonomy. Returns 1 if removed, zero if unknown '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            log.debug("[Taxonomy] Removing tag: %s" % tag.path)
+            del self.__tag_map[tag.name]
+            del self.__tag_map[tag.path]
+            return 1
+        else:
+            return 0
+
+    def get_category(self, t):
+        ''' Return category of input tag, UNK if not a tag '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            return tag.cat
+        else:
+            return "UNK"
+
+    def get_path(self, t):
+        ''' Return full path for given tag, or empty string if not a tag '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            return tag.path
+        else:
+            return ("UNK:" + t)
+
+    def get_prefix_l(self, t):
+        ''' Return prefix list for given tag, or empty string if not a tag '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            return tag.prefix_l
+        else:
+            return []
+
+    def get_prefix(self, t):
+        ''' Return prefix string for given tag, 
+            or empty string if not a tag '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            return tag.prefix_l
+        else:
+            return t.path[0:t.path.rfind(':')]
+
+    def get_depth(self, t):
+        ''' Return depth of tag in taxonomy. 
+            Returns zero if tag not in taxonomy. 
+            A normal tag CAT:name has depth two '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            return len(tag.prefix_l) + 2
+        else:
+            return 0
+
+    def get_info(self, t):
+        ''' Return (path,category) for given tag, or UNK:t if not a tag '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            return tag.path, tag.cat
+        else:
+            return "UNK:" + t, "UNK"
+
+    def expand(self, t):
+        ''' Return list of tags in prefix list that are leaves '''
+        tag = self.__tag_map.get(t, None)
+        if tag:
+            return [t for t in tag.prefix_l if t in self.__tag_map]
+        else:
+            return []
+
+    def platform_tags(self): 
+        ''' Returns list with platform tags in taxonomy '''
+        acc = set()
+        for idx,tag in self.__tag_map.items():
+            if tag.path.startswith(platform_prefix):
+                acc.add(tag.name)
+        return acc
+
+    def overlaps(self, t1, t2):
+        ''' Returns true if the path of the given tags overlaps '''
+        m1 = self.get_prefix_l(t1)
+        m2 = self.get_prefix_l(t2)
+        return (t1 in m2) or (t2 in m1)
+
+    def remove_overlaps(self, l): 
+        ''' Returns list with overlapping tags removed '''
+        if not l:
+            return l
+        pair_l = sorted([(self.get_depth(t),t) for t in l])
+        out_l = [pair_l.pop()[1]]
+        while pair_l:
+            t = pair_l.pop()[1]
+            if (not any(self.overlaps(t, e) for e in out_l)):
+                out_l.append(t)
+        return out_l
+
+    def read_taxonomy(self, filepath):
+        '''Read taxonomy from given file '''
+        with open(filepath, 'r') as fd:
+            for line in fd:
+                if line.startswith('#') or line == '\n':
+                    continue
+                self.add_tag(line.strip())
+        return
+
+    def to_file(self, filepath):
+        ''' Output sorted taxonomy to given file '''
+        # Open output file
+        fd = open(filepath, 'w')
+        # Write sorted tags
+        tag_l = sorted(self.__tag_map.items(), 
+                                key=lambda item : item[1].path, 
+                                reverse=False)
+        idx = 0
+        for name,tag in tag_l:
+            if (idx % 2) == 0:
+                fd.write(tag.path+"\n")
+            idx+=1
+        # Close output file
+        fd.close()
+
+
+class Rules:
+    '''
+    Rules are src -> dst1, dst2, ... relations
+    '''
+    def __init__(self, filepath):
+        ''' Map src -> set(dst) '''
+        self._rmap = {}
+        if filepath:
+            self.read_rules(filepath)
+
+    def __len__(self):
+        ''' Length is number of rules, i.e., number of src '''
+        return len(self._rmap)
+
+    def add_rule(self, src, dst_l, overwrite=False):
+        ''' Add rule. If rule exists:
+            if overwrite==True, replace destination list
+            else append dst_l to current target set  '''
+        # Remove src from dst_l if it exists
+        dst_l = filter(lambda x: x != src, dst_l)
+        # If no destinations, nothing to do
+        if (not dst_l):
+            return
+        log.debug("[Rules] Adding %s -> %s" % (src, dst_l))
+        src_tag = create_tag(src)
+        if overwrite:
+            target_l = [create_tag(dst).name for dst in dst_l]
+            self._rmap[src_tag.name] = set(target_l)
+        else:
+            curr_dst = self._rmap.get(src_tag.name, set())
+            for dst in dst_l:
+                dst_tag = create_tag(dst)
+                curr_dst.add(dst_tag.name)
+            self._rmap[src_tag.name] = curr_dst
+        return
+
+    def remove_rule(self, src):
+        l = self._rmap.get(src, [])
+        if l:
+            log.debug("[Rules] Removing rule: %s -> %s" % (src, l))
+            del self._rmap[src]
+            return 1
+        else:
+            return 0
+
+    def get_dst(self, src):
+        ''' Returns dst list for given src, or empty list if no expansion '''
+        return list(self._rmap.get(src, []))
+
+    def read_rules(self, filepath):
+        '''Read rules from given file'''
+        with open(filepath, 'r') as fd:
+            for line in fd:
+                if line.startswith('#') or line == '\n':
+                    continue
+                word_list = line.strip().split()
+                if len(word_list) > 1:
+                    self.add_rule(word_list[0],word_list[1:])
+        return
+
+    def to_file(self, filepath, taxonomy=None):
+        ''' Output sorted rules to given file 
+            If taxonomy is provided, it outputs full tag path '''
+        fd = open(filepath, 'w')
+        for src,dst_set in sorted(self._rmap.items()):
+            dst_l = sorted(dst_set, reverse=False)
+            if taxonomy:
+                src_path = taxonomy.get_path(src)
+                path_l = [taxonomy.get_path(t) for t in dst_l]
+                dst_str = '\t'.join(path_l)
+                fd.write("%s\t%s\n" % (src_path,dst_str))
+            else:
+                dst_str = '\t'.join(dst_l)
+                fd.write("%s\t%s\n" % (src,dst_str))
+        fd.close()
+
+    def expand_src_destinations(self, src):
+        ''' Return destination list for given src after recursively 
+            following any rules for destinations '''
+        dst_set = self._rmap.get(src, set())
+        out = set()
+        while dst_set:
+            dst = dst_set.pop()
+            l = self._rmap.get(dst, [])
+            if l:
+                for e in l:
+                    if (e not in out) and (e != dst):
+                        dst_set.add(e)
+            else:
+                out.add(dst)
+        return out
+
+    def expand_all_destinations(self):
+        ''' Return destination list for given src after recursively 
+            following any rules for destinations '''
+        src_l = self._rmap.keys()
+        for src in src_l:
+            dst_l = self.expand_src_destinations(src)
+            self._rmap[src] = dst_l
+
+
+class Tagging(Rules):
+    '''
+    Tagging rules have src UNK and dst in taxonomy
+    '''
+    def __init__(self, filepath):
+        Rules.__init__(self, filepath)
+
+    def validate(self, taxonomy):
+        ''' Check that tags in tagging rules are in given taxonomy '''
+        for tok,tag_l in self._rmap.items():
+            for t in tag_l:
+                if (not taxonomy.is_tag(t)):
+                    sys.stdout.write("[Tagging] %s not in taxonomy\n" % t)
+
+
+class Expansion(Rules):
+    '''
+    Expansion rules have src and dst in taxonomy and
+        src.category != dst.category
+    '''
+    def __init__(self, filepath):
+        Rules.__init__(self, filepath)
+
+    def validate(self, taxonomy):
+        ''' Check that tags in expansion rules are in given taxonomy '''
+        for src,dst_set in self._rmap.items():
+            if (not taxonomy.is_tag(src)):
+                sys.stdout.write("[Expansion] %s not in taxonomy\n" % src)
+            for dst in dst_set:
+                if (not taxonomy.is_tag(dst)):
+                    sys.stdout.write("[Expansion] %s not in taxonomy\n" % dst)
+
+
+class AvLabels:
+    '''
+    Class to operate on AV labels, 
+    such as extracting the most likely family name.
+    '''
+    def __init__(self, tag_file, exp_file = None, tax_file = None,
+                 av_file = None, aliasdetect=False):
+        # Read taxonomy
+        self.taxonomy = Taxonomy(tax_file)
+        # Read tag rules
+        self.tagging = Tagging(tag_file)
+        # Read expansion rules
+        self.expansions = Expansion(exp_file)
+        # Read AV engines
+        self.avs = self.read_avs(av_file) if av_file else None
+        # Alias statistics initialization
+        self.aliasdetect = aliasdetect
+
+    @staticmethod
+    def read_avs(avs_file):
+        '''Read AV engine set from given file'''
+        with open(avs_file) as fd:
+            avs = set(map(str.strip, fd.readlines()))
+        return avs
+
+    @staticmethod
+    def get_sample_info_lb(vt_rep):
+        '''Parse and extract sample information from JSON line
+           Returns a SampleInfo named tuple
+        '''
+        return SampleInfo(vt_rep['md5'], vt_rep['sha1'], vt_rep['sha256'],
+                          vt_rep['av_labels'], [])
+
+    @staticmethod
+    def get_sample_info_vt_v2(vt_rep):
+        '''Parse and extract sample information from JSON line
+           Returns a SampleInfo named tuple
+        '''
+        label_pairs = []
+        # Obtain scan results, if available
+        try:
+            scans = vt_rep['scans']
+            md5 = vt_rep['md5']
+            sha1 = vt_rep['sha1']
+            sha256 = vt_rep['sha256']
+        except KeyError:
+            return None
+        # Obtain labels from scan results
+        for av, res in scans.items():
+            if res['detected']:
+                label = res['result']
+                clean_label = ''.join(filter(
+                                  lambda x: x in string.printable,
+                                    label)).strip()
+                label_pairs.append((av, clean_label))
+        # Obtain VT tags, if available
+        vt_tags = vt_rep.get('tags', [])
+
+        return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags)
+
+    @staticmethod
+    def get_sample_info_vt_v3(vt_rep):
+        '''Parse and extract sample information from JSON line
+           Returns a SampleInfo named tuple
+        '''
+        label_pairs = []
+        # Obtain scan results, if available
+        try:
+            scans = vt_rep['data']['attributes']['last_analysis_results']
+            md5 = vt_rep['data']['attributes']['md5']
+            sha1 = vt_rep['data']['attributes']['sha1']
+            sha256 = vt_rep['data']['attributes']['sha256']
+        except KeyError:
+            return None
+        # Obtain labels from scan results
+        for av, res in scans.items():
+            label = res['result']
+            if label is not None:
+                clean_label = ''.join(filter(
+                                  lambda x: x in string.printable,
+                                    label)).strip()
+                label_pairs.append((av, clean_label))
+        # Obtain VT tags, if available
+        vt_tags = vt_rep['data']['attributes'].get('tags', [])
+
+        return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags)
+
+
+    @staticmethod
+    def is_pup(tag_pairs, taxonomy):
+        '''This function classifies the sample as PUP or not 
+           by checking if highest ranked CLASS tag contains "grayware"
+           and is above a predefined threshold
+           Return:
+              True/False/None
+        '''
+        threshold = 0.5
+        # If no tags, return false
+        if len(tag_pairs) < 1:
+            return None
+        max_ctr = tag_pairs[0][1]
+        for (tag,ctr) in tag_pairs:
+            (path, cat) = taxonomy.get_info(tag)
+            if (cat == "CLASS"):
+                if ("grayware" in path):
+                    return (float(ctr) >= float(max_ctr)*threshold)
+                else:
+                    return False
+        return False
+
+    @staticmethod
+    def __remove_suffixes(av_name, label):
+        '''Remove AV specific suffixes from given label
+           Returns updated label'''
+
+        # Truncate after last '.'
+        if av_name in suffix_removal_av_set:
+            label = label.rsplit('.', 1)[0]
+
+        # Truncate after last '.' 
+        # if suffix only contains digits or uppercase (no lowercase) chars
+        if av_name == 'AVG':
+            tokens = label.rsplit('.', 1)
+            if len(tokens) > 1 and re.match("^[A-Z0-9]+$", tokens[1]):
+                label = tokens[0]
+
+        # Truncate after last '!'
+        if av_name == 'Agnitum':
+            label = label.rsplit('!', 1)[0]
+
+        return label
+
+
+    def get_label_tags(self, label, hashes):
+        ''' Return list of tags in given label 
+            Tokenizes label, filters unneeded tokens, and 
+            applies tagging rules '''
+
+        # Initialize set of tags to return
+        # We use a set to avoid duplicate tokens in the same AV label
+        # This avoids "potentially unwanted" contributing twice BEH:pup
+        tags = set()
+
+        # If empty label, nothing to do
+        if not label:
+            return tags
+
+        # Split label into tokens and process each token
+        for token in re.split("[^0-9a-zA-Z]", label):
+            # Convert token to lowercase
+            token = token.lower()
+
+            # Remove digits at the end
+            end_len = len(re.findall("\d*$", token)[0])
+            if end_len:
+                token = token[:-end_len]
+
+            # Ignore token if prefix of a hash of the sample
+            # Most AVs use MD5 prefixes in labels, 
+            # but we check SHA1 and SHA256 as well
+            hash_token = False
+            for hash_str in hashes:
+                if hash_str[0:len(token)] == token:
+                  hash_token = True
+                  break
+            if hash_token:
+                continue
+
+            # Ignore generic tokens
+            if self.taxonomy.is_generic(token):
+                continue
+
+            # Apply tagging rule
+            dst_l = self.tagging.get_dst(token)
+            if dst_l:
+                # Ignore generic tokens
+                for t in dst_l:
+                    if not self.taxonomy.is_generic(t):
+                        tags.add(t)
+            # Add token if longer than 3 characters and no tagging rule
+            elif len(token) > 3:
+                tags.add(token)
+
+        # Return tags
+        return tags
+
+
+    def __expand(self, tag_set):
+        ''' Return expanded set of tags '''
+        ret = set()
+        for t in tag_set:
+            # Include tag
+            ret.add(t)
+
+            # Include target of expansion rule in output
+            ret.update(self.expansions.get_dst(t))
+
+            # Include implicit expansions in taxonomy
+            ret.update(self.taxonomy.expand(t))
+
+        # Return a list for backwards compatibility 
+        return ret
+
+    def get_sample_tags(self, sample_info):
+        ''' Returns dictionary tag -> AV list of tags for the given sample '''
+
+        # Whitelist the AVs to filter the ones with meaningful labels
+        av_whitelist = self.avs
+        # Initialize auxiliary data structures
+        duplicates = set()
+        av_dict = {}
+
+        # Process each AV label
+        for (av_name, label) in sample_info.labels:
+            # If empty label, nothing to do
+            if not label:
+                continue
+
+            ################
+            # AV selection #
+            ################
+            if av_whitelist and av_name not in av_whitelist:
+                continue
+
+            #####################
+            # Duplicate removal #
+            #####################
+
+            # Emsisoft uses same label as 
+            # GData/ESET-NOD32/BitDefender/Ad-Aware/MicroWorld-eScan,
+            # but suffixes ' (B)' to their label. Remove the suffix.
+            if label.endswith(' (B)'):
+                label = label[:-4]
+
+            # F-Secure uses Avira's engine since Nov. 2018
+            # but prefixes 'Malware.' to Avira's label. Remove the prefix.
+            if label.startswith('Malware.'):
+                label = label[8:]
+
+            # Other engines often use exactly the same label, e.g.,
+            #   AVG/Avast
+            #   K7Antivirus/K7GW
+            #   Kaspersky/ZoneAlarm
+
+            # If we have seen the exact same label before, skip
+            if label in duplicates:
+                continue
+            # If not, we add it to duplicates
+            else:
+                duplicates.add(label)
+
+            ##################
+            # Suffix removal #
+            ##################
+            label = self.__remove_suffixes(av_name, label)
+
+            ########################################################
+            # Tokenization and tagging                             #
+            ########################################################
+            hashes = [ sample_info.md5, sample_info.sha1, sample_info.sha256 ]
+            tags = self.get_label_tags(label, hashes)
+
+            ########################################################
+            # Expansions                                           #
+            ########################################################
+            # NOTE: Avoiding to do expansion when aliases
+            if self.aliasdetect:
+                expanded_tags = tags
+            else:
+                expanded_tags = self.__expand(tags)
+
+            ########################################################
+            # Stores information that relates AV vendors with tags #
+            ########################################################
+            for t in expanded_tags:
+                av_dict.setdefault(t, []).append(av_name)
+
+        return av_dict
+
+    def rank_tags(self, av_dict, threshold=1):
+        ''' Return list of (tag, confidence) ranked by decreasing confidence 
+            and filter tags with less or equal threshold confidence '''
+
+        pairs = ((t, len(avs)) for (t,avs) in av_dict.items() 
+                    if len(avs) > threshold)
+        return sorted(pairs, key=itemgetter(1,0), reverse=True)
+
diff --git a/avclass/input_checker.py b/avclass/input_checker.py
index 1547742..7ccf5bc 100755
--- a/avclass/input_checker.py
+++ b/avclass/input_checker.py
@@ -1,6 +1,6 @@
 import sys
 import argparse
-from avclass.lib import Taxonomy, Tagging, Expansion
+from avclass.common import Taxonomy, Tagging, Expansion
 
 
 default_tag_file = "data/default.tagging"
diff --git a/avclass/labeler.py b/avclass/labeler.py
index 7bc9b88..0649c47 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -6,7 +6,8 @@
 
 from operator import itemgetter
 
-from avclass.lib import AvLabels, clustering as ec
+from avclass.common import AvLabels
+from avclass import clustering as ec
 
 
 script_dir = os.path.dirname(os.path.abspath(__file__))
@@ -477,4 +478,3 @@ def parse_args():
 
 if __name__=='__main__':
     main()
-    
\ No newline at end of file
diff --git a/avclass/update.py b/avclass/update.py
index 14bb1c5..525bc1d 100755
--- a/avclass/update.py
+++ b/avclass/update.py
@@ -6,7 +6,7 @@
 from collections import namedtuple
 from operator import itemgetter
 
-from avclass.lib import Taxonomy, Expansion, Tagging
+from avclass.common import Taxonomy, Expansion, Tagging
 
 
 # Set logging

From 6c58d9e3e6fb960de4e1cebfbc11b0ac54838e25 Mon Sep 17 00:00:00 2001
From: Matt Miller <usr.bin.bourbon@gmail.com>
Date: Mon, 11 Jan 2021 15:47:55 -0600
Subject: [PATCH 05/36] add cli entry points; rework validator

---
 avclass/cli.py                            | 22 ++++++
 avclass/data/__init__.py                  |  0
 {data => avclass/data}/andropup.expansion |  0
 {data => avclass/data}/default.expansion  |  0
 {data => avclass/data}/default.tagging    |  0
 {data => avclass/data}/default.taxonomy   |  0
 avclass/input_checker.py                  | 53 --------------
 avclass/util.py                           | 86 +++++++++++++++++++++++
 setup.py                                  |  5 +-
 9 files changed, 112 insertions(+), 54 deletions(-)
 create mode 100644 avclass/cli.py
 create mode 100644 avclass/data/__init__.py
 rename {data => avclass/data}/andropup.expansion (100%)
 rename {data => avclass/data}/default.expansion (100%)
 rename {data => avclass/data}/default.tagging (100%)
 rename {data => avclass/data}/default.taxonomy (100%)
 delete mode 100755 avclass/input_checker.py
 create mode 100755 avclass/util.py

diff --git a/avclass/cli.py b/avclass/cli.py
new file mode 100644
index 0000000..76e2ad3
--- /dev/null
+++ b/avclass/cli.py
@@ -0,0 +1,22 @@
+import argparse
+
+from avclass import util
+
+
+def validate_files():
+    parser = argparse.ArgumentParser(description='Checks format of files Tagging, Expansion and Taxonomy.')
+    parser.add_argument('-exp',
+                        help='expansion file',
+                        default=util.DEFAULT_EXP_PATH)
+    parser.add_argument('-tag',
+                        help='tagging file',
+                        default=util.DEFAULT_TAG_PATH)
+    parser.add_argument('-tax',
+                        help='taxonomy file',
+                        default=util.DEFAULT_TAX_PATH)
+
+    args = parser.parse_args()
+
+    taxonomy = util.validate_taxonomy(args.tax)
+    util.validate_tagging(args.tag, taxonomy)
+    util.validate_expansion(args.exp, taxonomy)
diff --git a/avclass/data/__init__.py b/avclass/data/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/data/andropup.expansion b/avclass/data/andropup.expansion
similarity index 100%
rename from data/andropup.expansion
rename to avclass/data/andropup.expansion
diff --git a/data/default.expansion b/avclass/data/default.expansion
similarity index 100%
rename from data/default.expansion
rename to avclass/data/default.expansion
diff --git a/data/default.tagging b/avclass/data/default.tagging
similarity index 100%
rename from data/default.tagging
rename to avclass/data/default.tagging
diff --git a/data/default.taxonomy b/avclass/data/default.taxonomy
similarity index 100%
rename from data/default.taxonomy
rename to avclass/data/default.taxonomy
diff --git a/avclass/input_checker.py b/avclass/input_checker.py
deleted file mode 100755
index 7ccf5bc..0000000
--- a/avclass/input_checker.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import sys
-import argparse
-from avclass.common import Taxonomy, Tagging, Expansion
-
-
-default_tag_file = "data/default.tagging"
-default_tax_file = "data/default.taxonomy"
-default_exp_file = "data/default.expansion"
-
-
-def main():
-    argparser = argparse.ArgumentParser(prog='input_checker',
-        description='Checks format of files Tagging, Expansion and Taxonomy.')
-
-    argparser.add_argument('-tag',
-        help='tagging file',
-        default=default_tag_file)
-
-    argparser.add_argument('-tax',
-        help='taxonomy file',
-        default=default_tax_file)
-
-    argparser.add_argument('-exp',
-        help='expansion file',
-        default=default_exp_file)
-
-    # Parse arguments
-    args = argparser.parse_args()
-
-    # Normalize taxonomy
-    taxonomy = Taxonomy(args.tax)
-    taxonomy.to_file(args.tax)
-    sys.stdout.write('[-] Normalized %d tags in taxonomy %s\n' % (
-                        len(taxonomy), args.tax))
-
-    # Normalize tagging rules
-    tagging = Tagging(args.tag)
-    tagging.validate(taxonomy)
-    # tagging.expand_all_destinations()
-    tagging.to_file(args.tag)
-    sys.stdout.write('[-] Normalized %d tagging rules in %s\n' % (
-                        len(tagging), args.tag))
-
-    # Normalize expansion rules
-    expansion = Expansion(args.exp)
-    expansion.validate(taxonomy)
-    expansion.to_file(args.exp)
-    sys.stdout.write('[-] Normalized %d expansion rules in %s\n' % (
-                        len(expansion), args.exp))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/avclass/util.py b/avclass/util.py
new file mode 100755
index 0000000..7b4bba4
--- /dev/null
+++ b/avclass/util.py
@@ -0,0 +1,86 @@
+import atexit
+import logging
+import pkg_resources
+
+from avclass import data
+from avclass.common import Taxonomy, Tagging, Expansion
+
+from typing import AnyStr
+
+
+__all__ = (
+    'DEFAULT_EXP_PATH',
+    'DEFAULT_TAG_PATH',
+    'DEFAULT_TAX_PATH',
+    'validate_expansion',
+    'validate_tagging',
+    'validate_taxonomy',
+)
+
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_EXP = "default.expansion"
+DEFAULT_TAG = "default.tagging"
+DEFAULT_TAX = "default.taxonomy"
+
+DEFAULT_TAG_PATH = None
+DEFAULT_TAX_PATH = None
+DEFAULT_EXP_PATH = None
+
+if pkg_resources.resource_exists(data, DEFAULT_EXP):
+    DEFAULT_EXP_PATH = pkg_resources.resource_filename(data, DEFAULT_EXP)
+
+if pkg_resources.resource_exists(data, DEFAULT_TAG):
+    DEFAULT_TAG_PATH = pkg_resources.resource_filename(data, DEFAULT_TAG)
+
+if pkg_resources.resource_exists(data, DEFAULT_TAX):
+    DEFAULT_TAX_PATH = pkg_resources.resource_filename(data, DEFAULT_TAX)
+
+atexit.register(pkg_resources.cleanup_resources)
+
+
+def validate_taxonomy(path: AnyStr):
+    """
+    Validate and normalize a Taxonomy created from ``path``
+
+    :param path: Location on disk of a Taxonomy file
+    :return: Taxonomy object
+    """
+    taxonomy = Taxonomy(path)
+    taxonomy.to_file(path)
+
+    logger.info('[-] Normalized %d tags in taxonomy %s\n' % (len(taxonomy), path))
+
+    return taxonomy
+
+
+def validate_tagging(path: AnyStr, taxonomy: Taxonomy):
+    """
+    Validate and normalize Tagging created from ``path`` and verified against ``taxonomy``
+
+    :param path: Location on disk of a Tagging file
+    :param taxonomy: Valid Taxonomy object
+    :return: None
+    """
+    tagging = Tagging(path)
+    tagging.validate(taxonomy)
+    # tagging.expand_all_destinations()
+    tagging.to_file(path)
+
+    logger.info('[-] Normalized %d tagging rules in %s\n' % (len(tagging), path))
+
+
+def validate_expansion(path: AnyStr, taxonomy: Taxonomy):
+    """
+    Validate and normalize Expansion created from ``path`` and verified against ``taxonomy``
+
+    :param path: Location on disk of an Expansion file
+    :param taxonomy: Valid Taxonomy object
+    :return: None
+    """
+    expansion = Expansion(path)
+    expansion.validate(taxonomy)
+    expansion.to_file(path)
+
+    logger.info('[-] Normalized %d expansion rules in %s\n' % (len(expansion), path))
diff --git a/setup.py b/setup.py
index fada7e7..6bcc101 100644
--- a/setup.py
+++ b/setup.py
@@ -7,6 +7,9 @@
     description='Tag and label malware samples',
     license='LICENSE',
     packages=find_packages(),
+    package_data={
+        'avclass': ['data/default.*'],
+    },
     install_requires=[],
     setup_requires=[
         'pytest-runner',
@@ -17,6 +20,6 @@
     entry_points={
         'console_scripts': [
             'avclass = avclass.labeler:main',
-            'avclass-validate = avclass.input_checker:main',
+            'avclass-validate = avclass.util:validate_files',
         ],
     })

From c845baff2a4eedce7a742dd93b81d53358e97265 Mon Sep 17 00:00:00 2001
From: Matt Miller <usr.bin.bourbon@gmail.com>
Date: Mon, 11 Jan 2021 16:07:18 -0600
Subject: [PATCH 06/36] update cleanup

---
 avclass/update.py | 84 +++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 47 deletions(-)
 mode change 100755 => 100644 avclass/update.py

diff --git a/avclass/update.py b/avclass/update.py
old mode 100755
new mode 100644
index 525bc1d..6d0558c
--- a/avclass/update.py
+++ b/avclass/update.py
@@ -6,12 +6,11 @@
 from collections import namedtuple
 from operator import itemgetter
 
+from avclass import util
 from avclass.common import Taxonomy, Expansion, Tagging
 
 
-# Set logging
-log = logging.getLogger(__name__)
-
+logger = logging.getLogger(__name__)
 # Log warn and above to stderr
 formatter = logging.Formatter(u'%(message)s')
 handler_stderr = logging.StreamHandler(sys.stderr)
@@ -21,26 +20,17 @@
 root.setLevel(logging.DEBUG)
 root.addHandler(handler_stderr)
 
-
-script_dir = os.path.dirname(os.path.abspath(__file__))
-# Default tagging file
-default_tagging_file = os.path.join(script_dir, "data/default.tagging")
-# Default expansion file
-default_expansion_file = os.path.join(script_dir, "data/default.expansion")
-# Default taxonomy file
-default_taxonomy_file = os.path.join(script_dir, "data/default.taxonomy")
-
 # Threshold for string similarity
 # sim_threshold = 0.6
 
-# Relation
-Rel = namedtuple('Rel', ['t1', 't2', 't1_num', 't2_num', 
-                         'nalias_num', 'talias_num', 'tinv_alias_num'])
+Relation = namedtuple('Relation', ['t1', 't2', 't1_num', 't2_num',
+                                   'nalias_num', 'talias_num', 'tinv_alias_num'])
+
 
 
 class Update:
     ''' Update Module '''
-    def __init__(self, rel_filepath, in_taxonomy, in_tagging, in_expansion, 
+    def __init__(self, rel_filepath, in_taxonomy, in_tagging, in_expansion,
                     n, t):
         # Initialize inputs
         self.__out_taxonomy = in_taxonomy
@@ -59,7 +49,7 @@ def num_rules(self):
         return len(self.rel_set)
 
     def is_weak_rel(self, rel):
-        ''' Return true if relationship is weak, 
+        ''' Return true if relationship is weak,
             i.e., does not meet thresholds '''
         return ((int(rel.nalias_num) < self.__n) or
                 (float(rel.talias_num) < self.__t))
@@ -124,7 +114,7 @@ def add_alias(self, src, dst, dst_prefix):
                 cnt = self.src_map.get(e, 0)
                 if cnt > cnt_max:
                     target = e
-        # If dst is in tagging, update tagging rule destination, 
+        # If dst is in tagging, update tagging rule destination,
         l = self.__out_tagging.get_dst(dst)
         if l:
             target_l = l
@@ -151,11 +141,11 @@ def find_expansions(self):
         for rel in self.rel_set:
             p1 = self.__out_taxonomy.get_path(rel.t1)
             p2 = self.__out_taxonomy.get_path(rel.t2)
-            log.debug("Processing %s\t%s" % (p1, p2))
+            logger.debug("Processing %s\t%s" % (p1, p2))
             # Ignore relations where t1 is an alias
             l = self.__out_tagging.get_dst(rel.t1)
             if l:
-                log.debug("Ignoring relation for alias %s" % p1)
+                logger.debug("Ignoring relation for alias %s" % p1)
                 continue
             if self.is_expansion_rel(rel):
                 self.add_expansion(rel.t1, [rel.t2])
@@ -191,7 +181,7 @@ def process_relation(self, rel):
         p1,c1 = self.__out_taxonomy.get_info(rel.t1)
         p2,c2 = self.__out_taxonomy.get_info(rel.t2)
 
-        log.debug("Processing %s\t%s" % (p1, p2))
+        logger.debug("Processing %s\t%s" % (p1, p2))
 
         # If both directions strong, then equivalent, i.e., alias
         if (float(rel.tinv_alias_num) >= args.t):
@@ -204,7 +194,7 @@ def process_relation(self, rel):
             elif (c1 == c2):
                 prefix = p1[0:p1.rfind(':')]
             else:
-                log.warn("Equivalent rule with different categories: %s\t%s" % 
+                logger.warn("Equivalent rule with different categories: %s\t%s" %
                             (p1, p2))
                 return -1
             self.add_alias(t1, t2, prefix)
@@ -278,7 +268,7 @@ def run(self):
             # Do a pass in remaining relations
             cnt = 0
             new_set = set()
-            log.debug("[-] %03d Processing relations" % num_iter)
+            logger.debug("[-] %03d Processing relations" % num_iter)
             while self.rel_set:
                 rel = self.rel_set.pop()
                 # If known relation, continue
@@ -306,12 +296,12 @@ def run(self):
         # self.find_aliases()
 
         # Find expansions
-        log.debug("[-] Finding expansions")
+        logger.debug("[-] Finding expansions")
         self.find_expansions()
 
 
     def read_relations(self, filepath):
-        ''' Returns relations in file as a set 
+        ''' Returns relations in file as a set
             Filters weak and blacklisted relations '''
         rel_set = set()
         with open(filepath, 'r') as fd:
@@ -323,8 +313,8 @@ def read_relations(self, filepath):
                 t1, t2, t1_num, t2_num, nalias_num, talias_num, \
                   tinv_alias_num = line.strip().split('\t')
                 # Build relation
-                rel = Rel(t1, t2, t1_num, t2_num, nalias_num,
-                          talias_num, tinv_alias_num)
+                rel = Relation(t1, t2, t1_num, t2_num, nalias_num,
+                               talias_num, tinv_alias_num)
                 # Ignore weak relations
                 if self.is_weak_rel(rel):
                     continue
@@ -332,7 +322,7 @@ def read_relations(self, filepath):
                 if self.is_blacklisted_rel(rel):
                     continue
                 # Ignore known relations
-                # NOTE: commented since we check if a 
+                # NOTE: commented since we check if a
                 # relation is known before processing it
                 #if self.is_known_rel(rel):
                 #    continue
@@ -348,16 +338,16 @@ def output_relations(self, filepath):
         fd = open(filepath, 'w')
         fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t"
                   "|t1^t2|/|t2|\n")
-        sorted_rules = sorted(self.rel_set, 
+        sorted_rules = sorted(self.rel_set,
                               key=(lambda r: (
                                 self.__out_taxonomy.get_category(r.t1),
-                                self.__out_taxonomy.get_category(r.t2))), 
+                                self.__out_taxonomy.get_category(r.t2))),
                               reverse=False)
         for rel in sorted_rules:
             p1,c1 = self.__out_taxonomy.get_info(rel.t1)
             p2,c2 = self.__out_taxonomy.get_info(rel.t2)
             fd.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %(
-                p1, p2, rel.t1_num, rel.t2_num, rel.nalias_num, 
+                p1, p2, rel.t1_num, rel.t2_num, rel.nalias_num,
                 rel.talias_num, rel.tinv_alias_num))
         fd.close()
 
@@ -373,35 +363,35 @@ def output_rule_stats(self, fd):
                                                                   c2), 0) + 1
             self.dst_map[rel.t2] = self.dst_map.get(rel.t2, 0) + 1
         # Output statistics
-        cat_pairs = sorted(update.cat_pairs_map.items(), key=itemgetter(1,0), 
+        cat_pairs = sorted(update.cat_pairs_map.items(), key=itemgetter(1,0),
                             reverse=True)
         for (c1,c2), cnt in cat_pairs:
             fd.write("%s\t%s\t%03d\n" % (c1, c2, cnt))
 
         # Print dst statistics
-        dst_pairs = sorted(update.dst_map.items(), key=itemgetter(1,0), 
+        dst_pairs = sorted(update.dst_map.items(), key=itemgetter(1,0),
                             reverse=False)
         for dst, cnt in dst_pairs:
             fd.write("%s\t%03d\n" % (taxonomy.get_path(dst), cnt))
 
     def output(self, out_prefix):
         if (not out_prefix):
-            tax_filepath = default_taxonomy_file
-            tag_filepath = default_tagging_file
-            exp_filepath = default_expansion_file
+            tax_filepath = util.DEFAULT_TAX_PATH
+            tag_filepath = util.DEFAULT_TAG_PATH
+            exp_filepath = util.DEFAULT_EXP_PATH
         else:
             tax_filepath = out_prefix + ".taxonomy"
             tag_filepath = out_prefix + ".tagging"
             exp_filepath = out_prefix + ".expansion"
         taxonomy.to_file(tax_filepath)
-        log.info('[-] Output %d taxonomy tags to %s' % (
+        logger.info('[-] Output %d taxonomy tags to %s' % (
                         len(taxonomy), tax_filepath))
         tagging.expand_all_destinations()
         tagging.to_file(tag_filepath)
-        log.info('[-] Output %d tagging rules to %s' % (
+        logger.info('[-] Output %d tagging rules to %s' % (
                         len(tagging), tag_filepath))
         expansion.to_file(exp_filepath)
-        log.info('[-] Output %d expansion rules to %s' % (
+        logger.info('[-] Output %d expansion rules to %s' % (
                         len(expansion), exp_filepath))
 
 
@@ -435,15 +425,15 @@ def output(self, out_prefix):
 
     argparser.add_argument('-tag',
         help='file with tagging rules.',
-        default = default_tagging_file)
+        default = util.DEFAULT_TAG_PATH)
 
     argparser.add_argument('-tax',
         help='file with taxonomy.',
-        default = default_taxonomy_file)
+        default = util.DEFAULT_TAX_PATH)
 
     argparser.add_argument('-exp',
         help='file with expansion rules.',
-        default = default_expansion_file)
+        default = util.DEFAULT_EXP_PATH)
 
     argparser.add_argument('-v', '--verbose',
         action='store_true',
@@ -454,7 +444,7 @@ def output(self, out_prefix):
 
     # Check we have the input
     if not args.alias:
-        log.error('[-] Please provide an alias file with -alias')
+        logger.error('[-] Please provide an alias file with -alias')
         exit(1)
 
     # Set logging level
@@ -469,23 +459,23 @@ def output(self, out_prefix):
 
     # Read taxonomy
     taxonomy = Taxonomy(args.tax)
-    log.info('[-] Read %d taxonomy tags from %s' % (
+    logger.info('[-] Read %d taxonomy tags from %s' % (
                         len(taxonomy), args.tax))
 
     # Read tagging rules
     tagging = Tagging(args.tag)
-    log.info('[-] Read %d tagging rules from %s' % (
+    logger.info('[-] Read %d tagging rules from %s' % (
                         len(tagging), args.tag))
 
     # Read expansion rules
     expansion = Expansion(args.exp)
-    log.info('[-] Read %d expansion rules from %s' % (
+    logger.info('[-] Read %d expansion rules from %s' % (
                         len(expansion), args.exp))
 
     # Build update object
     update = Update(args.alias, taxonomy, tagging, expansion, args.n, args.t)
 
-    log.info('[-] Processing %d relations satisfying t>=%.2f n>=%d' % (
+    logger.info('[-] Processing %d relations satisfying t>=%.2f n>=%d' % (
                         update.num_rules(), args.t, args.n))
 
     # Output initial rules

From e8df430b2748514a7e509b8db446cbdd096b004c Mon Sep 17 00:00:00 2001
From: Matt Miller <usr.bin.bourbon@gmail.com>
Date: Mon, 11 Jan 2021 16:24:28 -0600
Subject: [PATCH 07/36] fix pkg stuff in util; cleanup labeler

---
 avclass/labeler.py | 44 ++++++++++++++++----------------------------
 avclass/util.py    | 18 +++++++++---------
 2 files changed, 25 insertions(+), 37 deletions(-)

diff --git a/avclass/labeler.py b/avclass/labeler.py
index 0649c47..035d4af 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -7,16 +7,7 @@
 from operator import itemgetter
 
 from avclass.common import AvLabels
-from avclass import clustering as ec
-
-
-script_dir = os.path.dirname(os.path.abspath(__file__))
-# Default tagging file
-default_tag_file = os.path.join(script_dir, "data/default.tagging")
-# Default expansion file
-default_exp_file = os.path.join(script_dir, "data/default.expansion")
-# Default taxonomy file
-default_tax_file = os.path.join(script_dir, "data/default.taxonomy")
+from avclass import clustering as ec, util
 
 
 def guess_hash(h):
@@ -31,6 +22,7 @@ def guess_hash(h):
     else:
         return None
 
+
 def format_tag_pairs(l, taxonomy=None):
     ''' Return ranked tags as string '''
     if not l:
@@ -48,6 +40,7 @@ def format_tag_pairs(l, taxonomy=None):
         out += ",%s|%d" % (p, s)
     return out
 
+
 def list_str(l, sep=", ", prefix=""):
     ''' Return list as a string '''
     if not l:
@@ -57,6 +50,7 @@ def list_str(l, sep=", ", prefix=""):
         out = out + sep + s
     return out
 
+
 def main():
     args = parse_args()
     # Select hash used to identify sample, by default MD5
@@ -354,7 +348,7 @@ def main():
 
 
 def parse_args():
-    argparser = argparse.ArgumentParser(prog='avclass2_labeler',
+    argparser = argparse.ArgumentParser(prog='avclass',
         description='''Extracts tags for a set of samples.
             Also calculates precision and recall if ground truth available''')
 
@@ -387,15 +381,15 @@ def parse_args():
 
     argparser.add_argument('-tag',
         help='file with tagging rules.',
-        default = default_tag_file)
+        default = util.DEFAULT_TAG_PATH)
 
     argparser.add_argument('-tax',
         help='file with taxonomy.',
-        default = default_tax_file)
+        default = util.DEFAULT_TAX_PATH)
 
     argparser.add_argument('-exp',
         help='file with expansion rules.',
-        default = default_exp_file)
+        default = util.DEFAULT_EXP_PATH)
 
     argparser.add_argument('-av',
         help='file with list of AVs to use')
@@ -435,43 +429,37 @@ def parse_args():
 
     if not args.vt and not args.lb and not args.vtdir and not args.lbdir:
         sys.stderr.write('One of the following 4 arguments is required: '
-                          '-vt,-lb,-vtdir,-lbdir\n')
+                         '-vt,-lb,-vtdir,-lbdir\n')
         exit(1)
 
     if (args.vt or args.vtdir) and (args.lb or args.lbdir):
         sys.stderr.write('Use either -vt/-vtdir or -lb/-lbdir. '
-                          'Both types of input files cannot be combined.\n')
+                         'Both types of input files cannot be combined.\n')
         exit(1)
 
     if args.tag:
         if args.tag == '/dev/null':
             sys.stderr.write('[-] Using no tagging rules\n')
         else:
-            sys.stderr.write('[-] Using tagging rules in %s\n' % (
-                              args.tag))
+            sys.stderr.write('[-] Using tagging rules in %s\n' % (args.tag))
     else:
-        sys.stderr.write('[-] Using default tagging rules in %s\n' % (
-                          default_tag_file))
+        sys.stderr.write('[-] Using default tagging rules in %s\n' % (util.DEFAULT_TAG_PATH))
 
     if args.tax:
         if args.tax == '/dev/null':
             sys.stderr.write('[-] Using no taxonomy\n')
         else:
-            sys.stderr.write('[-] Using taxonomy in %s\n' % (
-                              args.tax))
+            sys.stderr.write('[-] Using taxonomy in %s\n' % (args.tax))
     else:
-        sys.stderr.write('[-] Using default taxonomy in %s\n' % (
-                          default_tax_file))
+        sys.stderr.write('[-] Using default taxonomy in %s\n' % (util.DEFAULT_TAX_PATH))
 
     if args.exp:
         if args.exp == '/dev/null':
             sys.stderr.write('[-] Using no expansion tags\n')
         else:
-            sys.stderr.write('[-] Using expansion tags in %s\n' % (
-                              args.exp))
+            sys.stderr.write('[-] Using expansion tags in %s\n' % (args.exp))
     else:
-        sys.stderr.write('[-] Using default expansion tags in %s\n' % (
-                          default_exp_file))
+        sys.stderr.write('[-] Using default expansion tags in %s\n' % (util.DEFAULT_EXP_PATH))
 
     return args
 
diff --git a/avclass/util.py b/avclass/util.py
index 7b4bba4..ceaf071 100755
--- a/avclass/util.py
+++ b/avclass/util.py
@@ -20,22 +20,22 @@
 
 logger = logging.getLogger(__name__)
 
-DEFAULT_EXP = "default.expansion"
-DEFAULT_TAG = "default.tagging"
-DEFAULT_TAX = "default.taxonomy"
+RESOURCE_EXP = "default.expansion"
+RESOURCE_TAG = "default.tagging"
+RESOURCE_TAX = "default.taxonomy"
 
 DEFAULT_TAG_PATH = None
 DEFAULT_TAX_PATH = None
 DEFAULT_EXP_PATH = None
 
-if pkg_resources.resource_exists(data, DEFAULT_EXP):
-    DEFAULT_EXP_PATH = pkg_resources.resource_filename(data, DEFAULT_EXP)
+if pkg_resources.resource_exists(data.__name__, RESOURCE_EXP):
+    DEFAULT_EXP_PATH = pkg_resources.resource_filename(data.__name__, RESOURCE_EXP)
 
-if pkg_resources.resource_exists(data, DEFAULT_TAG):
-    DEFAULT_TAG_PATH = pkg_resources.resource_filename(data, DEFAULT_TAG)
+if pkg_resources.resource_exists(data.__name__, RESOURCE_TAG):
+    DEFAULT_TAG_PATH = pkg_resources.resource_filename(data.__name__, RESOURCE_TAG)
 
-if pkg_resources.resource_exists(data, DEFAULT_TAX):
-    DEFAULT_TAX_PATH = pkg_resources.resource_filename(data, DEFAULT_TAX)
+if pkg_resources.resource_exists(data.__name__, RESOURCE_TAX):
+    DEFAULT_TAX_PATH = pkg_resources.resource_filename(data.__name__, RESOURCE_TAX)
 
 atexit.register(pkg_resources.cleanup_resources)
 

From 992178724222d1f33d0d18eb180609d382a3af8d Mon Sep 17 00:00:00 2001
From: Matt Miller <usr.bin.bourbon@gmail.com>
Date: Wed, 13 Jan 2021 11:37:16 -0600
Subject: [PATCH 08/36] incremental

---
 avclass/common.py | 688 +++++++++++++++++++++++++++-------------------
 avclass/update.py |   4 +-
 avclass/util.py   |   4 +-
 3 files changed, 403 insertions(+), 293 deletions(-)

diff --git a/avclass/common.py b/avclass/common.py
index dc28ff4..dbc292b 100755
--- a/avclass/common.py
+++ b/avclass/common.py
@@ -5,10 +5,10 @@
 
 from collections import namedtuple
 from operator import itemgetter
+from typing import Any, AnyStr, Collection, Dict, List, Optional, Set, Tuple, Union
 
 
-# Set logging
-log = logging.getLogger(__name__)
+logger = logging.getLogger(__name__)
 
 # Prefix to identify platform tags
 platform_prefix = "FILE:os:"
@@ -23,14 +23,19 @@
 
 # AVs to use in suffix removal
 suffix_removal_av_set = {'Norman', 'Avast', 'Avira', 'Kaspersky',
-                          'ESET-NOD32', 'Fortinet', 'Jiangmin', 'Comodo',
-                          'GData', 'Avast', 'Sophos',
-                          'TrendMicro-HouseCall', 'TrendMicro',
-                          'NANO-Antivirus', 'Microsoft'}
+                         'ESET-NOD32', 'Fortinet', 'Jiangmin', 'Comodo',
+                         'GData', 'Avast', 'Sophos',
+                         'TrendMicro-HouseCall', 'TrendMicro',
+                         'NANO-Antivirus', 'Microsoft'}
 
 
-def create_tag(s):
-    ''' Create a Tag from its string representation '''
+def create_tag(s: AnyStr):
+    """
+    Create a Tag from its string representation (path)
+
+    :param s: The string
+    :return: A Tag object
+    """
     word_list = s.strip().split(":")
     if len(word_list) > 1:
         name = word_list[-1].lower()
@@ -49,195 +54,264 @@ def create_tag(s):
 
 
 class Taxonomy:
-    '''
-    A taxonomy of tags and generic tokens read from file
-    '''
-    def __init__(self, filepath):
-        ''' Map tag.name | tag.path -> Tag '''
+    """
+    Contains tags and generic tokens read from filesystem
+    """
+    def __init__(self, filepath: Optional[AnyStr]):
+        """
+        Initialize and populate the Tag map from ``filepath``
+
+        :param filepath: Path to taxonomy data
+        """
         self.__tag_map = {}
         if filepath:
             self.read_taxonomy(filepath)
 
-    def __len__(self):
-        ''' Taxonomy length is the number of tags it contains '''
-        return len(self.__tag_map)//2
-
-    def is_generic(self, t):
-        ''' Return true if input is generic, false otherwise '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            return tag.cat == "GEN"
-        else:
-            return False
-
-    def is_tag(self, t):
-        ''' Return true if input is tag, false otherwise '''
-        return t in self.__tag_map
-
-    def add_tag(self, s, override=False):
-        ''' Add tag to taxonomy 
-            If tag already exists with different path, 
-              only replaces if override True '''
+    def __len__(self) -> int:
+        """
+        The number of tags contained in __tag_map (divided by 2 because we store paths there too)
+
+        :return: The length (int) of the Taxonomy
+        """
+        return len(self.__tag_map)//2  # TODO - perhaps there should be two dicts, one for names, one for paths?
+
+    def is_generic(self, tag: AnyStr) -> bool:
+        """
+        Whether or not the input ``tag`` is generic
+
+        :param tag: The tag
+        :return: Boolean
+        """
+        t = self.__tag_map.get(tag, None)
+        return getattr(t, 'cat', None) == 'GEN'
+
+    def is_tag(self, tag: AnyStr) -> bool:
+        """
+        Whether this Taxonomy is aware of ``tag``
+
+        :param tag: The tag
+        :return: Boolean
+        """
+        return tag in self.__tag_map
+
+    def add_tag(self, s: AnyStr, override: bool = False):
+        """
+        Add a tag (``s``) to the Taxonomy.  Collisions are only replaced if ``override`` is truthy.
+
+        :param s: A string to create a Tag from
+        :param override: Whether or not to replace a duplicate if present
+        :return: None
+        """
         tag = create_tag(s)
         t = self.__tag_map.get(tag.name, None)
+
         if t and (t.path != tag.path):
-            if (not override):
-                return
-            else:
-                log.warn("[Taxonomy] Replacing %s with %s\n" % (
-                                  t.path, tag.path))
+            if override:
+                logger.warning("[Taxonomy] Replacing %s with %s\n" % t.path, tag.path)
                 del self.__tag_map[t.path]
-        log.debug("[Taxonomy] Adding tag %s" % s)
+            else:
+                return
+
+        logger.debug("[Taxonomy] Adding tag %s" % s)
         self.__tag_map[tag.name] = tag
         self.__tag_map[tag.path] = tag
-        return
-
-    def remove_tag(self, t):
-        ''' Remove tag from taxonomy. Returns 1 if removed, zero if unknown '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            log.debug("[Taxonomy] Removing tag: %s" % tag.path)
-            del self.__tag_map[tag.name]
-            del self.__tag_map[tag.path]
-            return 1
-        else:
-            return 0
 
-    def get_category(self, t):
-        ''' Return category of input tag, UNK if not a tag '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            return tag.cat
-        else:
-            return "UNK"
-
-    def get_path(self, t):
-        ''' Return full path for given tag, or empty string if not a tag '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            return tag.path
-        else:
-            return ("UNK:" + t)
-
-    def get_prefix_l(self, t):
-        ''' Return prefix list for given tag, or empty string if not a tag '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            return tag.prefix_l
-        else:
-            return []
+    def remove_tag(self, tag: AnyStr) -> bool:
+        """
+        Remove a Tag from the Taxonomy.
 
-    def get_prefix(self, t):
-        ''' Return prefix string for given tag, 
-            or empty string if not a tag '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            return tag.prefix_l
-        else:
-            return t.path[0:t.path.rfind(':')]
-
-    def get_depth(self, t):
-        ''' Return depth of tag in taxonomy. 
-            Returns zero if tag not in taxonomy. 
-            A normal tag CAT:name has depth two '''
-        tag = self.__tag_map.get(t, None)
+        :param tag: The tag to remove
+        :return: Whether or not the tag was present
+        """
+        t = self.__tag_map.get(tag, None)
         if tag:
+            logger.debug("[Taxonomy] Removing tag: %s" % t.path)
+            del self.__tag_map[t.name]
+            del self.__tag_map[t.path]
+        return t is not None
+
+    def get_category(self, tag: AnyStr) -> AnyStr:
+        """
+        Return the tag's category or "UNK" if it's not a tag.
+
+        :param tag: The tag
+        :return: The category
+        """
+        t = self.__tag_map.get(tag, None)
+        return getattr(t, 'cat', 'UNK')
+
+    def get_path(self, tag: AnyStr) -> AnyStr:
+        """
+        Get a tag's full path.
+
+        :param tag: The tag
+        :return: The tag's path
+        """
+        t = self.__tag_map.get(tag, None)
+        return getattr(t, 'path', f'UNK:{tag}')
+
+    def get_prefix_l(self, tag: AnyStr) -> List[AnyStr]:
+        """
+        Get a tag's prefix list.
+
+        :param tag: The tag
+        :return: The tag's prefix list
+        """
+        t = self.__tag_map.get(tag, None)
+        return getattr(t, 'prefix_l', [])
+
+    def get_prefix(self, tag: AnyStr) -> List[AnyStr]:
+        """
+        Get a tag's prefixes.
+
+        :param tag: The tag
+        :return: String representation of the tag's full prefix
+        """
+        t = self.__tag_map.get(tag, None)
+        tag_pfx = tag.path.split(':')[:-1]
+        return t.prefix_l if t else tag_pfx
+
+    def get_depth(self, tag: AnyStr) -> int:
+        """
+        Determine the "depth" (token count) of the tag
+
+        :param tag: The tag
+        :return: The depth (int) of the tag
+        """
+        t = self.__tag_map.get(tag, None)
+        if t:
             return len(tag.prefix_l) + 2
-        else:
-            return 0
-
-    def get_info(self, t):
-        ''' Return (path,category) for given tag, or UNK:t if not a tag '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            return tag.path, tag.cat
-        else:
-            return "UNK:" + t, "UNK"
-
-    def expand(self, t):
-        ''' Return list of tags in prefix list that are leaves '''
-        tag = self.__tag_map.get(t, None)
-        if tag:
-            return [t for t in tag.prefix_l if t in self.__tag_map]
-        else:
-            return []
-
-    def platform_tags(self): 
-        ''' Returns list with platform tags in taxonomy '''
-        acc = set()
-        for idx,tag in self.__tag_map.items():
-            if tag.path.startswith(platform_prefix):
-                acc.add(tag.name)
-        return acc
-
-    def overlaps(self, t1, t2):
-        ''' Returns true if the path of the given tags overlaps '''
+        return 0
+
+    def get_info(self, tag: AnyStr) -> Tuple[AnyStr, AnyStr]:
+        """
+        Get tag info (path, category) or "UNK:tag"
+
+        :param tag: The tag
+        :return: Tuple containing tag.path and tag.cat
+        """
+        t = self.__tag_map.get(tag, None)
+        if t:
+            return t.path, t.cat
+        return f"UNK:{tag}", "UNK"
+
+    def expand(self, tag: AnyStr) -> List[AnyStr]:
+        """
+        Return tag prefixes that are leaf-nodes
+
+        :param tag: The tag
+        :return: A list of prefixes
+        """
+        t = self.__tag_map.get(tag, None)
+        if t:
+            return [x for x in t.prefix_l if x in self.__tag_map]
+        return []
+
+    def platform_tags(self) -> Set[AnyStr]:
+        """
+        Returns a set of platform tags in the Taxonomy
+
+        :return: Set of platformn tags
+        """
+        return {tag.name for _, tag in self.__tag_map.items() if tag.path.startswith(platform_prefix)}
+
+    def overlaps(self, t1: AnyStr, t2: AnyStr) -> bool:
+        """
+        Whether or not the two tags overlap
+
+        :param t1: The first Tag
+        :param t2: The second Tag
+        :return: Boolean
+        """
         m1 = self.get_prefix_l(t1)
         m2 = self.get_prefix_l(t2)
-        return (t1 in m2) or (t2 in m1)
+        return t1 in m2 or t2 in m1
 
-    def remove_overlaps(self, l): 
-        ''' Returns list with overlapping tags removed '''
+    def remove_overlaps(self, l: Collection[AnyStr]) -> Union[Collection[AnyStr], List[AnyStr]]:
+        """
+        Returns list with overlapping tags removed
+
+        :param l: The list
+        :return: Deduped list
+        """
+        # TODO - code smell
         if not l:
             return l
-        pair_l = sorted([(self.get_depth(t),t) for t in l])
+        pair_l = sorted([(self.get_depth(t), t) for t in l])
         out_l = [pair_l.pop()[1]]
         while pair_l:
             t = pair_l.pop()[1]
-            if (not any(self.overlaps(t, e) for e in out_l)):
+            if not any(self.overlaps(t, e) for e in out_l):
                 out_l.append(t)
         return out_l
 
-    def read_taxonomy(self, filepath):
-        '''Read taxonomy from given file '''
+    def read_taxonomy(self, filepath: AnyStr):
+        """
+        Create Taxonomy from file (tab-separated lines)
+
+        :param filepath: The path of the file to read
+        :return: None
+        """
         with open(filepath, 'r') as fd:
             for line in fd:
-                if line.startswith('#') or line == '\n':
-                    continue
-                self.add_tag(line.strip())
-        return
-
-    def to_file(self, filepath):
-        ''' Output sorted taxonomy to given file '''
-        # Open output file
-        fd = open(filepath, 'w')
-        # Write sorted tags
-        tag_l = sorted(self.__tag_map.items(), 
-                                key=lambda item : item[1].path, 
-                                reverse=False)
-        idx = 0
-        for name,tag in tag_l:
-            if (idx % 2) == 0:
-                fd.write(tag.path+"\n")
-            idx+=1
-        # Close output file
-        fd.close()
+                line = line.strip()
+                if not line.startswith('#') and line:
+                    self.add_tag(line)
+
+    def to_file(self, filepath: AnyStr):
+        """
+        Write sorted Taxonomy to a file (tab-separated lines)
+
+        :param filepath: The path to write
+        :return: None
+        """
+        with open(filepath, 'w') as fd:
+            tag_l = sorted(self.__tag_map.items(),
+                           key=lambda item: item[1].path)
+            idx = 0
+            for name, tag in tag_l:
+                if (idx % 2) == 0:
+                    fd.write(tag.path + "\n")
+                idx += 1
 
 
 class Rules:
-    '''
-    Rules are src -> dst1, dst2, ... relations
-    '''
-    def __init__(self, filepath):
-        ''' Map src -> set(dst) '''
+    """
+    Map a single source with one or more destinations
+    """
+    def __init__(self, filepath: Optional[AnyStr]):
+        """
+        Initialize the rule-map and read rules from ``filepath``
+
+        :param filepath: The file to read from
+        """
         self._rmap = {}
         if filepath:
             self.read_rules(filepath)
 
     def __len__(self):
-        ''' Length is number of rules, i.e., number of src '''
+        """
+        The number of rules/src in the rule-map
+
+        :return: Number of rules
+        """
         return len(self._rmap)
 
-    def add_rule(self, src, dst_l, overwrite=False):
-        ''' Add rule. If rule exists:
-            if overwrite==True, replace destination list
-            else append dst_l to current target set  '''
+    def add_rule(self, src: AnyStr, dst_l: Collection[AnyStr] = None, overwrite: bool = False):
+        """
+        Add a rule to the map.  On duplicate, append destinations.  If ``overwrite`` is set, replace rule src/dst.
+
+        :param src: The source tag
+        :param dst_l: The destination list
+        :param overwrite: Whether or not to overwrite duplicates
+        :return: None
+        """
         # Remove src from dst_l if it exists
         dst_l = filter(lambda x: x != src, dst_l)
-        # If no destinations, nothing to do
-        if (not dst_l):
+        if not dst_l:
             return
-        log.debug("[Rules] Adding %s -> %s" % (src, dst_l))
+
+        logger.debug("[Rules] Adding %s -> %s" % (src, dst_l))
         src_tag = create_tag(src)
         if overwrite:
             target_l = [create_tag(dst).name for dst in dst_l]
@@ -248,212 +322,248 @@ def add_rule(self, src, dst_l, overwrite=False):
                 dst_tag = create_tag(dst)
                 curr_dst.add(dst_tag.name)
             self._rmap[src_tag.name] = curr_dst
-        return
 
-    def remove_rule(self, src):
-        l = self._rmap.get(src, [])
-        if l:
-            log.debug("[Rules] Removing rule: %s -> %s" % (src, l))
+    def remove_rule(self, src: AnyStr) -> bool:
+        dst = self._rmap.get(src, [])
+        if dst:
+            logger.debug("[Rules] Removing rule: %s -> %s" % (src, dst))
             del self._rmap[src]
-            return 1
-        else:
-            return 0
+            return True
+        return False
+
+    def get_dst(self, src: AnyStr) -> List[AnyStr]:
+        """
+        Returns a the dst belonging to src or an empty list.
 
-    def get_dst(self, src):
-        ''' Returns dst list for given src, or empty list if no expansion '''
+        :param src: The source rule
+        :return: List of dst
+        """
         return list(self._rmap.get(src, []))
 
-    def read_rules(self, filepath):
-        '''Read rules from given file'''
+    def read_rules(self, filepath: AnyStr):
+        """
+        Read rules from a file and create the rule-map.
+
+        :param filepath: The path of the file to read
+        :return: None
+        """
         with open(filepath, 'r') as fd:
             for line in fd:
-                if line.startswith('#') or line == '\n':
-                    continue
-                word_list = line.strip().split()
-                if len(word_list) > 1:
-                    self.add_rule(word_list[0],word_list[1:])
-        return
-
-    def to_file(self, filepath, taxonomy=None):
-        ''' Output sorted rules to given file 
-            If taxonomy is provided, it outputs full tag path '''
-        fd = open(filepath, 'w')
-        for src,dst_set in sorted(self._rmap.items()):
-            dst_l = sorted(dst_set, reverse=False)
-            if taxonomy:
-                src_path = taxonomy.get_path(src)
-                path_l = [taxonomy.get_path(t) for t in dst_l]
-                dst_str = '\t'.join(path_l)
-                fd.write("%s\t%s\n" % (src_path,dst_str))
-            else:
-                dst_str = '\t'.join(dst_l)
-                fd.write("%s\t%s\n" % (src,dst_str))
-        fd.close()
+                line = line.strip()
+                if not line.startswith('#') and line:
+                    word_list = line.split()
+                    if len(word_list) > 1:
+                        self.add_rule(word_list[0], word_list[1:])
+
+    def to_file(self, filepath: AnyStr, taxonomy: Taxonomy = None):
+        """
+        Write current rules to the file at ``filepath``.
+
+        :param filepath: The path of the file to write
+        :param taxonomy: A Taxonomy to optionally resolve full tag paths
+        :return: None
+        """
+        with open(filepath, 'w') as fd:
+            for src, dst_set in sorted(self._rmap.items()):
+                dst_l = sorted(dst_set)
+                if taxonomy:
+                    src_path = taxonomy.get_path(src)
+                    path_l = [taxonomy.get_path(t) for t in dst_l]
+                    dst_str = '\t'.join(path_l)
+                    fd.write("%s\t%s\n" % (src_path, dst_str))
+                else:
+                    dst_str = '\t'.join(dst_l)
+                    fd.write("%s\t%s\n" % (src, dst_str))
+
+    def expand_src_destinations(self, src: AnyStr) -> Set[AnyStr]:
+        """
+        Return a list of all expanded destinations for ``src``
 
-    def expand_src_destinations(self, src):
-        ''' Return destination list for given src after recursively 
-            following any rules for destinations '''
+        :param src: The source
+        :return: List of expanded destinations
+        """
+        # TODO - this only goes one layer deep it seems.  Not actually recursive
         dst_set = self._rmap.get(src, set())
         out = set()
         while dst_set:
             dst = dst_set.pop()
-            l = self._rmap.get(dst, [])
-            if l:
-                for e in l:
-                    if (e not in out) and (e != dst):
-                        dst_set.add(e)
+            dst_l = self._rmap.get(dst, [])
+            if dst_l:
+                for d in dst_l:
+                    if d not in out and d != dst:
+                        dst_set.add(d)
             else:
                 out.add(dst)
         return out
 
     def expand_all_destinations(self):
-        ''' Return destination list for given src after recursively 
-            following any rules for destinations '''
+        """
+        Expand/resolve all sources in the rule-map
+
+        :return: None
+        """
         src_l = self._rmap.keys()
         for src in src_l:
             dst_l = self.expand_src_destinations(src)
             self._rmap[src] = dst_l
 
 
-class Tagging(Rules):
-    '''
-    Tagging rules have src UNK and dst in taxonomy
-    '''
-    def __init__(self, filepath):
-        Rules.__init__(self, filepath)
+class Translation(Rules):
+    """
+    Translations are a set of rules that convert between unknown labels and labels that are in our Taxonomy
+    """
+    def __init__(self, filepath: AnyStr):
+        super().__init__(filepath)
 
-    def validate(self, taxonomy):
-        ''' Check that tags in tagging rules are in given taxonomy '''
-        for tok,tag_l in self._rmap.items():
+    def validate(self, taxonomy: Taxonomy):
+        """
+        Ensure all "destination" labels are in the Taxonomy.
+
+        :param taxonomy: The Taxonomy to use for checking
+        :return: None
+        """
+        for tok, tag_l in self._rmap.items():
             for t in tag_l:
-                if (not taxonomy.is_tag(t)):
+                if not taxonomy.is_tag(t):
                     sys.stdout.write("[Tagging] %s not in taxonomy\n" % t)
+                    # TODO - raise or return False?
 
 
 class Expansion(Rules):
-    '''
-    Expansion rules have src and dst in taxonomy and
-        src.category != dst.category
-    '''
-    def __init__(self, filepath):
-        Rules.__init__(self, filepath)
-
-    def validate(self, taxonomy):
-        ''' Check that tags in expansion rules are in given taxonomy '''
-        for src,dst_set in self._rmap.items():
-            if (not taxonomy.is_tag(src)):
+    """
+    Expansions are rules that allow us to map a single label (src) to all explicit and implicit labels
+    """
+    def __init__(self, filepath: AnyStr):
+        super().__init__(filepath)
+
+    def validate(self, taxonomy: Taxonomy):
+        """
+        Ensure all "source" and "destination" labels are in the Taxonomy.
+
+        :param taxonomy: The Taxonomy to use for checking
+        :return: None
+        """
+        for src, dst_set in self._rmap.items():
+            if not taxonomy.is_tag(src):
                 sys.stdout.write("[Expansion] %s not in taxonomy\n" % src)
+                # TODO - raise or return False?
             for dst in dst_set:
-                if (not taxonomy.is_tag(dst)):
+                if not taxonomy.is_tag(dst):
                     sys.stdout.write("[Expansion] %s not in taxonomy\n" % dst)
+                    # TODO - raise or return False?
 
 
 class AvLabels:
-    '''
-    Class to operate on AV labels, 
-    such as extracting the most likely family name.
-    '''
-    def __init__(self, tag_file, exp_file = None, tax_file = None,
-                 av_file = None, aliasdetect=False):
-        # Read taxonomy
+    """
+    Primary class used to interpret AV Labels
+    """
+    def __init__(self, tag_file: AnyStr, exp_file: AnyStr = None, tax_file: AnyStr = None, av_file: AnyStr = None,
+                 alias_detect: AnyStr = False):
         self.taxonomy = Taxonomy(tax_file)
-        # Read tag rules
-        self.tagging = Tagging(tag_file)
-        # Read expansion rules
+        self.translations = Translation(tag_file)
         self.expansions = Expansion(exp_file)
-        # Read AV engines
         self.avs = self.read_avs(av_file) if av_file else None
         # Alias statistics initialization
-        self.aliasdetect = aliasdetect
+        self.alias_detect = alias_detect
 
     @staticmethod
-    def read_avs(avs_file):
-        '''Read AV engine set from given file'''
+    def read_avs(avs_file: AnyStr) -> Set[AnyStr]:
+        """
+        Read AV engines from ``avs_file``
+
+        :param avs_file: The file to read
+        :return: A set containing the engines
+        """
         with open(avs_file) as fd:
             avs = set(map(str.strip, fd.readlines()))
         return avs
 
     @staticmethod
-    def get_sample_info_lb(vt_rep):
-        '''Parse and extract sample information from JSON line
-           Returns a SampleInfo named tuple
-        '''
-        return SampleInfo(vt_rep['md5'], vt_rep['sha1'], vt_rep['sha256'],
-                          vt_rep['av_labels'], [])
+    def get_sample_info_lb(record: Dict) -> SampleInfo:
+        """
+        Convert simplified JSON to a SampleInfo object
+
+        :param record: The JSON record
+        :return: An instance of SampleInfo
+        """
+        return SampleInfo(record['md5'], record['sha1'], record['sha256'], record['av_labels'], [])
 
     @staticmethod
-    def get_sample_info_vt_v2(vt_rep):
-        '''Parse and extract sample information from JSON line
-           Returns a SampleInfo named tuple
-        '''
-        label_pairs = []
-        # Obtain scan results, if available
+    def get_sample_info_vt_v2(record):
+        """
+        Convert VT (v2) JSON to a SampleInfo object
+
+        :param record: The JSON record
+        :return: An instance of SampleInfo
+        """
         try:
-            scans = vt_rep['scans']
-            md5 = vt_rep['md5']
-            sha1 = vt_rep['sha1']
-            sha256 = vt_rep['sha256']
+            scans = record['scans']
+            md5 = record['md5']
+            sha1 = record['sha1']
+            sha256 = record['sha256']
         except KeyError:
             return None
+
         # Obtain labels from scan results
+        label_pairs = []
         for av, res in scans.items():
             if res['detected']:
                 label = res['result']
-                clean_label = ''.join(filter(
-                                  lambda x: x in string.printable,
-                                    label)).strip()
+                clean_label = ''.join(filter(lambda x: x in string.printable, label)).strip()
                 label_pairs.append((av, clean_label))
-        # Obtain VT tags, if available
-        vt_tags = vt_rep.get('tags', [])
+
+        vt_tags = record.get('tags', [])
 
         return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags)
 
     @staticmethod
-    def get_sample_info_vt_v3(vt_rep):
-        '''Parse and extract sample information from JSON line
-           Returns a SampleInfo named tuple
-        '''
-        label_pairs = []
-        # Obtain scan results, if available
+    def get_sample_info_vt_v3(record):
+        """
+        Convert VT (v3) JSON to a SampleInfo object
+
+        :param record: The JSON record
+        :return: An instance of SampleInfo
+        """
         try:
-            scans = vt_rep['data']['attributes']['last_analysis_results']
-            md5 = vt_rep['data']['attributes']['md5']
-            sha1 = vt_rep['data']['attributes']['sha1']
-            sha256 = vt_rep['data']['attributes']['sha256']
+            scans = record['data']['attributes']['last_analysis_results']
+            md5 = record['data']['attributes']['md5']
+            sha1 = record['data']['attributes']['sha1']
+            sha256 = record['data']['attributes']['sha256']
         except KeyError:
             return None
+
         # Obtain labels from scan results
+        label_pairs = []
         for av, res in scans.items():
             label = res['result']
             if label is not None:
-                clean_label = ''.join(filter(
-                                  lambda x: x in string.printable,
-                                    label)).strip()
+                clean_label = ''.join(filter(lambda x: x in string.printable, label)).strip()
                 label_pairs.append((av, clean_label))
-        # Obtain VT tags, if available
-        vt_tags = vt_rep['data']['attributes'].get('tags', [])
 
-        return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags)
+        vt_tags = record['data']['attributes'].get('tags', [])
 
+        return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags)
 
     @staticmethod
-    def is_pup(tag_pairs, taxonomy):
-        '''This function classifies the sample as PUP or not 
-           by checking if highest ranked CLASS tag contains "grayware"
-           and is above a predefined threshold
-           Return:
-              True/False/None
-        '''
+    def is_pup(tag_pairs, taxonomy: Taxonomy) -> Optional[bool]:
+        """
+        Attempts to classify a sample (represented by ``tag_pairs``) as a PUP.  We accomplish this by checking for the
+        "grayware" label in the highest ranked CLASS.
+
+        :param tag_pairs: List of tuples containing a label, and rank (int)
+        :param taxonomy: The Taxonomy
+        :return: bool or None
+        """
         threshold = 0.5
-        # If no tags, return false
         if len(tag_pairs) < 1:
             return None
+
         max_ctr = tag_pairs[0][1]
-        for (tag,ctr) in tag_pairs:
-            (path, cat) = taxonomy.get_info(tag)
-            if (cat == "CLASS"):
-                if ("grayware" in path):
-                    return (float(ctr) >= float(max_ctr)*threshold)
+        for tag, ctr in tag_pairs:
+            path, cat = taxonomy.get_info(tag)
+            if cat == "CLASS":
+                if "grayware" in path:
+                    return float(ctr) >= float(max_ctr)*threshold
                 else:
                     return False
         return False
@@ -521,7 +631,7 @@ def get_label_tags(self, label, hashes):
                 continue
 
             # Apply tagging rule
-            dst_l = self.tagging.get_dst(token)
+            dst_l = self.translations.get_dst(token)
             if dst_l:
                 # Ignore generic tokens
                 for t in dst_l:
@@ -614,7 +724,7 @@ def get_sample_tags(self, sample_info):
             # Expansions                                           #
             ########################################################
             # NOTE: Avoiding to do expansion when aliases
-            if self.aliasdetect:
+            if self.alias_detect:
                 expanded_tags = tags
             else:
                 expanded_tags = self.__expand(tags)
diff --git a/avclass/update.py b/avclass/update.py
index 6d0558c..d19ef0f 100644
--- a/avclass/update.py
+++ b/avclass/update.py
@@ -7,7 +7,7 @@
 from operator import itemgetter
 
 from avclass import util
-from avclass.common import Taxonomy, Expansion, Tagging
+from avclass.common import Taxonomy, Expansion, Translation
 
 
 logger = logging.getLogger(__name__)
@@ -463,7 +463,7 @@ def output(self, out_prefix):
                         len(taxonomy), args.tax))
 
     # Read tagging rules
-    tagging = Tagging(args.tag)
+    tagging = Translation(args.tag)
     logger.info('[-] Read %d tagging rules from %s' % (
                         len(tagging), args.tag))
 
diff --git a/avclass/util.py b/avclass/util.py
index ceaf071..028bc36 100755
--- a/avclass/util.py
+++ b/avclass/util.py
@@ -3,7 +3,7 @@
 import pkg_resources
 
 from avclass import data
-from avclass.common import Taxonomy, Tagging, Expansion
+from avclass.common import Taxonomy, Translation, Expansion
 
 from typing import AnyStr
 
@@ -63,7 +63,7 @@ def validate_tagging(path: AnyStr, taxonomy: Taxonomy):
     :param taxonomy: Valid Taxonomy object
     :return: None
     """
-    tagging = Tagging(path)
+    tagging = Translation(path)
     tagging.validate(taxonomy)
     # tagging.expand_all_destinations()
     tagging.to_file(path)

From e1a00a56c4a78f502936d705bcd5a86283009877 Mon Sep 17 00:00:00 2001
From: Matt Miller <usr.bin.bourbon@gmail.com>
Date: Wed, 13 Jan 2021 14:24:16 -0600
Subject: [PATCH 09/36] fix common

---
 avclass/common.py | 133 +++++++++++++++++++---------------------------
 1 file changed, 55 insertions(+), 78 deletions(-)

diff --git a/avclass/common.py b/avclass/common.py
index dbc292b..cf79a21 100755
--- a/avclass/common.py
+++ b/avclass/common.py
@@ -1,11 +1,11 @@
 import logging
+import operator
 import re
 import string
 import sys
 
-from collections import namedtuple
-from operator import itemgetter
-from typing import Any, AnyStr, Collection, Dict, List, Optional, Set, Tuple, Union
+from collections import defaultdict, namedtuple
+from typing import AnyStr, Collection, Dict, List, Optional, Set, Tuple, Union
 
 
 logger = logging.getLogger(__name__)
@@ -569,10 +569,14 @@ def is_pup(tag_pairs, taxonomy: Taxonomy) -> Optional[bool]:
         return False
 
     @staticmethod
-    def __remove_suffixes(av_name, label):
-        '''Remove AV specific suffixes from given label
-           Returns updated label'''
+    def __remove_suffixes(av_name: AnyStr, label: AnyStr) -> AnyStr:
+        """
+        Remove vendor-specific suffixes from the label
 
+        :param av_name: The AV name to remove
+        :param label: The label to change
+        :return: The new label
+        """
         # Truncate after last '.'
         if av_name in suffix_removal_av_set:
             label = label.rsplit('.', 1)[0]
@@ -590,15 +594,15 @@ def __remove_suffixes(av_name, label):
 
         return label
 
+    def get_label_tags(self, label: AnyStr, hashes: Collection[AnyStr]) -> Set[AnyStr]:
+        """
+        Tokenize, translate, and filter a label into tags.  ``hashes`` are used to provide a dynamic filter of sorts.
+        We don't want to tokenize parts of the sample's hash which is a common thing for some AV vendors.
 
-    def get_label_tags(self, label, hashes):
-        ''' Return list of tags in given label 
-            Tokenizes label, filters unneeded tokens, and 
-            applies tagging rules '''
-
-        # Initialize set of tags to return
-        # We use a set to avoid duplicate tokens in the same AV label
-        # This avoids "potentially unwanted" contributing twice BEH:pup
+        :param label: The label to convert
+        :param hashes: A list of hashes to be used as dynamic filters
+        :return: A set of tags that were extracted from the label
+        """
         tags = set()
 
         # If empty label, nothing to do
@@ -618,12 +622,7 @@ def get_label_tags(self, label, hashes):
             # Ignore token if prefix of a hash of the sample
             # Most AVs use MD5 prefixes in labels, 
             # but we check SHA1 and SHA256 as well
-            hash_token = False
-            for hash_str in hashes:
-                if hash_str[0:len(token)] == token:
-                  hash_token = True
-                  break
-            if hash_token:
+            if any([h.startswith(token) for h in hashes]):
                 continue
 
             # Ignore generic tokens
@@ -644,9 +643,13 @@ def get_label_tags(self, label, hashes):
         # Return tags
         return tags
 
+    def __expand(self, tag_set: Set[AnyStr]) -> Set[AnyStr]:
+        """
+        Expand tags into more tags using expansion rules and the Taxonomy
 
-    def __expand(self, tag_set):
-        ''' Return expanded set of tags '''
+        :param tag_set: Starting set of tags
+        :return: Expanded set of tags
+        """
         ret = set()
         for t in tag_set:
             # Include tag
@@ -658,90 +661,64 @@ def __expand(self, tag_set):
             # Include implicit expansions in taxonomy
             ret.update(self.taxonomy.expand(t))
 
-        # Return a list for backwards compatibility 
         return ret
 
-    def get_sample_tags(self, sample_info):
-        ''' Returns dictionary tag -> AV list of tags for the given sample '''
+    def get_sample_tags(self, sample_info: SampleInfo) -> Dict[AnyStr, List[AnyStr]]:
+        """
+        Get a dictionary where the key is a tag and the value is a list of AV engines that confirmed that tag.
 
-        # Whitelist the AVs to filter the ones with meaningful labels
-        av_whitelist = self.avs
-        # Initialize auxiliary data structures
+        :param sample_info: The SampleInfo object to inspect
+        :return: A dictionary where k,v = tag,[av, ...]
+        """
         duplicates = set()
-        av_dict = {}
+        av_dict = defaultdict(list)
 
         # Process each AV label
-        for (av_name, label) in sample_info.labels:
-            # If empty label, nothing to do
-            if not label:
-                continue
-
-            ################
-            # AV selection #
-            ################
-            if av_whitelist and av_name not in av_whitelist:
+        for av_name, label in sample_info.labels:
+            if not label or av_name not in self.avs:
                 continue
 
-            #####################
-            # Duplicate removal #
-            #####################
-
-            # Emsisoft uses same label as 
+            # Emsisoft uses same label as
             # GData/ESET-NOD32/BitDefender/Ad-Aware/MicroWorld-eScan,
             # but suffixes ' (B)' to their label. Remove the suffix.
-            if label.endswith(' (B)'):
-                label = label[:-4]
+            label = label.rstrip(' (B)')
 
             # F-Secure uses Avira's engine since Nov. 2018
             # but prefixes 'Malware.' to Avira's label. Remove the prefix.
-            if label.startswith('Malware.'):
-                label = label[8:]
+            label = label.lstrip('Malware.')
 
             # Other engines often use exactly the same label, e.g.,
             #   AVG/Avast
             #   K7Antivirus/K7GW
             #   Kaspersky/ZoneAlarm
 
-            # If we have seen the exact same label before, skip
             if label in duplicates:
                 continue
-            # If not, we add it to duplicates
-            else:
-                duplicates.add(label)
 
-            ##################
-            # Suffix removal #
-            ##################
-            label = self.__remove_suffixes(av_name, label)
+            duplicates.add(label)
 
-            ########################################################
-            # Tokenization and tagging                             #
-            ########################################################
-            hashes = [ sample_info.md5, sample_info.sha1, sample_info.sha256 ]
+            label = self.__remove_suffixes(av_name, label)
+            hashes = [sample_info.md5, sample_info.sha1, sample_info.sha256]
             tags = self.get_label_tags(label, hashes)
 
-            ########################################################
-            # Expansions                                           #
-            ########################################################
-            # NOTE: Avoiding to do expansion when aliases
-            if self.alias_detect:
-                expanded_tags = tags
-            else:
-                expanded_tags = self.__expand(tags)
+            # NOTE: Avoid expansion when aliases are set
+            expanded_tags = tags if self.alias_detect else self.__expand(tags)
 
-            ########################################################
-            # Stores information that relates AV vendors with tags #
-            ########################################################
+            # store av vendors for each tag
             for t in expanded_tags:
-                av_dict.setdefault(t, []).append(av_name)
+                av_dict[t].append(av_name)
 
         return av_dict
 
-    def rank_tags(self, av_dict, threshold=1):
-        ''' Return list of (tag, confidence) ranked by decreasing confidence 
-            and filter tags with less or equal threshold confidence '''
-
-        pairs = ((t, len(avs)) for (t,avs) in av_dict.items() 
-                    if len(avs) > threshold)
-        return sorted(pairs, key=itemgetter(1,0), reverse=True)
+    @staticmethod
+    def rank_tags(av_dict: Dict[AnyStr, List[AnyStr]], threshold: int = 1) -> List[Tuple[AnyStr, int]]:
+        """
+        Get a list of tuples containing a tag and the number of AV that confirmed that tag sorted by number of AV
+        (descending).
 
+        :param av_dict: The AV dictionary (from ``get_sample_tags()``)
+        :param threshold: The minimum rank/count to include
+        :return: A sorted list of tag, av-count pairs
+        """
+        pairs = ((t, len(avs)) for t, avs in av_dict.items() if len(avs) > threshold)
+        return sorted(pairs, key=operator.itemgetter(1, 0), reverse=True)

From 1f5ccedc21d651a03cb1393aaab0dced72d19439 Mon Sep 17 00:00:00 2001
From: Matt Miller <usr.bin.bourbon@gmail.com>
Date: Wed, 13 Jan 2021 14:26:53 -0600
Subject: [PATCH 10/36] fix clustering

---
 avclass/clustering.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/avclass/clustering.py b/avclass/clustering.py
index c5a349c..20d3fe5 100755
--- a/avclass/clustering.py
+++ b/avclass/clustering.py
@@ -41,7 +41,7 @@ def eval_precision_recall_fmeasure(expected: Dict, guess: Dict):
     for k, v in expected.items():
         gt_rev_dict[v].add(k)
 
-    counter, l = 0, len(guess)
+    counter, gl = 0, len(guess)
 
     sys.stderr.write('Calculating precision and recall\n')
 
@@ -49,7 +49,7 @@ def eval_precision_recall_fmeasure(expected: Dict, guess: Dict):
     for element in guess:
         # Print progress
         if counter % 1000 == 0:
-            sys.stderr.write('\r%d out of %d' % (counter, l))
+            sys.stderr.write('\r%d out of %d' % (counter, gl))
             sys.stderr.flush()
         counter += 1
 
@@ -71,7 +71,7 @@ def eval_precision_recall_fmeasure(expected: Dict, guess: Dict):
         r = 1.0*tp/(tp+fn)
         tmp_recall += r
 
-    sys.stderr.write('\r%d out of %d' % (counter, l))
+    sys.stderr.write('\r%d out of %d' % (counter, gl))
     sys.stderr.write('\n')
 
     precision = 100.0 * tmp_precision / len(guess)
@@ -81,7 +81,7 @@ def eval_precision_recall_fmeasure(expected: Dict, guess: Dict):
     return precision, recall, fmeasure
 
 
-if __name__ == "__main__":
+def main():
     # The ground truth.
     # Dictionary with mapping: "element : cluster_id".
     diz_grth = {
@@ -104,12 +104,12 @@ def eval_precision_recall_fmeasure(expected: Dict, guess: Dict):
     # truth, but just different cluster labels. Precision == Recall ==
     # F-Measure == 100%.
     # Dictionary with mapping: "element : cluster_id".
-    diz_estim_grth = {
-        "a": 2,
-        "b": 2,
-        "c": 66,
-        "d": 9
-    }
+    # diz_estim_grth = {
+    #     "a": 2,
+    #     "b": 2,
+    #     "c": 66,
+    #     "d": 9
+    # }
 
     # a sample where estimated != ground truth
     sys.stdout.write("Ground truth\n")
@@ -130,3 +130,7 @@ def eval_precision_recall_fmeasure(expected: Dict, guess: Dict):
     sys.stdout.write("\nPrecison: %s%%\n" % p)
     sys.stdout.write("Recall: %s%%\n" % r)
     sys.stdout.write("F-Measure: %s%%\n" % f)
+
+
+if __name__ == "__main__":
+    main()

From f68836d254b6593fa91526c0761858cfda68b1a7 Mon Sep 17 00:00:00 2001
From: Matt Miller <usr.bin.bourbon@gmail.com>
Date: Wed, 13 Jan 2021 14:28:35 -0600
Subject: [PATCH 11/36] typing

---
 avclass/clustering.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/avclass/clustering.py b/avclass/clustering.py
index 20d3fe5..f6a9b4b 100755
--- a/avclass/clustering.py
+++ b/avclass/clustering.py
@@ -1,10 +1,10 @@
 import sys
 
 from collections import defaultdict
-from typing import Dict, Set
+from typing import Dict, Set, Tuple
 
 
-def tp_fp_fn(expected: Set, guess: Set):
+def tp_fp_fn(expected: Set, guess: Set) -> Tuple[int, int, int]:
     """
     Calculate the true-positives, false-positives, and false-negatives between ``expected`` and ``guess``
 
@@ -19,7 +19,7 @@ def tp_fp_fn(expected: Set, guess: Set):
     return tp, fp, fn
 
 
-def eval_precision_recall_fmeasure(expected: Dict, guess: Dict):
+def eval_precision_recall_fmeasure(expected: Dict, guess: Dict) -> Tuple[int, int, int]:
     """
     Evaluate the precision, recall, and f-measure for the comparison of ``expected`` to ``guess``
 

From 402757ce0563d2a86f1fd0ec26b2e10ca221af50 Mon Sep 17 00:00:00 2001
From: Matt Miller <usr.bin.bourbon@gmail.com>
Date: Wed, 13 Jan 2021 15:12:40 -0600
Subject: [PATCH 12/36] cleanup labeler

---
 avclass/labeler.py | 253 ++++++++++++++++++++-------------------------
 1 file changed, 110 insertions(+), 143 deletions(-)

diff --git a/avclass/labeler.py b/avclass/labeler.py
index 035d4af..b96a28f 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -5,12 +5,19 @@
 import traceback
 
 from operator import itemgetter
+from typing import AnyStr, Optional
 
-from avclass.common import AvLabels
+from avclass.common import AvLabels, Taxonomy
 from avclass import clustering as ec, util
 
 
-def guess_hash(h):
+def guess_hash(h: AnyStr) -> Optional[AnyStr]:
+    """
+    Guess hash type based on ``len(h)``
+
+    :param h: The hash
+    :return: The hash type (str)
+    """
     ''' Given a hash string, guess the hash type based on the string length '''
     hlen = len(h)
     if hlen == 32:
@@ -19,30 +26,48 @@ def guess_hash(h):
         return 'sha1'
     elif hlen == 64:
         return 'sha256'
-    else:
-        return None
 
+    return None
+
+
+def format_tag_pairs(l, taxonomy: Taxonomy = None) -> AnyStr:
+    """
+    Get ranked tags as a string.
 
-def format_tag_pairs(l, taxonomy=None):
-    ''' Return ranked tags as string '''
+    :param l:
+    :param taxonomy:
+    :return:
+    """
+    # TODO - wtf is ``l``?
     if not l:
         return ""
+
     if taxonomy is not None:
         p = taxonomy.get_path(l[0][0])
     else:
         p = l[0][0]
+
     out = "%s|%d" % (p, l[0][1])
-    for (t,s) in l[1:]:
+    for t, s in l[1:]:
         if taxonomy is not None:
             p = taxonomy.get_path(t) 
         else:
             p = t
         out += ",%s|%d" % (p, s)
+
     return out
 
 
-def list_str(l, sep=", ", prefix=""):
-    ''' Return list as a string '''
+def list_str(l, sep: AnyStr = ", ", prefix: AnyStr = "") -> AnyStr:
+    """
+    Return list as a string
+
+    :param l: The list
+    :param sep: The separator
+    :param prefix: The prefix
+    :return: A string representation of the list
+    """
+    # TODO - wtf is ``l``?
     if not l:
         return ""
     out = prefix + l[0]
@@ -52,9 +77,10 @@ def list_str(l, sep=", ", prefix=""):
 
 
 def main():
+    # TODO - break this function up.
     args = parse_args()
     # Select hash used to identify sample, by default MD5
-    hash_type = args.hash if args.hash else 'md5'
+    hash_type = args.hash or 'md5'
 
     # If ground truth provided, read it from file
     gt_dict = {}
@@ -68,26 +94,26 @@ def main():
         hash_type = guess_hash(list(gt_dict.keys())[0])
 
     # Create AvLabels object
-    av_labels = AvLabels(args.tag, args.exp, args.tax,
-                         args.av, args.aliasdetect)
+    av_labels = AvLabels(args.tag, args.exp, args.tax, args.av, args.aliasdetect)
 
     # Build list of input files
     # NOTE: duplicate input files are not removed
     ifile_l = []
-    if (args.vt):
+    if args.vt:
         ifile_l += args.vt
         ifile_are_vt = True
-    if (args.lb):
+    elif args.lb:
         ifile_l += args.lb
         ifile_are_vt = False
-    if (args.vtdir):
-        ifile_l += [os.path.join(args.vtdir, 
-                                  f) for f in os.listdir(args.vtdir)]
+    elif args.vtdir:
+        ifile_l += [os.path.join(args.vtdir, f) for f in os.listdir(args.vtdir)]
         ifile_are_vt = True
-    if (args.lbdir):
-        ifile_l += [os.path.join(args.lbdir, 
-                                  f) for f in os.listdir(args.lbdir)]
+    elif args.lbdir:
+        ifile_l += [os.path.join(args.lbdir, f) for f in os.listdir(args.lbdir)]
         ifile_are_vt = False
+    else:
+        # TODO - is this reachable?
+        sys.exit(1)
 
     # Select correct sample info extraction function
     if not ifile_are_vt:
@@ -109,19 +135,12 @@ def main():
     stats = {'samples': 0, 'noscans': 0, 'tagged': 0, 'maltagged': 0,
              'FAM': 0, 'CLASS': 0, 'BEH': 0, 'FILE': 0, 'UNK': 0}
 
-    # Process each input file
     for ifile in ifile_l:
-        # Open file
         fd = open(ifile, 'r')
-
-        # Debug info, file processed
         sys.stderr.write('[-] Processing input file %s\n' % ifile)
 
-        # Process all lines in file
         for line in fd:
-
-            # If blank line, skip
-            if line == '\n':
+            if not line.strip():
                 continue
 
             # Debug info
@@ -130,19 +149,16 @@ def main():
                 sys.stderr.flush()
             vt_all += 1
 
-            # Read JSON line
             vt_rep = json.loads(line)
-
-            # Extract sample info
             sample_info = get_sample_info(vt_rep)
 
-            # If no sample info, log error and continue
             if sample_info is None:
                 try:
                     name = vt_rep['md5']
                     sys.stderr.write('\nNo scans for %s\n' % name)
                 except KeyError:
                     sys.stderr.write('\nCould not process: %s\n' % line)
+
                 sys.stderr.flush()
                 stats['noscans'] += 1
                 continue
@@ -152,7 +168,7 @@ def main():
 
             # If the VT report has no AV labels, output and continue
             if not sample_info.labels:
-                sys.stdout.write('%s\t-\t[]\n' % (name))
+                sys.stdout.write('%s\t-\t[]\n' % name)
                 # sys.stderr.write('\nNo AV labels for %s\n' % name)
                 # sys.stderr.flush()
                 continue
@@ -160,8 +176,7 @@ def main():
             # Compute VT_Count
             vt_count = len(sample_info.labels)
 
-            # Get the distinct tokens from all the av labels in the report
-            # And print them. 
+            # Get the distinct tokens from all the av labels in the report and print them.
             try:
                 av_tmp = av_labels.get_sample_tags(sample_info)
                 tags = av_labels.rank_tags(av_tmp)
@@ -183,24 +198,21 @@ def main():
                         token_count_map[curr_tok] = curr_count + 1
                         for prev_tok in prev_tokens:
                             if prev_tok < curr_tok:
-                                pair = (prev_tok,curr_tok)
+                                pair = prev_tok, curr_tok
                             else:
-                                pair = (curr_tok,prev_tok)
+                                pair = curr_tok, prev_tok
                             pair_count = pair_count_map.get(pair, 0)
                             pair_count_map[pair] = pair_count + 1
                         prev_tokens.add(curr_tok)
 
                 # Collect stats
-                # FIX: should iterate once over tags, 
-                # for both stats and aliasdetect
+                # TODO - should iterate once over tags for both stats and aliasdetect
                 if tags:
                     stats["tagged"] += 1
                     if args.stats:
-                        if (vt_count > 3):
+                        if vt_count > 3:
                             stats["maltagged"] += 1
-                            cat_map = {'FAM': False, 'CLASS': False,
-                                       'BEH': False, 'FILE': False, 'UNK':
-                                           False}
+                            cat_map = {'FAM': False, 'CLASS': False, 'BEH': False, 'FILE': False, 'UNK': False}
                             for t in tags:
                                 path, cat = av_labels.taxonomy.get_info(t[0])
                                 cat_map[cat] = True
@@ -215,21 +227,18 @@ def main():
                     else:
                         is_pup_str = "\t0"
                 else:
-                    is_pup_str =  ""
+                    is_pup_str = ""
 
                 # Select family for sample if needed,
                 # i.e., for compatibility mode or for ground truth
+                fam = "SINGLETON:" + name
                 if args.c or args.gt:
-                    fam = "SINGLETON:" + name
-                    # fam = ''
-                    for (t,s) in tags:
+                    for t, s in tags:
                         cat = av_labels.taxonomy.get_category(t)
-                        if (cat == "UNK") or (cat == "FAM"):
+                        if cat in ["UNK", "FAM"]:
                             fam = t
                             break
 
-                # Get ground truth family, if available
-                if args.gt:
                     first_token_dict[name] = fam
                     gt_family = '\t' + gt_dict.get(name, "")
                 else:
@@ -247,38 +256,27 @@ def main():
                         tag_str = format_tag_pairs(tags, av_labels.taxonomy)
                     else:
                         tag_str = format_tag_pairs(tags)
-                    sys.stdout.write('%s\t%d\t%s%s%s%s\n' %
-                                     (name, vt_count, tag_str, gt_family,
-                                      is_pup_str, vtt))
+                    sys.stdout.write('%s\t%d\t%s%s%s%s\n' % name, vt_count, tag_str, gt_family, is_pup_str, vtt)
                 else:
-                    sys.stdout.write('%s\t%s%s%s\n' %
-                                     (name, fam, gt_family, is_pup_str))
+                    sys.stdout.write('%s\t%s%s%s\n' % name, fam, gt_family, is_pup_str)
             except:
                 traceback.print_exc(file=sys.stderr)
                 continue
 
-        # Debug info
         sys.stderr.write('\r[-] %d JSON read' % vt_all)
         sys.stderr.flush()
         sys.stderr.write('\n')
 
-        # Close file
         fd.close()
 
     # Print statistics
-    sys.stderr.write(
-            "[-] Samples: %d NoScans: %d NoTags: %d GroundTruth: %d\n" % (
-                vt_all, stats['noscans'], vt_all - stats['tagged'], 
-                len(gt_dict)))
+    sys.stderr.write("[-] Samples: %d NoScans: %d NoTags: %d GroundTruth: %d\n" %
+                     (vt_all, stats['noscans'], vt_all - stats['tagged'], len(gt_dict)))
 
     # If ground truth, print precision, recall, and F1-measure
     if args.gt:
-        precision, recall, fmeasure = \
-                    ec.eval_precision_recall_fmeasure(gt_dict,
-                                                      first_token_dict)
-        sys.stderr.write(
-            "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % \
-                          (precision, recall, fmeasure))
+        precision, recall, fmeasure = ec.eval_precision_recall_fmeasure(gt_dict, first_token_dict)
+        sys.stderr.write("Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % (precision, recall, fmeasure))
 
     # Output stats
     if args.stats:
@@ -291,7 +289,7 @@ def main():
         num_maltagged = stats['maltagged']
         frac = float(num_maltagged) / float(num_samples) * 100
         stats_fd.write('Tagged (VT>3): %d (%.01f%%)\n' % (num_maltagged, frac))
-        for c in ['FILE','CLASS','BEH','FAM','UNK']:
+        for c in ['FILE', 'CLASS', 'BEH', 'FAM', 'UNK']:
             count = stats[c]
             frac = float(count) / float(num_maltagged) * 100
             stats_fd.write('%s: %d (%.01f%%)\n' % (c, stats[c], frac))
@@ -302,9 +300,8 @@ def main():
         avtags_fd = open("%s.avtags" % out_prefix, 'w')
         for t in sorted(avtags_dict.keys()):
             avtags_fd.write('%s\t' % t)
-            pairs = sorted(avtags_dict[t].items(),
-                            key=lambda pair : pair[1],
-                            reverse=True)
+            pairs = sorted(avtags_dict[t].items(), key=lambda pair: pair[1], reverse=True)
+
             for pair in pairs:
                 avtags_fd.write('%s|%d,' % (pair[0], pair[1]))
             avtags_fd.write('\n')
@@ -312,7 +309,6 @@ def main():
 
     # If alias detection, print map
     if args.aliasdetect:
-        # Open alias file
         alias_filename = out_prefix + '.alias'
         alias_fd = open(alias_filename, 'w+')
         # Sort token pairs by number of times they appear together
@@ -322,13 +318,12 @@ def main():
         #     pair_count_map.items())
 
         # Output header line
-        alias_fd.write("# t1\tt2\t|t1|\t|t2|\t"
-                       "|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n")
+        alias_fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n")
         # Compute token pair statistic and output to alias file
-        for (t1, t2), c in sorted_pairs:
+        for t1, t2, c in sorted_pairs:
             n1 = token_count_map[t1]
             n2 = token_count_map[t2]
-            if (n1 < n2):
+            if n1 < n2:
                 x = t1
                 y = t2
                 xn = n1
@@ -340,129 +335,101 @@ def main():
                 yn = n1
             f = float(c) / float(xn)
             finv = float(c) / float(yn)
-            alias_fd.write("%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" % (
-                x, y, xn, yn, c, f, finv))
+            alias_fd.write("%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" % (x, y, xn, yn, c, f, finv))
         # Close alias file
         alias_fd.close()
-        sys.stderr.write('[-] Alias data in %s\n' % (alias_filename))
+        sys.stderr.write('[-] Alias data in %s\n' % alias_filename)
 
 
 def parse_args():
     argparser = argparse.ArgumentParser(prog='avclass',
-        description='''Extracts tags for a set of samples.
-            Also calculates precision and recall if ground truth available''')
+                                        description='Extracts tags for a set of samples.  Also calculates precision and'
+                                                    ' recall if ground truth available')
 
-    argparser.add_argument('-vt', action='append',
-        help='file with VT reports '
-             '(Can be provided multiple times)')
+    argparser.add_argument('-vt', action='append', help='file with VT reports (Can be provided multiple times)')
 
-    argparser.add_argument('-lb', action='append',
-        help='file with simplified JSON reports'
-             '{md5,sha1,sha256,scan_date,av_labels} '
-             '(Can be provided multiple times)')
+    argparser.add_argument('-lb', action='append', help='file with simplified JSON reports '
+                                                        '{md5,sha1,sha256,scan_date,av_labels} (Can be provided '
+                                                        'multiple times)')
 
-    argparser.add_argument('-vtdir',
-        help='existing directory with VT reports')
+    argparser.add_argument('-vtdir', help='existing directory with VT reports')
 
-    argparser.add_argument('-lbdir',
-        help='existing directory with simplified JSON reports')
+    argparser.add_argument('-lbdir', help='existing directory with simplified JSON reports')
 
-    argparser.add_argument('-vt3', action='store_true',
-        help='input are VT v3 files')
+    argparser.add_argument('-vt3', action='store_true', help='input are VT v3 files')
 
-    argparser.add_argument('-gt',
-        help='file with ground truth. '
-             'If provided it evaluates clustering accuracy. '
-             'Prints precision, recall, F1-measure.')
+    argparser.add_argument('-gt', help='file with ground truth. If provided it evaluates clustering accuracy. '
+                                       'Prints precision, recall, F1-measure.')
 
-    argparser.add_argument('-vtt',
-        help='Include VT tags in the output.',
-        action='store_true')
+    argparser.add_argument('-vtt', help='Include VT tags in the output.', action='store_true')
 
-    argparser.add_argument('-tag',
-        help='file with tagging rules.',
-        default = util.DEFAULT_TAG_PATH)
+    argparser.add_argument('-tag', help='file with tagging rules.', default=util.DEFAULT_TAG_PATH)
 
-    argparser.add_argument('-tax',
-        help='file with taxonomy.',
-        default = util.DEFAULT_TAX_PATH)
+    argparser.add_argument('-tax', help='file with taxonomy.', default=util.DEFAULT_TAX_PATH)
 
-    argparser.add_argument('-exp',
-        help='file with expansion rules.',
-        default = util.DEFAULT_EXP_PATH)
+    argparser.add_argument('-exp', help='file with expansion rules.', default=util.DEFAULT_EXP_PATH)
 
-    argparser.add_argument('-av',
-        help='file with list of AVs to use')
+    argparser.add_argument('-av', help='file with list of AVs to use')
 
-    argparser.add_argument('-avtags',
-        help='extracts tags per av vendor',
-        action='store_true')
+    argparser.add_argument('-avtags', help='extracts tags per av vendor', action='store_true')
 
-    argparser.add_argument('-pup',
-        action='store_true',
-        help='if used each sample is classified as PUP or not')
+    argparser.add_argument('-pup', action='store_true', help='if used each sample is classified as PUP or not')
 
-    argparser.add_argument('-p', '--path',
-        help='output.full path for tags',
-        action='store_true')
+    argparser.add_argument('-p', '--path', help='output.full path for tags', action='store_true')
 
-    argparser.add_argument('-hash',
-        help='hash used to name samples. Should match ground truth',
-        choices=['md5', 'sha1', 'sha256'])
+    argparser.add_argument('-hash', help='hash used to name samples. Should match ground truth',
+                           choices=['md5', 'sha1', 'sha256'])
 
-    argparser.add_argument('-c',
-        help='Compatibility mode. Outputs results in AVClass format.',
-        action='store_true')
+    argparser.add_argument('-c', help='Compatibility mode. Outputs results in AVClass format.', action='store_true')
 
-    argparser.add_argument('-aliasdetect',
-        action='store_true',
-        help='if used produce aliases file at end')
+    argparser.add_argument('-aliasdetect', action='store_true', help='if used produce aliases file at end')
 
-    argparser.add_argument('-stats',
-                           action='store_true',
-                           help='if used produce 1 file '
-                                'with stats per category '
-                                '(File, Class, '
-                                'Behavior, Family, Unclassified)')
+    argparser.add_argument('-stats', action='store_true', help='if used produce 1 file with stats per category '
+                                                               '(File, Class, Behavior, Family, Unclassified)')
 
     args = argparser.parse_args()
 
+    # TODO - use non-exclusive group to ensure at least one is selected instead of this
     if not args.vt and not args.lb and not args.vtdir and not args.lbdir:
         sys.stderr.write('One of the following 4 arguments is required: '
                          '-vt,-lb,-vtdir,-lbdir\n')
         exit(1)
 
+    # TODO - use mutex group for this instead of manual check
     if (args.vt or args.vtdir) and (args.lb or args.lbdir):
         sys.stderr.write('Use either -vt/-vtdir or -lb/-lbdir. '
                          'Both types of input files cannot be combined.\n')
         exit(1)
 
+    # TODO - consider letting argparse handle this?
     if args.tag:
         if args.tag == '/dev/null':
             sys.stderr.write('[-] Using no tagging rules\n')
         else:
-            sys.stderr.write('[-] Using tagging rules in %s\n' % (args.tag))
+            sys.stderr.write('[-] Using tagging rules in %s\n' % args.tag)
     else:
-        sys.stderr.write('[-] Using default tagging rules in %s\n' % (util.DEFAULT_TAG_PATH))
+        sys.stderr.write('[-] Using default tagging rules in %s\n' % util.DEFAULT_TAG_PATH)
 
+    # TODO - consider letting argparse handle this?
     if args.tax:
         if args.tax == '/dev/null':
             sys.stderr.write('[-] Using no taxonomy\n')
         else:
-            sys.stderr.write('[-] Using taxonomy in %s\n' % (args.tax))
+            sys.stderr.write('[-] Using taxonomy in %s\n' % args.tax)
     else:
-        sys.stderr.write('[-] Using default taxonomy in %s\n' % (util.DEFAULT_TAX_PATH))
+        sys.stderr.write('[-] Using default taxonomy in %s\n' % util.DEFAULT_TAX_PATH)
 
+    # TODO - consider letting argparse handle this?
     if args.exp:
         if args.exp == '/dev/null':
             sys.stderr.write('[-] Using no expansion tags\n')
         else:
-            sys.stderr.write('[-] Using expansion tags in %s\n' % (args.exp))
+            sys.stderr.write('[-] Using expansion tags in %s\n' % args.exp)
     else:
-        sys.stderr.write('[-] Using default expansion tags in %s\n' % (util.DEFAULT_EXP_PATH))
+        sys.stderr.write('[-] Using default expansion tags in %s\n' % util.DEFAULT_EXP_PATH)
 
     return args
 
 
-if __name__=='__main__':
+if __name__ == '__main__':
     main()

From f37a47da9f8766215da448fdf8e3ffc4d9595b3a Mon Sep 17 00:00:00 2001
From: Matt Miller <usr.bin.bourbon@gmail.com>
Date: Wed, 13 Jan 2021 15:44:18 -0600
Subject: [PATCH 13/36] cleanup update

---
 avclass/update.py | 334 ++++++++++++++++++++++++----------------------
 1 file changed, 173 insertions(+), 161 deletions(-)

diff --git a/avclass/update.py b/avclass/update.py
index d19ef0f..a2bc73b 100644
--- a/avclass/update.py
+++ b/avclass/update.py
@@ -5,6 +5,7 @@
 
 from collections import namedtuple
 from operator import itemgetter
+from typing import AnyStr, Collection, Optional, Set, TextIO
 
 from avclass import util
 from avclass.common import Taxonomy, Expansion, Translation
@@ -23,18 +24,14 @@
 # Threshold for string similarity
 # sim_threshold = 0.6
 
-Relation = namedtuple('Relation', ['t1', 't2', 't1_num', 't2_num',
-                                   'nalias_num', 'talias_num', 'tinv_alias_num'])
-
+Relation = namedtuple('Relation', ['t1', 't2', 't1_num', 't2_num', 'nalias_num', 'talias_num', 'tinv_alias_num'])
 
 
 class Update:
-    ''' Update Module '''
-    def __init__(self, rel_filepath, in_taxonomy, in_tagging, in_expansion,
-                    n, t):
-        # Initialize inputs
+    def __init__(self, rel_filepath: AnyStr, in_taxonomy: Taxonomy, in_translation: Translation,
+                 in_expansion: Expansion, n, t):
         self.__out_taxonomy = in_taxonomy
-        self.__out_tagging = in_tagging
+        self.__out_translation = in_translation
         self.__out_expansion = in_expansion
         self.__n = n
         self.__t = t
@@ -44,80 +41,117 @@ def __init__(self, rel_filepath, in_taxonomy, in_tagging, in_expansion,
         self.src_map = {}
         # Read relations from file
         self.rel_set = self.read_relations(rel_filepath)
+        self.dst_map = {}
+        self.cat_pairs_map = {}
 
-    def num_rules(self):
+    # TODO - @property decorator
+    def num_rules(self) -> int:
         return len(self.rel_set)
 
-    def is_weak_rel(self, rel):
-        ''' Return true if relationship is weak,
-            i.e., does not meet thresholds '''
+    def is_weak_rel(self, rel: Relation) -> bool:
+        """
+        Boolean whether or not the relationship is considered weak (doesn't meet thresholds).
+
+        :param rel: The relationship
+        :return: Boolean
+        """
         return ((int(rel.nalias_num) < self.__n) or
                 (float(rel.talias_num) < self.__t))
 
-    def is_blacklisted_rel(self, rel):
-        ''' Return true if relationship is blacklisted '''
-        return (rel.t1 in self.blist) or (rel.t2 in self.blist)
+    def is_blacklisted_rel(self, rel: Relation) -> bool:
+        """
+        Boolean whether or not the relationship is blacklisted.
+
+        :param rel: The relationship
+        :return: Boolean
+        """
+        return rel.t1 in self.blist or rel.t2 in self.blist
 
-    def is_known_rel(self, rel):
-        ''' Return true if relationship is known '''
+    def is_known_rel(self, rel: Relation) -> bool:
+        """
+        Boolean whether or not the relationship is known.
+
+        :param rel: The relationship
+        :return: Boolean
+        """
         t1 = rel.t1
         t2 = rel.t2
         # Known taxonomy relation
-        if self.__out_taxonomy.overlaps(t1,t2):
+        if self.__out_taxonomy.overlaps(t1, t2):
             return True
         # Known expansion rule
         t1_dst = self.__out_expansion.get_dst(t1)
         t2_dst = self.__out_expansion.get_dst(t2)
-        if (t2 in t1_dst) or (t1 in t2_dst):
+        if t2 in t1_dst or t1 in t2_dst:
             return True
         # Known tagging rule
-        t1_dst = sorted(self.__out_tagging.get_dst(t1))
-        t2_dst = sorted(self.__out_tagging.get_dst(t2))
-        if (t2 in t1_dst) or (t1 in t2_dst):
+        t1_dst = sorted(self.__out_translation.get_dst(t1))
+        t2_dst = sorted(self.__out_translation.get_dst(t2))
+        if t2 in t1_dst or t1 in t2_dst:
             return True
         # Known alias in tagging
-        if t1_dst and (t1_dst == t2_dst):
+        if t1_dst and t1_dst == t2_dst:
             return True
         return False
 
-    def add_tag(self, name, path):
-        ''' Add tag to taxonomy if not in tagging '''
-        l = self.__out_tagging.get_dst(name)
-        if (not l):
+    def add_tag(self, name: AnyStr, path: AnyStr):
+        """
+        Add tag to Taxonomy if it's not in Translation rules
+
+        :param name: The name of the tag
+        :param path: The full path
+        :return: None
+        """
+        dst = self.__out_translation.get_dst(name)
+        if not dst:
             self.__out_taxonomy.add_tag(path)
 
-    def add_expansion(self, src, dst_l):
+    def add_expansion(self, src: AnyStr, dst_l: Collection[AnyStr]):
+        """
+        Add expansion rule to fix destination if the source is in Translation.
+
+        :param src: The source label
+        :param dst_l: A list of destination labels
+        :return: None
+        """
         ''' Add expansion rule fixing destination if src in tagging '''
         # Select source handling aliases
-        l = self.__out_tagging.get_dst(src)
-        if l:
-            new_src = l[0]
+        dst = self.__out_translation.get_dst(src)
+        if dst:
+            new_src = dst[0]
         else:
             new_src = src
         # Select destinations removing overlaps with existing rule
-        l = self.__out_expansion.get_dst(src)
-        if l:
-            l.extend(dst_l)
-            target_l = self.__out_taxonomy.remove_overlaps(l)
+        dst = self.__out_expansion.get_dst(src)
+        if dst:
+            dst.extend(dst_l)
+            target_l = self.__out_taxonomy.remove_overlaps(dst)
             self.__out_expansion.add_rule(new_src, target_l, True)
         else:
             self.__out_expansion.add_rule(new_src, dst_l, True)
 
-    def add_alias(self, src, dst, dst_prefix):
-        ''' Add alias relation to taxonomy, tagging '''
+    def add_alias(self, src: AnyStr, dst: AnyStr, dst_prefix: AnyStr):
+        """
+        Add alias relation to the Taxonomy and Translation
+
+        :param src: Source alias
+        :param dst: Destination alias
+        :param dst_prefix: Destination prefix
+        :return: None
+        """
         # If src in tagging, use most popular target
-        l = self.__out_tagging.get_dst(src)
+        tr_dst = self.__out_translation.get_dst(src)
         target = dst
-        if l:
+        if tr_dst:
             cnt_max = self.src_map[dst]
-            for e in l:
+            for e in tr_dst:
                 cnt = self.src_map.get(e, 0)
                 if cnt > cnt_max:
                     target = e
         # If dst is in tagging, update tagging rule destination,
-        l = self.__out_tagging.get_dst(dst)
-        if l:
-            target_l = l
+        tr_dst = self.__out_translation.get_dst(dst)
+        if tr_dst:
+            target_l = tr_dst
         # else add dst to taxonomy
         else:
             target_l = [target]
@@ -125,10 +159,15 @@ def add_alias(self, src, dst, dst_prefix):
         # Remove src from taxonomy
         self.__out_taxonomy.remove_tag(src)
         # Replace tagging rule
-        self.__out_tagging.add_rule(src, target_l, True)
+        self.__out_translation.add_rule(src, target_l, True)
+
+    def is_expansion_rel(self, rel: Relation) -> bool:
+        """
+        Boolean whether or not the relation implies an expansion
 
-    def is_expansion_rel(self, rel):
-        ''' Return true if relation implies expansion rule '''
+        :param rel: The relation
+        :return: Boolean
+        """
         c1 = self.__out_taxonomy.get_category(rel.t1)
         c2 = self.__out_taxonomy.get_category(rel.t2)
         return (((c1 == "FAM") and (c2 != c1) and (c2 != "UNK")) or
@@ -136,15 +175,19 @@ def is_expansion_rel(self, rel):
                 ((c1 == "UNK") and ((c2 == "BEH") or (c2 == "CLASS"))))
 
     def find_expansions(self):
-        ''' Find expansions among relations '''
+        """
+        Resolve relations that are expansions
+
+        :return: None
+        """
         acc = []
         for rel in self.rel_set:
             p1 = self.__out_taxonomy.get_path(rel.t1)
             p2 = self.__out_taxonomy.get_path(rel.t2)
             logger.debug("Processing %s\t%s" % (p1, p2))
             # Ignore relations where t1 is an alias
-            l = self.__out_tagging.get_dst(rel.t1)
-            if l:
+            dst = self.__out_translation.get_dst(rel.t1)
+            if dst:
                 logger.debug("Ignoring relation for alias %s" % p1)
                 continue
             if self.is_expansion_rel(rel):
@@ -153,15 +196,14 @@ def find_expansions(self):
         for rel in acc:
             self.rel_set.remove(rel)
 
-    #def is_alias_rel(self, rel):
+    # def is_alias_rel(self, rel):
     #    ''' Return true if relation implies alias rule '''
     #    c1 = self.__out_taxonomy.get_category(rel.t1)
     #    c2 = self.__out_taxonomy.get_category(rel.t2)
     #    return (((c1 == "UNK") and (c2 == "FAM")) or
     #            ((c1 == "UNK") and (c2 == "UNK")))
 
-
-    #def find_aliases(self):
+    # def find_aliases(self):
     #    ''' Find aliases among relations '''
     #    for rel in self.rel_set:
     #        c1 = self.__out_taxonomy.get_category(rel.t1)
@@ -172,30 +214,32 @@ def find_expansions(self):
     #            self.G.add_edge(rel.t1, rel.t2, score=rel.talias_num)
     #    self.output_components("comp")
 
-    def process_relation(self, rel):
-        ''' Process relation and update taxonomy/tagging correspondingly '''
+    def process_relation(self, rel: Relation):
+        """
+        Process relation and update Taxonomy/Translation
 
-        # Obtain tag info
+        :param rel: The relation
+        :return:
+        """
         t1 = rel.t1
         t2 = rel.t2
-        p1,c1 = self.__out_taxonomy.get_info(rel.t1)
-        p2,c2 = self.__out_taxonomy.get_info(rel.t2)
+        p1, c1 = self.__out_taxonomy.get_info(rel.t1)
+        p2, c2 = self.__out_taxonomy.get_info(rel.t2)
 
         logger.debug("Processing %s\t%s" % (p1, p2))
 
         # If both directions strong, then equivalent, i.e., alias
-        if (float(rel.tinv_alias_num) >= args.t):
-            if (c1 != "UNK") and (c2 == "UNK"):
+        if float(rel.tinv_alias_num) >= args.t:
+            if c1 != "UNK" and c2 == "UNK":
                 prefix = p1[0:p1.rfind(':')]
-            elif (c1 == "UNK") and (c2 != "UNK"):
+            elif c1 == "UNK" and c2 != "UNK":
                 prefix = p2[0:p2.rfind(':')]
-            elif (c1 == "UNK") and (c2 == "UNK"):
+            elif c1 == "UNK" and c2 == "UNK":
                 prefix = "FAM"
-            elif (c1 == c2):
+            elif c1 == c2:
                 prefix = p1[0:p1.rfind(':')]
             else:
-                logger.warn("Equivalent rule with different categories: %s\t%s" %
-                            (p1, p2))
+                logger.warning("Equivalent rule with different categories: %s\t%s" % (p1, p2))
                 return -1
             self.add_alias(t1, t2, prefix)
             return 1
@@ -232,7 +276,7 @@ def process_relation(self, rel):
             self.add_alias(t1, t2, "FAM")
             return 1
 
-         # FILE -> UNK : alias-file
+        # FILE -> UNK : alias-file
         elif (c1 == "FILE") and (c2 == "UNK"):
             prefix = p1[0:p1.rfind(':')]
             self.add_alias(t1, t2, prefix)
@@ -240,13 +284,12 @@ def process_relation(self, rel):
 
         # Same category : alias
         elif (c1 == "FAM") and (c2 == "FAM"):
-        #elif c1 == c2:
             prefix = p2[0:p2.rfind(':')]
             self.add_alias(t1, t2, prefix)
             return 1
 
         # Target unknown
-        elif (c2 == "UNK"):
+        elif c2 == "UNK":
             # If tokens are similar, likely family aliases
             # log.info("Similarity: %.02f" % levenshtein_ratio(t1, t2))
             # if (levenshtein_ratio(t1, t2) > sim_threshold):
@@ -258,11 +301,14 @@ def process_relation(self, rel):
             return 0
 
         # Default: review taxonomy
-        else:
-            return 0
-
+        return 0
 
     def run(self):
+        """
+        Run the updater.
+
+        :return: None
+        """
         num_iter = 0
         while self.rel_set:
             # Do a pass in remaining relations
@@ -299,8 +345,13 @@ def run(self):
         logger.debug("[-] Finding expansions")
         self.find_expansions()
 
+    def read_relations(self, filepath: AnyStr) -> Set[Relation]:
+        """
+        Filters weak and blacklisted relations
 
-    def read_relations(self, filepath):
+        :param filepath: The path of the file to read
+        :return: A set of Relation objects
+        """
         ''' Returns relations in file as a set
             Filters weak and blacklisted relations '''
         rel_set = set()
@@ -310,8 +361,7 @@ def read_relations(self, filepath):
                 if line.startswith('#'):
                     continue
                 # Parse line
-                t1, t2, t1_num, t2_num, nalias_num, talias_num, \
-                  tinv_alias_num = line.strip().split('\t')
+                t1, t2, t1_num, t2_num, nalias_num, talias_num, tinv_alias_num = line.strip().split('\t')
                 # Build relation
                 rel = Relation(t1, t2, t1_num, t2_num, nalias_num,
                                talias_num, tinv_alias_num)
@@ -324,8 +374,8 @@ def read_relations(self, filepath):
                 # Ignore known relations
                 # NOTE: commented since we check if a
                 # relation is known before processing it
-                #if self.is_known_rel(rel):
-                #    continue
+                # if self.is_known_rel(rel):
+                #     continue
                 # Add relation to set
                 rel_set.add(rel)
                 # Add to src_map
@@ -334,113 +384,81 @@ def read_relations(self, filepath):
 
         return rel_set
 
-    def output_relations(self, filepath):
-        fd = open(filepath, 'w')
-        fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t"
-                  "|t1^t2|/|t2|\n")
-        sorted_rules = sorted(self.rel_set,
-                              key=(lambda r: (
-                                self.__out_taxonomy.get_category(r.t1),
-                                self.__out_taxonomy.get_category(r.t2))),
-                              reverse=False)
-        for rel in sorted_rules:
-            p1,c1 = self.__out_taxonomy.get_info(rel.t1)
-            p2,c2 = self.__out_taxonomy.get_info(rel.t2)
-            fd.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %(
-                p1, p2, rel.t1_num, rel.t2_num, rel.nalias_num,
-                rel.talias_num, rel.tinv_alias_num))
-        fd.close()
-
-    def output_rule_stats(self, fd):
-        # Initialize maps for statistics
-        self.dst_map = {}
-        self.cat_pairs_map = {}
+    def output_relations(self, filepath: AnyStr):
+        with open(filepath, 'w') as fd:
+            fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n")
+            sorted_rules = sorted(self.rel_set,
+                                  key=lambda r: (self.__out_taxonomy.get_category(r.t1),
+                                                 self.__out_taxonomy.get_category(r.t2)))
+            for rel in sorted_rules:
+                p1, c1 = self.__out_taxonomy.get_info(rel.t1)
+                p2, c2 = self.__out_taxonomy.get_info(rel.t2)
+                fd.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (p1, p2, rel.t1_num, rel.t2_num, rel.nalias_num,
+                                                           rel.talias_num, rel.tinv_alias_num))
+
+    def output_rule_stats(self, fd: TextIO):
         # Compute rule statistics
         for rel in self.rel_set:
             c1 = self.__out_taxonomy.get_category(rel.t1)
             c2 = self.__out_taxonomy.get_category(rel.t2)
-            self.cat_pairs_map[(c1,c2)] = self.cat_pairs_map.get((c1,
-                                                                  c2), 0) + 1
+            self.cat_pairs_map[(c1, c2)] = self.cat_pairs_map.get((c1, c2), 0) + 1
             self.dst_map[rel.t2] = self.dst_map.get(rel.t2, 0) + 1
         # Output statistics
-        cat_pairs = sorted(update.cat_pairs_map.items(), key=itemgetter(1,0),
-                            reverse=True)
-        for (c1,c2), cnt in cat_pairs:
+        cat_pairs = sorted(update.cat_pairs_map.items(), key=itemgetter(1, 0), reverse=True)
+        for c1, c2, cnt in cat_pairs:
             fd.write("%s\t%s\t%03d\n" % (c1, c2, cnt))
 
         # Print dst statistics
-        dst_pairs = sorted(update.dst_map.items(), key=itemgetter(1,0),
-                            reverse=False)
+        dst_pairs = sorted(update.dst_map.items(), key=itemgetter(1, 0))
         for dst, cnt in dst_pairs:
             fd.write("%s\t%03d\n" % (taxonomy.get_path(dst), cnt))
 
-    def output(self, out_prefix):
-        if (not out_prefix):
+    @staticmethod
+    def output(prefix: Optional[AnyStr] = None):
+        if not prefix:
             tax_filepath = util.DEFAULT_TAX_PATH
             tag_filepath = util.DEFAULT_TAG_PATH
             exp_filepath = util.DEFAULT_EXP_PATH
         else:
-            tax_filepath = out_prefix + ".taxonomy"
-            tag_filepath = out_prefix + ".tagging"
-            exp_filepath = out_prefix + ".expansion"
+            tax_filepath = prefix + ".taxonomy"
+            tag_filepath = prefix + ".tagging"
+            exp_filepath = prefix + ".expansion"
+
         taxonomy.to_file(tax_filepath)
-        logger.info('[-] Output %d taxonomy tags to %s' % (
-                        len(taxonomy), tax_filepath))
+        logger.info('[-] Output %d taxonomy tags to %s' % (len(taxonomy), tax_filepath))
         tagging.expand_all_destinations()
         tagging.to_file(tag_filepath)
-        logger.info('[-] Output %d tagging rules to %s' % (
-                        len(tagging), tag_filepath))
+        logger.info('[-] Output %d tagging rules to %s' % (len(tagging), tag_filepath))
         expansion.to_file(exp_filepath)
-        logger.info('[-] Output %d expansion rules to %s' % (
-                        len(expansion), exp_filepath))
+        logger.info('[-] Output %d expansion rules to %s' % (len(expansion), exp_filepath))
 
 
 if __name__ == '__main__':
-    argparser = argparse.ArgumentParser(
-        description='''Given a .alias file from the labeler, 
-        generates updates for the taxonomy, tagging, and expansion files.''')
+    parser = argparse.ArgumentParser(description='Given a .alias file from the labeler, generates updates for the '
+                                                 'taxonomy, tagging, and expansion files.')
 
-    argparser.add_argument('-alias',
-        help='file to parse with alias from labeler'
-             'Labeler will run if -alias not present')
+    parser.add_argument('-alias', help='file to parse with alias from labeler which runs if -alias not present')
 
-    argparser.add_argument('-n',
-        help='Minimum number of times that a pair of tokes have been seen.'
-             'Default: 20',
-        type=int,
-        default=20)
+    parser.add_argument('-n', help='Minimum number of times that a pair of tokes have been seen. Default: 20',
+                        type=int, default=20)
 
-    argparser.add_argument('-t',
-        help='Minimum percentage of times two tokens appear together.'
-             'Default: 1.94',
-        type=float,
-        default=0.94)
+    parser.add_argument('-t', help='Minimum percentage of times two tokens appear together. Default: 1.94',
+                        type=float, default=0.94)
 
-    argparser.add_argument('-o',
-        help='output prefix for files')
+    parser.add_argument('-o', help='output prefix for files')
 
-    argparser.add_argument('-update',
-        action='store_true',
-        help='update default taxonomy,tagging,expansion files in place')
+    parser.add_argument('-update', action='store_true', help='update default taxonomy,tagging,expansion files in place')
 
-    argparser.add_argument('-tag',
-        help='file with tagging rules.',
-        default = util.DEFAULT_TAG_PATH)
+    parser.add_argument('-tag', help='file with tagging rules.', default=util.DEFAULT_TAG_PATH)
 
-    argparser.add_argument('-tax',
-        help='file with taxonomy.',
-        default = util.DEFAULT_TAX_PATH)
+    parser.add_argument('-tax', help='file with taxonomy.', default=util.DEFAULT_TAX_PATH)
 
-    argparser.add_argument('-exp',
-        help='file with expansion rules.',
-        default = util.DEFAULT_EXP_PATH)
+    parser.add_argument('-exp', help='file with expansion rules.', default=util.DEFAULT_EXP_PATH)
 
-    argparser.add_argument('-v', '--verbose',
-        action='store_true',
-        help='verbose, prints debugging statements.')
+    parser.add_argument('-v', '--verbose', action='store_true', help='verbose, prints debugging statements.')
 
     # Parse arguments
-    args = argparser.parse_args()
+    args = parser.parse_args()
 
     # Check we have the input
     if not args.alias:
@@ -448,35 +466,31 @@ def output(self, out_prefix):
         exit(1)
 
     # Set logging level
-    if (args.verbose):
+    if args.verbose:
         handler_stderr.setLevel(logging.DEBUG)
 
     # Set output prefix
     if args.o:
-      out_prefix = args.o
+        out_prefix = args.o
     else:
-      out_prefix = os.path.splitext(args.alias)[0]
+        out_prefix = os.path.splitext(args.alias)[0]
 
     # Read taxonomy
     taxonomy = Taxonomy(args.tax)
-    logger.info('[-] Read %d taxonomy tags from %s' % (
-                        len(taxonomy), args.tax))
+    logger.info('[-] Read %d taxonomy tags from %s' % (len(taxonomy), args.tax))
 
     # Read tagging rules
     tagging = Translation(args.tag)
-    logger.info('[-] Read %d tagging rules from %s' % (
-                        len(tagging), args.tag))
+    logger.info('[-] Read %d tagging rules from %s' % (len(tagging), args.tag))
 
     # Read expansion rules
     expansion = Expansion(args.exp)
-    logger.info('[-] Read %d expansion rules from %s' % (
-                        len(expansion), args.exp))
+    logger.info('[-] Read %d expansion rules from %s' % (len(expansion), args.exp))
 
     # Build update object
     update = Update(args.alias, taxonomy, tagging, expansion, args.n, args.t)
 
-    logger.info('[-] Processing %d relations satisfying t>=%.2f n>=%d' % (
-                        update.num_rules(), args.t, args.n))
+    logger.info('[-] Processing %d relations satisfying t>=%.2f n>=%d' % (update.num_rules(), args.t, args.n))
 
     # Output initial rules
     update.output_relations(out_prefix + ".orig.rules")
@@ -493,6 +507,4 @@ def output(self, out_prefix):
     else:
         update.output(out_prefix)
 
-    # Output final rules
     update.output_relations(out_prefix + ".final.rules")
-

From dd591b1ac7bb7eb74fc9ba3860f0ccdc67291aea Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Mon, 1 Feb 2021 14:51:33 -0500
Subject: [PATCH 14/36] Fixed some issues for console execution, black
 formatting

---
 avclass/common.py  | 157 ++++++++++++++---------
 avclass/labeler.py | 305 +++++++++++++++++++++++++++++----------------
 2 files changed, 293 insertions(+), 169 deletions(-)

diff --git a/avclass/common.py b/avclass/common.py
index cf79a21..9cbe4bc 100755
--- a/avclass/common.py
+++ b/avclass/common.py
@@ -16,17 +16,28 @@
 # Default category for tags in taxonomy with no category
 uncategorized_cat = "UNC"
 
-SampleInfo = namedtuple('SampleInfo', 
-                        ['md5', 'sha1', 'sha256', 'labels', 'vt_tags'])
+SampleInfo = namedtuple("SampleInfo", ["md5", "sha1", "sha256", "labels", "vt_tags"])
 
-Tag = namedtuple('Tag', ['name', 'cat', 'path', 'prefix_l'])
+Tag = namedtuple("Tag", ["name", "cat", "path", "prefix_l"])
 
 # AVs to use in suffix removal
-suffix_removal_av_set = {'Norman', 'Avast', 'Avira', 'Kaspersky',
-                         'ESET-NOD32', 'Fortinet', 'Jiangmin', 'Comodo',
-                         'GData', 'Avast', 'Sophos',
-                         'TrendMicro-HouseCall', 'TrendMicro',
-                         'NANO-Antivirus', 'Microsoft'}
+suffix_removal_av_set = {
+    "Norman",
+    "Avast",
+    "Avira",
+    "Kaspersky",
+    "ESET-NOD32",
+    "Fortinet",
+    "Jiangmin",
+    "Comodo",
+    "GData",
+    "Avast",
+    "Sophos",
+    "TrendMicro-HouseCall",
+    "TrendMicro",
+    "NANO-Antivirus",
+    "Microsoft",
+}
 
 
 def create_tag(s: AnyStr):
@@ -43,8 +54,8 @@ def create_tag(s: AnyStr):
         prefix_l = [x.lower() for x in word_list[1:-1]]
         path = cat
         for x in prefix_l:
-            path = path + ':' + x
-        path = path + ':' + name
+            path = path + ":" + x
+        path = path + ":" + name
     else:
         name = word_list[0].lower()
         cat = uncategorized_cat
@@ -57,6 +68,7 @@ class Taxonomy:
     """
     Contains tags and generic tokens read from filesystem
     """
+
     def __init__(self, filepath: Optional[AnyStr]):
         """
         Initialize and populate the Tag map from ``filepath``
@@ -73,7 +85,9 @@ def __len__(self) -> int:
 
         :return: The length (int) of the Taxonomy
         """
-        return len(self.__tag_map)//2  # TODO - perhaps there should be two dicts, one for names, one for paths?
+        return (
+            len(self.__tag_map) // 2
+        )  # TODO - perhaps there should be two dicts, one for names, one for paths?
 
     def is_generic(self, tag: AnyStr) -> bool:
         """
@@ -83,7 +97,7 @@ def is_generic(self, tag: AnyStr) -> bool:
         :return: Boolean
         """
         t = self.__tag_map.get(tag, None)
-        return getattr(t, 'cat', None) == 'GEN'
+        return getattr(t, "cat", None) == "GEN"
 
     def is_tag(self, tag: AnyStr) -> bool:
         """
@@ -138,7 +152,7 @@ def get_category(self, tag: AnyStr) -> AnyStr:
         :return: The category
         """
         t = self.__tag_map.get(tag, None)
-        return getattr(t, 'cat', 'UNK')
+        return getattr(t, "cat", "UNK")
 
     def get_path(self, tag: AnyStr) -> AnyStr:
         """
@@ -148,7 +162,7 @@ def get_path(self, tag: AnyStr) -> AnyStr:
         :return: The tag's path
         """
         t = self.__tag_map.get(tag, None)
-        return getattr(t, 'path', f'UNK:{tag}')
+        return getattr(t, "path", f"UNK:{tag}")
 
     def get_prefix_l(self, tag: AnyStr) -> List[AnyStr]:
         """
@@ -158,7 +172,7 @@ def get_prefix_l(self, tag: AnyStr) -> List[AnyStr]:
         :return: The tag's prefix list
         """
         t = self.__tag_map.get(tag, None)
-        return getattr(t, 'prefix_l', [])
+        return getattr(t, "prefix_l", [])
 
     def get_prefix(self, tag: AnyStr) -> List[AnyStr]:
         """
@@ -168,7 +182,7 @@ def get_prefix(self, tag: AnyStr) -> List[AnyStr]:
         :return: String representation of the tag's full prefix
         """
         t = self.__tag_map.get(tag, None)
-        tag_pfx = tag.path.split(':')[:-1]
+        tag_pfx = tag.path.split(":")[:-1]
         return t.prefix_l if t else tag_pfx
 
     def get_depth(self, tag: AnyStr) -> int:
@@ -213,7 +227,11 @@ def platform_tags(self) -> Set[AnyStr]:
 
         :return: Set of platformn tags
         """
-        return {tag.name for _, tag in self.__tag_map.items() if tag.path.startswith(platform_prefix)}
+        return {
+            tag.name
+            for _, tag in self.__tag_map.items()
+            if tag.path.startswith(platform_prefix)
+        }
 
     def overlaps(self, t1: AnyStr, t2: AnyStr) -> bool:
         """
@@ -227,7 +245,9 @@ def overlaps(self, t1: AnyStr, t2: AnyStr) -> bool:
         m2 = self.get_prefix_l(t2)
         return t1 in m2 or t2 in m1
 
-    def remove_overlaps(self, l: Collection[AnyStr]) -> Union[Collection[AnyStr], List[AnyStr]]:
+    def remove_overlaps(
+        self, l: Collection[AnyStr]
+    ) -> Union[Collection[AnyStr], List[AnyStr]]:
         """
         Returns list with overlapping tags removed
 
@@ -252,10 +272,10 @@ def read_taxonomy(self, filepath: AnyStr):
         :param filepath: The path of the file to read
         :return: None
         """
-        with open(filepath, 'r') as fd:
+        with open(filepath, "r") as fd:
             for line in fd:
                 line = line.strip()
-                if not line.startswith('#') and line:
+                if not line.startswith("#") and line:
                     self.add_tag(line)
 
     def to_file(self, filepath: AnyStr):
@@ -265,9 +285,8 @@ def to_file(self, filepath: AnyStr):
         :param filepath: The path to write
         :return: None
         """
-        with open(filepath, 'w') as fd:
-            tag_l = sorted(self.__tag_map.items(),
-                           key=lambda item: item[1].path)
+        with open(filepath, "w") as fd:
+            tag_l = sorted(self.__tag_map.items(), key=lambda item: item[1].path)
             idx = 0
             for name, tag in tag_l:
                 if (idx % 2) == 0:
@@ -279,6 +298,7 @@ class Rules:
     """
     Map a single source with one or more destinations
     """
+
     def __init__(self, filepath: Optional[AnyStr]):
         """
         Initialize the rule-map and read rules from ``filepath``
@@ -297,7 +317,9 @@ def __len__(self):
         """
         return len(self._rmap)
 
-    def add_rule(self, src: AnyStr, dst_l: Collection[AnyStr] = None, overwrite: bool = False):
+    def add_rule(
+        self, src: AnyStr, dst_l: Collection[AnyStr] = None, overwrite: bool = False
+    ):
         """
         Add a rule to the map.  On duplicate, append destinations.  If ``overwrite`` is set, replace rule src/dst.
 
@@ -347,10 +369,10 @@ def read_rules(self, filepath: AnyStr):
         :param filepath: The path of the file to read
         :return: None
         """
-        with open(filepath, 'r') as fd:
+        with open(filepath, "r") as fd:
             for line in fd:
                 line = line.strip()
-                if not line.startswith('#') and line:
+                if not line.startswith("#") and line:
                     word_list = line.split()
                     if len(word_list) > 1:
                         self.add_rule(word_list[0], word_list[1:])
@@ -363,16 +385,16 @@ def to_file(self, filepath: AnyStr, taxonomy: Taxonomy = None):
         :param taxonomy: A Taxonomy to optionally resolve full tag paths
         :return: None
         """
-        with open(filepath, 'w') as fd:
+        with open(filepath, "w") as fd:
             for src, dst_set in sorted(self._rmap.items()):
                 dst_l = sorted(dst_set)
                 if taxonomy:
                     src_path = taxonomy.get_path(src)
                     path_l = [taxonomy.get_path(t) for t in dst_l]
-                    dst_str = '\t'.join(path_l)
+                    dst_str = "\t".join(path_l)
                     fd.write("%s\t%s\n" % (src_path, dst_str))
                 else:
-                    dst_str = '\t'.join(dst_l)
+                    dst_str = "\t".join(dst_l)
                     fd.write("%s\t%s\n" % (src, dst_str))
 
     def expand_src_destinations(self, src: AnyStr) -> Set[AnyStr]:
@@ -412,6 +434,7 @@ class Translation(Rules):
     """
     Translations are a set of rules that convert between unknown labels and labels that are in our Taxonomy
     """
+
     def __init__(self, filepath: AnyStr):
         super().__init__(filepath)
 
@@ -433,6 +456,7 @@ class Expansion(Rules):
     """
     Expansions are rules that allow us to map a single label (src) to all explicit and implicit labels
     """
+
     def __init__(self, filepath: AnyStr):
         super().__init__(filepath)
 
@@ -457,8 +481,15 @@ class AvLabels:
     """
     Primary class used to interpret AV Labels
     """
-    def __init__(self, tag_file: AnyStr, exp_file: AnyStr = None, tax_file: AnyStr = None, av_file: AnyStr = None,
-                 alias_detect: AnyStr = False):
+
+    def __init__(
+        self,
+        tag_file: AnyStr,
+        exp_file: AnyStr = None,
+        tax_file: AnyStr = None,
+        av_file: AnyStr = None,
+        alias_detect: bool = False,
+    ):
         self.taxonomy = Taxonomy(tax_file)
         self.translations = Translation(tag_file)
         self.expansions = Expansion(exp_file)
@@ -486,7 +517,9 @@ def get_sample_info_lb(record: Dict) -> SampleInfo:
         :param record: The JSON record
         :return: An instance of SampleInfo
         """
-        return SampleInfo(record['md5'], record['sha1'], record['sha256'], record['av_labels'], [])
+        return SampleInfo(
+            record["md5"], record["sha1"], record["sha256"], record["av_labels"], []
+        )
 
     @staticmethod
     def get_sample_info_vt_v2(record):
@@ -497,22 +530,24 @@ def get_sample_info_vt_v2(record):
         :return: An instance of SampleInfo
         """
         try:
-            scans = record['scans']
-            md5 = record['md5']
-            sha1 = record['sha1']
-            sha256 = record['sha256']
+            scans = record["scans"]
+            md5 = record["md5"]
+            sha1 = record["sha1"]
+            sha256 = record["sha256"]
         except KeyError:
             return None
 
         # Obtain labels from scan results
         label_pairs = []
         for av, res in scans.items():
-            if res['detected']:
-                label = res['result']
-                clean_label = ''.join(filter(lambda x: x in string.printable, label)).strip()
+            if res["detected"]:
+                label = res["result"]
+                clean_label = "".join(
+                    filter(lambda x: x in string.printable, label)
+                ).strip()
                 label_pairs.append((av, clean_label))
 
-        vt_tags = record.get('tags', [])
+        vt_tags = record.get("tags", [])
 
         return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags)
 
@@ -525,22 +560,24 @@ def get_sample_info_vt_v3(record):
         :return: An instance of SampleInfo
         """
         try:
-            scans = record['data']['attributes']['last_analysis_results']
-            md5 = record['data']['attributes']['md5']
-            sha1 = record['data']['attributes']['sha1']
-            sha256 = record['data']['attributes']['sha256']
+            scans = record["data"]["attributes"]["last_analysis_results"]
+            md5 = record["data"]["attributes"]["md5"]
+            sha1 = record["data"]["attributes"]["sha1"]
+            sha256 = record["data"]["attributes"]["sha256"]
         except KeyError:
             return None
 
         # Obtain labels from scan results
         label_pairs = []
         for av, res in scans.items():
-            label = res['result']
+            label = res["result"]
             if label is not None:
-                clean_label = ''.join(filter(lambda x: x in string.printable, label)).strip()
+                clean_label = "".join(
+                    filter(lambda x: x in string.printable, label)
+                ).strip()
                 label_pairs.append((av, clean_label))
 
-        vt_tags = record['data']['attributes'].get('tags', [])
+        vt_tags = record["data"]["attributes"].get("tags", [])
 
         return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags)
 
@@ -563,7 +600,7 @@ def is_pup(tag_pairs, taxonomy: Taxonomy) -> Optional[bool]:
             path, cat = taxonomy.get_info(tag)
             if cat == "CLASS":
                 if "grayware" in path:
-                    return float(ctr) >= float(max_ctr)*threshold
+                    return float(ctr) >= float(max_ctr) * threshold
                 else:
                     return False
         return False
@@ -579,18 +616,18 @@ def __remove_suffixes(av_name: AnyStr, label: AnyStr) -> AnyStr:
         """
         # Truncate after last '.'
         if av_name in suffix_removal_av_set:
-            label = label.rsplit('.', 1)[0]
+            label = label.rsplit(".", 1)[0]
 
-        # Truncate after last '.' 
+        # Truncate after last '.'
         # if suffix only contains digits or uppercase (no lowercase) chars
-        if av_name == 'AVG':
-            tokens = label.rsplit('.', 1)
+        if av_name == "AVG":
+            tokens = label.rsplit(".", 1)
             if len(tokens) > 1 and re.match("^[A-Z0-9]+$", tokens[1]):
                 label = tokens[0]
 
         # Truncate after last '!'
-        if av_name == 'Agnitum':
-            label = label.rsplit('!', 1)[0]
+        if av_name == "Agnitum":
+            label = label.rsplit("!", 1)[0]
 
         return label
 
@@ -620,7 +657,7 @@ def get_label_tags(self, label: AnyStr, hashes: Collection[AnyStr]) -> Set[AnySt
                 token = token[:-end_len]
 
             # Ignore token if prefix of a hash of the sample
-            # Most AVs use MD5 prefixes in labels, 
+            # Most AVs use MD5 prefixes in labels,
             # but we check SHA1 and SHA256 as well
             if any([h.startswith(token) for h in hashes]):
                 continue
@@ -675,17 +712,17 @@ def get_sample_tags(self, sample_info: SampleInfo) -> Dict[AnyStr, List[AnyStr]]
 
         # Process each AV label
         for av_name, label in sample_info.labels:
-            if not label or av_name not in self.avs:
+            if not label or (self.avs and av_name not in self.avs):
                 continue
 
             # Emsisoft uses same label as
             # GData/ESET-NOD32/BitDefender/Ad-Aware/MicroWorld-eScan,
             # but suffixes ' (B)' to their label. Remove the suffix.
-            label = label.rstrip(' (B)')
+            label = label.rstrip(" (B)")
 
             # F-Secure uses Avira's engine since Nov. 2018
             # but prefixes 'Malware.' to Avira's label. Remove the prefix.
-            label = label.lstrip('Malware.')
+            label = label.lstrip("Malware.")
 
             # Other engines often use exactly the same label, e.g.,
             #   AVG/Avast
@@ -711,7 +748,9 @@ def get_sample_tags(self, sample_info: SampleInfo) -> Dict[AnyStr, List[AnyStr]]
         return av_dict
 
     @staticmethod
-    def rank_tags(av_dict: Dict[AnyStr, List[AnyStr]], threshold: int = 1) -> List[Tuple[AnyStr, int]]:
+    def rank_tags(
+        av_dict: Dict[AnyStr, List[AnyStr]], threshold: int = 1
+    ) -> List[Tuple[AnyStr, int]]:
         """
         Get a list of tuples containing a tag and the number of AV that confirmed that tag sorted by number of AV
         (descending).
diff --git a/avclass/labeler.py b/avclass/labeler.py
index b96a28f..175e798 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -7,8 +7,14 @@
 from operator import itemgetter
 from typing import AnyStr, Optional
 
-from avclass.common import AvLabels, Taxonomy
-from avclass import clustering as ec, util
+try:
+    from avclass.common import AvLabels, Taxonomy
+    from avclass import clustering as ec, util
+except ModuleNotFoundError:
+    # Helps find the avclasses when run from console
+    sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    from avclass.common import AvLabels, Taxonomy
+    from avclass import clustering as ec, util
 
 
 def guess_hash(h: AnyStr) -> Optional[AnyStr]:
@@ -18,14 +24,14 @@ def guess_hash(h: AnyStr) -> Optional[AnyStr]:
     :param h: The hash
     :return: The hash type (str)
     """
-    ''' Given a hash string, guess the hash type based on the string length '''
+    """ Given a hash string, guess the hash type based on the string length """
     hlen = len(h)
     if hlen == 32:
-        return 'md5'
+        return "md5"
     elif hlen == 40:
-        return 'sha1'
+        return "sha1"
     elif hlen == 64:
-        return 'sha256'
+        return "sha256"
 
     return None
 
@@ -50,7 +56,7 @@ def format_tag_pairs(l, taxonomy: Taxonomy = None) -> AnyStr:
     out = "%s|%d" % (p, l[0][1])
     for t, s in l[1:]:
         if taxonomy is not None:
-            p = taxonomy.get_path(t) 
+            p = taxonomy.get_path(t)
         else:
             p = t
         out += ",%s|%d" % (p, s)
@@ -80,14 +86,14 @@ def main():
     # TODO - break this function up.
     args = parse_args()
     # Select hash used to identify sample, by default MD5
-    hash_type = args.hash or 'md5'
+    hash_type = args.hash or "md5"
 
     # If ground truth provided, read it from file
     gt_dict = {}
     if args.gt:
-        with open(args.gt, 'r') as gt_fd:
+        with open(args.gt, "r") as gt_fd:
             for line in gt_fd:
-                gt_hash, family = map(str, line.strip().split('\t', 1))
+                gt_hash, family = map(str, line.strip().split("\t", 1))
                 gt_dict[gt_hash] = family
 
         # Guess type of hash in ground truth file
@@ -132,12 +138,21 @@ def main():
     pair_count_map = {}
     vt_all = 0
     avtags_dict = {}
-    stats = {'samples': 0, 'noscans': 0, 'tagged': 0, 'maltagged': 0,
-             'FAM': 0, 'CLASS': 0, 'BEH': 0, 'FILE': 0, 'UNK': 0}
+    stats = {
+        "samples": 0,
+        "noscans": 0,
+        "tagged": 0,
+        "maltagged": 0,
+        "FAM": 0,
+        "CLASS": 0,
+        "BEH": 0,
+        "FILE": 0,
+        "UNK": 0,
+    }
 
     for ifile in ifile_l:
-        fd = open(ifile, 'r')
-        sys.stderr.write('[-] Processing input file %s\n' % ifile)
+        fd = open(ifile, "r")
+        sys.stderr.write("[-] Processing input file %s\n" % ifile)
 
         for line in fd:
             if not line.strip():
@@ -145,7 +160,7 @@ def main():
 
             # Debug info
             if vt_all % 100 == 0:
-                sys.stderr.write('\r[-] %d JSON read' % vt_all)
+                sys.stderr.write("\r[-] %d JSON read\n" % vt_all)
                 sys.stderr.flush()
             vt_all += 1
 
@@ -154,13 +169,13 @@ def main():
 
             if sample_info is None:
                 try:
-                    name = vt_rep['md5']
-                    sys.stderr.write('\nNo scans for %s\n' % name)
+                    name = vt_rep["md5"]
+                    sys.stderr.write("\nNo scans for %s\n" % name)
                 except KeyError:
-                    sys.stderr.write('\nCould not process: %s\n' % line)
+                    sys.stderr.write("\nCould not process: %s\n" % line)
 
                 sys.stderr.flush()
-                stats['noscans'] += 1
+                stats["noscans"] += 1
                 continue
 
             # Sample's name is selected hash type (md5 by default)
@@ -168,7 +183,7 @@ def main():
 
             # If the VT report has no AV labels, output and continue
             if not sample_info.labels:
-                sys.stdout.write('%s\t-\t[]\n' % name)
+                sys.stdout.write("%s\t-\t[]\n" % name)
                 # sys.stderr.write('\nNo AV labels for %s\n' % name)
                 # sys.stderr.flush()
                 continue
@@ -212,7 +227,13 @@ def main():
                     if args.stats:
                         if vt_count > 3:
                             stats["maltagged"] += 1
-                            cat_map = {'FAM': False, 'CLASS': False, 'BEH': False, 'FILE': False, 'UNK': False}
+                            cat_map = {
+                                "FAM": False,
+                                "CLASS": False,
+                                "BEH": False,
+                                "FILE": False,
+                                "UNK": False,
+                            }
                             for t in tags:
                                 path, cat = av_labels.taxonomy.get_info(t[0])
                                 cat_map[cat] = True
@@ -240,7 +261,7 @@ def main():
                             break
 
                     first_token_dict[name] = fam
-                    gt_family = '\t' + gt_dict.get(name, "")
+                    gt_family = "\t" + gt_dict.get(name, "")
                 else:
                     gt_family = ""
 
@@ -256,64 +277,75 @@ def main():
                         tag_str = format_tag_pairs(tags, av_labels.taxonomy)
                     else:
                         tag_str = format_tag_pairs(tags)
-                    sys.stdout.write('%s\t%d\t%s%s%s%s\n' % name, vt_count, tag_str, gt_family, is_pup_str, vtt)
+                    sys.stdout.write(
+                        "%s\t%d\t%s%s%s%s\n"
+                        % (name, vt_count, tag_str, gt_family, is_pup_str, vtt)
+                    )
                 else:
-                    sys.stdout.write('%s\t%s%s%s\n' % name, fam, gt_family, is_pup_str)
+                    sys.stdout.write("%s\t%s%s%s\n" % name, fam, gt_family, is_pup_str)
             except:
                 traceback.print_exc(file=sys.stderr)
                 continue
 
-        sys.stderr.write('\r[-] %d JSON read' % vt_all)
+        sys.stderr.write("\r[-] %d JSON read" % vt_all)
         sys.stderr.flush()
-        sys.stderr.write('\n')
+        sys.stderr.write("\n")
 
         fd.close()
 
     # Print statistics
-    sys.stderr.write("[-] Samples: %d NoScans: %d NoTags: %d GroundTruth: %d\n" %
-                     (vt_all, stats['noscans'], vt_all - stats['tagged'], len(gt_dict)))
+    sys.stderr.write(
+        "[-] Samples: %d NoScans: %d NoTags: %d GroundTruth: %d\n"
+        % (vt_all, stats["noscans"], vt_all - stats["tagged"], len(gt_dict))
+    )
 
     # If ground truth, print precision, recall, and F1-measure
     if args.gt:
-        precision, recall, fmeasure = ec.eval_precision_recall_fmeasure(gt_dict, first_token_dict)
-        sys.stderr.write("Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % (precision, recall, fmeasure))
+        precision, recall, fmeasure = ec.eval_precision_recall_fmeasure(
+            gt_dict, first_token_dict
+        )
+        sys.stderr.write(
+            "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n"
+            % (precision, recall, fmeasure)
+        )
 
     # Output stats
     if args.stats:
-        stats_fd = open("%s.stats" % out_prefix, 'w')
+        stats_fd = open("%s.stats" % out_prefix, "w")
         num_samples = vt_all
-        stats_fd.write('Samples: %d\n' % num_samples)
-        num_tagged = stats['tagged']
+        stats_fd.write("Samples: %d\n" % num_samples)
+        num_tagged = stats["tagged"]
         frac = float(num_tagged) / float(num_samples) * 100
-        stats_fd.write('Tagged (all): %d (%.01f%%)\n' % (num_tagged, frac))
-        num_maltagged = stats['maltagged']
+        stats_fd.write("Tagged (all): %d (%.01f%%)\n" % (num_tagged, frac))
+        num_maltagged = stats["maltagged"]
         frac = float(num_maltagged) / float(num_samples) * 100
-        stats_fd.write('Tagged (VT>3): %d (%.01f%%)\n' % (num_maltagged, frac))
-        for c in ['FILE', 'CLASS', 'BEH', 'FAM', 'UNK']:
+        stats_fd.write("Tagged (VT>3): %d (%.01f%%)\n" % (num_maltagged, frac))
+        for c in ["FILE", "CLASS", "BEH", "FAM", "UNK"]:
             count = stats[c]
             frac = float(count) / float(num_maltagged) * 100
-            stats_fd.write('%s: %d (%.01f%%)\n' % (c, stats[c], frac))
+            stats_fd.write("%s: %d (%.01f%%)\n" % (c, stats[c], frac))
         stats_fd.close()
 
     # Output vendor info
     if args.avtags:
-        avtags_fd = open("%s.avtags" % out_prefix, 'w')
+        avtags_fd = open("%s.avtags" % out_prefix, "w")
         for t in sorted(avtags_dict.keys()):
-            avtags_fd.write('%s\t' % t)
-            pairs = sorted(avtags_dict[t].items(), key=lambda pair: pair[1], reverse=True)
+            avtags_fd.write("%s\t" % t)
+            pairs = sorted(
+                avtags_dict[t].items(), key=lambda pair: pair[1], reverse=True
+            )
 
             for pair in pairs:
-                avtags_fd.write('%s|%d,' % (pair[0], pair[1]))
-            avtags_fd.write('\n')
+                avtags_fd.write("%s|%d," % (pair[0], pair[1]))
+            avtags_fd.write("\n")
         avtags_fd.close()
 
     # If alias detection, print map
     if args.aliasdetect:
-        alias_filename = out_prefix + '.alias'
-        alias_fd = open(alias_filename, 'w+')
+        alias_filename = out_prefix + ".alias"
+        alias_fd = open(alias_filename, "w+")
         # Sort token pairs by number of times they appear together
-        sorted_pairs = sorted(
-            pair_count_map.items(), key=itemgetter(1))
+        sorted_pairs = sorted(pair_count_map.items(), key=itemgetter(1))
         # sorted_pairs = sorted(
         #     pair_count_map.items())
 
@@ -335,101 +367,154 @@ def main():
                 yn = n1
             f = float(c) / float(xn)
             finv = float(c) / float(yn)
-            alias_fd.write("%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" % (x, y, xn, yn, c, f, finv))
+            alias_fd.write(
+                "%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" % (x, y, xn, yn, c, f, finv)
+            )
         # Close alias file
         alias_fd.close()
-        sys.stderr.write('[-] Alias data in %s\n' % alias_filename)
+        sys.stderr.write("[-] Alias data in %s\n" % alias_filename)
 
 
 def parse_args():
-    argparser = argparse.ArgumentParser(prog='avclass',
-                                        description='Extracts tags for a set of samples.  Also calculates precision and'
-                                                    ' recall if ground truth available')
-
-    argparser.add_argument('-vt', action='append', help='file with VT reports (Can be provided multiple times)')
-
-    argparser.add_argument('-lb', action='append', help='file with simplified JSON reports '
-                                                        '{md5,sha1,sha256,scan_date,av_labels} (Can be provided '
-                                                        'multiple times)')
-
-    argparser.add_argument('-vtdir', help='existing directory with VT reports')
-
-    argparser.add_argument('-lbdir', help='existing directory with simplified JSON reports')
-
-    argparser.add_argument('-vt3', action='store_true', help='input are VT v3 files')
-
-    argparser.add_argument('-gt', help='file with ground truth. If provided it evaluates clustering accuracy. '
-                                       'Prints precision, recall, F1-measure.')
-
-    argparser.add_argument('-vtt', help='Include VT tags in the output.', action='store_true')
-
-    argparser.add_argument('-tag', help='file with tagging rules.', default=util.DEFAULT_TAG_PATH)
-
-    argparser.add_argument('-tax', help='file with taxonomy.', default=util.DEFAULT_TAX_PATH)
-
-    argparser.add_argument('-exp', help='file with expansion rules.', default=util.DEFAULT_EXP_PATH)
-
-    argparser.add_argument('-av', help='file with list of AVs to use')
-
-    argparser.add_argument('-avtags', help='extracts tags per av vendor', action='store_true')
-
-    argparser.add_argument('-pup', action='store_true', help='if used each sample is classified as PUP or not')
-
-    argparser.add_argument('-p', '--path', help='output.full path for tags', action='store_true')
-
-    argparser.add_argument('-hash', help='hash used to name samples. Should match ground truth',
-                           choices=['md5', 'sha1', 'sha256'])
-
-    argparser.add_argument('-c', help='Compatibility mode. Outputs results in AVClass format.', action='store_true')
-
-    argparser.add_argument('-aliasdetect', action='store_true', help='if used produce aliases file at end')
-
-    argparser.add_argument('-stats', action='store_true', help='if used produce 1 file with stats per category '
-                                                               '(File, Class, Behavior, Family, Unclassified)')
+    argparser = argparse.ArgumentParser(
+        prog="avclass",
+        description="Extracts tags for a set of samples.  Also calculates precision and"
+        " recall if ground truth available",
+    )
+
+    argparser.add_argument(
+        "-vt",
+        action="append",
+        help="file with VT reports (Can be provided multiple times)",
+    )
+
+    argparser.add_argument(
+        "-lb",
+        action="append",
+        help="file with simplified JSON reports "
+        "{md5,sha1,sha256,scan_date,av_labels} (Can be provided "
+        "multiple times)",
+    )
+
+    argparser.add_argument("-vtdir", help="existing directory with VT reports")
+
+    argparser.add_argument(
+        "-lbdir", help="existing directory with simplified JSON reports"
+    )
+
+    argparser.add_argument("-vt3", action="store_true", help="input are VT v3 files")
+
+    argparser.add_argument(
+        "-gt",
+        help="file with ground truth. If provided it evaluates clustering accuracy. "
+        "Prints precision, recall, F1-measure.",
+    )
+
+    argparser.add_argument(
+        "-vtt", help="Include VT tags in the output.", action="store_true"
+    )
+
+    argparser.add_argument(
+        "-tag", help="file with tagging rules.", default=util.DEFAULT_TAG_PATH
+    )
+
+    argparser.add_argument(
+        "-tax", help="file with taxonomy.", default=util.DEFAULT_TAX_PATH
+    )
+
+    argparser.add_argument(
+        "-exp", help="file with expansion rules.", default=util.DEFAULT_EXP_PATH
+    )
+
+    argparser.add_argument("-av", help="file with list of AVs to use")
+
+    argparser.add_argument(
+        "-avtags", help="extracts tags per av vendor", action="store_true"
+    )
+
+    argparser.add_argument(
+        "-pup",
+        action="store_true",
+        help="if used each sample is classified as PUP or not",
+    )
+
+    argparser.add_argument(
+        "-p", "--path", help="output.full path for tags", action="store_true"
+    )
+
+    argparser.add_argument(
+        "-hash",
+        help="hash used to name samples. Should match ground truth",
+        choices=["md5", "sha1", "sha256"],
+    )
+
+    argparser.add_argument(
+        "-c",
+        help="Compatibility mode. Outputs results in AVClass format.",
+        action="store_true",
+    )
+
+    argparser.add_argument(
+        "-aliasdetect", action="store_true", help="if used produce aliases file at end"
+    )
+
+    argparser.add_argument(
+        "-stats",
+        action="store_true",
+        help="if used produce 1 file with stats per category "
+        "(File, Class, Behavior, Family, Unclassified)",
+    )
 
     args = argparser.parse_args()
 
     # TODO - use non-exclusive group to ensure at least one is selected instead of this
     if not args.vt and not args.lb and not args.vtdir and not args.lbdir:
-        sys.stderr.write('One of the following 4 arguments is required: '
-                         '-vt,-lb,-vtdir,-lbdir\n')
+        sys.stderr.write(
+            "One of the following 4 arguments is required: " "-vt,-lb,-vtdir,-lbdir\n"
+        )
         exit(1)
 
     # TODO - use mutex group for this instead of manual check
     if (args.vt or args.vtdir) and (args.lb or args.lbdir):
-        sys.stderr.write('Use either -vt/-vtdir or -lb/-lbdir. '
-                         'Both types of input files cannot be combined.\n')
+        sys.stderr.write(
+            "Use either -vt/-vtdir or -lb/-lbdir. "
+            "Both types of input files cannot be combined.\n"
+        )
         exit(1)
 
     # TODO - consider letting argparse handle this?
     if args.tag:
-        if args.tag == '/dev/null':
-            sys.stderr.write('[-] Using no tagging rules\n')
+        if args.tag == "/dev/null":
+            sys.stderr.write("[-] Using no tagging rules\n")
         else:
-            sys.stderr.write('[-] Using tagging rules in %s\n' % args.tag)
+            sys.stderr.write("[-] Using tagging rules in %s\n" % args.tag)
     else:
-        sys.stderr.write('[-] Using default tagging rules in %s\n' % util.DEFAULT_TAG_PATH)
+        sys.stderr.write(
+            "[-] Using default tagging rules in %s\n" % util.DEFAULT_TAG_PATH
+        )
 
     # TODO - consider letting argparse handle this?
     if args.tax:
-        if args.tax == '/dev/null':
-            sys.stderr.write('[-] Using no taxonomy\n')
+        if args.tax == "/dev/null":
+            sys.stderr.write("[-] Using no taxonomy\n")
         else:
-            sys.stderr.write('[-] Using taxonomy in %s\n' % args.tax)
+            sys.stderr.write("[-] Using taxonomy in %s\n" % args.tax)
     else:
-        sys.stderr.write('[-] Using default taxonomy in %s\n' % util.DEFAULT_TAX_PATH)
+        sys.stderr.write("[-] Using default taxonomy in %s\n" % util.DEFAULT_TAX_PATH)
 
     # TODO - consider letting argparse handle this?
     if args.exp:
-        if args.exp == '/dev/null':
-            sys.stderr.write('[-] Using no expansion tags\n')
+        if args.exp == "/dev/null":
+            sys.stderr.write("[-] Using no expansion tags\n")
         else:
-            sys.stderr.write('[-] Using expansion tags in %s\n' % args.exp)
+            sys.stderr.write("[-] Using expansion tags in %s\n" % args.exp)
     else:
-        sys.stderr.write('[-] Using default expansion tags in %s\n' % util.DEFAULT_EXP_PATH)
+        sys.stderr.write(
+            "[-] Using default expansion tags in %s\n" % util.DEFAULT_EXP_PATH
+        )
 
     return args
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()

From b832ab9cb4abbb4bb06c656c71549abf40d0ab88 Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Mon, 1 Feb 2021 16:13:45 -0500
Subject: [PATCH 15/36] Turn Labler into Class, cleanup

---
 avclass/common.py  |  22 +-
 avclass/labeler.py | 885 ++++++++++++++++++++++++++++++---------------
 2 files changed, 608 insertions(+), 299 deletions(-)

diff --git a/avclass/common.py b/avclass/common.py
index 9cbe4bc..5533b43 100755
--- a/avclass/common.py
+++ b/avclass/common.py
@@ -5,7 +5,7 @@
 import sys
 
 from collections import defaultdict, namedtuple
-from typing import AnyStr, Collection, Dict, List, Optional, Set, Tuple, Union
+from typing import AnyStr, Callable, Collection, Dict, List, Optional, Set, Tuple, Union
 
 
 logger = logging.getLogger(__name__)
@@ -497,6 +497,26 @@ def __init__(
         # Alias statistics initialization
         self.alias_detect = alias_detect
 
+    def get_sample_call(self, data_type: AnyStr) -> Callable:
+        """
+        Return the correct parser for the report type
+        
+        :param data_type: the type of file vt2, vt3, lb
+        :return: Callable function that returns SampleInfo
+        """
+        if data_type == "lb":
+            return self.get_sample_info_lb
+        elif data_type == "vt" or data_type == "vt2":
+            return self.get_sample_info_vt_v2
+        elif data_type == "vt3":
+            return self.get_sample_info_vt_v3
+        else:
+            sys.stderr.write(
+                "Invalid data type for sample: %s (should be vt, vt2, vt3, lb)"
+                % data_type
+            )
+            return self.get_sample_info_vt_v3
+
     @staticmethod
     def read_avs(avs_file: AnyStr) -> Set[AnyStr]:
         """
diff --git a/avclass/labeler.py b/avclass/labeler.py
index 175e798..dbf2202 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -4,8 +4,11 @@
 import sys
 import traceback
 
+
+from io import StringIO
 from operator import itemgetter
-from typing import AnyStr, Optional
+from pathlib import Path
+from typing import AnyStr, Dict, List, NamedTuple, Optional, Tuple, Union
 
 try:
     from avclass.common import AvLabels, Taxonomy
@@ -17,127 +20,25 @@
     from avclass import clustering as ec, util
 
 
-def guess_hash(h: AnyStr) -> Optional[AnyStr]:
-    """
-    Guess hash type based on ``len(h)``
-
-    :param h: The hash
-    :return: The hash type (str)
-    """
-    """ Given a hash string, guess the hash type based on the string length """
-    hlen = len(h)
-    if hlen == 32:
-        return "md5"
-    elif hlen == 40:
-        return "sha1"
-    elif hlen == 64:
-        return "sha256"
-
-    return None
-
-
-def format_tag_pairs(l, taxonomy: Taxonomy = None) -> AnyStr:
-    """
-    Get ranked tags as a string.
-
-    :param l:
-    :param taxonomy:
-    :return:
-    """
-    # TODO - wtf is ``l``?
-    if not l:
-        return ""
-
-    if taxonomy is not None:
-        p = taxonomy.get_path(l[0][0])
-    else:
-        p = l[0][0]
-
-    out = "%s|%d" % (p, l[0][1])
-    for t, s in l[1:]:
-        if taxonomy is not None:
-            p = taxonomy.get_path(t)
-        else:
-            p = t
-        out += ",%s|%d" % (p, s)
-
-    return out
-
-
-def list_str(l, sep: AnyStr = ", ", prefix: AnyStr = "") -> AnyStr:
-    """
-    Return list as a string
-
-    :param l: The list
-    :param sep: The separator
-    :param prefix: The prefix
-    :return: A string representation of the list
-    """
-    # TODO - wtf is ``l``?
-    if not l:
-        return ""
-    out = prefix + l[0]
-    for s in l[1:]:
-        out = out + sep + s
-    return out
-
-
-def main():
-    # TODO - break this function up.
-    args = parse_args()
-    # Select hash used to identify sample, by default MD5
-    hash_type = args.hash or "md5"
-
-    # If ground truth provided, read it from file
-    gt_dict = {}
-    if args.gt:
-        with open(args.gt, "r") as gt_fd:
-            for line in gt_fd:
-                gt_hash, family = map(str, line.strip().split("\t", 1))
-                gt_dict[gt_hash] = family
-
-        # Guess type of hash in ground truth file
-        hash_type = guess_hash(list(gt_dict.keys())[0])
-
-    # Create AvLabels object
-    av_labels = AvLabels(args.tag, args.exp, args.tax, args.av, args.aliasdetect)
-
-    # Build list of input files
-    # NOTE: duplicate input files are not removed
-    ifile_l = []
-    if args.vt:
-        ifile_l += args.vt
-        ifile_are_vt = True
-    elif args.lb:
-        ifile_l += args.lb
-        ifile_are_vt = False
-    elif args.vtdir:
-        ifile_l += [os.path.join(args.vtdir, f) for f in os.listdir(args.vtdir)]
-        ifile_are_vt = True
-    elif args.lbdir:
-        ifile_l += [os.path.join(args.lbdir, f) for f in os.listdir(args.lbdir)]
-        ifile_are_vt = False
-    else:
-        # TODO - is this reachable?
-        sys.exit(1)
-
-    # Select correct sample info extraction function
-    if not ifile_are_vt:
-        get_sample_info = av_labels.get_sample_info_lb
-    elif args.vt3:
-        get_sample_info = av_labels.get_sample_info_vt_v3
-    else:
-        get_sample_info = av_labels.get_sample_info_vt_v2
-
-    # Select output prefix
-    out_prefix = os.path.basename(os.path.splitext(ifile_l[0])[0])
-
-    # Initialize state
+class AVClass2:
+    output = []
+    av_labels = None
+    hash_type = None
+    ground_truth = None
+    get_sample_info = None
+    console = False
+    av_tags = False
+    stats_export = False
+    compatibility_v1 = False
+    pup_classify = False
+    path_export = False
+    vt_tags = False
+    vt_all = 0
     first_token_dict = {}
     token_count_map = {}
     pair_count_map = {}
-    vt_all = 0
     avtags_dict = {}
+    gt_dict = {}
     stats = {
         "samples": 0,
         "noscans": 0,
@@ -150,211 +51,391 @@ def main():
         "UNK": 0,
     }
 
-    for ifile in ifile_l:
-        fd = open(ifile, "r")
-        sys.stderr.write("[-] Processing input file %s\n" % ifile)
-
-        for line in fd:
-            if not line.strip():
+    def __init__(self, av_labels: AvLabels):
+        self.av_labels = av_labels
+
+    def run(
+        self,
+        files: Union[
+            AnyStr,
+            List[AnyStr],
+            Path,
+            List[Path],
+            StringIO,
+            List[StringIO],
+            Dict,
+            List[Dict],
+        ],
+        data_type: str = "vt3",
+        hash_type: Optional[AnyStr] = "md5",
+        ground_truth: Optional[AnyStr] = None,
+        stats_export: bool = False,
+        vt_tags: bool = False,
+        av_tags: bool = False,
+        pup_classify: bool = False,
+        path_export: bool = False,
+        compatibility_v1: bool = False,
+        console: bool = False,
+    ) -> List[Dict]:
+        # Set class arguments
+        self.console = console
+        self.ground_truth = ground_truth
+        self.av_tags = av_tags
+        self.stats_export = stats_export
+        self.compatibility_v1 = compatibility_v1
+        self.pup_classify = pup_classify
+        self.path_export = path_export
+        self.vt_tags = vt_tags
+
+        # Select hash used to identify sample, by default MD5
+        self.hash_type = self.get_hash_type(hash_type)
+
+        # Select file type used for sampling
+        self.get_sample_info = self.av_labels.get_sample_call(data_type)
+
+        # Select output prefix
+        out_prefix = os.path.basename(os.path.splitext(files[0])[0])
+
+        # Process each input file
+        if not isinstance(files, list):
+            files = [files]
+        for ifile in files:
+            # Open file
+            if isinstance(ifile, dict):
+                self.process_line(ifile)
                 continue
+            elif isinstance(ifile, StringIO):
+                fd = ifile
+            else:
+                fd = open(ifile, "r")
+
+                # Debug info, file processed
+                self.print_error("[-] Processing input file %s\n" % ifile)
+
+            # Process all lines in file
+            for line in fd:
+                self.process_line(line)
 
             # Debug info
-            if vt_all % 100 == 0:
-                sys.stderr.write("\r[-] %d JSON read\n" % vt_all)
-                sys.stderr.flush()
-            vt_all += 1
+            self.print_error("\r[-] %d JSON read" % self.vt_all, flush=True)
+            self.print_error("\n")
 
-            vt_rep = json.loads(line)
-            sample_info = get_sample_info(vt_rep)
+            # Close file
+            fd.close()
 
-            if sample_info is None:
-                try:
-                    name = vt_rep["md5"]
-                    sys.stderr.write("\nNo scans for %s\n" % name)
-                except KeyError:
-                    sys.stderr.write("\nCould not process: %s\n" % line)
+        # Print statistics
+        self.print_statistics()
 
-                sys.stderr.flush()
-                stats["noscans"] += 1
-                continue
+        # If ground truth, print precision, recall, and F1-measure
+        if self.ground_truth:
+            self.ground_truth_print()
 
-            # Sample's name is selected hash type (md5 by default)
-            name = getattr(sample_info, hash_type)
+        # Output stats
+        if self.stats_export:
+            self.out_stats(out_prefix)
 
-            # If the VT report has no AV labels, output and continue
-            if not sample_info.labels:
-                sys.stdout.write("%s\t-\t[]\n" % name)
-                # sys.stderr.write('\nNo AV labels for %s\n' % name)
-                # sys.stderr.flush()
-                continue
+        # Output vendor info
+        if self.av_tags:
+            self.out_avtags(out_prefix)
 
-            # Compute VT_Count
-            vt_count = len(sample_info.labels)
+        # If alias detection, print map
+        if self.av_labels.alias_detect:
+            self.alias_detection(out_prefix, path_export)
 
-            # Get the distinct tokens from all the av labels in the report and print them.
-            try:
-                av_tmp = av_labels.get_sample_tags(sample_info)
-                tags = av_labels.rank_tags(av_tmp)
-
-                # AV VENDORS PER TOKEN
-                if args.avtags:
-                    for t in av_tmp:
-                        tmap = avtags_dict.get(t, {})
-                        for av in av_tmp[t]:
-                            ctr = tmap.get(av, 0)
-                            tmap[av] = ctr + 1
-                        avtags_dict[t] = tmap
-
-                if args.aliasdetect:
-                    prev_tokens = set()
-                    for entry in tags:
-                        curr_tok = entry[0]
-                        curr_count = token_count_map.get(curr_tok, 0)
-                        token_count_map[curr_tok] = curr_count + 1
-                        for prev_tok in prev_tokens:
-                            if prev_tok < curr_tok:
-                                pair = prev_tok, curr_tok
-                            else:
-                                pair = curr_tok, prev_tok
-                            pair_count = pair_count_map.get(pair, 0)
-                            pair_count_map[pair] = pair_count + 1
-                        prev_tokens.add(curr_tok)
-
-                # Collect stats
-                # TODO - should iterate once over tags for both stats and aliasdetect
-                if tags:
-                    stats["tagged"] += 1
-                    if args.stats:
-                        if vt_count > 3:
-                            stats["maltagged"] += 1
-                            cat_map = {
-                                "FAM": False,
-                                "CLASS": False,
-                                "BEH": False,
-                                "FILE": False,
-                                "UNK": False,
-                            }
-                            for t in tags:
-                                path, cat = av_labels.taxonomy.get_info(t[0])
-                                cat_map[cat] = True
-                            for c in cat_map:
-                                if cat_map[c]:
-                                    stats[c] += 1
-
-                # Check if sample is PUP, if requested
-                if args.pup:
-                    if av_labels.is_pup(tags, av_labels.taxonomy):
-                        is_pup_str = "\t1"
-                    else:
-                        is_pup_str = "\t0"
-                else:
-                    is_pup_str = ""
-
-                # Select family for sample if needed,
-                # i.e., for compatibility mode or for ground truth
-                fam = "SINGLETON:" + name
-                if args.c or args.gt:
-                    for t, s in tags:
-                        cat = av_labels.taxonomy.get_category(t)
-                        if cat in ["UNK", "FAM"]:
-                            fam = t
-                            break
-
-                    first_token_dict[name] = fam
-                    gt_family = "\t" + gt_dict.get(name, "")
-                else:
-                    gt_family = ""
+        return self.output
 
-                # Get VT tags as string
-                if args.vtt:
-                    vtt = list_str(sample_info.vt_tags, prefix="\t")
-                else:
-                    vtt = ""
-
-                # Print family (and ground truth if available) to stdout
-                if not args.c:
-                    if args.path:
-                        tag_str = format_tag_pairs(tags, av_labels.taxonomy)
-                    else:
-                        tag_str = format_tag_pairs(tags)
-                    sys.stdout.write(
-                        "%s\t%d\t%s%s%s%s\n"
-                        % (name, vt_count, tag_str, gt_family, is_pup_str, vtt)
-                    )
-                else:
-                    sys.stdout.write("%s\t%s%s%s\n" % name, fam, gt_family, is_pup_str)
-            except:
-                traceback.print_exc(file=sys.stderr)
-                continue
+    def process_line(self, line: Union[AnyStr, Dict]):
+        if isinstance(line, str):
+            # If blank line, skip
+            if line == "\n":
+                return
 
-        sys.stderr.write("\r[-] %d JSON read" % vt_all)
-        sys.stderr.flush()
-        sys.stderr.write("\n")
+            # Debug info
+            if self.vt_all % 100 == 0:
+                self.print_error("\r[-] %d JSON read\n" % self.vt_all, flush=True)
+            self.vt_all += 1
 
-        fd.close()
+            # Read JSON line
+            vt_rep = json.loads(line)
+        else:
+            vt_rep = line
 
-    # Print statistics
-    sys.stderr.write(
-        "[-] Samples: %d NoScans: %d NoTags: %d GroundTruth: %d\n"
-        % (vt_all, stats["noscans"], vt_all - stats["tagged"], len(gt_dict))
-    )
+        # Extract sample info
+        sample_info = self.get_sample_info(vt_rep)
 
-    # If ground truth, print precision, recall, and F1-measure
-    if args.gt:
-        precision, recall, fmeasure = ec.eval_precision_recall_fmeasure(
-            gt_dict, first_token_dict
+        # If no sample info, log error and continue
+        if sample_info is None:
+            try:
+                name = vt_rep["md5"]
+                self.print_error("\nNo scans for %s\n" % name, flush=True)
+            except KeyError:
+                self.print_error("\nCould not process: %s\n" % line, flush=True)
+            self.stats["noscans"] += 1
+            return
+
+        # Get the distinct tokens from all the av labels in the report
+        # And print them.
+        try:
+            self.get_tokens(sample_info)
+        except Exception:
+            traceback.print_exc(file=sys.stderr)
+            return
+
+    def get_tokens(self, sample_info: NamedTuple):
+        # Sample's name is selected hash type (md5 by default)
+        name = getattr(sample_info, self.hash_type)
+
+        # If the VT report has no AV labels, output and continue
+        if not sample_info.labels:
+            self.print_output("%s\t-\t[]\n" % (name))
+            # self.print_error('\nNo AV labels for %s\n' % name, flush=True)
+            return
+
+        # AV VENDORS PER TOKEN
+        av_tmp = self.av_labels.get_sample_tags(sample_info)
+        if self.av_tags:
+            self.av_vender_tags(av_tmp)
+
+        tags = self.av_labels.rank_tags(av_tmp)
+        if self.av_labels.alias_detect:
+            self.av_vender_tokens(tags)
+
+        # Compute VT_Count
+        vt_count = len(sample_info.labels)
+
+        # Collect stats
+        # TODO: should iterate once over tags,
+        # for both stats and aliasdetect
+        if tags:
+            self.collect_stats(tags, vt_count)
+
+        # Select family for sample if needed,
+        # i.e., for compatibility mode or for ground truth
+        fam, gt_family = self.get_family(name, tags)
+
+        # Check if sample is PUP, if requested
+        pup_val = self.is_pup(self.pup_classify, tags)
+
+        # Print family (and ground truth if available)
+        if self.compatibility_v1:
+            class_entry = self.avclass1_output(
+                name=name,
+                family=fam,
+                ground_truth=gt_family,
+                pup_val=pup_val,
+                vt_count=vt_count,
+            )
+            self.output.append(class_entry)
+        else:
+            class_entry = self.avclass2_output(
+                name=name,
+                tags=tags,
+                sample_info=sample_info,
+                ground_truth=gt_family,
+                pup_val=pup_val,
+                vt_count=vt_count,
+            )
+            self.output.append(class_entry)
+
+    def avclass1_output(
+        self,
+        name: AnyStr,
+        family: AnyStr,
+        ground_truth: AnyStr,
+        pup_val: Optional[bool],
+        vt_count: int,
+    ) -> Dict:
+        """
+        Build the v1 classification entry
+
+        :param name: Hash
+        :param family: family classification
+        :param ground_truth:
+        :param pup_val: is a pup
+        :param vt_count:
+        :return: Dict of classification
+        """
+        self.print_output(
+            "%s\t%s%s%s\n" % (name, family, ground_truth, self.get_pup_str(pup_val))
         )
-        sys.stderr.write(
-            "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n"
-            % (precision, recall, fmeasure)
+        # Build json output
+        values = {"hash": name, "av_count": vt_count, "family": family}
+        if self.ground_truth:
+            values["ground_truth"] = ground_truth
+        if self.pup_classify:
+            values["pup"] = pup_val
+        return values
+
+    def avclass2_output(
+        self,
+        name: AnyStr,
+        ground_truth: AnyStr,
+        pup_val: Optional[bool],
+        vt_count: int,
+        tags: List[Tuple],
+        sample_info: NamedTuple,
+    ) -> Dict:
+        """
+        Build the v2 classification entry
+
+        :param name: Hash
+        :param ground_truth:
+        :param pup_val: is a pup
+        :param vt_count:
+        :param tags: List of tags and their count
+        :param sample_info:
+        :return: Dict of classification
+        """
+        # Build string output
+        if self.vt_tags:
+            vtt = self.list_str(sample_info.vt_tags, prefix="\t")
+        else:
+            vtt = ""
+        tag_str = self.format_tag_pairs_str(
+            tags, self.av_labels.taxonomy, self.path_export
+        )
+        self.print_output(
+            "%s\t%d\t%s%s%s%s\n"
+            % (name, vt_count, tag_str, ground_truth, self.get_pup_str(pup_val), vtt)
+        )
+        # Build json output
+        tag_dict = self.format_tag_pairs_list(
+            tags, self.av_labels.taxonomy, self.path_export
         )
+        values = {"hash": name, "av_count": vt_count, "tags": tag_dict}
+        if self.ground_truth:
+            values["ground_truth"] = self.gt_dict.get(name, "")
+        if self.pup_classify:
+            values["pup"] = pup_val
+        if self.vt_tags:
+            values["vt_tags"] = sample_info.vt_tags
+        return values
+
+    def get_family(self, name: AnyStr, tags: List[Tuple]) -> Tuple:
+        if self.compatibility_v1 or self.ground_truth:
+            fam = "SINGLETON:" + name
+            # fam = ''
+            for (t, s) in tags:
+                cat = self.av_labels.taxonomy.get_category(t)
+                if (cat == "UNK") or (cat == "FAM"):
+                    fam = t
+                    break
+        else:
+            fam = ""
 
-    # Output stats
-    if args.stats:
-        stats_fd = open("%s.stats" % out_prefix, "w")
-        num_samples = vt_all
-        stats_fd.write("Samples: %d\n" % num_samples)
-        num_tagged = stats["tagged"]
-        frac = float(num_tagged) / float(num_samples) * 100
-        stats_fd.write("Tagged (all): %d (%.01f%%)\n" % (num_tagged, frac))
-        num_maltagged = stats["maltagged"]
-        frac = float(num_maltagged) / float(num_samples) * 100
-        stats_fd.write("Tagged (VT>3): %d (%.01f%%)\n" % (num_maltagged, frac))
-        for c in ["FILE", "CLASS", "BEH", "FAM", "UNK"]:
-            count = stats[c]
-            frac = float(count) / float(num_maltagged) * 100
-            stats_fd.write("%s: %d (%.01f%%)\n" % (c, stats[c], frac))
-        stats_fd.close()
+        # Get ground truth family, if available
+        if self.ground_truth:
+            self.first_token_dict[name] = fam
+            gt_family = "\t" + self.gt_dict.get(name, "")
+        else:
+            gt_family = ""
+        return (fam, gt_family)
+
+    def collect_stats(self, tags: List[Tuple], vt_count: int):
+        self.stats["tagged"] += 1
+        if self.stats_export and vt_count > 3:
+            self.stats["maltagged"] += 1
+            cat_map = {
+                "FAM": False,
+                "CLASS": False,
+                "BEH": False,
+                "FILE": False,
+                "UNK": False,
+            }
+            for t in tags:
+                cat = self.av_labels.taxonomy.get_info(t[0])[1]
+                cat_map[cat] = True
+            for c in cat_map:
+                if cat_map[c]:
+                    self.stats[c] += 1
+
+    def av_vender_tags(self, av_tmp: Dict):
+        for t in av_tmp:
+            tmap = self.avtags_dict.get(t, {})
+            for av in av_tmp[t]:
+                ctr = tmap.get(av, 0)
+                tmap[av] = ctr + 1
+            self.avtags_dict[t] = tmap
+
+    def av_vender_tokens(self, tags: List[Tuple]):
+        prev_tokens = set()
+        for entry in tags:
+            curr_tok = entry[0]
+            curr_count = self.token_count_map.get(curr_tok, 0)
+            self.token_count_map[curr_tok] = curr_count + 1
+            for prev_tok in prev_tokens:
+                if prev_tok < curr_tok:
+                    pair = (prev_tok, curr_tok)
+                else:
+                    pair = (curr_tok, prev_tok)
+                pair_count = self.pair_count_map.get(pair, 0)
+                self.pair_count_map[pair] = pair_count + 1
+            prev_tokens.add(curr_tok)
+
+    def get_pup_str(self, is_pup: Optional[bool] = None) -> AnyStr:
+        if is_pup is True:
+            return "\t1"
+        elif is_pup is False:
+            return "\t0"
+        else:
+            return ""
 
-    # Output vendor info
-    if args.avtags:
-        avtags_fd = open("%s.avtags" % out_prefix, "w")
-        for t in sorted(avtags_dict.keys()):
-            avtags_fd.write("%s\t" % t)
-            pairs = sorted(
-                avtags_dict[t].items(), key=lambda pair: pair[1], reverse=True
+    def is_pup(self, pup_classify: bool, tags: List[Tuple]) -> Optional[bool]:
+        if pup_classify:
+            if self.av_labels.is_pup(tags, self.av_labels.taxonomy):
+                is_pup = True
+            else:
+                is_pup = False
+        else:
+            is_pup = None
+        return is_pup
+
+    def get_hash_type(self, hash_type: Optional[AnyStr] = None) -> AnyStr:
+        if self.ground_truth:
+            with open(self.ground_truth, "r") as gt_fd:
+                for line in gt_fd:
+                    gt_hash, family = map(str, line.strip().split("\t", 1))
+                    self.gt_dict[gt_hash] = family
+            # Guess type of hash in ground truth file
+            return self.guess_hash(list(self.gt_dict.keys())[0])
+        else:
+            return hash_type if hash_type else "md5"
+
+    def print_statistics(self):
+        self.print_error(
+            "[-] Samples: %d NoScans: %d NoTags: %d GroundTruth: %d\n"
+            % (
+                self.vt_all,
+                self.stats["noscans"],
+                self.vt_all - self.stats["tagged"],
+                len(self.gt_dict),
             )
+        )
 
-            for pair in pairs:
-                avtags_fd.write("%s|%d," % (pair[0], pair[1]))
-            avtags_fd.write("\n")
-        avtags_fd.close()
+    def ground_truth_print(self):
+        # If ground truth, print precision, recall, and F1-measure
+        precision, recall, fmeasure = ec.eval_precision_recall_fmeasure(
+            self.gt_dict, self.first_token_dict
+        )
+        self.print_error(
+            "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n"
+            % (precision, recall, fmeasure)
+        )
 
-    # If alias detection, print map
-    if args.aliasdetect:
+    def alias_detection(self, out_prefix: AnyStr, path_export: bool = False):
+        # Open alias file
         alias_filename = out_prefix + ".alias"
         alias_fd = open(alias_filename, "w+")
         # Sort token pairs by number of times they appear together
-        sorted_pairs = sorted(pair_count_map.items(), key=itemgetter(1))
-        # sorted_pairs = sorted(
-        #     pair_count_map.items())
+        sorted_pairs = sorted(self.pair_count_map.items(), key=itemgetter(1))
+        # sorted_pairs = sorted(self.pair_count_map.items())
 
         # Output header line
         alias_fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n")
         # Compute token pair statistic and output to alias file
-        for t1, t2, c in sorted_pairs:
-            n1 = token_count_map[t1]
-            n2 = token_count_map[t2]
+        for (t1, t2), c in sorted_pairs:
+            n1 = self.token_count_map[t1]
+            n2 = self.token_count_map[t2]
             if n1 < n2:
                 x = t1
                 y = t2
@@ -367,12 +448,215 @@ def main():
                 yn = n1
             f = float(c) / float(xn)
             finv = float(c) / float(yn)
+            if path_export:
+                x = self.av_labels.taxonomy.get_path(x)
+                y = self.av_labels.taxonomy.get_path(y)
             alias_fd.write(
                 "%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" % (x, y, xn, yn, c, f, finv)
             )
         # Close alias file
         alias_fd.close()
-        sys.stderr.write("[-] Alias data in %s\n" % alias_filename)
+        self.print_error("[-] Alias data in %s\n" % (alias_filename))
+
+    def out_avtags(self, out_prefix: AnyStr):
+        avtags_fd = open("%s.avtags" % out_prefix, "w")
+        for t in sorted(self.avtags_dict.keys()):
+            avtags_fd.write("%s\t" % t)
+            pairs = sorted(
+                self.avtags_dict[t].items(), key=lambda pair: pair[1], reverse=True
+            )
+            for pair in pairs:
+                avtags_fd.write("%s|%d," % (pair[0], pair[1]))
+            avtags_fd.write("\n")
+        avtags_fd.close()
+
+    def out_stats(self, out_prefix: AnyStr):
+        # Output stats
+        stats_fd = open("%s.stats" % out_prefix, "w")
+        num_samples = self.vt_all
+        stats_fd.write("Samples: %d\n" % num_samples)
+        num_tagged = self.stats["tagged"]
+        frac = float(num_tagged) / float(num_samples) * 100
+        stats_fd.write("Tagged (all): %d (%.01f%%)\n" % (num_tagged, frac))
+        num_maltagged = self.stats["maltagged"]
+        frac = float(num_maltagged) / float(num_samples) * 100
+        stats_fd.write("Tagged (VT>3): %d (%.01f%%)\n" % (num_maltagged, frac))
+        for c in ["FILE", "CLASS", "BEH", "FAM", "UNK"]:
+            count = self.stats[c]
+            frac = float(count) / float(num_maltagged) * 100
+            stats_fd.write("%s: %d (%.01f%%)\n" % (c, self.stats[c], frac))
+        stats_fd.close()
+
+    def guess_hash(self, h: AnyStr) -> Optional[AnyStr]:
+        """
+        Guess hash type based on ``len(h)``
+
+        :param h: The hash
+        :return: The hash type (str)
+        """
+        hlen = len(h)
+        if hlen == 32:
+            return "md5"
+        elif hlen == 40:
+            return "sha1"
+        elif hlen == 64:
+            return "sha256"
+        return None
+
+    def format_tag_pairs_str(
+        self, tags: List[Tuple], taxonomy: Taxonomy = None, path_export: bool = False
+    ) -> AnyStr:
+        """
+        Get ranked tags as a string.
+
+        :param tags:
+        :param taxonomy:
+        :return: List of tags
+        """
+        if not tags:
+            return ""
+        if path_export and taxonomy is not None:
+            p = taxonomy.get_path(tags[0][0])
+        else:
+            p = tags[0][0]
+        out = "%s|%d" % (p, tags[0][1])
+        for (t, s) in tags[1:]:
+            if path_export and taxonomy is not None:
+                p = taxonomy.get_path(t)
+            else:
+                p = t
+            out += ",%s|%d" % (p, s)
+        return out
+
+    def format_tag_pairs_list(
+        self, tags: List[Tuple], taxonomy: Taxonomy = None, path_export: bool = False
+    ) -> List[Dict]:
+        """
+        Get ranked tags as a list dictionary.
+
+        :param tags:
+        :param taxonomy:
+        :return: List of tags
+        """
+        out = []
+        for (tag, count) in tags:
+            values = {"tag": tag, "count": count}
+            if path_export and taxonomy:
+                values["category"] = taxonomy.get_category(tag)
+                values["path"] = taxonomy.get_path(tag)
+            out.append(values)
+        return out
+
+    def list_str(
+        self, vt_tags: Optional[Dict], sep: AnyStr = ", ", prefix: AnyStr = ""
+    ) -> AnyStr:
+        """
+        Return list as a string
+
+        :param vt_tags: The list of virus total tags
+        :param sep: The separator
+        :param prefix: The prefix
+        :return: A string representation of the list
+        """
+        if not vt_tags or len(vt_tags) == 0:
+            return ""
+        out = prefix + vt_tags[0]
+        for s in vt_tags[1:]:
+            out = out + sep + s
+        return out
+
+    def print_error(self, output: str = "", flush=False):
+        if self.console:
+            # TODO - would this be better? print(output, file=sys.stderr, flush=flush, end="")
+            sys.stderr.write(output)
+            if flush:
+                sys.stderr.flush()
+
+    def print_output(self, output: str = ""):
+        if self.console:
+            sys.stdout.write(output)
+
+
+def main():
+    args = parse_args()
+    # Create AvLabels object
+    av_labels = AvLabels(
+        tag_file=args.tag,
+        tax_file=args.tax,
+        exp_file=args.exp,
+        av_file=args.av,
+        alias_detect=args.aliasdetect,
+    )
+    # Build list of input files
+    # TODO: File selection should be rewritten as it is difficult to add new types.
+    # Would be nice to just have '-i or --input', detect if its a directory or file,
+    # then use a new arg string to specify the data type ["vt2", "vt3", "lb"]
+    files, data_type = get_files(
+        vt=args.vt,
+        lb=args.lb,
+        vtdir=args.vtdir,
+        lbdir=args.lbdir,
+        vt3=args.vt3,
+    )
+    av_class = AVClass2(av_labels=av_labels)
+    result = av_class.run(
+        files=files,
+        data_type=data_type,
+        hash_type=args.hash,
+        stats_export=args.stats,
+        vt_tags=args.vtt,
+        av_tags=args.avtags,
+        ground_truth=args.gt,
+        pup_classify=args.pup,
+        path_export=args.path,
+        compatibility_v1=args.c,
+        console=not args.json,
+    )
+    if args.json:
+        print(json.dumps(result))
+
+
+def get_files(
+    vt: Optional[str] = None,
+    lb: Optional[str] = None,
+    vtdir: Optional[str] = None,
+    lbdir: Optional[str] = None,
+    vt3: Optional[bool] = False,
+) -> Tuple:
+    """
+    Return list as a string
+
+    :param vt: vt file
+    :param lb: lb file
+    :param vtdir: vt directory
+    :param lbdir: lb directory
+    :param vt3: vt3 json format
+    :return: A Tuple of files and type
+    """
+    # NOTE: duplicate input files are not removed
+    ifile_l = []
+    ifile_are_vt = None
+    if vt:
+        ifile_l += vt
+        ifile_are_vt = True
+    if lb:
+        ifile_l += lb
+        ifile_are_vt = False
+    if vtdir:
+        ifile_l += [os.path.join(vtdir, f) for f in os.listdir(vtdir)]
+        ifile_are_vt = True
+    if lbdir:
+        ifile_l += [os.path.join(lbdir, f) for f in os.listdir(lbdir)]
+        ifile_are_vt = False
+
+    # Select correct sample info extraction function
+    if not ifile_are_vt:
+        data_type = "lb"
+    elif vt3:
+        data_type = "vt3"
+    else:
+        data_type = "vt2"
+    return ifile_l, data_type
 
 
 def parse_args():
@@ -458,6 +742,10 @@ def parse_args():
         "-aliasdetect", action="store_true", help="if used produce aliases file at end"
     )
 
+    argparser.add_argument(
+        "-json", "--json", action="store_true", help="output console to json"
+    )
+
     argparser.add_argument(
         "-stats",
         action="store_true",
@@ -482,9 +770,10 @@ def parse_args():
         )
         exit(1)
 
+    devnull = "/dev/null"
     # TODO - consider letting argparse handle this?
     if args.tag:
-        if args.tag == "/dev/null":
+        if args.tag == devnull:
             sys.stderr.write("[-] Using no tagging rules\n")
         else:
             sys.stderr.write("[-] Using tagging rules in %s\n" % args.tag)
@@ -495,7 +784,7 @@ def parse_args():
 
     # TODO - consider letting argparse handle this?
     if args.tax:
-        if args.tax == "/dev/null":
+        if args.tax == devnull:
             sys.stderr.write("[-] Using no taxonomy\n")
         else:
             sys.stderr.write("[-] Using taxonomy in %s\n" % args.tax)
@@ -504,7 +793,7 @@ def parse_args():
 
     # TODO - consider letting argparse handle this?
     if args.exp:
-        if args.exp == "/dev/null":
+        if args.exp == devnull:
             sys.stderr.write("[-] Using no expansion tags\n")
         else:
             sys.stderr.write("[-] Using expansion tags in %s\n" % args.exp)

From 45af907b0580a15637a5561ea5eaad76c8348095 Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Mon, 1 Feb 2021 17:33:30 -0500
Subject: [PATCH 16/36] Changed Class Name, Deprecated lb, vt, lbdir, vtdir,
 vt3

Reduced arguments to --input and --type.  Handles multiple files or directories.
This makes it easier to add additonal inputs, such as metadefender
I left the old arguments in there for backward compatibility, but we may just want to remove them.
---
 avclass/labeler.py | 99 +++++++++++++++++++++++++++++-----------------
 1 file changed, 63 insertions(+), 36 deletions(-)

diff --git a/avclass/labeler.py b/avclass/labeler.py
index dbf2202..7f0c85d 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -20,7 +20,7 @@
     from avclass import clustering as ec, util
 
 
-class AVClass2:
+class AVClassLabeler:
     output = []
     av_labels = None
     hash_type = None
@@ -113,8 +113,13 @@ def run(
                 self.print_error("[-] Processing input file %s\n" % ifile)
 
             # Process all lines in file
-            for line in fd:
-                self.process_line(line)
+            try:
+                for line in fd:
+                    self.process_line(line)
+            except json.decoder.JSONDecodeError:
+                if isinstance(ifile, str):
+                    self.print_error("Error parsing %s (possible incorrect file type\n" % ifile)
+                continue
 
             # Debug info
             self.print_error("\r[-] %d JSON read" % self.vt_all, flush=True)
@@ -588,17 +593,16 @@ def main():
         alias_detect=args.aliasdetect,
     )
     # Build list of input files
-    # TODO: File selection should be rewritten as it is difficult to add new types.
-    # Would be nice to just have '-i or --input', detect if its a directory or file,
-    # then use a new arg string to specify the data type ["vt2", "vt3", "lb"]
     files, data_type = get_files(
+        file_input=args.input,
+        data_type=args.type,
         vt=args.vt,
         lb=args.lb,
         vtdir=args.vtdir,
         lbdir=args.lbdir,
         vt3=args.vt3,
     )
-    av_class = AVClass2(av_labels=av_labels)
+    av_class = AVClassLabeler(av_labels=av_labels)
     result = av_class.run(
         files=files,
         data_type=data_type,
@@ -617,10 +621,12 @@ def main():
 
 
 def get_files(
-    vt: Optional[str] = None,
-    lb: Optional[str] = None,
-    vtdir: Optional[str] = None,
-    lbdir: Optional[str] = None,
+    file_input: Optional[AnyStr]=None,
+    data_type: Optional[AnyStr]=None,
+    vt: Optional[AnyStr]=None,
+    lb: Optional[AnyStr]=None,
+    vtdir: Optional[AnyStr]=None,
+    lbdir: Optional[AnyStr]=None,
     vt3: Optional[bool] = False,
 ) -> Tuple:
     """
@@ -633,29 +639,40 @@ def get_files(
     :param vt3: vt3 json format
     :return: A Tuple of files and type
     """
-    # NOTE: duplicate input files are not removed
     ifile_l = []
     ifile_are_vt = None
-    if vt:
-        ifile_l += vt
-        ifile_are_vt = True
-    if lb:
-        ifile_l += lb
-        ifile_are_vt = False
-    if vtdir:
-        ifile_l += [os.path.join(vtdir, f) for f in os.listdir(vtdir)]
-        ifile_are_vt = True
-    if lbdir:
-        ifile_l += [os.path.join(lbdir, f) for f in os.listdir(lbdir)]
-        ifile_are_vt = False
+    if file_input:
+        for fi in file_input:
+            if os.path.isdir(fi):
+                for f in os.listdir(fi):
+                    dir_file = os.path.join(fi, f)
+                    if dir_file not in ifile_l:
+                        ifile_l.append(dir_file)
+            elif fi not in ifile_l:
+                ifile_l.append(fi)
+    else:
+        # NOTE: duplicate input files are not removed
+        if vt:
+            ifile_l += vt
+            ifile_are_vt = True
+        if lb:
+            ifile_l += lb
+            ifile_are_vt = False
+        if vtdir:
+            ifile_l += [os.path.join(vtdir, f) for f in os.listdir(vtdir)]
+            ifile_are_vt = True
+        if lbdir:
+            ifile_l += [os.path.join(lbdir, f) for f in os.listdir(lbdir)]
+            ifile_are_vt = False
 
     # Select correct sample info extraction function
-    if not ifile_are_vt:
-        data_type = "lb"
-    elif vt3:
-        data_type = "vt3"
-    else:
-        data_type = "vt2"
+    if not data_type:
+        if not ifile_are_vt:
+            data_type = "lb"
+        elif vt3:
+            data_type = "vt3"
+        else:
+            data_type = "vt2"
     return ifile_l, data_type
 
 
@@ -669,24 +686,34 @@ def parse_args():
     argparser.add_argument(
         "-vt",
         action="append",
-        help="file with VT reports (Can be provided multiple times)",
+        help="DEPRECATED (use -i & -type): file with VT reports (Can be provided multiple times)",
     )
 
     argparser.add_argument(
         "-lb",
         action="append",
-        help="file with simplified JSON reports "
+        help="DEPRECATED (use -i & -type): file with simplified JSON reports "
         "{md5,sha1,sha256,scan_date,av_labels} (Can be provided "
         "multiple times)",
     )
 
-    argparser.add_argument("-vtdir", help="existing directory with VT reports")
+    argparser.add_argument("-vtdir", help="DEPRECATED (use -i & -type): existing directory with VT reports")
 
     argparser.add_argument(
-        "-lbdir", help="existing directory with simplified JSON reports"
+        "-lbdir", help="DEPRECATED (use -i & -type) existing directory with simplified JSON reports"
     )
 
-    argparser.add_argument("-vt3", action="store_true", help="input are VT v3 files")
+    argparser.add_argument("-vt3", action="store_true", help="DEPRECATED (use -type): input are VT v3 files")
+
+    argparser.add_argument(
+        "-i", "--input", 
+        action="append",
+        help="input report file or directory (Can be provided multiple times)"
+    )
+    
+    argparser.add_argument(
+        "-type", "--type", help="the type of report (vt2, vt3, lb)"
+    )
 
     argparser.add_argument(
         "-gt",
@@ -756,7 +783,7 @@ def parse_args():
     args = argparser.parse_args()
 
     # TODO - use non-exclusive group to ensure at least one is selected instead of this
-    if not args.vt and not args.lb and not args.vtdir and not args.lbdir:
+    if not args.input and not args.vt and not args.lb and not args.vtdir and not args.lbdir:
         sys.stderr.write(
             "One of the following 4 arguments is required: " "-vt,-lb,-vtdir,-lbdir\n"
         )

From a1bcb255f7c97b041465a27c8fa1fe5133c54cf3 Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Mon, 1 Feb 2021 17:42:12 -0500
Subject: [PATCH 17/36] black reformatting

---
 avclass/labeler.py | 48 ++++++++++++++++++++++++++++++----------------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/avclass/labeler.py b/avclass/labeler.py
index 7f0c85d..b09e3cf 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -118,7 +118,9 @@ def run(
                     self.process_line(line)
             except json.decoder.JSONDecodeError:
                 if isinstance(ifile, str):
-                    self.print_error("Error parsing %s (possible incorrect file type\n" % ifile)
+                    self.print_error(
+                        "Error parsing %s (possible incorrect file type\n" % ifile
+                    )
                 continue
 
             # Debug info
@@ -621,12 +623,12 @@ def main():
 
 
 def get_files(
-    file_input: Optional[AnyStr]=None,
-    data_type: Optional[AnyStr]=None,
-    vt: Optional[AnyStr]=None,
-    lb: Optional[AnyStr]=None,
-    vtdir: Optional[AnyStr]=None,
-    lbdir: Optional[AnyStr]=None,
+    file_input: Optional[AnyStr] = None,
+    data_type: Optional[AnyStr] = None,
+    vt: Optional[AnyStr] = None,
+    lb: Optional[AnyStr] = None,
+    vtdir: Optional[AnyStr] = None,
+    lbdir: Optional[AnyStr] = None,
     vt3: Optional[bool] = False,
 ) -> Tuple:
     """
@@ -697,24 +699,30 @@ def parse_args():
         "multiple times)",
     )
 
-    argparser.add_argument("-vtdir", help="DEPRECATED (use -i & -type): existing directory with VT reports")
-
     argparser.add_argument(
-        "-lbdir", help="DEPRECATED (use -i & -type) existing directory with simplified JSON reports"
+        "-vtdir", help="DEPRECATED (use -i & -type): existing directory with VT reports"
     )
 
-    argparser.add_argument("-vt3", action="store_true", help="DEPRECATED (use -type): input are VT v3 files")
+    argparser.add_argument(
+        "-lbdir",
+        help="DEPRECATED (use -i & -type) existing directory with simplified JSON reports",
+    )
 
     argparser.add_argument(
-        "-i", "--input", 
-        action="append",
-        help="input report file or directory (Can be provided multiple times)"
+        "-vt3",
+        action="store_true",
+        help="DEPRECATED (use -type): input are VT v3 files",
     )
-    
+
     argparser.add_argument(
-        "-type", "--type", help="the type of report (vt2, vt3, lb)"
+        "-i",
+        "--input",
+        action="append",
+        help="input report file or directory (Can be provided multiple times)",
     )
 
+    argparser.add_argument("-type", "--type", help="the type of report (vt2, vt3, lb)")
+
     argparser.add_argument(
         "-gt",
         help="file with ground truth. If provided it evaluates clustering accuracy. "
@@ -783,7 +791,13 @@ def parse_args():
     args = argparser.parse_args()
 
     # TODO - use non-exclusive group to ensure at least one is selected instead of this
-    if not args.input and not args.vt and not args.lb and not args.vtdir and not args.lbdir:
+    if (
+        not args.input
+        and not args.vt
+        and not args.lb
+        and not args.vtdir
+        and not args.lbdir
+    ):
         sys.stderr.write(
             "One of the following 4 arguments is required: " "-vt,-lb,-vtdir,-lbdir\n"
         )

From 5cbe0408cb694fa2a898ada7f115aece3e687c30 Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Tue, 2 Feb 2021 17:13:27 -0500
Subject: [PATCH 18/36] Just removed -vt -lb -vtdir -lbdir -vt3, use -i & -t

---
 avclass/labeler.py | 116 ++++++++-------------------------------------
 1 file changed, 20 insertions(+), 96 deletions(-)

diff --git a/avclass/labeler.py b/avclass/labeler.py
index b09e3cf..a9cc928 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -66,7 +66,7 @@ def run(
             Dict,
             List[Dict],
         ],
-        data_type: str = "vt3",
+        data_type: Optional[AnyStr] = "vt3",
         hash_type: Optional[AnyStr] = "md5",
         ground_truth: Optional[AnyStr] = None,
         stats_export: bool = False,
@@ -91,6 +91,7 @@ def run(
         self.hash_type = self.get_hash_type(hash_type)
 
         # Select file type used for sampling
+        data_type = data_type if data_type else "vt3"
         self.get_sample_info = self.av_labels.get_sample_call(data_type)
 
         # Select output prefix
@@ -572,14 +573,14 @@ def list_str(
             out = out + sep + s
         return out
 
-    def print_error(self, output: str = "", flush=False):
+    def print_error(self, output: AnyStr = "", flush=False):
         if self.console:
             # TODO - would this be better? print(output, file=sys.stderr, flush=flush, end="")
             sys.stderr.write(output)
             if flush:
                 sys.stderr.flush()
 
-    def print_output(self, output: str = ""):
+    def print_output(self, output: AnyStr = ""):
         if self.console:
             sys.stdout.write(output)
 
@@ -595,19 +596,13 @@ def main():
         alias_detect=args.aliasdetect,
     )
     # Build list of input files
-    files, data_type = get_files(
+    files = get_files(
         file_input=args.input,
-        data_type=args.type,
-        vt=args.vt,
-        lb=args.lb,
-        vtdir=args.vtdir,
-        lbdir=args.lbdir,
-        vt3=args.vt3,
     )
     av_class = AVClassLabeler(av_labels=av_labels)
     result = av_class.run(
         files=files,
-        data_type=data_type,
+        data_type=args.type,
         hash_type=args.hash,
         stats_export=args.stats,
         vt_tags=args.vtt,
@@ -624,25 +619,14 @@ def main():
 
 def get_files(
     file_input: Optional[AnyStr] = None,
-    data_type: Optional[AnyStr] = None,
-    vt: Optional[AnyStr] = None,
-    lb: Optional[AnyStr] = None,
-    vtdir: Optional[AnyStr] = None,
-    lbdir: Optional[AnyStr] = None,
-    vt3: Optional[bool] = False,
-) -> Tuple:
+) -> List[AnyStr]:
     """
-    Return list as a string
-
-    :param vt: vt file
-    :param lb: lb file
-    :param vtdir: vt directory
-    :param lbdir: lb directory
-    :param vt3: vt3 json format
-    :return: A Tuple of files and type
+    Return List of the files to process
+
+    :param file_input: file or directory to process
+    :return: List of type str
     """
     ifile_l = []
-    ifile_are_vt = None
     if file_input:
         for fi in file_input:
             if os.path.isdir(fi):
@@ -652,30 +636,7 @@ def get_files(
                         ifile_l.append(dir_file)
             elif fi not in ifile_l:
                 ifile_l.append(fi)
-    else:
-        # NOTE: duplicate input files are not removed
-        if vt:
-            ifile_l += vt
-            ifile_are_vt = True
-        if lb:
-            ifile_l += lb
-            ifile_are_vt = False
-        if vtdir:
-            ifile_l += [os.path.join(vtdir, f) for f in os.listdir(vtdir)]
-            ifile_are_vt = True
-        if lbdir:
-            ifile_l += [os.path.join(lbdir, f) for f in os.listdir(lbdir)]
-            ifile_are_vt = False
-
-    # Select correct sample info extraction function
-    if not data_type:
-        if not ifile_are_vt:
-            data_type = "lb"
-        elif vt3:
-            data_type = "vt3"
-        else:
-            data_type = "vt2"
-    return ifile_l, data_type
+    return ifile_l
 
 
 def parse_args():
@@ -685,35 +646,6 @@ def parse_args():
         " recall if ground truth available",
     )
 
-    argparser.add_argument(
-        "-vt",
-        action="append",
-        help="DEPRECATED (use -i & -type): file with VT reports (Can be provided multiple times)",
-    )
-
-    argparser.add_argument(
-        "-lb",
-        action="append",
-        help="DEPRECATED (use -i & -type): file with simplified JSON reports "
-        "{md5,sha1,sha256,scan_date,av_labels} (Can be provided "
-        "multiple times)",
-    )
-
-    argparser.add_argument(
-        "-vtdir", help="DEPRECATED (use -i & -type): existing directory with VT reports"
-    )
-
-    argparser.add_argument(
-        "-lbdir",
-        help="DEPRECATED (use -i & -type) existing directory with simplified JSON reports",
-    )
-
-    argparser.add_argument(
-        "-vt3",
-        action="store_true",
-        help="DEPRECATED (use -type): input are VT v3 files",
-    )
-
     argparser.add_argument(
         "-i",
         "--input",
@@ -721,7 +653,9 @@ def parse_args():
         help="input report file or directory (Can be provided multiple times)",
     )
 
-    argparser.add_argument("-type", "--type", help="the type of report (vt2, vt3, lb)")
+    argparser.add_argument(
+        "-t", "--type", help="the type of report file (vt2, vt3, lb)"
+    )
 
     argparser.add_argument(
         "-gt",
@@ -791,25 +725,15 @@ def parse_args():
     args = argparser.parse_args()
 
     # TODO - use non-exclusive group to ensure at least one is selected instead of this
-    if (
-        not args.input
-        and not args.vt
-        and not args.lb
-        and not args.vtdir
-        and not args.lbdir
-    ):
-        sys.stderr.write(
-            "One of the following 4 arguments is required: " "-vt,-lb,-vtdir,-lbdir\n"
-        )
+    if not args.input:
+        sys.stderr.write("Input file / directory is required: " "-i\n")
         exit(1)
 
-    # TODO - use mutex group for this instead of manual check
-    if (args.vt or args.vtdir) and (args.lb or args.lbdir):
+    if not args.type:
+
         sys.stderr.write(
-            "Use either -vt/-vtdir or -lb/-lbdir. "
-            "Both types of input files cannot be combined.\n"
+            "[-] No type defined, using file type of VirusTotal v3: '-t vt3'\n"
         )
-        exit(1)
 
     devnull = "/dev/null"
     # TODO - consider letting argparse handle this?

From 09f2e5eb37f55e4e62ec0f5c2dfb5cc36a24dd53 Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Thu, 4 Feb 2021 08:51:18 -0500
Subject: [PATCH 19/36] Tweaks to --input

---
 avclass/labeler.py | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/avclass/labeler.py b/avclass/labeler.py
index a9cc928..799e889 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -596,7 +596,7 @@ def main():
         alias_detect=args.aliasdetect,
     )
     # Build list of input files
-    files = get_files(
+    files = get_arg_files(
         file_input=args.input,
     )
     av_class = AVClassLabeler(av_labels=av_labels)
@@ -617,25 +617,24 @@ def main():
         print(json.dumps(result))
 
 
-def get_files(
-    file_input: Optional[AnyStr] = None,
+def get_arg_files(
+    file_input: List[AnyStr],
 ) -> List[AnyStr]:
     """
     Return List of the files to process
 
-    :param file_input: file or directory to process
+    :param file_input: file(s) or directory to process
     :return: List of type str
     """
     ifile_l = []
-    if file_input:
-        for fi in file_input:
-            if os.path.isdir(fi):
-                for f in os.listdir(fi):
-                    dir_file = os.path.join(fi, f)
-                    if dir_file not in ifile_l:
-                        ifile_l.append(dir_file)
-            elif fi not in ifile_l:
-                ifile_l.append(fi)
+    for fi in file_input:
+        if os.path.isdir(fi):
+            for f in os.listdir(fi):
+                dir_file = os.path.join(fi, f)
+                if dir_file not in ifile_l:
+                    ifile_l.append(dir_file)
+        elif fi not in ifile_l:
+            ifile_l.append(fi)
     return ifile_l
 
 
@@ -724,13 +723,11 @@ def parse_args():
 
     args = argparser.parse_args()
 
-    # TODO - use non-exclusive group to ensure at least one is selected instead of this
     if not args.input:
         sys.stderr.write("Input file / directory is required: " "-i\n")
         exit(1)
 
     if not args.type:
-
         sys.stderr.write(
             "[-] No type defined, using file type of VirusTotal v3: '-t vt3'\n"
         )

From f4efd26e75d56742b30a5eebb4455d8fc3d30430 Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Thu, 4 Feb 2021 13:54:34 -0500
Subject: [PATCH 20/36] MetaDefender support

---
 avclass/common.py  | 44 +++++++++++++++++++++++++++++++++++++-------
 avclass/labeler.py |  2 +-
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/avclass/common.py b/avclass/common.py
index 5533b43..05e24dd 100755
--- a/avclass/common.py
+++ b/avclass/common.py
@@ -501,7 +501,7 @@ def get_sample_call(self, data_type: AnyStr) -> Callable:
         """
         Return the correct parser for the report type
         
-        :param data_type: the type of file vt2, vt3, lb
+        :param data_type: the type of file vt2, vt3, lb, md
         :return: Callable function that returns SampleInfo
         """
         if data_type == "lb":
@@ -510,9 +510,11 @@ def get_sample_call(self, data_type: AnyStr) -> Callable:
             return self.get_sample_info_vt_v2
         elif data_type == "vt3":
             return self.get_sample_info_vt_v3
+        elif data_type == "md":
+            return self.get_sample_info_md
         else:
             sys.stderr.write(
-                "Invalid data type for sample: %s (should be vt, vt2, vt3, lb)"
+                "Invalid data type for sample: %s (should be vt, vt2, vt3, lb, md)"
                 % data_type
             )
             return self.get_sample_info_vt_v3
@@ -542,9 +544,9 @@ def get_sample_info_lb(record: Dict) -> SampleInfo:
         )
 
     @staticmethod
-    def get_sample_info_vt_v2(record):
+    def get_sample_info_vt_v2(record: Dict) -> SampleInfo:
         """
-        Convert VT (v2) JSON to a SampleInfo object
+        Convert VirusTotal (v2) JSON to a SampleInfo object
 
         :param record: The JSON record
         :return: An instance of SampleInfo
@@ -572,9 +574,9 @@ def get_sample_info_vt_v2(record):
         return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags)
 
     @staticmethod
-    def get_sample_info_vt_v3(record):
+    def get_sample_info_vt_v3(record: Dict) -> SampleInfo:
         """
-        Convert VT (v3) JSON to a SampleInfo object
+        Convert VirusTotal (v3) JSON to a SampleInfo object
 
         :param record: The JSON record
         :return: An instance of SampleInfo
@@ -602,7 +604,35 @@ def get_sample_info_vt_v3(record):
         return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags)
 
     @staticmethod
-    def is_pup(tag_pairs, taxonomy: Taxonomy) -> Optional[bool]:
+    def get_sample_info_md(record: Dict) -> SampleInfo:
+        """
+        Convert OPSWAT MetaDefender JSON to a SampleInfo object
+
+        :param record: The JSON record
+        :return: An instance of SampleInfo
+        """
+        try:
+            scans = record["scan_results"]["scan_details"]
+            md5 = record["file_info"]["md5"]
+            sha1 = record["file_info"]["sha1"]
+            sha256 = record["file_info"]["sha256"]
+        except KeyError:
+            return None
+
+        # Obtain labels from scan results
+        label_pairs = []
+        for av, res in scans.items():
+            label = res["threat_found"]
+            if label is not None and res["scan_result_i"] == 1:
+                clean_label = "".join(
+                    filter(lambda x: x in string.printable, label)
+                ).strip()
+                label_pairs.append((av, clean_label))
+
+        return SampleInfo(md5, sha1, sha256, label_pairs, [])
+
+    @staticmethod
+    def is_pup(tag_pairs: List[Tuple], taxonomy: Taxonomy) -> Optional[bool]:
         """
         Attempts to classify a sample (represented by ``tag_pairs``) as a PUP.  We accomplish this by checking for the
         "grayware" label in the highest ranked CLASS.
diff --git a/avclass/labeler.py b/avclass/labeler.py
index 799e889..fbee834 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -653,7 +653,7 @@ def parse_args():
     )
 
     argparser.add_argument(
-        "-t", "--type", help="the type of report file (vt2, vt3, lb)"
+        "-t", "--type", help="the type of report file (vt2, vt3, lb, md)"
     )
 
     argparser.add_argument(

From dd3948f7a6127908e06ce15fbe1d118f0a0cec4d Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Thu, 4 Feb 2021 18:12:39 -0500
Subject: [PATCH 21/36] Copying over some changes from malicialab master (merge
 prep)

---
 .gitignore         |  3 ++
 avclass/common.py  | 46 +++++++++++++-------------
 avclass/labeler.py | 15 ++++++++-
 avclass/update.py  | 82 +++++++++++++++++++++++-----------------------
 4 files changed, 81 insertions(+), 65 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4b38bd9..1438929 100644
--- a/.gitignore
+++ b/.gitignore
@@ -139,3 +139,6 @@ cython_debug/
 
 # PyCharm
 .idea/
+
+# Apple
+.DS_Store
diff --git a/avclass/common.py b/avclass/common.py
index 05e24dd..a799070 100755
--- a/avclass/common.py
+++ b/avclass/common.py
@@ -75,7 +75,7 @@ def __init__(self, filepath: Optional[AnyStr]):
 
         :param filepath: Path to taxonomy data
         """
-        self.__tag_map = {}
+        self._tag_map = {}
         if filepath:
             self.read_taxonomy(filepath)
 
@@ -86,7 +86,7 @@ def __len__(self) -> int:
         :return: The length (int) of the Taxonomy
         """
         return (
-            len(self.__tag_map) // 2
+            len(self._tag_map) // 2
         )  # TODO - perhaps there should be two dicts, one for names, one for paths?
 
     def is_generic(self, tag: AnyStr) -> bool:
@@ -96,7 +96,7 @@ def is_generic(self, tag: AnyStr) -> bool:
         :param tag: The tag
         :return: Boolean
         """
-        t = self.__tag_map.get(tag, None)
+        t = self._tag_map.get(tag, None)
         return getattr(t, "cat", None) == "GEN"
 
     def is_tag(self, tag: AnyStr) -> bool:
@@ -106,7 +106,7 @@ def is_tag(self, tag: AnyStr) -> bool:
         :param tag: The tag
         :return: Boolean
         """
-        return tag in self.__tag_map
+        return tag in self._tag_map
 
     def add_tag(self, s: AnyStr, override: bool = False):
         """
@@ -117,18 +117,18 @@ def add_tag(self, s: AnyStr, override: bool = False):
         :return: None
         """
         tag = create_tag(s)
-        t = self.__tag_map.get(tag.name, None)
+        t = self._tag_map.get(tag.name, None)
 
         if t and (t.path != tag.path):
             if override:
                 logger.warning("[Taxonomy] Replacing %s with %s\n" % t.path, tag.path)
-                del self.__tag_map[t.path]
+                del self._tag_map[t.path]
             else:
                 return
 
         logger.debug("[Taxonomy] Adding tag %s" % s)
-        self.__tag_map[tag.name] = tag
-        self.__tag_map[tag.path] = tag
+        self._tag_map[tag.name] = tag
+        self._tag_map[tag.path] = tag
 
     def remove_tag(self, tag: AnyStr) -> bool:
         """
@@ -137,11 +137,11 @@ def remove_tag(self, tag: AnyStr) -> bool:
         :param tag: The tag to remove
         :return: Whether or not the tag was present
         """
-        t = self.__tag_map.get(tag, None)
+        t = self._tag_map.get(tag, None)
         if tag:
             logger.debug("[Taxonomy] Removing tag: %s" % t.path)
-            del self.__tag_map[t.name]
-            del self.__tag_map[t.path]
+            del self._tag_map[t.name]
+            del self._tag_map[t.path]
         return t is not None
 
     def get_category(self, tag: AnyStr) -> AnyStr:
@@ -151,7 +151,7 @@ def get_category(self, tag: AnyStr) -> AnyStr:
         :param tag: The tag
         :return: The category
         """
-        t = self.__tag_map.get(tag, None)
+        t = self._tag_map.get(tag, None)
         return getattr(t, "cat", "UNK")
 
     def get_path(self, tag: AnyStr) -> AnyStr:
@@ -161,7 +161,7 @@ def get_path(self, tag: AnyStr) -> AnyStr:
         :param tag: The tag
         :return: The tag's path
         """
-        t = self.__tag_map.get(tag, None)
+        t = self._tag_map.get(tag, None)
         return getattr(t, "path", f"UNK:{tag}")
 
     def get_prefix_l(self, tag: AnyStr) -> List[AnyStr]:
@@ -171,7 +171,7 @@ def get_prefix_l(self, tag: AnyStr) -> List[AnyStr]:
         :param tag: The tag
         :return: The tag's prefix list
         """
-        t = self.__tag_map.get(tag, None)
+        t = self._tag_map.get(tag, None)
         return getattr(t, "prefix_l", [])
 
     def get_prefix(self, tag: AnyStr) -> List[AnyStr]:
@@ -181,7 +181,7 @@ def get_prefix(self, tag: AnyStr) -> List[AnyStr]:
         :param tag: The tag
         :return: String representation of the tag's full prefix
         """
-        t = self.__tag_map.get(tag, None)
+        t = self._tag_map.get(tag, None)
         tag_pfx = tag.path.split(":")[:-1]
         return t.prefix_l if t else tag_pfx
 
@@ -192,7 +192,7 @@ def get_depth(self, tag: AnyStr) -> int:
         :param tag: The tag
         :return: The depth (int) of the tag
         """
-        t = self.__tag_map.get(tag, None)
+        t = self._tag_map.get(tag, None)
         if t:
             return len(tag.prefix_l) + 2
         return 0
@@ -204,7 +204,7 @@ def get_info(self, tag: AnyStr) -> Tuple[AnyStr, AnyStr]:
         :param tag: The tag
         :return: Tuple containing tag.path and tag.cat
         """
-        t = self.__tag_map.get(tag, None)
+        t = self._tag_map.get(tag, None)
         if t:
             return t.path, t.cat
         return f"UNK:{tag}", "UNK"
@@ -216,9 +216,9 @@ def expand(self, tag: AnyStr) -> List[AnyStr]:
         :param tag: The tag
         :return: A list of prefixes
         """
-        t = self.__tag_map.get(tag, None)
+        t = self._tag_map.get(tag, None)
         if t:
-            return [x for x in t.prefix_l if x in self.__tag_map]
+            return [x for x in t.prefix_l if x in self._tag_map]
         return []
 
     def platform_tags(self) -> Set[AnyStr]:
@@ -229,7 +229,7 @@ def platform_tags(self) -> Set[AnyStr]:
         """
         return {
             tag.name
-            for _, tag in self.__tag_map.items()
+            for _, tag in self._tag_map.items()
             if tag.path.startswith(platform_prefix)
         }
 
@@ -286,7 +286,7 @@ def to_file(self, filepath: AnyStr):
         :return: None
         """
         with open(filepath, "w") as fd:
-            tag_l = sorted(self.__tag_map.items(), key=lambda item: item[1].path)
+            tag_l = sorted(self._tag_map.items(), key=lambda item: item[1].path)
             idx = 0
             for name, tag in tag_l:
                 if (idx % 2) == 0:
@@ -784,12 +784,12 @@ def get_sample_tags(self, sample_info: SampleInfo) -> Dict[AnyStr, List[AnyStr]]
 
             duplicates.add(label)
 
-            label = self.__remove_suffixes(av_name, label)
+            label = self._remove_suffixes(av_name, label)
             hashes = [sample_info.md5, sample_info.sha1, sample_info.sha256]
             tags = self.get_label_tags(label, hashes)
 
             # NOTE: Avoid expansion when aliases are set
-            expanded_tags = tags if self.alias_detect else self.__expand(tags)
+            expanded_tags = tags if self.alias_detect else self._expand(tags)
 
             # store av vendors for each tag
             for t in expanded_tags:
diff --git a/avclass/labeler.py b/avclass/labeler.py
index fbee834..957c1f8 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -1,4 +1,5 @@
 import argparse
+import gzip
 import os
 import json
 import sys
@@ -75,6 +76,7 @@ def run(
         pup_classify: bool = False,
         path_export: bool = False,
         compatibility_v1: bool = False,
+        gzipped: bool = False,
         console: bool = False,
     ) -> List[Dict]:
         # Set class arguments
@@ -108,7 +110,10 @@ def run(
             elif isinstance(ifile, StringIO):
                 fd = ifile
             else:
-                fd = open(ifile, "r")
+                if gzipped:
+                    fd = gzip.open(ifile, "rt")
+                else:
+                    fd = open(ifile, "r")
 
                 # Debug info, file processed
                 self.print_error("[-] Processing input file %s\n" % ifile)
@@ -611,6 +616,7 @@ def main():
         pup_classify=args.pup,
         path_export=args.path,
         compatibility_v1=args.c,
+        gzipped=args.gzip,
         console=not args.json,
     )
     if args.json:
@@ -662,6 +668,13 @@ def parse_args():
         "Prints precision, recall, F1-measure.",
     )
 
+    argparser.add_argument(
+        "-gz",
+        "--gzip",
+        help="file with JSON reports is gzipped",
+        action="store_true",
+    )
+
     argparser.add_argument(
         "-vtt", help="Include VT tags in the output.", action="store_true"
     )
diff --git a/avclass/update.py b/avclass/update.py
index a2bc73b..4e9d5ea 100644
--- a/avclass/update.py
+++ b/avclass/update.py
@@ -30,11 +30,11 @@
 class Update:
     def __init__(self, rel_filepath: AnyStr, in_taxonomy: Taxonomy, in_translation: Translation,
                  in_expansion: Expansion, n, t):
-        self.__out_taxonomy = in_taxonomy
-        self.__out_translation = in_translation
-        self.__out_expansion = in_expansion
-        self.__n = n
-        self.__t = t
+        self._out_taxonomy = in_taxonomy
+        self._out_translation = in_translation
+        self._out_expansion = in_expansion
+        self._n = n
+        self._t = t
         # Initialize blacklist
         self.blist = in_taxonomy.platform_tags()
         # Maps src -> cnt
@@ -55,8 +55,8 @@ def is_weak_rel(self, rel: Relation) -> bool:
         :param rel: The relationship
         :return: Boolean
         """
-        return ((int(rel.nalias_num) < self.__n) or
-                (float(rel.talias_num) < self.__t))
+        return ((int(rel.nalias_num) < self._n) or
+                (float(rel.talias_num) < self._t))
 
     def is_blacklisted_rel(self, rel: Relation) -> bool:
         """
@@ -77,16 +77,16 @@ def is_known_rel(self, rel: Relation) -> bool:
         t1 = rel.t1
         t2 = rel.t2
         # Known taxonomy relation
-        if self.__out_taxonomy.overlaps(t1, t2):
+        if self._out_taxonomy.overlaps(t1, t2):
             return True
         # Known expansion rule
-        t1_dst = self.__out_expansion.get_dst(t1)
-        t2_dst = self.__out_expansion.get_dst(t2)
+        t1_dst = self._out_expansion.get_dst(t1)
+        t2_dst = self._out_expansion.get_dst(t2)
         if t2 in t1_dst or t1 in t2_dst:
             return True
         # Known tagging rule
-        t1_dst = sorted(self.__out_translation.get_dst(t1))
-        t2_dst = sorted(self.__out_translation.get_dst(t2))
+        t1_dst = sorted(self._out_translation.get_dst(t1))
+        t2_dst = sorted(self._out_translation.get_dst(t2))
         if t2 in t1_dst or t1 in t2_dst:
             return True
         # Known alias in tagging
@@ -102,9 +102,9 @@ def add_tag(self, name: AnyStr, path: AnyStr):
         :param path: The full path
         :return: None
         """
-        dst = self.__out_translation.get_dst(name)
+        dst = self._out_translation.get_dst(name)
         if not dst:
-            self.__out_taxonomy.add_tag(path)
+            self._out_taxonomy.add_tag(path)
 
     def add_expansion(self, src: AnyStr, dst_l: Collection[AnyStr]):
         """
@@ -116,19 +116,19 @@ def add_expansion(self, src: AnyStr, dst_l: Collection[AnyStr]):
         """
         ''' Add expansion rule fixing destination if src in tagging '''
         # Select source handling aliases
-        dst = self.__out_translation.get_dst(src)
+        dst = self._out_translation.get_dst(src)
         if dst:
             new_src = dst[0]
         else:
             new_src = src
         # Select destinations removing overlaps with existing rule
-        dst = self.__out_expansion.get_dst(src)
+        dst = self._out_expansion.get_dst(src)
         if dst:
             dst.extend(dst_l)
-            target_l = self.__out_taxonomy.remove_overlaps(dst)
-            self.__out_expansion.add_rule(new_src, target_l, True)
+            target_l = self._out_taxonomy.remove_overlaps(dst)
+            self._out_expansion.add_rule(new_src, target_l, True)
         else:
-            self.__out_expansion.add_rule(new_src, dst_l, True)
+            self._out_expansion.add_rule(new_src, dst_l, True)
 
     def add_alias(self, src: AnyStr, dst: AnyStr, dst_prefix: AnyStr):
         """
@@ -140,7 +140,7 @@ def add_alias(self, src: AnyStr, dst: AnyStr, dst_prefix: AnyStr):
         :return: None
         """
         # If src in tagging, use most popular target
-        tr_dst = self.__out_translation.get_dst(src)
+        tr_dst = self._out_translation.get_dst(src)
         target = dst
         if tr_dst:
             cnt_max = self.src_map[dst]
@@ -149,17 +149,17 @@ def add_alias(self, src: AnyStr, dst: AnyStr, dst_prefix: AnyStr):
                 if cnt > cnt_max:
                     target = e
         # If dst is in tagging, update tagging rule destination,
-        tr_dst = self.__out_translation.get_dst(dst)
+        tr_dst = self._out_translation.get_dst(dst)
         if tr_dst:
             target_l = tr_dst
         # else add dst to taxonomy
         else:
             target_l = [target]
-            self.__out_taxonomy.add_tag('%s:%s' % (dst_prefix, dst))
+            self._out_taxonomy.add_tag('%s:%s' % (dst_prefix, dst))
         # Remove src from taxonomy
-        self.__out_taxonomy.remove_tag(src)
+        self._out_taxonomy.remove_tag(src)
         # Replace tagging rule
-        self.__out_translation.add_rule(src, target_l, True)
+        self._out_translation.add_rule(src, target_l, True)
 
     def is_expansion_rel(self, rel: Relation) -> bool:
         """
@@ -168,8 +168,8 @@ def is_expansion_rel(self, rel: Relation) -> bool:
         :param rel: The relation
         :return: Boolean
         """
-        c1 = self.__out_taxonomy.get_category(rel.t1)
-        c2 = self.__out_taxonomy.get_category(rel.t2)
+        c1 = self._out_taxonomy.get_category(rel.t1)
+        c2 = self._out_taxonomy.get_category(rel.t2)
         return (((c1 == "FAM") and (c2 != c1) and (c2 != "UNK")) or
                 ((c1 == "CLASS") and ((c2 == "FILE") or (c2 == "BEH"))) or
                 ((c1 == "UNK") and ((c2 == "BEH") or (c2 == "CLASS"))))
@@ -182,11 +182,11 @@ def find_expansions(self):
         """
         acc = []
         for rel in self.rel_set:
-            p1 = self.__out_taxonomy.get_path(rel.t1)
-            p2 = self.__out_taxonomy.get_path(rel.t2)
+            p1 = self._out_taxonomy.get_path(rel.t1)
+            p2 = self._out_taxonomy.get_path(rel.t2)
             logger.debug("Processing %s\t%s" % (p1, p2))
             # Ignore relations where t1 is an alias
-            dst = self.__out_translation.get_dst(rel.t1)
+            dst = self._out_translation.get_dst(rel.t1)
             if dst:
                 logger.debug("Ignoring relation for alias %s" % p1)
                 continue
@@ -198,16 +198,16 @@ def find_expansions(self):
 
     # def is_alias_rel(self, rel):
     #    ''' Return true if relation implies alias rule '''
-    #    c1 = self.__out_taxonomy.get_category(rel.t1)
-    #    c2 = self.__out_taxonomy.get_category(rel.t2)
+    #    c1 = self._out_taxonomy.get_category(rel.t1)
+    #    c2 = self._out_taxonomy.get_category(rel.t2)
     #    return (((c1 == "UNK") and (c2 == "FAM")) or
     #            ((c1 == "UNK") and (c2 == "UNK")))
 
     # def find_aliases(self):
     #    ''' Find aliases among relations '''
     #    for rel in self.rel_set:
-    #        c1 = self.__out_taxonomy.get_category(rel.t1)
-    #        c2 = self.__out_taxonomy.get_category(rel.t2)
+    #        c1 = self._out_taxonomy.get_category(rel.t1)
+    #        c2 = self._out_taxonomy.get_category(rel.t2)
     #        if self.is_alias_rel(rel):
     #            self.G.add_node(rel.t1)
     #            self.G.add_node(rel.t2)
@@ -223,8 +223,8 @@ def process_relation(self, rel: Relation):
         """
         t1 = rel.t1
         t2 = rel.t2
-        p1, c1 = self.__out_taxonomy.get_info(rel.t1)
-        p2, c2 = self.__out_taxonomy.get_info(rel.t2)
+        p1, c1 = self._out_taxonomy.get_info(rel.t1)
+        p2, c2 = self._out_taxonomy.get_info(rel.t2)
 
         logger.debug("Processing %s\t%s" % (p1, p2))
 
@@ -388,19 +388,19 @@ def output_relations(self, filepath: AnyStr):
         with open(filepath, 'w') as fd:
             fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n")
             sorted_rules = sorted(self.rel_set,
-                                  key=lambda r: (self.__out_taxonomy.get_category(r.t1),
-                                                 self.__out_taxonomy.get_category(r.t2)))
+                                  key=lambda r: (self._out_taxonomy.get_category(r.t1),
+                                                 self._out_taxonomy.get_category(r.t2)))
             for rel in sorted_rules:
-                p1, c1 = self.__out_taxonomy.get_info(rel.t1)
-                p2, c2 = self.__out_taxonomy.get_info(rel.t2)
+                p1, c1 = self._out_taxonomy.get_info(rel.t1)
+                p2, c2 = self._out_taxonomy.get_info(rel.t2)
                 fd.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (p1, p2, rel.t1_num, rel.t2_num, rel.nalias_num,
                                                            rel.talias_num, rel.tinv_alias_num))
 
     def output_rule_stats(self, fd: TextIO):
         # Compute rule statistics
         for rel in self.rel_set:
-            c1 = self.__out_taxonomy.get_category(rel.t1)
-            c2 = self.__out_taxonomy.get_category(rel.t2)
+            c1 = self._out_taxonomy.get_category(rel.t1)
+            c2 = self._out_taxonomy.get_category(rel.t2)
             self.cat_pairs_map[(c1, c2)] = self.cat_pairs_map.get((c1, c2), 0) + 1
             self.dst_map[rel.t2] = self.dst_map.get(rel.t2, 0) + 1
         # Output statistics

From fc2d7def568fecefaac787c602d4367874aadd79 Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Mon, 8 Feb 2021 11:28:05 -0500
Subject: [PATCH 22/36] AVLabels default

---
 avclass/cli.py    | 63 ++++++++++++++++++++++++++++++++++++++++++++---
 avclass/common.py |  7 +++---
 avclass/util.py   | 55 -----------------------------------------
 3 files changed, 64 insertions(+), 61 deletions(-)

diff --git a/avclass/cli.py b/avclass/cli.py
index 76e2ad3..0586c3e 100644
--- a/avclass/cli.py
+++ b/avclass/cli.py
@@ -1,6 +1,63 @@
 import argparse
+import logging
 
 from avclass import util
+from avclass.common import Taxonomy, Translation, Expansion
+from typing import AnyStr
+
+
+logger = logging.getLogger(__name__)
+
+__all__ = (
+    'validate_expansion',
+    'validate_tagging',
+    'validate_taxonomy',
+)
+
+def validate_taxonomy(path: AnyStr):
+    """
+    Validate and normalize a Taxonomy created from ``path``
+
+    :param path: Location on disk of a Taxonomy file
+    :return: Taxonomy object
+    """
+    taxonomy = Taxonomy(path)
+    taxonomy.to_file(path)
+
+    logger.info('[-] Normalized %d tags in taxonomy %s\n' % (len(taxonomy), path))
+
+    return taxonomy
+
+
+def validate_tagging(path: AnyStr, taxonomy: Taxonomy):
+    """
+    Validate and normalize Tagging created from ``path`` and verified against ``taxonomy``
+
+    :param path: Location on disk of a Tagging file
+    :param taxonomy: Valid Taxonomy object
+    :return: None
+    """
+    tagging = Translation(path)
+    tagging.validate(taxonomy)
+    # tagging.expand_all_destinations()
+    tagging.to_file(path)
+
+    logger.info('[-] Normalized %d tagging rules in %s\n' % (len(tagging), path))
+
+
+def validate_expansion(path: AnyStr, taxonomy: Taxonomy):
+    """
+    Validate and normalize Expansion created from ``path`` and verified against ``taxonomy``
+
+    :param path: Location on disk of an Expansion file
+    :param taxonomy: Valid Taxonomy object
+    :return: None
+    """
+    expansion = Expansion(path)
+    expansion.validate(taxonomy)
+    expansion.to_file(path)
+
+    logger.info('[-] Normalized %d expansion rules in %s\n' % (len(expansion), path))
 
 
 def validate_files():
@@ -17,6 +74,6 @@ def validate_files():
 
     args = parser.parse_args()
 
-    taxonomy = util.validate_taxonomy(args.tax)
-    util.validate_tagging(args.tag, taxonomy)
-    util.validate_expansion(args.exp, taxonomy)
+    taxonomy = validate_taxonomy(args.tax)
+    validate_tagging(args.tag, taxonomy)
+    validate_expansion(args.exp, taxonomy)
diff --git a/avclass/common.py b/avclass/common.py
index afb6b76..a480747 100755
--- a/avclass/common.py
+++ b/avclass/common.py
@@ -4,6 +4,7 @@
 import string
 import sys
 
+from avclass import util
 from collections import defaultdict, namedtuple
 from typing import AnyStr, Callable, Collection, Dict, List, Optional, Set, Tuple, Union
 
@@ -505,9 +506,9 @@ class AvLabels:
 
     def __init__(
         self,
-        tag_file: AnyStr,
-        exp_file: AnyStr = None,
-        tax_file: AnyStr = None,
+        tag_file: AnyStr = util.DEFAULT_TAG_PATH,
+        exp_file: AnyStr = util.DEFAULT_EXP_PATH,
+        tax_file: AnyStr = util.DEFAULT_TAX_PATH,
         av_file: AnyStr = None,
         alias_detect: bool = False,
     ):
diff --git a/avclass/util.py b/avclass/util.py
index 028bc36..e5d8bab 100755
--- a/avclass/util.py
+++ b/avclass/util.py
@@ -1,25 +1,15 @@
 import atexit
-import logging
 import pkg_resources
 
 from avclass import data
-from avclass.common import Taxonomy, Translation, Expansion
-
-from typing import AnyStr
 
 
 __all__ = (
     'DEFAULT_EXP_PATH',
     'DEFAULT_TAG_PATH',
     'DEFAULT_TAX_PATH',
-    'validate_expansion',
-    'validate_tagging',
-    'validate_taxonomy',
 )
 
-
-logger = logging.getLogger(__name__)
-
 RESOURCE_EXP = "default.expansion"
 RESOURCE_TAG = "default.tagging"
 RESOURCE_TAX = "default.taxonomy"
@@ -39,48 +29,3 @@
 
 atexit.register(pkg_resources.cleanup_resources)
 
-
-def validate_taxonomy(path: AnyStr):
-    """
-    Validate and normalize a Taxonomy created from ``path``
-
-    :param path: Location on disk of a Taxonomy file
-    :return: Taxonomy object
-    """
-    taxonomy = Taxonomy(path)
-    taxonomy.to_file(path)
-
-    logger.info('[-] Normalized %d tags in taxonomy %s\n' % (len(taxonomy), path))
-
-    return taxonomy
-
-
-def validate_tagging(path: AnyStr, taxonomy: Taxonomy):
-    """
-    Validate and normalize Tagging created from ``path`` and verified against ``taxonomy``
-
-    :param path: Location on disk of a Tagging file
-    :param taxonomy: Valid Taxonomy object
-    :return: None
-    """
-    tagging = Translation(path)
-    tagging.validate(taxonomy)
-    # tagging.expand_all_destinations()
-    tagging.to_file(path)
-
-    logger.info('[-] Normalized %d tagging rules in %s\n' % (len(tagging), path))
-
-
-def validate_expansion(path: AnyStr, taxonomy: Taxonomy):
-    """
-    Validate and normalize Expansion created from ``path`` and verified against ``taxonomy``
-
-    :param path: Location on disk of an Expansion file
-    :param taxonomy: Valid Taxonomy object
-    :return: None
-    """
-    expansion = Expansion(path)
-    expansion.validate(taxonomy)
-    expansion.to_file(path)
-
-    logger.info('[-] Normalized %d expansion rules in %s\n' % (len(expansion), path))

From 6b64ed6ba38efecb66e05d0f7afe57624875d3f8 Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Mon, 8 Feb 2021 11:44:52 -0500
Subject: [PATCH 23/36] AVClassLabeler default, formatting

---
 avclass/cli.py     | 29 +++++++++++++----------------
 avclass/common.py  | 21 +++++++++++----------
 avclass/labeler.py |  2 +-
 avclass/util.py    |  7 +++----
 4 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/avclass/cli.py b/avclass/cli.py
index 0586c3e..ad528ff 100644
--- a/avclass/cli.py
+++ b/avclass/cli.py
@@ -9,11 +9,12 @@
 logger = logging.getLogger(__name__)
 
 __all__ = (
-    'validate_expansion',
-    'validate_tagging',
-    'validate_taxonomy',
+    "validate_expansion",
+    "validate_tagging",
+    "validate_taxonomy",
 )
 
+
 def validate_taxonomy(path: AnyStr):
     """
     Validate and normalize a Taxonomy created from ``path``
@@ -24,7 +25,7 @@ def validate_taxonomy(path: AnyStr):
     taxonomy = Taxonomy(path)
     taxonomy.to_file(path)
 
-    logger.info('[-] Normalized %d tags in taxonomy %s\n' % (len(taxonomy), path))
+    logger.info("[-] Normalized %d tags in taxonomy %s\n" % (len(taxonomy), path))
 
     return taxonomy
 
@@ -42,7 +43,7 @@ def validate_tagging(path: AnyStr, taxonomy: Taxonomy):
     # tagging.expand_all_destinations()
     tagging.to_file(path)
 
-    logger.info('[-] Normalized %d tagging rules in %s\n' % (len(tagging), path))
+    logger.info("[-] Normalized %d tagging rules in %s\n" % (len(tagging), path))
 
 
 def validate_expansion(path: AnyStr, taxonomy: Taxonomy):
@@ -57,20 +58,16 @@ def validate_expansion(path: AnyStr, taxonomy: Taxonomy):
     expansion.validate(taxonomy)
     expansion.to_file(path)
 
-    logger.info('[-] Normalized %d expansion rules in %s\n' % (len(expansion), path))
+    logger.info("[-] Normalized %d expansion rules in %s\n" % (len(expansion), path))
 
 
 def validate_files():
-    parser = argparse.ArgumentParser(description='Checks format of files Tagging, Expansion and Taxonomy.')
-    parser.add_argument('-exp',
-                        help='expansion file',
-                        default=util.DEFAULT_EXP_PATH)
-    parser.add_argument('-tag',
-                        help='tagging file',
-                        default=util.DEFAULT_TAG_PATH)
-    parser.add_argument('-tax',
-                        help='taxonomy file',
-                        default=util.DEFAULT_TAX_PATH)
+    parser = argparse.ArgumentParser(
+        description="Checks format of files Tagging, Expansion and Taxonomy."
+    )
+    parser.add_argument("-exp", help="expansion file", default=util.DEFAULT_EXP_PATH)
+    parser.add_argument("-tag", help="tagging file", default=util.DEFAULT_TAG_PATH)
+    parser.add_argument("-tax", help="taxonomy file", default=util.DEFAULT_TAX_PATH)
 
     args = parser.parse_args()
 
diff --git a/avclass/common.py b/avclass/common.py
index a480747..d812523 100755
--- a/avclass/common.py
+++ b/avclass/common.py
@@ -40,7 +40,8 @@
 
 
 class Tag:
-    ''' A Tag in the taxonomy '''
+    """ A Tag in the taxonomy """
+
     def __init__(self, s):
         word_list = s.strip().split(":")
         if len(word_list) > 1:
@@ -49,8 +50,8 @@ def __init__(self, s):
             self._prefix_l = [x.lower() for x in word_list[1:-1]]
             path = self._cat
             for x in self._prefix_l:
-                path = path + ':' + x
-            self._path = path + ':' + self._name
+                path = path + ":" + x
+            self._path = path + ":" + self._name
         else:
             self._name = word_list[0].lower()
             self._cat = uncategorized_cat
@@ -58,27 +59,27 @@ def __init__(self, s):
             self._path = self._name
 
     def __hash__(self):
-        ''' Return hash '''
+        """ Return hash """
         return hash((self._path))
 
     @property
     def name(self):
-        ''' Return tag name '''
+        """ Return tag name """
         return self._name
 
     @property
     def cat(self):
-        ''' Return tag category '''
+        """ Return tag category """
         return self._cat
 
     @property
     def path(self):
-        ''' Return tag path '''
+        """ Return tag path """
         return self._path
 
     @property
     def prefix_l(self):
-        ''' Return tag prefix list '''
+        """ Return tag prefix list """
         return self._prefix_l
 
 
@@ -107,7 +108,7 @@ def __len__(self) -> int:
         return len(self._tags)
 
     def __iter__(self):
-        ''' Iterator over the alphabetically sorted tags in the taxonomy '''
+        """ Iterator over the alphabetically sorted tags in the taxonomy """
         return (t for t in sorted(self._tags))
 
     def is_generic(self, tag: AnyStr) -> bool:
@@ -522,7 +523,7 @@ def __init__(
     def get_sample_call(self, data_type: AnyStr) -> Callable:
         """
         Return the correct parser for the report type
-        
+
         :param data_type: the type of file vt2, vt3, lb, md
         :return: Callable function that returns SampleInfo
         """
diff --git a/avclass/labeler.py b/avclass/labeler.py
index 957c1f8..c1d7d67 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -52,7 +52,7 @@ class AVClassLabeler:
         "UNK": 0,
     }
 
-    def __init__(self, av_labels: AvLabels):
+    def __init__(self, av_labels: AvLabels = AvLabels()):
         self.av_labels = av_labels
 
     def run(
diff --git a/avclass/util.py b/avclass/util.py
index e5d8bab..8a21db2 100755
--- a/avclass/util.py
+++ b/avclass/util.py
@@ -5,9 +5,9 @@
 
 
 __all__ = (
-    'DEFAULT_EXP_PATH',
-    'DEFAULT_TAG_PATH',
-    'DEFAULT_TAX_PATH',
+    "DEFAULT_EXP_PATH",
+    "DEFAULT_TAG_PATH",
+    "DEFAULT_TAX_PATH",
 )
 
 RESOURCE_EXP = "default.expansion"
@@ -28,4 +28,3 @@
     DEFAULT_TAX_PATH = pkg_resources.resource_filename(data.__name__, RESOURCE_TAX)
 
 atexit.register(pkg_resources.cleanup_resources)
-

From c9e9c6741ab8f7b2b2c9313cf46b773c49efa7d1 Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Mon, 8 Feb 2021 13:42:01 -0500
Subject: [PATCH 24/36] Add stats, avtags, alias to json output

---
 .gitignore         |   5 ++
 avclass/labeler.py | 116 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 89 insertions(+), 32 deletions(-)

diff --git a/.gitignore b/.gitignore
index 1438929..bd84ced 100644
--- a/.gitignore
+++ b/.gitignore
@@ -142,3 +142,8 @@ cython_debug/
 
 # Apple
 .DS_Store
+
+# Output
+*.avtags
+*.stats
+*.alias
diff --git a/avclass/labeler.py b/avclass/labeler.py
index c1d7d67..0f0259b 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -22,7 +22,7 @@
 
 
 class AVClassLabeler:
-    output = []
+    output = {"labels": []}
     av_labels = None
     hash_type = None
     ground_truth = None
@@ -97,7 +97,10 @@ def run(
         self.get_sample_info = self.av_labels.get_sample_call(data_type)
 
         # Select output prefix
-        out_prefix = os.path.basename(os.path.splitext(files[0])[0])
+        if isinstance(files, list) and isinstance(files[0], str):
+            out_prefix = os.path.basename(os.path.splitext(files[0])[0])
+        else:
+            out_prefix = None
 
         # Process each input file
         if not isinstance(files, list):
@@ -147,7 +150,7 @@ def run(
         if self.stats_export:
             self.out_stats(out_prefix)
 
-        # Output vendor info
+        # Output av vendor info
         if self.av_tags:
             self.out_avtags(out_prefix)
 
@@ -238,7 +241,7 @@ def get_tokens(self, sample_info: NamedTuple):
                 pup_val=pup_val,
                 vt_count=vt_count,
             )
-            self.output.append(class_entry)
+            self.output["labels"].append(class_entry)
         else:
             class_entry = self.avclass2_output(
                 name=name,
@@ -248,7 +251,7 @@ def get_tokens(self, sample_info: NamedTuple):
                 pup_val=pup_val,
                 vt_count=vt_count,
             )
-            self.output.append(class_entry)
+            self.output["labels"].append(class_entry)
 
     def avclass1_output(
         self,
@@ -434,17 +437,28 @@ def ground_truth_print(self):
             "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n"
             % (precision, recall, fmeasure)
         )
+        self.output["ground_truth"] = {
+            "precision": "%.2f" % precision,
+            "recall": "%.2f" % recall,
+            "f1-measure": "%.2f" % fmeasure,
+        }
 
     def alias_detection(self, out_prefix: AnyStr, path_export: bool = False):
-        # Open alias file
-        alias_filename = out_prefix + ".alias"
-        alias_fd = open(alias_filename, "w+")
+        self.output["alias"] = []
+        alias_fd = None
+        alias_filename = None
         # Sort token pairs by number of times they appear together
         sorted_pairs = sorted(self.pair_count_map.items(), key=itemgetter(1))
         # sorted_pairs = sorted(self.pair_count_map.items())
 
-        # Output header line
-        alias_fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n")
+        # Open alias file
+        if out_prefix:
+            alias_filename = out_prefix + ".alias"
+            alias_fd = open(alias_filename, "w+")
+            # Output header line
+            alias_fd.write(
+                "# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n"
+            )
         # Compute token pair statistic and output to alias file
         for (t1, t2), c in sorted_pairs:
             n1 = self.token_count_map[t1]
@@ -464,41 +478,79 @@ def alias_detection(self, out_prefix: AnyStr, path_export: bool = False):
             if path_export:
                 x = self.av_labels.taxonomy.get_path(x)
                 y = self.av_labels.taxonomy.get_path(y)
-            alias_fd.write(
-                "%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" % (x, y, xn, yn, c, f, finv)
+            self.output["alias"].append(
+                {
+                    "tag1_label": x,
+                    "tag2_label": y,
+                    "tag1": xn,
+                    "tag2": yn,
+                    "tag1^tag2": c,
+                    "tag1^tag2/tag1": f,
+                    "tag1^tag2/tag2": finv,
+                }
             )
-        # Close alias file
-        alias_fd.close()
-        self.print_error("[-] Alias data in %s\n" % (alias_filename))
+            if out_prefix:
+                alias_fd.write(
+                    "%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" % (x, y, xn, yn, c, f, finv)
+                )
+        if out_prefix:
+            # Close alias file
+            alias_fd.close()
+            self.print_error("[-] Alias data in %s\n" % (alias_filename))
 
     def out_avtags(self, out_prefix: AnyStr):
-        avtags_fd = open("%s.avtags" % out_prefix, "w")
-        for t in sorted(self.avtags_dict.keys()):
-            avtags_fd.write("%s\t" % t)
+        if out_prefix:
+            avtags_fd = open("%s.avtags" % out_prefix, "w")
+            for t in sorted(self.avtags_dict.keys()):
+                avtags_fd.write("%s\t" % t)
+                pairs = sorted(
+                    self.avtags_dict[t].items(), key=lambda pair: pair[1], reverse=True
+                )
+                for pair in pairs:
+                    avtags_fd.write("%s|%d," % (pair[0], pair[1]))
+                avtags_fd.write("\n")
+            avtags_fd.close()
+        self.output["av_tags"] = {}
+        for tag in sorted(self.avtags_dict.keys()):
+            self.output["av_tags"][tag] = []
             pairs = sorted(
-                self.avtags_dict[t].items(), key=lambda pair: pair[1], reverse=True
+                self.avtags_dict[tag].items(), key=lambda pair: pair[1], reverse=True
             )
             for pair in pairs:
-                avtags_fd.write("%s|%d," % (pair[0], pair[1]))
-            avtags_fd.write("\n")
-        avtags_fd.close()
+                self.output["av_tags"][tag].append({"name": pair[0], "count": pair[1]})
 
     def out_stats(self, out_prefix: AnyStr):
         # Output stats
-        stats_fd = open("%s.stats" % out_prefix, "w")
         num_samples = self.vt_all
-        stats_fd.write("Samples: %d\n" % num_samples)
         num_tagged = self.stats["tagged"]
-        frac = float(num_tagged) / float(num_samples) * 100
-        stats_fd.write("Tagged (all): %d (%.01f%%)\n" % (num_tagged, frac))
+        tag_frac = float(num_tagged) / float(num_samples) * 100
+
         num_maltagged = self.stats["maltagged"]
-        frac = float(num_maltagged) / float(num_samples) * 100
-        stats_fd.write("Tagged (VT>3): %d (%.01f%%)\n" % (num_maltagged, frac))
-        for c in ["FILE", "CLASS", "BEH", "FAM", "UNK"]:
-            count = self.stats[c]
+        maltag_frac = float(num_maltagged) / float(num_samples) * 100
+        if out_prefix:
+            stats_fd = open("%s.stats" % out_prefix, "w")
+            stats_fd.write("Samples: %d\n" % num_samples)
+            stats_fd.write("Tagged (all): %d (%.01f%%)\n" % (num_tagged, tag_frac))
+            stats_fd.write(
+                "Tagged (VT>3): %d (%.01f%%)\n" % (num_maltagged, maltag_frac)
+            )
+            for c in ["FILE", "CLASS", "BEH", "FAM", "UNK"]:
+                count = self.stats[c]
+                frac = float(count) / float(num_maltagged) * 100
+                stats_fd.write("%s: %d (%.01f%%)\n" % (c, self.stats[c], frac))
+            stats_fd.close()
+        self.output["stats"] = {
+            "samples": num_samples,
+            "tagged_all": {"count": num_tagged, "ratio": "%.01f%%" % tag_frac},
+            "tagged_vt3": {"count": num_maltagged, "ratio": "%.01f%%" % maltag_frac},
+            "category": [],
+        }
+        for cat in ["FILE", "CLASS", "BEH", "FAM", "UNK"]:
+            count = self.stats[cat]
             frac = float(count) / float(num_maltagged) * 100
-            stats_fd.write("%s: %d (%.01f%%)\n" % (c, self.stats[c], frac))
-        stats_fd.close()
+            self.output["stats"]["category"].append(
+                {cat: {"count": count, "ratio": "%.01f%%" % frac}}
+            )
 
     def guess_hash(self, h: AnyStr) -> Optional[AnyStr]:
         """

From 72c42a0fb5ce741483ec0cd3ae63112e3eb16c2f Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Mon, 8 Feb 2021 13:59:42 -0500
Subject: [PATCH 25/36] Just detect gz by magic bytes

---
 avclass/labeler.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/avclass/labeler.py b/avclass/labeler.py
index 0f0259b..2343fe4 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -76,7 +76,6 @@ def run(
         pup_classify: bool = False,
         path_export: bool = False,
         compatibility_v1: bool = False,
-        gzipped: bool = False,
         console: bool = False,
     ) -> List[Dict]:
         # Set class arguments
@@ -113,7 +112,7 @@ def run(
             elif isinstance(ifile, StringIO):
                 fd = ifile
             else:
-                if gzipped:
+                if self.is_gz_file(ifile):
                     fd = gzip.open(ifile, "rt")
                 else:
                     fd = open(ifile, "r")
@@ -630,6 +629,10 @@ def list_str(
             out = out + sep + s
         return out
 
+    def is_gz_file(self, filepath):
+        with open(filepath, "rb") as test_f:
+            return test_f.read(2) == b"\x1f\x8b"
+
     def print_error(self, output: AnyStr = "", flush=False):
         if self.console:
             # TODO - would this be better? print(output, file=sys.stderr, flush=flush, end="")
@@ -668,7 +671,6 @@ def main():
         pup_classify=args.pup,
         path_export=args.path,
         compatibility_v1=args.c,
-        gzipped=args.gzip,
         console=not args.json,
     )
     if args.json:
@@ -707,7 +709,7 @@ def parse_args():
         "-i",
         "--input",
         action="append",
-        help="input report file or directory (Can be provided multiple times)",
+        help="input report file (plain or gzip) or directory. (Can be provided multiple times)",
     )
 
     argparser.add_argument(
@@ -720,13 +722,6 @@ def parse_args():
         "Prints precision, recall, F1-measure.",
     )
 
-    argparser.add_argument(
-        "-gz",
-        "--gzip",
-        help="file with JSON reports is gzipped",
-        action="store_true",
-    )
-
     argparser.add_argument(
         "-vtt", help="Include VT tags in the output.", action="store_true"
     )

From 7fb6bd1b3ced5c5c6bfe8c922584fff0513aa222 Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Mon, 8 Feb 2021 15:30:26 -0500
Subject: [PATCH 26/36] Readme update, path_export fix, renamed class argument

---
 README.md          | 216 ++++++++++++++++++++++++++++++++++-----------
 avclass/labeler.py |  18 ++--
 setup.py           |   2 +-
 3 files changed, 176 insertions(+), 60 deletions(-)

diff --git a/README.md b/README.md
index c6eae7a..6d92cf4 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,17 @@
-# AVClass and AVClass2
+# AVClass
 
-AVClass and AVClass2 are Python tools to tag / label malware samples. 
-You give them as input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports) 
-and they output tags extracted from the AV labels of each sample. 
-The original AVClass only outputs family names (i.e., family tags). 
-By default, it outputs the most likely family for each sample (e.g., *zbot*, *virut*). 
+AVClass is a Python package / command line tool to tag / label malware samples. 
+You input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports) 
+and it outputs tags extracted from the AV labels of each sample. 
+AVClass will output the family names, along with other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). 
+It can also be run in compatibility mode `-c` (AVClass 1.x) to only output the family names (i.e., family tags). 
 It can also output a ranking of all alternative family names it found for each sample.
-The newer AVClass2, in addition to family names, also outputs other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). 
 
-A quick example helps illustrating the differences. If you run AVClass2 on our example input file:
+
+A quick example helps illustrating the differences of compatibility mode. If you run AVClass on our example input file:
 
 ```shell
-$./avclass2/avclass2_labeler.py -lb examples/malheurReference_lb.json -p
+$ python3 ./avclass/labeler.py -i ./examples/malheurReference_lb.json -t lb -p
 ```
 
 the output on stdout is:
@@ -27,40 +27,150 @@ was flagged by 33 AV engines and 10 of them agree it is *grayware*, 9 that it is
 Sample *67d15459e1f85898851148511c86d88d* is flagged by 37 AV engines and 23 of them 
 consider it a *dialer*, 8 that it belongs to the *adultbrowser* family, and so on.
 
-If you instead run AVClass on the same input file:
+If you instead run AVClass on the same input file in compatibility mode:
 
 ```shell
-$./avclass/avclass_labeler.py -lb examples/malheurReference_lb.json
+$ python3 ./avclass/labeler.py -i ./examples/malheurReference_lb.json -t lb -c
 ```
 
-the output looks like this:
+the output looks like this, which simply reports the most common family name for each sample.
 
 ```
 aca2d12934935b070df8f50e06a20539 adrotator
 67d15459e1f85898851148511c86d88d adultbrowser
 ``` 
 
-which simply reports the most common family name for each sample.
-
-In a nutshell, that is the main difference between both tools. 
-Of course, there are more options for both tools, 
-which you can read about in their corresponding README files. 
 
+The output can also be formatted as **JSON**.
+```shell
+$ python3 ./avclass/labeler.py -i ./examples/malheurReference_lb.json -t lb -p -json
+```
+the output on stdout is:
 
-## Which one should I use?
+```json
+{
+  "labels": [
+    {
+      "hash": "aca2d12934935b070df8f50e06a20539",
+      "av_count": 33,
+      "tags": [
+        {
+          "tag": "grayware",
+          "count": 9,
+          "category": "CLASS",
+          "path": "CLASS:grayware"
+        },
+        {
+          "tag": "adware",
+          "count": 9,
+          "category": "CLASS",
+          "path": "CLASS:grayware:adware"
+        },
+        {
+          "tag": "windows",
+          "count": 8,
+          "category": "FILE",
+          "path": "FILE:os:windows"
+        },
+        {
+          "tag": "adrotator",
+          "count": 8,
+          "category": "FAM",
+          "path": "FAM:adrotator"
+        },
+        {
+          "tag": "execdownload",
+          "count": 3,
+          "category": "BEH",
+          "path": "BEH:execdownload"
+        },
+        {
+          "tag": "downloader",
+          "count": 3,
+          "category": "CLASS",
+          "path": "CLASS:downloader"
+        },
+        {
+          "tag": "zlob",
+          "count": 2,
+          "category": "FAM",
+          "path": "FAM:zlob"
+        }
+      ]
+    }
+  ]
+}
+```
 
-AVClass2 is the newer tool and it extracts more information 
-from the input AV labels.
-So, if you are new to AVClass and AVClass2, we recommend trying it out first.
+Or it can be used as a Python package:
+```py
+import json
+from avclass.labeler import AVClassLabeler
+
+av_class = AVClassLabeler()
+result = av_class.run(
+    files="./examples/malheurReference_lb.json",
+    data_type="lb",
+    path_export=True,
+)
+print(json.dumps(result))
+```
+the output on stdout is:
 
-However, there are several reasons to keep AVClass around. 
-First, it is more mature and used by many analysts, 
-so we want to preserve backwards compatibility.
-Second, for some applications only family names are needed and 
-for that AVClass is enough.
-Third, AVClass is faster than AVClass2 since it extracts less info. 
-The lower runtime is nice when processing millions of samples and 
-not requiring the extra tags AVClass2 provides. 
+```json
+{
+  "labels": [
+    {
+      "hash": "aca2d12934935b070df8f50e06a20539",
+      "av_count": 33,
+      "tags": [
+        {
+          "tag": "grayware",
+          "count": 9,
+          "category": "CLASS",
+          "path": "CLASS:grayware"
+        },
+        {
+          "tag": "adware",
+          "count": 9,
+          "category": "CLASS",
+          "path": "CLASS:grayware:adware"
+        },
+        {
+          "tag": "windows",
+          "count": 8,
+          "category": "FILE",
+          "path": "FILE:os:windows"
+        },
+        {
+          "tag": "adrotator",
+          "count": 8,
+          "category": "FAM",
+          "path": "FAM:adrotator"
+        },
+        {
+          "tag": "execdownload",
+          "count": 3,
+          "category": "BEH",
+          "path": "BEH:execdownload"
+        },
+        {
+          "tag": "downloader",
+          "count": 3,
+          "category": "CLASS",
+          "path": "CLASS:downloader"
+        },
+        {
+          "tag": "zlob",
+          "count": 2,
+          "category": "FAM",
+          "path": "FAM:zlob"
+        }
+      ]
+    }
+  ]
+}
+```
 
 ## References
 
@@ -80,16 +190,15 @@ The design and evaluation of AVClass2 is detailed in our
 AVClass2: Massive Malware Tag Extraction from AV Labels. 
 In proceedings of the Annual Computer Security Applications Conference, December 2020.
 
-## Why are AVClass and AVClass2 useful?
+## Why is AVClass useful?
 
 Because a lot of times security researchers want to extract family and other 
 information from AV labels, but this process is not as simple as it looks, 
 especially if you need to do it for large numbers (e.g., millions) of samples. 
-Some advantages of AVClass and AVClass2 are:
+Some advantages of AVClass are:
 
 1. *Automatic.* They remove manual analysis limitations on the size of the 
-input 
-dataset.
+input dataset.
 
 2. *Vendor-agnostic.* They operate on the labels of any available set of AV 
 engines, which can vary from sample to sample.
@@ -100,7 +209,7 @@ engines, e.g., Windows or Android malware.
 4. *Does not require executables.* AV labels can be obtained from online services
  like VirusTotal using a sample's hash, even when the executable is not available.
 
-5. *Quantified accuracy.* We have evaluated AVClass and AVClass2 on millions of 
+5. *Quantified accuracy.* We have evaluated AVClass 2.x on millions of 
 samples and publicly available malware datasets with ground truth. 
 Evaluation details are in the RAID 2016 and ACSAC 2020 papers.
 
@@ -110,21 +219,21 @@ these tools.
 
 ## Limitations
 
-The main limitations of AVClass and AVClass2 are that its output depends 
+The main limitations of AVClass is that the output depends 
 on the input AV labels. 
-Both tools try to compensate for the noise on the AV labels, 
+The tool tries to compensate for the noise on the AV labels, 
 but cannot identify tags if AV engines do not provide non-generic tokens 
 in the labels of a sample. 
-In particular, they cannot tag samples if at least 2 AV engines 
+In particular, it cannot tag samples if at least 2 AV engines 
 do not agree on a tag. 
 
-Still, there are many samples that both tools can tag
-and thus we believe you will find them useful.
+Still, there are many samples that it can tag
+and thus we believe you will find it useful.
 We recommend you to read the RAID 2016 and ACSAC 2020 papers for more details.
 
 ## Input JSON format
 
-AVClass and AVClass2 support three input JSON formats: 
+AVClass supports four input JSON formats: 
 
 1. VirusTotal v2 API JSON reports (*-vt file*), 
 where each line in the input *file* should be the full JSON of a 
@@ -133,7 +242,7 @@ e.g., obtained by querying https://www.virustotal.com/vtapi/v2/file/report?apike
 There is an example VirusTotal v2 input file in examples/vtv2_sample.json
 
 ```shell
-$./avclass2/avclass2_labeler.py -vt examples/vtv2_sample.json -p > output.txt
+$./avclass/labeler.py -i examples/vtv2_sample.json -t vt2 -p > output.txt
 ```
 
 2. VirusTotal v3 API JSON reports (*-vt file -vt3*), 
@@ -142,7 +251,7 @@ e.g., obtained by querying https://www.virustotal.com/api/v3/files/{hash}
 There is an example VirusTotal v3 input file in examples/vtv3_sample.json
 
 ```shell
-$./avclass2/avclass2_labeler.py -vt examples/vtv3_sample.json -p -vt3 > output.txt
+$./avclass/labeler.py -i examples/vtv3_sample.json -p -t vt3 > output.txt
 ```
 
 3. Simplified JSON (*-lb file*),
@@ -152,16 +261,23 @@ with (at least) these fields:
 There is an example of such input file in *examples/malheurReference_lb.json*
 
 ```shell
-$./avclass2/avclass2_labeler.py -lb examples/malheurReference_lb.json -p > output.txt
+$./avclass/labeler.py -i examples/malheurReference_lb.json -t lb -p > output.txt
+```
+
+4. Metadefender JSON (*-md file*),
+where each line in *file* should be a JSON
+
+```shell
+$./avclass/labeler.py -i examples/malheurReference_lb.json -t md -p > output.txt
 ```
 
 **Why have a simplified JSON format?**
 
 We believe most users will get the AV labels using VirusTotal. 
-However, AVClass and AVClass2 are IO-bound and a VirusTotal report 
+However, AVClass is IO-bound and a VirusTotal report 
 in addition to the AV labels and hashes includes 
-much other data that the tools do not need. 
-Thus, when applying AVClass or AVClass2 to millions of samples,
+a lot of other data that the tools do not need. 
+Thus, when applying AVClass to millions of samples,
 reducing the input file size by removing unnnecessary data 
 significantly improves efficiency. 
 Furthermore, users could obtain AV labels from other sources and 
@@ -170,8 +286,8 @@ the easier to convert those AV labels into an input file.
 
 ## Dependencies
 
-AVClass and AVClass2 are both written in Python. 
-They should both run on Python versions above 2.7 and 3.0.
+AVClass is both written in Python. 
+It should be run on Python versions >= 3.6.
 
 They do not require installing any dependencies.
 
@@ -182,11 +298,11 @@ pull request through GitHub.
 
 ## License
 
-AVClass and AVClass2 are both released under the MIT license
+AVClass is released under the MIT license
 
 ## Contributors
 
 Several members of the MaliciaLab at the [IMDEA Software Institute](http://software.imdea.org) 
-have contributed code to AVClasss and AVClass2: 
+have contributed code to AVClass: 
 Marcos Sebastián, Richard Rivera, Platon Kotzias, Srdjan Matic, Silvia Sebastián, and Juan Caballero.
 
diff --git a/avclass/labeler.py b/avclass/labeler.py
index 2343fe4..481b984 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -30,7 +30,7 @@ class AVClassLabeler:
     console = False
     av_tags = False
     stats_export = False
-    compatibility_v1 = False
+    family_only = False
     pup_classify = False
     path_export = False
     vt_tags = False
@@ -75,7 +75,7 @@ def run(
         av_tags: bool = False,
         pup_classify: bool = False,
         path_export: bool = False,
-        compatibility_v1: bool = False,
+        family_only: bool = False,
         console: bool = False,
     ) -> List[Dict]:
         # Set class arguments
@@ -83,7 +83,7 @@ def run(
         self.ground_truth = ground_truth
         self.av_tags = av_tags
         self.stats_export = stats_export
-        self.compatibility_v1 = compatibility_v1
+        self.family_only = family_only
         self.pup_classify = pup_classify
         self.path_export = path_export
         self.vt_tags = vt_tags
@@ -232,7 +232,7 @@ def get_tokens(self, sample_info: NamedTuple):
         pup_val = self.is_pup(self.pup_classify, tags)
 
         # Print family (and ground truth if available)
-        if self.compatibility_v1:
+        if self.family_only:
             class_entry = self.avclass1_output(
                 name=name,
                 family=fam,
@@ -307,7 +307,7 @@ def avclass2_output(
         else:
             vtt = ""
         tag_str = self.format_tag_pairs_str(
-            tags, self.av_labels.taxonomy, self.path_export
+            tags=tags, taxonomy=self.av_labels.taxonomy, path_export=self.path_export
         )
         self.print_output(
             "%s\t%d\t%s%s%s%s\n"
@@ -315,7 +315,7 @@ def avclass2_output(
         )
         # Build json output
         tag_dict = self.format_tag_pairs_list(
-            tags, self.av_labels.taxonomy, self.path_export
+            tags=tags, taxonomy=self.av_labels.taxonomy, path_export=self.path_export
         )
         values = {"hash": name, "av_count": vt_count, "tags": tag_dict}
         if self.ground_truth:
@@ -327,7 +327,7 @@ def avclass2_output(
         return values
 
     def get_family(self, name: AnyStr, tags: List[Tuple]) -> Tuple:
-        if self.compatibility_v1 or self.ground_truth:
+        if self.family_only or self.ground_truth:
             fam = "SINGLETON:" + name
             # fam = ''
             for (t, s) in tags:
@@ -605,7 +605,7 @@ def format_tag_pairs_list(
         out = []
         for (tag, count) in tags:
             values = {"tag": tag, "count": count}
-            if path_export and taxonomy:
+            if path_export and taxonomy is not None:
                 values["category"] = taxonomy.get_category(tag)
                 values["path"] = taxonomy.get_path(tag)
             out.append(values)
@@ -670,7 +670,7 @@ def main():
         ground_truth=args.gt,
         pup_classify=args.pup,
         path_export=args.path,
-        compatibility_v1=args.c,
+        family_only=args.c,
         console=not args.json,
     )
     if args.json:
diff --git a/setup.py b/setup.py
index 6bcc101..fa4ba97 100644
--- a/setup.py
+++ b/setup.py
@@ -20,6 +20,6 @@
     entry_points={
         'console_scripts': [
             'avclass = avclass.labeler:main',
-            'avclass-validate = avclass.util:validate_files',
+            'avclass-validate = avclass.cli:validate_files',
         ],
     })

From 127c96df6d2e723d3e72522f4cb7b5bea135e03a Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Mon, 8 Feb 2021 17:33:32 -0500
Subject: [PATCH 27/36] Readme, setup

---
 README.md          | 100 ++++++++++++++++++++++++++++++++++++++-------
 avclass/cli.py     |  13 ++++--
 avclass/labeler.py |  14 +++----
 avclass/update.py  |   5 ++-
 setup.py           |   1 +
 5 files changed, 108 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 6d92cf4..3b15139 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,23 @@
+# Install
+```shell
+$ git clone http://.../avclass
+$ cd avclass
+$ pip3 install .
+```
+
 # AVClass
 
 AVClass is a Python package / command line tool to tag / label malware samples. 
 You input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports) 
 and it outputs tags extracted from the AV labels of each sample. 
-AVClass will output the family names, along with other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). 
-It can also be run in compatibility mode `-c` (AVClass 1.x) to only output the family names (i.e., family tags). 
-It can also output a ranking of all alternative family names it found for each sample.
+AVClass will output the family names, along with other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*).  It can also output a ranking of all alternative family names it found for each sample.
+There is also a compatibility mode `-c` (AVClass 1.x) that will only output the family names (i.e., family tags). 
 
 
-A quick example helps illustrating the differences of compatibility mode. If you run AVClass on our example input file:
+A quick example helps illustrating the differences of 1.x compatibility mode. If you run AVClass on our example input file:
 
 ```shell
-$ python3 ./avclass/labeler.py -i ./examples/malheurReference_lb.json -t lb -p
+$ avclass -i ./examples/malheurReference_lb.json -t lb -p
 ```
 
 the output on stdout is:
@@ -27,10 +33,10 @@ was flagged by 33 AV engines and 10 of them agree it is *grayware*, 9 that it is
 Sample *67d15459e1f85898851148511c86d88d* is flagged by 37 AV engines and 23 of them 
 consider it a *dialer*, 8 that it belongs to the *adultbrowser* family, and so on.
 
-If you instead run AVClass on the same input file in compatibility mode:
+If you instead run AVClass on the same input file in compatibility mode `-c`:
 
 ```shell
-$ python3 ./avclass/labeler.py -i ./examples/malheurReference_lb.json -t lb -c
+$ avclass -i ./examples/malheurReference_lb.json -t lb -c
 ```
 
 the output looks like this, which simply reports the most common family name for each sample.
@@ -43,11 +49,11 @@ aca2d12934935b070df8f50e06a20539 adrotator
 
 The output can also be formatted as **JSON**.
 ```shell
-$ python3 ./avclass/labeler.py -i ./examples/malheurReference_lb.json -t lb -p -json
+$ avclass -i ./examples/malheurReference_lb.json -t lb -p -json
 ```
 the output on stdout is:
 
-```json
+```yaml
 {
   "labels": [
     {
@@ -117,7 +123,7 @@ print(json.dumps(result))
 ```
 the output on stdout is:
 
-```json
+```yaml
 {
   "labels": [
     {
@@ -172,6 +178,72 @@ the output on stdout is:
 }
 ```
 
+## Update Module
+
+The update module can be used to suggest additions and changes to the input 
+taxonomy, tagging rules, and expansion rules. 
+Using the update module comprises of two steps.
+The first step is obtaining an alias file from the labeler:
+
+```shell
+$ avclass -i ./examples/malheurReference_lb.json -t lb -aliasdetect
+```
+
+The above command will create a file named \<file\>.alias, 
+malheurReference_lb.alias in our example. This file has 7 columns:
+
+1. t1: token that is an alias
+2. t2: tag for which t1 is an alias
+3. |t1|: number of input samples where t1 was observed
+4. |t2|: number of input samples where t2 was observed
+5. |t1^t2|: number of input samples where both t1 and t2 were observed
+6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t1 was observed.
+7. |t1^t2|/|t2|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t2 was observed.
+
+
+The Update Module takes the above file as input with the -alias option, 
+as well as the default taxonomy, tagging, and expansion files in the data directory. 
+It outputs updated taxonomy, tagging, and expansion files that include the 
+suggested additions and changes. 
+
+```shell
+$ avclass-update -alias malheurReference_lb.alias -o output_prefix
+```
+
+This will produce three files: 
+output_prefix.taxonomy, output_prefix.tagging, output_prefix.expansion. 
+You can diff the output and input files to analyze the proposed changes.
+
+You can also modify the input taxonomy, tagging, and expansion rules in place, 
+rather than producing new files:
+
+
+```shell
+$ avclass-update -alias malheurReference_lb.alias -update
+```
+
+
+## Customizing AVClass
+
+AVClass is fully customizable: 
+Tagging, Expansion and Taxonomy files can be easily modified by the analyst 
+either manually or by running the update module. 
+
+If you change those files manually, we recommend running 
+afterwards the input checker script to keep them tidy. 
+It sorts the tags in the taxonomy and performs some basic cleaning like 
+removing redundant entries:
+
+```shell
+$ avclass-validate -tax taxonomy_file -tag tagging_file -exp expansio_file
+```
+
+If the modifications are in the default files in the data directory you can simply run: 
+
+```shell
+$ avclass-validate
+```
+
 ## References
 
 The design and evaluation of AVClass is detailed in our 
@@ -242,7 +314,7 @@ e.g., obtained by querying https://www.virustotal.com/vtapi/v2/file/report?apike
 There is an example VirusTotal v2 input file in examples/vtv2_sample.json
 
 ```shell
-$./avclass/labeler.py -i examples/vtv2_sample.json -t vt2 -p > output.txt
+$ avclass -i examples/vtv2_sample.json -t vt2 -p > output.txt
 ```
 
 2. VirusTotal v3 API JSON reports (*-vt file -vt3*), 
@@ -251,7 +323,7 @@ e.g., obtained by querying https://www.virustotal.com/api/v3/files/{hash}
 There is an example VirusTotal v3 input file in examples/vtv3_sample.json
 
 ```shell
-$./avclass/labeler.py -i examples/vtv3_sample.json -p -t vt3 > output.txt
+$ avclass -i examples/vtv3_sample.json -p -t vt3 > output.txt
 ```
 
 3. Simplified JSON (*-lb file*),
@@ -261,14 +333,14 @@ with (at least) these fields:
 There is an example of such input file in *examples/malheurReference_lb.json*
 
 ```shell
-$./avclass/labeler.py -i examples/malheurReference_lb.json -t lb -p > output.txt
+$ avclass -i examples/malheurReference_lb.json -t lb -p > output.txt
 ```
 
 4. Metadefender JSON (*-md file*),
 where each line in *file* should be a JSON
 
 ```shell
-$./avclass/labeler.py -i examples/malheurReference_lb.json -t md -p > output.txt
+$ avclass -i examples/malheurReference_lb.json -t md -p > output.txt
 ```
 
 **Why have a simplified JSON format?**
diff --git a/avclass/cli.py b/avclass/cli.py
index ad528ff..1fa35ac 100644
--- a/avclass/cli.py
+++ b/avclass/cli.py
@@ -25,7 +25,8 @@ def validate_taxonomy(path: AnyStr):
     taxonomy = Taxonomy(path)
     taxonomy.to_file(path)
 
-    logger.info("[-] Normalized %d tags in taxonomy %s\n" % (len(taxonomy), path))
+    print("[-] Normalized %d tags in taxonomy %s" % (len(taxonomy), path))
+    logger.info("[-] Normalized %d tags in taxonomy %s" % (len(taxonomy), path))
 
     return taxonomy
 
@@ -43,7 +44,8 @@ def validate_tagging(path: AnyStr, taxonomy: Taxonomy):
     # tagging.expand_all_destinations()
     tagging.to_file(path)
 
-    logger.info("[-] Normalized %d tagging rules in %s\n" % (len(tagging), path))
+    print("[-] Normalized %d tagging rules in %s" % (len(tagging), path))
+    logger.info("[-] Normalized %d tagging rules in %s" % (len(tagging), path))
 
 
 def validate_expansion(path: AnyStr, taxonomy: Taxonomy):
@@ -58,7 +60,8 @@ def validate_expansion(path: AnyStr, taxonomy: Taxonomy):
     expansion.validate(taxonomy)
     expansion.to_file(path)
 
-    logger.info("[-] Normalized %d expansion rules in %s\n" % (len(expansion), path))
+    print("[-] Normalized %d expansion rules in %s" % (len(expansion), path))
+    logger.info("[-] Normalized %d expansion rules in %s" % (len(expansion), path))
 
 
 def validate_files():
@@ -74,3 +77,7 @@ def validate_files():
     taxonomy = validate_taxonomy(args.tax)
     validate_tagging(args.tag, taxonomy)
     validate_expansion(args.exp, taxonomy)
+
+
+if __name__ == "__main__":
+    validate_files()
diff --git a/avclass/labeler.py b/avclass/labeler.py
index 481b984..d3885be 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -479,13 +479,13 @@ def alias_detection(self, out_prefix: AnyStr, path_export: bool = False):
                 y = self.av_labels.taxonomy.get_path(y)
             self.output["alias"].append(
                 {
-                    "tag1_label": x,
-                    "tag2_label": y,
-                    "tag1": xn,
-                    "tag2": yn,
-                    "tag1^tag2": c,
-                    "tag1^tag2/tag1": f,
-                    "tag1^tag2/tag2": finv,
+                    "alias_token": x,
+                    "alias_tag": y,
+                    "count_token": xn,
+                    "count_tag": yn,
+                    "ratio": c,
+                    "ratio_token": f,
+                    "raiio_tag": finv,
                 }
             )
             if out_prefix:
diff --git a/avclass/update.py b/avclass/update.py
index 4e9d5ea..5adaf54 100644
--- a/avclass/update.py
+++ b/avclass/update.py
@@ -433,7 +433,7 @@ def output(prefix: Optional[AnyStr] = None):
         logger.info('[-] Output %d expansion rules to %s' % (len(expansion), exp_filepath))
 
 
-if __name__ == '__main__':
+def main():
     parser = argparse.ArgumentParser(description='Given a .alias file from the labeler, generates updates for the '
                                                  'taxonomy, tagging, and expansion files.')
 
@@ -508,3 +508,6 @@ def output(prefix: Optional[AnyStr] = None):
         update.output(out_prefix)
 
     update.output_relations(out_prefix + ".final.rules")
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
index fa4ba97..10f35fc 100644
--- a/setup.py
+++ b/setup.py
@@ -21,5 +21,6 @@
         'console_scripts': [
             'avclass = avclass.labeler:main',
             'avclass-validate = avclass.cli:validate_files',
+            'avclass-update = avclass.update:main',
         ],
     })

From fb6d28dbab9573524e5c6791a04268bb964979b0 Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Mon, 8 Feb 2021 18:02:30 -0500
Subject: [PATCH 28/36] Readme updates

---
 .gitignore        |   1 +
 README.md         |  93 +++++++----------
 avclass/README.md | 261 ----------------------------------------------
 3 files changed, 40 insertions(+), 315 deletions(-)
 delete mode 100644 avclass/README.md

diff --git a/.gitignore b/.gitignore
index bd84ced..9558345 100644
--- a/.gitignore
+++ b/.gitignore
@@ -147,3 +147,4 @@ cython_debug/
 *.avtags
 *.stats
 *.alias
+*.labels
diff --git a/README.md b/README.md
index 3b15139..075e64d 100644
--- a/README.md
+++ b/README.md
@@ -121,62 +121,47 @@ result = av_class.run(
 )
 print(json.dumps(result))
 ```
-the output on stdout is:
+## Labeling: Ground Truth Evaluation
+
+If you have family ground truth for some malware samples, i.e., 
+you know the true family for those samples, you can evaluate the accuracy 
+of the family tags output by AVClass2 on those samples with respect to that ground truth. 
+The evaluation metrics used are precision, recall, and F1 measure. 
+See our [RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf) for their definition.
+Note that the ground truth evaluation does not apply to non-family tags, 
+i.e., it only evaluates the output of the compatibility mode.
+
+```shell
+$ avclass -i ./examples/malheurReference_lb.json -t lb -gt ./examples/malheurReference_gt.tsv > malheurReference.labels
+```
+
+The output includes these lines:
+
+```
+Calculating precision and recall
+3131 out of 3131
+Precision: 90.81  Recall: 94.05 F1-Measure: 92.40
+```
+
+Each line in the *../examples/malheurReference_gt.tsv* file has two **tab-separated** columns:
 
-```yaml
-{
-  "labels": [
-    {
-      "hash": "aca2d12934935b070df8f50e06a20539",
-      "av_count": 33,
-      "tags": [
-        {
-          "tag": "grayware",
-          "count": 9,
-          "category": "CLASS",
-          "path": "CLASS:grayware"
-        },
-        {
-          "tag": "adware",
-          "count": 9,
-          "category": "CLASS",
-          "path": "CLASS:grayware:adware"
-        },
-        {
-          "tag": "windows",
-          "count": 8,
-          "category": "FILE",
-          "path": "FILE:os:windows"
-        },
-        {
-          "tag": "adrotator",
-          "count": 8,
-          "category": "FAM",
-          "path": "FAM:adrotator"
-        },
-        {
-          "tag": "execdownload",
-          "count": 3,
-          "category": "BEH",
-          "path": "BEH:execdownload"
-        },
-        {
-          "tag": "downloader",
-          "count": 3,
-          "category": "CLASS",
-          "path": "CLASS:downloader"
-        },
-        {
-          "tag": "zlob",
-          "count": 2,
-          "category": "FAM",
-          "path": "FAM:zlob"
-        }
-      ]
-    }
-  ]
-}
 ```
+aca2d12934935b070df8f50e06a20539 ADROTATOR
+```
+
+which indicates that sample aca2d12934935b070df8f50e06a20539 is known 
+to be of the *ADROTATOR* family. 
+Each sample in the input file should also appear in the ground truth file. 
+Note that the particular label assigned to each family does not matter. 
+What matters is that all samples in the same family are assigned 
+the same family name (i.e., the same string in the second column)
+
+The ground truth can be obtained from publicly available malware datasets. 
+The one in *../examples/malheurReference_gt.tsv* comes from the 
+[Malheur](http://www.mlsec.org/malheur/) dataset. 
+There are other public datasets with ground truth such as 
+[Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or 
+[Malicia](http://malicia-project.com/dataset.html).
 
 ## Update Module
 
diff --git a/avclass/README.md b/avclass/README.md
deleted file mode 100644
index 83dfaad..0000000
--- a/avclass/README.md
+++ /dev/null
@@ -1,261 +0,0 @@
-# AVClass2
-
-AVClass2 is a malware tagging tool. It extends AVClass to extract from AV labels not only family name tags, but other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). 
-
-You give it as input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports)
-and it outputs tags observed in the AV labels, ranked by decreasing popularity. 
-
-The design and evaluation of AVClass2 is detailed in our ACSAC 2020 paper.
-
-> Silvia Sebastián, Juan Caballero. 
-AVClass2: Massive Malware Tag Extraction from AV Labels. 
-In proceedings of the Annual Computer Security Applications Conference, December 2020.
-
-In a nutshell, AVClass2 comprises two modules: labeling and update. Code for both is included, but most users will be only interested in the labeling, which outputs the tags for the samples. The update module is used to update the input taxonomy, tagging rules, and expansion rules. If you use our default taxonomy, tagging, and expansion files, you do not need to run the update module.
-
-
-## Labeling
-
-The labeler takes as input a JSON file with the AV labels of malware samples 
-(-vt or -lb options), 
-a file with the taxonomy (-tax option), 
-a file with tagging rules (-tag option), and
-a file with expansion rules (-exp option). 
-It outputs a set of ranked tags. 
-If you do not provide taxonomy, expansion or tagging files, 
-the default ones in the data folder are used.
-
-```shell
-$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json
-```
-
-The above command labels the samples whose AV labels are in 
-the ../examples/malheurReference_lb.json file. 
-It prints the results to stdout. 
-The output looks like this: 
-
-```
-aca2d12934935b070df8f50e06a20539 33 grayware|10,adware|9,windows|8,adrotator|8,downloader|3,zlob|2
-67d15459e1f85898851148511c86d88d 37 dialer|23,windows|9,adultbrowser|8,porndialer|7,grayware|6,tool|3,target|2
-```
-
-which means sample *aca2d12934935b070df8f50e06a20539* 
-was flagged by 33 AV engines and 10 of them agree it is *grayware*, 9 that it is more specifically *adware*, 
-8 mention that it runs on *windows*, another 8 that it is the *adrotator* family, 
-3 that it is a *downloader*, and 2 that it belongs instead to the *zlob* family.
-Sample *67d15459e1f85898851148511c86d88d* is flagged by 37 AV engines and 23 of them 
-consider it a *dialer*, 8 that it belongs to the *adultbrowser* family, and so on. 
-
-The -p option outputs the full path of each tag in the taxonomy: 
-
-```shell
-$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -p
-```
-
-The above command line outputs:
-
-```
-aca2d12934935b070df8f50e06a20539 33 CLASS:grayware|10,CLASS:grayware:adware|9,FILE:os:windows|8,FAM:adrotator|8,CLASS:downloader|3,FAM:zlob|2
-67d15459e1f85898851148511c86d88d 37 CLASS:dialer|23,FILE:os:windows|9,FAM:adultbrowser|8,CLASS:dialer:porndialer|7,CLASS:grayware|6,CLASS:grayware:tool|3,FAM:target|2
-```
-
-where each tag has been replaced by its taxonomy path, which starts with the category in capitals, 
-followed by the path in the category (if any), and the tag itself, all separated by colons. 
-For example, *FAM:adrotator* makes explicit that *adrotator* is a malware family, 
-*CLASS:grayware* that *grayware* is a malware class, and 
-*CLASS:grayware:adware* that *adware* is a subclass of *grayware*.
-
-**Compatibility mode**
-
-The compatibility -c option makes AVClass2 output the same format as AVClass. 
-
-```shell
-$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -c
-```
-
-outputs:
-
-```
-bb23e1d296cf01bbaf32ed3938f9b0b8 allaple
-cc4521ea738e8ba17139f86b3def5349 SINGLETON:cc4521ea738e8ba17139f86b3def5349
-```
-
-As in AVClass, the output contains only the family name, 
-which corresponds to the highest ranked family tag, all other tags are ignored.
-Samples for which a family cannot be obtained are labeled as singletons with their hash.
- 
-It is important to note that AVClass2 compatibility mode results can differ from AVClass results
-on the same input file.
-The differences in family names are due to differences between the generics and aliases files 
-used by AVClass and the taxonomy, tagging rules, and expansion rules used by AVClass2. 
-In the future, we may change AVClass to use the taxonomy and rules from AVClass2 
-as input (instead of the generics and aliases files) 
-to minimize such differences and avoid maintaining different data files.
-
-
-## Input JSON format
-
-AVClass2 supports three input JSON formats:
-
-1. VirusTotal v2 API JSON reports (*-vt file*), 
-where each line in the input *file* should be the full JSON of a 
-VirusTotal v2 API response to the */file/report* endpoint,
-e.g., obtained by querying https://www.virustotal.com/vtapi/v2/file/report?apikey={apikey}&resource={hash}
-There is an example VirusTotal v2 input file in examples/vtv2_sample.json
-
-2. VirusTotal v3 API JSON reports (*-vt file -vt3*), 
-where each line in the input *file* should be the full JSON of a VirusTotal API version 3 response with a *File* object report, 
-e.g., obtained by querying https://www.virustotal.com/api/v3/files/{hash}
-There is an example VirusTotal v3 input file in examples/vtv3_sample.json
-
-3. Simplified JSON (*-lb file*),
-where each line in *file* should be a JSON 
-with (at least) these fields:
-{md5, sha1, sha256, av_labels}. 
-There is an example of such input file in *examples/malheurReference_lb.json*
-
-
-**Multiple input files**
-
-AVClass2 can handle multiple input files putting the results in the same output files 
-(if you want results in separate files, process each input file separately).
-
-It is possible to provide the -vt and -lb input options multiple times.
-
-```shell
-$./avclass2_labeler.py -vt <file1> -vt <file2>
-```
-```shell
-$./avclass2_labeler.py -lb <file1> -lb <file2>
-```
-
-There are also -vtdir and -lbdir options that can be used to provide 
-an input directory where all files are VT (-vtdir) or simplified (-lbdir) JSON reports:
-
-```shell
-$./avclass2_labeler.py -vtdir <directory>
-```
-
-It is also possible to combine -vt with -vtdir and -lb with -lbdir, 
-but you cannot combine input files of different format. Thus, this command works:
-
-```shell
-$./avclass2_labeler.py -vt <file> -vtdir <directory>
-```
-
-But, this one throws an error:
-
-```shell
-$./avclass2_labeler.py -vt <file1> -lb <file2>
-```
-
-At this point you have read the most important information on how to use AVClass2. 
-The following sections describe steps that most users will not need.
-
-## Labeling: Ground Truth Evaluation
-
-If you have family ground truth for some malware samples, i.e., 
-you know the true family for those samples, you can evaluate the accuracy 
-of the family tags output by AVClass2 on those samples with respect to that ground truth. 
-The evaluation metrics used are precision, recall, and F1 measure. 
-See our [RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf) for their definition.
-Note that the ground truth evaluation does not apply to non-family tags, 
-i.e., it only evaluates the output of the compatibility mode.
-
-```shell
-$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -gt ../examples/malheurReference_gt.tsv > malheurReference.labels
-```
-
-The output includes these lines:
-
-```
-Calculating precision and recall
-3131 out of 3131
-Precision: 90.81  Recall: 94.05 F1-Measure: 92.40
-```
-
-Each line in the *../examples/malheurReference_gt.tsv* file has two **tab-separated** columns:
-
-```
-aca2d12934935b070df8f50e06a20539 ADROTATOR
-```
-
-which indicates that sample aca2d12934935b070df8f50e06a20539 is known 
-to be of the *ADROTATOR* family. 
-Each sample in the input file should also appear in the ground truth file. 
-Note that the particular label assigned to each family does not matter. 
-What matters is that all samples in the same family are assigned 
-the same family name (i.e., the same string in the second column)
-
-The ground truth can be obtained from publicly available malware datasets. 
-The one in *../examples/malheurReference_gt.tsv* comes from the 
-[Malheur](http://www.mlsec.org/malheur/) dataset. 
-There are other public datasets with ground truth such as 
-[Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or 
-[Malicia](http://malicia-project.com/dataset.html).
-
-## Update Module
-
-The update module can be used to suggest additions and changes to the input 
-taxonomy, tagging rules, and expansion rules. 
-Using the update module comprises of two steps.
-The first step is obtaining an alias file from the labeler:
-
-```shell
-$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -aliasdetect
-```
-
-The above command will create a file named \<file\>.alias, 
-malheurReference_lb.alias in our example. This file has 7 columns:
-
-1. t1: token that is an alias
-2. t2: tag for which t1 is an alias
-3. |t1|: number of input samples where t1 was observed
-4. |t2|: number of input samples where t2 was observed
-5. |t1^t2|: number of input samples where both t1 and t2 were observed
-6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t1 was observed.
-7. |t1^t2|/|t2|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t2 was observed.
-
-
-The Update Module takes the above file as input with the -alias option, 
-as well as the default taxonomy, tagging, and expansion files in the data directory. 
-It outputs updated taxonomy, tagging, and expansion files that include the 
-suggested additions and changes. 
-
-```shell
-$./avclass2_update_module.py -alias malheurReference_lb.alias -o output_prefix
-```
-
-This will produce three files: 
-output_prefix.taxonomy, output_prefix.tagging, output_prefix.expansion. 
-You can diff the output and input files to analyze the proposed changes.
-
-You can also modify the input taxonomy, tagging, and expansion rules in place, 
-rather than producing new files:
-
-
-```shell
-$./avclass2_update_module.py -alias malheurReference_lb.alias -update
-```
-
-
-## Customizing AVClass2
-
-AVClass2 is fully customizable: 
-Tagging, Expansion and Taxonomy files can be easily modified by the analyst 
-either manually or by running the update module. 
-
-If you change those files manually, we recommend running 
-afterwards the input checker script to keep them tidy. 
-It sorts the tags in the taxonomy and performs some basic cleaning like 
-removing redundant entries:
-
-```shell
-$./avclass2_input_checker.py -tax taxonomy_file -tag tagging_file -exp expansio_file
-```
-
-If the modifications are in the default files in the data directory you can simply run: 
-
-```shell
-$./avclass2_input_checker.py 
-```

From 2e87fb696990cc38f17ee3812eefe44aeec657a7 Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Mon, 8 Feb 2021 18:05:56 -0500
Subject: [PATCH 29/36] Readme formatting

---
 README.md | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 075e64d..684fb7a 100644
--- a/README.md
+++ b/README.md
@@ -1,19 +1,18 @@
-# Install
-```shell
-$ git clone http://.../avclass
-$ cd avclass
-$ pip3 install .
-```
-
 # AVClass
-
 AVClass is a Python package / command line tool to tag / label malware samples. 
 You input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports) 
 and it outputs tags extracted from the AV labels of each sample. 
 AVClass will output the family names, along with other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*).  It can also output a ranking of all alternative family names it found for each sample.
 There is also a compatibility mode `-c` (AVClass 1.x) that will only output the family names (i.e., family tags). 
 
+## Install
+```shell
+$ git clone http://.../avclass
+$ cd avclass
+$ pip3 install .
+```
 
+## Examples
 A quick example helps illustrating the differences of 1.x compatibility mode. If you run AVClass on our example input file:
 
 ```shell

From 343aa5556811a0810171c096671070354af835ce Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Tue, 9 Feb 2021 16:59:00 -0500
Subject: [PATCH 30/36] Moved class variables into __init__ so they are reset

---
 avclass/labeler.py | 58 ++++++++++++++++++++++------------------------
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/avclass/labeler.py b/avclass/labeler.py
index d3885be..b0e362f 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -22,38 +22,36 @@
 
 
 class AVClassLabeler:
-    output = {"labels": []}
-    av_labels = None
-    hash_type = None
-    ground_truth = None
-    get_sample_info = None
-    console = False
-    av_tags = False
-    stats_export = False
-    family_only = False
-    pup_classify = False
-    path_export = False
-    vt_tags = False
-    vt_all = 0
-    first_token_dict = {}
-    token_count_map = {}
-    pair_count_map = {}
-    avtags_dict = {}
-    gt_dict = {}
-    stats = {
-        "samples": 0,
-        "noscans": 0,
-        "tagged": 0,
-        "maltagged": 0,
-        "FAM": 0,
-        "CLASS": 0,
-        "BEH": 0,
-        "FILE": 0,
-        "UNK": 0,
-    }
-
     def __init__(self, av_labels: AvLabels = AvLabels()):
         self.av_labels = av_labels
+        self.output = {"labels": []}
+        self.hash_type = None
+        self.ground_truth = None
+        self.get_sample_info = None
+        self.console = False
+        self.av_tags = False
+        self.stats_export = False
+        self.family_only = False
+        self.pup_classify = False
+        self.path_export = False
+        self.vt_tags = False
+        self.vt_all = 0
+        self.first_token_dict = {}
+        self.token_count_map = {}
+        self.pair_count_map = {}
+        self.avtags_dict = {}
+        self.gt_dict = {}
+        self.stats = {
+            "samples": 0,
+            "noscans": 0,
+            "tagged": 0,
+            "maltagged": 0,
+            "FAM": 0,
+            "CLASS": 0,
+            "BEH": 0,
+            "FILE": 0,
+            "UNK": 0,
+        }
 
     def run(
         self,

From 2924a66e4358a6dcfe396f14b7bbc943d1820be1 Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Wed, 17 Feb 2021 13:03:44 -0500
Subject: [PATCH 31/36] Copied updates from common

---
 avclass/common.py | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/avclass/common.py b/avclass/common.py
index d812523..d03c43a 100755
--- a/avclass/common.py
+++ b/avclass/common.py
@@ -328,7 +328,7 @@ def __init__(self, filepath: Optional[AnyStr]):
 
         :param filepath: The file to read from
         """
-        self._rmap = {}
+        self._src_map = {}
         if filepath:
             self.read_rules(filepath)
 
@@ -338,7 +338,7 @@ def __len__(self):
 
         :return: Number of rules
         """
-        return len(self._rmap)
+        return len(self._src_map)
 
     def add_rule(
         self, src: AnyStr, dst_l: Collection[AnyStr] = None, overwrite: bool = False
@@ -360,19 +360,19 @@ def add_rule(
         src_tag = Tag(src)
         if overwrite:
             target_l = [Tag(dst).name for dst in dst_l]
-            self._rmap[src_tag.name] = set(target_l)
+            self._src_map[src_tag.name] = set(target_l)
         else:
-            curr_dst = self._rmap.get(src_tag.name, set())
+            curr_dst = self._src_map.get(src_tag.name, set())
             for dst in dst_l:
                 dst_tag = Tag(dst)
                 curr_dst.add(dst_tag.name)
-            self._rmap[src_tag.name] = curr_dst
+            self._src_map[src_tag.name] = curr_dst
 
     def remove_rule(self, src: AnyStr) -> bool:
-        dst = self._rmap.get(src, [])
+        dst = self._src_map.get(src, [])
         if dst:
             logger.debug("[Rules] Removing rule: %s -> %s" % (src, dst))
-            del self._rmap[src]
+            del self._src_map[src]
             return True
         return False
 
@@ -383,7 +383,7 @@ def get_dst(self, src: AnyStr) -> List[AnyStr]:
         :param src: The source rule
         :return: List of dst
         """
-        return list(self._rmap.get(src, []))
+        return list(self._src_map.get(src, []))
 
     def read_rules(self, filepath: AnyStr):
         """
@@ -409,7 +409,7 @@ def to_file(self, filepath: AnyStr, taxonomy: Taxonomy = None):
         :return: None
         """
         with open(filepath, "w") as fd:
-            for src, dst_set in sorted(self._rmap.items()):
+            for src, dst_set in sorted(self._src_map.items()):
                 dst_l = sorted(dst_set)
                 if taxonomy:
                     src_path = taxonomy.get_path(src)
@@ -428,11 +428,11 @@ def expand_src_destinations(self, src: AnyStr) -> Set[AnyStr]:
         :return: List of expanded destinations
         """
         # TODO - this only goes one layer deep it seems.  Not actually recursive
-        dst_set = self._rmap.get(src, set())
+        dst_set = self._src_map.get(src, set())
         out = set()
         while dst_set:
             dst = dst_set.pop()
-            dst_l = self._rmap.get(dst, [])
+            dst_l = self._src_map.get(dst, [])
             if dst_l:
                 for d in dst_l:
                     if d not in out and d != dst:
@@ -447,10 +447,10 @@ def expand_all_destinations(self):
 
         :return: None
         """
-        src_l = self._rmap.keys()
+        src_l = self._src_map.keys()
         for src in src_l:
             dst_l = self.expand_src_destinations(src)
-            self._rmap[src] = dst_l
+            self._src_map[src] = dst_l
 
 
 class Translation(Rules):
@@ -468,7 +468,9 @@ def validate(self, taxonomy: Taxonomy):
         :param taxonomy: The Taxonomy to use for checking
         :return: None
         """
-        for tok, tag_l in self._rmap.items():
+        for tok, tag_l in self._src_map.items():
+            if taxonomy.is_tag(tok):
+                sys.stdout.write("[Tagging] SRC %s in taxonomy\n" % tok)
             for t in tag_l:
                 if not taxonomy.is_tag(t):
                     sys.stdout.write("[Tagging] %s not in taxonomy\n" % t)
@@ -490,7 +492,7 @@ def validate(self, taxonomy: Taxonomy):
         :param taxonomy: The Taxonomy to use for checking
         :return: None
         """
-        for src, dst_set in self._rmap.items():
+        for src, dst_set in self._src_map.items():
             if not taxonomy.is_tag(src):
                 sys.stdout.write("[Expansion] %s not in taxonomy\n" % src)
                 # TODO - raise or return False?

From b2ce1afe7cee35145aeafb09d374a6d605fb9d1f Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Thu, 18 Feb 2021 10:34:50 -0500
Subject: [PATCH 32/36] platonK fix for parsing of VT file reports from VT file
 feed APIv3

---
 avclass/common.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/avclass/common.py b/avclass/common.py
index d03c43a..7e0eea4 100755
--- a/avclass/common.py
+++ b/avclass/common.py
@@ -606,11 +606,13 @@ def get_sample_info_vt_v3(record: Dict) -> SampleInfo:
         :param record: The JSON record
         :return: An instance of SampleInfo
         """
+        if 'data' in record:
+            record = record['data']
         try:
-            scans = record["data"]["attributes"]["last_analysis_results"]
-            md5 = record["data"]["attributes"]["md5"]
-            sha1 = record["data"]["attributes"]["sha1"]
-            sha256 = record["data"]["attributes"]["sha256"]
+            scans = record["attributes"]["last_analysis_results"]
+            md5 = record["attributes"]["md5"]
+            sha1 = record["attributes"]["sha1"]
+            sha256 = record["attributes"]["sha256"]
         except KeyError:
             return None
 
@@ -624,7 +626,7 @@ def get_sample_info_vt_v3(record: Dict) -> SampleInfo:
                 ).strip()
                 label_pairs.append((av, clean_label))
 
-        vt_tags = record["data"]["attributes"].get("tags", [])
+        vt_tags = record["attributes"].get("tags", [])
 
         return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags)
 

From b113a69ce56f0edb19935dbe6ecad8c4823215d6 Mon Sep 17 00:00:00 2001
From: ElJeffe <jeffgemail@gmail.com>
Date: Tue, 9 Mar 2021 10:23:04 -0500
Subject: [PATCH 33/36] MISP

---
 avclass/common.py                             |    9 +-
 avclass/data/default.taxonomy                 |   44 +
 .../cluster/avclass.json                      | 1288 ++++++++++++++++-
 .../avclass2.json => misp/galaxy/avclass.json |    2 +-
 4 files changed, 1324 insertions(+), 19 deletions(-)
 rename avclass/data/misp/cluster/avclass2.json => misp/cluster/avclass.json (85%)
 rename avclass/data/misp/galaxy/avclass2.json => misp/galaxy/avclass.json (87%)

diff --git a/avclass/common.py b/avclass/common.py
index 7e0eea4..946f7ad 100755
--- a/avclass/common.py
+++ b/avclass/common.py
@@ -32,6 +32,11 @@
     "GData",
     "Avast",
     "Sophos",
+    "BitDefenderTheta",
+    "Alibaba",
+    "Tencent",
+    "Cyren",
+    "Arcabit",
     "TrendMicro-HouseCall",
     "TrendMicro",
     "NANO-Antivirus",
@@ -606,8 +611,8 @@ def get_sample_info_vt_v3(record: Dict) -> SampleInfo:
         :param record: The JSON record
         :return: An instance of SampleInfo
         """
-        if 'data' in record:
-            record = record['data']
+        if "data" in record:
+            record = record["data"]
         try:
             scans = record["attributes"]["last_analysis_results"]
             md5 = record["attributes"]["md5"]
diff --git a/avclass/data/default.taxonomy b/avclass/data/default.taxonomy
index 68a416a..963b8da 100644
--- a/avclass/data/default.taxonomy
+++ b/avclass/data/default.taxonomy
@@ -15,6 +15,7 @@ BEH:filemodify
 BEH:files
 BEH:hostsmodify
 BEH:infosteal
+BEH:infosteal:coinstealer
 BEH:infosteal:gamethief
 BEH:inject
 BEH:irc
@@ -98,9 +99,11 @@ FAM:adrotator
 FAM:adrotoob
 FAM:adultbrowser
 FAM:adviator
+FAM:adwind
 FAM:adwk
 FAM:adwo
 FAM:aesads
+FAM:agenttesla
 FAM:agobot
 FAM:agvd
 FAM:ahmyth
@@ -136,7 +139,9 @@ FAM:asacub
 FAM:asprox
 FAM:autoins
 FAM:autosus
+FAM:avemaria
 FAM:axespy
+FAM:azorult
 FAM:badda
 FAM:badnews
 FAM:badpac
@@ -150,6 +155,7 @@ FAM:basbanke
 FAM:basebridge
 FAM:basepay
 FAM:bauts
+FAM:bazar
 FAM:bebeg
 FAM:becou
 FAM:beebone
@@ -198,6 +204,7 @@ FAM:cardserv
 FAM:cashon
 FAM:cellshark
 FAM:centim
+FAM:cerberus
 FAM:cerekv
 FAM:cheica
 FAM:chir
@@ -207,15 +214,18 @@ FAM:cleaman
 FAM:clevernet
 FAM:clientor
 FAM:clinator
+FAM:clipbanker
 FAM:cmccwm
 FAM:cnbtech
 FAM:cnzz
+FAM:cobaltstrike
 FAM:coinhive
 FAM:coldfusion
 FAM:commplat
 FAM:conduit
 FAM:conficker
 FAM:contactscollector
+FAM:conti
 FAM:cooee
 FAM:coogos
 FAM:coolmirage
@@ -230,6 +240,7 @@ FAM:crusewind
 FAM:cryptodef
 FAM:cryptolocker
 FAM:cryptowall
+FAM:crysis
 FAM:crytex
 FAM:cryxos
 FAM:ctchm
@@ -246,6 +257,7 @@ FAM:dasu
 FAM:datacollector
 FAM:daws
 FAM:dbtes
+FAM:deathransom
 FAM:deblio
 FAM:defmid
 FAM:delbar
@@ -268,6 +280,7 @@ FAM:dorifel
 FAM:dorkbot
 FAM:dougalek
 FAM:dowgin
+FAM:downeks
 FAM:downloadadmin
 FAM:downloadassistant
 FAM:downloadguide
@@ -289,9 +302,11 @@ FAM:dusvext
 FAM:dynamer
 FAM:easyroot
 FAM:egame
+FAM:egregor
 FAM:egroupdial
 FAM:ejik
 FAM:elite
+FAM:emotet
 FAM:emudbot
 FAM:eorezo
 FAM:equationdrug
@@ -384,6 +399,7 @@ FAM:ginamster
 FAM:ginko
 FAM:ginmaster
 FAM:gizmo
+FAM:glupteba
 FAM:gobot
 FAM:golddream
 FAM:goldentouch
@@ -391,7 +407,10 @@ FAM:gomanag
 FAM:gomunc
 FAM:gonesixty
 FAM:goodnews
+FAM:goodor
+FAM:gootkit
 FAM:gorillaprice
+FAM:gozi
 FAM:gpspy
 FAM:grabos
 FAM:graybird
@@ -417,6 +436,7 @@ FAM:hipposms
 FAM:honli
 FAM:hotbar
 FAM:hotclip
+FAM:houdini
 FAM:hoverwatch
 FAM:hqowdo
 FAM:hqwar
@@ -497,6 +517,7 @@ FAM:kyview
 FAM:laconic
 FAM:lardlond
 FAM:laroux
+FAM:lazagne
 FAM:ldpinch
 FAM:leadbolt
 FAM:leapp
@@ -505,6 +526,7 @@ FAM:lemon
 FAM:lethic
 FAM:letv
 FAM:lien
+FAM:limerat
 FAM:linkular
 FAM:lirose
 FAM:lmir
@@ -599,10 +621,12 @@ FAM:myteam
 FAM:mytrackp
 FAM:mywebsearch
 FAM:nandrobox
+FAM:nanocore
 FAM:navbar
 FAM:nawiaiad
 FAM:necro
 FAM:necurs
+FAM:neoreklami
 FAM:neospy
 FAM:neshta
 FAM:netbox
@@ -630,6 +654,7 @@ FAM:opencandy
 FAM:openinstall
 FAM:opfake
 FAM:optix
+FAM:orcusrat
 FAM:outbrowse
 FAM:oveead
 FAM:paccy
@@ -644,6 +669,7 @@ FAM:penguin
 FAM:perflogger
 FAM:perkel
 FAM:petrolin
+FAM:petya
 FAM:phonespy
 FAM:picsys
 FAM:piom
@@ -674,6 +700,7 @@ FAM:pushad
 FAM:pushe
 FAM:puxis
 FAM:pykspa
+FAM:quasar
 FAM:qbot
 FAM:qexma
 FAM:qplus
@@ -681,11 +708,13 @@ FAM:qqrob
 FAM:qumi
 FAM:quozha
 FAM:qushu
+FAM:raccoon
 FAM:raden
 FAM:ramnit
 FAM:ranky
 FAM:rasteal
 FAM:razam
+FAM:razy
 FAM:rbot
 FAM:rebhip
 FAM:recmads
@@ -697,12 +726,14 @@ FAM:reflod
 FAM:refog
 FAM:regon
 FAM:relevantknowledge
+FAM:remcos
 FAM:renocide
 FAM:renos
 FAM:reporo
 FAM:reptilicus
 FAM:resharer
 FAM:reveton
+FAM:revetrat
 FAM:revmob
 FAM:riltok
 FAM:rimod
@@ -715,11 +746,13 @@ FAM:rootmaster
 FAM:rootnik
 FAM:rootsmart
 FAM:rotexy
+FAM:rozena
 FAM:rufraud
 FAM:rukometa
 FAM:rungbu
 FAM:ruskill
 FAM:rusms
+FAM:ryuk
 FAM:sacti
 FAM:sacto
 FAM:sadenav
@@ -727,10 +760,12 @@ FAM:sadpor
 FAM:sahat
 FAM:sakezon
 FAM:sality
+FAM:samsam
 FAM:sanctionedmedia
 FAM:sandr
 FAM:savemy
 FAM:scam
+FAM:schwarzesonne
 FAM:sckeylog
 FAM:sdbot
 FAM:seaweth
@@ -741,6 +776,7 @@ FAM:severs
 FAM:sfone
 FAM:shastrosms
 FAM:shedun
+FAM:shelma
 FAM:sheridroid
 FAM:shixot
 FAM:shiz
@@ -808,6 +844,7 @@ FAM:spyapp
 FAM:spybubble
 FAM:spydealer
 FAM:spyeye
+FAM:spygate
 FAM:spyhasb
 FAM:spynote
 FAM:spyoo
@@ -826,6 +863,7 @@ FAM:suaban
 FAM:suggestor
 FAM:supking
 FAM:svpeng
+FAM:swrort
 FAM:swisyn
 FAM:swizzor
 FAM:systemmonitor
@@ -872,6 +910,7 @@ FAM:tracer
 FAM:tracker
 FAM:trackerfree
 FAM:trackplus
+FAM:trickbot
 FAM:trclick
 FAM:tridrongo
 FAM:troom
@@ -895,6 +934,7 @@ FAM:usteal
 FAM:utchi
 FAM:uupay
 FAM:uuserv
+FAM:valyria
 FAM:vapsup
 FAM:vdloader
 FAM:verti
@@ -907,6 +947,7 @@ FAM:virusdoctor
 FAM:virut
 FAM:viser
 FAM:vittalia
+FAM:vjworm
 FAM:vkemag
 FAM:vktihs
 FAM:vmvol
@@ -1022,6 +1063,7 @@ FILE:packed:aspack
 FILE:packed:asprotect
 FILE:packed:beroexepacker
 FILE:packed:bobsoft
+FILE:packed:confuser
 FILE:packed:decrypter
 FILE:packed:encryptpe
 FILE:packed:enigma
@@ -1033,6 +1075,7 @@ FILE:packed:krunchy
 FILE:packed:maskpe
 FILE:packed:molebox
 FILE:packed:morphine
+FILE:packed:multipacked
 FILE:packed:nakedpack
 FILE:packed:niceprotect
 FILE:packed:npack
@@ -1057,6 +1100,7 @@ FILE:packed:upack
 FILE:packed:upx
 FILE:packed:vmprotect
 FILE:packed:yoda
+FILE:proglang:autohk
 FILE:proglang:autoit
 FILE:proglang:delphi
 FILE:proglang:java
diff --git a/avclass/data/misp/cluster/avclass2.json b/misp/cluster/avclass.json
similarity index 85%
rename from avclass/data/misp/cluster/avclass2.json
rename to misp/cluster/avclass.json
index a8a7d33..933e133 100644
--- a/avclass/data/misp/cluster/avclass2.json
+++ b/misp/cluster/avclass.json
@@ -134,6 +134,19 @@
       "uuid": "e56915a8-a345-316a-9f69-d9e62a68c753", 
       "value": "spyeye"
     }, 
+    {
+      "description": "FAM:spygate",
+      "meta": {
+        "refs": [
+          "https://www.fortiguard.com/encyclopedia/virus/8225407",
+          "https://www.rekings.com/spygate-rat-3-2/",
+          "https://www.symantec.com/security_response/attacksignatures/detail.jsp%3Fasid%3D27950",
+          "http://spygate-rat.blogspot.lu/"
+        ]
+      },
+      "uuid": "793d27f3-f060-49f2-b572-8bc6fcdbbdef",
+      "value": "spygate"
+    },
     {
       "description": "FAM:spyhasb", 
       "meta": {
@@ -341,6 +354,19 @@
       "uuid": "b94f39e6-7997-373b-8d67-ae62d889e110", 
       "value": "svpeng"
     }, 
+    {
+      "description": "FAM:swrort",
+      "meta": {
+        "refs": [
+          "https://blog.malwarebytes.com/detections/trojan-swrort/",
+          "https://malpedia.caad.fkie.fraunhofer.de/details/ps1.swrort"
+        ],
+        "synonyms": [],
+        "type": []
+      },
+      "uuid": "048b948f-5e4d-4e6f-a0b5-54157cf03c86",
+      "value": "swrort"
+    },
     {
       "description": "FAM:swisyn", 
       "meta": {
@@ -871,6 +897,69 @@
       "uuid": "42603f75-a6b9-3091-bf23-c2fb545fad56", 
       "value": "trclick"
     }, 
+    {
+      "description": "FAM:trickbot",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.trickbot",
+          "https://www.cybereason.com/blog/triple-threat-emotet-deploys-trickbot-to-steal-data-spread-ryuk-ransomware",
+          "https://blog.malwarebytes.com/threat-analysis/2017/08/trickbot-comes-with-new-tricks-attacking-outlook-and-browsing-data/",
+          "http://www.vkremez.com/2017/11/lets-learn-trickbot-socks5-backconnect.html",
+          "https://blog.trendmicro.com/trendlabs-security-intelligence/trickbot-adds-remote-application-credential-grabbing-capabilities-to-its-repertoire/",
+          "http://www.vkremez.com/2017/12/lets-learn-introducing-new-trickbot.html",
+          "https://blog.trendmicro.com/trendlabs-security-intelligence/trickbot-shows-off-new-trick-password-grabber-module",
+          "https://www.fidelissecurity.com/threatgeek/2016/10/trickbot-we-missed-you-dyre",
+          "https://www.flashpoint-intel.com/blog/trickbot-account-checking-hybrid-attack-model/",
+          "http://www.peppermalware.com/2019/03/quick-analysis-of-trickbot-sample-with.html",
+          "https://blog.malwarebytes.com/threat-analysis/2016/10/trick-bot-dyrezas-successor/",
+          "https://www.youtube.com/watch?v=KMcSAlS9zGE",
+          "https://www.crowdstrike.com/blog/sin-ful-spiders-wizard-spider-and-lunar-spider-sharing-the-same-web/",
+          "https://www.arbornetworks.com/blog/asert/trickbot-banker-insights/",
+          "https://blog.malwarebytes.com/threat-analysis/malware-threat-analysis/2018/11/whats-new-trickbot-deobfuscating-elements/",
+          "https://www.trustwave.com/Resources/SpiderLabs-Blog/Tale-of-the-Two-Payloads-%E2%80%93-TrickBot-and-Nitol/",
+          "http://www.vkremez.com/2018/04/lets-learn-trickbot-implements-network.html",
+          "https://securityintelligence.com/trickbot-takes-to-latin-america-continues-to-expand-its-global-reach/",
+          "https://qmemcpy.io/post/reverse-engineering-malware-trickbot-part-2-loader",
+          "https://www.fireeye.com/blog/threat-research/2019/01/a-nasty-trick-from-credential-theft-malware-to-business-disruption.html",
+          "https://securityintelligence.com/trickbots-cryptocurrency-hunger-tricking-the-bitcoin-out-of-wallets/",
+          "https://blog.fraudwatchinternational.com/malware/trickbot-malware-works",
+          "https://www.blueliv.com/research/trickbot-banking-trojan-using-eflags-as-an-anti-hook-technique/",
+          "https://f5.com/labs/articles/threat-intelligence/malware/trickbot-expands-global-targets-beyond-banks-and-payment-processors-to-crms",
+          "https://f5.com/labs/articles/threat-intelligence/malware/little-trickbot-growing-up-new-campaign-24412",
+          "https://github.com/JR0driguezB/malware_configs/tree/master/TrickBot",
+          "https://escinsecurity.blogspot.de/2018/01/weekly-trickbot-analysis-end-of-wc-22.html",
+          "https://www.webroot.com/blog/2018/03/21/trickbot-banking-trojan-adapts-new-module/",
+          "https://www.fortinet.com/blog/threat-research/deep-analysis-of-trickbot-new-module-pwgrab.html",
+          "https://www.securityartwork.es/wp-content/uploads/2017/06/Informe_Evoluci%C3%B3n_Trickbot.pdf",
+          "https://blogs.forcepoint.com/security-labs/trickbot-spread-necurs-botnet-adds-nordic-countries-its-targets",
+          "http://blog.fortinet.com/2016/12/06/deep-analysis-of-the-online-banking-botnet-trickbot",
+          "https://www.cyberbit.com/blog/endpoint-security/latest-trickbot-variant-has-new-tricks-up-its-sleeve/",
+          "http://www.malware-traffic-analysis.net/2018/02/01/",
+          "https://www.cert.pl/en/news/single/detricking-trickbot-loader/",
+          "https://www.trendmicro.com/vinfo/us/security/news/cybercrime-and-digital-threats/evolving-trickbot-adds-detection-evasion-and-screen-locking-features",
+          "https://securityintelligence.com/tricks-of-the-trade-a-deeper-look-into-trickbots-machinations/",
+          "http://www.pwc.co.uk/issues/cyber-security-data-privacy/research/trickbots-bag-of-tricks.html",
+          "https://qmemcpy.io/post/reverse-engineering-malware-trickbot-part-3-core",
+          "https://www.ringzerolabs.com/2017/07/trickbot-banking-trojan-doc00039217doc.html",
+          "https://www.youtube.com/watch?v=EdchPEHnohw",
+          "https://sysopfb.github.io/malware/2018/04/16/trickbot-uacme.html",
+          "https://blog.talosintelligence.com/2018/07/smoking-guns-smoke-loader-learned-new.html",
+          "https://www.vkremez.com/2018/11/lets-learn-introducing-latest-trickbot.html",
+          "https://www.youtube.com/watch?v=lTywPmZEU1A",
+          "https://qmemcpy.github.io/post/reverse-engineering-malware-trickbot-part-1-packer",
+          "https://www.botconf.eu/wp-content/uploads/2016/11/2016-LT09-TrickBot-Adams.pdf",
+          "https://www.flashpoint-intel.com/blog/new-version-trickbot-adds-worm-propagation-module/"
+        ],
+        "synonyms": [
+          "thetrick",
+          "trickLoader",
+          "trickster"
+        ],
+        "type": []
+      },
+      "uuid": "b104ec95-e1bd-44c1-a193-d979bedc0a98",
+      "value": "trickbot"
+    },
     {
       "description": "FAM:tridrongo", 
       "meta": {
@@ -1130,6 +1219,27 @@
       "uuid": "ceea3c58-4d4a-34c5-9bf7-a7b621a0157b", 
       "value": "uuserv"
     }, 
+    {
+      "description": "FAM:valyria",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/ps1.powerstats",
+          "https://www.clearskysec.com/muddywater-operations-in-lebanon-and-oman/",
+          "https://unit42.paloaltonetworks.com/unit42-muddying-the-water-targeted-attacks-in-the-middle-east/",
+          "https://www.fireeye.com/blog/threat-research/2018/03/iranian-threat-group-updates-ttps-in-spear-phishing-campaign.html",
+          "https://blog.malwarebytes.com/threat-analysis/2017/09/elaborate-scripting-fu-used-in-espionage-attack-against-saudi-arabia-government_entity/",
+          "https://reaqta.com/2017/11/muddywater-apt-targeting-middle-east/",
+          "https://blog.trendmicro.com/trendlabs-security-intelligence/campaign-possibly-connected-muddywater-surfaces-middle-east-central-asia/",
+          "https://www.clearskysec.com/muddywater-targets-kurdish-groups-turkish-orgs/"
+        ],
+        "synonyms": [
+          "powerstats"
+        ],
+        "type": []
+      },
+      "uuid": "9f958d8b-0489-40e5-91b4-aa780fc90393",
+      "value": "valyria"
+    },
     {
       "description": "FAM:vapsup", 
       "meta": {
@@ -1284,6 +1394,21 @@
       "uuid": "e6645d41-384f-3030-a76f-e12c94d6a39d", 
       "value": "vittalia"
     }, 
+    {
+      "description": "FAM:vjworm", 
+      "meta": {
+        "refs": [
+          "https://cofense.com/vjw0rm-malware-heres-watch/",
+          "https://www.trendmicro.com/vinfo/us/threat-encyclopedia/malware/js_vjworm.i"
+        ], 
+        "synonyms": [
+          "vjw0rm"
+        ], 
+        "type": []
+      }, 
+      "uuid": "d2d82d11-f174-4804-abf5-2b81740b5993", 
+      "value": "vjworm"
+    }, 
     {
       "description": "FAM:vkemag", 
       "meta": {
@@ -2587,17 +2712,20 @@
       "meta": {
         "refs": [], 
         "synonyms": [
+          "binder",
           "cryp", 
           "crypt", 
           "crypted", 
           "crypter", 
           "cryptic", 
+          "cryptoobfuscator",
           "cryptor", 
           "encpk", 
           "genpack", 
           "krypt", 
           "kryptik", 
-          "kryptk", 
+          "kryptk",
+          "genkryptik",
           "malcrypt", 
           "malob", 
           "malpack", 
@@ -2611,6 +2739,7 @@
           "pakes", 
           "suspiciouspacker", 
           "susppack", 
+          "vbinder",
           "vbcrypt", 
           "vbkrypt", 
           "vbpack", 
@@ -2629,6 +2758,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "b1c0c358-3a33-37db-974d-d4b8c31e45d3", 
       "value": "armadillo"
     }, 
@@ -2639,6 +2774,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "48eb867b-53ea-344f-8006-7756cdca4be9", 
       "value": "aspack"
     }, 
@@ -2649,6 +2790,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "cd8af59b-808b-3692-8068-2df3fd6a3ab6", 
       "value": "asprotect"
     }, 
@@ -2659,6 +2806,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "acebc64c-4419-3034-aeb7-62657e281101", 
       "value": "beroexepacker"
     }, 
@@ -2669,9 +2822,31 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "32b30e14-264e-3482-aa8f-e741f3f9ea29", 
       "value": "bobsoft"
     }, 
+    {
+      "description": "FILE:packed:confuser", 
+      "meta": {
+        "refs": [], 
+        "synonyms": ["confuserex"], 
+        "type": []
+      }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
+      "uuid": "f0c8b58d-040b-4956-9160-d17b6a6064e9", 
+      "value": "confuser"
+    }, 
     {
       "description": "FILE:packed:decrypter", 
       "meta": {
@@ -2679,6 +2854,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "11c5c00e-d453-39b9-b127-4c01be2227fb", 
       "value": "decrypter"
     }, 
@@ -2689,16 +2870,30 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "b94cd8d8-48cf-38c0-b296-52036803a04f", 
       "value": "encryptpe"
     }, 
     {
       "description": "FILE:packed:enigma", 
       "meta": {
-        "refs": [], 
-        "synonyms": [], 
+        "refs": "https://enigmaprotector.com/", 
+        "synonyms": [
+          "enigmaprotector"
+        ], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "2f5b61f3-5b5f-3eac-bbe6-32eb728cba6e", 
       "value": "enigma"
     }, 
@@ -2709,6 +2904,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "19aa389c-ff65-36c7-8d79-e109e356866f", 
       "value": "execryptor"
     }, 
@@ -2719,6 +2920,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "01f1aec1-33c8-3cbe-aa9b-5c78b8db18bf", 
       "value": "exestealth"
     }, 
@@ -2729,6 +2936,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "138e2027-887b-3d4a-8baf-f265cda803ec", 
       "value": "expressor"
     }, 
@@ -2739,6 +2952,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "62775d3e-9738-37eb-8521-b6086b278380", 
       "value": "jiagu"
     }, 
@@ -2751,6 +2970,12 @@
         ], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "0599aec6-6542-3014-b36a-85eb7b1acc50", 
       "value": "krunchy"
     }, 
@@ -2763,6 +2988,12 @@
         ], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "ac7fb78f-1cfb-3f53-a654-ebe0c02c61c1", 
       "value": "maskpe"
     }, 
@@ -2773,6 +3004,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "10398ebf-c9c9-3620-b718-40934daa60f3", 
       "value": "molebox"
     }, 
@@ -2783,9 +3020,31 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "cae3f868-9601-30b9-9e88-f5bf724c1fd3", 
       "value": "morphine"
     }, 
+    {
+      "description": "FILE:packed:multipacked", 
+      "meta": {
+        "refs": "https://encyclopedia.kaspersky.com/knowledge/multipacked/", 
+        "synonyms": [], 
+        "type": []
+      }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
+      "uuid": "59f26518-641f-4c07-9aed-445d60158789", 
+      "value": "multipacked"
+    }, 
     {
       "description": "FILE:packed:nakedpack", 
       "meta": {
@@ -2793,6 +3052,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "030e9fd5-99c8-3c33-9896-d5711b54219a", 
       "value": "nakedpack"
     }, 
@@ -2803,6 +3068,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "9ebe63ac-3a1f-3a64-afed-8adcec5f2fff", 
       "value": "niceprotect"
     }, 
@@ -2813,6 +3084,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "fe292fb8-4e92-3def-8506-ee3278f457af", 
       "value": "npack"
     }, 
@@ -2823,6 +3100,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "3fa37052-f024-3c63-89be-d8c366d6f83e", 
       "value": "nspack"
     }, 
@@ -2833,6 +3116,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "5cea2ead-364f-3ff7-8dbf-9d0c0952c70c", 
       "value": "obsidium"
     }, 
@@ -2843,6 +3132,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "1bf29bd3-1e76-330c-87ee-c186adf9c646", 
       "value": "packman"
     }, 
@@ -2853,6 +3148,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "fe384ea4-109d-35ec-9f00-fea5f69914fd", 
       "value": "pearmor"
     }, 
@@ -2863,6 +3164,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "634815d7-0cd8-37da-b20f-b126a744b825", 
       "value": "pecompact"
     }, 
@@ -2873,6 +3180,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "70e0e627-e1a0-344a-a4cd-825e9de1b86f", 
       "value": "pecrypt"
     }, 
@@ -2883,6 +3196,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "36224c37-5a49-336f-9fa9-42db1db78cff", 
       "value": "pespin"
     }, 
@@ -2895,6 +3214,12 @@
         ], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "a515ed68-5ab7-3a7a-ba4d-63ff5edba892", 
       "value": "polycrypt"
     }, 
@@ -2905,6 +3230,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "26ee092f-b61c-3774-90ab-632d902a9690", 
       "value": "punisher"
     }, 
@@ -2915,6 +3246,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "d9221f76-bbe9-3936-a6f4-cc2582ed8ae8", 
       "value": "rcryptor"
     }, 
@@ -2925,6 +3262,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "a95dcb8b-fe56-31a7-80df-6764ed4b4759", 
       "value": "rlpack"
     }, 
@@ -2935,6 +3278,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "73ab92ef-dbf4-38d3-8687-e9df33211bd1", 
       "value": "sdprotector"
     }, 
@@ -2945,6 +3294,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "0e88fad9-544f-37fa-b4fd-13c91ea8afdd", 
       "value": "secapk"
     }, 
@@ -2955,6 +3310,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "e10e4847-cad5-32e3-ae8d-3f08d2c4baac", 
       "value": "secneo"
     }, 
@@ -2965,6 +3326,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "8cb96214-6910-3124-89fa-301eccee65ea", 
       "value": "simplepack"
     }, 
@@ -2975,6 +3342,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "6cf84378-cc5e-3e92-8a13-87697f25808f", 
       "value": "telock"
     }, 
@@ -2985,6 +3358,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "b247e2db-6c54-318f-95a4-8d3c0a0b384b", 
       "value": "themida"
     }, 
@@ -2995,6 +3374,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "a550b64f-0f43-3c19-b372-fff40ddf8002", 
       "value": "upack"
     }, 
@@ -3005,6 +3390,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "8a0a984c-44b7-38ed-8a8c-6e55a4c8d888", 
       "value": "upx"
     }, 
@@ -3017,6 +3408,12 @@
         ], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "b077a697-29f1-3780-9e37-88791a9e70c8", 
       "value": "vmprotect"
     }, 
@@ -3031,9 +3428,27 @@
         ], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "199805da-3a4c-3927-b32c-e6eb4b61f903", 
       "value": "yoda"
     }, 
+    {
+      "description": "FILE:proglang:autohk", 
+      "meta": {
+        "refs": [], 
+        "synonyms": [
+          "autohotkey"
+        ], 
+        "type": []
+      }, 
+      "uuid": "5391799c-5a56-4e63-8438-eba2b331e65d", 
+      "value": "autohk"
+    }, 
     {
       "description": "FILE:proglang:autoit", 
       "meta": {
@@ -3329,6 +3744,23 @@
       "uuid": "46e0b44b-cf35-3472-9941-1bc1ea14943e", 
       "value": "gizmo"
     }, 
+    {
+      "description": "FAM:glupteba",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.glupteba",
+          "http://resources.infosecinstitute.com/tdss4-part-1/",
+          "http://malwarefor.me/2015-04-13-nuclear-ek-glupteba-and-operation-windigo/",
+          "https://www.welivesecurity.com/2014/03/18/operation-windigo-the-vivisection-of-a-large-linux-server-side-credential-stealing-malware-campaign/",
+          "https://www.welivesecurity.com/2011/03/02/tdl4-and-glubteba-piggyback-piggybugs/",
+          "https://www.welivesecurity.com/2018/03/22/glupteba-no-longer-windigo/"
+        ],
+        "synonyms": [],
+        "type": []
+      },
+      "uuid": "0409c6ab-133a-448e-a2cf-c2c7f55d4100",
+      "value": "glupteba"
+    },
     {
       "description": "FAM:gobot", 
       "meta": {
@@ -3410,6 +3842,50 @@
       "uuid": "c9431d80-ca53-3176-877b-e283aa3f9f11", 
       "value": "goodnews"
     }, 
+    {
+      "description": "FAM:goodor", 
+      "meta": {
+        "refs": "https://malpedia.caad.fkie.fraunhofer.de/details/win.goodor", 
+        "synonyms": [
+          "fuerboos"
+        ], 
+        "type": []
+      }, 
+      "uuid": "2115d439-31e9-45bd-ae00-dcb2fa5cde9c", 
+      "value": "goodor"
+    }, 
+    {
+      "description": "FAM:gootkit",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.gootkit",
+          "https://www.lexsi.com/securityhub/homer-simpson-brian-krebs-rencontrent-zeus-gootkit/",
+          "http://blog.cert.societegenerale.com/2015/04/analyzing-gootkits-persistence-mechanism.html",
+          "https://securityintelligence.com/gootkit-developers-dress-it-up-with-web-traffic-proxy/",
+          "https://forums.juniper.net/t5/Security-Now/New-Gootkit-Banking-Trojan-variant-pushes-the-limits-on-evasive/ba-p/319055",
+          "https://www.f5.com/labs/articles/threat-intelligence/tackling-gootkit-s-traps",
+          "https://securelist.com/blog/research/76433/inside-the-gootkit-cc-server/",
+          "https://www.us-cert.gov/ncas/alerts/TA16-336A",
+          "http://www.vkremez.com/2018/04/lets-learn-in-depth-dive-into-gootkit.html",
+          "https://securityintelligence.com/gootkit-bobbing-and-weaving-to-avoid-prying-eyes/",
+          "https://www.youtube.com/watch?v=242Tn0IL2jE",
+          "http://www.kernelmode.info/forum/viewtopic.php?f=16&t=3669",
+          "https://www.s21sec.com/en/blog/2016/05/reverse-engineering-gootkit/",
+          "http://blog.trendmicro.com/trendlabs-security-intelligence/fake-judicial-spam-leads-to-backdoor-with-fake-certificate-authority/",
+          "https://news.drweb.com/show/?i=4338&lng=en",
+          "https://www.youtube.com/watch?v=QgUlPvEE4aw",
+          "https://www.cyphort.com/angler-ek-leads-to-fileless-gootkit/"
+        ],
+        "synonyms": [
+          "xswkit",
+          "talalpek",
+          "waldek"
+        ],
+        "type": []
+      },
+      "uuid": "beb37b7f-d2f9-47c5-a53b-4b5bb7db9cdf",
+      "value": "gootkit"
+    },
     {
       "description": "FAM:gorillaprice", 
       "meta": {
@@ -3422,6 +3898,27 @@
       "uuid": "83db9052-450f-30d9-90f0-e9ccffb3b348", 
       "value": "gorillaprice"
     }, 
+    {
+      "description": "FAM:gozi",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.gozi",
+          "http://blog.malwaremustdie.org/2013/02/the-infection-of-styx-exploit-kit.html",
+          "https://www.secureworks.com/research/gozi",
+          "https://lokalhost.pl/gozi_tree.txt",
+          "https://blog.gdatasoftware.com/2016/11/29325-analysis-ursnif-spying-on-your-data-since-2007",
+          "http://researchcenter.paloaltonetworks.com/2017/02/unit42-banking-trojans-ursnif-global-distribution-networks-identified/"
+        ],
+        "synonyms": [
+          "papras",
+          "snifula",
+          "ursnif"
+        ],
+        "type": []
+      },
+      "uuid": "5fac06c6-010a-4af3-99c9-cb0052057bdf",
+      "value": "gozi"
+    },
     {
       "description": "FAM:gpspy", 
       "meta": {
@@ -3616,6 +4113,21 @@
       "uuid": "f801c366-5b0c-39da-9afb-ca515528bd99", 
       "value": "hiddenapp"
     }, 
+    {
+      "description": "FAM:hiddentear",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.hiddentear",
+          "https://www.tripwire.com/state-of-security/security-data-protection/cyber-security/hidden-tear-project-forbidden-fruit-is-the-sweetest/",
+          "https://twitter.com/struppigel/status/950787783353884672",
+          "https://github.com/goliate/hidden-tear"
+        ],
+        "synonyms": [],
+        "type": []
+      },
+      "uuid": "06e7d142-3ad1-40b6-b231-41dd47465ac3",
+      "value": "hiddentear"
+    },
     {
       "description": "FAM:hiddnad", 
       "meta": {
@@ -3718,6 +4230,25 @@
       "uuid": "373d306c-aaa3-38e3-b839-7dd39b51e89a", 
       "value": "hotclip"
     }, 
+    {
+      "description": "FAM:houdini",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.houdini",
+          "http://researchcenter.paloaltonetworks.com/2016/10/unit42-houdinis-magic-reappearance/?adbsc=social67221546&adbid=790972447373668352&adbpl=tw&adbpr=4487645412",
+          "http://blogs.360.cn/post/analysis-of-apt-c-37.html"
+        ],
+        "synonyms": [
+          "dunihi",
+          "dinihou",
+          "hworm",
+          "jenxcus"
+        ],
+        "type": []
+      },
+      "uuid": "1f268f26-ad8b-4e4d-9efb-661904171c2a",
+      "value": "houdini"
+    },
     {
       "description": "FAM:hoverwatch", 
       "meta": {
@@ -4637,6 +5168,19 @@
       "uuid": "84da1a17-5013-3775-a58f-f913b62180ad", 
       "value": "laroux"
     }, 
+    {
+      "description": "FAM:lazagne",
+      "meta": {
+        "refs": [
+          "https://attack.mitre.org/software/S0349",
+          "https://github.com/AlessandroZ/LaZagne"
+        ],
+        "synonyms": [],
+        "type": []
+      },
+      "uuid": "a8b9546f-1a91-468d-9304-3f6654d39352",
+      "value": "lazagne"
+    },
     {
       "description": "FAM:ldpinch", 
       "meta": {
@@ -4725,6 +5269,21 @@
       "uuid": "3388273e-3d1e-32a3-afb6-a0ade35d91b2", 
       "value": "lien"
     }, 
+    {
+      "description": "FAM:limerat",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.limerat",
+          "https://www.youtube.com/watch?v=x-g-ZLeX8GM",
+          "https://blog.yoroi.company/research/limerat-spreads-in-the-wild/",
+          "https://github.com/NYAN-x-CAT/Lime-RAT/"
+        ],
+        "synonyms": [],
+        "type": []
+      },
+      "uuid": "1ea6f1b4-cf3d-40aa-981f-31a1efbd819f",
+      "value": "limerat"
+    },
     {
       "description": "FAM:linkular", 
       "meta": {
@@ -4803,6 +5362,22 @@
       "uuid": "bc6ae7c3-eff8-3b11-999b-4248e5d93073", 
       "value": "loapi"
     }, 
+    {
+      "description": "FAM:loda",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.loda",
+          "https://www.proofpoint.com/us/threat-insight/post/introducing-loda-malware",
+          "https://zerophagemalware.com/2018/01/23/maldoc-rtf-drop-loda-logger/"
+        ],
+        "synonyms": [
+          "nymeria"
+        ],
+        "type": []
+      },
+      "uuid": "f6203215-d07e-4108-bb75-ee5ad7e9dbfc",
+      "value": "loda"
+    },
     {
       "description": "FAM:lockactivity", 
       "meta": {
@@ -5787,6 +6362,24 @@
       "uuid": "da9dcdba-19b8-34b7-9647-94983e0f04ed", 
       "value": "nandrobox"
     }, 
+    {
+      "description": "FAM:nanocore",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.nanocore",
+          "https://www.fireeye.com/blog/threat-research/2017/09/apt33-insights-into-iranian-cyber-espionage.html",
+          "https://www.symantec.com/blogs/threat-intelligence/elfin-apt33-espionage",
+          "https://researchcenter.paloaltonetworks.com/2018/08/unit42-gorgon-group-slithering-nation-state-cybercrime/",
+          "https://www.bleepingcomputer.com/news/security/nanocore-rat-author-gets-33-months-in-prison/"
+        ],
+        "synonyms": [
+          "nancrat"
+        ],
+        "type": []
+      },
+      "uuid": "048b948f-5e4d-4e6f-a0b5-54157cf03c86",
+      "value": "nanocore"
+    },
     {
       "description": "FAM:navbar", 
       "meta": {
@@ -5827,6 +6420,25 @@
       "uuid": "ddf07c01-91d3-35ab-b393-3afabe39dff7", 
       "value": "necurs"
     }, 
+    {
+      "description": "FAM:neoreklami", 
+      "meta": {
+        "refs": [
+          "https://blog.malwarebytes.com/detections/adware-neoreklami/",
+          "https://www.microsoft.com/en-us/wdsi/threats/malware-encyclopedia-description?Name=Adware:Win32/Neoreklami"
+        ], 
+        "synonyms": [], 
+        "type": []
+      }, 
+      "related": [
+        {
+          "dest-uuid": "590eeb1a-2742-3301-94bf-9fa0856e959c",
+          "type": "variant-of"
+        }
+      ],
+      "uuid": "665ecfd3-b713-465f-879b-5182203b4a8b", 
+      "value": "neoreklami"
+    }, 
     {
       "description": "FAM:neospy", 
       "meta": {
@@ -5880,7 +6492,8 @@
         "synonyms": [
           "netweird", 
           "weecnaw", 
-          "wirenet"
+          "wirenet",
+          "netwire"
         ], 
         "type": []
       }, 
@@ -6140,6 +6753,26 @@
       "uuid": "bd3777dc-6822-36d9-b57b-fe623ddb0170", 
       "value": "optix"
     }, 
+    {
+      "description": "FAM:orcusrat",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.orcus_rat",
+          "https://orcustechnologies.com/",
+          "https://blog.fortinet.com/2017/12/07/a-peculiar-case-of-orcus-rat-targeting-bitcoin-investors",
+          "https://www.canada.ca/en/radio-television-telecommunications/news/2019/03/crtc-and-rcmp-national-division-execute-warrants-in-malware-investigation.html",
+          "https://krebsonsecurity.com/2016/07/canadian-man-is-author-of-popular-orcus-rat/",
+          "https://krebsonsecurity.com/2019/04/canadian-police-raid-orcus-rat-author/",
+          "http://researchcenter.paloaltonetworks.com/2016/08/unit42-orcus-birth-of-an-unusual-plugin-builder-rat/"
+        ],
+        "synonyms": [
+          "orcus"
+        ],
+        "type": []
+      },
+      "uuid": "0c57b2b4-b545-4b5d-bd27-b102b635e432",
+      "value": "orcusrat"
+    },
     {
       "description": "FAM:outbrowse", 
       "meta": {
@@ -6303,6 +6936,25 @@
       "uuid": "457c9036-a4bf-355e-844a-e74dd69c80e7", 
       "value": "petrolin"
     }, 
+    {
+      "description": "FAM:petya",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.petya",
+          "https://blog.malwarebytes.com/threat-analysis/2016/05/petya-and-mischa-ransomware-duet-p1/",
+          "https://blog.malwarebytes.com/threat-analysis/2016/07/third-time-unlucky-improved-petya-is-out/",
+          "https://blog.malwarebytes.com/cybercrime/2017/07/keeping-up-with-the-petyas-demystifying-the-malware-family/",
+          "https://blog.malwarebytes.com/malwarebytes-news/2017/07/bye-bye-petya-decryptor-old-versions-released/",
+          "https://blog.malwarebytes.com/threat-analysis/2016/04/petya-ransomware/"
+        ],
+        "synonyms": [
+          "petr"
+        ],
+        "type": []
+      },
+      "uuid": "c4324143-3921-4771-a9b2-f15ae2b3777f",
+      "value": "petya"
+    },
     {
       "description": "FAM:phonespy", 
       "meta": {
@@ -6706,6 +7358,33 @@
       "uuid": "e49cc0f4-649c-344a-b75a-c6187d57e721", 
       "value": "qqrob"
     }, 
+    {
+      "description": "FAM:quasar",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.quasar_rat",
+          "https://researchcenter.paloaltonetworks.com/2018/01/unit42-vermin-quasar-rat-custom-malware-used-ukraine/",
+          "https://www.fireeye.com/blog/threat-research/2019/04/spear-phishing-campaign-targets-ukraine-government.html",
+          "https://researchcenter.paloaltonetworks.com/2018/08/unit42-gorgon-group-slithering-nation-state-cybercrime/",
+          "https://github.com/quasar/QuasarRAT/tree/master/Client",
+          "https://www.volexity.com/blog/2018/06/07/patchwork-apt-group-targets-us-think-tanks/",
+          "https://www.pwc.co.uk/cyber-security/pdf/cloud-hopper-annex-b-final.pdf",
+          "http://researchcenter.paloaltonetworks.com/2017/01/unit42-downeks-and-quasar-rat-used-in-recent-targeted-attacks-against-governments",
+          "https://documents.trendmicro.com/assets/tech-brief-untangling-the-patchwork-cyberespionage-group.pdf?platform=hootsuite",
+          "https://ti.360.net/blog/articles/analysis-of-apt-c-09-target-china/",
+          "https://www.symantec.com/blogs/threat-intelligence/elfin-apt33-espionage",
+          "https://twitter.com/malwrhunterteam/status/789153556255342596",
+          "https://www.welivesecurity.com/2018/07/17/deep-dive-vermin-rathole/"
+        ],
+        "synonyms": [
+          "quasar_rat",
+          "quasarrat"
+        ],
+        "type": []
+      },
+      "uuid": "620903d7-42ed-4a16-b3df-4ca6076d9f31",
+      "value": "quasar"
+    },
     {
       "description": "FAM:qumi", 
       "meta": {
@@ -6738,6 +7417,26 @@
       "uuid": "0761227e-1a79-3a7b-bda6-b5962458e4ec", 
       "value": "qushu"
     }, 
+    {
+      "description": "FAM:raccoon",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.raccoon",
+          "https://www.secfreaks.gr/2019/12/in-depth-analysis-of-an-infostealer-raccoon.html",
+          "https://www.bitdefender.com/files/News/CaseStudies/study/289/Bitdefender-WhitePaper-Fallout.pdf",
+          "https://www.cybereason.com/blog/hunting-raccoon-stealer-the-new-masked-bandit-on-the-block",
+          "https://www.cynexlink.com/2020/12/26/raccoon-malware-a-threat-to-cybersecurity/"
+        ],
+        "synonyms": [
+          "mohazo",
+          "racealer",
+          "racoon"
+        ],
+        "type": []
+      },
+      "uuid": "b934637a-5c8d-43bc-b595-61e8acd9af78",
+      "value": "raccoon"
+    },
     {
       "description": "FAM:raden", 
       "meta": {
@@ -6800,6 +7499,25 @@
       "uuid": "ac2392d0-c38e-3909-aabf-5e632062f24d", 
       "value": "razam"
     }, 
+    {
+      "description": "FAM:razy", 
+      "meta": {
+        "refs": [
+          "https://www.microsoft.com/en-us/wdsi/threats/malware-encyclopedia-description?Name=Trojan:Win32/Razy.A",
+          "https://threatpost.com/razy-browser-extensions-theft/141181/"
+        ], 
+        "synonyms": [], 
+        "type": []
+      }, 
+      "related": [
+        {
+          "dest-uuid": "636db272-61ce-3e0f-ad64-d77048b05066",
+          "type": "variant-of"
+        }
+      ],
+      "uuid": "aad8ce8e-0c82-42f7-a63c-bbfe85c015b6", 
+      "value": "razy"
+    }, 
     {
       "description": "FAM:rbot", 
       "meta": {
@@ -6923,6 +7641,31 @@
       "uuid": "4cbb8478-d6ca-3efc-bc72-9be4ffde7073", 
       "value": "relevantknowledge"
     }, 
+    {
+      "description": "FAM:remcos",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.remcos",
+          "https://www.riskiq.com/blog/labs/spear-phishing-turkish-defense-contractors/",
+          "https://researchcenter.paloaltonetworks.com/2018/08/unit42-gorgon-group-slithering-nation-state-cybercrime/",
+          "http://malware-traffic-analysis.net/2017/12/22/index.html",
+          "https://www.symantec.com/blogs/threat-intelligence/elfin-apt33-espionage",
+          "https://blog.fortinet.com/2017/02/14/remcos-a-new-rat-in-the-wild-2",
+          "https://krabsonsecurity.com/2018/03/02/analysing-remcos-rats-executable/",
+          "https://myonlinesecurity.co.uk/fake-order-spoofed-from-finchers-ltd-sankyo-rubber-delivers-remcos-rat-via-ace-attachments/",
+          "https://blog.talosintelligence.com/2018/08/picking-apart-remcos.html",
+          "https://secrary.com/ReversingMalware/RemcosRAT/"
+        ],
+        "synonyms": [
+          "remcosrat", 
+          "remvio", 
+          "socmer"
+        ],
+        "type": []
+      },
+      "uuid": "1b2a647e-35a2-418d-95e2-e77e0423060b",
+      "value": "remcos"
+    },
     {
       "description": "FAM:renocide", 
       "meta": {
@@ -6993,6 +7736,24 @@
       "uuid": "94805334-1d2e-3621-aa6d-0b3dc8e0405d", 
       "value": "reveton"
     }, 
+    {
+      "description": "FAM:revetrat",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.revenge_rat",
+          "https://isc.sans.edu/diary/rss/22590",
+          "https://researchcenter.paloaltonetworks.com/2018/08/unit42-gorgon-group-slithering-nation-state-cybercrime/",
+          "http://blog.deniable.org/blog/2016/08/26/lurking-around-revenge-rat/"
+        ],
+        "synonyms": [
+          "revenge",
+          "revet"
+        ],
+        "type": []
+      },
+      "uuid": "2326ae09-ab18-41cd-8f87-187853a8623f",
+      "value": "revetrat"
+    },
     {
       "description": "FAM:revmob", 
       "meta": {
@@ -7122,6 +7883,19 @@
       "uuid": "1eb2ee9d-7dff-3816-8641-ab772d90cb54", 
       "value": "rotexy"
     }, 
+    {
+      "description": "FAM:rozena",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.rozena",
+          "https://www.gdatasoftware.com/blog/2018/06/30862-fileless-malware-rozena"
+        ],
+        "synonyms": [],
+        "type": []
+      },
+      "uuid": "106d00a7-2044-46b6-9c08-35eb775764df",
+      "value": "rozena"
+    },
     {
       "description": "FAM:rufraud", 
       "meta": {
@@ -7176,6 +7950,31 @@
       "uuid": "1b7e36bf-e9dd-33c8-a9af-ad56c3c07f2b", 
       "value": "rusms"
     }, 
+    {
+      "description": "FAM:ryuk",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.ryuk",
+          "https://www.cybereason.com/blog/triple-threat-emotet-deploys-trickbot-to-steal-data-spread-ryuk-ransomware",
+          "https://research.checkpoint.com/ryuk-ransomware-targeted-campaign-break/",
+          "https://www.latimes.com/local/lanow/la-me-ln-times-delivery-disruption-20181229-story.html",
+          "https://www.crowdstrike.com/blog/big-game-hunting-with-ryuk-another-lucrative-targeted-ransomware/",
+          "https://www.fireeye.com/blog/threat-research/2019/01/a-nasty-trick-from-credential-theft-malware-to-business-disruption.html",
+          "https://www.fireeye.com/blog/threat-research/2019/04/pick-six-intercepting-a-fin6-intrusion.html",
+          "https://securingtomorrow.mcafee.com/other-blogs/mcafee-labs/ryuk-ransomware-attack-rush-to-attribution-misses-the-point/"
+        ],
+        "synonyms": [],
+        "type": []
+      },
+      "related": [
+        {
+          "dest-uuid": "a8f4f4fa-1558-3f16-ac56-bce903a83b22",
+          "type": "variant-of"
+        }
+      ],
+      "uuid": "abf8f3dc-4dbf-47b3-95fd-b35ac2ed3f46",
+      "value": "ryuk"
+    },
     {
       "description": "FAM:sacti", 
       "meta": {
@@ -7265,6 +8064,26 @@
       "uuid": "5ff7793d-c1c4-380d-900e-d9aa6a409915", 
       "value": "sality"
     }, 
+    {
+      "description": "FAM:samsam",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.samsam",
+          "http://blog.talosintel.com/2016/03/samsam-ransomware.html",
+          "https://www.sophos.com/en-us/medialibrary/pdfs/technical-papers/samsam-ransomware-chooses-its-targets-carefully-wpna.aspx",
+          "https://www.crowdstrike.com/blog/an-in-depth-analysis-of-samsam-ransomware-and-boss-spider/",
+          "https://www.justice.gov/opa/pr/two-iranian-men-indicted-deploying-ransomware-extort-hospitals-municipalities-and-public",
+          "https://nakedsecurity.sophos.com/2018/05/01/samsam-ransomware-a-mean-old-dog-with-a-nasty-new-trick-report/",
+          "http://blog.talosintelligence.com/2018/01/samsam-evolution-continues-netting-over.html"
+        ],
+        "synonyms": [
+          "samas"
+        ],
+        "type": []
+      },
+      "uuid": "d00e9064-f1e9-4696-87dc-13031aa4553d",
+      "value": "samsam"
+    },
     {
       "description": "FAM:sanctionedmedia", 
       "meta": {
@@ -7310,6 +8129,20 @@
       "uuid": "b568a5b2-1008-33cb-85ba-c461018fc2c8", 
       "value": "scam"
     }, 
+    {
+      "description": "FAM:schwarzesonne", 
+      "meta": {
+        "refs": [
+          "https://www.microsoft.com/en-us/wdsi/threats/malware-encyclopedia-description?Name=Trojan:Win32/SchwarzeSonne!MSR",
+          "https://www.fortiguard.com/encyclopedia/virus/7488534",
+          "https://www.virusradar.com/en/Win32_SchwarzeSonne.BL/description"
+        ], 
+        "synonyms": [], 
+        "type": []
+      }, 
+      "uuid": "a4c164cc-a4a1-4b7f-a9a2-c664f6b461d4", 
+      "value": "schwarzesonne"
+    }, 
     {
       "description": "FAM:sckeylog", 
       "meta": {
@@ -7431,6 +8264,27 @@
       "uuid": "ad21874f-d8c7-33d7-9527-c9d666171aa8", 
       "value": "shedun"
     }, 
+    {
+      "description": "FAM:shelma",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.doghousepower",
+          "http://www1.paladion.net/hubfs/Newsletter/DogHousePower-%20Newly%20Identified%20Python-Based%20Ransomware.pdf"
+        ],
+        "synonyms": [
+          "doghousepower"
+        ],
+        "type": []
+      },
+      "related": [
+        {
+          "dest-uuid": "a8f4f4fa-1558-3f16-ac56-bce903a83b22",
+          "type": "variant-of"
+        }
+      ],
+      "uuid": "47ff7101-61d2-464f-8210-6fe26cac2772",
+      "value": "shelma"
+    },
     {
       "description": "FAM:sheridroid", 
       "meta": {
@@ -8239,6 +9093,7 @@
         "refs": [], 
         "synonyms": [
           "encoder", 
+          "diskcoder",
           "filecoder", 
           "ransomcrypt", 
           "trojanransom"
@@ -8309,11 +9164,14 @@
         "refs": [], 
         "synonyms": [
           "banker", 
-          "datasetaler", 
+          "bitstealer",
+          "datastealer", 
+          "discostealer",
           "delfsnif", 
           "delpbanc", 
           "infostealer", 
           "monitor", 
+          "passwordstealera",
           "pswtool", 
           "pwsteal", 
           "pwstealer", 
@@ -8326,6 +9184,22 @@
       "uuid": "c65071d8-2bad-302b-8646-d309f7705fdb", 
       "value": "infosteal"
     }, 
+    {
+      "description": "BEH:infosteal:coinstealer", 
+      "meta": {
+        "refs": [], 
+        "synonyms": [], 
+        "type": []
+      }, 
+      "related": [
+        {
+          "dest-uuid": "c65071d8-2bad-302b-8646-d309f7705fdb",
+          "type": "subtechnique-of"
+        }
+      ],
+      "uuid": "e8e60d44-4950-4671-b56e-707d6ce0b2f6", 
+      "value": "coinstealer"
+    }, 
     {
       "description": "BEH:infosteal:gamethief", 
       "meta": {
@@ -8335,6 +9209,12 @@
         ], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "c65071d8-2bad-302b-8646-d309f7705fdb",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "b87b252e-b364-3cbb-92cf-939b2343b0bc", 
       "value": "gamethief"
     }, 
@@ -8416,6 +9296,12 @@
         ], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "aecd212a-8701-3527-bbde-8cd36b405f93",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "1b4d1d8e-9cbf-3f9b-8308-23e6de3456fd", 
       "value": "killsectool"
     }, 
@@ -8852,6 +9738,12 @@
         ], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "0fa687bd-caa9-32b1-a77e-13b98dc83685",
+          "type": "variant-of"
+        }
+      ],
       "uuid": "590eeb1a-2742-3301-94bf-9fa0856e959c", 
       "value": "adware"
     }, 
@@ -8864,6 +9756,12 @@
         ], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "0fa687bd-caa9-32b1-a77e-13b98dc83685",
+          "type": "variant-of"
+        }
+      ],
       "uuid": "3ce6bd72-2133-35f8-b5a9-3d22c5e55a93", 
       "value": "casino"
     }, 
@@ -8883,6 +9781,12 @@
         ], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "590eeb1a-2742-3301-94bf-9fa0856e959c",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "636db272-61ce-3e0f-ad64-d77048b05066", 
       "value": "multiplug"
     }, 
@@ -8904,30 +9808,42 @@
         ], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "0fa687bd-caa9-32b1-a77e-13b98dc83685",
+          "type": "variant-of"
+        }
+      ],
       "uuid": "b6d3ea56-83b3-3524-a2f5-c87ce2ed0aab", 
       "value": "tool"
     }, 
     {
-      "description": "CLASS:hoax", 
+      "description": "CLASS:grayware:tool:remoteadmin", 
       "meta": {
         "refs": [], 
-        "synonyms": [
-          "joke"
-        ], 
+        "synonyms": [], 
         "type": []
       }, 
-      "uuid": "e7bd337d-700c-376b-ac75-61c85dd8a246", 
-      "value": "hoax"
+      "related": [
+        {
+          "dest-uuid": "b6d3ea56-83b3-3524-a2f5-c87ce2ed0aab",
+          "type": "variant-of"
+        }
+      ],
+      "uuid": "e43ecd9a-2734-34d6-b24f-77be13f4b9cd", 
+      "value": "remoteadmin"
     }, 
     {
-      "description": "CLASS:grayware:tool:remoteadmin", 
+      "description": "CLASS:hoax", 
       "meta": {
         "refs": [], 
-        "synonyms": [], 
+        "synonyms": [
+          "joke"
+        ], 
         "type": []
       }, 
-      "uuid": "e43ecd9a-2734-34d6-b24f-77be13f4b9cd", 
-      "value": "remoteadmin"
+      "uuid": "e7bd337d-700c-376b-ac75-61c85dd8a246", 
+      "value": "hoax"
     }, 
     {
       "description": "CLASS:hoax:smshoax", 
@@ -8993,6 +9909,12 @@
         ], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "64376dc5-0640-33a5-ba0e-1a4b71922c06",
+          "type": "variant-of"
+        }
+      ],
       "uuid": "3265ee34-384e-3dc8-9652-19d88d4374cb", 
       "value": "bitcoinminer"
     }, 
@@ -9096,6 +10018,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "973cc9e5-32ab-3403-9ead-eb941690fc23",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "b7be1d66-ac27-3b2c-8361-a652564ec2e3", 
       "value": "prepender"
     }, 
@@ -9109,6 +10037,12 @@
         ], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "90bb8141-d467-3376-8c85-4e0ec9a2be05",
+          "type": "uses"
+        }
+      ],
       "uuid": "f0b15f66-0eae-37d8-bf08-eeca70557795", 
       "value": "worm"
     }, 
@@ -9119,6 +10053,12 @@
         "synonyms": [], 
         "type": []
       }, 
+      "related": [
+        {
+          "dest-uuid": "f0b15f66-0eae-37d8-bf08-eeca70557795",
+          "type": "subtechnique-of"
+        }
+      ],
       "uuid": "980f8421-cccd-3c17-b998-1ab1b7c7bdb9", 
       "value": "emailworm"
     }, 
@@ -9377,6 +10317,32 @@
       "uuid": "805e91bc-aaed-380d-97f7-7d9ae2ab4703", 
       "value": "adviator"
     }, 
+    {
+      "description": "FAM:adwind",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/jar.adwind",
+          "https://blogs.seqrite.com/evolution-of-jrat-java-malware/",
+          "https://www.fortinet.com/blog/threat-research/new-jrat-adwind-variant-being-spread-with-package-delivery-scam.html",
+          "http://blog.trendmicro.com/trendlabs-security-intelligence/spam-remote-access-trojan-adwind-jrat",
+          "http://malware-traffic-analysis.net/2017/07/04/index.html",
+          "https://codemetrix.net/decrypting-adwind-jrat-jbifrost-trojan/",
+          "https://gist.github.com/herrcore/8336975475e88f9bc539d94000412885",
+          "https://blog.talosintelligence.com/2018/09/adwind-dodgesav-dde.html"
+        ],
+        "synonyms": [
+          "AlienSpy",
+          "Frutas",
+          "JBifrost",
+          "JSocket",
+          "Sockrat",
+          "UNRECOM"
+        ],
+        "type": []
+      },
+      "uuid": "04e324c1-a981-4bf7-aab4-d64d0dacae51",
+      "value": "adwind"
+    },
     {
       "description": "FAM:adwk", 
       "meta": {
@@ -9409,6 +10375,25 @@
       "uuid": "1bcf8191-2d6d-3f3e-a114-7df87b8aafcd", 
       "value": "aesads"
     }, 
+    {
+      "description": "FAM:agenttesla",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.agent_tesla",
+          "https://researchcenter.paloaltonetworks.com/2017/09/unit42-analyzing-various-layers-agentteslas-packing/",
+          "https://malwarebreakdown.com/2018/01/11/malspam-entitled-invoice-attched-for-your-reference-delivers-agent-tesla-keylogger/",
+          "https://www.zscaler.com/blogs/research/agent-tesla-keylogger-delivered-using-cybersquatting",
+          "https://blog.fortinet.com/2017/06/28/in-depth-analysis-of-net-malware-javaupdtr",
+          "https://www.fortinet.com/blog/threat-research/analysis-of-new-agent-tesla-spyware-variant.html",
+          "https://thisissecurity.stormshield.com/2018/01/12/agent-tesla-campaign/",
+          "https://blogs.forcepoint.com/security-labs/part-two-camouflage-netting"
+        ],
+        "synonyms": [],
+        "type": []
+      },
+      "uuid": "bd4238f7-fbfc-4ad8-b42e-247013c6df3d",
+      "value": "agenttesla"
+    },
     {
       "description": "FAM:agobot", 
       "meta": {
@@ -9803,6 +10788,21 @@
       "uuid": "4b0a463a-269d-3d58-ae8c-7935c51aa9bc", 
       "value": "autosus"
     }, 
+    {
+      "description": "FAM:avemaria",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.ave_maria",
+          "https://blog.yoroi.company/research/the-ave_maria-malware/"
+        ],
+        "synonyms": [
+          "AVE_MARIA"
+        ],
+        "type": []
+      },
+      "uuid": "a006993a-6e83-4fb5-a6a6-d67a7dc71c23",
+      "value": "avemaria"
+    },
     {
       "description": "FAM:axespy", 
       "meta": {
@@ -9813,6 +10813,31 @@
       "uuid": "7bce3d09-df0c-3593-8353-7812dd205844", 
       "value": "axespy"
     }, 
+    {
+      "description": "FAM:azorult",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.azorult",
+          "https://www.bleepingcomputer.com/news/security/azorult-trojan-serving-aurora-ransomware-by-malactor-oktropys/",
+          "https://blog.minerva-labs.com/puffstealer-evasion-in-a-cloak-of-multiple-layers",
+          "https://malwarebreakdown.com/2017/07/24/the-seamless-campaign-drops-ramnit-follow-up-malware-azorult-stealer-smoke-loader-etc/",
+          "https://www.proofpoint.com/us/threat-insight/post/threat-actors-using-legitimate-paypal-accounts-to-distribute-chthonic-banking-trojan",
+          "http://www.vkremez.com/2017/07/lets-learn-reversing-credential-and.html",
+          "https://blog.minerva-labs.com/azorult-now-as-a-signed-google-update",
+          "https://www.proofpoint.com/us/threat-insight/post/new-version-azorult-stealer-improves-loading-features-spreads-alongside",
+          "https://malwarebreakdown.com/2017/11/12/seamless-campaign-delivers-ramnit-via-rig-ek-at-188-225-82-158-follow-up-malware-is-azorult-stealer/",
+          "https://www.blueliv.com/blog-news/research/azorult-crydbrox-stops-sells-malware-credential-stealer/",
+          "https://research.checkpoint.com/the-emergence-of-the-new-azorult-3-3/"
+        ],
+        "synonyms": [
+          "puffstealer",
+          "rultazo"
+        ],
+        "type": []
+      },
+      "uuid": "fc17c756-528b-416e-907d-9d1ef7403df1",
+      "value": "azorult"
+    },
     {
       "description": "FAM:badda", 
       "meta": {
@@ -9967,6 +10992,27 @@
       "uuid": "40f45119-a4ce-335c-be07-c46ff67f3dcf", 
       "value": "bauts"
     }, 
+    {
+      "description": "FAM:bazar",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.bazarbackdoor",
+          "https://www.advanced-intel.com/post/anatomy-of-attack-inside-bazarbackdoor-to-ryuk-ransomware-one-group-via-cobalt-strike",
+          "https://www.pandasecurity.com/en/mediacenter/business/bazarbackdoor-trickbot-backdoor/"
+        ],
+        "synonyms": [
+          "bazarbackdoor",
+          "beerbot",
+          "bazarcall",
+          "kegtap",
+          "team9backdoor",
+          "bazaloader"
+        ],
+        "type": []
+      },
+      "uuid": "2f6e812e-16a6-4fbc-9273-1aebc12b7d3d",
+      "value": "bazar"
+    },
     {
       "description": "FAM:bebeg", 
       "meta": {
@@ -10185,7 +11231,11 @@
       "description": "FAM:bladabindi", 
       "meta": {
         "refs": "https://malpedia.caad.fkie.fraunhofer.de/details/win.njrat", 
-        "synonyms": [], 
+        "synonyms": [
+          "bladabi",
+          "bladabindinet",
+          "njrat"
+        ], 
         "type": []
       }, 
       "uuid": "470bf5fe-81e2-3da1-a4da-6a1680119a0f", 
@@ -10519,6 +11569,20 @@
       "uuid": "a4e78673-2014-3dbd-bf93-628bc644a872", 
       "value": "centim"
     }, 
+    {
+      "description": "FAM:cerberus", 
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/apk.cerberus",
+          "https://attack.mitre.org/software/S0480",
+          "https://www.threatfabric.com/blogs/cerberus-a-new-banking-trojan-from-the-underworld.html"
+        ],
+        "synonyms": [], 
+        "type": []
+      }, 
+      "uuid": "bfd0098a-822d-436b-b751-1c61ff661cfe", 
+      "value": "cerberus"
+    }, 
     {
       "description": "FAM:cerekv", 
       "meta": {
@@ -10620,6 +11684,22 @@
       "uuid": "d46db949-1fd3-303c-9bf1-b56f86d9077b", 
       "value": "clinator"
     }, 
+    {
+      "description": "FAM:clipbanker", 
+      "meta": {
+        "refs": "https://malpedia.caad.fkie.fraunhofer.de/details/win.clipbanker", 
+        "synonyms": [], 
+        "type": []
+      }, 
+      "related": [
+        {
+          "dest-uuid": "c65071d8-2bad-302b-8646-d309f7705fdb",
+          "type": "variant-of"
+        }
+      ],
+      "uuid": "21529e81-1aea-4435-a407-c5016653d63d", 
+      "value": "clipbanker"
+    }, 
     {
       "description": "FAM:cmccwm", 
       "meta": {
@@ -10650,6 +11730,31 @@
       "uuid": "5c5aa6ae-b94a-31df-bea0-4e672b746664", 
       "value": "cnzz"
     }, 
+    {
+      "description": "FAM:cobaltstrike",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.cobalt_strike",
+          "https://www.fireeye.com/blog/threat-research/2017/06/phished-at-the-request-of-counsel.html",
+          "https://www.symantec.com/connect/blogs/odinaff-new-trojan-used-high-level-financial-attacks",
+          "https://github.com/JPCERTCC/aa-tools/blob/master/cobaltstrikescan.py",
+          "https://blogs.jpcert.or.jp/en/2018/08/volatility-plugin-for-detecting-cobalt-strike-beacon.html",
+          "https://blog.cobaltstrike.com/",
+          "https://www.cobaltstrike.com/support",
+          "https://www.fireeye.com/blog/threat-research/2018/11/not-so-cozy-an-uncomfortable-examination-of-a-suspected-apt29-phishing-campaign.html",
+          "http://blog.morphisec.com/new-global-attack-on-point-of-sale-systems",
+          "https://www.lac.co.jp/lacwatch/people/20180521_001638.html",
+          "https://401trg.com/burning-umbrella/ ",
+          "https://www.pentestpartners.com/security-blog/cobalt-strike-walkthrough-for-red-teamers/",
+          "https://pylos.co/2018/11/18/cozybear-in-from-the-cold/",
+          "http://cyberforensicator.com/2018/12/23/dissecting-cozy-bears-malicious-lnk-file/"
+        ],
+        "synonyms": [],
+        "type": []
+      },
+      "uuid": "008947a7-9634-4f83-851b-f65e1a0f2f0c",
+      "value": "cobaltstrike"
+    },
     {
       "description": "FAM:coinhive", 
       "meta": {
@@ -10712,6 +11817,25 @@
       "uuid": "875a27b7-cc81-3b09-8c23-d2c7b1bd6ac4", 
       "value": "contactscollector"
     }, 
+    {
+      "description": "FAM:conti",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.conti",
+          "https://www.carbonblack.com/blog/tau-threat-discovery-conti-ransomware/"
+        ],
+        "synonyms": [],
+        "type": []
+      },
+      "related": [
+        {
+          "dest-uuid": "a8f4f4fa-1558-3f16-ac56-bce903a83b22",
+          "type": "variant-of"
+        }
+      ],
+      "uuid": "7d7da922-9df0-4184-944c-215b74c8095b",
+      "value": "conti"
+    },
     {
       "description": "FAM:cooee", 
       "meta": {
@@ -11065,6 +12189,24 @@
       "uuid": "96986f73-ee4f-330c-92f9-805d05e6f44b", 
       "value": "dbtes"
     }, 
+    {
+      "description": "FAM:deathransom", 
+      "meta": {
+        "refs": "https://malpedia.caad.fkie.fraunhofer.de/details/win.deathransom", 
+        "synonyms": [
+          "wacatac"
+        ], 
+        "type": []
+      }, 
+      "related": [
+        {
+          "dest-uuid": "a8f4f4fa-1558-3f16-ac56-bce903a83b22",
+          "type": "variant-of"
+        }
+      ],
+      "uuid": "bc154e88-a6ae-4f5d-b029-bbd3b8acf587", 
+      "value": "deathransom"
+    }, 
     {
       "description": "FAM:deblio", 
       "meta": {
@@ -11124,6 +12266,33 @@
       "uuid": "9b9eaf63-3447-3349-b955-6b62a9809d85", 
       "value": "detroie"
     }, 
+    {
+      "description": "FAM:crysis", 
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.dharma",
+          "https://www.carbonblack.com/2018/07/10/carbon-black-tau-threat-analysis-recent-dharma-ransomware-highlights-attackers-continued-use-open-source-tools/",
+          "https://www.bleepingcomputer.com/news/security/new-arena-crysis-ransomware-variant-released/"
+        ],
+        "synonyms": [
+          "crusis", 
+          "dharma",
+          "phobos",
+          "arena",
+          "wadhrama",
+          "ncov"
+        ], 
+        "type": []
+      }, 
+      "related": [
+        {
+          "dest-uuid": "a8f4f4fa-1558-3f16-ac56-bce903a83b22",
+          "type": "variant-of"
+        }
+      ],
+      "uuid": "689e26c6-8cf6-4ce0-ba87-cad1377996ae", 
+      "value": "crysis"
+    }, 
     {
       "description": "FAM:dianjin", 
       "meta": {
@@ -11329,6 +12498,19 @@
       "uuid": "3eb5f701-637e-3b03-ac32-47f59641c718", 
       "value": "dowgin"
     }, 
+    {
+      "description": "FAM:downeks",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.downeks",
+          "http://researchcenter.paloaltonetworks.com/2017/01/unit42-downeks-and-quasar-rat-used-in-recent-targeted-attacks-against-governments/?adbsc=social69739136&adbid=826218465723756545&adbpl=tw&adbpr=4487645412"
+        ],
+        "synonyms": [],
+        "type": []
+      },
+      "uuid": "c8149b45-7d28-421e-bc6f-25c4b8698b92",
+      "value": "downeks"
+    },
     {
       "description": "FAM:downloadadmin", 
       "meta": {
@@ -11582,6 +12764,32 @@
       "uuid": "88fec24c-acb5-3403-b8bf-2da120708b5c", 
       "value": "egame"
     }, 
+    {
+      "description": "FAM:egregor",
+      "meta": {
+        "ransomnotes-filenames": [
+          "RECOVER-FILES.txt"
+        ],
+        "ransomnotes-refs": [
+          "https://www.bleepstatic.com/images/news/columns/week-in-ransomware/2020/september/25/egregor.jpg"
+        ],
+        "refs": [
+          "https://www.appgate.com/news-press/appgate-labs-analyzes-new-family-of-ransomware-egregor",
+          "https://www.bleepingcomputer.com/news/security/crytek-hit-by-egregor-ransomware-ubisoft-data-leaked/",
+          "https://cybersecuritynews.com/egregor-ransomware/"
+        ],
+        "synonyms": [], 
+        "type": []
+      },
+      "related": [
+        {
+          "dest-uuid": "a8f4f4fa-1558-3f16-ac56-bce903a83b22",
+          "type": "variant-of"
+        }
+      ],
+      "uuid": "75c5da38-3097-4e81-9c34-188cfbec1596",
+      "value": "egregor"
+    },
     {
       "description": "FAM:egroupdial", 
       "meta": {
@@ -11622,6 +12830,54 @@
       "uuid": "d43481d8-9186-33cc-8974-75fb3f7a357d", 
       "value": "elite"
     }, 
+    {
+      "description": "FAM:emotet",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.emotet",
+          "https://blog.trendmicro.com/trendlabs-security-intelligence/ursnif-emotet-dridex-and-bitpaymer-gangs-linked-by-a-similar-loader/",
+          "http://blog.trendmicro.com/trendlabs-security-intelligence/emotet-returns-starts-spreading-via-spam-botnet/",
+          "https://www.fortinet.com/blog/threat-research/deep-analysis-of-new-emotet-variant-part-2.html",
+          "https://www.spamhaus.org/news/article/783/emotet-adds-a-further-layer-of-camouflage",
+          "https://isc.sans.edu/forums/diary/Emotet+infections+and+followup+malware/24532/",
+          "https://www.welivesecurity.com/2018/11/09/emotet-launches-major-new-spam-campaign/",
+          "https://github.com/d00rt/emotet_research",
+          "https://blog.kryptoslogic.com/malware/2018/08/01/emotet.html",
+          "https://www.us-cert.gov/ncas/alerts/TA18-201A",
+          "https://portswigger.net/daily-swig/emotet-trojan-implicated-in-wolverine-solutions-ransomware-attack",
+          "https://blog.trendmicro.com/trendlabs-security-intelligence/new-emotet-hijacks-windows-api-evades-sandbox-analysis/",
+          "https://blog.kryptoslogic.com/malware/2018/10/31/emotet-email-theft.html",
+          "http://blog.fortinet.com/2017/05/03/deep-analysis-of-new-emotet-variant-part-1",
+          "https://www.intezer.com/mitigating-emotet-the-most-common-banking-trojan/",
+          "https://maxkersten.nl/binary-analysis-course/malware-analysis/emotet-droppers/",
+          "https://research.checkpoint.com/emotet-tricky-trojan-git-clones/",
+          "https://www.cert.pl/en/news/single/analysis-of-emotet-v4/",
+          "https://www.symantec.com/blogs/threat-intelligence/evolution-emotet-trojan-distributor",
+          "https://www.crowdstrike.com/blog/meet-crowdstrikes-adversary-of-the-month-for-february-mummy-spider/",
+          "https://www.melani.admin.ch/melani/de/home/dokumentation/newsletter/Trojaner_Emotet_greift_Unternehmensnetzwerke_an.html",
+          "https://persianov.net/emotet-malware-analysis-part-1",
+          "https://persianov.net/emotet-malware-analysis-part-2",
+          "https://int0xcc.svbtle.com/dissecting-emotet-s-network-communication-protocol",
+          "https://blog.trendmicro.com/trendlabs-security-intelligence/exploring-emotet-examining-emotets-activities-infrastructure/",
+          "https://paste.cryptolaemus.com",
+          "https://cloudblogs.microsoft.com/microsoftsecure/2017/11/06/mitigating-and-eliminating-info-stealing-qakbot-and-emotet-in-corporate-networks/?source=mmpc",
+          "https://www.spamtitan.com/blog/emotet-malware-revives-old-email-conversations-threads-to-increase-infection-rates/",
+          "https://www.fidelissecurity.com/threatgeek/2017/07/emotet-takes-wing-spreader",
+          "https://securelist.com/analysis/publications/69560/the-banking-trojan-emotet-detailed-analysis/",
+          "https://feodotracker.abuse.ch/?filter=version_e",
+          "https://www.gdata.de/blog/2017/10/30110-emotet-beutet-outlook-aus",
+          "https://malfind.com/index.php/2018/07/23/deobfuscating-emotets-powershell-payload/",
+          "https://medium.com/@0xd0cf11e/analyzing-emotet-with-ghidra-part-1-4da71a5c8d69"
+        ],
+        "synonyms": [
+          "geodo",
+          "heodo"
+        ],
+        "type": []
+      },
+      "uuid": "054e50ca-aeec-428e-91a5-f45e4029a073",
+      "value": "emotet"
+    },
     {
       "description": "FAM:emudbot", 
       "meta": {
diff --git a/avclass/data/misp/galaxy/avclass2.json b/misp/galaxy/avclass.json
similarity index 87%
rename from avclass/data/misp/galaxy/avclass2.json
rename to misp/galaxy/avclass.json
index 656826f..8a95d0b 100644
--- a/avclass/data/misp/galaxy/avclass2.json
+++ b/misp/galaxy/avclass.json
@@ -1,6 +1,6 @@
 {
   "description": "A malware galaxy based on AvClass", 
-  "icon": "", 
+  "icon": "optin-monster", 
   "name": "AvClass", 
   "namespace": "misp", 
   "type": "avclass", 

From 5e6dd1d8f2c45cb083229b9f6326c25fbb26e831 Mon Sep 17 00:00:00 2001
From: Jeffrey Gentes <jeffgemail@gmail.com>
Date: Wed, 15 Feb 2023 20:44:26 -0500
Subject: [PATCH 34/36] Update based on malicialab master

---
 avclass/common.py                 |  23 ++++++-
 avclass/data/default.tagging      |  91 ++++++++++++++++++++++++++-
 avclass/data/default.taxonomy     |  27 ++++++--
 avclass/labeler.py                |   4 +-
 avclass/update.py                 |   6 +-
 examples/metadefender_sample.json |   1 +
 misp/cluster/avclass.json         | 101 +++++++++++++++++++++++++++++-
 7 files changed, 239 insertions(+), 14 deletions(-)
 create mode 100644 examples/metadefender_sample.json

diff --git a/avclass/common.py b/avclass/common.py
index 946f7ad..a6a439c 100755
--- a/avclass/common.py
+++ b/avclass/common.py
@@ -30,7 +30,6 @@
     "Jiangmin",
     "Comodo",
     "GData",
-    "Avast",
     "Sophos",
     "BitDefenderTheta",
     "Alibaba",
@@ -116,6 +115,12 @@ def __iter__(self):
         """ Iterator over the alphabetically sorted tags in the taxonomy """
         return (t for t in sorted(self._tags))
 
+    def is_hex(self, tag: AnyStr) -> bool:
+        # exclude generic hex tags like 004bc24a
+        return bool(re.search(r"\d", tag)) and bool(
+                re.fullmatch(r"[0-9a-fA-F]+", tag)
+            )
+
     def is_generic(self, tag: AnyStr) -> bool:
         """
         Whether or not the input ``tag`` is generic
@@ -748,6 +753,10 @@ def get_label_tags(self, label: AnyStr, hashes: Collection[AnyStr]) -> Set[AnySt
             if self.taxonomy.is_generic(token):
                 continue
 
+            # Ignore hex tokens
+            if self.taxonomy.is_hex(token):
+                continue
+
             # Apply tagging rule
             dst_l = self.translations.get_dst(token)
             if dst_l:
@@ -829,6 +838,18 @@ def get_sample_tags(self, sample_info: SampleInfo) -> Dict[AnyStr, List[AnyStr]]
 
         return av_dict
 
+    def get_sample_vt_count(self, sample_info):
+        ''' Return number of detections for sample
+            in the provided AV whitelist (if any) '''
+        if self.avs is None:
+            return len(sample_info.labels)
+        else:
+            cnt = 0
+            for (av_name, label) in sample_info.labels:
+                if av_name in self.avs:
+                    cnt += 1
+            return cnt
+
     @staticmethod
     def rank_tags(
         av_dict: Dict[AnyStr, List[AnyStr]], threshold: int = 1
diff --git a/avclass/data/default.tagging b/avclass/data/default.tagging
index bbdaa98..7de3e8f 100644
--- a/avclass/data/default.tagging
+++ b/avclass/data/default.tagging
@@ -13,6 +13,7 @@ addisplay	adware
 addrop	adware
 adfltnet	amonetize
 adgazele	adgazelle
+adhubllka	deathransom
 adiwky	airpush
 adknowledge	adware
 adload	adware
@@ -26,14 +27,18 @@ adtrafficanalysis	winkad
 adwareeorezo	eorezo
 afoynq	ksapp
 agemt	domob
+agensla	agenttesla
+agentesla	agenttesla
 agewap	opfake
 agile	biige
 agilebinary	biige
 agnsmit	infectionads
+ainslot	blackshades
 airad	airinstaller
 airadinstaller	airinstaller
 airinstall	airinstaller
 akan	winwebsec
+alienspy	adwind
 allad	airpush
 almanahe	alman
 alureon	tdss
@@ -96,8 +101,14 @@ banloader	rimod
 basebrid	basebridge
 batteryd	fakedoc
 batterydoctor	fakedoc
+bazaloader	bazar
+bazarldr	bazar
+bazarloader	bazar
+bazdor	bazar
+bazzarldr	bazar
 bbridge	basebridge
 bckdr	backdoor
+beacon	cobaltstrike
 bean	nandrobox
 bearshare	bandoo
 beita	beitaad
@@ -113,6 +124,8 @@ bitminer	bitcoinminer
 bjlog	zegost
 bkdr	backdoor
 blackice	whiteice
+bladabi	bladabindi
+blanajog	spygate
 blic	whiteice
 blocal	vmvol
 blocker	killsectool
@@ -121,9 +134,11 @@ botnet	gidix
 bototer	wapomi
 boxer	fakeinst
 boxersms	fakeinst
+bozok	bezigate
 braininst	installbrain
 brantall	installbrain
 brappware	multiplug
+breu	darkkomet
 browsepulse	browsefox
 browsermodifier	multiplug
 browserplugin	multiplug
@@ -144,7 +159,9 @@ c2lop	swizzor
 cabby	dalexis
 caphaw	shylock
 casonline	casino
+cassiopeia	blackshades
 cawitt	smsbot
+cebruser	cerberus
 ceeinject	inject
 cellphonetrack	mytrackp
 cellspy	mobilespy
@@ -158,6 +175,7 @@ chinky	vobfus
 chydo	pykspa
 cidox	vundo
 cimag	hiloti
+cinarat	quasar
 cinmeng	cinmus
 citirevo	vundo
 clemag	cleaman
@@ -170,7 +188,10 @@ clickspring	purityscan
 clientconnect	opencandy
 climap	androrat
 clkpotato	hotbar
+cloudatlas	neoreklami
 clspring	purityscan
+cobalt	cobaltstrike
+cobaltstr	cobaltstrike
 cobbler	focobers
 cobblerone	focobers
 cobbleronea	focobers
@@ -183,6 +204,7 @@ coinminer	miner
 coldfuson	coldfusion
 collector	autoins
 comet	darkkomet
+cometer	cobaltstrike
 cometsys	darkkomet
 cometsystems	darkkomet
 condestil	firseria
@@ -198,6 +220,7 @@ cracktool	tool
 crisis	morcut
 crori	crossrider
 crosate	svpeng
+crusis	crysis
 crwind	crusewind
 cryp	packed
 crypt	packed
@@ -209,16 +232,19 @@ cryptinno	installcore
 cryptodefense	cryptodef
 cryptominer	miner
 cryptor	packed
+cryptz	rozena
 cson	simbot
 ctblocker	dalexis
 cudos	fosniw
 cupi	smssend
+cybergate	rebhip
 cybota	cycbot
 cycler	unruy
 dadmin	downloadadmin
 dailer	dialer
 dalamodo	cossta
 damaged	corrupted
+darkcomet	darkkomet
 darksnow	whiteice
 datasetaler	infosteal
 daytre	upatre
@@ -236,11 +262,14 @@ derdroi	simbad
 desktoplightning	cashon
 detroi	detroie
 detroia	detroie
+dexcrypt	mbrlock
+dharma	crysis
 dial	dialer
 dialers	dialer
 dialpass	egroupdial
 dialplatform	dialer
 didat	dabom
+dinihou	jenxcus
 diple	vobfus
 directdown	directdownloader
 dizhi	lecna
@@ -309,13 +338,16 @@ droppr	downloader
 dropr	downloader
 duel	loveletter
 dumobove	hiddad
+dunihi	jenxcus
 duptwux	lolbot
+dwnld	downloader
 dwnldr	downloader
 dwonk	pykspa
 easydl	amonetize
 echiui	invis
 ecsys	mailcab
 egbii	biige
+egregorransom	egregor
 egroup	egroupdial
 eicar	testvirus
 electron	sytro
@@ -327,6 +359,7 @@ emagsoftware	smsreg
 email	spam
 emailspy	maistealer
 emerleox	fujacks
+emotetcrypt	emotet
 emud	emudbot
 encoder	filecrypt	ransomware
 encpk	packed
@@ -335,6 +368,7 @@ epicgames	gamevance
 epicplay	gamevance
 eqdrug	equationdrug
 equation	equationdrug
+eregorcrypt	egregor
 erop	smssend
 escape	laroux
 escop	laroux
@@ -382,6 +416,7 @@ fakeupdates	gamex
 fakmod	fakeapp
 fakromup	soft32downloader
 faktvx	fakeangry
+fareitvb	fareit
 farex	fearso
 fastsave	megasearch
 fastsaveapp	megasearch
@@ -427,6 +462,7 @@ freeandspy	freespy
 freepds	hotclip
 frogonal	ginmaster
 fujack	fujacks
+fullscreen	lockscreen
 funclub	smssend
 funweb	mywebsearch
 fynloski	darkkomet
@@ -478,6 +514,7 @@ gmeil	gamex
 gnurbulf	rungbu
 goidu	oveead
 goldclick	hiddad
+goldeneye	petya
 gonca	gonesixty
 gone	gonesixty
 gonfu	droidkungfu
@@ -503,6 +540,7 @@ hacyayu	winwebsec
 hamob	fakeflash
 hdusafe	wapron
 helldoor	hilldoor
+hellokitty	deathransom
 hellospy	spyoo
 hiddenad	hiddad
 hiddeninstall	jsmshider
@@ -518,8 +556,10 @@ homepage	browsermodify
 hongtoutou	adrd
 horse	trojan
 hosts-modifier	hostsmodify
+houdini	jenxcus
 hublo	crytex
 huigezi	hupigon
+hworm	jenxcus
 hype	loadmoney
 hyteod	kovter
 iadpush	dowgin
@@ -579,6 +619,7 @@ jedan	kuguo
 jelbrus	techsnab
 joke	hoax
 joleee	tedroo
+jrat	adwind
 juched	griptolo
 kaka	telman
 kanav	alyak
@@ -596,6 +637,8 @@ kibi	ksapp
 kichhoat	smsreg
 killav	killsectool
 killfiles	files
+kitty	deathransom
+kittycrypt	deathransom
 kituri	placms
 kkrunchy	krunchy
 klevate	webprefix
@@ -607,6 +650,8 @@ kometa	rukometa
 kongfu	droidkungfu
 kouto	koutodoor
 koyotelab	bandoo
+kpotsteal	kpot
+kpotstealer	kpot
 krademok	darkkomet
 kranxpay	mmarketpay
 krypt	packed
@@ -644,6 +689,8 @@ llond	lardlond
 loadmoneyent	loadmoney
 locker	lockscreen
 locm	locmg
+loda	nymeria
+lodarat	nymeria
 lohmys	midia
 looked	viking
 loorp	wapomi
@@ -673,6 +720,7 @@ malpe	corrupted
 manalo	laroux
 mandaph	socks
 marketpay	mmarketpay
+maskit	khalesi
 massmailer	spam
 master	masterkey
 maxplus	zeroaccess
@@ -724,7 +772,9 @@ morstar	firseria
 morstars	firseria
 mosky	skymobi
 mostofate	softomate
+mozaakai	bazar
 mplug	multiplug
+mrophine	morphine
 msilobfuscator	msil	packed
 mspyonline	mspy
 msteal	maistealer
@@ -735,6 +785,7 @@ muldrop	downloader
 multibardown	multibar
 multibardownloader	multibar
 multiinstall	vilsel
+multipacked	packed
 multipluggen	multiplug
 musictoolbar	bandoo
 mutibar	multibar
@@ -744,13 +795,19 @@ mw97	macro
 mytrack	mytrackp
 nabucur	virlock
 najin	feejar
+nancrat	nanocore
 nandrob	nandrobox
+nanobot	nanocore
+negasteal	agenttesla
 nemucod	smsreg
+neobar	neoreklami
 neshuta	neshta
 netboxserver	netbox
 neteyes	ipamor
 netfilter	network
 netweird	netwiredrc
+netwire	netwiredrc
+netwired	netwiredrc
 networm	worm
 newyearl	plankton
 nextup	verti
@@ -764,6 +821,8 @@ nimnul	wapomi
 ninebox	kuguo
 nioserv	nocoma
 nisev	nocoma
+njrat	bladabindi
+noancooe	nanocore
 nofear	fearso
 nofer	fearso
 noico	zdtad
@@ -808,6 +867,8 @@ optinstall	ibryte
 optiuminstaller	ibryte
 optixp	optix
 optixpro	optix
+orcusrat	orcus
+orcusrot	orcus
 osx	mac
 osx32	mac
 otran	vobfus
@@ -843,7 +904,9 @@ perfectkeylogger	perflogger
 perfkey	perflogger
 perfloger	perflogger
 perkele	perkel
+petr	petya
 petrolan	petrolin
+petrwrap	petya
 philis	viking
 pigeon	hupigon
 pigetrl	lockscreen
@@ -871,6 +934,7 @@ polipos	cardserv
 polycryptt	polycrypt
 polyransom	virlock
 pony	fareit
+ponystealer	fareit
 popeler	firseria
 popov	fakeinst
 popuppers	soft32downloader
@@ -915,6 +979,7 @@ qakbot	qbot
 qhost	hostsmodify
 qhosts	hostsmodify
 qqrobber	qqrob
+quasarrat	quasar
 qukart	berbew
 qvod	wapomi
 rabbhome	fjcon
@@ -932,6 +997,7 @@ ratab	mamianune
 razel	rasteal
 raziel	rasteal
 recal	mogap
+recam	netwiredrc
 recordpage	browsefox
 redirector	network
 reefwal	kalfere
@@ -941,7 +1007,12 @@ relevant	relevantknowledge
 relik	updtkiller
 remtasu	xtrat
 renamer	files
+reposfxg	trickbot
 reptilic	reptilicus
+rescoms	remcos
+revenge	revetrat
+revengerat	revetrat
+revet	revetrat
 revtcp	metasploit
 rimecud	palevo
 risk	grayware
@@ -967,6 +1038,7 @@ rugo	hotbar
 runitslf	looper
 runonce	chir
 runouce	chir
+ruyk	ryuk
 safekidzone	sakezon
 sahagent	sahat
 saho	wroba
@@ -978,9 +1050,11 @@ salitystub	sality
 salload	sality
 salpack	sality
 salrenmetie	sality
+samas	samsam
 sambamedia	softpulse
 sancmed	sanctionedmedia
 sandrorat	sandr
+sasfis	oficla
 saveshare	megasearch
 scareware	rogueware
 scavir	fakeinst
@@ -999,6 +1073,7 @@ securitydefender	defmid
 securitytool	tool
 secxplod	securityxploded
 secxploded	securityxploded
+sekhmet	egregor
 selfdel	beebone
 sendpay	shastrosms
 sensode	zxshell
@@ -1007,6 +1082,8 @@ serpip	morto
 sethom	hiddad
 sexxoo	redmobile
 sexyclip	smssend
+shadebot	blackshades
+shakblades	blackshades
 sharestar	gappusin
 shell	shellcode
 shellkode	shellcode
@@ -1023,6 +1100,7 @@ sinodo	sinowal
 sintal	plankton
 sirefef	zeroaccess
 skanik	smssend
+skeeeyah	avemaria
 skywiper	flame
 slybdb	blohi
 smabo	adialer
@@ -1055,6 +1133,9 @@ sndapps	typstu
 sneakytrail	installerex
 sniffer	network
 sobot	clientor
+sodin	revil
+sodinokib	revil
+sodinokibi	revil
 soft32down	soft32downloader
 soft32download	soft32downloader
 softbase	softobase
@@ -1113,6 +1194,7 @@ suspiciouspacker	packed
 susppack	packed
 sventore	firseria
 swiftbrowse	browsefox
+swrort	rozena
 system	droidkungfu
 systemfix	fakesysdef
 systemsecurity	winwebsec
@@ -1146,12 +1228,16 @@ tinbelog	nandrobox
 tiny	small
 tklocker	lockscreen
 tonclank	plankton
+toobpug	neoreklami
 toorch	rootnik
 tophos	stegvob
 torchmedia	bandoo
 torpump	winpump
 tovkater	installmonster
 towelexploit	towel
+trick	trickbot
+trickbotcrypt	trickbot
+trickpak	trickbot
 trj	trojan
 trjdown	downloader	trojan
 trjndwnlder	downloader	trojan
@@ -1229,6 +1315,7 @@ w2km	macro
 w32	windows
 w64	windows
 w97m	macro
+wadhrama	crysis
 wakeful	cardserv
 wali	wapomi
 walkfree	kalfere
@@ -1253,6 +1340,7 @@ websearch	search
 webtoolbar	toolbar
 wedownload	soft32downloader
 weecnaw	netwiredrc
+weenloc	lockscreen
 weiyi	smforw
 whboy	fujacks
 whistle	whistlesoftware
@@ -1262,6 +1350,7 @@ win	windows
 win32	windows
 win64	windows
 winge	cardserv
+winlock	lockscreen
 winnt	windows
 winsoft	fosniw
 winsxsbot	sfone
@@ -1313,4 +1402,4 @@ zona	zvuzona
 zpack	packed
 zsone	raden
 zwunzi	zwangi
-zybut	shiz
+zybut	shiz
\ No newline at end of file
diff --git a/avclass/data/default.taxonomy b/avclass/data/default.taxonomy
index 963b8da..2bb344e 100644
--- a/avclass/data/default.taxonomy
+++ b/avclass/data/default.taxonomy
@@ -43,6 +43,7 @@ BEH:spam
 BEH:tor
 BEH:vmdetect
 BEH:whatsapp
+BEH:windef
 CLASS:apt
 CLASS:backdoor
 CLASS:bot
@@ -116,6 +117,7 @@ FAM:allaple
 FAM:alman
 FAM:alyak
 FAM:amonetize
+FAM:amphitryon
 FAM:androidarmour
 FAM:androidlost
 FAM:androrat
@@ -166,15 +168,18 @@ FAM:berbew
 FAM:bertle
 FAM:betterad
 FAM:bettersurf
+FAM:bezigate
 FAM:bgserv
 FAM:bicololo
 FAM:bifrose
+FAM:bifrost
 FAM:biige
 FAM:binka
 FAM:bips
 FAM:birele
 FAM:bitrep
 FAM:blacklister
+FAM:blackshades
 FAM:bladabindi
 FAM:blohi
 FAM:blueguard
@@ -190,6 +195,7 @@ FAM:brontok
 FAM:browsefox
 FAM:bruad
 FAM:bublik
+FAM:buhtrap
 FAM:bundlore
 FAM:buterat
 FAM:buzus
@@ -275,6 +281,7 @@ FAM:dofoil
 FAM:dogowar
 FAM:domaiq
 FAM:domob
+FAM:donoff
 FAM:dorfdo
 FAM:dorifel
 FAM:dorkbot
@@ -373,6 +380,7 @@ FAM:fujacks
 FAM:gabas
 FAM:gabpath
 FAM:gamania
+FAM:gamaredon
 FAM:gamarue
 FAM:gambler
 FAM:gamclk
@@ -409,6 +417,7 @@ FAM:gonesixty
 FAM:goodnews
 FAM:goodor
 FAM:gootkit
+FAM:gorgon
 FAM:gorillaprice
 FAM:gozi
 FAM:gpspy
@@ -476,6 +485,7 @@ FAM:irtard
 FAM:itracker
 FAM:jayqa
 FAM:jeefo
+FAM:jenxcus
 FAM:jfpush
 FAM:jiead
 FAM:jifake
@@ -495,6 +505,7 @@ FAM:kasidet
 FAM:katrep
 FAM:kelihos
 FAM:kgbspy
+FAM:khalesi
 FAM:kidlogger
 FAM:kimia
 FAM:kingroot
@@ -507,6 +518,7 @@ FAM:koobface
 FAM:korgo
 FAM:koutodoor
 FAM:kovter
+FAM:kpot
 FAM:krefel
 FAM:kronos
 FAM:ksapp
@@ -536,6 +548,7 @@ FAM:loapi
 FAM:lockactivity
 FAM:locmg
 FAM:loic
+FAM:lokibot
 FAM:lolbot
 FAM:lollipop
 FAM:loodos
@@ -550,6 +563,7 @@ FAM:lucky
 FAM:lxasj
 FAM:lynep
 FAM:mabezat
+FAM:macrobe
 FAM:magiccasino
 FAM:mailcab
 FAM:maistealer
@@ -564,6 +578,7 @@ FAM:masplot
 FAM:masspr
 FAM:maxapp
 FAM:mazarbot
+FAM:mbrlock
 FAM:mecor
 FAM:medfos
 FAM:mediafinder
@@ -626,6 +641,7 @@ FAM:navbar
 FAM:nawiaiad
 FAM:necro
 FAM:necurs
+FAM:nemim
 FAM:neoreklami
 FAM:neospy
 FAM:neshta
@@ -640,9 +656,11 @@ FAM:nocoma
 FAM:notifyer
 FAM:nqshield
 FAM:nymaim
+FAM:nymeria
 FAM:obtes
 FAM:ocikq
 FAM:odpa
+FAM:oficla
 FAM:oimobi
 FAM:oivim
 FAM:oixal
@@ -654,7 +672,7 @@ FAM:opencandy
 FAM:openinstall
 FAM:opfake
 FAM:optix
-FAM:orcusrat
+FAM:orcus
 FAM:outbrowse
 FAM:oveead
 FAM:paccy
@@ -700,11 +718,11 @@ FAM:pushad
 FAM:pushe
 FAM:puxis
 FAM:pykspa
-FAM:quasar
 FAM:qbot
 FAM:qexma
 FAM:qplus
 FAM:qqrob
+FAM:quasar
 FAM:qumi
 FAM:quozha
 FAM:qushu
@@ -734,6 +752,7 @@ FAM:reptilicus
 FAM:resharer
 FAM:reveton
 FAM:revetrat
+FAM:revil
 FAM:revmob
 FAM:riltok
 FAM:rimod
@@ -910,8 +929,8 @@ FAM:tracer
 FAM:tracker
 FAM:trackerfree
 FAM:trackplus
-FAM:trickbot
 FAM:trclick
+FAM:trickbot
 FAM:tridrongo
 FAM:troom
 FAM:truedownloader
@@ -1205,4 +1224,4 @@ GEN:undef
 GEN:undefined
 GEN:unknown
 GEN:variant
-GEN:website
+GEN:website
\ No newline at end of file
diff --git a/avclass/labeler.py b/avclass/labeler.py
index b0e362f..ae22a0d 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -213,8 +213,8 @@ def get_tokens(self, sample_info: NamedTuple):
         if self.av_labels.alias_detect:
             self.av_vender_tokens(tags)
 
-        # Compute VT_Count
-        vt_count = len(sample_info.labels)
+        # Compute VT_Count (using list of AV engines if provided)
+        vt_count = self.av_labels.get_sample_vt_count(sample_info)
 
         # Collect stats
         # TODO: should iterate once over tags,
diff --git a/avclass/update.py b/avclass/update.py
index 5adaf54..9615207 100644
--- a/avclass/update.py
+++ b/avclass/update.py
@@ -437,12 +437,12 @@ def main():
     parser = argparse.ArgumentParser(description='Given a .alias file from the labeler, generates updates for the '
                                                  'taxonomy, tagging, and expansion files.')
 
-    parser.add_argument('-alias', help='file to parse with alias from labeler which runs if -alias not present')
+    parser.add_argument('-alias', help='input file with alias from labeler. Mandatory.')
 
-    parser.add_argument('-n', help='Minimum number of times that a pair of tokes have been seen. Default: 20',
+    parser.add_argument('-n', help='Minimum number of times that a pair of tokens have been seen. Default: 20',
                         type=int, default=20)
 
-    parser.add_argument('-t', help='Minimum percentage of times two tokens appear together. Default: 1.94',
+    parser.add_argument('-t', help='Minimum percentage of times two tokens appear together. Default: 0.94',
                         type=float, default=0.94)
 
     parser.add_argument('-o', help='output prefix for files')
diff --git a/examples/metadefender_sample.json b/examples/metadefender_sample.json
new file mode 100644
index 0000000..0577345
--- /dev/null
+++ b/examples/metadefender_sample.json
@@ -0,0 +1 @@
+{"data_id": "49f8ca95f24a45ce9b7feb41b484e165", "dlp_info": {}, "extracted_files": {"files_extracted_count": 4, "files_in_archive": [{"data_id": "0dba93e893a64e42b2aad42996d52fb2", "detected_by": 0, "display_name": "reedmi.cvl", "file_size": 251124, "file_type": "application/vnd.rar", "file_type_description": "WinRAR Compressed Archive", "process_info": {"blocked_reason": "Encrypted Archive", "progress_percentage": 100, "result": "Blocked", "verdicts": ["Encrypted Archive"]}, "progress_percentage": 100, "scan_all_result_a": "Encrypted Archive", "scan_all_result_i": 12, "scanned_with": 29}, {"data_id": "220373b076e74ab09ae49b9879617b9b", "detected_by": 2, "display_name": "elp.bat", "file_size": 670, "file_type": "text/plain", "file_type_description": "ASCII Text", "process_info": {"blocked_reason": "Infected", "progress_percentage": 100, "result": "Blocked", "verdicts": ["Infected"]}, "progress_percentage": 100, "scan_all_result_a": "Infected", "scan_all_result_i": 1, "scanned_with": 29}, {"data_id": "b43c3a5ba47249a6b99632d9fb0563c5", "detected_by": 1, "display_name": "extraPFZ.exe", "file_size": 564896, "file_type": "application/x-dosexec", "file_type_description": "Executable File", "process_info": {"blocked_reason": "Infected", "progress_percentage": 100, "result": "Blocked", "verdicts": ["Infected"]}, "progress_percentage": 100, "scan_all_result_a": "Infected", "scan_all_result_i": 1, "scanned_with": 29}, {"data_id": "662a175c7783459b9afa28b1d33d1379", "detected_by": 0, "display_name": "svideo.vbs", "file_size": 81, "file_type": "text/plain", "file_type_description": "ASCII Text", "process_info": {"blocked_reason": "", "progress_percentage": 100, "result": "Allowed", "verdicts": ["No Threat Detected"]}, "progress_percentage": 100, "scan_all_result_a": "No Threat Detected", "scan_all_result_i": 0, "scanned_with": 29}], "first_index": 0, "page_size": 50, "total_extracted_files": 4, "worst_data_id": "49f8ca95f24a45ce9b7feb41b484e165"}, "file_info": {"display_name": "2c6110a76dda8da49195052fa561ab8b8278c02df400124e46d26d2df228b70b", "file_size": 988643, "file_type": "application/vnd.microsoft.portable-executable", "file_type_description": "Self-extracting Executable File", "md5": "33ca3e86d783234092e52369e1b6bb83", "sha1": "653ab54e15b01473943cd897ded24f742b0193c5", "sha256": "2c6110a76dda8da49195052fa561ab8b8278c02df400124e46d26d2df228b70b", "upload_timestamp": "2021-01-29T22:53:45.604Z"}, "process_info": {"blocked_reason": "Infected", "file_type_skipped_scan": false, "post_processing": {"actions_failed": "", "actions_ran": "", "converted_destination": "", "converted_to": "", "copy_move_destination": ""}, "processing_time": 20516, "profile": "File process", "progress_percentage": 100, "queue_time": 1219, "result": "Blocked", "user_agent": "", "username": "", "verdicts": ["Infected"]}, "scan_results": {"data_id": "49f8ca95f24a45ce9b7feb41b484e165", "last_file_scanned": "reedmi.cvl", "progress_percentage": 100, "scan_all_result_a": "Infected", "scan_all_result_i": 1, "scan_details": {"AegisLab": {"def_time": "2021-01-29T12:48:00.000Z", "eng_id": "aegislab_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 9, "threat_found": "", "wait_time": 1366}, "Ahnlab": {"def_time": "2021-01-30T00:00:00.000Z", "eng_id": "ahnlab_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 24, "threat_found": "Malware/Win32.Generic", "wait_time": 1351}, "Antiy": {"def_time": "2021-01-29T15:48:00.000Z", "eng_id": "antiy_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 20, "threat_found": "", "wait_time": 1355}, "Avira": {"def_time": "2021-01-29T00:00:00.000Z", "eng_id": "avira_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 1, "threat_found": "TR/Drop.Agent.xlojg", "wait_time": 1374}, "BitDefender": {"def_time": "2021-01-29T13:19:00.000Z", "eng_id": "bitdefender_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 140, "threat_found": "Trojan.Dropper.ZME", "wait_time": 1501}, "ByteHero": {"def_time": "2021-01-27T00:00:00.000Z", "eng_id": "bytehero_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 680, "threat_found": "", "wait_time": 1352}, "ClamAV": {"def_time": "2021-01-28T07:28:06.000Z", "eng_id": "clamav_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 1125, "threat_found": "", "wait_time": 1438}, "Comodo": {"def_time": "2021-01-29T05:05:50.000Z", "eng_id": "comodo_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 26, "threat_found": "Malware", "wait_time": 1349}, "Cyren": {"def_time": "2021-01-29T14:35:00.000Z", "eng_id": "cyren_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 94, "threat_found": "", "wait_time": 1547}, "ESET": {"def_time": "2021-01-29T00:00:00.000Z", "eng_id": "eset_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 50, "threat_found": "", "wait_time": 1544}, "Emsisoft": {"def_time": "2021-01-29T12:07:00.000Z", "eng_id": "emsisoft_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 1202, "threat_found": "Trojan.Dropper.ZME (B)", "wait_time": 1502}, "Filseclab": {"def_time": "2021-01-27T23:08:00.000Z", "eng_id": "filseclab_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 411, "threat_found": "", "wait_time": 1527}, "Huorong": {"def_time": "2021-01-29T09:24:00.000Z", "eng_id": "huorong_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 260, "threat_found": "", "wait_time": 1584}, "Ikarus": {"def_time": "2021-01-29T13:13:30.000Z", "eng_id": "ikarus_1_windows", "location": "local", "scan_result_i": 3, "scan_time": 235, "threat_found": "The archive is password protected or the given password is invalid.", "wait_time": 1594}, "K7": {"def_time": "2021-01-29T11:16:00.000Z", "eng_id": "k7_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 12, "threat_found": "Trojan ( 005631561 )", "wait_time": 1363}, "McAfee": {"def_time": "2021-01-29T00:00:00.000Z", "eng_id": "mcafee_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 61, "threat_found": "RDN/Dridex", "wait_time": 1549}, "NANOAV": {"def_time": "2021-01-29T11:38:00.000Z", "eng_id": "nano_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 28, "threat_found": "Trojan.Win32.Dridex.icipbk", "wait_time": 1519}, "NetGate": {"def_time": "2021-01-24T04:10:00.000Z", "eng_id": "netgate_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 64, "threat_found": "", "wait_time": 1561}, "Quick Heal": {"def_time": "2021-01-29T06:52:00.000Z", "eng_id": "quickheal_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 51, "threat_found": "Backdoor.Dridex", "wait_time": 1559}, "Sophos": {"def_time": "2021-01-29T00:12:00.000Z", "eng_id": "sophos_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 238, "threat_found": "", "wait_time": 1591}, "Symantec": {"def_time": "2021-01-29T00:00:00.000Z", "eng_id": "symantec_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 21, "threat_found": "", "wait_time": 1464}, "TACHYON": {"def_time": "2021-01-29T00:00:00.000Z", "eng_id": "nprotect_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 76, "threat_found": "", "wait_time": 1549}, "TrendMicro": {"def_time": "2021-01-27T20:22:00.000Z", "eng_id": "trendmicro_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 1388, "threat_found": "", "wait_time": 1441}, "TrendMicro House Call": {"def_time": "2021-01-28T22:14:00.000Z", "eng_id": "trendmicrohousecall_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 1281, "threat_found": "", "wait_time": 1454}, "Vir.IT eXplorer": {"def_time": "2021-01-29T12:10:00.000Z", "eng_id": "viritexplorer_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 72, "threat_found": "", "wait_time": 1569}, "VirusBlokAda": {"def_time": "2021-01-29T08:04:00.000Z", "eng_id": "virusblokada_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 492, "threat_found": "", "wait_time": 1493}, "Windows Defender": {"def_time": "2021-01-29T07:07:36.000Z", "eng_id": "windowsdefender_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 760, "threat_found": "", "wait_time": 1334}, "Xvirus Personal Guard": {"def_time": "2021-01-28T05:47:00.000Z", "eng_id": "xviruspersonalguard_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 825, "threat_found": "Suspicious:NewThreat.179", "wait_time": 1363}, "Zillya!": {"def_time": "2021-01-28T07:07:00.000Z", "eng_id": "zillya_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 10, "threat_found": "", "wait_time": 1475}}, "start_time": "2021-01-29T22:53:46.823Z", "total_avs": 29, "total_time": 20516}, "vulnerability_info": {}, "yara_info": {}}
diff --git a/misp/cluster/avclass.json b/misp/cluster/avclass.json
index 933e133..7bf7521 100644
--- a/misp/cluster/avclass.json
+++ b/misp/cluster/avclass.json
@@ -2730,6 +2730,7 @@
           "malob", 
           "malpack", 
           "msilobfuscator", 
+          "msilkrypt",
           "nsanti", 
           "obfus", 
           "obfusc", 
@@ -2737,6 +2738,7 @@
           "obfuscated", 
           "obfuscator", 
           "pakes", 
+          "packer",
           "suspiciouspacker", 
           "susppack", 
           "vbinder",
@@ -3453,7 +3455,9 @@
       "description": "FILE:proglang:autoit", 
       "meta": {
         "refs": [], 
-        "synonyms": [], 
+        "synonyms": [
+          "autoitscript"
+        ], 
         "type": []
       }, 
       "uuid": "e16e2760-e497-3e39-9ca2-68a23ccd2b4f", 
@@ -5412,6 +5416,19 @@
       "uuid": "0c6ba93f-a1bc-3e31-bf0e-ffab207c80f8", 
       "value": "loic"
     }, 
+    {
+      "description": "FAM:lokibot",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/apk.lokibot",
+          "https://www.threatfabric.com/blogs/lokibot_the_first_hybrid_android_malware.html"
+        ],
+        "synonyms": [],
+        "type": []
+      },
+      "uuid": "14b91559-69a4-4f1c-aeac-346be227d08d",
+      "value": "lokibot"
+    }, 
     {
       "description": "FAM:lolbot", 
       "meta": {
@@ -8793,6 +8810,23 @@
       "uuid": "0e0ea1ba-65d6-3132-b36c-48cd50ca03cd", 
       "value": "smszombie"
     }, 
+    {
+      "description": "FAM:snatch",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.snatch"
+        ],
+        "synonyms": []
+      },
+      "related": [
+        {
+          "dest-uuid": "a8f4f4fa-1558-3f16-ac56-bce903a83b22",
+          "type": "variant-of"
+        }
+      ],
+      "uuid": "5a5cdf0a-ead8-4399-848a-44a9a48cb237",
+      "value": "snatch"
+    },
     {
       "description": "FAM:snowfox", 
       "meta": {
@@ -8817,6 +8851,28 @@
       "uuid": "ab23cc7e-2a93-36e8-a0b7-91bf87e0a142", 
       "value": "socks"
     }, 
+    {
+      "description": "FAM:revil",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.revil",
+          "https://blog.talosintelligence.com/2019/04/sodinokibi-ransomware-exploits-weblogic.html"
+        ],
+        "synonyms": [
+          "sodinokibi",
+          "sodin",
+          "sodinoransom"
+        ]
+      },
+      "related": [
+        {
+          "dest-uuid": "a8f4f4fa-1558-3f16-ac56-bce903a83b22",
+          "type": "variant-of"
+        }
+      ],
+      "uuid": "a3d6c162-a51c-4b25-a0cd-7d89037140a3",
+      "value": "revil"
+    },
     {
       "description": "FAM:soft32downloader", 
       "meta": {
@@ -8968,6 +9024,7 @@
       "meta": {
         "refs": [], 
         "synonyms": [
+          "autoruns",
           "autoruner", 
           "autorunerent"
         ], 
@@ -9165,6 +9222,7 @@
         "synonyms": [
           "banker", 
           "bitstealer",
+          "cookiesstealer",
           "datastealer", 
           "discostealer",
           "delfsnif", 
@@ -9172,6 +9230,7 @@
           "infostealer", 
           "monitor", 
           "passwordstealera",
+          "poscardstealer",
           "pswtool", 
           "pwsteal", 
           "pwstealer", 
@@ -10542,6 +10601,19 @@
       "uuid": "5e9086ac-ea73-306b-8faf-fd96bd20e8a2", 
       "value": "amonetize"
     }, 
+    {
+      "description": "FAM:ammyy", 
+      "meta": {
+        "refs": [], 
+        "synonyms": [
+          "fakeammyy", 
+          "ammyyadmin"
+        ], 
+        "type": []
+      }, 
+      "uuid": "4dc7b45d-cff8-447a-97a6-2181f7b7773d", 
+      "value": "ammyy"
+    }, 
     {
       "description": "FAM:androidarmour", 
       "meta": {
@@ -11006,7 +11078,9 @@
           "bazarcall",
           "kegtap",
           "team9backdoor",
-          "bazaloader"
+          "bazaloader",
+          "bazarloader",
+          "bazloader"
         ],
         "type": []
       },
@@ -11412,6 +11486,26 @@
       "uuid": "cd7498b7-68c7-34ba-bdfd-9381aa879adb", 
       "value": "bublik"
     }, 
+    {
+      "description": "FAM:buhtrap",
+      "meta": {
+        "refs": [
+          "https://malpedia.caad.fkie.fraunhofer.de/details/win.buhtrap",
+          "https://malware-research.org/carbanak-source-code-leaked/",
+          "https://www.symantec.com/connect/blogs/russian-bank-employees-received-fake-job-offers-targeted-email-attack",
+          "https://www.welivesecurity.com/2015/04/09/operation-buhtrap/",
+          "https://www.group-ib.com/brochures/gib-buhtrap-report.pdf",
+          "https://www.arbornetworks.com/blog/asert/diving-buhtrap-banking-trojan-activity/",
+          "https://blog.dcso.de/pegasus-buhtrap-analysis-of-the-malware-stage-based-on-the-leaked-source-code/"
+        ],
+        "synonyms": [
+          "ratopak"
+        ],
+        "type": []
+      },
+      "uuid": "71e031ee-50e8-46a8-bda8-c4ae9c0012de",
+      "value": "buhtrap"
+    },
     {
       "description": "FAM:bundlore", 
       "meta": {
@@ -13127,6 +13221,7 @@
           "fakebrows", 
           "fakeicq", 
           "fakeinstall", 
+          "fakeinstaller",
           "fakeinsthw", 
           "fakeinstsms", 
           "fodeg", 
@@ -13854,5 +13949,5 @@
       "value": "geinimi"
     }
   ], 
-  "version": 0.1
+  "version": 0.2
 }
\ No newline at end of file

From 2039d304d8026f97d10b54d6e41645a608806215 Mon Sep 17 00:00:00 2001
From: Jeffrey Gentes <jeffgemail@gmail.com>
Date: Fri, 17 Feb 2023 22:20:54 -0500
Subject: [PATCH 35/36] tagging and taxonomy updates

---
 avclass/data/default.tagging  | 166 ++++++++++++++++++++++++----------
 avclass/data/default.taxonomy |  13 ++-
 2 files changed, 130 insertions(+), 49 deletions(-)

diff --git a/avclass/data/default.tagging b/avclass/data/default.tagging
index 7de3e8f..e983863 100644
--- a/avclass/data/default.tagging
+++ b/avclass/data/default.tagging
@@ -43,6 +43,7 @@ allad	airpush
 almanahe	alman
 alureon	tdss
 amab	mobidash
+ammyyadmin	ammyy
 amorba	ipamor
 andef	fkdefend
 andr	android
@@ -77,14 +78,19 @@ arcadeparlor	gamevance
 arcadeweb	gamevance
 archsms	smshoax
 arcparlor	gamevance
+arena	crysis
 armour	androidarmour
 arto	renos
 artro	renos
 aservicea	kuguo
+autohotkey	autohk
+autoitscript	autoit
 autokms	winactivator
-autoruner	autorun	vobfus
-autorunerent	autorun	palevo
+autoruner	autorun
+autorunerent	autorun
+autoruns	autorun
 avalod	sinowal
+ave_maria	avemaria
 aveasms	smskey
 avkill	killsectool
 bacteraloh	sality
@@ -102,15 +108,19 @@ basebrid	basebridge
 batteryd	fakedoc
 batterydoctor	fakedoc
 bazaloader	bazar
+bazarbackdoor	bazar
+bazarcall	bazar
 bazarldr	bazar
 bazarloader	bazar
 bazdor	bazar
+bazloader	bazar
 bazzarldr	bazar
 bbridge	basebridge
 bckdr	backdoor
 beacon	cobaltstrike
 bean	nandrobox
 bearshare	bandoo
+beerbot	bazar
 beita	beitaad
 bergat	xtrat
 bertlea	bertle
@@ -119,12 +129,15 @@ betterinstaller	somoto
 bflient	palevo
 bibean	faketimer
 biez	loadmoney
+binder	packed
 bitcoin	bitcoinminer
 bitminer	bitcoinminer
+bitstealer	infosteal
 bjlog	zegost
 bkdr	backdoor
 blackice	whiteice
 bladabi	bladabindi
+bladabindinet	bladabindi
 blanajog	spygate
 blic	whiteice
 blocal	vmvol
@@ -190,6 +203,7 @@ climap	androrat
 clkpotato	hotbar
 cloudatlas	neoreklami
 clspring	purityscan
+cmkfkw	5wfo
 cobalt	cobaltstrike
 cobaltstr	cobaltstrike
 cobbler	focobers
@@ -208,14 +222,17 @@ cometer	cobaltstrike
 cometsys	darkkomet
 cometsystems	darkkomet
 condestil	firseria
+confuserex	confuser
 contrand	sckeylog
 controlrandom	sckeylog
+cookiesstealer	infosteal
 coolpaperleak	coolwall
 copycat	airpush
 corrupt	corrupted
 cosha	lovetrap
 counterclank	plankton
 crack	tool
+cracks	wzteam
 cracktool	tool
 crisis	morcut
 crori	crossrider
@@ -231,6 +248,7 @@ cryptic	packed
 cryptinno	installcore
 cryptodefense	cryptodef
 cryptominer	miner
+cryptoobfuscator	packed
 cryptor	packed
 cryptz	rozena
 cson	simbot
@@ -247,17 +265,18 @@ damaged	corrupted
 darkcomet	darkkomet
 darksnow	whiteice
 datasetaler	infosteal
+datastealer	infosteal
 daytre	upatre
 ddlight	droiddreamlight
 dealcabby	adpeak
 debris	gamarue
 delf	delphi
 delfiles	filedelete
-delfinject	delphi	inject
-delfloader	delphi	downloader
-delfsnif	delphi	infosteal
-delpbanc	delphi	infosteal
-delpdldr	delphi	downloader
+delfinject	delphi
+delfloader	delphi
+delfsnif	delphi
+delpbanc	delphi
+delpdldr	delphi
 derdroi	simbad
 desktoplightning	cashon
 detroi	detroie
@@ -269,9 +288,11 @@ dialers	dialer
 dialpass	egroupdial
 dialplatform	dialer
 didat	dabom
-dinihou	jenxcus
+dinihou	houdini
 diple	vobfus
 directdown	directdownloader
+discostealer	infosteal
+diskcoder	filecrypt
 dizhi	lecna
 dldr	downloader
 dldrop	downloader
@@ -281,10 +302,11 @@ dloader	downloader
 dloadr	downloader
 dloadware	adware
 dnschanger	dnsmodify
-docdl	downloader	msoffice
-docdrop	downloader	msoffice
-docdrp	downloader	msoffice
+docdl	downloader
+docdrop	downloader
+docdrp	downloader
 dogbite	dogowar
+doghousepower	shelma
 dogwar	dogowar
 doidroot	rooter
 domainiq	domaiq
@@ -338,7 +360,7 @@ droppr	downloader
 dropr	downloader
 duel	loveletter
 dumobove	hiddad
-dunihi	jenxcus
+dunihi	houdini
 duptwux	lolbot
 dwnld	downloader
 dwnldr	downloader
@@ -361,9 +383,10 @@ emailspy	maistealer
 emerleox	fujacks
 emotetcrypt	emotet
 emud	emudbot
-encoder	filecrypt	ransomware
+encoder	filecrypt
 encpk	packed
 engwings	cardserv
+enigmaprotector	enigma
 epicgames	gamevance
 epicplay	gamevance
 eqdrug	equationdrug
@@ -386,6 +409,7 @@ extrat	xtrat
 eydrop	dinwod
 fakapp	styricka
 fakealert	rogueware
+fakeammyy	ammyy
 fakeav	rogueware
 fakebattscar	fakedoc
 fakebrows	fakeinst
@@ -395,6 +419,7 @@ fakedefender	fkdefend
 fakefldr	fakefolder
 fakeicq	fakeinst
 fakeinstall	fakeinst
+fakeinstaller	fakeinst
 fakeinsthw	fakeinst
 fakeinstsms	fakeinst
 fakejoboffer	fakejob
@@ -402,7 +427,7 @@ fakelogosms	fakelogo
 fakelt	elite
 fakemini	opfake
 fakemms	fakeplayer
-fakems	fakepublisher	signed
+fakems	fakepublisher
 fakengry	fakeangry
 fakenotify	opfake
 fakeplay	fakeplayer
@@ -426,7 +451,7 @@ fenomen	fenomengame
 fenomengamet	fenomengame
 fenservice	fengvi
 fidgo	opfake
-filecoder	filecrypt	ransomware
+filecoder	filecrypt
 filehunter	winpump
 fileinfector	infector
 filesearch	amonetize
@@ -455,12 +480,14 @@ fokonge	droidkungfu
 foncysms	foncy
 foran	anforen
 fraud	rogueware
-fraudload	downloader	rogueware
+fraudload	downloader
 fraudtool	tool
 freeandroidspy	freespy
 freeandspy	freespy
 freepds	hotclip
 frogonal	ginmaster
+frutas	adwind
+fuerboos	goodor
 fujack	fujacks
 fullscreen	lockscreen
 funclub	smssend
@@ -476,6 +503,7 @@ gamevancecs	gamevance
 gampass	gamethief
 ganelp	griptolo
 gaobot	agobot
+gaslome	loosemaque
 gasms	gambler
 gastab	gabas
 gavir	viking
@@ -489,8 +517,10 @@ geksone	crytex
 gemest	smishing
 genericab	wroba
 genericgb	basebridge
+genkryptik	packed
 genpack	packed
 gentroj	trojan
+geodo	emotet
 gepat	airpush
 getextension	eorezo
 getfaster	4shared
@@ -542,6 +572,7 @@ hdusafe	wapron
 helldoor	hilldoor
 hellokitty	deathransom
 hellospy	spyoo
+heodo	emotet
 hiddenad	hiddad
 hiddeninstall	jsmshider
 hidrag	jeefo
@@ -556,10 +587,9 @@ homepage	browsermodify
 hongtoutou	adrd
 horse	trojan
 hosts-modifier	hostsmodify
-houdini	jenxcus
 hublo	crytex
 huigezi	hupigon
-hworm	jenxcus
+hworm	houdini
 hype	loadmoney
 hyteod	kovter
 iadpush	dowgin
@@ -609,17 +639,20 @@ intex	intexdial
 intexus	intexdial
 invader	daws
 ipatre	upatre
-ircbot	bot	irc
+ircbot	bot
 ispyoo	spyoo
 j2me	java
 jackpos	jinupd
 jadtre	wapomi
 javak	suggestor
+jbifrost	adwind
 jedan	kuguo
 jelbrus	techsnab
+jenxcus	houdini
 joke	hoax
 joleee	tedroo
 jrat	adwind
+jsocket	adwind
 juched	griptolo
 kaka	telman
 kanav	alyak
@@ -627,6 +660,7 @@ kasandra	sandr
 kashu	sality
 kazaa	benjamin
 keepmusic	hiddad
+kegtap	bazar
 keji	basebridge
 kelvin	smssend
 kernelpatch	geral
@@ -689,8 +723,7 @@ llond	lardlond
 loadmoneyent	loadmoney
 locker	lockscreen
 locm	locmg
-loda	nymeria
-lodarat	nymeria
+lodarat	loda
 lohmys	midia
 looked	viking
 loorp	wapomi
@@ -700,7 +733,7 @@ lotuseed	lotusid
 lower	airpush
 lozfoon	loozfon
 macosx	mac
-macrodown	downloader	macro
+macrodown	downloader
 madanf	virut
 madang	virut
 madangel	virut
@@ -760,6 +793,7 @@ mobkong	smssend
 mobspy	trackplus
 mobsqueeze	fakedoc
 mofksys	swisyn
+mohazo	raccoon
 monad	damon
 monderb	vundo
 monitor	infosteal
@@ -775,7 +809,8 @@ mostofate	softomate
 mozaakai	bazar
 mplug	multiplug
 mrophine	morphine
-msilobfuscator	msil	packed
+msilkrypt	packed
+msilobfuscator	msil
 mspyonline	mspy
 msteal	maistealer
 mswdm	ipamor
@@ -798,6 +833,7 @@ najin	feejar
 nancrat	nanocore
 nandrob	nandrobox
 nanobot	nanocore
+ncov	crysis
 negasteal	agenttesla
 nemucod	smsreg
 neobar	neoreklami
@@ -836,6 +872,7 @@ nsanti	packed
 nuwar	tibs
 nyearleaker	airpush
 nyleaker	airpush
+nymeria	loda
 o97m	macro
 obfus	packed
 obfusc	packed
@@ -867,8 +904,8 @@ optinstall	ibryte
 optiuminstaller	ibryte
 optixp	optix
 optixpro	optix
-orcusrat	orcus
-orcusrot	orcus
+orcus	orcusrat
+orcusrot	orcusrat
 osx	mac
 osx32	mac
 otran	vobfus
@@ -878,13 +915,16 @@ overt	sadenav
 overtls	sadenav
 ozotshielder	kmin
 pace	socks
+packer	packed
 padobot	korgo
 padodor	berbew
 pakes	packed
 panda	zbot
 pandaent	zbot
 pandora	nandrobox
+papras	gozi
 parnian	smssend
+passwordstealera	infosteal
 patch	filemodify
 patched	filemodify
 patcher	filemodify
@@ -908,6 +948,7 @@ petr	petya
 petrolan	petrolin
 petrwrap	petya
 philis	viking
+phobos	crysis
 pigeon	hupigon
 pigetrl	lockscreen
 pikor	wapomi
@@ -942,12 +983,14 @@ porn	porndialer
 porndial	porndialer
 pornlocker	lockscreen
 portscan	network
+poscardstealer	infosteal
 positivefinds	browsefox
 positmob	fakeinst
 potentially	grayware
 poweliks	wowlik
 powerliks	wowlik
 powerpack	linkular
+powerstats	valyria
 powessere	wowlik
 pp97m	macro
 preloader	megasearch
@@ -962,6 +1005,7 @@ protil	wapomi
 provar	fakeinst
 pswtool	infosteal
 pua	grayware
+puffstealer	azorult
 pup	grayware
 pupil	plemood
 purity	purityscan
@@ -979,23 +1023,29 @@ qakbot	qbot
 qhost	hostsmodify
 qhosts	hostsmodify
 qqrobber	qqrob
+quasar_rat	quasar
 quasarrat	quasar
 qukart	berbew
 qvod	wapomi
 rabbhome	fjcon
 rabidog	dogowar
+racealer	raccoon
+racoon	raccoon
 rahack	allaple
 rahiwi	brontok
 raideloz	vobfus
 ramdo	redyms
 ranck	ranky
+randaev	exrand
 ransom	ransomware
-ransomcrypt	filecrypt	ransomware
-ransomlock	lockscreen	ransomware
+ransomcrypt	filecrypt
+ransomlock	lockscreen
 rapiddown	firseria
 ratab	mamianune
+ratopak	buhtrap
 razel	rasteal
 raziel	rasteal
+rdpdos	rdpkill
 recal	mogap
 recam	netwiredrc
 recordpage	browsefox
@@ -1005,7 +1055,9 @@ refogkeylogger	refog
 regie	fosniw
 relevant	relevantknowledge
 relik	updtkiller
+remcosrat	remcos
 remtasu	xtrat
+remvio	remcos
 renamer	files
 reposfxg	trickbot
 reptilic	reptilicus
@@ -1016,7 +1068,7 @@ revet	revetrat
 revtcp	metasploit
 rimecud	palevo
 risk	grayware
-risktool	grayware	tool
+risktool	grayware
 riskware	grayware
 rivalgame	gamevance
 rkdoor	koutodoor
@@ -1035,6 +1087,7 @@ ropin	leadbolt
 rorpian	zeroaccess
 ruftar	usteal
 rugo	hotbar
+rultazo	azorult
 runitslf	looper
 runonce	chir
 runouce	chir
@@ -1132,10 +1185,14 @@ snadapps	typstu
 sndapps	typstu
 sneakytrail	installerex
 sniffer	network
+snifula	gozi
 sobot	clientor
+sockrat	adwind
+socmer	remcos
 sodin	revil
 sodinokib	revil
 sodinokibi	revil
+sodinoransom	revil
 soft32down	soft32downloader
 soft32download	soft32downloader
 softbase	softobase
@@ -1155,7 +1212,7 @@ spacer	unruy
 spakrab	vidro
 spambot	spam
 spammer	spam
-spamtool	spam	tool
+spamtool	spam
 spatet	rebhip
 spdupmypc	speedingupmypc
 speedupmypc	uniblue
@@ -1201,6 +1258,7 @@ systemsecurity	winwebsec
 systex	daws
 systro	sytro
 sysvenfak	loadmoney
+talalpek	gootkit
 talklog	talkw
 taojin	taojinstar
 tapsnake	gpspy
@@ -1209,11 +1267,13 @@ tatus	tetus
 tazebama	mabezat
 tdownloader	installerex
 tdssrt	tdss
+team9backdoor	bazar
 tedro	tedroo
 temai	ksapp
 tepfer	fareit
 test	testvirus
 testfile	testvirus
+thetrick	trickbot
 tibspak	tibs
 tibspk	tibs
 tibsys	tibser
@@ -1237,26 +1297,29 @@ tovkater	installmonster
 towelexploit	towel
 trick	trickbot
 trickbotcrypt	trickbot
+trickloader	trickbot
 trickpak	trickbot
+trickster	trickbot
 trj	trojan
-trjdown	downloader	trojan
-trjndwnlder	downloader	trojan
+trjdown	downloader
+trjndwnlder	downloader
 troj	trojan
-trojanapt	apt	trojan
-trojanbanker	infosteal	trojan
-trojanclicker	adware	clicker	trojan
-trojandldr	downloader	trojan
-trojandownloader	downloader	trojan
-trojandropper	downloader	trojan
-trojandwnldr	downloader	trojan
-trojanfakeav	alertuser	rogueware	trojan
+trojanapt	apt
+trojanbanker	infosteal
+trojanclicker	adware
+trojandldr	downloader
+trojandownloader	downloader
+trojandropper	downloader
+trojandwnldr	downloader
+trojanfakeav	alertuser
 trojanhorse	trojan
-trojanproxy	proxy	trojan
-trojanpsw	infosteal	trojan
-trojanransom	filecrypt	ransomware	trojan
-trojansms	sms	trojan
-trojanspy	spyware	trojan
+trojanproxy	proxy
+trojanpsw	infosteal
+trojanransom	filecrypt
+trojansms	sms
+trojanspy	spyware
 trojware	trojan
+trollster	kefamad
 truedown	truedownloader
 tsuploader	installerex
 tufei	tufik
@@ -1274,24 +1337,27 @@ ultradownload	vilsel
 ultradownloads	vilsel
 umeng	gumen
 unix	linux
+unrecom	adwind
 unsafe	grayware
 unwanted	grayware
 unwnt	grayware
 updatekiller	updtkiller
 updtkill	updtkiller
 uracto	maistealer
+ursnif	gozi
 uuser	uuserv
 uxipp	yzhc
 valhalla	xorala
 valla	xorala
 vbccrypt	vobfus
-vbcrypt	packed	visualbasic
-vbinject	inject	visualbasic
-vbkrypt	packed	visualbasic
+vbcrypt	packed
+vbinder	packed
+vbinject	inject
+vbkrypt	packed
 vbna	vobfus
 vbobf	vobfus
 vbobfus	vobfus
-vbpack	packed	visualbasic
+vbpack	packed
 vernet	dusvext
 vertex	dusvext
 vertexb	dusvext
@@ -1305,6 +1371,7 @@ virtob	virut
 virtool	tool
 vitallia	vittalia
 vjadtre	wapomi
+vjw0rm	vjworm
 vmdetector	vmdetect
 vmpbad	vmprotect
 vnfraye	dusvext
@@ -1315,8 +1382,10 @@ w2km	macro
 w32	windows
 w64	windows
 w97m	macro
+wacatac	deathransom
 wadhrama	crysis
 wakeful	cardserv
+waldek	gootkit
 wali	wapomi
 walkfree	kalfere
 walksteal	walkinwat
@@ -1371,6 +1440,7 @@ xloader	wroba
 xpack	packed
 xpiro	expiro
 xsider	jsmshider
+xswkit	gootkit
 xtoober	karagany
 xtreme	xtrat
 xworm	loveletter
diff --git a/avclass/data/default.taxonomy b/avclass/data/default.taxonomy
index 2bb344e..8404f50 100644
--- a/avclass/data/default.taxonomy
+++ b/avclass/data/default.taxonomy
@@ -78,6 +78,7 @@ CLASS:worm
 CLASS:worm:emailworm
 FAM:1clickdownload
 FAM:4shared
+FAM:5wfo
 FAM:abeciv
 FAM:accutrack
 FAM:acecard
@@ -116,6 +117,7 @@ FAM:aliyuncs
 FAM:allaple
 FAM:alman
 FAM:alyak
+FAM:ammyy
 FAM:amonetize
 FAM:amphitryon
 FAM:androidarmour
@@ -323,6 +325,7 @@ FAM:etumbot
 FAM:ewind
 FAM:expiro
 FAM:expressdownloader
+FAM:exrand
 FAM:faceniff
 FAM:fakeangry
 FAM:fakeapp
@@ -437,6 +440,7 @@ FAM:haynu
 FAM:hero
 FAM:hiddad
 FAM:hiddenapp
+FAM:hiddentear
 FAM:hiddnad
 FAM:highster
 FAM:hilldoor
@@ -503,6 +507,7 @@ FAM:kapratect
 FAM:karagany
 FAM:kasidet
 FAM:katrep
+FAM:kefamad
 FAM:kelihos
 FAM:kgbspy
 FAM:khalesi
@@ -547,12 +552,14 @@ FAM:loadmoney
 FAM:loapi
 FAM:lockactivity
 FAM:locmg
+FAM:loda
 FAM:loic
 FAM:lokibot
 FAM:lolbot
 FAM:lollipop
 FAM:loodos
 FAM:looper
+FAM:loosemaque
 FAM:loozfon
 FAM:lotusid
 FAM:lovefraud
@@ -673,6 +680,7 @@ FAM:openinstall
 FAM:opfake
 FAM:optix
 FAM:orcus
+FAM:orcusrat
 FAM:outbrowse
 FAM:oveead
 FAM:paccy
@@ -734,6 +742,7 @@ FAM:rasteal
 FAM:razam
 FAM:razy
 FAM:rbot
+FAM:rdpkill
 FAM:rebhip
 FAM:recmads
 FAM:redalert
@@ -838,6 +847,7 @@ FAM:smssend
 FAM:smsspy
 FAM:smsthief
 FAM:smszombie
+FAM:snatch
 FAM:snowfox
 FAM:socks
 FAM:soft32downloader
@@ -882,9 +892,9 @@ FAM:suaban
 FAM:suggestor
 FAM:supking
 FAM:svpeng
-FAM:swrort
 FAM:swisyn
 FAM:swizzor
+FAM:swrort
 FAM:systemmonitor
 FAM:systush
 FAM:sytro
@@ -1006,6 +1016,7 @@ FAM:wowlik
 FAM:wqmobile
 FAM:wroba
 FAM:wtaspin
+FAM:wzteam
 FAM:xavierad
 FAM:xinhua
 FAM:xolco

From 2f013f89da93011f96e926fc3840ff91bf85697a Mon Sep 17 00:00:00 2001
From: Jeffrey Gentes <jeffgemail@gmail.com>
Date: Tue, 21 Feb 2023 22:10:40 -0500
Subject: [PATCH 36/36] Allow AVLabels to be init with Classes

---
 avclass/common.py  | 54 ++++++++++++++++++++++++++++------------------
 avclass/labeler.py | 10 ++++-----
 2 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/avclass/common.py b/avclass/common.py
index a6a439c..0ae4436 100755
--- a/avclass/common.py
+++ b/avclass/common.py
@@ -4,7 +4,6 @@
 import string
 import sys
 
-from avclass import util
 from collections import defaultdict, namedtuple
 from typing import AnyStr, Callable, Collection, Dict, List, Optional, Set, Tuple, Union
 
@@ -44,7 +43,7 @@
 
 
 class Tag:
-    """ A Tag in the taxonomy """
+    """A Tag in the taxonomy"""
 
     def __init__(self, s):
         word_list = s.strip().split(":")
@@ -63,27 +62,27 @@ def __init__(self, s):
             self._path = self._name
 
     def __hash__(self):
-        """ Return hash """
+        """Return hash"""
         return hash((self._path))
 
     @property
     def name(self):
-        """ Return tag name """
+        """Return tag name"""
         return self._name
 
     @property
     def cat(self):
-        """ Return tag category """
+        """Return tag category"""
         return self._cat
 
     @property
     def path(self):
-        """ Return tag path """
+        """Return tag path"""
         return self._path
 
     @property
     def prefix_l(self):
-        """ Return tag prefix list """
+        """Return tag prefix list"""
         return self._prefix_l
 
 
@@ -112,14 +111,18 @@ def __len__(self) -> int:
         return len(self._tags)
 
     def __iter__(self):
-        """ Iterator over the alphabetically sorted tags in the taxonomy """
+        """Iterator over the alphabetically sorted tags in the taxonomy"""
         return (t for t in sorted(self._tags))
 
     def is_hex(self, tag: AnyStr) -> bool:
-        # exclude generic hex tags like 004bc24a
-        return bool(re.search(r"\d", tag)) and bool(
-                re.fullmatch(r"[0-9a-fA-F]+", tag)
-            )
+        """
+        Whether or not the input ``tag`` is hex
+        Exclude generic hex tags like 004bc24a
+
+        :param tag: The tag
+        :return: Boolean
+        """
+        return bool(re.search(r"\d", tag)) and bool(re.fullmatch(r"[0-9a-fA-F]+", tag))
 
     def is_generic(self, tag: AnyStr) -> bool:
         """
@@ -512,22 +515,31 @@ def validate(self, taxonomy: Taxonomy):
                     # TODO - raise or return False?
 
 
-class AvLabels:
+class AVLabels:
     """
     Primary class used to interpret AV Labels
     """
 
     def __init__(
         self,
-        tag_file: AnyStr = util.DEFAULT_TAG_PATH,
-        exp_file: AnyStr = util.DEFAULT_EXP_PATH,
-        tax_file: AnyStr = util.DEFAULT_TAX_PATH,
+        translations: Union[AnyStr, Translation] = None,
+        expansions: Union[AnyStr, Expansion] = None,
+        taxonomy: Union[AnyStr, Taxonomy] = None,
         av_file: AnyStr = None,
         alias_detect: bool = False,
     ):
-        self.taxonomy = Taxonomy(tax_file)
-        self.translations = Translation(tag_file)
-        self.expansions = Expansion(exp_file)
+        if isinstance(taxonomy, Taxonomy):
+            self.taxonomy = taxonomy
+        else:
+            self.taxonomy = Taxonomy(taxonomy)
+        if isinstance(translations, Translation):
+            self.translations = translations
+        else:
+            self.translations = Translation(translations)
+        if isinstance(expansions, Expansion):
+            self.expansions = expansions
+        else:
+            self.expansions = Expansion(expansions)
         self.avs = self.read_avs(av_file) if av_file else None
         # Alias statistics initialization
         self.alias_detect = alias_detect
@@ -839,8 +851,8 @@ def get_sample_tags(self, sample_info: SampleInfo) -> Dict[AnyStr, List[AnyStr]]
         return av_dict
 
     def get_sample_vt_count(self, sample_info):
-        ''' Return number of detections for sample
-            in the provided AV whitelist (if any) '''
+        """Return number of detections for sample
+        in the provided AV whitelist (if any)"""
         if self.avs is None:
             return len(sample_info.labels)
         else:
diff --git a/avclass/labeler.py b/avclass/labeler.py
index ae22a0d..784e220 100755
--- a/avclass/labeler.py
+++ b/avclass/labeler.py
@@ -12,17 +12,17 @@
 from typing import AnyStr, Dict, List, NamedTuple, Optional, Tuple, Union
 
 try:
-    from avclass.common import AvLabels, Taxonomy
+    from avclass.common import AVLabels, Taxonomy
     from avclass import clustering as ec, util
 except ModuleNotFoundError:
     # Helps find the avclasses when run from console
     sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-    from avclass.common import AvLabels, Taxonomy
+    from avclass.common import AVLabels, Taxonomy
     from avclass import clustering as ec, util
 
 
 class AVClassLabeler:
-    def __init__(self, av_labels: AvLabels = AvLabels()):
+    def __init__(self, av_labels: AVLabels = AVLabels()):
         self.av_labels = av_labels
         self.output = {"labels": []}
         self.hash_type = None
@@ -645,8 +645,8 @@ def print_output(self, output: AnyStr = ""):
 
 def main():
     args = parse_args()
-    # Create AvLabels object
-    av_labels = AvLabels(
+    # Create AVLabels object
+    av_labels = AVLabels(
         tag_file=args.tag,
         tax_file=args.tax,
         exp_file=args.exp,