From 652ef7bc0c8e0fe9d27d4d690dade23335d836dd Mon Sep 17 00:00:00 2001 From: Matt Miller Date: Mon, 11 Jan 2021 11:51:00 -0600 Subject: [PATCH 01/36] add setup.py and a better .gitignore --- .gitignore | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++- setup.py | 17 +++++++ 2 files changed, 158 insertions(+), 1 deletion(-) create mode 100644 setup.py diff --git a/.gitignore b/.gitignore index 0d20b64..4b38bd9 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,141 @@ -*.pyc +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +.idea/ diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..d87b69e --- /dev/null +++ b/setup.py @@ -0,0 +1,17 @@ +from setuptools import find_packages, setup + + +setup( + name='AVClass', + version='0.0.1', + description='Tag and label malware samples', + license='LICENSE', + packages=find_packages(), + install_requires=[], + setup_requires=[ + 'pytest-runner', + ], + tests_require=[ + 'pytest', + ], +) From 107bb8d1c2f4b3fb8a32c7bcb81612ae749c5534 Mon Sep 17 00:00:00 2001 From: Matt Miller Date: Mon, 11 Jan 2021 12:20:10 -0600 Subject: [PATCH 02/36] implement package structure --- avclass/README.md | 450 +++++-------- avclass/__init__.py | 0 avclass/avclass_alias_detect.py | 89 --- avclass/avclass_generic_detect.py | 83 --- avclass/avclass_labeler.py | 459 ------------- avclass/data/default.aliases | 559 --------------- avclass/data/default.generics | 418 ------------ .../input_checker.py | 12 +- .../avclass2_labeler.py => avclass/labeler.py | 22 +- avclass/lib/avclass_common.py | 337 ---------- .../update.py | 28 +- avclass2/README.md | 261 ------- avclass2/lib/avclass2_common.py | 636 ------------------ {avclass2/data => data}/andropup.expansion | 0 {avclass2/data => data}/default.expansion | 0 {avclass2/data => data}/default.tagging | 0 {avclass2/data => data}/default.taxonomy | 0 setup.py | 2 +- shared/evaluate_clustering.py | 140 ---- 19 files changed, 178 insertions(+), 3318 deletions(-) create mode 100644 avclass/__init__.py delete mode 100755 avclass/avclass_alias_detect.py delete mode 100755 avclass/avclass_generic_detect.py delete mode 100755 avclass/avclass_labeler.py delete mode 100644 avclass/data/default.aliases delete mode 100644 avclass/data/default.generics rename avclass2/avclass2_input_checker.py => avclass/input_checker.py (86%) rename avclass2/avclass2_labeler.py => avclass/labeler.py (98%) delete mode 100755 avclass/lib/avclass_common.py rename avclass2/avclass2_update_module.py => avclass/update.py (97%) delete mode 100644 avclass2/README.md delete mode 100755 avclass2/lib/avclass2_common.py rename {avclass2/data => data}/andropup.expansion (100%) rename {avclass2/data => data}/default.expansion (100%) rename {avclass2/data => data}/default.tagging (100%) rename {avclass2/data => data}/default.taxonomy (100%) delete mode 100755 shared/evaluate_clustering.py diff --git a/avclass/README.md b/avclass/README.md index 07fb2ec..83dfaad 100644 --- a/avclass/README.md +++ b/avclass/README.md @@ -1,93 +1,101 @@ -# AVClass - -AVClass is a malware labeling tool. - -You give it as input the AV labels for a large number of -malware samples (e.g., VirusTotal JSON reports) and it outputs the most -likely family name for each sample that it can extract from the AV labels. -It can also output a ranking of all alternative names it found for each sample. - -The design and evaluation of AVClass is detailed in our -[RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf): - -> Marcos Sebastián, Richard Rivera, Platon Kotzias, and Juan Caballero. -AVClass: A Tool for Massive Malware Labeling. -In Proceedings of the International Symposium on Research in -Attacks, Intrusions and Defenses, -September 2016. - -In a nutshell, AVClass comprises two phases: -preparation (optional) and labeling. -Code for both is included, -but most users will be only interested in the labeling, which outputs the -family name for the samples. -The preparation produces a list of aliases and generic tokens -used by the labeling. -If you use our default aliases and generic tokens lists, -you do not need to run the preparation. - - -## Labeling - -The labeler takes as input -a JSON file with the AV labels of malware samples (-vt or -lb options), -a file with generic tokens (-gen option), -and a file with aliases (-alias option). -It outputs the most likely family name for each sample. -If you do not provide alias or generic tokens files, -the default ones in the *data* folder are used. +# AVClass2 + +AVClass2 is a malware tagging tool. It extends AVClass to extract from AV labels not only family name tags, but other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). + +You give it as input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports) +and it outputs tags observed in the AV labels, ranked by decreasing popularity. + +The design and evaluation of AVClass2 is detailed in our ACSAC 2020 paper. + +> Silvia Sebastián, Juan Caballero. +AVClass2: Massive Malware Tag Extraction from AV Labels. +In proceedings of the Annual Computer Security Applications Conference, December 2020. + +In a nutshell, AVClass2 comprises two modules: labeling and update. Code for both is included, but most users will be only interested in the labeling, which outputs the tags for the samples. The update module is used to update the input taxonomy, tagging rules, and expansion rules. If you use our default taxonomy, tagging, and expansion files, you do not need to run the update module. + + +## Labeling + +The labeler takes as input a JSON file with the AV labels of malware samples +(-vt or -lb options), +a file with the taxonomy (-tax option), +a file with tagging rules (-tag option), and +a file with expansion rules (-exp option). +It outputs a set of ranked tags. +If you do not provide taxonomy, expansion or tagging files, +the default ones in the data folder are used. ```shell -$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v > malheurReference.labels +$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json ``` - -The above command labels the samples whose AV labels are in the -*../examples/malheurReference_lb.json* file. -It prints the results to stdout, -which we redirect to the *malheurReference.labels* file. -The output looks like this: + +The above command labels the samples whose AV labels are in +the ../examples/malheurReference_lb.json file. +It prints the results to stdout. +The output looks like this: ``` -aca2d12934935b070df8f50e06a20539 adrotator -67d15459e1f85898851148511c86d88d adultbrowser +aca2d12934935b070df8f50e06a20539 33 grayware|10,adware|9,windows|8,adrotator|8,downloader|3,zlob|2 +67d15459e1f85898851148511c86d88d 37 dialer|23,windows|9,adultbrowser|8,porndialer|7,grayware|6,tool|3,target|2 ``` -which means sample aca2d12934935b070df8f50e06a20539 is most likely -from the *adrotator* family and -67d15459e1f85898851148511c86d88d from the *adultbrowser* family. +which means sample *aca2d12934935b070df8f50e06a20539* +was flagged by 33 AV engines and 10 of them agree it is *grayware*, 9 that it is more specifically *adware*, +8 mention that it runs on *windows*, another 8 that it is the *adrotator* family, +3 that it is a *downloader*, and 2 that it belongs instead to the *zlob* family. +Sample *67d15459e1f85898851148511c86d88d* is flagged by 37 AV engines and 23 of them +consider it a *dialer*, 8 that it belongs to the *adultbrowser* family, and so on. -The verbose (-v) option makes it output an extra -*malheurReference_lb.verbose* file -with all families extracted for each sample ranked by the number of AV -engines that use that family. -The file looks like this: +The -p option outputs the full path of each tag in the taxonomy: +```shell +$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -p ``` -aca2d12934935b070df8f50e06a20539 [(u'adrotator', 8), (u'zlob', 2)] -ee90a64fcfaa54a314a7b5bfe9b57357 [(u'swizzor', 19)] -f465a2c1b852373c72a1ccd161fbe94c SINGLETON:f465a2c1b852373c72a1ccd161fbe94c + +The above command line outputs: + +``` +aca2d12934935b070df8f50e06a20539 33 CLASS:grayware|10,CLASS:grayware:adware|9,FILE:os:windows|8,FAM:adrotator|8,CLASS:downloader|3,FAM:zlob|2 +67d15459e1f85898851148511c86d88d 37 CLASS:dialer|23,FILE:os:windows|9,FAM:adultbrowser|8,CLASS:dialer:porndialer|7,CLASS:grayware|6,CLASS:grayware:tool|3,FAM:target|2 +``` + +where each tag has been replaced by its taxonomy path, which starts with the category in capitals, +followed by the path in the category (if any), and the tag itself, all separated by colons. +For example, *FAM:adrotator* makes explicit that *adrotator* is a malware family, +*CLASS:grayware* that *grayware* is a malware class, and +*CLASS:grayware:adware* that *adware* is a subclass of *grayware*. + +**Compatibility mode** + +The compatibility -c option makes AVClass2 output the same format as AVClass. + +```shell +$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -c ``` -which means that for sample aca2d12934935b070df8f50e06a20539 -there are 8 AV engines assigning *adrotator* as the family and -another 2 assigning *zlob*. -Thus, *adrotator* is the most likely family. -On the other hand, for ee90a64fcfaa54a314a7b5bfe9b57357 there are 19 AV -engines assigning *swizzor* as family, -and no other family was found. -The last line means that for sample f465a2c1b852373c72a1ccd161fbe94c -no family name was found in the AV labels. -Thus, the sample is placed by himself in a singleton cluster -with the name of the cluster being the sample's hash. - -Note that the sum of the number of AV engines may not equal the number -of AV engines with a label for that sample in the input file -because the labels of some AV engines may only include generic tokens -that are removed by AVClass. +outputs: + +``` +bb23e1d296cf01bbaf32ed3938f9b0b8 allaple +cc4521ea738e8ba17139f86b3def5349 SINGLETON:cc4521ea738e8ba17139f86b3def5349 +``` + +As in AVClass, the output contains only the family name, +which corresponds to the highest ranked family tag, all other tags are ignored. +Samples for which a family cannot be obtained are labeled as singletons with their hash. + +It is important to note that AVClass2 compatibility mode results can differ from AVClass results +on the same input file. +The differences in family names are due to differences between the generics and aliases files +used by AVClass and the taxonomy, tagging rules, and expansion rules used by AVClass2. +In the future, we may change AVClass to use the taxonomy and rules from AVClass2 +as input (instead of the generics and aliases files) +to minimize such differences and avoid maintaining different data files. + ## Input JSON format -AVClass supports three input JSON formats: +AVClass2 supports three input JSON formats: 1. VirusTotal v2 API JSON reports (*-vt file*), where each line in the input *file* should be the full JSON of a @@ -109,141 +117,53 @@ There is an example of such input file in *examples/malheurReference_lb.json* **Multiple input files** -AVClass can handle multiple input files putting the results in the same output files +AVClass2 can handle multiple input files putting the results in the same output files (if you want results in separate files, process each input file separately). It is possible to provide the -vt and -lb input options multiple times. ```shell -$./avclass_labeler.py -vt -vt +$./avclass2_labeler.py -vt -vt ``` ```shell -$./avclass_labeler.py -lb -lb +$./avclass2_labeler.py -lb -lb ``` There are also -vtdir and -lbdir options that can be used to provide an input directory where all files are VT (-vtdir) or simplified (-lbdir) JSON reports: ```shell -$./avclass_labeler.py -vtdir +$./avclass2_labeler.py -vtdir ``` It is also possible to combine -vt with -vtdir and -lb with -lbdir, but you cannot combine input files of different format. Thus, this command works: ```shell -$./avclass_labeler.py -vt -vtdir +$./avclass2_labeler.py -vt -vtdir ``` But, this one throws an error: ```shell -$./avclass_labeler.py -vt -lb -``` - -## Labeling: Family Ranking - -AVClass has a -fam option to output a file with a ranking of the -families assigned to the input samples. - -```shell -$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -fam > malheurReference.labels -``` - -will produce a file called *malheurReference_lb.families* with two columns: - -``` -virut 441 -allaple 301 -podnuha 300 -``` - -indicating that 441 samples were classified in the virut family, -301 as allaple, and 300 as podnuha. - -This option is very similar to using the following shell command: - -```shell -$cut -f 2 malheurReference.labels | sort | uniq -c | sort -nr +$./avclass2_labeler.py -vt -lb ``` -The main difference is that using the -fam option all SINGLETON samples, -i.e., those for which no label was found, -are grouped into a fake *SINGLETONS* family, -while the shell command would leave each singleton as a separate family. - - -## Labeling: PUP Classification - -AVClass also has a -pup option to classify a sample as -Potentially Unwanted Program (PUP) or malware. -This classification looks for PUP-related keywords -(e.g., pup, pua, unwanted, adware) in the AV labels and was proposed in our -[CCS 2015 paper](https://software.imdea.org/~juanca/papers/malsign_ccs15.pdf): - -> Platon Kotzias, Srdjan Matic, Richard Rivera, and Juan Caballero. -Certified PUP: Abuse in Authenticode Code Signing. -In Proceedings of the 22nd ACM Conference on Computer and Communication Security, Denver, CO, October 2015 - -```shell -$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -pup > malheurReference.labels -``` - -With the -pup option the output of the *malheurReference.labels* file -looks like this: - -``` -aca2d12934935b070df8f50e06a20539 adrotator 1 -67d15459e1f85898851148511c86d88d adultbrowser 0 -``` - -The digit at the end is a Boolean flag that -indicates sample aca2d12934935b070df8f50e06a20539 is -(likely) PUP, but sample 67d15459e1f85898851148511c86d88d is (likely) not. - -In our experience the PUP classification is conservative, -i.e., if it says the sample is PUP, it most likely is. -But, if it says that it is not PUP, it could still be PUP if the AV labels -do not contain PUP-related keywords. -Note that it is possible that some samples from a family get -the PUP flag while other samples from the same family do not -because the PUP-related keywords may not appear in the labels of -all samples from the same family. -To address this issue, you can combine the -pup option with the -fam option. -This combination will add into the families file the classification of the -family as malware or PUP, based on a majority vote among the samples in a -family. - -```shell -$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -pup -fam > malheurReference.labels -``` - -will produce a file called *malheurReference_lb.families* with five columns: - -``` -# Family Total Malware PUP FamType -virut 441 441 0 malware -magiccasino 173 0 173 pup -ejik 168 124 44 malware -``` - -For virut, the numbers indicate all the 441 virut samples are classified -as malware, and thus the last column states that virut is a malware family. -For magiccasino, all 173 samples are labeled as PUP, thus the family is PUP. -For ejik, out of the 168 samples, 124 are labeled as malware and 44 as PUP, -so the family is classified as malware. - +At this point you have read the most important information on how to use AVClass2. +The following sections describe steps that most users will not need. ## Labeling: Ground Truth Evaluation -If you have ground truth for some malware samples, -i.e., you know the true family for those samples, you can evaluate the accuracy of the labeling output by AVClass on those samples with respect to that -ground truth. -The evaluation metrics used are precision, recall, and F1 measure. -See our RAID 2016 paper above for their definition. +If you have family ground truth for some malware samples, i.e., +you know the true family for those samples, you can evaluate the accuracy +of the family tags output by AVClass2 on those samples with respect to that ground truth. +The evaluation metrics used are precision, recall, and F1 measure. +See our [RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf) for their definition. +Note that the ground truth evaluation does not apply to non-family tags, +i.e., it only evaluates the output of the compatibility mode. ```shell -$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -gt ../examples/malheurReference_gt.tsv -eval > malheurReference.labels +$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -gt ../examples/malheurReference_gt.tsv > malheurReference.labels ``` The output includes these lines: @@ -254,148 +174,88 @@ Calculating precision and recall Precision: 90.81 Recall: 94.05 F1-Measure: 92.40 ``` -The last line corresponds to the accuracy metrics obtained by -comparing AVClass results with the provided ground truth. - -Each line in the *../examples/malheurReference_gt.tsv* file has -two **tab-separated** columns: +Each line in the *../examples/malheurReference_gt.tsv* file has two **tab-separated** columns: ``` -0058780b175c3ce5e244f595951f611b8a24bee2 CASINO +aca2d12934935b070df8f50e06a20539 ADROTATOR ``` -which indicates that sample 0058780b175c3ce5e244f595951f611b8a24bee2 -is known to be of the *CASINO* family. -Each sample in the input file should also appear in the ground truth file. +which indicates that sample aca2d12934935b070df8f50e06a20539 is known +to be of the *ADROTATOR* family. +Each sample in the input file should also appear in the ground truth file. Note that the particular label assigned to each family does not matter. -What matters is that all samples in the same family are assigned the -same family name (i.e., the same string in the second column) +What matters is that all samples in the same family are assigned +the same family name (i.e., the same string in the second column) -The ground truth can be obtained from publicly available malware -datasets. +The ground truth can be obtained from publicly available malware datasets. The one in *../examples/malheurReference_gt.tsv* comes from the [Malheur](http://www.mlsec.org/malheur/) dataset. There are other public datasets with ground truth such as [Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or [Malicia](http://malicia-project.com/dataset.html). +## Update Module + +The update module can be used to suggest additions and changes to the input +taxonomy, tagging rules, and expansion rules. +Using the update module comprises of two steps. +The first step is obtaining an alias file from the labeler: -## Preparation: Generic Token Detection +```shell +$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -aliasdetect +``` + +The above command will create a file named \.alias, +malheurReference_lb.alias in our example. This file has 7 columns: -The labeling takes as input a file with generic tokens that should be -ignored in the AV labels, e.g., trojan, virus, generic, linux. -By default, the labeling uses the *data/default.generics* -generic tokens file. -You can edit that file to add additional generic tokens you feel -we are missing. +1. t1: token that is an alias +2. t2: tag for which t1 is an alias +3. |t1|: number of input samples where t1 was observed +4. |t2|: number of input samples where t2 was observed +5. |t1^t2|: number of input samples where both t1 and t2 were observed +6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t1 was observed. +7. |t1^t2|/|t2|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t2 was observed. -In our RAID 2016 paper we describe an automatic approach to -identify generic tokens, which **requires ground truth**, -i.e., it requires knowing the true family for each input sample. -Not only that, but **the ground truth should be large**, -i.e., contain at least one hundred thousand samples. -In our work we identified generic tokens using as ground truth -the concatenation of all datasets for which we had ground truth. -This requirement of a large ground truth dataset is why we expect most users -will skip this step and simply use our provided default file. -If you want to test generic token detection you can do: +The Update Module takes the above file as input with the -alias option, +as well as the default taxonomy, tagging, and expansion files in the data directory. +It outputs updated taxonomy, tagging, and expansion files that include the +suggested additions and changes. ```shell - $./avclass_generic_detect.py -lb ../examples/malheurReference_lb.json -gt ../examples/malheurReference_gt.tsv -tgen 10 > malheurReference.gen +$./avclass2_update_module.py -alias malheurReference_lb.alias -o output_prefix ``` -Each line in the *../examples/malheurReference_gt.tsv* file has -two **tab-separated** columns: +This will produce three files: +output_prefix.taxonomy, output_prefix.tagging, output_prefix.expansion. +You can diff the output and input files to analyze the proposed changes. -``` -0058780b175c3ce5e244f595951f611b8a24bee2 CASINO -``` +You can also modify the input taxonomy, tagging, and expansion rules in place, +rather than producing new files: -which indicates that sample 0058780b175c3ce5e244f595951f611b8a24bee2 -is known to be of the *CASINO* family. - -The *-tgen 10* option is a threshold for the minimum number of families -where a token has to be observed to be considered generic. -If the option is ommitted, the default threshold of 8 is used. - -The above command outputs two files: -*malheurReference.gen* and *malheurReference_lb.gen*. -Each of them has 2 columns: token and number of families where the token -was observed. -File *malheurReference.gen* is the final output with the detected -generic tokens for which the number of families is above -the given threshold. -The file *malheurReference_lb.gen* has this information for all tokens. -Thus, *malheurReference.gen* is a subset of *malheurReference_lb.gen*. - -However, note that in the above command you are trying to identify generic -tokens from a small dataset since Drebin only contains 3K labeled samples. -Thus, *malheurReference.gen* only contains 25 identified generic tokens. -Using those 25 generic tokens will produce significantly worse results -than using the generic tokens in *data/default.generics*. -For more details you can refer to our RAID 2016 paper. - - -## Preparation: Alias Detection - -Different vendors may assign different names (i.e., aliases) for the same -family. For example, some vendors may use *zeus* and others *zbot* -as aliases for the same malware family. -The labeling takes as input a file with aliases that should be merged. -By default, the labeling uses the *data/default.aliases* aliases file. -You can edit that file to add additional aliases you feel we are missing. - -In our RAID 2016 paper we describe an automatic approach -to identify aliases. -Our alias detection approach -**requires as input the AV labels for large set of samples**, -e.g., several million samples. -In contrast with the generic token detection, the input samples for -alias detection **do not need to be labeled**, -i.e., no need to know their family. -In our work we identified aliases using as input the largest of our -unlabeled datasets, which contained nearly 8M samples. -This requirement of a large input dataset is why we expect most users -will skip this step and simply use our provided default file. - -If you want to test alias detection you can do: ```shell -$./avclass_alias_detect.py -lb ../examples/malheurReference_lb.json -nalias 100 -talias 0.98 > malheurReference.aliases +$./avclass2_update_module.py -alias malheurReference_lb.alias -update ``` -The -nalias threshold provides the minimum number of samples two tokens -need to be observed in to be considered aliases. -If the option is not provided the default is 20. -The -talias threshold provides the minimum fraction of times that -the samples appear together. -If the is not provided the default is 0.94 (94%). +## Customizing AVClass2 -The above command outputs two files: -*malheurReference.aliases* and *malheurReference_lb.alias*. -Each of them has 6 columns: -1. t1: token that is an alias -2. t2: family for which t1 is an alias -3. |t1|: number of input samples where t1 was observed -4. |t2|: number of input samples where t2 was observed -5. |t1^t2|: number of input samples where both t1 and t2 were observed -6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 -were observed over the number of input samples where t1 was observed. - -File *malheurReference.aliases* is the final output with the -detected aliases that satisfy the -nalias and -talias thresholds. -The file *malheurReference_lb.alias* has this information for all tokens. -Thus, *malheurReference.aliases* is a subset -of *malheurReference_lb.alias*. - -However, note that in the above command you are trying to identify aliases -from a small dataset since Drebin only contains 3K samples. -Thus, *malheurReference.aliases* only contains 6 identified aliases. -Using those 6 aliases will produce significantly worse results than using -the aliases in *data/default.aliases*. -As mentioned, to improve the identified aliases you should provide as -input several million samples. -For more details you can refer to our RAID 2016 paper. +AVClass2 is fully customizable: +Tagging, Expansion and Taxonomy files can be easily modified by the analyst +either manually or by running the update module. + +If you change those files manually, we recommend running +afterwards the input checker script to keep them tidy. +It sorts the tags in the taxonomy and performs some basic cleaning like +removing redundant entries: + +```shell +$./avclass2_input_checker.py -tax taxonomy_file -tag tagging_file -exp expansio_file +``` + +If the modifications are in the default files in the data directory you can simply run: +```shell +$./avclass2_input_checker.py +``` diff --git a/avclass/__init__.py b/avclass/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/avclass/avclass_alias_detect.py b/avclass/avclass_alias_detect.py deleted file mode 100755 index 6624d97..0000000 --- a/avclass/avclass_alias_detect.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -''' -AVClass Alias detect -''' -import sys -import argparse -import subprocess -import os - - -def main(args): - # Set input switch - itype = '-vt' if args.vt else '-lb' - ifile = args.vt if args.vt else args.lb - - # Set generic tokens file if provided - gen_switch = "-gen " + args.gen if args.gen else "" - sys.stderr.write('Switch: %s\n' % (gen_switch)) - - # Run avclass_labeler - sys.stderr.write('[-] Running avclass_labeler on %s\n' % (ifile)) - FNULL = open(os.devnull, 'w') - labeler = subprocess.Popen(\ - "python avclass_labeler.py %s %s %s -alias /dev/null -aliasdetect" % - (itype, ifile, gen_switch), shell=True, stdout=FNULL) - labeler.wait() - - # Process alias file - sys.stderr.write('[-] Processing token pairs.\n') - alias_fname = os.path.basename(os.path.splitext(ifile)[0]) + '.alias' - with open(alias_fname, 'r') as fr: - for pos, line in enumerate(fr): - cline = line.strip('\n') - # Print headers - if not pos: - sys.stdout.write("%s\n" % cline) - continue - t1, t2, t1_num, t2_num, nalias_num, talias_num = cline.split('\t') - if int(nalias_num) > args.nalias and\ - float(talias_num) > args.talias: - sys.stdout.write("%s\n" % cline) - - # Done - sys.stderr.write('[-] Done.\n') - - -if __name__=='__main__': - argparser = argparse.ArgumentParser(prog='avclass_alias_detect', - description='''Given a collection of VT reports it detects aliases - used by AVs. It runs the avclass_labeler with specific arguments - and processes the output.''') - - argparser.add_argument('-vt', - help='file to parse with full VT reports ' - '(REQUIRED if -lb argument not present)') - - argparser.add_argument('-lb', - help='file to parse with subset of VT reports' - '{md5,sha1,sha256,scan_date,av_labels} ' - '(REQUIRED if -vt not present)') - - argparser.add_argument('-gen', - help='file with generic tokens.') - - argparser.add_argument('-nalias', - help='Minimum number of times that a pair of tokes have been seen.' - 'Default: 20', - type=int, - default = 20) - - argparser.add_argument('-talias', - help='Minimum percentage of times two tokens appear together.' - 'Default: 0.94', - type=float, - default = 0.94) - - args = argparser.parse_args() - - if not args.vt and not args.lb: - sys.stderr.write('Argument -vt or -lb is required\n') - exit(1) - - if args.vt and args.lb: - sys.stderr.write('Use either -vt or -lb argument, not both.\n') - exit(1) - - main(args) - diff --git a/avclass/avclass_generic_detect.py b/avclass/avclass_generic_detect.py deleted file mode 100755 index cfdcaa8..0000000 --- a/avclass/avclass_generic_detect.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -''' -AVClass Generic detect -''' -import sys -import argparse -import subprocess -import os - - -def main(args): - # Set input switch - itype = '-vt' if args.vt else '-lb' - ifile = args.vt if args.vt else args.lb - - # Run avclass_labeler - sys.stderr.write('[-] Running avclass_labeler on %s\n' % (ifile)) - FNULL = open(os.devnull, 'w') - labeler = subprocess.Popen(\ - "python avclass_labeler.py %s %s -alias /dev/null"\ - " -gen /dev/null -gendetect -gt %s" % - (itype, ifile, args.gt), shell=True, stdout=FNULL) - labeler.wait() - - # Process generic tokens file - sys.stderr.write('[-] Processing results.\n') - gen_fname = os.path.basename(os.path.splitext(ifile)[0]) + '.gen' - with open(gen_fname, 'r') as fr: - for pos, line in enumerate(fr): - cline = line.strip('\n') - # Print headers - if not pos: - sys.stdout.write("%s\n" % cline) - continue - token, fam_num = cline.split('\t') - if int(fam_num) > args.tgen: - sys.stdout.write("%s\n" % cline) - - # Done - sys.stderr.write('[-] Done.\n') - - -if __name__=='__main__': - argparser = argparse.ArgumentParser(prog='avclass_alias_detect', - description='''Given a collection of VT reports and the family - names of these samples (i.e., groundtruth) it generates a list - of generic tokens to be excluded from labeling.''') - - argparser.add_argument('-vt', - help='file to parse with full VT reports ' - '(REQUIRED if -lb argument not present)') - - argparser.add_argument('-lb', - help='file to parse with subset of VT reports' - '{md5,sha1,sha256,scan_date,av_labels} ' - '(REQUIRED if -vt not present)') - - argparser.add_argument('-tgen', - help='Minimum number of families that a token appears. ' - 'Default: 8', - type=int, - default = 8) - - argparser.add_argument('-gt', - help='file with ground truth') - - args = argparser.parse_args() - - if not args.vt and not args.lb: - sys.stderr.write('Argument -vt or -lb is required\n') - exit(1) - - if args.vt and args.lb: - sys.stderr.write('Use either -vt or -lb argument, not both.\n') - exit(1) - - if not args.gt: - sys.stderr.write('Generic token detection needs groundtruth (-gt)\n') - exit(1) - - main(args) - diff --git a/avclass/avclass_labeler.py b/avclass/avclass_labeler.py deleted file mode 100755 index 21ff9b5..0000000 --- a/avclass/avclass_labeler.py +++ /dev/null @@ -1,459 +0,0 @@ -#!/usr/bin/env python -''' -AVClass labeler -''' - -import os -import sys -path = os.path.dirname(os.path.abspath(__file__)) -libpath = os.path.join(path, 'lib/') -sharedpath = os.path.join(path, '../shared/') -sys.path.insert(1, libpath) -sys.path.insert(1, sharedpath) -import argparse -from avclass_common import AvLabels -from operator import itemgetter -import evaluate_clustering as ec -import json -import traceback - -# Default alias file -default_alias_file = os.path.join(path, "data/default.aliases") -# Default generic tokens file -default_gen_file = os.path.join(path, "data/default.generics") - -def guess_hash(h): - '''Given a hash string, guess the hash type based on the string length''' - hlen = len(h) - if hlen == 32: - return 'md5' - elif hlen == 40: - return 'sha1' - elif hlen == 64: - return 'sha256' - else: - return None - -def main(args): - # Select hash used to identify sample, by default MD5 - hash_type = args.hash if args.hash else 'md5' - - # If ground truth provided, read it from file - gt_dict = {} - if args.gt: - with open(args.gt, 'r') as gt_fd: - for line in gt_fd: - gt_hash, family = map(str.lower, line.strip().split('\t', 1)) - gt_dict[gt_hash] = family - - # Guess type of hash in ground truth file - hash_type = guess_hash(list(gt_dict.keys())[0]) - - # Create AvLabels object - av_labels = AvLabels(args.gen, args.alias, args.av) - - # Build list of input files - # NOTE: duplicate input files are not removed - ifile_l = [] - if (args.vt): - ifile_l += args.vt - ifile_are_vt = True - if (args.lb): - ifile_l += args.lb - ifile_are_vt = False - if (args.vtdir): - ifile_l += [os.path.join(args.vtdir, f) for f in os.listdir(args.vtdir)] - ifile_are_vt = True - if (args.lbdir): - ifile_l += [os.path.join(args.lbdir, f) for f in os.listdir(args.lbdir)] - ifile_are_vt = False - - # Select correct sample info extraction function - if not ifile_are_vt: - get_sample_info = av_labels.get_sample_info_lb - elif args.vt3: - get_sample_info = av_labels.get_sample_info_vt_v3 - else: - get_sample_info = av_labels.get_sample_info_vt_v2 - - # Select output prefix - out_prefix = os.path.basename(os.path.splitext(ifile_l[0])[0]) - - # If verbose, open log file - if args.verbose: - log_filename = out_prefix + '.verbose' - verb_fd = open(log_filename, 'w+') - - # Initialize state - first_token_dict = {} - token_count_map = {} - pair_count_map = {} - token_family_map = {} - fam_stats = {} - vt_all = 0 - vt_empty = 0 - singletons = 0 - - # Process each input file - for ifile in ifile_l: - # Open file - fd = open(ifile, 'r') - - # Debug info, file processed - sys.stderr.write('[-] Processing input file %s\n' % ifile) - - # Process all lines in file - for line in fd: - - # If blank line, skip - if line == '\n': - continue - - # Debug info - if vt_all % 100 == 0: - sys.stderr.write('\r[-] %d JSON read' % vt_all) - sys.stderr.flush() - vt_all += 1 - - # Read JSON line and extract sample info (i.e., hashes and labels) - vt_rep = json.loads(line) - sample_info = get_sample_info(vt_rep) - if sample_info is None: - try: - name = vt_rep['md5'] - sys.stderr.write('\nNo AV labels for %s\n' % name) - except KeyError: - sys.stderr.write('\nCould not process: %s\n' % line) - sys.stderr.flush() - vt_empty += 1 - continue - - # Sample's name is selected hash type (md5 by default) - name = getattr(sample_info, hash_type) - - # If the VT report has no AV labels, continue - if not sample_info[3]: - vt_empty += 1 - sys.stderr.write('\nNo AV labels for %s\n' % name) - sys.stderr.flush() - continue - - # Get the distinct tokens from all the av labels in the report - # And print them. If not verbose, print the first token. - # If verbose, print the whole list - try: - # Get distinct tokens from AV labels - tokens = list(av_labels.get_family_ranking(sample_info).items()) - - # If alias detection, populate maps - if args.aliasdetect: - prev_tokens = set() - for entry in tokens: - curr_tok = entry[0] - curr_count = token_count_map.get(curr_tok) - if curr_count: - token_count_map[curr_tok] = curr_count + 1 - else: - token_count_map[curr_tok] = 1 - for prev_tok in prev_tokens: - if prev_tok < curr_tok: - pair = (prev_tok,curr_tok) - else: - pair = (curr_tok,prev_tok) - pair_count = pair_count_map.get(pair) - if pair_count: - pair_count_map[pair] = pair_count + 1 - else: - pair_count_map[pair] = 1 - prev_tokens.add(curr_tok) - - # If generic token detection, populate map - if args.gendetect and args.gt: - for entry in tokens: - curr_tok = entry[0] - curr_fam_set = token_family_map.get(curr_tok) - family = gt_dict[name] if name in gt_dict else None - if curr_fam_set and family: - curr_fam_set.add(family) - elif family: - token_family_map[curr_tok] = set(family) - - # Top candidate is most likely family name - if tokens: - family = tokens[0][0] - is_singleton = False - else: - family = "SINGLETON:" + name - is_singleton = True - singletons += 1 - - # Check if sample is PUP, if requested - if args.pup: - is_pup = av_labels.is_pup(sample_info[3]) - if is_pup: - is_pup_str = "\t1" - else: - is_pup_str = "\t0" - else: - is_pup = None - is_pup_str = "" - - # Build family map for precision, recall, computation - first_token_dict[name] = family - - # Get ground truth family, if available - if args.gt: - gt_family = '\t' + gt_dict[name] if name in gt_dict else "" - else: - gt_family = "" - - # Print family (and ground truth if available) to stdout - sys.stdout.write('%s\t%s%s%s\n' % (name, family, gt_family, - is_pup_str)) - - # If verbose, print tokens (and ground truth if available) - # to log file - if args.verbose: - verb_fd.write('%s\t%s%s%s\n' % ( - name, tokens, gt_family, is_pup_str)) - - # Store family stats (if required) - if args.fam: - if is_singleton: - ff = 'SINGLETONS' - else: - ff = family - try: - numAll, numMal, numPup = fam_stats[ff] - except KeyError: - numAll = 0 - numMal = 0 - numPup = 0 - - numAll += 1 - if args.pup: - if is_pup: - numPup += 1 - else: - numMal += 1 - fam_stats[ff] = (numAll, numMal, numPup) - - except: - traceback.print_exc(file=sys.stderr) - continue - - # Debug info - sys.stderr.write('\r[-] %d JSON read' % vt_all) - sys.stderr.flush() - sys.stderr.write('\n') - - # Close file - fd.close() - - # Print statistics - sys.stderr.write( - "[-] Samples: %d NoLabels: %d Singletons: %d " - "GroundTruth: %d\n" % ( - vt_all, vt_empty, singletons, len(gt_dict))) - - # If ground truth, print precision, recall, and F1-measure - if args.gt and args.eval: - precision, recall, fmeasure = \ - ec.eval_precision_recall_fmeasure(gt_dict, - first_token_dict) - sys.stderr.write( \ - "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % \ - (precision, recall, fmeasure)) - - # If generic token detection, print map - if args.gendetect: - # Open generic tokens file - gen_filename = out_prefix + '.gen' - gen_fd = open(gen_filename, 'w+') - # Output header line - gen_fd.write("Token\t#Families\n") - sorted_pairs = sorted(token_family_map.items(), - key=lambda x: len(x[1]) if x[1] else 0, - reverse=True) - for (t,fset) in sorted_pairs: - gen_fd.write("%s\t%d\n" % (t, len(fset))) - - # Close generic tokens file - gen_fd.close() - sys.stderr.write('[-] Generic token data in %s\n' % (gen_filename)) - - # If alias detection, print map - if args.aliasdetect: - # Open alias file - alias_filename = out_prefix + '.alias' - alias_fd = open(alias_filename, 'w+') - # Sort token pairs by number of times they appear together - sorted_pairs = sorted( - pair_count_map.items(), key=itemgetter(1)) - # Output header line - alias_fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\n") - # Compute token pair statistic and output to alias file - for (t1,t2),c in sorted_pairs: - n1 = token_count_map[t1] - n2 = token_count_map[t2] - if (n1 < n2): - x = t1 - y = t2 - xn = n1 - yn = n2 - else: - x = t2 - y = t1 - xn = n2 - yn = n1 - f = float(c) / float(xn) - alias_fd.write("%s\t%s\t%d\t%d\t%d\t%0.2f\n" % ( - x,y,xn,yn,c,f)) - # Close alias file - alias_fd.close() - sys.stderr.write('[-] Alias data in %s\n' % (alias_filename)) - - # If family statistics, output to file - if args.fam: - # Open family file - fam_filename = out_prefix + '.families' - fam_fd = open(fam_filename, 'w+') - # Output header line - if args.pup: - fam_fd.write("# Family\tTotal\tMalware\tPUP\tFamType\n") - else: - fam_fd.write("# Family\tTotal\n") - # Sort map - sorted_pairs = sorted(fam_stats.items(), key=itemgetter(1), - reverse=True) - # Print map contents - for (f,fstat) in sorted_pairs: - if args.pup: - if fstat[1] > fstat[2]: - famType = "malware" - else: - famType = "pup" - fam_fd.write("%s\t%d\t%d\t%d\t%s\n" % (f, fstat[0], fstat[1], - fstat[2], famType)) - else: - fam_fd.write("%s\t%d\n" % (f, fstat[0])) - # Close file - fam_fd.close() - sys.stderr.write('[-] Family data in %s\n' % (fam_filename)) - - # Close log file - if args.verbose: - sys.stderr.write('[-] Verbose output in %s\n' % (log_filename)) - verb_fd.close() - - - -if __name__=='__main__': - argparser = argparse.ArgumentParser(prog='avclass_labeler', - description='''Extracts the family of a set of samples. - Also calculates precision and recall if ground truth available''') - - argparser.add_argument('-vt', action='append', - help='file with VT reports ' - '(Can be provided multiple times)') - - argparser.add_argument('-lb', action='append', - help='file with simplified JSON reports ' - '{md5,sha1,sha256,scan_date,av_labels} ' - '(Can be provided multiple times)') - - argparser.add_argument('-vtdir', - help='existing directory with VT reports') - - argparser.add_argument('-lbdir', - help='existing directory with simplified JSON reports') - - argparser.add_argument('-gt', - help='file with ground truth') - - argparser.add_argument('-eval', - action='store_true', - help='if used it evaluates clustering accuracy.' - ' Prints precision, recall, F1-measure. Requires -gt parameter') - - argparser.add_argument('-alias', - help='file with aliases.', - default = default_alias_file) - - argparser.add_argument('-gen', - help='file with generic tokens.', - default = default_gen_file) - - argparser.add_argument('-av', - help='file with list of AVs to use') - - argparser.add_argument('-pup', - action='store_true', - help='if used each sample is classified as PUP or not') - - argparser.add_argument('-gendetect', - action='store_true', - help='if used produce generics file at end. Requires -gt parameter') - - argparser.add_argument('-aliasdetect', - action='store_true', - help='if used produce aliases file at end') - - argparser.add_argument('-v', '--verbose', - action='store_true', - help='output .verbose file with distinct tokens') - - argparser.add_argument('-hash', - help='hash used to name samples. Should match ground truth', - choices=['md5', 'sha1', 'sha256']) - - argparser.add_argument('-fam', - action='store_true', - help='if used produce families file with PUP/malware counts per family') - - argparser.add_argument('-vt3', action='store_true', - help='input are VT v3 files') - - args = argparser.parse_args() - - if not args.vt and not args.lb and not args.vtdir and not args.lbdir: - sys.stderr.write('One of the following 4 arguments is required: ' - '-vt,-lb,-vtdir,-lbdir\n') - exit(1) - - if (args.vt or args.vtdir) and (args.lb or args.lbdir): - sys.stderr.write('Use either -vt/-vtdir or -lb/-lbdir. ' - 'Both types of input files cannot be combined.\n') - exit(1) - - if args.gendetect and not args.gt: - sys.stderr.write('Generic token detection requires -gt param\n') - exit(1) - - if args.eval and not args.gt: - sys.stderr.write('Evaluating clustering accuracy needs -gt param\n') - exit(1) - - if args.alias: - if args.alias == '/dev/null': - sys.stderr.write('[-] Using no aliases\n') - args.alias = None - else: - sys.stderr.write('[-] Using aliases in %s\n' % ( - args.alias)) - else: - sys.stderr.write('[-] Using generic aliases in %s\n' % ( - default_alias_file)) - - if args.gen: - if args.gen == '/dev/null': - sys.stderr.write('[-] Using no generic tokens\n') - args.gen = None - else: - sys.stderr.write('[-] Using generic tokens in %s\n' % ( - args.gen)) - else: - sys.stderr.write('[-] Using default generic tokens in %s\n' % ( - default_gen_file)) - - main(args) diff --git a/avclass/data/default.aliases b/avclass/data/default.aliases deleted file mode 100644 index d9ed41c..0000000 --- a/avclass/data/default.aliases +++ /dev/null @@ -1,559 +0,0 @@ -oneclickdownload 1clickdownload -4share 4shared -getfaster 4shared -activshop activshopper -adgazele adgazelle -smabo adialer -dealcabby adpeak -adswo adwo -gaobot agobot -airad airinstaller -airadinstaller airinstaller -airinstall airinstaller -rahack allaple -starman allaple -almanahe alman -kanav alyak -adfltnet amonetize -easydl amonetize -filesearch amonetize -imonetize amonetize -armour androidarmour -climap androrat -arcparlor arcadeparlor -badday badda -bearshare bandoo -ilivid bandoo -koyotelab bandoo -musictoolbar bandoo -searchsuite bandoo -seasuite bandoo -torchmedia bandoo -basebrid basebridge -batteryd batterydoctor -fakebattscar batterydoctor -klezer beebone -selfdel beebone -kazaa benjamin -qukart berbew -padodor berbew -bertle bertle -bertlea bertle -serbg bgserv -midgare bifrose -egbii biige -widoman bmmedia -bobic bobax -boxersms boxer -smsboxer boxer -browsepulse browsefox -dragonbranch browsefox -expressfind browsefox -glassbottle browsefox -greatfind browsefox -liteweb browsefox -positivefinds browsefox -recordpage browsefox -rollaround browsefox -salecharger browsefox -strongsignal browsefox -swiftbrowse browsefox -wanderburst browsefox -yontoo browsefox -yotoon browsefox -bundl bundlore -installvibe bundlore -buzb bzub -desktoplightning cashon -dowcen centim -chinesehacker chir -runonce chir -runouce chir -cinmeng cinmus -clemag cleaman -clientconnect conduit -searchprotect conduit -kucirc cosmu -overdoom cosmu -dalamodo cossta -putalol couponmarvel -crori crossrider -geksone crytex -hublo crytex -cybota cycbot -gbot cycbot -goolbot cycbot -cabby dalexis -ctblocker dalexis -elenoocka dalexis -comet darkkomet -cometsys darkkomet -cometsystems darkkomet -finloski darkkomet -fynloski darkkomet -krademok darkkomet -montiera delbar -cheval detroie -detroi detroie -detroia detroie -eydrop dinwod -directdown directdownloader -indirect directdownloader -zadved dlhelper -dogbite dogowar -dogwar dogowar -rabidog dogowar -domainiq domaiq -domalq domaiq -domlq domaiq -payint domaiq -tugspay domaiq -downloadmin downloadadmin -downloadasist downloadassistant -downloaderguide downloadguide -drdelux droiddeluxe -ddlight droiddreamlight -lightdd droiddreamlight -fokonge droidkungfu -kongfu droidkungfu -kungfu droidkungfu -ibashade drolnux -dialpass egroupdial -egroup egroupdial -exedial egroupdial -instantaccess egroupdial -emud emudbot -adwareeorezo eorezo -getextension eorezo -tuto4pc eorezo -eqdrug equationdrug -equation equationdrug -xpiro expiro -yourfiledownloader expressdownloader -fakerecovery fakesysdef -prodatect fakesysdef -systemfix fakesysdef -tepfer fareit -farex fearso -nofear fearso -nofer fearso -fenomen fenomengame -fenomengamet fenomengame -condestil firseria -downloadmr firseria -firser firseria -firseriainstaller firseria -fiseria firseria -morstar firseria -morstars firseria -popeler firseria -rapiddown firseria -solimba firseria -sventore firseria -flyagent flystudio -flystud flystudio -cobbler focobers -cobblerone focobers -cudos fosniw -regie fosniw -winsoft fosniw -emerleox fujacks -fujack fujacks -whboy fujacks -gaba gabpath -androm gamarue -andromeda gamarue -bundpil gamarue -debris gamarue -dromedan gamarue -lilu gamarue -wauchos gamarue -arcadeparlor gamevance -arcadeweb gamevance -epicgames gamevance -epicplay gamevance -gamevancecs gamevance -gvance gamevance -rivalgame gamevance -juched ganelp -waps gappusin -wapsx gappusin -geimini geinimi -geinim geinimi -kernelpatch geral -livesoft getnow -livesoftaction getnow -frogonal ginmaster -gingermaster ginmaster -gmaster ginmaster -ghostbot gobot -gdream golddream -glodream golddream -gprice gorillaprice -spysheriff harnig -helldoor hilldoor -hippo hipposms -hipsmser hipposms -hispo hipposms -banach hotbar -clickpotato hotbar -clkpotato hotbar -pinball hotbar -rugo hotbar -screensaver hotbar -zango hotbar -freepds hotclip -huigezi hupigon -pigeon hupigon -optimum ibryte -optimuminstall ibryte -optimuminstaller ibryte -optinstall ibryte -optiuminstaller ibryte -ickboy icekboy -iceboy icekboy -installcube icloader -iconos iconosys -iconosis iconosys -inboxtoolbar inbox -dowins inservice -inservc inservice -braininst installbrain -brantall installbrain -ibrain installbrain -clickrun installcore -clickrunsoftware installcore -cryptinno installcore -installco installcore -installrex installerex -sneakytrail installerex -tdownloader installerex -tsuploader installerex -webpick installerex -installq installiq -installmet installmetrix -instmonetizer installmonetizer -installmon installmonster -installmonst installmonster -installmonstr installmonster -monstruos installmonster -tovkater installmonster -intex intexdial -intexus intexdial -neteyes ipamor -mswdm ipamor -amorba ipamor -hidrag jeefo -jackpos jinupd -plosa karagany -xtoober karagany -kgbkeylogger kgbspy -elkern klez -padobot korgo -rkdoor koutodoor -hyteod kovter -lacon laconic -escape laroux -escop laroux -manalo laroux -linkun linkular -powerpack linkular -legendmir lmir -legmir lmir -lemir lmir -biez loadmoney -gldct loadmoney -ldmon loadmoney -loadmoneyent loadmoney -odyssey loadmoney -ogimant loadmoney -plocust loadmoney -duptwux lolbot -duel loveletter -mixor loveletter -xworm loveletter -tazebama mabezat -ratab mamianune -midhos medfos -magmedia mediamagnet -mmag mediamagnet -downloadnsave megasearch -fastsave megasearch -fastsaveapp megasearch -preloader megasearch -saveshare megasearch -morefi memery -lohmys midia -marketpay mmarketpay -mmarket mmarketpay -mmarketp mmarketpay -fipp morto -serpip morto -mspyonline mspy -multibardown multibar -multibardownloader multibar -mutibar multibar -ticno multibar -mplug multiplug -licat murofet -funweb mywebsearch -mindspark mywebsearch -nandrob nandrobox -neshuta neshta -netboxserver netbox -bespal netins -netweird netwiredrc -weecnaw netwiredrc -wirenet netwiredrc -nickispy nickyspy -nickspy nickyspy -conduit opencandy -optixp optix -optixpro optix -bflient palevo -pilleuz palevo -rimecud palevo -pate parite -pinfi parite -perfectkeylogger perflogger -perfkey perflogger -perfloger perflogger -petrolan petrolin -yoof picsys -fixflo pioneer -flofix pioneer -floxif pioneer -floxlib pioneer -apperhand plankton -plangton plankton -pupil plemood -purplemood plemood -purple plemood -gulpix plugx -poisonivy poison -polipos polip -screenblaze prosti -acute pullupdate -clickspring purityscan -clspring purityscan -purity purityscan -chydo pykspa -dwonk pykspa -pykse pykspa -qakbot qbot -qqrobber qqrob -zsone raden -protexor ramnit -rmnet ramnit -ranck ranky -dracur rebhip -spatet rebhip -spyrat rebhip -refogkeylogger refog -relevant relevantknowledge -rknowledge relevantknowledge -arto renos -codecpack renos -codepack renos -banloader rimod -mutopy rodecap -ggsmart rootsmart -kometa rukometa -gnurbulf rungbu -overt sadenav -overtls sadenav -sahagent sahat -shopathome sahat -safekidzone sakezon -kashu sality -kuku sality -saldrop sality -salicode sality -salitystub sality -salload sality -salpack sality -salrenmetie sality -stubofsality sality -sancmed sanctionedmedia -contrand sckeylog -controlrandom sckeylog -sckeylogger sckeylog -sclog sckeylog -softcentral sckeylog -secxplod securityxploded -secxploded securityxploded -winsxsbot sfone -ibank shiz -pinny shiz -shifu shiz -zybut shiz -shohdi shodi -caphaw shylock -opclose sillyfdc -cson simbot -rodricter simda -avalod sinowal -sinodo sinowal -wplug slugin -wplugin slugin -koceg socks -mandaph socks -pace socks -fakromup soft32downloader -popuppers soft32downloader -soft32down soft32downloader -soft32download soft32downloader -wedownload soft32downloader -softbase softobase -bxib softonic -softonicdownloader softonic -driverupd softpulse -sambamedia softpulse -softpules softpulse -betterinstaller somoto -mazel somoto -somato somoto -somotobetterinstaller somoto -somotoltd somoto -optimizerpro speedingupmypc -spdupmypc speedingupmypc -superoptimizer speedingupmypc -superpctools speedingupmypc -spyeyes spyeye -spyweep spyeye -square squarenet -javak suggestor -steekt steek -tophos stegvob -mofksys swisyn -c2lop swizzor -electron sytro -soltern sytro -systro sytro -taojin taojinstar -alureon tdss -olmarik tdss -tidserv tdss -tdssrt tdss -jelbrus techsnab -privitize techsnab -joleee tedroo -tedro tedroo -gael tenga -gaelicum tenga -licum tenga -nuwar tibs -peacomm tibs -tibspk tibs -zhelatin tibs -tinbakd tinba -pirrit tirrip -pirritsuggestor tirrip -inffinity toggle -inffinityinternet toggle -stufik tufik -tufei tufik -twetty twetti -speedupmypc uniblue -bandito unruy -banito unruy -cycler unruy -spacer unruy -cryptodef upatre -daytre upatre -ipatre upatre -waski upatre -yarwi upatre -gupboot urelas -plite urelas -ruftar usteal -nextup verti -lavandos vidro -spakrab vidro -gavir viking -looked viking -philis viking -multiinstall vilsel -ultradownload vilsel -ultradownloads vilsel -vils vilsel -nabucur virlock -polyransom virlock -virransom virlock -angel virut -angryangel virut -guarder virut -madanf virut -madang virut -madangel virut -vetor virut -virtob virut -vserv viser -vitallia vittalia -changeup vobfus -chinky vobfus -diple vobfus -meredrop vobfus -pronny vobfus -purora vobfus -vbccrypt vobfus -vbna vobfus -vbobfus vobfus -wbna vobfus -vflood vtflooder -vflooder vtflooder -wanna wannacry -wanacry wannacry -wannacrypt wannacry -wannacryptor wannacry -jadtre wapomi -loorp wapomi -mikcer wapomi -nimnul wapomi -otwycal wapomi -pikor wapomi -pikorms wapomi -protil wapomi -qvod wapomi -simfect wapomi -vjadtre wapomi -wali wapomi -stration warezov -webalt webalta -bulknet webprefix -klevate webprefix -blackice whiteice -blic whiteice -darksnow whiteice -autokms winactivator -kmsauto winactivator -hackkms winactivator -statblaster winfetcher -akan winwebsec -livesecurity winwebsec -mbro winwebsec -systemsecurity winwebsec -poweliks wowlik -powerliks wowlik -powessere wowlik -appquanta wkload -valla xorala -valhalla xorala -extrat xtrat -remtasu xtrat -xtreme xtrat -zbomber zombbomber -panda zbot -zbocheman zbot -zeus zbot -bjlog zegost -zeno zenosearch -maxplus zeroaccess -maxplusent zeroaccess -pmax zeroaccess -sirefef zeroaccess -smadow zeroaccess -zaccess zeroaccess -zona zvuzona -onestep zwangi -zwunzi zwangi diff --git a/avclass/data/default.generics b/avclass/data/default.generics deleted file mode 100644 index 1fbef42..0000000 --- a/avclass/data/default.generics +++ /dev/null @@ -1,418 +0,0 @@ -# Architecture / OS -win -win32 -w32 -win64 -w64 -winnt -linux -unix -android -androidos -andr -macosx -osx -osx32 - -# Malicious software -malware -malicious -malagent -maldroid -dangerousobject - -# Heuristic detection -generic -generik -gen -agen -genmalicious -generickd -tsgeneric -genericr -heuristic -heur -siggen -genetic -genome -cloud -kcloud -memscan -high -score -attribute -advml -bloodhound -sape -maltrec -symvt -igeneric -eheur -posible -undefined -static - -# Malware classes -trojan -horse -troj -trj -trojanhorse -trojware -trojanransom -trojanspy -trojanapt -trojanclicker -trojanfakeav -trojanpsw -worm -networm -hllw -virus -fileinfector -infector -prepender -hllp -rootkit -spyware -ddos -flooder -dialer -porndialer -porn -backdoor -bkdr -keylog -keylogger -datastealer -stealer -infostealer -pwstealer -banker -monitor -mailer -email -emailworm -massmailer -smtp -stmp -spam -spammer -spambot -ransom -ransomlock -ransomcrypt -ransomware -filecoder -filecryptor -rogue -fakeav -fakealert -clicker -adclicker -click -miner -coinmine -coinminer -bitcoinminer -bitcoin -btcmine -bitminer -trojansms -smssend -searcher -phishing - -# Macro -macro -badmacro -maliciousmacro -w97m -o97m -x97m -pp97m -mw97 -w2km -mo97 -x2km - -# Downloader -downloader -downldr -dloader -dwnldr -dldr -dloadr -dloade -download -dload -downware -downagent -dropper -drop -dropr -dldrop -exedrop -mdropper -muldrop -droppr -trojandropper -trojandownloader -trojandwnldr -trjndwnlder -exedown -downldexe -dropped -docdl -docdrop -docdrp -macrodown -downloadware -dloadware - -# PUP -pup -pua -adware -potentially -unwanted -not-a-virus -riskware -risk -grayware -unwnt -addisplay -adknowledge -adload -applicunwnt -adplugin -plugin -downad -toolbar -webtoolbar -casino -casonline -install -installer -bundle -bundler -bundled -bundleapp -bundleinstaller -softwarebundler -nsis -browsermodifier -unsafe -securityrisk - -# Suspicious -suspected -suspect -suspicious -susp -suspic -suspectcrc -reputation -behaveslike -lookslike -variant -based -possible -threat -probably -confidence -highconfidence - -# Unclassified -unknown -unclassifiedmalware -undef - -# Behavior: injection -injector -inject -injecter -vbinject -injcrypt -injected - -# Behavior: homepage modification -homepage -startpage - -# Behavior: kill -avkill -killav -antiav -antifw -blocker - -# Behavior: signed -fakems -signed - -# Behavior: proxy -proxy -trojanproxy - -# Behavior: autorun -autorun -autoruner -starter - -# Behavior: network -netfilter -redirector -sniffer -portscan - -# Behavior: files -killfiles -renamer - -# Behavior: services -servstart -server - -# Behavior: VM detect -vmdetect -vmdetector - -# Packer -packer -cryptor -crypter -obfuscator -msilobfuscator -encoder - -# Packed -packed -malpack -encpk -malob -cryp -crypt -crypted -cryptic -genpack -krypt -kryptk -kryptik -obfuscated -obfus -obfusc -obfuscate -malcrypt -vbcrypt -vbkrypt -vbpack -xpack -zpack -susppack -suspiciouspacker - -# Packed (specific packers) -asprotect -nspack -pecompact -upack -themida -vmprotect - -# Program -program -application -appl -software -file - -# File types -text -html -script -word -msword -excel -msexcel -office -msoffice -shellcode -shellkode -msil -java -j2me -fakedoc -fakepdf -webpage -iframe -powershell -perl -python -flash -jpeg -autoit - -# Patch -pepatch -patchfile -patched -patcher - -# Exploit -exploit -expl - -# Corrupted -damaged -corrupt -pemalform -malpe - -# Tools -tool -risktool -securitytool -fraudtool -virtool -keygen -hack -hacktool -hktl -spamtool -crack -cracktool - -# Small -small -tiny - -# Generic families -agent -eldorado -artemis -krap -kazy -katusha -pornoasset -foreign -symmi -jorik -graftor -strictor - -# Test -test -testvirus - -# Misc -password -website -encodefeature -multi -normal -other -optional -access -onion - diff --git a/avclass2/avclass2_input_checker.py b/avclass/input_checker.py similarity index 86% rename from avclass2/avclass2_input_checker.py rename to avclass/input_checker.py index 0b8dc35..ca7c381 100755 --- a/avclass2/avclass2_input_checker.py +++ b/avclass/input_checker.py @@ -1,19 +1,13 @@ -#!/usr/bin/env python -''' -AVClass2 input checker -''' - -import os import sys import argparse -script_dir = os.path.dirname(os.path.abspath(__file__)) -sys.path.insert(1, os.path.join(script_dir, 'lib/')) -from avclass2_common import Taxonomy, Tagging, Expansion +from avclass.lib import Taxonomy, Tagging, Expansion + default_tag_file = "data/default.tagging" default_tax_file = "data/default.taxonomy" default_exp_file = "data/default.expansion" + if __name__ == '__main__': argparser = argparse.ArgumentParser(prog='input_checker', description='Checks format of files Tagging, Expansion and Taxonomy.') diff --git a/avclass2/avclass2_labeler.py b/avclass/labeler.py similarity index 98% rename from avclass2/avclass2_labeler.py rename to avclass/labeler.py index ed4996c..c64a636 100755 --- a/avclass2/avclass2_labeler.py +++ b/avclass/labeler.py @@ -1,20 +1,15 @@ -#!/usr/bin/env python -''' -AVClass2 labeler -''' - -import os -import sys -script_dir = os.path.dirname(os.path.abspath(__file__)) -sys.path.insert(1, os.path.join(script_dir, 'lib/')) -sys.path.insert(1, os.path.join(script_dir, '../shared/')) import argparse -from avclass2_common import AvLabels -from operator import itemgetter -import evaluate_clustering as ec +import os import json +import sys import traceback +from operator import itemgetter + +from avclass.lib import AvLabels, clustering as ec + + +script_dir = os.path.dirname(os.path.abspath(__file__)) # Default tagging file default_tag_file = os.path.join(script_dir, "data/default.tagging") # Default expansion file @@ -22,6 +17,7 @@ # Default taxonomy file default_tax_file = os.path.join(script_dir, "data/default.taxonomy") + def guess_hash(h): ''' Given a hash string, guess the hash type based on the string length ''' hlen = len(h) diff --git a/avclass/lib/avclass_common.py b/avclass/lib/avclass_common.py deleted file mode 100755 index 5145d99..0000000 --- a/avclass/lib/avclass_common.py +++ /dev/null @@ -1,337 +0,0 @@ -#!/usr/bin/env python -''' -Main AVClass class -''' - -import re -import string -from collections import OrderedDict as OrdDict -from collections import namedtuple -from operator import itemgetter, attrgetter - -SampleInfo = namedtuple('SampleInfo', - ['md5', 'sha1', 'sha256', 'labels']) - -# AVs to use in is_pup method -pup_av_set = {'Malwarebytes', 'K7AntiVirus', 'Avast', - 'AhnLab-V3', 'Kaspersky', 'K7GW', 'Ikarus', - 'Fortinet', 'Antiy-AVL', 'Agnitum', 'ESET-NOD32'} - -# Tokens that indicate PUP used by is_pup method -pup_tokens = {'PUA', 'Adware', 'PUP', 'Unwanted', 'Riskware', 'grayware', - 'Unwnt', 'Adknowledge', 'toolbar', 'casino', 'casonline', - 'AdLoad', 'not-a-virus'} - -# AVs to use in suffix removal -suffix_removal_av_set = {'Norman', 'Avast', 'Avira', 'Kaspersky', - 'ESET-NOD32', 'Fortinet', 'Jiangmin', 'Comodo', - 'GData', 'Avast', 'Sophos', - 'TrendMicro-HouseCall', 'TrendMicro', - 'NANO-Antivirus', 'Microsoft'} - -class AvLabels: - ''' - Class to operate on AV labels, - such as extracting the most likely family name. - ''' - def __init__(self, gen_file = None, alias_file = None, av_file = None): - - # Read generic token set from file - self.gen_set = self.read_generics(gen_file) if gen_file else set() - - # Read aliases map from file - self.aliases_map = self.read_aliases(alias_file) if alias_file else {} - - # Read AV engine set from file - self.avs = self.read_avs(av_file) if av_file else None - - @staticmethod - def read_aliases(alfile): - '''Read aliases map from given file''' - if alfile is None: - return {} - almap = {} - with open(alfile, 'r') as fd: - for line in fd: - alias, token = line.strip().split()[0:2] - almap[alias] = token - return almap - - @staticmethod - def read_generics(generics_file): - '''Read generic token set from given file''' - gen_set = set() - with open(generics_file) as gen_fd: - for line in gen_fd: - if line.startswith('#') or line == '\n': - continue - gen_set.add(line.strip()) - return gen_set - - @staticmethod - def read_avs(avs_file): - '''Read AV engine set from given file''' - with open(avs_file) as fd: - avs = set(map(str.strip, fd.readlines())) - return avs - - @staticmethod - def get_sample_info_lb(vt_rep): - '''Parse and extract sample information from JSON line - Returns a SampleInfo named tuple - ''' - return SampleInfo(vt_rep['md5'], vt_rep['sha1'], vt_rep['sha256'], - vt_rep['av_labels']) - - @staticmethod - def get_sample_info_vt_v2(vt_rep): - '''Parse and extract sample information from JSON line - Returns a SampleInfo named tuple - ''' - label_pairs = [] - # Obtain scan results, if available - try: - scans = vt_rep['scans'] - md5 = vt_rep['md5'] - sha1 = vt_rep['sha1'] - sha256 = vt_rep['sha256'] - except KeyError: - return None - # Obtain labels from scan results - for av, res in scans.items(): - if res['detected']: - label = res['result'] - clean_label = ''.join(filter( - lambda x: x in string.printable, - label)).strip() - label_pairs.append((av, clean_label)) - - return SampleInfo(md5, sha1, sha256, label_pairs) - - @staticmethod - def get_sample_info_vt_v3(vt_rep): - '''Parse and extract sample information from JSON line - Returns a SampleInfo named tuple - ''' - label_pairs = [] - # Obtain scan results, if available - try: - scans = vt_rep['data']['attributes']['last_analysis_results'] - md5 = vt_rep['data']['attributes']['md5'] - sha1 = vt_rep['data']['attributes']['sha1'] - sha256 = vt_rep['data']['attributes']['sha256'] - except KeyError: - return None - # Obtain labels from scan results - for av, res in scans.items(): - label = res['result'] - if label is not None: - clean_label = ''.join(filter( - lambda x: x in string.printable, - label)).strip() - label_pairs.append((av, clean_label)) - - return SampleInfo(md5, sha1, sha256, label_pairs) - - @staticmethod - def is_pup(av_label_pairs): - '''This function classifies the sample as PUP or not - using the AV labels as explained in the paper: - "Certified PUP: Abuse in Authenticode Code Signing" - (ACM CCS 2015) - It uses the AV labels of 11 specific AVs. - The function checks for 13 keywords used to indicate PUP. - Return: - True/False/None - ''' - # If no AV labels, nothing to do, return - if not av_label_pairs: - return None - # Initialize - pup = False - threshold = 0.5 - # Set with (AV name, Flagged/not flagged as PUP), for AVs in pup_av_set - bool_set = set([(pair[0], t.lower() in pair[1].lower()) - for t in pup_tokens - for pair in av_label_pairs - if pair[0] in pup_av_set]) - - # Number of AVs that had a label for the sample - av_detected = len([p[0] for p in av_label_pairs - if p[0] in pup_av_set]) - - # Number of AVs that flagged the sample as PUP - av_pup = list(map(lambda x: x[1], bool_set)).count(True) - - # Flag as PUP according to a threshold - if (float(av_pup) >= float(av_detected)*threshold) and av_pup != 0: - pup = True - return pup - - - @staticmethod - def __remove_suffixes(av_name, label): - '''Remove AV specific suffixes from given label - Returns updated label''' - - # Truncate after last '.' - if av_name in suffix_removal_av_set: - label = label.rsplit('.', 1)[0] - - # Truncate after last '.' - # if suffix only contains digits or uppercase (no lowercase) chars - if av_name == 'AVG': - tokens = label.rsplit('.', 1) - if len(tokens) > 1 and re.match("^[A-Z0-9]+$", tokens[1]): - label = tokens[0] - - # Truncate after last '!' - if av_name in set(['Agnitum','McAffee','McAffee-GW-Edition']): - label = label.rsplit('!', 1)[0] - - # Truncate after last '(' - if av_name in set(['K7AntiVirus', 'K7GW']): - label = label.rsplit('(', 1)[0] - - # Truncate after last '@' - # GData would belong here, but already trimmed earlier - if av_name in set(['Ad-Aware', 'BitDefender', 'Emsisoft', 'F-Secure', - 'Microworld-eScan']): - label = label.rsplit('(', 1)[0] - - return label - - - def __normalize(self, label, hashes): - '''Tokenize label, filter tokens, and replace aliases''' - - # If empty label, nothing to do - if not label: - return [] - - # Initialize list of tokens to return - ret = [] - - # Split label into tokens and process each token - for token in re.split("[^0-9a-zA-Z]", label): - # Convert to lowercase - token = token.lower() - - # Remove digits at the end - end_len = len(re.findall("\d*$", token)[0]) - if end_len: - token = token[:-end_len] - - # Ignore short token - if len(token) < 4: - continue - - # Remove generic tokens - if token in self.gen_set: - continue - - # Ignore token if prefix of a hash of the sample - # Most AVs use MD5 prefixes in labels, - # but we check SHA1 and SHA256 as well - hash_token = False - for hash_str in hashes: - if hash_str[0:len(token)] == token: - hash_token = True - break - if hash_token: - continue - - # Replace alias - token = self.aliases_map[token] if token in self.aliases_map \ - else token - - # Add token - ret.append(token) - return ret - - def get_family_ranking(self, sample_info): - ''' - Returns sorted dictionary of most likely family names for sample - ''' - # Extract info from named tuple - av_label_pairs = sample_info[3] - hashes = [ sample_info[0], sample_info[1], sample_info[2] ] - - # Whitelist the AVs to filter the ones with meaningful labels - av_whitelist = self.avs - - # Initialize auxiliary data structures - labels_seen = set() - token_map = {} - - # Process each AV label - for (av_name, label) in av_label_pairs: - # If empty label, nothing to do - if not label: - continue - - ################ - # AV selection # - ################ - if av_whitelist and av_name not in av_whitelist: - continue - - ##################### - # Duplicate removal # - ##################### - - # Emsisoft uses same label as - # GData/ESET-NOD32/BitDefender/Ad-Aware/MicroWorld-eScan, - # but suffixes ' (B)' to their label. Remove the suffix. - if label.endswith(' (B)'): - label = label[:-4] - - # F-Secure uses Avira's engine since Nov. 2018 - # but prefixes 'Malware.' to Avira's label. Remove the prefix. - if label.startswith('Malware.'): - label = label[8:] - - # Other engines often use exactly the same label, e.g., - # AVG/Avast - # K7Antivirus/K7GW - # Kaspersky/ZoneAlarm - - # If we have seen the exact same label before, skip - if label in labels_seen: - continue - # If not, we add it to the set of labels seen - else: - labels_seen.add(label) - - ################## - # Suffix removal # - ################## - label = self.__remove_suffixes(av_name, label) - - ######################################################## - # Tokenization, token filtering, and alias replacement # - ######################################################## - tokens = self.__normalize(label, hashes) - - # Increase token count in map - for t in tokens: - c = token_map[t] if t in token_map else 0 - token_map[t] = c + 1 - - ################################################################## - # Token ranking: sorts tokens by decreasing count and then token # - ################################################################## - sorted_tokens = sorted(token_map.items(), - key=itemgetter(1,0), - reverse=True) - - # Delete the tokens appearing only in one AV, add rest to output - sorted_dict = OrdDict() - for t, c in sorted_tokens: - if c > 1: - sorted_dict[t] = c - else: - break - - return sorted_dict - diff --git a/avclass2/avclass2_update_module.py b/avclass/update.py similarity index 97% rename from avclass2/avclass2_update_module.py rename to avclass/update.py index 1ca9e87..14bb1c5 100755 --- a/avclass2/avclass2_update_module.py +++ b/avclass/update.py @@ -1,19 +1,13 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -''' -AVClass2 Update module -''' -import sys -import os import argparse import logging -# Make sure paths are relative to execution path -script_dir = os.path.dirname(os.path.abspath(__file__)) -sys.path.insert(0, os.path.join(script_dir, 'lib/')) -from operator import itemgetter +import os +import sys + from collections import namedtuple -from avclass2_common import Taxonomy, Expansion, Tagging -# from Levenshtein import ratio as levenshtein_ratio +from operator import itemgetter + +from avclass.lib import Taxonomy, Expansion, Tagging + # Set logging log = logging.getLogger(__name__) @@ -28,6 +22,7 @@ root.addHandler(handler_stderr) +script_dir = os.path.dirname(os.path.abspath(__file__)) # Default tagging file default_tagging_file = os.path.join(script_dir, "data/default.tagging") # Default expansion file @@ -42,6 +37,7 @@ Rel = namedtuple('Rel', ['t1', 't2', 't1_num', 't2_num', 'nalias_num', 'talias_num', 'tinv_alias_num']) + class Update: ''' Update Module ''' def __init__(self, rel_filepath, in_taxonomy, in_tagging, in_expansion, @@ -487,11 +483,7 @@ def output(self, out_prefix): len(expansion), args.exp)) # Build update object - if not args.alias: - alias_fname = os.path.basename(os.path.splitext(ifile)[0]) + '.alias' - else: - alias_fname = args.alias - update = Update(alias_fname, taxonomy, tagging, expansion, args.n, args.t) + update = Update(args.alias, taxonomy, tagging, expansion, args.n, args.t) log.info('[-] Processing %d relations satisfying t>=%.2f n>=%d' % ( update.num_rules(), args.t, args.n)) diff --git a/avclass2/README.md b/avclass2/README.md deleted file mode 100644 index 83dfaad..0000000 --- a/avclass2/README.md +++ /dev/null @@ -1,261 +0,0 @@ -# AVClass2 - -AVClass2 is a malware tagging tool. It extends AVClass to extract from AV labels not only family name tags, but other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). - -You give it as input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports) -and it outputs tags observed in the AV labels, ranked by decreasing popularity. - -The design and evaluation of AVClass2 is detailed in our ACSAC 2020 paper. - -> Silvia Sebastián, Juan Caballero. -AVClass2: Massive Malware Tag Extraction from AV Labels. -In proceedings of the Annual Computer Security Applications Conference, December 2020. - -In a nutshell, AVClass2 comprises two modules: labeling and update. Code for both is included, but most users will be only interested in the labeling, which outputs the tags for the samples. The update module is used to update the input taxonomy, tagging rules, and expansion rules. If you use our default taxonomy, tagging, and expansion files, you do not need to run the update module. - - -## Labeling - -The labeler takes as input a JSON file with the AV labels of malware samples -(-vt or -lb options), -a file with the taxonomy (-tax option), -a file with tagging rules (-tag option), and -a file with expansion rules (-exp option). -It outputs a set of ranked tags. -If you do not provide taxonomy, expansion or tagging files, -the default ones in the data folder are used. - -```shell -$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -``` - -The above command labels the samples whose AV labels are in -the ../examples/malheurReference_lb.json file. -It prints the results to stdout. -The output looks like this: - -``` -aca2d12934935b070df8f50e06a20539 33 grayware|10,adware|9,windows|8,adrotator|8,downloader|3,zlob|2 -67d15459e1f85898851148511c86d88d 37 dialer|23,windows|9,adultbrowser|8,porndialer|7,grayware|6,tool|3,target|2 -``` - -which means sample *aca2d12934935b070df8f50e06a20539* -was flagged by 33 AV engines and 10 of them agree it is *grayware*, 9 that it is more specifically *adware*, -8 mention that it runs on *windows*, another 8 that it is the *adrotator* family, -3 that it is a *downloader*, and 2 that it belongs instead to the *zlob* family. -Sample *67d15459e1f85898851148511c86d88d* is flagged by 37 AV engines and 23 of them -consider it a *dialer*, 8 that it belongs to the *adultbrowser* family, and so on. - -The -p option outputs the full path of each tag in the taxonomy: - -```shell -$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -p -``` - -The above command line outputs: - -``` -aca2d12934935b070df8f50e06a20539 33 CLASS:grayware|10,CLASS:grayware:adware|9,FILE:os:windows|8,FAM:adrotator|8,CLASS:downloader|3,FAM:zlob|2 -67d15459e1f85898851148511c86d88d 37 CLASS:dialer|23,FILE:os:windows|9,FAM:adultbrowser|8,CLASS:dialer:porndialer|7,CLASS:grayware|6,CLASS:grayware:tool|3,FAM:target|2 -``` - -where each tag has been replaced by its taxonomy path, which starts with the category in capitals, -followed by the path in the category (if any), and the tag itself, all separated by colons. -For example, *FAM:adrotator* makes explicit that *adrotator* is a malware family, -*CLASS:grayware* that *grayware* is a malware class, and -*CLASS:grayware:adware* that *adware* is a subclass of *grayware*. - -**Compatibility mode** - -The compatibility -c option makes AVClass2 output the same format as AVClass. - -```shell -$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -c -``` - -outputs: - -``` -bb23e1d296cf01bbaf32ed3938f9b0b8 allaple -cc4521ea738e8ba17139f86b3def5349 SINGLETON:cc4521ea738e8ba17139f86b3def5349 -``` - -As in AVClass, the output contains only the family name, -which corresponds to the highest ranked family tag, all other tags are ignored. -Samples for which a family cannot be obtained are labeled as singletons with their hash. - -It is important to note that AVClass2 compatibility mode results can differ from AVClass results -on the same input file. -The differences in family names are due to differences between the generics and aliases files -used by AVClass and the taxonomy, tagging rules, and expansion rules used by AVClass2. -In the future, we may change AVClass to use the taxonomy and rules from AVClass2 -as input (instead of the generics and aliases files) -to minimize such differences and avoid maintaining different data files. - - -## Input JSON format - -AVClass2 supports three input JSON formats: - -1. VirusTotal v2 API JSON reports (*-vt file*), -where each line in the input *file* should be the full JSON of a -VirusTotal v2 API response to the */file/report* endpoint, -e.g., obtained by querying https://www.virustotal.com/vtapi/v2/file/report?apikey={apikey}&resource={hash} -There is an example VirusTotal v2 input file in examples/vtv2_sample.json - -2. VirusTotal v3 API JSON reports (*-vt file -vt3*), -where each line in the input *file* should be the full JSON of a VirusTotal API version 3 response with a *File* object report, -e.g., obtained by querying https://www.virustotal.com/api/v3/files/{hash} -There is an example VirusTotal v3 input file in examples/vtv3_sample.json - -3. Simplified JSON (*-lb file*), -where each line in *file* should be a JSON -with (at least) these fields: -{md5, sha1, sha256, av_labels}. -There is an example of such input file in *examples/malheurReference_lb.json* - - -**Multiple input files** - -AVClass2 can handle multiple input files putting the results in the same output files -(if you want results in separate files, process each input file separately). - -It is possible to provide the -vt and -lb input options multiple times. - -```shell -$./avclass2_labeler.py -vt -vt -``` -```shell -$./avclass2_labeler.py -lb -lb -``` - -There are also -vtdir and -lbdir options that can be used to provide -an input directory where all files are VT (-vtdir) or simplified (-lbdir) JSON reports: - -```shell -$./avclass2_labeler.py -vtdir -``` - -It is also possible to combine -vt with -vtdir and -lb with -lbdir, -but you cannot combine input files of different format. Thus, this command works: - -```shell -$./avclass2_labeler.py -vt -vtdir -``` - -But, this one throws an error: - -```shell -$./avclass2_labeler.py -vt -lb -``` - -At this point you have read the most important information on how to use AVClass2. -The following sections describe steps that most users will not need. - -## Labeling: Ground Truth Evaluation - -If you have family ground truth for some malware samples, i.e., -you know the true family for those samples, you can evaluate the accuracy -of the family tags output by AVClass2 on those samples with respect to that ground truth. -The evaluation metrics used are precision, recall, and F1 measure. -See our [RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf) for their definition. -Note that the ground truth evaluation does not apply to non-family tags, -i.e., it only evaluates the output of the compatibility mode. - -```shell -$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -gt ../examples/malheurReference_gt.tsv > malheurReference.labels -``` - -The output includes these lines: - -``` -Calculating precision and recall -3131 out of 3131 -Precision: 90.81 Recall: 94.05 F1-Measure: 92.40 -``` - -Each line in the *../examples/malheurReference_gt.tsv* file has two **tab-separated** columns: - -``` -aca2d12934935b070df8f50e06a20539 ADROTATOR -``` - -which indicates that sample aca2d12934935b070df8f50e06a20539 is known -to be of the *ADROTATOR* family. -Each sample in the input file should also appear in the ground truth file. -Note that the particular label assigned to each family does not matter. -What matters is that all samples in the same family are assigned -the same family name (i.e., the same string in the second column) - -The ground truth can be obtained from publicly available malware datasets. -The one in *../examples/malheurReference_gt.tsv* comes from the -[Malheur](http://www.mlsec.org/malheur/) dataset. -There are other public datasets with ground truth such as -[Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or -[Malicia](http://malicia-project.com/dataset.html). - -## Update Module - -The update module can be used to suggest additions and changes to the input -taxonomy, tagging rules, and expansion rules. -Using the update module comprises of two steps. -The first step is obtaining an alias file from the labeler: - -```shell -$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -aliasdetect -``` - -The above command will create a file named \.alias, -malheurReference_lb.alias in our example. This file has 7 columns: - -1. t1: token that is an alias -2. t2: tag for which t1 is an alias -3. |t1|: number of input samples where t1 was observed -4. |t2|: number of input samples where t2 was observed -5. |t1^t2|: number of input samples where both t1 and t2 were observed -6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t1 was observed. -7. |t1^t2|/|t2|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t2 was observed. - - -The Update Module takes the above file as input with the -alias option, -as well as the default taxonomy, tagging, and expansion files in the data directory. -It outputs updated taxonomy, tagging, and expansion files that include the -suggested additions and changes. - -```shell -$./avclass2_update_module.py -alias malheurReference_lb.alias -o output_prefix -``` - -This will produce three files: -output_prefix.taxonomy, output_prefix.tagging, output_prefix.expansion. -You can diff the output and input files to analyze the proposed changes. - -You can also modify the input taxonomy, tagging, and expansion rules in place, -rather than producing new files: - - -```shell -$./avclass2_update_module.py -alias malheurReference_lb.alias -update -``` - - -## Customizing AVClass2 - -AVClass2 is fully customizable: -Tagging, Expansion and Taxonomy files can be easily modified by the analyst -either manually or by running the update module. - -If you change those files manually, we recommend running -afterwards the input checker script to keep them tidy. -It sorts the tags in the taxonomy and performs some basic cleaning like -removing redundant entries: - -```shell -$./avclass2_input_checker.py -tax taxonomy_file -tag tagging_file -exp expansio_file -``` - -If the modifications are in the default files in the data directory you can simply run: - -```shell -$./avclass2_input_checker.py -``` diff --git a/avclass2/lib/avclass2_common.py b/avclass2/lib/avclass2_common.py deleted file mode 100755 index adf74a8..0000000 --- a/avclass2/lib/avclass2_common.py +++ /dev/null @@ -1,636 +0,0 @@ -#!/usr/bin/env python -''' -Main AVClass class -''' - -import sys -import re -import string -import logging -from collections import OrderedDict as OrdDict -from collections import namedtuple -from operator import itemgetter, attrgetter - -# Set logging -log = logging.getLogger(__name__) - -# Prefix to identify platform tags -platform_prefix = "FILE:os:" - -# Default category for tags in taxonomy with no category -uncategorized_cat = "UNC" - -SampleInfo = namedtuple('SampleInfo', - ['md5', 'sha1', 'sha256', 'labels', 'vt_tags']) - -Tag = namedtuple('Tag', ['name', 'cat', 'path', 'prefix_l']) - -# AVs to use in suffix removal -suffix_removal_av_set = {'Norman', 'Avast', 'Avira', 'Kaspersky', - 'ESET-NOD32', 'Fortinet', 'Jiangmin', 'Comodo', - 'GData', 'Avast', 'Sophos', - 'TrendMicro-HouseCall', 'TrendMicro', - 'NANO-Antivirus', 'Microsoft'} - -def create_tag(s): - ''' Create a Tag from its string representation ''' - word_list = s.strip().split(":") - if len(word_list) > 1: - name = word_list[-1].lower() - cat = word_list[0].upper() - prefix_l = [x.lower() for x in word_list[1:-1]] - path = cat - for x in prefix_l: - path = path + ':' + x - path = path + ':' + name - else: - name = word_list[0].lower() - cat = uncategorized_cat - prefix_l = [] - path = name - return Tag(name, cat, path, prefix_l) - -class Taxonomy: - ''' - A taxonomy of tags and generic tokens read from file - ''' - def __init__(self, filepath): - ''' Map tag.name | tag.path -> Tag ''' - self.__tag_map = {} - if filepath: - self.read_taxonomy(filepath) - - def __len__(self): - ''' Taxonomy length is the number of tags it contains ''' - return len(self.__tag_map)//2 - - def is_generic(self, t): - ''' Return true if input is generic, false otherwise ''' - tag = self.__tag_map.get(t, None) - if tag: - return tag.cat == "GEN" - else: - return False - - def is_tag(self, t): - ''' Return true if input is tag, false otherwise ''' - return t in self.__tag_map - - def add_tag(self, s, override=False): - ''' Add tag to taxonomy - If tag already exists with different path, - only replaces if override True ''' - tag = create_tag(s) - t = self.__tag_map.get(tag.name, None) - if t and (t.path != tag.path): - if (not override): - return - else: - log.warn("[Taxonomy] Replacing %s with %s\n" % ( - t.path, tag.path)) - del self.__tag_map[t.path] - log.debug("[Taxonomy] Adding tag %s" % s) - self.__tag_map[tag.name] = tag - self.__tag_map[tag.path] = tag - return - - def remove_tag(self, t): - ''' Remove tag from taxonomy. Returns 1 if removed, zero if unknown ''' - tag = self.__tag_map.get(t, None) - if tag: - log.debug("[Taxonomy] Removing tag: %s" % tag.path) - del self.__tag_map[tag.name] - del self.__tag_map[tag.path] - return 1 - else: - return 0 - - def get_category(self, t): - ''' Return category of input tag, UNK if not a tag ''' - tag = self.__tag_map.get(t, None) - if tag: - return tag.cat - else: - return "UNK" - - def get_path(self, t): - ''' Return full path for given tag, or empty string if not a tag ''' - tag = self.__tag_map.get(t, None) - if tag: - return tag.path - else: - return ("UNK:" + t) - - def get_prefix_l(self, t): - ''' Return prefix list for given tag, or empty string if not a tag ''' - tag = self.__tag_map.get(t, None) - if tag: - return tag.prefix_l - else: - return [] - - def get_prefix(self, t): - ''' Return prefix string for given tag, - or empty string if not a tag ''' - tag = self.__tag_map.get(t, None) - if tag: - return tag.prefix_l - else: - return t.path[0:t.path.rfind(':')] - - def get_depth(self, t): - ''' Return depth of tag in taxonomy. - Returns zero if tag not in taxonomy. - A normal tag CAT:name has depth two ''' - tag = self.__tag_map.get(t, None) - if tag: - return len(tag.prefix_l) + 2 - else: - return 0 - - def get_info(self, t): - ''' Return (path,category) for given tag, or UNK:t if not a tag ''' - tag = self.__tag_map.get(t, None) - if tag: - return tag.path, tag.cat - else: - return "UNK:" + t, "UNK" - - def expand(self, t): - ''' Return list of tags in prefix list that are leaves ''' - tag = self.__tag_map.get(t, None) - if tag: - return [t for t in tag.prefix_l if t in self.__tag_map] - else: - return [] - - def platform_tags(self): - ''' Returns list with platform tags in taxonomy ''' - acc = set() - for idx,tag in self.__tag_map.items(): - if tag.path.startswith(platform_prefix): - acc.add(tag.name) - return acc - - def overlaps(self, t1, t2): - ''' Returns true if the path of the given tags overlaps ''' - m1 = self.get_prefix_l(t1) - m2 = self.get_prefix_l(t2) - return (t1 in m2) or (t2 in m1) - - def remove_overlaps(self, l): - ''' Returns list with overlapping tags removed ''' - if not l: - return l - pair_l = sorted([(self.get_depth(t),t) for t in l]) - out_l = [pair_l.pop()[1]] - while pair_l: - t = pair_l.pop()[1] - if (not any(self.overlaps(t, e) for e in out_l)): - out_l.append(t) - return out_l - - def read_taxonomy(self, filepath): - '''Read taxonomy from given file ''' - with open(filepath, 'r') as fd: - for line in fd: - if line.startswith('#') or line == '\n': - continue - self.add_tag(line.strip()) - return - - def to_file(self, filepath): - ''' Output sorted taxonomy to given file ''' - # Open output file - fd = open(filepath, 'w') - # Write sorted tags - tag_l = sorted(self.__tag_map.items(), - key=lambda item : item[1].path, - reverse=False) - idx = 0 - for name,tag in tag_l: - if (idx % 2) == 0: - fd.write(tag.path+"\n") - idx+=1 - # Close output file - fd.close() - -class Rules: - ''' - Rules are src -> dst1, dst2, ... relations - ''' - def __init__(self, filepath): - ''' Map src -> set(dst) ''' - self._rmap = {} - if filepath: - self.read_rules(filepath) - - def __len__(self): - ''' Length is number of rules, i.e., number of src ''' - return len(self._rmap) - - def add_rule(self, src, dst_l, overwrite=False): - ''' Add rule. If rule exists: - if overwrite==True, replace destination list - else append dst_l to current target set ''' - # Remove src from dst_l if it exists - dst_l = filter(lambda x: x != src, dst_l) - # If no destinations, nothing to do - if (not dst_l): - return - log.debug("[Rules] Adding %s -> %s" % (src, dst_l)) - src_tag = create_tag(src) - if overwrite: - target_l = [create_tag(dst).name for dst in dst_l] - self._rmap[src_tag.name] = set(target_l) - else: - curr_dst = self._rmap.get(src_tag.name, set()) - for dst in dst_l: - dst_tag = create_tag(dst) - curr_dst.add(dst_tag.name) - self._rmap[src_tag.name] = curr_dst - return - - def remove_rule(self, src): - l = self._rmap.get(src, []) - if l: - log.debug("[Rules] Removing rule: %s -> %s" % (src, l)) - del self._rmap[src] - return 1 - else: - return 0 - - def get_dst(self, src): - ''' Returns dst list for given src, or empty list if no expansion ''' - return list(self._rmap.get(src, [])) - - def read_rules(self, filepath): - '''Read rules from given file''' - with open(filepath, 'r') as fd: - for line in fd: - if line.startswith('#') or line == '\n': - continue - word_list = line.strip().split() - if len(word_list) > 1: - self.add_rule(word_list[0],word_list[1:]) - return - - def to_file(self, filepath, taxonomy=None): - ''' Output sorted rules to given file - If taxonomy is provided, it outputs full tag path ''' - fd = open(filepath, 'w') - for src,dst_set in sorted(self._rmap.items()): - dst_l = sorted(dst_set, reverse=False) - if taxonomy: - src_path = taxonomy.get_path(src) - path_l = [taxonomy.get_path(t) for t in dst_l] - dst_str = '\t'.join(path_l) - fd.write("%s\t%s\n" % (src_path,dst_str)) - else: - dst_str = '\t'.join(dst_l) - fd.write("%s\t%s\n" % (src,dst_str)) - fd.close() - - def expand_src_destinations(self, src): - ''' Return destination list for given src after recursively - following any rules for destinations ''' - dst_set = self._rmap.get(src, set()) - out = set() - while dst_set: - dst = dst_set.pop() - l = self._rmap.get(dst, []) - if l: - for e in l: - if (e not in out) and (e != dst): - dst_set.add(e) - else: - out.add(dst) - return out - - def expand_all_destinations(self): - ''' Return destination list for given src after recursively - following any rules for destinations ''' - src_l = self._rmap.keys() - for src in src_l: - dst_l = self.expand_src_destinations(src) - self._rmap[src] = dst_l - -class Tagging(Rules): - ''' - Tagging rules have src UNK and dst in taxonomy - ''' - def __init__(self, filepath): - Rules.__init__(self, filepath) - - def validate(self, taxonomy): - ''' Check that tags in tagging rules are in given taxonomy ''' - for tok,tag_l in self._rmap.items(): - for t in tag_l: - if (not taxonomy.is_tag(t)): - sys.stdout.write("[Tagging] %s not in taxonomy\n" % t) - -class Expansion(Rules): - ''' - Expansion rules have src and dst in taxonomy and - src.category != dst.category - ''' - def __init__(self, filepath): - Rules.__init__(self, filepath) - - def validate(self, taxonomy): - ''' Check that tags in expansion rules are in given taxonomy ''' - for src,dst_set in self._rmap.items(): - if (not taxonomy.is_tag(src)): - sys.stdout.write("[Expansion] %s not in taxonomy\n" % src) - for dst in dst_set: - if (not taxonomy.is_tag(dst)): - sys.stdout.write("[Expansion] %s not in taxonomy\n" % dst) - -class AvLabels: - ''' - Class to operate on AV labels, - such as extracting the most likely family name. - ''' - def __init__(self, tag_file, exp_file = None, tax_file = None, - av_file = None, aliasdetect=False): - # Read taxonomy - self.taxonomy = Taxonomy(tax_file) - # Read tag rules - self.tagging = Tagging(tag_file) - # Read expansion rules - self.expansions = Expansion(exp_file) - # Read AV engines - self.avs = self.read_avs(av_file) if av_file else None - # Alias statistics initialization - self.aliasdetect = aliasdetect - - @staticmethod - def read_avs(avs_file): - '''Read AV engine set from given file''' - with open(avs_file) as fd: - avs = set(map(str.strip, fd.readlines())) - return avs - - @staticmethod - def get_sample_info_lb(vt_rep): - '''Parse and extract sample information from JSON line - Returns a SampleInfo named tuple - ''' - return SampleInfo(vt_rep['md5'], vt_rep['sha1'], vt_rep['sha256'], - vt_rep['av_labels'], []) - - @staticmethod - def get_sample_info_vt_v2(vt_rep): - '''Parse and extract sample information from JSON line - Returns a SampleInfo named tuple - ''' - label_pairs = [] - # Obtain scan results, if available - try: - scans = vt_rep['scans'] - md5 = vt_rep['md5'] - sha1 = vt_rep['sha1'] - sha256 = vt_rep['sha256'] - except KeyError: - return None - # Obtain labels from scan results - for av, res in scans.items(): - if res['detected']: - label = res['result'] - clean_label = ''.join(filter( - lambda x: x in string.printable, - label)).strip() - label_pairs.append((av, clean_label)) - # Obtain VT tags, if available - vt_tags = vt_rep.get('tags', []) - - return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags) - - @staticmethod - def get_sample_info_vt_v3(vt_rep): - '''Parse and extract sample information from JSON line - Returns a SampleInfo named tuple - ''' - label_pairs = [] - # Obtain scan results, if available - try: - scans = vt_rep['data']['attributes']['last_analysis_results'] - md5 = vt_rep['data']['attributes']['md5'] - sha1 = vt_rep['data']['attributes']['sha1'] - sha256 = vt_rep['data']['attributes']['sha256'] - except KeyError: - return None - # Obtain labels from scan results - for av, res in scans.items(): - label = res['result'] - if label is not None: - clean_label = ''.join(filter( - lambda x: x in string.printable, - label)).strip() - label_pairs.append((av, clean_label)) - # Obtain VT tags, if available - vt_tags = vt_rep['data']['attributes'].get('tags', []) - - return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags) - - - @staticmethod - def is_pup(tag_pairs, taxonomy): - '''This function classifies the sample as PUP or not - by checking if highest ranked CLASS tag contains "grayware" - and is above a predefined threshold - Return: - True/False/None - ''' - threshold = 0.5 - # If no tags, return false - if len(tag_pairs) < 1: - return None - max_ctr = tag_pairs[0][1] - for (tag,ctr) in tag_pairs: - (path, cat) = taxonomy.get_info(tag) - if (cat == "CLASS"): - if ("grayware" in path): - return (float(ctr) >= float(max_ctr)*threshold) - else: - return False - return False - - @staticmethod - def __remove_suffixes(av_name, label): - '''Remove AV specific suffixes from given label - Returns updated label''' - - # Truncate after last '.' - if av_name in suffix_removal_av_set: - label = label.rsplit('.', 1)[0] - - # Truncate after last '.' - # if suffix only contains digits or uppercase (no lowercase) chars - if av_name == 'AVG': - tokens = label.rsplit('.', 1) - if len(tokens) > 1 and re.match("^[A-Z0-9]+$", tokens[1]): - label = tokens[0] - - # Truncate after last '!' - if av_name == 'Agnitum': - label = label.rsplit('!', 1)[0] - - return label - - - def get_label_tags(self, label, hashes): - ''' Return list of tags in given label - Tokenizes label, filters unneeded tokens, and - applies tagging rules ''' - - # Initialize set of tags to return - # We use a set to avoid duplicate tokens in the same AV label - # This avoids "potentially unwanted" contributing twice BEH:pup - tags = set() - - # If empty label, nothing to do - if not label: - return tags - - # Split label into tokens and process each token - for token in re.split("[^0-9a-zA-Z]", label): - # Convert token to lowercase - token = token.lower() - - # Remove digits at the end - end_len = len(re.findall("\d*$", token)[0]) - if end_len: - token = token[:-end_len] - - # Ignore token if prefix of a hash of the sample - # Most AVs use MD5 prefixes in labels, - # but we check SHA1 and SHA256 as well - hash_token = False - for hash_str in hashes: - if hash_str[0:len(token)] == token: - hash_token = True - break - if hash_token: - continue - - # Ignore generic tokens - if self.taxonomy.is_generic(token): - continue - - # Apply tagging rule - dst_l = self.tagging.get_dst(token) - if dst_l: - # Ignore generic tokens - for t in dst_l: - if not self.taxonomy.is_generic(t): - tags.add(t) - # Add token if longer than 3 characters and no tagging rule - elif len(token) > 3: - tags.add(token) - - # Return tags - return tags - - - def __expand(self, tag_set): - ''' Return expanded set of tags ''' - ret = set() - for t in tag_set: - # Include tag - ret.add(t) - - # Include target of expansion rule in output - ret.update(self.expansions.get_dst(t)) - - # Include implicit expansions in taxonomy - ret.update(self.taxonomy.expand(t)) - - # Return a list for backwards compatibility - return ret - - def get_sample_tags(self, sample_info): - ''' Returns dictionary tag -> AV list of tags for the given sample ''' - - # Whitelist the AVs to filter the ones with meaningful labels - av_whitelist = self.avs - # Initialize auxiliary data structures - duplicates = set() - av_dict = {} - - # Process each AV label - for (av_name, label) in sample_info.labels: - # If empty label, nothing to do - if not label: - continue - - ################ - # AV selection # - ################ - if av_whitelist and av_name not in av_whitelist: - continue - - ##################### - # Duplicate removal # - ##################### - - # Emsisoft uses same label as - # GData/ESET-NOD32/BitDefender/Ad-Aware/MicroWorld-eScan, - # but suffixes ' (B)' to their label. Remove the suffix. - if label.endswith(' (B)'): - label = label[:-4] - - # F-Secure uses Avira's engine since Nov. 2018 - # but prefixes 'Malware.' to Avira's label. Remove the prefix. - if label.startswith('Malware.'): - label = label[8:] - - # Other engines often use exactly the same label, e.g., - # AVG/Avast - # K7Antivirus/K7GW - # Kaspersky/ZoneAlarm - - # If we have seen the exact same label before, skip - if label in duplicates: - continue - # If not, we add it to duplicates - else: - duplicates.add(label) - - ################## - # Suffix removal # - ################## - label = self.__remove_suffixes(av_name, label) - - ######################################################## - # Tokenization and tagging # - ######################################################## - hashes = [ sample_info.md5, sample_info.sha1, sample_info.sha256 ] - tags = self.get_label_tags(label, hashes) - - ######################################################## - # Expansions # - ######################################################## - # NOTE: Avoiding to do expansion when aliases - if self.aliasdetect: - expanded_tags = tags - else: - expanded_tags = self.__expand(tags) - - ######################################################## - # Stores information that relates AV vendors with tags # - ######################################################## - for t in expanded_tags: - av_dict.setdefault(t, []).append(av_name) - - - return av_dict - - def rank_tags(self, av_dict, threshold=1): - ''' Return list of (tag, confidence) ranked by decreasing confidence - and filter tags with less or equal threshold confidence ''' - - pairs = ((t, len(avs)) for (t,avs) in av_dict.items() - if len(avs) > threshold) - return sorted(pairs, key=itemgetter(1,0), reverse=True) - diff --git a/avclass2/data/andropup.expansion b/data/andropup.expansion similarity index 100% rename from avclass2/data/andropup.expansion rename to data/andropup.expansion diff --git a/avclass2/data/default.expansion b/data/default.expansion similarity index 100% rename from avclass2/data/default.expansion rename to data/default.expansion diff --git a/avclass2/data/default.tagging b/data/default.tagging similarity index 100% rename from avclass2/data/default.tagging rename to data/default.tagging diff --git a/avclass2/data/default.taxonomy b/data/default.taxonomy similarity index 100% rename from avclass2/data/default.taxonomy rename to data/default.taxonomy diff --git a/setup.py b/setup.py index d87b69e..ff933d6 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name='AVClass', - version='0.0.1', + version='2.0.0', description='Tag and label malware samples', license='LICENSE', packages=find_packages(), diff --git a/shared/evaluate_clustering.py b/shared/evaluate_clustering.py deleted file mode 100755 index c841d3d..0000000 --- a/shared/evaluate_clustering.py +++ /dev/null @@ -1,140 +0,0 @@ -#!/usr/bin/env python -import sys - -def tp_fp_fn(CORRECT_SET, GUESS_SET): - """ - INPUT: dictionary with the elements in the cluster from the ground truth - (CORRECT_SET) and dictionary with the elements from the estimated cluster - (ESTIMATED_SET). - - OUTPUT: number of True Positives (elements in both clusters), False - Positives (elements only in the ESTIMATED_SET), False Negatives (elements - only in the CORRECT_SET). - """ - tp = 0 - fp = 0 - fn = 0 - for elem in GUESS_SET: - # True Positives (elements in both clusters) - if elem in CORRECT_SET: - tp += 1 - else: - # False Positives (elements only in the "estimated cluster") - fp += 1 - for elem in CORRECT_SET: - if elem not in GUESS_SET: - # False Negatives (elements only in the "correct cluster") - fn += 1 - return tp, fp, fn - - -def eval_precision_recall_fmeasure(GROUNDTRUTH_DICT, ESTIMATED_DICT): - """ - INPUT: dictionary with the mapping "element:cluster_id" for both the ground - truth and the ESTIMATED_DICT clustering. - - OUTPUT: average values of Precision, Recall and F-Measure. - """ - # eval: precision, recall, f-measure - tmp_precision = 0 - tmp_recall = 0 - - # build reverse dictionary of ESTIMATED_DICT - rev_est_dict = {} - for k, v in ESTIMATED_DICT.items(): - if v not in rev_est_dict: - rev_est_dict[v] = { k } - else: - rev_est_dict[v].add(k) - - # build reverse dictionary of GROUNDTRUTH_DICT - gt_rev_dict = {} - for k, v in GROUNDTRUTH_DICT.items(): - if v not in gt_rev_dict: - gt_rev_dict[v] = { k } - else: - gt_rev_dict[v].add(k) - - - counter, l = 0, len(ESTIMATED_DICT) - - sys.stderr.write('Calculating precision and recall\n') - - # For each element - for element in ESTIMATED_DICT: - - # Print progress - if counter % 1000 == 0: - sys.stderr.write('\r%d out of %d' % (counter, l)) - sys.stderr.flush() - counter += 1 - - # Get elements in the same cluster (for "ESTIMATED_DICT cluster") - guess_cluster_id = ESTIMATED_DICT[element] - - # Get the list of elements in the same cluster ("correct cluster") - correct_cluster_id = GROUNDTRUTH_DICT[element] - - # Calculate TP, FP, FN - tp, fp, fn = tp_fp_fn(gt_rev_dict[correct_cluster_id], - rev_est_dict[guess_cluster_id]) - - # tmp_precision - p = 1.0*tp/(tp+fp) - tmp_precision += p - # tmp_recall - r = 1.0*tp/(tp+fn) - tmp_recall += r - sys.stderr.write('\r%d out of %d' % (counter, l)) - sys.stderr.write('\n') - precision = 100.0*tmp_precision/len(ESTIMATED_DICT) - recall = 100.0*tmp_recall/len(ESTIMATED_DICT) - fmeasure = (2*precision*recall)/(precision+recall) - return precision, recall, fmeasure - - -if __name__ == "__main__": - - # The ground truth. - # Dictionary with mapping: "element : cluster_id". - diz_grth = { - "a": 1, - "b": 1, - "c": 2, - "d": 3 - } - - # An example of an "estimated cluster". - # Dictionary with mapping: "element : cluster_id". - diz_estim = { - "a": 66, - "b": 'malware', - "c": 'goodware', - "d": 'trojan' - } - - # An example of an "estimated cluster": same partitioning as for the ground - # truth, but just different cluster labels. Precision == Recall == - # F-Measure == 100%. - # Dictionary with mapping: "element : cluster_id". - diz_estim_grth = { - "a": 2, - "b": 2, - "c": 66, - "d": 9 - } - - # a sample where estimated != ground truth - sys.stdout.write("Ground truth\n") - sys.stdout.write("%8s --> %10s\n" % ("Element", "Cluster_ID")) - for k, v in diz_grth.items(): - sys.stdout.write("%8s --> %10s\n" % (k, v)) - sys.stdout.write("\nEstimated clustering\n") - sys.stdout.write("%8s --> %10s\n" % ("Element", "Cluster_ID")) - for k, v in diz_estim.items(): - sys.stdout.write("%8s --> %10s\n" % (k, v)) - # precision, recall, f-measure - p, r, f = eval_precision_recall_fmeasure(diz_grth, diz_estim) - sys.stdout.write("\nPrecison: %s%%\n" % p) - sys.stdout.write("Recall: %s%%\n" % r) - sys.stdout.write("F-Measure: %s%%\n" % f) From b32ace4db8bfa7f9440832b92e85446f03d669ed Mon Sep 17 00:00:00 2001 From: Matt Miller Date: Mon, 11 Jan 2021 12:28:35 -0600 Subject: [PATCH 03/36] entry points --- avclass/input_checker.py | 5 ++++- avclass/labeler.py | 12 +++++++++--- setup.py | 7 ++++++- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/avclass/input_checker.py b/avclass/input_checker.py index ca7c381..1547742 100755 --- a/avclass/input_checker.py +++ b/avclass/input_checker.py @@ -8,7 +8,7 @@ default_exp_file = "data/default.expansion" -if __name__ == '__main__': +def main(): argparser = argparse.ArgumentParser(prog='input_checker', description='Checks format of files Tagging, Expansion and Taxonomy.') @@ -48,3 +48,6 @@ sys.stdout.write('[-] Normalized %d expansion rules in %s\n' % ( len(expansion), args.exp)) + +if __name__ == '__main__': + main() diff --git a/avclass/labeler.py b/avclass/labeler.py index c64a636..7bc9b88 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -56,7 +56,8 @@ def list_str(l, sep=", ", prefix=""): out = out + sep + s return out -def main(args): +def main(): + args = parse_args() # Select hash used to identify sample, by default MD5 hash_type = args.hash if args.hash else 'md5' @@ -351,7 +352,7 @@ def main(args): sys.stderr.write('[-] Alias data in %s\n' % (alias_filename)) -if __name__=='__main__': +def parse_args(): argparser = argparse.ArgumentParser(prog='avclass2_labeler', description='''Extracts tags for a set of samples. Also calculates precision and recall if ground truth available''') @@ -471,4 +472,9 @@ def main(args): sys.stderr.write('[-] Using default expansion tags in %s\n' % ( default_exp_file)) - main(args) + return args + + +if __name__=='__main__': + main() + \ No newline at end of file diff --git a/setup.py b/setup.py index ff933d6..fada7e7 100644 --- a/setup.py +++ b/setup.py @@ -14,4 +14,9 @@ tests_require=[ 'pytest', ], -) + entry_points={ + 'console_scripts': [ + 'avclass = avclass.labeler:main', + 'avclass-validate = avclass.input_checker:main', + ], + }) From 4bf29c1210e405802999578840b65edb07ed3946 Mon Sep 17 00:00:00 2001 From: Matt Miller Date: Mon, 11 Jan 2021 12:57:03 -0600 Subject: [PATCH 04/36] flatten package; clustering cleanup --- avclass/clustering.py | 132 ++++++++ avclass/common.py | 637 +++++++++++++++++++++++++++++++++++++++ avclass/input_checker.py | 2 +- avclass/labeler.py | 4 +- avclass/update.py | 2 +- 5 files changed, 773 insertions(+), 4 deletions(-) create mode 100755 avclass/clustering.py create mode 100755 avclass/common.py diff --git a/avclass/clustering.py b/avclass/clustering.py new file mode 100755 index 0000000..c5a349c --- /dev/null +++ b/avclass/clustering.py @@ -0,0 +1,132 @@ +import sys + +from collections import defaultdict +from typing import Dict, Set + + +def tp_fp_fn(expected: Set, guess: Set): + """ + Calculate the true-positives, false-positives, and false-negatives between ``expected`` and ``guess`` + + :param expected: Ground truth set + :param guess: Estimated set + :return: Tuple containing true positive count, false positive count, false negative count + """ + tp = len(guess.intersection(expected)) + fp = len(guess.difference(expected)) + fn = len(expected.difference(guess)) + + return tp, fp, fn + + +def eval_precision_recall_fmeasure(expected: Dict, guess: Dict): + """ + Evaluate the precision, recall, and f-measure for the comparison of ``expected`` to ``guess`` + + :param expected: Dictionary mapping an element to a cluster_id + :param guess: Dictionary mapping an element t a cluster_id + :return: Tuple containing precision, recall, and f-measure values + """ + # eval: precision, recall, f-measure + tmp_precision = 0 + tmp_recall = 0 + + # build reverse dictionary of guess + rev_est_dict = defaultdict(set) + for k, v in guess.items(): + rev_est_dict[v].add(k) + + # build reverse dictionary of expected + gt_rev_dict = defaultdict(set) + for k, v in expected.items(): + gt_rev_dict[v].add(k) + + counter, l = 0, len(guess) + + sys.stderr.write('Calculating precision and recall\n') + + # For each element + for element in guess: + # Print progress + if counter % 1000 == 0: + sys.stderr.write('\r%d out of %d' % (counter, l)) + sys.stderr.flush() + counter += 1 + + # Get elements in the same cluster (for "guess cluster") + guess_cluster_id = guess[element] + + # Get the list of elements in the same cluster ("expected cluster") + correct_cluster_id = expected[element] + + # Calculate TP, FP, FN + tp, fp, fn = tp_fp_fn(gt_rev_dict[correct_cluster_id], + rev_est_dict[guess_cluster_id]) + + # tmp_precision + p = 1.0*tp/(tp+fp) + tmp_precision += p + + # tmp_recall + r = 1.0*tp/(tp+fn) + tmp_recall += r + + sys.stderr.write('\r%d out of %d' % (counter, l)) + sys.stderr.write('\n') + + precision = 100.0 * tmp_precision / len(guess) + recall = 100.0 * tmp_recall / len(guess) + fmeasure = (2 * precision * recall) / (precision + recall) + + return precision, recall, fmeasure + + +if __name__ == "__main__": + # The ground truth. + # Dictionary with mapping: "element : cluster_id". + diz_grth = { + "a": 1, + "b": 1, + "c": 2, + "d": 3 + } + + # An example of an "estimated cluster". + # Dictionary with mapping: "element : cluster_id". + diz_estim = { + "a": 66, + "b": 'malware', + "c": 'goodware', + "d": 'trojan' + } + + # An example of an "estimated cluster": same partitioning as for the ground + # truth, but just different cluster labels. Precision == Recall == + # F-Measure == 100%. + # Dictionary with mapping: "element : cluster_id". + diz_estim_grth = { + "a": 2, + "b": 2, + "c": 66, + "d": 9 + } + + # a sample where estimated != ground truth + sys.stdout.write("Ground truth\n") + sys.stdout.write("%8s --> %10s\n" % ("Element", "Cluster_ID")) + + for k, v in diz_grth.items(): + sys.stdout.write("%8s --> %10s\n" % (k, v)) + + sys.stdout.write("\nEstimated clustering\n") + sys.stdout.write("%8s --> %10s\n" % ("Element", "Cluster_ID")) + + for k, v in diz_estim.items(): + sys.stdout.write("%8s --> %10s\n" % (k, v)) + + # precision, recall, f-measure + p, r, f = eval_precision_recall_fmeasure(diz_grth, diz_estim) + + sys.stdout.write("\nPrecison: %s%%\n" % p) + sys.stdout.write("Recall: %s%%\n" % r) + sys.stdout.write("F-Measure: %s%%\n" % f) diff --git a/avclass/common.py b/avclass/common.py new file mode 100755 index 0000000..dc28ff4 --- /dev/null +++ b/avclass/common.py @@ -0,0 +1,637 @@ +import logging +import re +import string +import sys + +from collections import namedtuple +from operator import itemgetter + + +# Set logging +log = logging.getLogger(__name__) + +# Prefix to identify platform tags +platform_prefix = "FILE:os:" + +# Default category for tags in taxonomy with no category +uncategorized_cat = "UNC" + +SampleInfo = namedtuple('SampleInfo', + ['md5', 'sha1', 'sha256', 'labels', 'vt_tags']) + +Tag = namedtuple('Tag', ['name', 'cat', 'path', 'prefix_l']) + +# AVs to use in suffix removal +suffix_removal_av_set = {'Norman', 'Avast', 'Avira', 'Kaspersky', + 'ESET-NOD32', 'Fortinet', 'Jiangmin', 'Comodo', + 'GData', 'Avast', 'Sophos', + 'TrendMicro-HouseCall', 'TrendMicro', + 'NANO-Antivirus', 'Microsoft'} + + +def create_tag(s): + ''' Create a Tag from its string representation ''' + word_list = s.strip().split(":") + if len(word_list) > 1: + name = word_list[-1].lower() + cat = word_list[0].upper() + prefix_l = [x.lower() for x in word_list[1:-1]] + path = cat + for x in prefix_l: + path = path + ':' + x + path = path + ':' + name + else: + name = word_list[0].lower() + cat = uncategorized_cat + prefix_l = [] + path = name + return Tag(name, cat, path, prefix_l) + + +class Taxonomy: + ''' + A taxonomy of tags and generic tokens read from file + ''' + def __init__(self, filepath): + ''' Map tag.name | tag.path -> Tag ''' + self.__tag_map = {} + if filepath: + self.read_taxonomy(filepath) + + def __len__(self): + ''' Taxonomy length is the number of tags it contains ''' + return len(self.__tag_map)//2 + + def is_generic(self, t): + ''' Return true if input is generic, false otherwise ''' + tag = self.__tag_map.get(t, None) + if tag: + return tag.cat == "GEN" + else: + return False + + def is_tag(self, t): + ''' Return true if input is tag, false otherwise ''' + return t in self.__tag_map + + def add_tag(self, s, override=False): + ''' Add tag to taxonomy + If tag already exists with different path, + only replaces if override True ''' + tag = create_tag(s) + t = self.__tag_map.get(tag.name, None) + if t and (t.path != tag.path): + if (not override): + return + else: + log.warn("[Taxonomy] Replacing %s with %s\n" % ( + t.path, tag.path)) + del self.__tag_map[t.path] + log.debug("[Taxonomy] Adding tag %s" % s) + self.__tag_map[tag.name] = tag + self.__tag_map[tag.path] = tag + return + + def remove_tag(self, t): + ''' Remove tag from taxonomy. Returns 1 if removed, zero if unknown ''' + tag = self.__tag_map.get(t, None) + if tag: + log.debug("[Taxonomy] Removing tag: %s" % tag.path) + del self.__tag_map[tag.name] + del self.__tag_map[tag.path] + return 1 + else: + return 0 + + def get_category(self, t): + ''' Return category of input tag, UNK if not a tag ''' + tag = self.__tag_map.get(t, None) + if tag: + return tag.cat + else: + return "UNK" + + def get_path(self, t): + ''' Return full path for given tag, or empty string if not a tag ''' + tag = self.__tag_map.get(t, None) + if tag: + return tag.path + else: + return ("UNK:" + t) + + def get_prefix_l(self, t): + ''' Return prefix list for given tag, or empty string if not a tag ''' + tag = self.__tag_map.get(t, None) + if tag: + return tag.prefix_l + else: + return [] + + def get_prefix(self, t): + ''' Return prefix string for given tag, + or empty string if not a tag ''' + tag = self.__tag_map.get(t, None) + if tag: + return tag.prefix_l + else: + return t.path[0:t.path.rfind(':')] + + def get_depth(self, t): + ''' Return depth of tag in taxonomy. + Returns zero if tag not in taxonomy. + A normal tag CAT:name has depth two ''' + tag = self.__tag_map.get(t, None) + if tag: + return len(tag.prefix_l) + 2 + else: + return 0 + + def get_info(self, t): + ''' Return (path,category) for given tag, or UNK:t if not a tag ''' + tag = self.__tag_map.get(t, None) + if tag: + return tag.path, tag.cat + else: + return "UNK:" + t, "UNK" + + def expand(self, t): + ''' Return list of tags in prefix list that are leaves ''' + tag = self.__tag_map.get(t, None) + if tag: + return [t for t in tag.prefix_l if t in self.__tag_map] + else: + return [] + + def platform_tags(self): + ''' Returns list with platform tags in taxonomy ''' + acc = set() + for idx,tag in self.__tag_map.items(): + if tag.path.startswith(platform_prefix): + acc.add(tag.name) + return acc + + def overlaps(self, t1, t2): + ''' Returns true if the path of the given tags overlaps ''' + m1 = self.get_prefix_l(t1) + m2 = self.get_prefix_l(t2) + return (t1 in m2) or (t2 in m1) + + def remove_overlaps(self, l): + ''' Returns list with overlapping tags removed ''' + if not l: + return l + pair_l = sorted([(self.get_depth(t),t) for t in l]) + out_l = [pair_l.pop()[1]] + while pair_l: + t = pair_l.pop()[1] + if (not any(self.overlaps(t, e) for e in out_l)): + out_l.append(t) + return out_l + + def read_taxonomy(self, filepath): + '''Read taxonomy from given file ''' + with open(filepath, 'r') as fd: + for line in fd: + if line.startswith('#') or line == '\n': + continue + self.add_tag(line.strip()) + return + + def to_file(self, filepath): + ''' Output sorted taxonomy to given file ''' + # Open output file + fd = open(filepath, 'w') + # Write sorted tags + tag_l = sorted(self.__tag_map.items(), + key=lambda item : item[1].path, + reverse=False) + idx = 0 + for name,tag in tag_l: + if (idx % 2) == 0: + fd.write(tag.path+"\n") + idx+=1 + # Close output file + fd.close() + + +class Rules: + ''' + Rules are src -> dst1, dst2, ... relations + ''' + def __init__(self, filepath): + ''' Map src -> set(dst) ''' + self._rmap = {} + if filepath: + self.read_rules(filepath) + + def __len__(self): + ''' Length is number of rules, i.e., number of src ''' + return len(self._rmap) + + def add_rule(self, src, dst_l, overwrite=False): + ''' Add rule. If rule exists: + if overwrite==True, replace destination list + else append dst_l to current target set ''' + # Remove src from dst_l if it exists + dst_l = filter(lambda x: x != src, dst_l) + # If no destinations, nothing to do + if (not dst_l): + return + log.debug("[Rules] Adding %s -> %s" % (src, dst_l)) + src_tag = create_tag(src) + if overwrite: + target_l = [create_tag(dst).name for dst in dst_l] + self._rmap[src_tag.name] = set(target_l) + else: + curr_dst = self._rmap.get(src_tag.name, set()) + for dst in dst_l: + dst_tag = create_tag(dst) + curr_dst.add(dst_tag.name) + self._rmap[src_tag.name] = curr_dst + return + + def remove_rule(self, src): + l = self._rmap.get(src, []) + if l: + log.debug("[Rules] Removing rule: %s -> %s" % (src, l)) + del self._rmap[src] + return 1 + else: + return 0 + + def get_dst(self, src): + ''' Returns dst list for given src, or empty list if no expansion ''' + return list(self._rmap.get(src, [])) + + def read_rules(self, filepath): + '''Read rules from given file''' + with open(filepath, 'r') as fd: + for line in fd: + if line.startswith('#') or line == '\n': + continue + word_list = line.strip().split() + if len(word_list) > 1: + self.add_rule(word_list[0],word_list[1:]) + return + + def to_file(self, filepath, taxonomy=None): + ''' Output sorted rules to given file + If taxonomy is provided, it outputs full tag path ''' + fd = open(filepath, 'w') + for src,dst_set in sorted(self._rmap.items()): + dst_l = sorted(dst_set, reverse=False) + if taxonomy: + src_path = taxonomy.get_path(src) + path_l = [taxonomy.get_path(t) for t in dst_l] + dst_str = '\t'.join(path_l) + fd.write("%s\t%s\n" % (src_path,dst_str)) + else: + dst_str = '\t'.join(dst_l) + fd.write("%s\t%s\n" % (src,dst_str)) + fd.close() + + def expand_src_destinations(self, src): + ''' Return destination list for given src after recursively + following any rules for destinations ''' + dst_set = self._rmap.get(src, set()) + out = set() + while dst_set: + dst = dst_set.pop() + l = self._rmap.get(dst, []) + if l: + for e in l: + if (e not in out) and (e != dst): + dst_set.add(e) + else: + out.add(dst) + return out + + def expand_all_destinations(self): + ''' Return destination list for given src after recursively + following any rules for destinations ''' + src_l = self._rmap.keys() + for src in src_l: + dst_l = self.expand_src_destinations(src) + self._rmap[src] = dst_l + + +class Tagging(Rules): + ''' + Tagging rules have src UNK and dst in taxonomy + ''' + def __init__(self, filepath): + Rules.__init__(self, filepath) + + def validate(self, taxonomy): + ''' Check that tags in tagging rules are in given taxonomy ''' + for tok,tag_l in self._rmap.items(): + for t in tag_l: + if (not taxonomy.is_tag(t)): + sys.stdout.write("[Tagging] %s not in taxonomy\n" % t) + + +class Expansion(Rules): + ''' + Expansion rules have src and dst in taxonomy and + src.category != dst.category + ''' + def __init__(self, filepath): + Rules.__init__(self, filepath) + + def validate(self, taxonomy): + ''' Check that tags in expansion rules are in given taxonomy ''' + for src,dst_set in self._rmap.items(): + if (not taxonomy.is_tag(src)): + sys.stdout.write("[Expansion] %s not in taxonomy\n" % src) + for dst in dst_set: + if (not taxonomy.is_tag(dst)): + sys.stdout.write("[Expansion] %s not in taxonomy\n" % dst) + + +class AvLabels: + ''' + Class to operate on AV labels, + such as extracting the most likely family name. + ''' + def __init__(self, tag_file, exp_file = None, tax_file = None, + av_file = None, aliasdetect=False): + # Read taxonomy + self.taxonomy = Taxonomy(tax_file) + # Read tag rules + self.tagging = Tagging(tag_file) + # Read expansion rules + self.expansions = Expansion(exp_file) + # Read AV engines + self.avs = self.read_avs(av_file) if av_file else None + # Alias statistics initialization + self.aliasdetect = aliasdetect + + @staticmethod + def read_avs(avs_file): + '''Read AV engine set from given file''' + with open(avs_file) as fd: + avs = set(map(str.strip, fd.readlines())) + return avs + + @staticmethod + def get_sample_info_lb(vt_rep): + '''Parse and extract sample information from JSON line + Returns a SampleInfo named tuple + ''' + return SampleInfo(vt_rep['md5'], vt_rep['sha1'], vt_rep['sha256'], + vt_rep['av_labels'], []) + + @staticmethod + def get_sample_info_vt_v2(vt_rep): + '''Parse and extract sample information from JSON line + Returns a SampleInfo named tuple + ''' + label_pairs = [] + # Obtain scan results, if available + try: + scans = vt_rep['scans'] + md5 = vt_rep['md5'] + sha1 = vt_rep['sha1'] + sha256 = vt_rep['sha256'] + except KeyError: + return None + # Obtain labels from scan results + for av, res in scans.items(): + if res['detected']: + label = res['result'] + clean_label = ''.join(filter( + lambda x: x in string.printable, + label)).strip() + label_pairs.append((av, clean_label)) + # Obtain VT tags, if available + vt_tags = vt_rep.get('tags', []) + + return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags) + + @staticmethod + def get_sample_info_vt_v3(vt_rep): + '''Parse and extract sample information from JSON line + Returns a SampleInfo named tuple + ''' + label_pairs = [] + # Obtain scan results, if available + try: + scans = vt_rep['data']['attributes']['last_analysis_results'] + md5 = vt_rep['data']['attributes']['md5'] + sha1 = vt_rep['data']['attributes']['sha1'] + sha256 = vt_rep['data']['attributes']['sha256'] + except KeyError: + return None + # Obtain labels from scan results + for av, res in scans.items(): + label = res['result'] + if label is not None: + clean_label = ''.join(filter( + lambda x: x in string.printable, + label)).strip() + label_pairs.append((av, clean_label)) + # Obtain VT tags, if available + vt_tags = vt_rep['data']['attributes'].get('tags', []) + + return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags) + + + @staticmethod + def is_pup(tag_pairs, taxonomy): + '''This function classifies the sample as PUP or not + by checking if highest ranked CLASS tag contains "grayware" + and is above a predefined threshold + Return: + True/False/None + ''' + threshold = 0.5 + # If no tags, return false + if len(tag_pairs) < 1: + return None + max_ctr = tag_pairs[0][1] + for (tag,ctr) in tag_pairs: + (path, cat) = taxonomy.get_info(tag) + if (cat == "CLASS"): + if ("grayware" in path): + return (float(ctr) >= float(max_ctr)*threshold) + else: + return False + return False + + @staticmethod + def __remove_suffixes(av_name, label): + '''Remove AV specific suffixes from given label + Returns updated label''' + + # Truncate after last '.' + if av_name in suffix_removal_av_set: + label = label.rsplit('.', 1)[0] + + # Truncate after last '.' + # if suffix only contains digits or uppercase (no lowercase) chars + if av_name == 'AVG': + tokens = label.rsplit('.', 1) + if len(tokens) > 1 and re.match("^[A-Z0-9]+$", tokens[1]): + label = tokens[0] + + # Truncate after last '!' + if av_name == 'Agnitum': + label = label.rsplit('!', 1)[0] + + return label + + + def get_label_tags(self, label, hashes): + ''' Return list of tags in given label + Tokenizes label, filters unneeded tokens, and + applies tagging rules ''' + + # Initialize set of tags to return + # We use a set to avoid duplicate tokens in the same AV label + # This avoids "potentially unwanted" contributing twice BEH:pup + tags = set() + + # If empty label, nothing to do + if not label: + return tags + + # Split label into tokens and process each token + for token in re.split("[^0-9a-zA-Z]", label): + # Convert token to lowercase + token = token.lower() + + # Remove digits at the end + end_len = len(re.findall("\d*$", token)[0]) + if end_len: + token = token[:-end_len] + + # Ignore token if prefix of a hash of the sample + # Most AVs use MD5 prefixes in labels, + # but we check SHA1 and SHA256 as well + hash_token = False + for hash_str in hashes: + if hash_str[0:len(token)] == token: + hash_token = True + break + if hash_token: + continue + + # Ignore generic tokens + if self.taxonomy.is_generic(token): + continue + + # Apply tagging rule + dst_l = self.tagging.get_dst(token) + if dst_l: + # Ignore generic tokens + for t in dst_l: + if not self.taxonomy.is_generic(t): + tags.add(t) + # Add token if longer than 3 characters and no tagging rule + elif len(token) > 3: + tags.add(token) + + # Return tags + return tags + + + def __expand(self, tag_set): + ''' Return expanded set of tags ''' + ret = set() + for t in tag_set: + # Include tag + ret.add(t) + + # Include target of expansion rule in output + ret.update(self.expansions.get_dst(t)) + + # Include implicit expansions in taxonomy + ret.update(self.taxonomy.expand(t)) + + # Return a list for backwards compatibility + return ret + + def get_sample_tags(self, sample_info): + ''' Returns dictionary tag -> AV list of tags for the given sample ''' + + # Whitelist the AVs to filter the ones with meaningful labels + av_whitelist = self.avs + # Initialize auxiliary data structures + duplicates = set() + av_dict = {} + + # Process each AV label + for (av_name, label) in sample_info.labels: + # If empty label, nothing to do + if not label: + continue + + ################ + # AV selection # + ################ + if av_whitelist and av_name not in av_whitelist: + continue + + ##################### + # Duplicate removal # + ##################### + + # Emsisoft uses same label as + # GData/ESET-NOD32/BitDefender/Ad-Aware/MicroWorld-eScan, + # but suffixes ' (B)' to their label. Remove the suffix. + if label.endswith(' (B)'): + label = label[:-4] + + # F-Secure uses Avira's engine since Nov. 2018 + # but prefixes 'Malware.' to Avira's label. Remove the prefix. + if label.startswith('Malware.'): + label = label[8:] + + # Other engines often use exactly the same label, e.g., + # AVG/Avast + # K7Antivirus/K7GW + # Kaspersky/ZoneAlarm + + # If we have seen the exact same label before, skip + if label in duplicates: + continue + # If not, we add it to duplicates + else: + duplicates.add(label) + + ################## + # Suffix removal # + ################## + label = self.__remove_suffixes(av_name, label) + + ######################################################## + # Tokenization and tagging # + ######################################################## + hashes = [ sample_info.md5, sample_info.sha1, sample_info.sha256 ] + tags = self.get_label_tags(label, hashes) + + ######################################################## + # Expansions # + ######################################################## + # NOTE: Avoiding to do expansion when aliases + if self.aliasdetect: + expanded_tags = tags + else: + expanded_tags = self.__expand(tags) + + ######################################################## + # Stores information that relates AV vendors with tags # + ######################################################## + for t in expanded_tags: + av_dict.setdefault(t, []).append(av_name) + + return av_dict + + def rank_tags(self, av_dict, threshold=1): + ''' Return list of (tag, confidence) ranked by decreasing confidence + and filter tags with less or equal threshold confidence ''' + + pairs = ((t, len(avs)) for (t,avs) in av_dict.items() + if len(avs) > threshold) + return sorted(pairs, key=itemgetter(1,0), reverse=True) + diff --git a/avclass/input_checker.py b/avclass/input_checker.py index 1547742..7ccf5bc 100755 --- a/avclass/input_checker.py +++ b/avclass/input_checker.py @@ -1,6 +1,6 @@ import sys import argparse -from avclass.lib import Taxonomy, Tagging, Expansion +from avclass.common import Taxonomy, Tagging, Expansion default_tag_file = "data/default.tagging" diff --git a/avclass/labeler.py b/avclass/labeler.py index 7bc9b88..0649c47 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -6,7 +6,8 @@ from operator import itemgetter -from avclass.lib import AvLabels, clustering as ec +from avclass.common import AvLabels +from avclass import clustering as ec script_dir = os.path.dirname(os.path.abspath(__file__)) @@ -477,4 +478,3 @@ def parse_args(): if __name__=='__main__': main() - \ No newline at end of file diff --git a/avclass/update.py b/avclass/update.py index 14bb1c5..525bc1d 100755 --- a/avclass/update.py +++ b/avclass/update.py @@ -6,7 +6,7 @@ from collections import namedtuple from operator import itemgetter -from avclass.lib import Taxonomy, Expansion, Tagging +from avclass.common import Taxonomy, Expansion, Tagging # Set logging From 6c58d9e3e6fb960de4e1cebfbc11b0ac54838e25 Mon Sep 17 00:00:00 2001 From: Matt Miller Date: Mon, 11 Jan 2021 15:47:55 -0600 Subject: [PATCH 05/36] add cli entry points; rework validator --- avclass/cli.py | 22 ++++++ avclass/data/__init__.py | 0 {data => avclass/data}/andropup.expansion | 0 {data => avclass/data}/default.expansion | 0 {data => avclass/data}/default.tagging | 0 {data => avclass/data}/default.taxonomy | 0 avclass/input_checker.py | 53 -------------- avclass/util.py | 86 +++++++++++++++++++++++ setup.py | 5 +- 9 files changed, 112 insertions(+), 54 deletions(-) create mode 100644 avclass/cli.py create mode 100644 avclass/data/__init__.py rename {data => avclass/data}/andropup.expansion (100%) rename {data => avclass/data}/default.expansion (100%) rename {data => avclass/data}/default.tagging (100%) rename {data => avclass/data}/default.taxonomy (100%) delete mode 100755 avclass/input_checker.py create mode 100755 avclass/util.py diff --git a/avclass/cli.py b/avclass/cli.py new file mode 100644 index 0000000..76e2ad3 --- /dev/null +++ b/avclass/cli.py @@ -0,0 +1,22 @@ +import argparse + +from avclass import util + + +def validate_files(): + parser = argparse.ArgumentParser(description='Checks format of files Tagging, Expansion and Taxonomy.') + parser.add_argument('-exp', + help='expansion file', + default=util.DEFAULT_EXP_PATH) + parser.add_argument('-tag', + help='tagging file', + default=util.DEFAULT_TAG_PATH) + parser.add_argument('-tax', + help='taxonomy file', + default=util.DEFAULT_TAX_PATH) + + args = parser.parse_args() + + taxonomy = util.validate_taxonomy(args.tax) + util.validate_tagging(args.tag, taxonomy) + util.validate_expansion(args.exp, taxonomy) diff --git a/avclass/data/__init__.py b/avclass/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data/andropup.expansion b/avclass/data/andropup.expansion similarity index 100% rename from data/andropup.expansion rename to avclass/data/andropup.expansion diff --git a/data/default.expansion b/avclass/data/default.expansion similarity index 100% rename from data/default.expansion rename to avclass/data/default.expansion diff --git a/data/default.tagging b/avclass/data/default.tagging similarity index 100% rename from data/default.tagging rename to avclass/data/default.tagging diff --git a/data/default.taxonomy b/avclass/data/default.taxonomy similarity index 100% rename from data/default.taxonomy rename to avclass/data/default.taxonomy diff --git a/avclass/input_checker.py b/avclass/input_checker.py deleted file mode 100755 index 7ccf5bc..0000000 --- a/avclass/input_checker.py +++ /dev/null @@ -1,53 +0,0 @@ -import sys -import argparse -from avclass.common import Taxonomy, Tagging, Expansion - - -default_tag_file = "data/default.tagging" -default_tax_file = "data/default.taxonomy" -default_exp_file = "data/default.expansion" - - -def main(): - argparser = argparse.ArgumentParser(prog='input_checker', - description='Checks format of files Tagging, Expansion and Taxonomy.') - - argparser.add_argument('-tag', - help='tagging file', - default=default_tag_file) - - argparser.add_argument('-tax', - help='taxonomy file', - default=default_tax_file) - - argparser.add_argument('-exp', - help='expansion file', - default=default_exp_file) - - # Parse arguments - args = argparser.parse_args() - - # Normalize taxonomy - taxonomy = Taxonomy(args.tax) - taxonomy.to_file(args.tax) - sys.stdout.write('[-] Normalized %d tags in taxonomy %s\n' % ( - len(taxonomy), args.tax)) - - # Normalize tagging rules - tagging = Tagging(args.tag) - tagging.validate(taxonomy) - # tagging.expand_all_destinations() - tagging.to_file(args.tag) - sys.stdout.write('[-] Normalized %d tagging rules in %s\n' % ( - len(tagging), args.tag)) - - # Normalize expansion rules - expansion = Expansion(args.exp) - expansion.validate(taxonomy) - expansion.to_file(args.exp) - sys.stdout.write('[-] Normalized %d expansion rules in %s\n' % ( - len(expansion), args.exp)) - - -if __name__ == '__main__': - main() diff --git a/avclass/util.py b/avclass/util.py new file mode 100755 index 0000000..7b4bba4 --- /dev/null +++ b/avclass/util.py @@ -0,0 +1,86 @@ +import atexit +import logging +import pkg_resources + +from avclass import data +from avclass.common import Taxonomy, Tagging, Expansion + +from typing import AnyStr + + +__all__ = ( + 'DEFAULT_EXP_PATH', + 'DEFAULT_TAG_PATH', + 'DEFAULT_TAX_PATH', + 'validate_expansion', + 'validate_tagging', + 'validate_taxonomy', +) + + +logger = logging.getLogger(__name__) + +DEFAULT_EXP = "default.expansion" +DEFAULT_TAG = "default.tagging" +DEFAULT_TAX = "default.taxonomy" + +DEFAULT_TAG_PATH = None +DEFAULT_TAX_PATH = None +DEFAULT_EXP_PATH = None + +if pkg_resources.resource_exists(data, DEFAULT_EXP): + DEFAULT_EXP_PATH = pkg_resources.resource_filename(data, DEFAULT_EXP) + +if pkg_resources.resource_exists(data, DEFAULT_TAG): + DEFAULT_TAG_PATH = pkg_resources.resource_filename(data, DEFAULT_TAG) + +if pkg_resources.resource_exists(data, DEFAULT_TAX): + DEFAULT_TAX_PATH = pkg_resources.resource_filename(data, DEFAULT_TAX) + +atexit.register(pkg_resources.cleanup_resources) + + +def validate_taxonomy(path: AnyStr): + """ + Validate and normalize a Taxonomy created from ``path`` + + :param path: Location on disk of a Taxonomy file + :return: Taxonomy object + """ + taxonomy = Taxonomy(path) + taxonomy.to_file(path) + + logger.info('[-] Normalized %d tags in taxonomy %s\n' % (len(taxonomy), path)) + + return taxonomy + + +def validate_tagging(path: AnyStr, taxonomy: Taxonomy): + """ + Validate and normalize Tagging created from ``path`` and verified against ``taxonomy`` + + :param path: Location on disk of a Tagging file + :param taxonomy: Valid Taxonomy object + :return: None + """ + tagging = Tagging(path) + tagging.validate(taxonomy) + # tagging.expand_all_destinations() + tagging.to_file(path) + + logger.info('[-] Normalized %d tagging rules in %s\n' % (len(tagging), path)) + + +def validate_expansion(path: AnyStr, taxonomy: Taxonomy): + """ + Validate and normalize Expansion created from ``path`` and verified against ``taxonomy`` + + :param path: Location on disk of an Expansion file + :param taxonomy: Valid Taxonomy object + :return: None + """ + expansion = Expansion(path) + expansion.validate(taxonomy) + expansion.to_file(path) + + logger.info('[-] Normalized %d expansion rules in %s\n' % (len(expansion), path)) diff --git a/setup.py b/setup.py index fada7e7..6bcc101 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,9 @@ description='Tag and label malware samples', license='LICENSE', packages=find_packages(), + package_data={ + 'avclass': ['data/default.*'], + }, install_requires=[], setup_requires=[ 'pytest-runner', @@ -17,6 +20,6 @@ entry_points={ 'console_scripts': [ 'avclass = avclass.labeler:main', - 'avclass-validate = avclass.input_checker:main', + 'avclass-validate = avclass.util:validate_files', ], }) From c845baff2a4eedce7a742dd93b81d53358e97265 Mon Sep 17 00:00:00 2001 From: Matt Miller Date: Mon, 11 Jan 2021 16:07:18 -0600 Subject: [PATCH 06/36] update cleanup --- avclass/update.py | 84 +++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 47 deletions(-) mode change 100755 => 100644 avclass/update.py diff --git a/avclass/update.py b/avclass/update.py old mode 100755 new mode 100644 index 525bc1d..6d0558c --- a/avclass/update.py +++ b/avclass/update.py @@ -6,12 +6,11 @@ from collections import namedtuple from operator import itemgetter +from avclass import util from avclass.common import Taxonomy, Expansion, Tagging -# Set logging -log = logging.getLogger(__name__) - +logger = logging.getLogger(__name__) # Log warn and above to stderr formatter = logging.Formatter(u'%(message)s') handler_stderr = logging.StreamHandler(sys.stderr) @@ -21,26 +20,17 @@ root.setLevel(logging.DEBUG) root.addHandler(handler_stderr) - -script_dir = os.path.dirname(os.path.abspath(__file__)) -# Default tagging file -default_tagging_file = os.path.join(script_dir, "data/default.tagging") -# Default expansion file -default_expansion_file = os.path.join(script_dir, "data/default.expansion") -# Default taxonomy file -default_taxonomy_file = os.path.join(script_dir, "data/default.taxonomy") - # Threshold for string similarity # sim_threshold = 0.6 -# Relation -Rel = namedtuple('Rel', ['t1', 't2', 't1_num', 't2_num', - 'nalias_num', 'talias_num', 'tinv_alias_num']) +Relation = namedtuple('Relation', ['t1', 't2', 't1_num', 't2_num', + 'nalias_num', 'talias_num', 'tinv_alias_num']) + class Update: ''' Update Module ''' - def __init__(self, rel_filepath, in_taxonomy, in_tagging, in_expansion, + def __init__(self, rel_filepath, in_taxonomy, in_tagging, in_expansion, n, t): # Initialize inputs self.__out_taxonomy = in_taxonomy @@ -59,7 +49,7 @@ def num_rules(self): return len(self.rel_set) def is_weak_rel(self, rel): - ''' Return true if relationship is weak, + ''' Return true if relationship is weak, i.e., does not meet thresholds ''' return ((int(rel.nalias_num) < self.__n) or (float(rel.talias_num) < self.__t)) @@ -124,7 +114,7 @@ def add_alias(self, src, dst, dst_prefix): cnt = self.src_map.get(e, 0) if cnt > cnt_max: target = e - # If dst is in tagging, update tagging rule destination, + # If dst is in tagging, update tagging rule destination, l = self.__out_tagging.get_dst(dst) if l: target_l = l @@ -151,11 +141,11 @@ def find_expansions(self): for rel in self.rel_set: p1 = self.__out_taxonomy.get_path(rel.t1) p2 = self.__out_taxonomy.get_path(rel.t2) - log.debug("Processing %s\t%s" % (p1, p2)) + logger.debug("Processing %s\t%s" % (p1, p2)) # Ignore relations where t1 is an alias l = self.__out_tagging.get_dst(rel.t1) if l: - log.debug("Ignoring relation for alias %s" % p1) + logger.debug("Ignoring relation for alias %s" % p1) continue if self.is_expansion_rel(rel): self.add_expansion(rel.t1, [rel.t2]) @@ -191,7 +181,7 @@ def process_relation(self, rel): p1,c1 = self.__out_taxonomy.get_info(rel.t1) p2,c2 = self.__out_taxonomy.get_info(rel.t2) - log.debug("Processing %s\t%s" % (p1, p2)) + logger.debug("Processing %s\t%s" % (p1, p2)) # If both directions strong, then equivalent, i.e., alias if (float(rel.tinv_alias_num) >= args.t): @@ -204,7 +194,7 @@ def process_relation(self, rel): elif (c1 == c2): prefix = p1[0:p1.rfind(':')] else: - log.warn("Equivalent rule with different categories: %s\t%s" % + logger.warn("Equivalent rule with different categories: %s\t%s" % (p1, p2)) return -1 self.add_alias(t1, t2, prefix) @@ -278,7 +268,7 @@ def run(self): # Do a pass in remaining relations cnt = 0 new_set = set() - log.debug("[-] %03d Processing relations" % num_iter) + logger.debug("[-] %03d Processing relations" % num_iter) while self.rel_set: rel = self.rel_set.pop() # If known relation, continue @@ -306,12 +296,12 @@ def run(self): # self.find_aliases() # Find expansions - log.debug("[-] Finding expansions") + logger.debug("[-] Finding expansions") self.find_expansions() def read_relations(self, filepath): - ''' Returns relations in file as a set + ''' Returns relations in file as a set Filters weak and blacklisted relations ''' rel_set = set() with open(filepath, 'r') as fd: @@ -323,8 +313,8 @@ def read_relations(self, filepath): t1, t2, t1_num, t2_num, nalias_num, talias_num, \ tinv_alias_num = line.strip().split('\t') # Build relation - rel = Rel(t1, t2, t1_num, t2_num, nalias_num, - talias_num, tinv_alias_num) + rel = Relation(t1, t2, t1_num, t2_num, nalias_num, + talias_num, tinv_alias_num) # Ignore weak relations if self.is_weak_rel(rel): continue @@ -332,7 +322,7 @@ def read_relations(self, filepath): if self.is_blacklisted_rel(rel): continue # Ignore known relations - # NOTE: commented since we check if a + # NOTE: commented since we check if a # relation is known before processing it #if self.is_known_rel(rel): # continue @@ -348,16 +338,16 @@ def output_relations(self, filepath): fd = open(filepath, 'w') fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t" "|t1^t2|/|t2|\n") - sorted_rules = sorted(self.rel_set, + sorted_rules = sorted(self.rel_set, key=(lambda r: ( self.__out_taxonomy.get_category(r.t1), - self.__out_taxonomy.get_category(r.t2))), + self.__out_taxonomy.get_category(r.t2))), reverse=False) for rel in sorted_rules: p1,c1 = self.__out_taxonomy.get_info(rel.t1) p2,c2 = self.__out_taxonomy.get_info(rel.t2) fd.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %( - p1, p2, rel.t1_num, rel.t2_num, rel.nalias_num, + p1, p2, rel.t1_num, rel.t2_num, rel.nalias_num, rel.talias_num, rel.tinv_alias_num)) fd.close() @@ -373,35 +363,35 @@ def output_rule_stats(self, fd): c2), 0) + 1 self.dst_map[rel.t2] = self.dst_map.get(rel.t2, 0) + 1 # Output statistics - cat_pairs = sorted(update.cat_pairs_map.items(), key=itemgetter(1,0), + cat_pairs = sorted(update.cat_pairs_map.items(), key=itemgetter(1,0), reverse=True) for (c1,c2), cnt in cat_pairs: fd.write("%s\t%s\t%03d\n" % (c1, c2, cnt)) # Print dst statistics - dst_pairs = sorted(update.dst_map.items(), key=itemgetter(1,0), + dst_pairs = sorted(update.dst_map.items(), key=itemgetter(1,0), reverse=False) for dst, cnt in dst_pairs: fd.write("%s\t%03d\n" % (taxonomy.get_path(dst), cnt)) def output(self, out_prefix): if (not out_prefix): - tax_filepath = default_taxonomy_file - tag_filepath = default_tagging_file - exp_filepath = default_expansion_file + tax_filepath = util.DEFAULT_TAX_PATH + tag_filepath = util.DEFAULT_TAG_PATH + exp_filepath = util.DEFAULT_EXP_PATH else: tax_filepath = out_prefix + ".taxonomy" tag_filepath = out_prefix + ".tagging" exp_filepath = out_prefix + ".expansion" taxonomy.to_file(tax_filepath) - log.info('[-] Output %d taxonomy tags to %s' % ( + logger.info('[-] Output %d taxonomy tags to %s' % ( len(taxonomy), tax_filepath)) tagging.expand_all_destinations() tagging.to_file(tag_filepath) - log.info('[-] Output %d tagging rules to %s' % ( + logger.info('[-] Output %d tagging rules to %s' % ( len(tagging), tag_filepath)) expansion.to_file(exp_filepath) - log.info('[-] Output %d expansion rules to %s' % ( + logger.info('[-] Output %d expansion rules to %s' % ( len(expansion), exp_filepath)) @@ -435,15 +425,15 @@ def output(self, out_prefix): argparser.add_argument('-tag', help='file with tagging rules.', - default = default_tagging_file) + default = util.DEFAULT_TAG_PATH) argparser.add_argument('-tax', help='file with taxonomy.', - default = default_taxonomy_file) + default = util.DEFAULT_TAX_PATH) argparser.add_argument('-exp', help='file with expansion rules.', - default = default_expansion_file) + default = util.DEFAULT_EXP_PATH) argparser.add_argument('-v', '--verbose', action='store_true', @@ -454,7 +444,7 @@ def output(self, out_prefix): # Check we have the input if not args.alias: - log.error('[-] Please provide an alias file with -alias') + logger.error('[-] Please provide an alias file with -alias') exit(1) # Set logging level @@ -469,23 +459,23 @@ def output(self, out_prefix): # Read taxonomy taxonomy = Taxonomy(args.tax) - log.info('[-] Read %d taxonomy tags from %s' % ( + logger.info('[-] Read %d taxonomy tags from %s' % ( len(taxonomy), args.tax)) # Read tagging rules tagging = Tagging(args.tag) - log.info('[-] Read %d tagging rules from %s' % ( + logger.info('[-] Read %d tagging rules from %s' % ( len(tagging), args.tag)) # Read expansion rules expansion = Expansion(args.exp) - log.info('[-] Read %d expansion rules from %s' % ( + logger.info('[-] Read %d expansion rules from %s' % ( len(expansion), args.exp)) # Build update object update = Update(args.alias, taxonomy, tagging, expansion, args.n, args.t) - log.info('[-] Processing %d relations satisfying t>=%.2f n>=%d' % ( + logger.info('[-] Processing %d relations satisfying t>=%.2f n>=%d' % ( update.num_rules(), args.t, args.n)) # Output initial rules From e8df430b2748514a7e509b8db446cbdd096b004c Mon Sep 17 00:00:00 2001 From: Matt Miller Date: Mon, 11 Jan 2021 16:24:28 -0600 Subject: [PATCH 07/36] fix pkg stuff in util; cleanup labeler --- avclass/labeler.py | 44 ++++++++++++++++---------------------------- avclass/util.py | 18 +++++++++--------- 2 files changed, 25 insertions(+), 37 deletions(-) diff --git a/avclass/labeler.py b/avclass/labeler.py index 0649c47..035d4af 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -7,16 +7,7 @@ from operator import itemgetter from avclass.common import AvLabels -from avclass import clustering as ec - - -script_dir = os.path.dirname(os.path.abspath(__file__)) -# Default tagging file -default_tag_file = os.path.join(script_dir, "data/default.tagging") -# Default expansion file -default_exp_file = os.path.join(script_dir, "data/default.expansion") -# Default taxonomy file -default_tax_file = os.path.join(script_dir, "data/default.taxonomy") +from avclass import clustering as ec, util def guess_hash(h): @@ -31,6 +22,7 @@ def guess_hash(h): else: return None + def format_tag_pairs(l, taxonomy=None): ''' Return ranked tags as string ''' if not l: @@ -48,6 +40,7 @@ def format_tag_pairs(l, taxonomy=None): out += ",%s|%d" % (p, s) return out + def list_str(l, sep=", ", prefix=""): ''' Return list as a string ''' if not l: @@ -57,6 +50,7 @@ def list_str(l, sep=", ", prefix=""): out = out + sep + s return out + def main(): args = parse_args() # Select hash used to identify sample, by default MD5 @@ -354,7 +348,7 @@ def main(): def parse_args(): - argparser = argparse.ArgumentParser(prog='avclass2_labeler', + argparser = argparse.ArgumentParser(prog='avclass', description='''Extracts tags for a set of samples. Also calculates precision and recall if ground truth available''') @@ -387,15 +381,15 @@ def parse_args(): argparser.add_argument('-tag', help='file with tagging rules.', - default = default_tag_file) + default = util.DEFAULT_TAG_PATH) argparser.add_argument('-tax', help='file with taxonomy.', - default = default_tax_file) + default = util.DEFAULT_TAX_PATH) argparser.add_argument('-exp', help='file with expansion rules.', - default = default_exp_file) + default = util.DEFAULT_EXP_PATH) argparser.add_argument('-av', help='file with list of AVs to use') @@ -435,43 +429,37 @@ def parse_args(): if not args.vt and not args.lb and not args.vtdir and not args.lbdir: sys.stderr.write('One of the following 4 arguments is required: ' - '-vt,-lb,-vtdir,-lbdir\n') + '-vt,-lb,-vtdir,-lbdir\n') exit(1) if (args.vt or args.vtdir) and (args.lb or args.lbdir): sys.stderr.write('Use either -vt/-vtdir or -lb/-lbdir. ' - 'Both types of input files cannot be combined.\n') + 'Both types of input files cannot be combined.\n') exit(1) if args.tag: if args.tag == '/dev/null': sys.stderr.write('[-] Using no tagging rules\n') else: - sys.stderr.write('[-] Using tagging rules in %s\n' % ( - args.tag)) + sys.stderr.write('[-] Using tagging rules in %s\n' % (args.tag)) else: - sys.stderr.write('[-] Using default tagging rules in %s\n' % ( - default_tag_file)) + sys.stderr.write('[-] Using default tagging rules in %s\n' % (util.DEFAULT_TAG_PATH)) if args.tax: if args.tax == '/dev/null': sys.stderr.write('[-] Using no taxonomy\n') else: - sys.stderr.write('[-] Using taxonomy in %s\n' % ( - args.tax)) + sys.stderr.write('[-] Using taxonomy in %s\n' % (args.tax)) else: - sys.stderr.write('[-] Using default taxonomy in %s\n' % ( - default_tax_file)) + sys.stderr.write('[-] Using default taxonomy in %s\n' % (util.DEFAULT_TAX_PATH)) if args.exp: if args.exp == '/dev/null': sys.stderr.write('[-] Using no expansion tags\n') else: - sys.stderr.write('[-] Using expansion tags in %s\n' % ( - args.exp)) + sys.stderr.write('[-] Using expansion tags in %s\n' % (args.exp)) else: - sys.stderr.write('[-] Using default expansion tags in %s\n' % ( - default_exp_file)) + sys.stderr.write('[-] Using default expansion tags in %s\n' % (util.DEFAULT_EXP_PATH)) return args diff --git a/avclass/util.py b/avclass/util.py index 7b4bba4..ceaf071 100755 --- a/avclass/util.py +++ b/avclass/util.py @@ -20,22 +20,22 @@ logger = logging.getLogger(__name__) -DEFAULT_EXP = "default.expansion" -DEFAULT_TAG = "default.tagging" -DEFAULT_TAX = "default.taxonomy" +RESOURCE_EXP = "default.expansion" +RESOURCE_TAG = "default.tagging" +RESOURCE_TAX = "default.taxonomy" DEFAULT_TAG_PATH = None DEFAULT_TAX_PATH = None DEFAULT_EXP_PATH = None -if pkg_resources.resource_exists(data, DEFAULT_EXP): - DEFAULT_EXP_PATH = pkg_resources.resource_filename(data, DEFAULT_EXP) +if pkg_resources.resource_exists(data.__name__, RESOURCE_EXP): + DEFAULT_EXP_PATH = pkg_resources.resource_filename(data.__name__, RESOURCE_EXP) -if pkg_resources.resource_exists(data, DEFAULT_TAG): - DEFAULT_TAG_PATH = pkg_resources.resource_filename(data, DEFAULT_TAG) +if pkg_resources.resource_exists(data.__name__, RESOURCE_TAG): + DEFAULT_TAG_PATH = pkg_resources.resource_filename(data.__name__, RESOURCE_TAG) -if pkg_resources.resource_exists(data, DEFAULT_TAX): - DEFAULT_TAX_PATH = pkg_resources.resource_filename(data, DEFAULT_TAX) +if pkg_resources.resource_exists(data.__name__, RESOURCE_TAX): + DEFAULT_TAX_PATH = pkg_resources.resource_filename(data.__name__, RESOURCE_TAX) atexit.register(pkg_resources.cleanup_resources) From 992178724222d1f33d0d18eb180609d382a3af8d Mon Sep 17 00:00:00 2001 From: Matt Miller Date: Wed, 13 Jan 2021 11:37:16 -0600 Subject: [PATCH 08/36] incremental --- avclass/common.py | 688 +++++++++++++++++++++++++++------------------- avclass/update.py | 4 +- avclass/util.py | 4 +- 3 files changed, 403 insertions(+), 293 deletions(-) diff --git a/avclass/common.py b/avclass/common.py index dc28ff4..dbc292b 100755 --- a/avclass/common.py +++ b/avclass/common.py @@ -5,10 +5,10 @@ from collections import namedtuple from operator import itemgetter +from typing import Any, AnyStr, Collection, Dict, List, Optional, Set, Tuple, Union -# Set logging -log = logging.getLogger(__name__) +logger = logging.getLogger(__name__) # Prefix to identify platform tags platform_prefix = "FILE:os:" @@ -23,14 +23,19 @@ # AVs to use in suffix removal suffix_removal_av_set = {'Norman', 'Avast', 'Avira', 'Kaspersky', - 'ESET-NOD32', 'Fortinet', 'Jiangmin', 'Comodo', - 'GData', 'Avast', 'Sophos', - 'TrendMicro-HouseCall', 'TrendMicro', - 'NANO-Antivirus', 'Microsoft'} + 'ESET-NOD32', 'Fortinet', 'Jiangmin', 'Comodo', + 'GData', 'Avast', 'Sophos', + 'TrendMicro-HouseCall', 'TrendMicro', + 'NANO-Antivirus', 'Microsoft'} -def create_tag(s): - ''' Create a Tag from its string representation ''' +def create_tag(s: AnyStr): + """ + Create a Tag from its string representation (path) + + :param s: The string + :return: A Tag object + """ word_list = s.strip().split(":") if len(word_list) > 1: name = word_list[-1].lower() @@ -49,195 +54,264 @@ def create_tag(s): class Taxonomy: - ''' - A taxonomy of tags and generic tokens read from file - ''' - def __init__(self, filepath): - ''' Map tag.name | tag.path -> Tag ''' + """ + Contains tags and generic tokens read from filesystem + """ + def __init__(self, filepath: Optional[AnyStr]): + """ + Initialize and populate the Tag map from ``filepath`` + + :param filepath: Path to taxonomy data + """ self.__tag_map = {} if filepath: self.read_taxonomy(filepath) - def __len__(self): - ''' Taxonomy length is the number of tags it contains ''' - return len(self.__tag_map)//2 - - def is_generic(self, t): - ''' Return true if input is generic, false otherwise ''' - tag = self.__tag_map.get(t, None) - if tag: - return tag.cat == "GEN" - else: - return False - - def is_tag(self, t): - ''' Return true if input is tag, false otherwise ''' - return t in self.__tag_map - - def add_tag(self, s, override=False): - ''' Add tag to taxonomy - If tag already exists with different path, - only replaces if override True ''' + def __len__(self) -> int: + """ + The number of tags contained in __tag_map (divided by 2 because we store paths there too) + + :return: The length (int) of the Taxonomy + """ + return len(self.__tag_map)//2 # TODO - perhaps there should be two dicts, one for names, one for paths? + + def is_generic(self, tag: AnyStr) -> bool: + """ + Whether or not the input ``tag`` is generic + + :param tag: The tag + :return: Boolean + """ + t = self.__tag_map.get(tag, None) + return getattr(t, 'cat', None) == 'GEN' + + def is_tag(self, tag: AnyStr) -> bool: + """ + Whether this Taxonomy is aware of ``tag`` + + :param tag: The tag + :return: Boolean + """ + return tag in self.__tag_map + + def add_tag(self, s: AnyStr, override: bool = False): + """ + Add a tag (``s``) to the Taxonomy. Collisions are only replaced if ``override`` is truthy. + + :param s: A string to create a Tag from + :param override: Whether or not to replace a duplicate if present + :return: None + """ tag = create_tag(s) t = self.__tag_map.get(tag.name, None) + if t and (t.path != tag.path): - if (not override): - return - else: - log.warn("[Taxonomy] Replacing %s with %s\n" % ( - t.path, tag.path)) + if override: + logger.warning("[Taxonomy] Replacing %s with %s\n" % t.path, tag.path) del self.__tag_map[t.path] - log.debug("[Taxonomy] Adding tag %s" % s) + else: + return + + logger.debug("[Taxonomy] Adding tag %s" % s) self.__tag_map[tag.name] = tag self.__tag_map[tag.path] = tag - return - - def remove_tag(self, t): - ''' Remove tag from taxonomy. Returns 1 if removed, zero if unknown ''' - tag = self.__tag_map.get(t, None) - if tag: - log.debug("[Taxonomy] Removing tag: %s" % tag.path) - del self.__tag_map[tag.name] - del self.__tag_map[tag.path] - return 1 - else: - return 0 - def get_category(self, t): - ''' Return category of input tag, UNK if not a tag ''' - tag = self.__tag_map.get(t, None) - if tag: - return tag.cat - else: - return "UNK" - - def get_path(self, t): - ''' Return full path for given tag, or empty string if not a tag ''' - tag = self.__tag_map.get(t, None) - if tag: - return tag.path - else: - return ("UNK:" + t) - - def get_prefix_l(self, t): - ''' Return prefix list for given tag, or empty string if not a tag ''' - tag = self.__tag_map.get(t, None) - if tag: - return tag.prefix_l - else: - return [] + def remove_tag(self, tag: AnyStr) -> bool: + """ + Remove a Tag from the Taxonomy. - def get_prefix(self, t): - ''' Return prefix string for given tag, - or empty string if not a tag ''' - tag = self.__tag_map.get(t, None) - if tag: - return tag.prefix_l - else: - return t.path[0:t.path.rfind(':')] - - def get_depth(self, t): - ''' Return depth of tag in taxonomy. - Returns zero if tag not in taxonomy. - A normal tag CAT:name has depth two ''' - tag = self.__tag_map.get(t, None) + :param tag: The tag to remove + :return: Whether or not the tag was present + """ + t = self.__tag_map.get(tag, None) if tag: + logger.debug("[Taxonomy] Removing tag: %s" % t.path) + del self.__tag_map[t.name] + del self.__tag_map[t.path] + return t is not None + + def get_category(self, tag: AnyStr) -> AnyStr: + """ + Return the tag's category or "UNK" if it's not a tag. + + :param tag: The tag + :return: The category + """ + t = self.__tag_map.get(tag, None) + return getattr(t, 'cat', 'UNK') + + def get_path(self, tag: AnyStr) -> AnyStr: + """ + Get a tag's full path. + + :param tag: The tag + :return: The tag's path + """ + t = self.__tag_map.get(tag, None) + return getattr(t, 'path', f'UNK:{tag}') + + def get_prefix_l(self, tag: AnyStr) -> List[AnyStr]: + """ + Get a tag's prefix list. + + :param tag: The tag + :return: The tag's prefix list + """ + t = self.__tag_map.get(tag, None) + return getattr(t, 'prefix_l', []) + + def get_prefix(self, tag: AnyStr) -> List[AnyStr]: + """ + Get a tag's prefixes. + + :param tag: The tag + :return: String representation of the tag's full prefix + """ + t = self.__tag_map.get(tag, None) + tag_pfx = tag.path.split(':')[:-1] + return t.prefix_l if t else tag_pfx + + def get_depth(self, tag: AnyStr) -> int: + """ + Determine the "depth" (token count) of the tag + + :param tag: The tag + :return: The depth (int) of the tag + """ + t = self.__tag_map.get(tag, None) + if t: return len(tag.prefix_l) + 2 - else: - return 0 - - def get_info(self, t): - ''' Return (path,category) for given tag, or UNK:t if not a tag ''' - tag = self.__tag_map.get(t, None) - if tag: - return tag.path, tag.cat - else: - return "UNK:" + t, "UNK" - - def expand(self, t): - ''' Return list of tags in prefix list that are leaves ''' - tag = self.__tag_map.get(t, None) - if tag: - return [t for t in tag.prefix_l if t in self.__tag_map] - else: - return [] - - def platform_tags(self): - ''' Returns list with platform tags in taxonomy ''' - acc = set() - for idx,tag in self.__tag_map.items(): - if tag.path.startswith(platform_prefix): - acc.add(tag.name) - return acc - - def overlaps(self, t1, t2): - ''' Returns true if the path of the given tags overlaps ''' + return 0 + + def get_info(self, tag: AnyStr) -> Tuple[AnyStr, AnyStr]: + """ + Get tag info (path, category) or "UNK:tag" + + :param tag: The tag + :return: Tuple containing tag.path and tag.cat + """ + t = self.__tag_map.get(tag, None) + if t: + return t.path, t.cat + return f"UNK:{tag}", "UNK" + + def expand(self, tag: AnyStr) -> List[AnyStr]: + """ + Return tag prefixes that are leaf-nodes + + :param tag: The tag + :return: A list of prefixes + """ + t = self.__tag_map.get(tag, None) + if t: + return [x for x in t.prefix_l if x in self.__tag_map] + return [] + + def platform_tags(self) -> Set[AnyStr]: + """ + Returns a set of platform tags in the Taxonomy + + :return: Set of platformn tags + """ + return {tag.name for _, tag in self.__tag_map.items() if tag.path.startswith(platform_prefix)} + + def overlaps(self, t1: AnyStr, t2: AnyStr) -> bool: + """ + Whether or not the two tags overlap + + :param t1: The first Tag + :param t2: The second Tag + :return: Boolean + """ m1 = self.get_prefix_l(t1) m2 = self.get_prefix_l(t2) - return (t1 in m2) or (t2 in m1) + return t1 in m2 or t2 in m1 - def remove_overlaps(self, l): - ''' Returns list with overlapping tags removed ''' + def remove_overlaps(self, l: Collection[AnyStr]) -> Union[Collection[AnyStr], List[AnyStr]]: + """ + Returns list with overlapping tags removed + + :param l: The list + :return: Deduped list + """ + # TODO - code smell if not l: return l - pair_l = sorted([(self.get_depth(t),t) for t in l]) + pair_l = sorted([(self.get_depth(t), t) for t in l]) out_l = [pair_l.pop()[1]] while pair_l: t = pair_l.pop()[1] - if (not any(self.overlaps(t, e) for e in out_l)): + if not any(self.overlaps(t, e) for e in out_l): out_l.append(t) return out_l - def read_taxonomy(self, filepath): - '''Read taxonomy from given file ''' + def read_taxonomy(self, filepath: AnyStr): + """ + Create Taxonomy from file (tab-separated lines) + + :param filepath: The path of the file to read + :return: None + """ with open(filepath, 'r') as fd: for line in fd: - if line.startswith('#') or line == '\n': - continue - self.add_tag(line.strip()) - return - - def to_file(self, filepath): - ''' Output sorted taxonomy to given file ''' - # Open output file - fd = open(filepath, 'w') - # Write sorted tags - tag_l = sorted(self.__tag_map.items(), - key=lambda item : item[1].path, - reverse=False) - idx = 0 - for name,tag in tag_l: - if (idx % 2) == 0: - fd.write(tag.path+"\n") - idx+=1 - # Close output file - fd.close() + line = line.strip() + if not line.startswith('#') and line: + self.add_tag(line) + + def to_file(self, filepath: AnyStr): + """ + Write sorted Taxonomy to a file (tab-separated lines) + + :param filepath: The path to write + :return: None + """ + with open(filepath, 'w') as fd: + tag_l = sorted(self.__tag_map.items(), + key=lambda item: item[1].path) + idx = 0 + for name, tag in tag_l: + if (idx % 2) == 0: + fd.write(tag.path + "\n") + idx += 1 class Rules: - ''' - Rules are src -> dst1, dst2, ... relations - ''' - def __init__(self, filepath): - ''' Map src -> set(dst) ''' + """ + Map a single source with one or more destinations + """ + def __init__(self, filepath: Optional[AnyStr]): + """ + Initialize the rule-map and read rules from ``filepath`` + + :param filepath: The file to read from + """ self._rmap = {} if filepath: self.read_rules(filepath) def __len__(self): - ''' Length is number of rules, i.e., number of src ''' + """ + The number of rules/src in the rule-map + + :return: Number of rules + """ return len(self._rmap) - def add_rule(self, src, dst_l, overwrite=False): - ''' Add rule. If rule exists: - if overwrite==True, replace destination list - else append dst_l to current target set ''' + def add_rule(self, src: AnyStr, dst_l: Collection[AnyStr] = None, overwrite: bool = False): + """ + Add a rule to the map. On duplicate, append destinations. If ``overwrite`` is set, replace rule src/dst. + + :param src: The source tag + :param dst_l: The destination list + :param overwrite: Whether or not to overwrite duplicates + :return: None + """ # Remove src from dst_l if it exists dst_l = filter(lambda x: x != src, dst_l) - # If no destinations, nothing to do - if (not dst_l): + if not dst_l: return - log.debug("[Rules] Adding %s -> %s" % (src, dst_l)) + + logger.debug("[Rules] Adding %s -> %s" % (src, dst_l)) src_tag = create_tag(src) if overwrite: target_l = [create_tag(dst).name for dst in dst_l] @@ -248,212 +322,248 @@ def add_rule(self, src, dst_l, overwrite=False): dst_tag = create_tag(dst) curr_dst.add(dst_tag.name) self._rmap[src_tag.name] = curr_dst - return - def remove_rule(self, src): - l = self._rmap.get(src, []) - if l: - log.debug("[Rules] Removing rule: %s -> %s" % (src, l)) + def remove_rule(self, src: AnyStr) -> bool: + dst = self._rmap.get(src, []) + if dst: + logger.debug("[Rules] Removing rule: %s -> %s" % (src, dst)) del self._rmap[src] - return 1 - else: - return 0 + return True + return False + + def get_dst(self, src: AnyStr) -> List[AnyStr]: + """ + Returns a the dst belonging to src or an empty list. - def get_dst(self, src): - ''' Returns dst list for given src, or empty list if no expansion ''' + :param src: The source rule + :return: List of dst + """ return list(self._rmap.get(src, [])) - def read_rules(self, filepath): - '''Read rules from given file''' + def read_rules(self, filepath: AnyStr): + """ + Read rules from a file and create the rule-map. + + :param filepath: The path of the file to read + :return: None + """ with open(filepath, 'r') as fd: for line in fd: - if line.startswith('#') or line == '\n': - continue - word_list = line.strip().split() - if len(word_list) > 1: - self.add_rule(word_list[0],word_list[1:]) - return - - def to_file(self, filepath, taxonomy=None): - ''' Output sorted rules to given file - If taxonomy is provided, it outputs full tag path ''' - fd = open(filepath, 'w') - for src,dst_set in sorted(self._rmap.items()): - dst_l = sorted(dst_set, reverse=False) - if taxonomy: - src_path = taxonomy.get_path(src) - path_l = [taxonomy.get_path(t) for t in dst_l] - dst_str = '\t'.join(path_l) - fd.write("%s\t%s\n" % (src_path,dst_str)) - else: - dst_str = '\t'.join(dst_l) - fd.write("%s\t%s\n" % (src,dst_str)) - fd.close() + line = line.strip() + if not line.startswith('#') and line: + word_list = line.split() + if len(word_list) > 1: + self.add_rule(word_list[0], word_list[1:]) + + def to_file(self, filepath: AnyStr, taxonomy: Taxonomy = None): + """ + Write current rules to the file at ``filepath``. + + :param filepath: The path of the file to write + :param taxonomy: A Taxonomy to optionally resolve full tag paths + :return: None + """ + with open(filepath, 'w') as fd: + for src, dst_set in sorted(self._rmap.items()): + dst_l = sorted(dst_set) + if taxonomy: + src_path = taxonomy.get_path(src) + path_l = [taxonomy.get_path(t) for t in dst_l] + dst_str = '\t'.join(path_l) + fd.write("%s\t%s\n" % (src_path, dst_str)) + else: + dst_str = '\t'.join(dst_l) + fd.write("%s\t%s\n" % (src, dst_str)) + + def expand_src_destinations(self, src: AnyStr) -> Set[AnyStr]: + """ + Return a list of all expanded destinations for ``src`` - def expand_src_destinations(self, src): - ''' Return destination list for given src after recursively - following any rules for destinations ''' + :param src: The source + :return: List of expanded destinations + """ + # TODO - this only goes one layer deep it seems. Not actually recursive dst_set = self._rmap.get(src, set()) out = set() while dst_set: dst = dst_set.pop() - l = self._rmap.get(dst, []) - if l: - for e in l: - if (e not in out) and (e != dst): - dst_set.add(e) + dst_l = self._rmap.get(dst, []) + if dst_l: + for d in dst_l: + if d not in out and d != dst: + dst_set.add(d) else: out.add(dst) return out def expand_all_destinations(self): - ''' Return destination list for given src after recursively - following any rules for destinations ''' + """ + Expand/resolve all sources in the rule-map + + :return: None + """ src_l = self._rmap.keys() for src in src_l: dst_l = self.expand_src_destinations(src) self._rmap[src] = dst_l -class Tagging(Rules): - ''' - Tagging rules have src UNK and dst in taxonomy - ''' - def __init__(self, filepath): - Rules.__init__(self, filepath) +class Translation(Rules): + """ + Translations are a set of rules that convert between unknown labels and labels that are in our Taxonomy + """ + def __init__(self, filepath: AnyStr): + super().__init__(filepath) - def validate(self, taxonomy): - ''' Check that tags in tagging rules are in given taxonomy ''' - for tok,tag_l in self._rmap.items(): + def validate(self, taxonomy: Taxonomy): + """ + Ensure all "destination" labels are in the Taxonomy. + + :param taxonomy: The Taxonomy to use for checking + :return: None + """ + for tok, tag_l in self._rmap.items(): for t in tag_l: - if (not taxonomy.is_tag(t)): + if not taxonomy.is_tag(t): sys.stdout.write("[Tagging] %s not in taxonomy\n" % t) + # TODO - raise or return False? class Expansion(Rules): - ''' - Expansion rules have src and dst in taxonomy and - src.category != dst.category - ''' - def __init__(self, filepath): - Rules.__init__(self, filepath) - - def validate(self, taxonomy): - ''' Check that tags in expansion rules are in given taxonomy ''' - for src,dst_set in self._rmap.items(): - if (not taxonomy.is_tag(src)): + """ + Expansions are rules that allow us to map a single label (src) to all explicit and implicit labels + """ + def __init__(self, filepath: AnyStr): + super().__init__(filepath) + + def validate(self, taxonomy: Taxonomy): + """ + Ensure all "source" and "destination" labels are in the Taxonomy. + + :param taxonomy: The Taxonomy to use for checking + :return: None + """ + for src, dst_set in self._rmap.items(): + if not taxonomy.is_tag(src): sys.stdout.write("[Expansion] %s not in taxonomy\n" % src) + # TODO - raise or return False? for dst in dst_set: - if (not taxonomy.is_tag(dst)): + if not taxonomy.is_tag(dst): sys.stdout.write("[Expansion] %s not in taxonomy\n" % dst) + # TODO - raise or return False? class AvLabels: - ''' - Class to operate on AV labels, - such as extracting the most likely family name. - ''' - def __init__(self, tag_file, exp_file = None, tax_file = None, - av_file = None, aliasdetect=False): - # Read taxonomy + """ + Primary class used to interpret AV Labels + """ + def __init__(self, tag_file: AnyStr, exp_file: AnyStr = None, tax_file: AnyStr = None, av_file: AnyStr = None, + alias_detect: AnyStr = False): self.taxonomy = Taxonomy(tax_file) - # Read tag rules - self.tagging = Tagging(tag_file) - # Read expansion rules + self.translations = Translation(tag_file) self.expansions = Expansion(exp_file) - # Read AV engines self.avs = self.read_avs(av_file) if av_file else None # Alias statistics initialization - self.aliasdetect = aliasdetect + self.alias_detect = alias_detect @staticmethod - def read_avs(avs_file): - '''Read AV engine set from given file''' + def read_avs(avs_file: AnyStr) -> Set[AnyStr]: + """ + Read AV engines from ``avs_file`` + + :param avs_file: The file to read + :return: A set containing the engines + """ with open(avs_file) as fd: avs = set(map(str.strip, fd.readlines())) return avs @staticmethod - def get_sample_info_lb(vt_rep): - '''Parse and extract sample information from JSON line - Returns a SampleInfo named tuple - ''' - return SampleInfo(vt_rep['md5'], vt_rep['sha1'], vt_rep['sha256'], - vt_rep['av_labels'], []) + def get_sample_info_lb(record: Dict) -> SampleInfo: + """ + Convert simplified JSON to a SampleInfo object + + :param record: The JSON record + :return: An instance of SampleInfo + """ + return SampleInfo(record['md5'], record['sha1'], record['sha256'], record['av_labels'], []) @staticmethod - def get_sample_info_vt_v2(vt_rep): - '''Parse and extract sample information from JSON line - Returns a SampleInfo named tuple - ''' - label_pairs = [] - # Obtain scan results, if available + def get_sample_info_vt_v2(record): + """ + Convert VT (v2) JSON to a SampleInfo object + + :param record: The JSON record + :return: An instance of SampleInfo + """ try: - scans = vt_rep['scans'] - md5 = vt_rep['md5'] - sha1 = vt_rep['sha1'] - sha256 = vt_rep['sha256'] + scans = record['scans'] + md5 = record['md5'] + sha1 = record['sha1'] + sha256 = record['sha256'] except KeyError: return None + # Obtain labels from scan results + label_pairs = [] for av, res in scans.items(): if res['detected']: label = res['result'] - clean_label = ''.join(filter( - lambda x: x in string.printable, - label)).strip() + clean_label = ''.join(filter(lambda x: x in string.printable, label)).strip() label_pairs.append((av, clean_label)) - # Obtain VT tags, if available - vt_tags = vt_rep.get('tags', []) + + vt_tags = record.get('tags', []) return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags) @staticmethod - def get_sample_info_vt_v3(vt_rep): - '''Parse and extract sample information from JSON line - Returns a SampleInfo named tuple - ''' - label_pairs = [] - # Obtain scan results, if available + def get_sample_info_vt_v3(record): + """ + Convert VT (v3) JSON to a SampleInfo object + + :param record: The JSON record + :return: An instance of SampleInfo + """ try: - scans = vt_rep['data']['attributes']['last_analysis_results'] - md5 = vt_rep['data']['attributes']['md5'] - sha1 = vt_rep['data']['attributes']['sha1'] - sha256 = vt_rep['data']['attributes']['sha256'] + scans = record['data']['attributes']['last_analysis_results'] + md5 = record['data']['attributes']['md5'] + sha1 = record['data']['attributes']['sha1'] + sha256 = record['data']['attributes']['sha256'] except KeyError: return None + # Obtain labels from scan results + label_pairs = [] for av, res in scans.items(): label = res['result'] if label is not None: - clean_label = ''.join(filter( - lambda x: x in string.printable, - label)).strip() + clean_label = ''.join(filter(lambda x: x in string.printable, label)).strip() label_pairs.append((av, clean_label)) - # Obtain VT tags, if available - vt_tags = vt_rep['data']['attributes'].get('tags', []) - return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags) + vt_tags = record['data']['attributes'].get('tags', []) + return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags) @staticmethod - def is_pup(tag_pairs, taxonomy): - '''This function classifies the sample as PUP or not - by checking if highest ranked CLASS tag contains "grayware" - and is above a predefined threshold - Return: - True/False/None - ''' + def is_pup(tag_pairs, taxonomy: Taxonomy) -> Optional[bool]: + """ + Attempts to classify a sample (represented by ``tag_pairs``) as a PUP. We accomplish this by checking for the + "grayware" label in the highest ranked CLASS. + + :param tag_pairs: List of tuples containing a label, and rank (int) + :param taxonomy: The Taxonomy + :return: bool or None + """ threshold = 0.5 - # If no tags, return false if len(tag_pairs) < 1: return None + max_ctr = tag_pairs[0][1] - for (tag,ctr) in tag_pairs: - (path, cat) = taxonomy.get_info(tag) - if (cat == "CLASS"): - if ("grayware" in path): - return (float(ctr) >= float(max_ctr)*threshold) + for tag, ctr in tag_pairs: + path, cat = taxonomy.get_info(tag) + if cat == "CLASS": + if "grayware" in path: + return float(ctr) >= float(max_ctr)*threshold else: return False return False @@ -521,7 +631,7 @@ def get_label_tags(self, label, hashes): continue # Apply tagging rule - dst_l = self.tagging.get_dst(token) + dst_l = self.translations.get_dst(token) if dst_l: # Ignore generic tokens for t in dst_l: @@ -614,7 +724,7 @@ def get_sample_tags(self, sample_info): # Expansions # ######################################################## # NOTE: Avoiding to do expansion when aliases - if self.aliasdetect: + if self.alias_detect: expanded_tags = tags else: expanded_tags = self.__expand(tags) diff --git a/avclass/update.py b/avclass/update.py index 6d0558c..d19ef0f 100644 --- a/avclass/update.py +++ b/avclass/update.py @@ -7,7 +7,7 @@ from operator import itemgetter from avclass import util -from avclass.common import Taxonomy, Expansion, Tagging +from avclass.common import Taxonomy, Expansion, Translation logger = logging.getLogger(__name__) @@ -463,7 +463,7 @@ def output(self, out_prefix): len(taxonomy), args.tax)) # Read tagging rules - tagging = Tagging(args.tag) + tagging = Translation(args.tag) logger.info('[-] Read %d tagging rules from %s' % ( len(tagging), args.tag)) diff --git a/avclass/util.py b/avclass/util.py index ceaf071..028bc36 100755 --- a/avclass/util.py +++ b/avclass/util.py @@ -3,7 +3,7 @@ import pkg_resources from avclass import data -from avclass.common import Taxonomy, Tagging, Expansion +from avclass.common import Taxonomy, Translation, Expansion from typing import AnyStr @@ -63,7 +63,7 @@ def validate_tagging(path: AnyStr, taxonomy: Taxonomy): :param taxonomy: Valid Taxonomy object :return: None """ - tagging = Tagging(path) + tagging = Translation(path) tagging.validate(taxonomy) # tagging.expand_all_destinations() tagging.to_file(path) From e1a00a56c4a78f502936d705bcd5a86283009877 Mon Sep 17 00:00:00 2001 From: Matt Miller Date: Wed, 13 Jan 2021 14:24:16 -0600 Subject: [PATCH 09/36] fix common --- avclass/common.py | 133 +++++++++++++++++++--------------------------- 1 file changed, 55 insertions(+), 78 deletions(-) diff --git a/avclass/common.py b/avclass/common.py index dbc292b..cf79a21 100755 --- a/avclass/common.py +++ b/avclass/common.py @@ -1,11 +1,11 @@ import logging +import operator import re import string import sys -from collections import namedtuple -from operator import itemgetter -from typing import Any, AnyStr, Collection, Dict, List, Optional, Set, Tuple, Union +from collections import defaultdict, namedtuple +from typing import AnyStr, Collection, Dict, List, Optional, Set, Tuple, Union logger = logging.getLogger(__name__) @@ -569,10 +569,14 @@ def is_pup(tag_pairs, taxonomy: Taxonomy) -> Optional[bool]: return False @staticmethod - def __remove_suffixes(av_name, label): - '''Remove AV specific suffixes from given label - Returns updated label''' + def __remove_suffixes(av_name: AnyStr, label: AnyStr) -> AnyStr: + """ + Remove vendor-specific suffixes from the label + :param av_name: The AV name to remove + :param label: The label to change + :return: The new label + """ # Truncate after last '.' if av_name in suffix_removal_av_set: label = label.rsplit('.', 1)[0] @@ -590,15 +594,15 @@ def __remove_suffixes(av_name, label): return label + def get_label_tags(self, label: AnyStr, hashes: Collection[AnyStr]) -> Set[AnyStr]: + """ + Tokenize, translate, and filter a label into tags. ``hashes`` are used to provide a dynamic filter of sorts. + We don't want to tokenize parts of the sample's hash which is a common thing for some AV vendors. - def get_label_tags(self, label, hashes): - ''' Return list of tags in given label - Tokenizes label, filters unneeded tokens, and - applies tagging rules ''' - - # Initialize set of tags to return - # We use a set to avoid duplicate tokens in the same AV label - # This avoids "potentially unwanted" contributing twice BEH:pup + :param label: The label to convert + :param hashes: A list of hashes to be used as dynamic filters + :return: A set of tags that were extracted from the label + """ tags = set() # If empty label, nothing to do @@ -618,12 +622,7 @@ def get_label_tags(self, label, hashes): # Ignore token if prefix of a hash of the sample # Most AVs use MD5 prefixes in labels, # but we check SHA1 and SHA256 as well - hash_token = False - for hash_str in hashes: - if hash_str[0:len(token)] == token: - hash_token = True - break - if hash_token: + if any([h.startswith(token) for h in hashes]): continue # Ignore generic tokens @@ -644,9 +643,13 @@ def get_label_tags(self, label, hashes): # Return tags return tags + def __expand(self, tag_set: Set[AnyStr]) -> Set[AnyStr]: + """ + Expand tags into more tags using expansion rules and the Taxonomy - def __expand(self, tag_set): - ''' Return expanded set of tags ''' + :param tag_set: Starting set of tags + :return: Expanded set of tags + """ ret = set() for t in tag_set: # Include tag @@ -658,90 +661,64 @@ def __expand(self, tag_set): # Include implicit expansions in taxonomy ret.update(self.taxonomy.expand(t)) - # Return a list for backwards compatibility return ret - def get_sample_tags(self, sample_info): - ''' Returns dictionary tag -> AV list of tags for the given sample ''' + def get_sample_tags(self, sample_info: SampleInfo) -> Dict[AnyStr, List[AnyStr]]: + """ + Get a dictionary where the key is a tag and the value is a list of AV engines that confirmed that tag. - # Whitelist the AVs to filter the ones with meaningful labels - av_whitelist = self.avs - # Initialize auxiliary data structures + :param sample_info: The SampleInfo object to inspect + :return: A dictionary where k,v = tag,[av, ...] + """ duplicates = set() - av_dict = {} + av_dict = defaultdict(list) # Process each AV label - for (av_name, label) in sample_info.labels: - # If empty label, nothing to do - if not label: - continue - - ################ - # AV selection # - ################ - if av_whitelist and av_name not in av_whitelist: + for av_name, label in sample_info.labels: + if not label or av_name not in self.avs: continue - ##################### - # Duplicate removal # - ##################### - - # Emsisoft uses same label as + # Emsisoft uses same label as # GData/ESET-NOD32/BitDefender/Ad-Aware/MicroWorld-eScan, # but suffixes ' (B)' to their label. Remove the suffix. - if label.endswith(' (B)'): - label = label[:-4] + label = label.rstrip(' (B)') # F-Secure uses Avira's engine since Nov. 2018 # but prefixes 'Malware.' to Avira's label. Remove the prefix. - if label.startswith('Malware.'): - label = label[8:] + label = label.lstrip('Malware.') # Other engines often use exactly the same label, e.g., # AVG/Avast # K7Antivirus/K7GW # Kaspersky/ZoneAlarm - # If we have seen the exact same label before, skip if label in duplicates: continue - # If not, we add it to duplicates - else: - duplicates.add(label) - ################## - # Suffix removal # - ################## - label = self.__remove_suffixes(av_name, label) + duplicates.add(label) - ######################################################## - # Tokenization and tagging # - ######################################################## - hashes = [ sample_info.md5, sample_info.sha1, sample_info.sha256 ] + label = self.__remove_suffixes(av_name, label) + hashes = [sample_info.md5, sample_info.sha1, sample_info.sha256] tags = self.get_label_tags(label, hashes) - ######################################################## - # Expansions # - ######################################################## - # NOTE: Avoiding to do expansion when aliases - if self.alias_detect: - expanded_tags = tags - else: - expanded_tags = self.__expand(tags) + # NOTE: Avoid expansion when aliases are set + expanded_tags = tags if self.alias_detect else self.__expand(tags) - ######################################################## - # Stores information that relates AV vendors with tags # - ######################################################## + # store av vendors for each tag for t in expanded_tags: - av_dict.setdefault(t, []).append(av_name) + av_dict[t].append(av_name) return av_dict - def rank_tags(self, av_dict, threshold=1): - ''' Return list of (tag, confidence) ranked by decreasing confidence - and filter tags with less or equal threshold confidence ''' - - pairs = ((t, len(avs)) for (t,avs) in av_dict.items() - if len(avs) > threshold) - return sorted(pairs, key=itemgetter(1,0), reverse=True) + @staticmethod + def rank_tags(av_dict: Dict[AnyStr, List[AnyStr]], threshold: int = 1) -> List[Tuple[AnyStr, int]]: + """ + Get a list of tuples containing a tag and the number of AV that confirmed that tag sorted by number of AV + (descending). + :param av_dict: The AV dictionary (from ``get_sample_tags()``) + :param threshold: The minimum rank/count to include + :return: A sorted list of tag, av-count pairs + """ + pairs = ((t, len(avs)) for t, avs in av_dict.items() if len(avs) > threshold) + return sorted(pairs, key=operator.itemgetter(1, 0), reverse=True) From 1f5ccedc21d651a03cb1393aaab0dced72d19439 Mon Sep 17 00:00:00 2001 From: Matt Miller Date: Wed, 13 Jan 2021 14:26:53 -0600 Subject: [PATCH 10/36] fix clustering --- avclass/clustering.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/avclass/clustering.py b/avclass/clustering.py index c5a349c..20d3fe5 100755 --- a/avclass/clustering.py +++ b/avclass/clustering.py @@ -41,7 +41,7 @@ def eval_precision_recall_fmeasure(expected: Dict, guess: Dict): for k, v in expected.items(): gt_rev_dict[v].add(k) - counter, l = 0, len(guess) + counter, gl = 0, len(guess) sys.stderr.write('Calculating precision and recall\n') @@ -49,7 +49,7 @@ def eval_precision_recall_fmeasure(expected: Dict, guess: Dict): for element in guess: # Print progress if counter % 1000 == 0: - sys.stderr.write('\r%d out of %d' % (counter, l)) + sys.stderr.write('\r%d out of %d' % (counter, gl)) sys.stderr.flush() counter += 1 @@ -71,7 +71,7 @@ def eval_precision_recall_fmeasure(expected: Dict, guess: Dict): r = 1.0*tp/(tp+fn) tmp_recall += r - sys.stderr.write('\r%d out of %d' % (counter, l)) + sys.stderr.write('\r%d out of %d' % (counter, gl)) sys.stderr.write('\n') precision = 100.0 * tmp_precision / len(guess) @@ -81,7 +81,7 @@ def eval_precision_recall_fmeasure(expected: Dict, guess: Dict): return precision, recall, fmeasure -if __name__ == "__main__": +def main(): # The ground truth. # Dictionary with mapping: "element : cluster_id". diz_grth = { @@ -104,12 +104,12 @@ def eval_precision_recall_fmeasure(expected: Dict, guess: Dict): # truth, but just different cluster labels. Precision == Recall == # F-Measure == 100%. # Dictionary with mapping: "element : cluster_id". - diz_estim_grth = { - "a": 2, - "b": 2, - "c": 66, - "d": 9 - } + # diz_estim_grth = { + # "a": 2, + # "b": 2, + # "c": 66, + # "d": 9 + # } # a sample where estimated != ground truth sys.stdout.write("Ground truth\n") @@ -130,3 +130,7 @@ def eval_precision_recall_fmeasure(expected: Dict, guess: Dict): sys.stdout.write("\nPrecison: %s%%\n" % p) sys.stdout.write("Recall: %s%%\n" % r) sys.stdout.write("F-Measure: %s%%\n" % f) + + +if __name__ == "__main__": + main() From f68836d254b6593fa91526c0761858cfda68b1a7 Mon Sep 17 00:00:00 2001 From: Matt Miller Date: Wed, 13 Jan 2021 14:28:35 -0600 Subject: [PATCH 11/36] typing --- avclass/clustering.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/avclass/clustering.py b/avclass/clustering.py index 20d3fe5..f6a9b4b 100755 --- a/avclass/clustering.py +++ b/avclass/clustering.py @@ -1,10 +1,10 @@ import sys from collections import defaultdict -from typing import Dict, Set +from typing import Dict, Set, Tuple -def tp_fp_fn(expected: Set, guess: Set): +def tp_fp_fn(expected: Set, guess: Set) -> Tuple[int, int, int]: """ Calculate the true-positives, false-positives, and false-negatives between ``expected`` and ``guess`` @@ -19,7 +19,7 @@ def tp_fp_fn(expected: Set, guess: Set): return tp, fp, fn -def eval_precision_recall_fmeasure(expected: Dict, guess: Dict): +def eval_precision_recall_fmeasure(expected: Dict, guess: Dict) -> Tuple[int, int, int]: """ Evaluate the precision, recall, and f-measure for the comparison of ``expected`` to ``guess`` From 402757ce0563d2a86f1fd0ec26b2e10ca221af50 Mon Sep 17 00:00:00 2001 From: Matt Miller Date: Wed, 13 Jan 2021 15:12:40 -0600 Subject: [PATCH 12/36] cleanup labeler --- avclass/labeler.py | 253 ++++++++++++++++++++------------------------- 1 file changed, 110 insertions(+), 143 deletions(-) diff --git a/avclass/labeler.py b/avclass/labeler.py index 035d4af..b96a28f 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -5,12 +5,19 @@ import traceback from operator import itemgetter +from typing import AnyStr, Optional -from avclass.common import AvLabels +from avclass.common import AvLabels, Taxonomy from avclass import clustering as ec, util -def guess_hash(h): +def guess_hash(h: AnyStr) -> Optional[AnyStr]: + """ + Guess hash type based on ``len(h)`` + + :param h: The hash + :return: The hash type (str) + """ ''' Given a hash string, guess the hash type based on the string length ''' hlen = len(h) if hlen == 32: @@ -19,30 +26,48 @@ def guess_hash(h): return 'sha1' elif hlen == 64: return 'sha256' - else: - return None + return None + + +def format_tag_pairs(l, taxonomy: Taxonomy = None) -> AnyStr: + """ + Get ranked tags as a string. -def format_tag_pairs(l, taxonomy=None): - ''' Return ranked tags as string ''' + :param l: + :param taxonomy: + :return: + """ + # TODO - wtf is ``l``? if not l: return "" + if taxonomy is not None: p = taxonomy.get_path(l[0][0]) else: p = l[0][0] + out = "%s|%d" % (p, l[0][1]) - for (t,s) in l[1:]: + for t, s in l[1:]: if taxonomy is not None: p = taxonomy.get_path(t) else: p = t out += ",%s|%d" % (p, s) + return out -def list_str(l, sep=", ", prefix=""): - ''' Return list as a string ''' +def list_str(l, sep: AnyStr = ", ", prefix: AnyStr = "") -> AnyStr: + """ + Return list as a string + + :param l: The list + :param sep: The separator + :param prefix: The prefix + :return: A string representation of the list + """ + # TODO - wtf is ``l``? if not l: return "" out = prefix + l[0] @@ -52,9 +77,10 @@ def list_str(l, sep=", ", prefix=""): def main(): + # TODO - break this function up. args = parse_args() # Select hash used to identify sample, by default MD5 - hash_type = args.hash if args.hash else 'md5' + hash_type = args.hash or 'md5' # If ground truth provided, read it from file gt_dict = {} @@ -68,26 +94,26 @@ def main(): hash_type = guess_hash(list(gt_dict.keys())[0]) # Create AvLabels object - av_labels = AvLabels(args.tag, args.exp, args.tax, - args.av, args.aliasdetect) + av_labels = AvLabels(args.tag, args.exp, args.tax, args.av, args.aliasdetect) # Build list of input files # NOTE: duplicate input files are not removed ifile_l = [] - if (args.vt): + if args.vt: ifile_l += args.vt ifile_are_vt = True - if (args.lb): + elif args.lb: ifile_l += args.lb ifile_are_vt = False - if (args.vtdir): - ifile_l += [os.path.join(args.vtdir, - f) for f in os.listdir(args.vtdir)] + elif args.vtdir: + ifile_l += [os.path.join(args.vtdir, f) for f in os.listdir(args.vtdir)] ifile_are_vt = True - if (args.lbdir): - ifile_l += [os.path.join(args.lbdir, - f) for f in os.listdir(args.lbdir)] + elif args.lbdir: + ifile_l += [os.path.join(args.lbdir, f) for f in os.listdir(args.lbdir)] ifile_are_vt = False + else: + # TODO - is this reachable? + sys.exit(1) # Select correct sample info extraction function if not ifile_are_vt: @@ -109,19 +135,12 @@ def main(): stats = {'samples': 0, 'noscans': 0, 'tagged': 0, 'maltagged': 0, 'FAM': 0, 'CLASS': 0, 'BEH': 0, 'FILE': 0, 'UNK': 0} - # Process each input file for ifile in ifile_l: - # Open file fd = open(ifile, 'r') - - # Debug info, file processed sys.stderr.write('[-] Processing input file %s\n' % ifile) - # Process all lines in file for line in fd: - - # If blank line, skip - if line == '\n': + if not line.strip(): continue # Debug info @@ -130,19 +149,16 @@ def main(): sys.stderr.flush() vt_all += 1 - # Read JSON line vt_rep = json.loads(line) - - # Extract sample info sample_info = get_sample_info(vt_rep) - # If no sample info, log error and continue if sample_info is None: try: name = vt_rep['md5'] sys.stderr.write('\nNo scans for %s\n' % name) except KeyError: sys.stderr.write('\nCould not process: %s\n' % line) + sys.stderr.flush() stats['noscans'] += 1 continue @@ -152,7 +168,7 @@ def main(): # If the VT report has no AV labels, output and continue if not sample_info.labels: - sys.stdout.write('%s\t-\t[]\n' % (name)) + sys.stdout.write('%s\t-\t[]\n' % name) # sys.stderr.write('\nNo AV labels for %s\n' % name) # sys.stderr.flush() continue @@ -160,8 +176,7 @@ def main(): # Compute VT_Count vt_count = len(sample_info.labels) - # Get the distinct tokens from all the av labels in the report - # And print them. + # Get the distinct tokens from all the av labels in the report and print them. try: av_tmp = av_labels.get_sample_tags(sample_info) tags = av_labels.rank_tags(av_tmp) @@ -183,24 +198,21 @@ def main(): token_count_map[curr_tok] = curr_count + 1 for prev_tok in prev_tokens: if prev_tok < curr_tok: - pair = (prev_tok,curr_tok) + pair = prev_tok, curr_tok else: - pair = (curr_tok,prev_tok) + pair = curr_tok, prev_tok pair_count = pair_count_map.get(pair, 0) pair_count_map[pair] = pair_count + 1 prev_tokens.add(curr_tok) # Collect stats - # FIX: should iterate once over tags, - # for both stats and aliasdetect + # TODO - should iterate once over tags for both stats and aliasdetect if tags: stats["tagged"] += 1 if args.stats: - if (vt_count > 3): + if vt_count > 3: stats["maltagged"] += 1 - cat_map = {'FAM': False, 'CLASS': False, - 'BEH': False, 'FILE': False, 'UNK': - False} + cat_map = {'FAM': False, 'CLASS': False, 'BEH': False, 'FILE': False, 'UNK': False} for t in tags: path, cat = av_labels.taxonomy.get_info(t[0]) cat_map[cat] = True @@ -215,21 +227,18 @@ def main(): else: is_pup_str = "\t0" else: - is_pup_str = "" + is_pup_str = "" # Select family for sample if needed, # i.e., for compatibility mode or for ground truth + fam = "SINGLETON:" + name if args.c or args.gt: - fam = "SINGLETON:" + name - # fam = '' - for (t,s) in tags: + for t, s in tags: cat = av_labels.taxonomy.get_category(t) - if (cat == "UNK") or (cat == "FAM"): + if cat in ["UNK", "FAM"]: fam = t break - # Get ground truth family, if available - if args.gt: first_token_dict[name] = fam gt_family = '\t' + gt_dict.get(name, "") else: @@ -247,38 +256,27 @@ def main(): tag_str = format_tag_pairs(tags, av_labels.taxonomy) else: tag_str = format_tag_pairs(tags) - sys.stdout.write('%s\t%d\t%s%s%s%s\n' % - (name, vt_count, tag_str, gt_family, - is_pup_str, vtt)) + sys.stdout.write('%s\t%d\t%s%s%s%s\n' % name, vt_count, tag_str, gt_family, is_pup_str, vtt) else: - sys.stdout.write('%s\t%s%s%s\n' % - (name, fam, gt_family, is_pup_str)) + sys.stdout.write('%s\t%s%s%s\n' % name, fam, gt_family, is_pup_str) except: traceback.print_exc(file=sys.stderr) continue - # Debug info sys.stderr.write('\r[-] %d JSON read' % vt_all) sys.stderr.flush() sys.stderr.write('\n') - # Close file fd.close() # Print statistics - sys.stderr.write( - "[-] Samples: %d NoScans: %d NoTags: %d GroundTruth: %d\n" % ( - vt_all, stats['noscans'], vt_all - stats['tagged'], - len(gt_dict))) + sys.stderr.write("[-] Samples: %d NoScans: %d NoTags: %d GroundTruth: %d\n" % + (vt_all, stats['noscans'], vt_all - stats['tagged'], len(gt_dict))) # If ground truth, print precision, recall, and F1-measure if args.gt: - precision, recall, fmeasure = \ - ec.eval_precision_recall_fmeasure(gt_dict, - first_token_dict) - sys.stderr.write( - "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % \ - (precision, recall, fmeasure)) + precision, recall, fmeasure = ec.eval_precision_recall_fmeasure(gt_dict, first_token_dict) + sys.stderr.write("Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % (precision, recall, fmeasure)) # Output stats if args.stats: @@ -291,7 +289,7 @@ def main(): num_maltagged = stats['maltagged'] frac = float(num_maltagged) / float(num_samples) * 100 stats_fd.write('Tagged (VT>3): %d (%.01f%%)\n' % (num_maltagged, frac)) - for c in ['FILE','CLASS','BEH','FAM','UNK']: + for c in ['FILE', 'CLASS', 'BEH', 'FAM', 'UNK']: count = stats[c] frac = float(count) / float(num_maltagged) * 100 stats_fd.write('%s: %d (%.01f%%)\n' % (c, stats[c], frac)) @@ -302,9 +300,8 @@ def main(): avtags_fd = open("%s.avtags" % out_prefix, 'w') for t in sorted(avtags_dict.keys()): avtags_fd.write('%s\t' % t) - pairs = sorted(avtags_dict[t].items(), - key=lambda pair : pair[1], - reverse=True) + pairs = sorted(avtags_dict[t].items(), key=lambda pair: pair[1], reverse=True) + for pair in pairs: avtags_fd.write('%s|%d,' % (pair[0], pair[1])) avtags_fd.write('\n') @@ -312,7 +309,6 @@ def main(): # If alias detection, print map if args.aliasdetect: - # Open alias file alias_filename = out_prefix + '.alias' alias_fd = open(alias_filename, 'w+') # Sort token pairs by number of times they appear together @@ -322,13 +318,12 @@ def main(): # pair_count_map.items()) # Output header line - alias_fd.write("# t1\tt2\t|t1|\t|t2|\t" - "|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n") + alias_fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n") # Compute token pair statistic and output to alias file - for (t1, t2), c in sorted_pairs: + for t1, t2, c in sorted_pairs: n1 = token_count_map[t1] n2 = token_count_map[t2] - if (n1 < n2): + if n1 < n2: x = t1 y = t2 xn = n1 @@ -340,129 +335,101 @@ def main(): yn = n1 f = float(c) / float(xn) finv = float(c) / float(yn) - alias_fd.write("%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" % ( - x, y, xn, yn, c, f, finv)) + alias_fd.write("%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" % (x, y, xn, yn, c, f, finv)) # Close alias file alias_fd.close() - sys.stderr.write('[-] Alias data in %s\n' % (alias_filename)) + sys.stderr.write('[-] Alias data in %s\n' % alias_filename) def parse_args(): argparser = argparse.ArgumentParser(prog='avclass', - description='''Extracts tags for a set of samples. - Also calculates precision and recall if ground truth available''') + description='Extracts tags for a set of samples. Also calculates precision and' + ' recall if ground truth available') - argparser.add_argument('-vt', action='append', - help='file with VT reports ' - '(Can be provided multiple times)') + argparser.add_argument('-vt', action='append', help='file with VT reports (Can be provided multiple times)') - argparser.add_argument('-lb', action='append', - help='file with simplified JSON reports' - '{md5,sha1,sha256,scan_date,av_labels} ' - '(Can be provided multiple times)') + argparser.add_argument('-lb', action='append', help='file with simplified JSON reports ' + '{md5,sha1,sha256,scan_date,av_labels} (Can be provided ' + 'multiple times)') - argparser.add_argument('-vtdir', - help='existing directory with VT reports') + argparser.add_argument('-vtdir', help='existing directory with VT reports') - argparser.add_argument('-lbdir', - help='existing directory with simplified JSON reports') + argparser.add_argument('-lbdir', help='existing directory with simplified JSON reports') - argparser.add_argument('-vt3', action='store_true', - help='input are VT v3 files') + argparser.add_argument('-vt3', action='store_true', help='input are VT v3 files') - argparser.add_argument('-gt', - help='file with ground truth. ' - 'If provided it evaluates clustering accuracy. ' - 'Prints precision, recall, F1-measure.') + argparser.add_argument('-gt', help='file with ground truth. If provided it evaluates clustering accuracy. ' + 'Prints precision, recall, F1-measure.') - argparser.add_argument('-vtt', - help='Include VT tags in the output.', - action='store_true') + argparser.add_argument('-vtt', help='Include VT tags in the output.', action='store_true') - argparser.add_argument('-tag', - help='file with tagging rules.', - default = util.DEFAULT_TAG_PATH) + argparser.add_argument('-tag', help='file with tagging rules.', default=util.DEFAULT_TAG_PATH) - argparser.add_argument('-tax', - help='file with taxonomy.', - default = util.DEFAULT_TAX_PATH) + argparser.add_argument('-tax', help='file with taxonomy.', default=util.DEFAULT_TAX_PATH) - argparser.add_argument('-exp', - help='file with expansion rules.', - default = util.DEFAULT_EXP_PATH) + argparser.add_argument('-exp', help='file with expansion rules.', default=util.DEFAULT_EXP_PATH) - argparser.add_argument('-av', - help='file with list of AVs to use') + argparser.add_argument('-av', help='file with list of AVs to use') - argparser.add_argument('-avtags', - help='extracts tags per av vendor', - action='store_true') + argparser.add_argument('-avtags', help='extracts tags per av vendor', action='store_true') - argparser.add_argument('-pup', - action='store_true', - help='if used each sample is classified as PUP or not') + argparser.add_argument('-pup', action='store_true', help='if used each sample is classified as PUP or not') - argparser.add_argument('-p', '--path', - help='output.full path for tags', - action='store_true') + argparser.add_argument('-p', '--path', help='output.full path for tags', action='store_true') - argparser.add_argument('-hash', - help='hash used to name samples. Should match ground truth', - choices=['md5', 'sha1', 'sha256']) + argparser.add_argument('-hash', help='hash used to name samples. Should match ground truth', + choices=['md5', 'sha1', 'sha256']) - argparser.add_argument('-c', - help='Compatibility mode. Outputs results in AVClass format.', - action='store_true') + argparser.add_argument('-c', help='Compatibility mode. Outputs results in AVClass format.', action='store_true') - argparser.add_argument('-aliasdetect', - action='store_true', - help='if used produce aliases file at end') + argparser.add_argument('-aliasdetect', action='store_true', help='if used produce aliases file at end') - argparser.add_argument('-stats', - action='store_true', - help='if used produce 1 file ' - 'with stats per category ' - '(File, Class, ' - 'Behavior, Family, Unclassified)') + argparser.add_argument('-stats', action='store_true', help='if used produce 1 file with stats per category ' + '(File, Class, Behavior, Family, Unclassified)') args = argparser.parse_args() + # TODO - use non-exclusive group to ensure at least one is selected instead of this if not args.vt and not args.lb and not args.vtdir and not args.lbdir: sys.stderr.write('One of the following 4 arguments is required: ' '-vt,-lb,-vtdir,-lbdir\n') exit(1) + # TODO - use mutex group for this instead of manual check if (args.vt or args.vtdir) and (args.lb or args.lbdir): sys.stderr.write('Use either -vt/-vtdir or -lb/-lbdir. ' 'Both types of input files cannot be combined.\n') exit(1) + # TODO - consider letting argparse handle this? if args.tag: if args.tag == '/dev/null': sys.stderr.write('[-] Using no tagging rules\n') else: - sys.stderr.write('[-] Using tagging rules in %s\n' % (args.tag)) + sys.stderr.write('[-] Using tagging rules in %s\n' % args.tag) else: - sys.stderr.write('[-] Using default tagging rules in %s\n' % (util.DEFAULT_TAG_PATH)) + sys.stderr.write('[-] Using default tagging rules in %s\n' % util.DEFAULT_TAG_PATH) + # TODO - consider letting argparse handle this? if args.tax: if args.tax == '/dev/null': sys.stderr.write('[-] Using no taxonomy\n') else: - sys.stderr.write('[-] Using taxonomy in %s\n' % (args.tax)) + sys.stderr.write('[-] Using taxonomy in %s\n' % args.tax) else: - sys.stderr.write('[-] Using default taxonomy in %s\n' % (util.DEFAULT_TAX_PATH)) + sys.stderr.write('[-] Using default taxonomy in %s\n' % util.DEFAULT_TAX_PATH) + # TODO - consider letting argparse handle this? if args.exp: if args.exp == '/dev/null': sys.stderr.write('[-] Using no expansion tags\n') else: - sys.stderr.write('[-] Using expansion tags in %s\n' % (args.exp)) + sys.stderr.write('[-] Using expansion tags in %s\n' % args.exp) else: - sys.stderr.write('[-] Using default expansion tags in %s\n' % (util.DEFAULT_EXP_PATH)) + sys.stderr.write('[-] Using default expansion tags in %s\n' % util.DEFAULT_EXP_PATH) return args -if __name__=='__main__': +if __name__ == '__main__': main() From f37a47da9f8766215da448fdf8e3ffc4d9595b3a Mon Sep 17 00:00:00 2001 From: Matt Miller Date: Wed, 13 Jan 2021 15:44:18 -0600 Subject: [PATCH 13/36] cleanup update --- avclass/update.py | 334 ++++++++++++++++++++++++---------------------- 1 file changed, 173 insertions(+), 161 deletions(-) diff --git a/avclass/update.py b/avclass/update.py index d19ef0f..a2bc73b 100644 --- a/avclass/update.py +++ b/avclass/update.py @@ -5,6 +5,7 @@ from collections import namedtuple from operator import itemgetter +from typing import AnyStr, Collection, Optional, Set, TextIO from avclass import util from avclass.common import Taxonomy, Expansion, Translation @@ -23,18 +24,14 @@ # Threshold for string similarity # sim_threshold = 0.6 -Relation = namedtuple('Relation', ['t1', 't2', 't1_num', 't2_num', - 'nalias_num', 'talias_num', 'tinv_alias_num']) - +Relation = namedtuple('Relation', ['t1', 't2', 't1_num', 't2_num', 'nalias_num', 'talias_num', 'tinv_alias_num']) class Update: - ''' Update Module ''' - def __init__(self, rel_filepath, in_taxonomy, in_tagging, in_expansion, - n, t): - # Initialize inputs + def __init__(self, rel_filepath: AnyStr, in_taxonomy: Taxonomy, in_translation: Translation, + in_expansion: Expansion, n, t): self.__out_taxonomy = in_taxonomy - self.__out_tagging = in_tagging + self.__out_translation = in_translation self.__out_expansion = in_expansion self.__n = n self.__t = t @@ -44,80 +41,117 @@ def __init__(self, rel_filepath, in_taxonomy, in_tagging, in_expansion, self.src_map = {} # Read relations from file self.rel_set = self.read_relations(rel_filepath) + self.dst_map = {} + self.cat_pairs_map = {} - def num_rules(self): + # TODO - @property decorator + def num_rules(self) -> int: return len(self.rel_set) - def is_weak_rel(self, rel): - ''' Return true if relationship is weak, - i.e., does not meet thresholds ''' + def is_weak_rel(self, rel: Relation) -> bool: + """ + Boolean whether or not the relationship is considered weak (doesn't meet thresholds). + + :param rel: The relationship + :return: Boolean + """ return ((int(rel.nalias_num) < self.__n) or (float(rel.talias_num) < self.__t)) - def is_blacklisted_rel(self, rel): - ''' Return true if relationship is blacklisted ''' - return (rel.t1 in self.blist) or (rel.t2 in self.blist) + def is_blacklisted_rel(self, rel: Relation) -> bool: + """ + Boolean whether or not the relationship is blacklisted. + + :param rel: The relationship + :return: Boolean + """ + return rel.t1 in self.blist or rel.t2 in self.blist - def is_known_rel(self, rel): - ''' Return true if relationship is known ''' + def is_known_rel(self, rel: Relation) -> bool: + """ + Boolean whether or not the relationship is known. + + :param rel: The relationship + :return: Boolean + """ t1 = rel.t1 t2 = rel.t2 # Known taxonomy relation - if self.__out_taxonomy.overlaps(t1,t2): + if self.__out_taxonomy.overlaps(t1, t2): return True # Known expansion rule t1_dst = self.__out_expansion.get_dst(t1) t2_dst = self.__out_expansion.get_dst(t2) - if (t2 in t1_dst) or (t1 in t2_dst): + if t2 in t1_dst or t1 in t2_dst: return True # Known tagging rule - t1_dst = sorted(self.__out_tagging.get_dst(t1)) - t2_dst = sorted(self.__out_tagging.get_dst(t2)) - if (t2 in t1_dst) or (t1 in t2_dst): + t1_dst = sorted(self.__out_translation.get_dst(t1)) + t2_dst = sorted(self.__out_translation.get_dst(t2)) + if t2 in t1_dst or t1 in t2_dst: return True # Known alias in tagging - if t1_dst and (t1_dst == t2_dst): + if t1_dst and t1_dst == t2_dst: return True return False - def add_tag(self, name, path): - ''' Add tag to taxonomy if not in tagging ''' - l = self.__out_tagging.get_dst(name) - if (not l): + def add_tag(self, name: AnyStr, path: AnyStr): + """ + Add tag to Taxonomy if it's not in Translation rules + + :param name: The name of the tag + :param path: The full path + :return: None + """ + dst = self.__out_translation.get_dst(name) + if not dst: self.__out_taxonomy.add_tag(path) - def add_expansion(self, src, dst_l): + def add_expansion(self, src: AnyStr, dst_l: Collection[AnyStr]): + """ + Add expansion rule to fix destination if the source is in Translation. + + :param src: The source label + :param dst_l: A list of destination labels + :return: None + """ ''' Add expansion rule fixing destination if src in tagging ''' # Select source handling aliases - l = self.__out_tagging.get_dst(src) - if l: - new_src = l[0] + dst = self.__out_translation.get_dst(src) + if dst: + new_src = dst[0] else: new_src = src # Select destinations removing overlaps with existing rule - l = self.__out_expansion.get_dst(src) - if l: - l.extend(dst_l) - target_l = self.__out_taxonomy.remove_overlaps(l) + dst = self.__out_expansion.get_dst(src) + if dst: + dst.extend(dst_l) + target_l = self.__out_taxonomy.remove_overlaps(dst) self.__out_expansion.add_rule(new_src, target_l, True) else: self.__out_expansion.add_rule(new_src, dst_l, True) - def add_alias(self, src, dst, dst_prefix): - ''' Add alias relation to taxonomy, tagging ''' + def add_alias(self, src: AnyStr, dst: AnyStr, dst_prefix: AnyStr): + """ + Add alias relation to the Taxonomy and Translation + + :param src: Source alias + :param dst: Destination alias + :param dst_prefix: Destination prefix + :return: None + """ # If src in tagging, use most popular target - l = self.__out_tagging.get_dst(src) + tr_dst = self.__out_translation.get_dst(src) target = dst - if l: + if tr_dst: cnt_max = self.src_map[dst] - for e in l: + for e in tr_dst: cnt = self.src_map.get(e, 0) if cnt > cnt_max: target = e # If dst is in tagging, update tagging rule destination, - l = self.__out_tagging.get_dst(dst) - if l: - target_l = l + tr_dst = self.__out_translation.get_dst(dst) + if tr_dst: + target_l = tr_dst # else add dst to taxonomy else: target_l = [target] @@ -125,10 +159,15 @@ def add_alias(self, src, dst, dst_prefix): # Remove src from taxonomy self.__out_taxonomy.remove_tag(src) # Replace tagging rule - self.__out_tagging.add_rule(src, target_l, True) + self.__out_translation.add_rule(src, target_l, True) + + def is_expansion_rel(self, rel: Relation) -> bool: + """ + Boolean whether or not the relation implies an expansion - def is_expansion_rel(self, rel): - ''' Return true if relation implies expansion rule ''' + :param rel: The relation + :return: Boolean + """ c1 = self.__out_taxonomy.get_category(rel.t1) c2 = self.__out_taxonomy.get_category(rel.t2) return (((c1 == "FAM") and (c2 != c1) and (c2 != "UNK")) or @@ -136,15 +175,19 @@ def is_expansion_rel(self, rel): ((c1 == "UNK") and ((c2 == "BEH") or (c2 == "CLASS")))) def find_expansions(self): - ''' Find expansions among relations ''' + """ + Resolve relations that are expansions + + :return: None + """ acc = [] for rel in self.rel_set: p1 = self.__out_taxonomy.get_path(rel.t1) p2 = self.__out_taxonomy.get_path(rel.t2) logger.debug("Processing %s\t%s" % (p1, p2)) # Ignore relations where t1 is an alias - l = self.__out_tagging.get_dst(rel.t1) - if l: + dst = self.__out_translation.get_dst(rel.t1) + if dst: logger.debug("Ignoring relation for alias %s" % p1) continue if self.is_expansion_rel(rel): @@ -153,15 +196,14 @@ def find_expansions(self): for rel in acc: self.rel_set.remove(rel) - #def is_alias_rel(self, rel): + # def is_alias_rel(self, rel): # ''' Return true if relation implies alias rule ''' # c1 = self.__out_taxonomy.get_category(rel.t1) # c2 = self.__out_taxonomy.get_category(rel.t2) # return (((c1 == "UNK") and (c2 == "FAM")) or # ((c1 == "UNK") and (c2 == "UNK"))) - - #def find_aliases(self): + # def find_aliases(self): # ''' Find aliases among relations ''' # for rel in self.rel_set: # c1 = self.__out_taxonomy.get_category(rel.t1) @@ -172,30 +214,32 @@ def find_expansions(self): # self.G.add_edge(rel.t1, rel.t2, score=rel.talias_num) # self.output_components("comp") - def process_relation(self, rel): - ''' Process relation and update taxonomy/tagging correspondingly ''' + def process_relation(self, rel: Relation): + """ + Process relation and update Taxonomy/Translation - # Obtain tag info + :param rel: The relation + :return: + """ t1 = rel.t1 t2 = rel.t2 - p1,c1 = self.__out_taxonomy.get_info(rel.t1) - p2,c2 = self.__out_taxonomy.get_info(rel.t2) + p1, c1 = self.__out_taxonomy.get_info(rel.t1) + p2, c2 = self.__out_taxonomy.get_info(rel.t2) logger.debug("Processing %s\t%s" % (p1, p2)) # If both directions strong, then equivalent, i.e., alias - if (float(rel.tinv_alias_num) >= args.t): - if (c1 != "UNK") and (c2 == "UNK"): + if float(rel.tinv_alias_num) >= args.t: + if c1 != "UNK" and c2 == "UNK": prefix = p1[0:p1.rfind(':')] - elif (c1 == "UNK") and (c2 != "UNK"): + elif c1 == "UNK" and c2 != "UNK": prefix = p2[0:p2.rfind(':')] - elif (c1 == "UNK") and (c2 == "UNK"): + elif c1 == "UNK" and c2 == "UNK": prefix = "FAM" - elif (c1 == c2): + elif c1 == c2: prefix = p1[0:p1.rfind(':')] else: - logger.warn("Equivalent rule with different categories: %s\t%s" % - (p1, p2)) + logger.warning("Equivalent rule with different categories: %s\t%s" % (p1, p2)) return -1 self.add_alias(t1, t2, prefix) return 1 @@ -232,7 +276,7 @@ def process_relation(self, rel): self.add_alias(t1, t2, "FAM") return 1 - # FILE -> UNK : alias-file + # FILE -> UNK : alias-file elif (c1 == "FILE") and (c2 == "UNK"): prefix = p1[0:p1.rfind(':')] self.add_alias(t1, t2, prefix) @@ -240,13 +284,12 @@ def process_relation(self, rel): # Same category : alias elif (c1 == "FAM") and (c2 == "FAM"): - #elif c1 == c2: prefix = p2[0:p2.rfind(':')] self.add_alias(t1, t2, prefix) return 1 # Target unknown - elif (c2 == "UNK"): + elif c2 == "UNK": # If tokens are similar, likely family aliases # log.info("Similarity: %.02f" % levenshtein_ratio(t1, t2)) # if (levenshtein_ratio(t1, t2) > sim_threshold): @@ -258,11 +301,14 @@ def process_relation(self, rel): return 0 # Default: review taxonomy - else: - return 0 - + return 0 def run(self): + """ + Run the updater. + + :return: None + """ num_iter = 0 while self.rel_set: # Do a pass in remaining relations @@ -299,8 +345,13 @@ def run(self): logger.debug("[-] Finding expansions") self.find_expansions() + def read_relations(self, filepath: AnyStr) -> Set[Relation]: + """ + Filters weak and blacklisted relations - def read_relations(self, filepath): + :param filepath: The path of the file to read + :return: A set of Relation objects + """ ''' Returns relations in file as a set Filters weak and blacklisted relations ''' rel_set = set() @@ -310,8 +361,7 @@ def read_relations(self, filepath): if line.startswith('#'): continue # Parse line - t1, t2, t1_num, t2_num, nalias_num, talias_num, \ - tinv_alias_num = line.strip().split('\t') + t1, t2, t1_num, t2_num, nalias_num, talias_num, tinv_alias_num = line.strip().split('\t') # Build relation rel = Relation(t1, t2, t1_num, t2_num, nalias_num, talias_num, tinv_alias_num) @@ -324,8 +374,8 @@ def read_relations(self, filepath): # Ignore known relations # NOTE: commented since we check if a # relation is known before processing it - #if self.is_known_rel(rel): - # continue + # if self.is_known_rel(rel): + # continue # Add relation to set rel_set.add(rel) # Add to src_map @@ -334,113 +384,81 @@ def read_relations(self, filepath): return rel_set - def output_relations(self, filepath): - fd = open(filepath, 'w') - fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t" - "|t1^t2|/|t2|\n") - sorted_rules = sorted(self.rel_set, - key=(lambda r: ( - self.__out_taxonomy.get_category(r.t1), - self.__out_taxonomy.get_category(r.t2))), - reverse=False) - for rel in sorted_rules: - p1,c1 = self.__out_taxonomy.get_info(rel.t1) - p2,c2 = self.__out_taxonomy.get_info(rel.t2) - fd.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %( - p1, p2, rel.t1_num, rel.t2_num, rel.nalias_num, - rel.talias_num, rel.tinv_alias_num)) - fd.close() - - def output_rule_stats(self, fd): - # Initialize maps for statistics - self.dst_map = {} - self.cat_pairs_map = {} + def output_relations(self, filepath: AnyStr): + with open(filepath, 'w') as fd: + fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n") + sorted_rules = sorted(self.rel_set, + key=lambda r: (self.__out_taxonomy.get_category(r.t1), + self.__out_taxonomy.get_category(r.t2))) + for rel in sorted_rules: + p1, c1 = self.__out_taxonomy.get_info(rel.t1) + p2, c2 = self.__out_taxonomy.get_info(rel.t2) + fd.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (p1, p2, rel.t1_num, rel.t2_num, rel.nalias_num, + rel.talias_num, rel.tinv_alias_num)) + + def output_rule_stats(self, fd: TextIO): # Compute rule statistics for rel in self.rel_set: c1 = self.__out_taxonomy.get_category(rel.t1) c2 = self.__out_taxonomy.get_category(rel.t2) - self.cat_pairs_map[(c1,c2)] = self.cat_pairs_map.get((c1, - c2), 0) + 1 + self.cat_pairs_map[(c1, c2)] = self.cat_pairs_map.get((c1, c2), 0) + 1 self.dst_map[rel.t2] = self.dst_map.get(rel.t2, 0) + 1 # Output statistics - cat_pairs = sorted(update.cat_pairs_map.items(), key=itemgetter(1,0), - reverse=True) - for (c1,c2), cnt in cat_pairs: + cat_pairs = sorted(update.cat_pairs_map.items(), key=itemgetter(1, 0), reverse=True) + for c1, c2, cnt in cat_pairs: fd.write("%s\t%s\t%03d\n" % (c1, c2, cnt)) # Print dst statistics - dst_pairs = sorted(update.dst_map.items(), key=itemgetter(1,0), - reverse=False) + dst_pairs = sorted(update.dst_map.items(), key=itemgetter(1, 0)) for dst, cnt in dst_pairs: fd.write("%s\t%03d\n" % (taxonomy.get_path(dst), cnt)) - def output(self, out_prefix): - if (not out_prefix): + @staticmethod + def output(prefix: Optional[AnyStr] = None): + if not prefix: tax_filepath = util.DEFAULT_TAX_PATH tag_filepath = util.DEFAULT_TAG_PATH exp_filepath = util.DEFAULT_EXP_PATH else: - tax_filepath = out_prefix + ".taxonomy" - tag_filepath = out_prefix + ".tagging" - exp_filepath = out_prefix + ".expansion" + tax_filepath = prefix + ".taxonomy" + tag_filepath = prefix + ".tagging" + exp_filepath = prefix + ".expansion" + taxonomy.to_file(tax_filepath) - logger.info('[-] Output %d taxonomy tags to %s' % ( - len(taxonomy), tax_filepath)) + logger.info('[-] Output %d taxonomy tags to %s' % (len(taxonomy), tax_filepath)) tagging.expand_all_destinations() tagging.to_file(tag_filepath) - logger.info('[-] Output %d tagging rules to %s' % ( - len(tagging), tag_filepath)) + logger.info('[-] Output %d tagging rules to %s' % (len(tagging), tag_filepath)) expansion.to_file(exp_filepath) - logger.info('[-] Output %d expansion rules to %s' % ( - len(expansion), exp_filepath)) + logger.info('[-] Output %d expansion rules to %s' % (len(expansion), exp_filepath)) if __name__ == '__main__': - argparser = argparse.ArgumentParser( - description='''Given a .alias file from the labeler, - generates updates for the taxonomy, tagging, and expansion files.''') + parser = argparse.ArgumentParser(description='Given a .alias file from the labeler, generates updates for the ' + 'taxonomy, tagging, and expansion files.') - argparser.add_argument('-alias', - help='file to parse with alias from labeler' - 'Labeler will run if -alias not present') + parser.add_argument('-alias', help='file to parse with alias from labeler which runs if -alias not present') - argparser.add_argument('-n', - help='Minimum number of times that a pair of tokes have been seen.' - 'Default: 20', - type=int, - default=20) + parser.add_argument('-n', help='Minimum number of times that a pair of tokes have been seen. Default: 20', + type=int, default=20) - argparser.add_argument('-t', - help='Minimum percentage of times two tokens appear together.' - 'Default: 1.94', - type=float, - default=0.94) + parser.add_argument('-t', help='Minimum percentage of times two tokens appear together. Default: 1.94', + type=float, default=0.94) - argparser.add_argument('-o', - help='output prefix for files') + parser.add_argument('-o', help='output prefix for files') - argparser.add_argument('-update', - action='store_true', - help='update default taxonomy,tagging,expansion files in place') + parser.add_argument('-update', action='store_true', help='update default taxonomy,tagging,expansion files in place') - argparser.add_argument('-tag', - help='file with tagging rules.', - default = util.DEFAULT_TAG_PATH) + parser.add_argument('-tag', help='file with tagging rules.', default=util.DEFAULT_TAG_PATH) - argparser.add_argument('-tax', - help='file with taxonomy.', - default = util.DEFAULT_TAX_PATH) + parser.add_argument('-tax', help='file with taxonomy.', default=util.DEFAULT_TAX_PATH) - argparser.add_argument('-exp', - help='file with expansion rules.', - default = util.DEFAULT_EXP_PATH) + parser.add_argument('-exp', help='file with expansion rules.', default=util.DEFAULT_EXP_PATH) - argparser.add_argument('-v', '--verbose', - action='store_true', - help='verbose, prints debugging statements.') + parser.add_argument('-v', '--verbose', action='store_true', help='verbose, prints debugging statements.') # Parse arguments - args = argparser.parse_args() + args = parser.parse_args() # Check we have the input if not args.alias: @@ -448,35 +466,31 @@ def output(self, out_prefix): exit(1) # Set logging level - if (args.verbose): + if args.verbose: handler_stderr.setLevel(logging.DEBUG) # Set output prefix if args.o: - out_prefix = args.o + out_prefix = args.o else: - out_prefix = os.path.splitext(args.alias)[0] + out_prefix = os.path.splitext(args.alias)[0] # Read taxonomy taxonomy = Taxonomy(args.tax) - logger.info('[-] Read %d taxonomy tags from %s' % ( - len(taxonomy), args.tax)) + logger.info('[-] Read %d taxonomy tags from %s' % (len(taxonomy), args.tax)) # Read tagging rules tagging = Translation(args.tag) - logger.info('[-] Read %d tagging rules from %s' % ( - len(tagging), args.tag)) + logger.info('[-] Read %d tagging rules from %s' % (len(tagging), args.tag)) # Read expansion rules expansion = Expansion(args.exp) - logger.info('[-] Read %d expansion rules from %s' % ( - len(expansion), args.exp)) + logger.info('[-] Read %d expansion rules from %s' % (len(expansion), args.exp)) # Build update object update = Update(args.alias, taxonomy, tagging, expansion, args.n, args.t) - logger.info('[-] Processing %d relations satisfying t>=%.2f n>=%d' % ( - update.num_rules(), args.t, args.n)) + logger.info('[-] Processing %d relations satisfying t>=%.2f n>=%d' % (update.num_rules(), args.t, args.n)) # Output initial rules update.output_relations(out_prefix + ".orig.rules") @@ -493,6 +507,4 @@ def output(self, out_prefix): else: update.output(out_prefix) - # Output final rules update.output_relations(out_prefix + ".final.rules") - From dd591b1ac7bb7eb74fc9ba3860f0ccdc67291aea Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Mon, 1 Feb 2021 14:51:33 -0500 Subject: [PATCH 14/36] Fixed some issues for console execution, black formatting --- avclass/common.py | 157 ++++++++++++++--------- avclass/labeler.py | 305 +++++++++++++++++++++++++++++---------------- 2 files changed, 293 insertions(+), 169 deletions(-) diff --git a/avclass/common.py b/avclass/common.py index cf79a21..9cbe4bc 100755 --- a/avclass/common.py +++ b/avclass/common.py @@ -16,17 +16,28 @@ # Default category for tags in taxonomy with no category uncategorized_cat = "UNC" -SampleInfo = namedtuple('SampleInfo', - ['md5', 'sha1', 'sha256', 'labels', 'vt_tags']) +SampleInfo = namedtuple("SampleInfo", ["md5", "sha1", "sha256", "labels", "vt_tags"]) -Tag = namedtuple('Tag', ['name', 'cat', 'path', 'prefix_l']) +Tag = namedtuple("Tag", ["name", "cat", "path", "prefix_l"]) # AVs to use in suffix removal -suffix_removal_av_set = {'Norman', 'Avast', 'Avira', 'Kaspersky', - 'ESET-NOD32', 'Fortinet', 'Jiangmin', 'Comodo', - 'GData', 'Avast', 'Sophos', - 'TrendMicro-HouseCall', 'TrendMicro', - 'NANO-Antivirus', 'Microsoft'} +suffix_removal_av_set = { + "Norman", + "Avast", + "Avira", + "Kaspersky", + "ESET-NOD32", + "Fortinet", + "Jiangmin", + "Comodo", + "GData", + "Avast", + "Sophos", + "TrendMicro-HouseCall", + "TrendMicro", + "NANO-Antivirus", + "Microsoft", +} def create_tag(s: AnyStr): @@ -43,8 +54,8 @@ def create_tag(s: AnyStr): prefix_l = [x.lower() for x in word_list[1:-1]] path = cat for x in prefix_l: - path = path + ':' + x - path = path + ':' + name + path = path + ":" + x + path = path + ":" + name else: name = word_list[0].lower() cat = uncategorized_cat @@ -57,6 +68,7 @@ class Taxonomy: """ Contains tags and generic tokens read from filesystem """ + def __init__(self, filepath: Optional[AnyStr]): """ Initialize and populate the Tag map from ``filepath`` @@ -73,7 +85,9 @@ def __len__(self) -> int: :return: The length (int) of the Taxonomy """ - return len(self.__tag_map)//2 # TODO - perhaps there should be two dicts, one for names, one for paths? + return ( + len(self.__tag_map) // 2 + ) # TODO - perhaps there should be two dicts, one for names, one for paths? def is_generic(self, tag: AnyStr) -> bool: """ @@ -83,7 +97,7 @@ def is_generic(self, tag: AnyStr) -> bool: :return: Boolean """ t = self.__tag_map.get(tag, None) - return getattr(t, 'cat', None) == 'GEN' + return getattr(t, "cat", None) == "GEN" def is_tag(self, tag: AnyStr) -> bool: """ @@ -138,7 +152,7 @@ def get_category(self, tag: AnyStr) -> AnyStr: :return: The category """ t = self.__tag_map.get(tag, None) - return getattr(t, 'cat', 'UNK') + return getattr(t, "cat", "UNK") def get_path(self, tag: AnyStr) -> AnyStr: """ @@ -148,7 +162,7 @@ def get_path(self, tag: AnyStr) -> AnyStr: :return: The tag's path """ t = self.__tag_map.get(tag, None) - return getattr(t, 'path', f'UNK:{tag}') + return getattr(t, "path", f"UNK:{tag}") def get_prefix_l(self, tag: AnyStr) -> List[AnyStr]: """ @@ -158,7 +172,7 @@ def get_prefix_l(self, tag: AnyStr) -> List[AnyStr]: :return: The tag's prefix list """ t = self.__tag_map.get(tag, None) - return getattr(t, 'prefix_l', []) + return getattr(t, "prefix_l", []) def get_prefix(self, tag: AnyStr) -> List[AnyStr]: """ @@ -168,7 +182,7 @@ def get_prefix(self, tag: AnyStr) -> List[AnyStr]: :return: String representation of the tag's full prefix """ t = self.__tag_map.get(tag, None) - tag_pfx = tag.path.split(':')[:-1] + tag_pfx = tag.path.split(":")[:-1] return t.prefix_l if t else tag_pfx def get_depth(self, tag: AnyStr) -> int: @@ -213,7 +227,11 @@ def platform_tags(self) -> Set[AnyStr]: :return: Set of platformn tags """ - return {tag.name for _, tag in self.__tag_map.items() if tag.path.startswith(platform_prefix)} + return { + tag.name + for _, tag in self.__tag_map.items() + if tag.path.startswith(platform_prefix) + } def overlaps(self, t1: AnyStr, t2: AnyStr) -> bool: """ @@ -227,7 +245,9 @@ def overlaps(self, t1: AnyStr, t2: AnyStr) -> bool: m2 = self.get_prefix_l(t2) return t1 in m2 or t2 in m1 - def remove_overlaps(self, l: Collection[AnyStr]) -> Union[Collection[AnyStr], List[AnyStr]]: + def remove_overlaps( + self, l: Collection[AnyStr] + ) -> Union[Collection[AnyStr], List[AnyStr]]: """ Returns list with overlapping tags removed @@ -252,10 +272,10 @@ def read_taxonomy(self, filepath: AnyStr): :param filepath: The path of the file to read :return: None """ - with open(filepath, 'r') as fd: + with open(filepath, "r") as fd: for line in fd: line = line.strip() - if not line.startswith('#') and line: + if not line.startswith("#") and line: self.add_tag(line) def to_file(self, filepath: AnyStr): @@ -265,9 +285,8 @@ def to_file(self, filepath: AnyStr): :param filepath: The path to write :return: None """ - with open(filepath, 'w') as fd: - tag_l = sorted(self.__tag_map.items(), - key=lambda item: item[1].path) + with open(filepath, "w") as fd: + tag_l = sorted(self.__tag_map.items(), key=lambda item: item[1].path) idx = 0 for name, tag in tag_l: if (idx % 2) == 0: @@ -279,6 +298,7 @@ class Rules: """ Map a single source with one or more destinations """ + def __init__(self, filepath: Optional[AnyStr]): """ Initialize the rule-map and read rules from ``filepath`` @@ -297,7 +317,9 @@ def __len__(self): """ return len(self._rmap) - def add_rule(self, src: AnyStr, dst_l: Collection[AnyStr] = None, overwrite: bool = False): + def add_rule( + self, src: AnyStr, dst_l: Collection[AnyStr] = None, overwrite: bool = False + ): """ Add a rule to the map. On duplicate, append destinations. If ``overwrite`` is set, replace rule src/dst. @@ -347,10 +369,10 @@ def read_rules(self, filepath: AnyStr): :param filepath: The path of the file to read :return: None """ - with open(filepath, 'r') as fd: + with open(filepath, "r") as fd: for line in fd: line = line.strip() - if not line.startswith('#') and line: + if not line.startswith("#") and line: word_list = line.split() if len(word_list) > 1: self.add_rule(word_list[0], word_list[1:]) @@ -363,16 +385,16 @@ def to_file(self, filepath: AnyStr, taxonomy: Taxonomy = None): :param taxonomy: A Taxonomy to optionally resolve full tag paths :return: None """ - with open(filepath, 'w') as fd: + with open(filepath, "w") as fd: for src, dst_set in sorted(self._rmap.items()): dst_l = sorted(dst_set) if taxonomy: src_path = taxonomy.get_path(src) path_l = [taxonomy.get_path(t) for t in dst_l] - dst_str = '\t'.join(path_l) + dst_str = "\t".join(path_l) fd.write("%s\t%s\n" % (src_path, dst_str)) else: - dst_str = '\t'.join(dst_l) + dst_str = "\t".join(dst_l) fd.write("%s\t%s\n" % (src, dst_str)) def expand_src_destinations(self, src: AnyStr) -> Set[AnyStr]: @@ -412,6 +434,7 @@ class Translation(Rules): """ Translations are a set of rules that convert between unknown labels and labels that are in our Taxonomy """ + def __init__(self, filepath: AnyStr): super().__init__(filepath) @@ -433,6 +456,7 @@ class Expansion(Rules): """ Expansions are rules that allow us to map a single label (src) to all explicit and implicit labels """ + def __init__(self, filepath: AnyStr): super().__init__(filepath) @@ -457,8 +481,15 @@ class AvLabels: """ Primary class used to interpret AV Labels """ - def __init__(self, tag_file: AnyStr, exp_file: AnyStr = None, tax_file: AnyStr = None, av_file: AnyStr = None, - alias_detect: AnyStr = False): + + def __init__( + self, + tag_file: AnyStr, + exp_file: AnyStr = None, + tax_file: AnyStr = None, + av_file: AnyStr = None, + alias_detect: bool = False, + ): self.taxonomy = Taxonomy(tax_file) self.translations = Translation(tag_file) self.expansions = Expansion(exp_file) @@ -486,7 +517,9 @@ def get_sample_info_lb(record: Dict) -> SampleInfo: :param record: The JSON record :return: An instance of SampleInfo """ - return SampleInfo(record['md5'], record['sha1'], record['sha256'], record['av_labels'], []) + return SampleInfo( + record["md5"], record["sha1"], record["sha256"], record["av_labels"], [] + ) @staticmethod def get_sample_info_vt_v2(record): @@ -497,22 +530,24 @@ def get_sample_info_vt_v2(record): :return: An instance of SampleInfo """ try: - scans = record['scans'] - md5 = record['md5'] - sha1 = record['sha1'] - sha256 = record['sha256'] + scans = record["scans"] + md5 = record["md5"] + sha1 = record["sha1"] + sha256 = record["sha256"] except KeyError: return None # Obtain labels from scan results label_pairs = [] for av, res in scans.items(): - if res['detected']: - label = res['result'] - clean_label = ''.join(filter(lambda x: x in string.printable, label)).strip() + if res["detected"]: + label = res["result"] + clean_label = "".join( + filter(lambda x: x in string.printable, label) + ).strip() label_pairs.append((av, clean_label)) - vt_tags = record.get('tags', []) + vt_tags = record.get("tags", []) return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags) @@ -525,22 +560,24 @@ def get_sample_info_vt_v3(record): :return: An instance of SampleInfo """ try: - scans = record['data']['attributes']['last_analysis_results'] - md5 = record['data']['attributes']['md5'] - sha1 = record['data']['attributes']['sha1'] - sha256 = record['data']['attributes']['sha256'] + scans = record["data"]["attributes"]["last_analysis_results"] + md5 = record["data"]["attributes"]["md5"] + sha1 = record["data"]["attributes"]["sha1"] + sha256 = record["data"]["attributes"]["sha256"] except KeyError: return None # Obtain labels from scan results label_pairs = [] for av, res in scans.items(): - label = res['result'] + label = res["result"] if label is not None: - clean_label = ''.join(filter(lambda x: x in string.printable, label)).strip() + clean_label = "".join( + filter(lambda x: x in string.printable, label) + ).strip() label_pairs.append((av, clean_label)) - vt_tags = record['data']['attributes'].get('tags', []) + vt_tags = record["data"]["attributes"].get("tags", []) return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags) @@ -563,7 +600,7 @@ def is_pup(tag_pairs, taxonomy: Taxonomy) -> Optional[bool]: path, cat = taxonomy.get_info(tag) if cat == "CLASS": if "grayware" in path: - return float(ctr) >= float(max_ctr)*threshold + return float(ctr) >= float(max_ctr) * threshold else: return False return False @@ -579,18 +616,18 @@ def __remove_suffixes(av_name: AnyStr, label: AnyStr) -> AnyStr: """ # Truncate after last '.' if av_name in suffix_removal_av_set: - label = label.rsplit('.', 1)[0] + label = label.rsplit(".", 1)[0] - # Truncate after last '.' + # Truncate after last '.' # if suffix only contains digits or uppercase (no lowercase) chars - if av_name == 'AVG': - tokens = label.rsplit('.', 1) + if av_name == "AVG": + tokens = label.rsplit(".", 1) if len(tokens) > 1 and re.match("^[A-Z0-9]+$", tokens[1]): label = tokens[0] # Truncate after last '!' - if av_name == 'Agnitum': - label = label.rsplit('!', 1)[0] + if av_name == "Agnitum": + label = label.rsplit("!", 1)[0] return label @@ -620,7 +657,7 @@ def get_label_tags(self, label: AnyStr, hashes: Collection[AnyStr]) -> Set[AnySt token = token[:-end_len] # Ignore token if prefix of a hash of the sample - # Most AVs use MD5 prefixes in labels, + # Most AVs use MD5 prefixes in labels, # but we check SHA1 and SHA256 as well if any([h.startswith(token) for h in hashes]): continue @@ -675,17 +712,17 @@ def get_sample_tags(self, sample_info: SampleInfo) -> Dict[AnyStr, List[AnyStr]] # Process each AV label for av_name, label in sample_info.labels: - if not label or av_name not in self.avs: + if not label or (self.avs and av_name not in self.avs): continue # Emsisoft uses same label as # GData/ESET-NOD32/BitDefender/Ad-Aware/MicroWorld-eScan, # but suffixes ' (B)' to their label. Remove the suffix. - label = label.rstrip(' (B)') + label = label.rstrip(" (B)") # F-Secure uses Avira's engine since Nov. 2018 # but prefixes 'Malware.' to Avira's label. Remove the prefix. - label = label.lstrip('Malware.') + label = label.lstrip("Malware.") # Other engines often use exactly the same label, e.g., # AVG/Avast @@ -711,7 +748,9 @@ def get_sample_tags(self, sample_info: SampleInfo) -> Dict[AnyStr, List[AnyStr]] return av_dict @staticmethod - def rank_tags(av_dict: Dict[AnyStr, List[AnyStr]], threshold: int = 1) -> List[Tuple[AnyStr, int]]: + def rank_tags( + av_dict: Dict[AnyStr, List[AnyStr]], threshold: int = 1 + ) -> List[Tuple[AnyStr, int]]: """ Get a list of tuples containing a tag and the number of AV that confirmed that tag sorted by number of AV (descending). diff --git a/avclass/labeler.py b/avclass/labeler.py index b96a28f..175e798 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -7,8 +7,14 @@ from operator import itemgetter from typing import AnyStr, Optional -from avclass.common import AvLabels, Taxonomy -from avclass import clustering as ec, util +try: + from avclass.common import AvLabels, Taxonomy + from avclass import clustering as ec, util +except ModuleNotFoundError: + # Helps find the avclasses when run from console + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + from avclass.common import AvLabels, Taxonomy + from avclass import clustering as ec, util def guess_hash(h: AnyStr) -> Optional[AnyStr]: @@ -18,14 +24,14 @@ def guess_hash(h: AnyStr) -> Optional[AnyStr]: :param h: The hash :return: The hash type (str) """ - ''' Given a hash string, guess the hash type based on the string length ''' + """ Given a hash string, guess the hash type based on the string length """ hlen = len(h) if hlen == 32: - return 'md5' + return "md5" elif hlen == 40: - return 'sha1' + return "sha1" elif hlen == 64: - return 'sha256' + return "sha256" return None @@ -50,7 +56,7 @@ def format_tag_pairs(l, taxonomy: Taxonomy = None) -> AnyStr: out = "%s|%d" % (p, l[0][1]) for t, s in l[1:]: if taxonomy is not None: - p = taxonomy.get_path(t) + p = taxonomy.get_path(t) else: p = t out += ",%s|%d" % (p, s) @@ -80,14 +86,14 @@ def main(): # TODO - break this function up. args = parse_args() # Select hash used to identify sample, by default MD5 - hash_type = args.hash or 'md5' + hash_type = args.hash or "md5" # If ground truth provided, read it from file gt_dict = {} if args.gt: - with open(args.gt, 'r') as gt_fd: + with open(args.gt, "r") as gt_fd: for line in gt_fd: - gt_hash, family = map(str, line.strip().split('\t', 1)) + gt_hash, family = map(str, line.strip().split("\t", 1)) gt_dict[gt_hash] = family # Guess type of hash in ground truth file @@ -132,12 +138,21 @@ def main(): pair_count_map = {} vt_all = 0 avtags_dict = {} - stats = {'samples': 0, 'noscans': 0, 'tagged': 0, 'maltagged': 0, - 'FAM': 0, 'CLASS': 0, 'BEH': 0, 'FILE': 0, 'UNK': 0} + stats = { + "samples": 0, + "noscans": 0, + "tagged": 0, + "maltagged": 0, + "FAM": 0, + "CLASS": 0, + "BEH": 0, + "FILE": 0, + "UNK": 0, + } for ifile in ifile_l: - fd = open(ifile, 'r') - sys.stderr.write('[-] Processing input file %s\n' % ifile) + fd = open(ifile, "r") + sys.stderr.write("[-] Processing input file %s\n" % ifile) for line in fd: if not line.strip(): @@ -145,7 +160,7 @@ def main(): # Debug info if vt_all % 100 == 0: - sys.stderr.write('\r[-] %d JSON read' % vt_all) + sys.stderr.write("\r[-] %d JSON read\n" % vt_all) sys.stderr.flush() vt_all += 1 @@ -154,13 +169,13 @@ def main(): if sample_info is None: try: - name = vt_rep['md5'] - sys.stderr.write('\nNo scans for %s\n' % name) + name = vt_rep["md5"] + sys.stderr.write("\nNo scans for %s\n" % name) except KeyError: - sys.stderr.write('\nCould not process: %s\n' % line) + sys.stderr.write("\nCould not process: %s\n" % line) sys.stderr.flush() - stats['noscans'] += 1 + stats["noscans"] += 1 continue # Sample's name is selected hash type (md5 by default) @@ -168,7 +183,7 @@ def main(): # If the VT report has no AV labels, output and continue if not sample_info.labels: - sys.stdout.write('%s\t-\t[]\n' % name) + sys.stdout.write("%s\t-\t[]\n" % name) # sys.stderr.write('\nNo AV labels for %s\n' % name) # sys.stderr.flush() continue @@ -212,7 +227,13 @@ def main(): if args.stats: if vt_count > 3: stats["maltagged"] += 1 - cat_map = {'FAM': False, 'CLASS': False, 'BEH': False, 'FILE': False, 'UNK': False} + cat_map = { + "FAM": False, + "CLASS": False, + "BEH": False, + "FILE": False, + "UNK": False, + } for t in tags: path, cat = av_labels.taxonomy.get_info(t[0]) cat_map[cat] = True @@ -240,7 +261,7 @@ def main(): break first_token_dict[name] = fam - gt_family = '\t' + gt_dict.get(name, "") + gt_family = "\t" + gt_dict.get(name, "") else: gt_family = "" @@ -256,64 +277,75 @@ def main(): tag_str = format_tag_pairs(tags, av_labels.taxonomy) else: tag_str = format_tag_pairs(tags) - sys.stdout.write('%s\t%d\t%s%s%s%s\n' % name, vt_count, tag_str, gt_family, is_pup_str, vtt) + sys.stdout.write( + "%s\t%d\t%s%s%s%s\n" + % (name, vt_count, tag_str, gt_family, is_pup_str, vtt) + ) else: - sys.stdout.write('%s\t%s%s%s\n' % name, fam, gt_family, is_pup_str) + sys.stdout.write("%s\t%s%s%s\n" % name, fam, gt_family, is_pup_str) except: traceback.print_exc(file=sys.stderr) continue - sys.stderr.write('\r[-] %d JSON read' % vt_all) + sys.stderr.write("\r[-] %d JSON read" % vt_all) sys.stderr.flush() - sys.stderr.write('\n') + sys.stderr.write("\n") fd.close() # Print statistics - sys.stderr.write("[-] Samples: %d NoScans: %d NoTags: %d GroundTruth: %d\n" % - (vt_all, stats['noscans'], vt_all - stats['tagged'], len(gt_dict))) + sys.stderr.write( + "[-] Samples: %d NoScans: %d NoTags: %d GroundTruth: %d\n" + % (vt_all, stats["noscans"], vt_all - stats["tagged"], len(gt_dict)) + ) # If ground truth, print precision, recall, and F1-measure if args.gt: - precision, recall, fmeasure = ec.eval_precision_recall_fmeasure(gt_dict, first_token_dict) - sys.stderr.write("Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % (precision, recall, fmeasure)) + precision, recall, fmeasure = ec.eval_precision_recall_fmeasure( + gt_dict, first_token_dict + ) + sys.stderr.write( + "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" + % (precision, recall, fmeasure) + ) # Output stats if args.stats: - stats_fd = open("%s.stats" % out_prefix, 'w') + stats_fd = open("%s.stats" % out_prefix, "w") num_samples = vt_all - stats_fd.write('Samples: %d\n' % num_samples) - num_tagged = stats['tagged'] + stats_fd.write("Samples: %d\n" % num_samples) + num_tagged = stats["tagged"] frac = float(num_tagged) / float(num_samples) * 100 - stats_fd.write('Tagged (all): %d (%.01f%%)\n' % (num_tagged, frac)) - num_maltagged = stats['maltagged'] + stats_fd.write("Tagged (all): %d (%.01f%%)\n" % (num_tagged, frac)) + num_maltagged = stats["maltagged"] frac = float(num_maltagged) / float(num_samples) * 100 - stats_fd.write('Tagged (VT>3): %d (%.01f%%)\n' % (num_maltagged, frac)) - for c in ['FILE', 'CLASS', 'BEH', 'FAM', 'UNK']: + stats_fd.write("Tagged (VT>3): %d (%.01f%%)\n" % (num_maltagged, frac)) + for c in ["FILE", "CLASS", "BEH", "FAM", "UNK"]: count = stats[c] frac = float(count) / float(num_maltagged) * 100 - stats_fd.write('%s: %d (%.01f%%)\n' % (c, stats[c], frac)) + stats_fd.write("%s: %d (%.01f%%)\n" % (c, stats[c], frac)) stats_fd.close() # Output vendor info if args.avtags: - avtags_fd = open("%s.avtags" % out_prefix, 'w') + avtags_fd = open("%s.avtags" % out_prefix, "w") for t in sorted(avtags_dict.keys()): - avtags_fd.write('%s\t' % t) - pairs = sorted(avtags_dict[t].items(), key=lambda pair: pair[1], reverse=True) + avtags_fd.write("%s\t" % t) + pairs = sorted( + avtags_dict[t].items(), key=lambda pair: pair[1], reverse=True + ) for pair in pairs: - avtags_fd.write('%s|%d,' % (pair[0], pair[1])) - avtags_fd.write('\n') + avtags_fd.write("%s|%d," % (pair[0], pair[1])) + avtags_fd.write("\n") avtags_fd.close() # If alias detection, print map if args.aliasdetect: - alias_filename = out_prefix + '.alias' - alias_fd = open(alias_filename, 'w+') + alias_filename = out_prefix + ".alias" + alias_fd = open(alias_filename, "w+") # Sort token pairs by number of times they appear together - sorted_pairs = sorted( - pair_count_map.items(), key=itemgetter(1)) + sorted_pairs = sorted(pair_count_map.items(), key=itemgetter(1)) # sorted_pairs = sorted( # pair_count_map.items()) @@ -335,101 +367,154 @@ def main(): yn = n1 f = float(c) / float(xn) finv = float(c) / float(yn) - alias_fd.write("%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" % (x, y, xn, yn, c, f, finv)) + alias_fd.write( + "%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" % (x, y, xn, yn, c, f, finv) + ) # Close alias file alias_fd.close() - sys.stderr.write('[-] Alias data in %s\n' % alias_filename) + sys.stderr.write("[-] Alias data in %s\n" % alias_filename) def parse_args(): - argparser = argparse.ArgumentParser(prog='avclass', - description='Extracts tags for a set of samples. Also calculates precision and' - ' recall if ground truth available') - - argparser.add_argument('-vt', action='append', help='file with VT reports (Can be provided multiple times)') - - argparser.add_argument('-lb', action='append', help='file with simplified JSON reports ' - '{md5,sha1,sha256,scan_date,av_labels} (Can be provided ' - 'multiple times)') - - argparser.add_argument('-vtdir', help='existing directory with VT reports') - - argparser.add_argument('-lbdir', help='existing directory with simplified JSON reports') - - argparser.add_argument('-vt3', action='store_true', help='input are VT v3 files') - - argparser.add_argument('-gt', help='file with ground truth. If provided it evaluates clustering accuracy. ' - 'Prints precision, recall, F1-measure.') - - argparser.add_argument('-vtt', help='Include VT tags in the output.', action='store_true') - - argparser.add_argument('-tag', help='file with tagging rules.', default=util.DEFAULT_TAG_PATH) - - argparser.add_argument('-tax', help='file with taxonomy.', default=util.DEFAULT_TAX_PATH) - - argparser.add_argument('-exp', help='file with expansion rules.', default=util.DEFAULT_EXP_PATH) - - argparser.add_argument('-av', help='file with list of AVs to use') - - argparser.add_argument('-avtags', help='extracts tags per av vendor', action='store_true') - - argparser.add_argument('-pup', action='store_true', help='if used each sample is classified as PUP or not') - - argparser.add_argument('-p', '--path', help='output.full path for tags', action='store_true') - - argparser.add_argument('-hash', help='hash used to name samples. Should match ground truth', - choices=['md5', 'sha1', 'sha256']) - - argparser.add_argument('-c', help='Compatibility mode. Outputs results in AVClass format.', action='store_true') - - argparser.add_argument('-aliasdetect', action='store_true', help='if used produce aliases file at end') - - argparser.add_argument('-stats', action='store_true', help='if used produce 1 file with stats per category ' - '(File, Class, Behavior, Family, Unclassified)') + argparser = argparse.ArgumentParser( + prog="avclass", + description="Extracts tags for a set of samples. Also calculates precision and" + " recall if ground truth available", + ) + + argparser.add_argument( + "-vt", + action="append", + help="file with VT reports (Can be provided multiple times)", + ) + + argparser.add_argument( + "-lb", + action="append", + help="file with simplified JSON reports " + "{md5,sha1,sha256,scan_date,av_labels} (Can be provided " + "multiple times)", + ) + + argparser.add_argument("-vtdir", help="existing directory with VT reports") + + argparser.add_argument( + "-lbdir", help="existing directory with simplified JSON reports" + ) + + argparser.add_argument("-vt3", action="store_true", help="input are VT v3 files") + + argparser.add_argument( + "-gt", + help="file with ground truth. If provided it evaluates clustering accuracy. " + "Prints precision, recall, F1-measure.", + ) + + argparser.add_argument( + "-vtt", help="Include VT tags in the output.", action="store_true" + ) + + argparser.add_argument( + "-tag", help="file with tagging rules.", default=util.DEFAULT_TAG_PATH + ) + + argparser.add_argument( + "-tax", help="file with taxonomy.", default=util.DEFAULT_TAX_PATH + ) + + argparser.add_argument( + "-exp", help="file with expansion rules.", default=util.DEFAULT_EXP_PATH + ) + + argparser.add_argument("-av", help="file with list of AVs to use") + + argparser.add_argument( + "-avtags", help="extracts tags per av vendor", action="store_true" + ) + + argparser.add_argument( + "-pup", + action="store_true", + help="if used each sample is classified as PUP or not", + ) + + argparser.add_argument( + "-p", "--path", help="output.full path for tags", action="store_true" + ) + + argparser.add_argument( + "-hash", + help="hash used to name samples. Should match ground truth", + choices=["md5", "sha1", "sha256"], + ) + + argparser.add_argument( + "-c", + help="Compatibility mode. Outputs results in AVClass format.", + action="store_true", + ) + + argparser.add_argument( + "-aliasdetect", action="store_true", help="if used produce aliases file at end" + ) + + argparser.add_argument( + "-stats", + action="store_true", + help="if used produce 1 file with stats per category " + "(File, Class, Behavior, Family, Unclassified)", + ) args = argparser.parse_args() # TODO - use non-exclusive group to ensure at least one is selected instead of this if not args.vt and not args.lb and not args.vtdir and not args.lbdir: - sys.stderr.write('One of the following 4 arguments is required: ' - '-vt,-lb,-vtdir,-lbdir\n') + sys.stderr.write( + "One of the following 4 arguments is required: " "-vt,-lb,-vtdir,-lbdir\n" + ) exit(1) # TODO - use mutex group for this instead of manual check if (args.vt or args.vtdir) and (args.lb or args.lbdir): - sys.stderr.write('Use either -vt/-vtdir or -lb/-lbdir. ' - 'Both types of input files cannot be combined.\n') + sys.stderr.write( + "Use either -vt/-vtdir or -lb/-lbdir. " + "Both types of input files cannot be combined.\n" + ) exit(1) # TODO - consider letting argparse handle this? if args.tag: - if args.tag == '/dev/null': - sys.stderr.write('[-] Using no tagging rules\n') + if args.tag == "/dev/null": + sys.stderr.write("[-] Using no tagging rules\n") else: - sys.stderr.write('[-] Using tagging rules in %s\n' % args.tag) + sys.stderr.write("[-] Using tagging rules in %s\n" % args.tag) else: - sys.stderr.write('[-] Using default tagging rules in %s\n' % util.DEFAULT_TAG_PATH) + sys.stderr.write( + "[-] Using default tagging rules in %s\n" % util.DEFAULT_TAG_PATH + ) # TODO - consider letting argparse handle this? if args.tax: - if args.tax == '/dev/null': - sys.stderr.write('[-] Using no taxonomy\n') + if args.tax == "/dev/null": + sys.stderr.write("[-] Using no taxonomy\n") else: - sys.stderr.write('[-] Using taxonomy in %s\n' % args.tax) + sys.stderr.write("[-] Using taxonomy in %s\n" % args.tax) else: - sys.stderr.write('[-] Using default taxonomy in %s\n' % util.DEFAULT_TAX_PATH) + sys.stderr.write("[-] Using default taxonomy in %s\n" % util.DEFAULT_TAX_PATH) # TODO - consider letting argparse handle this? if args.exp: - if args.exp == '/dev/null': - sys.stderr.write('[-] Using no expansion tags\n') + if args.exp == "/dev/null": + sys.stderr.write("[-] Using no expansion tags\n") else: - sys.stderr.write('[-] Using expansion tags in %s\n' % args.exp) + sys.stderr.write("[-] Using expansion tags in %s\n" % args.exp) else: - sys.stderr.write('[-] Using default expansion tags in %s\n' % util.DEFAULT_EXP_PATH) + sys.stderr.write( + "[-] Using default expansion tags in %s\n" % util.DEFAULT_EXP_PATH + ) return args -if __name__ == '__main__': +if __name__ == "__main__": main() From b832ab9cb4abbb4bb06c656c71549abf40d0ab88 Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Mon, 1 Feb 2021 16:13:45 -0500 Subject: [PATCH 15/36] Turn Labler into Class, cleanup --- avclass/common.py | 22 +- avclass/labeler.py | 885 ++++++++++++++++++++++++++++++--------------- 2 files changed, 608 insertions(+), 299 deletions(-) diff --git a/avclass/common.py b/avclass/common.py index 9cbe4bc..5533b43 100755 --- a/avclass/common.py +++ b/avclass/common.py @@ -5,7 +5,7 @@ import sys from collections import defaultdict, namedtuple -from typing import AnyStr, Collection, Dict, List, Optional, Set, Tuple, Union +from typing import AnyStr, Callable, Collection, Dict, List, Optional, Set, Tuple, Union logger = logging.getLogger(__name__) @@ -497,6 +497,26 @@ def __init__( # Alias statistics initialization self.alias_detect = alias_detect + def get_sample_call(self, data_type: AnyStr) -> Callable: + """ + Return the correct parser for the report type + + :param data_type: the type of file vt2, vt3, lb + :return: Callable function that returns SampleInfo + """ + if data_type == "lb": + return self.get_sample_info_lb + elif data_type == "vt" or data_type == "vt2": + return self.get_sample_info_vt_v2 + elif data_type == "vt3": + return self.get_sample_info_vt_v3 + else: + sys.stderr.write( + "Invalid data type for sample: %s (should be vt, vt2, vt3, lb)" + % data_type + ) + return self.get_sample_info_vt_v3 + @staticmethod def read_avs(avs_file: AnyStr) -> Set[AnyStr]: """ diff --git a/avclass/labeler.py b/avclass/labeler.py index 175e798..dbf2202 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -4,8 +4,11 @@ import sys import traceback + +from io import StringIO from operator import itemgetter -from typing import AnyStr, Optional +from pathlib import Path +from typing import AnyStr, Dict, List, NamedTuple, Optional, Tuple, Union try: from avclass.common import AvLabels, Taxonomy @@ -17,127 +20,25 @@ from avclass import clustering as ec, util -def guess_hash(h: AnyStr) -> Optional[AnyStr]: - """ - Guess hash type based on ``len(h)`` - - :param h: The hash - :return: The hash type (str) - """ - """ Given a hash string, guess the hash type based on the string length """ - hlen = len(h) - if hlen == 32: - return "md5" - elif hlen == 40: - return "sha1" - elif hlen == 64: - return "sha256" - - return None - - -def format_tag_pairs(l, taxonomy: Taxonomy = None) -> AnyStr: - """ - Get ranked tags as a string. - - :param l: - :param taxonomy: - :return: - """ - # TODO - wtf is ``l``? - if not l: - return "" - - if taxonomy is not None: - p = taxonomy.get_path(l[0][0]) - else: - p = l[0][0] - - out = "%s|%d" % (p, l[0][1]) - for t, s in l[1:]: - if taxonomy is not None: - p = taxonomy.get_path(t) - else: - p = t - out += ",%s|%d" % (p, s) - - return out - - -def list_str(l, sep: AnyStr = ", ", prefix: AnyStr = "") -> AnyStr: - """ - Return list as a string - - :param l: The list - :param sep: The separator - :param prefix: The prefix - :return: A string representation of the list - """ - # TODO - wtf is ``l``? - if not l: - return "" - out = prefix + l[0] - for s in l[1:]: - out = out + sep + s - return out - - -def main(): - # TODO - break this function up. - args = parse_args() - # Select hash used to identify sample, by default MD5 - hash_type = args.hash or "md5" - - # If ground truth provided, read it from file - gt_dict = {} - if args.gt: - with open(args.gt, "r") as gt_fd: - for line in gt_fd: - gt_hash, family = map(str, line.strip().split("\t", 1)) - gt_dict[gt_hash] = family - - # Guess type of hash in ground truth file - hash_type = guess_hash(list(gt_dict.keys())[0]) - - # Create AvLabels object - av_labels = AvLabels(args.tag, args.exp, args.tax, args.av, args.aliasdetect) - - # Build list of input files - # NOTE: duplicate input files are not removed - ifile_l = [] - if args.vt: - ifile_l += args.vt - ifile_are_vt = True - elif args.lb: - ifile_l += args.lb - ifile_are_vt = False - elif args.vtdir: - ifile_l += [os.path.join(args.vtdir, f) for f in os.listdir(args.vtdir)] - ifile_are_vt = True - elif args.lbdir: - ifile_l += [os.path.join(args.lbdir, f) for f in os.listdir(args.lbdir)] - ifile_are_vt = False - else: - # TODO - is this reachable? - sys.exit(1) - - # Select correct sample info extraction function - if not ifile_are_vt: - get_sample_info = av_labels.get_sample_info_lb - elif args.vt3: - get_sample_info = av_labels.get_sample_info_vt_v3 - else: - get_sample_info = av_labels.get_sample_info_vt_v2 - - # Select output prefix - out_prefix = os.path.basename(os.path.splitext(ifile_l[0])[0]) - - # Initialize state +class AVClass2: + output = [] + av_labels = None + hash_type = None + ground_truth = None + get_sample_info = None + console = False + av_tags = False + stats_export = False + compatibility_v1 = False + pup_classify = False + path_export = False + vt_tags = False + vt_all = 0 first_token_dict = {} token_count_map = {} pair_count_map = {} - vt_all = 0 avtags_dict = {} + gt_dict = {} stats = { "samples": 0, "noscans": 0, @@ -150,211 +51,391 @@ def main(): "UNK": 0, } - for ifile in ifile_l: - fd = open(ifile, "r") - sys.stderr.write("[-] Processing input file %s\n" % ifile) - - for line in fd: - if not line.strip(): + def __init__(self, av_labels: AvLabels): + self.av_labels = av_labels + + def run( + self, + files: Union[ + AnyStr, + List[AnyStr], + Path, + List[Path], + StringIO, + List[StringIO], + Dict, + List[Dict], + ], + data_type: str = "vt3", + hash_type: Optional[AnyStr] = "md5", + ground_truth: Optional[AnyStr] = None, + stats_export: bool = False, + vt_tags: bool = False, + av_tags: bool = False, + pup_classify: bool = False, + path_export: bool = False, + compatibility_v1: bool = False, + console: bool = False, + ) -> List[Dict]: + # Set class arguments + self.console = console + self.ground_truth = ground_truth + self.av_tags = av_tags + self.stats_export = stats_export + self.compatibility_v1 = compatibility_v1 + self.pup_classify = pup_classify + self.path_export = path_export + self.vt_tags = vt_tags + + # Select hash used to identify sample, by default MD5 + self.hash_type = self.get_hash_type(hash_type) + + # Select file type used for sampling + self.get_sample_info = self.av_labels.get_sample_call(data_type) + + # Select output prefix + out_prefix = os.path.basename(os.path.splitext(files[0])[0]) + + # Process each input file + if not isinstance(files, list): + files = [files] + for ifile in files: + # Open file + if isinstance(ifile, dict): + self.process_line(ifile) continue + elif isinstance(ifile, StringIO): + fd = ifile + else: + fd = open(ifile, "r") + + # Debug info, file processed + self.print_error("[-] Processing input file %s\n" % ifile) + + # Process all lines in file + for line in fd: + self.process_line(line) # Debug info - if vt_all % 100 == 0: - sys.stderr.write("\r[-] %d JSON read\n" % vt_all) - sys.stderr.flush() - vt_all += 1 + self.print_error("\r[-] %d JSON read" % self.vt_all, flush=True) + self.print_error("\n") - vt_rep = json.loads(line) - sample_info = get_sample_info(vt_rep) + # Close file + fd.close() - if sample_info is None: - try: - name = vt_rep["md5"] - sys.stderr.write("\nNo scans for %s\n" % name) - except KeyError: - sys.stderr.write("\nCould not process: %s\n" % line) + # Print statistics + self.print_statistics() - sys.stderr.flush() - stats["noscans"] += 1 - continue + # If ground truth, print precision, recall, and F1-measure + if self.ground_truth: + self.ground_truth_print() - # Sample's name is selected hash type (md5 by default) - name = getattr(sample_info, hash_type) + # Output stats + if self.stats_export: + self.out_stats(out_prefix) - # If the VT report has no AV labels, output and continue - if not sample_info.labels: - sys.stdout.write("%s\t-\t[]\n" % name) - # sys.stderr.write('\nNo AV labels for %s\n' % name) - # sys.stderr.flush() - continue + # Output vendor info + if self.av_tags: + self.out_avtags(out_prefix) - # Compute VT_Count - vt_count = len(sample_info.labels) + # If alias detection, print map + if self.av_labels.alias_detect: + self.alias_detection(out_prefix, path_export) - # Get the distinct tokens from all the av labels in the report and print them. - try: - av_tmp = av_labels.get_sample_tags(sample_info) - tags = av_labels.rank_tags(av_tmp) - - # AV VENDORS PER TOKEN - if args.avtags: - for t in av_tmp: - tmap = avtags_dict.get(t, {}) - for av in av_tmp[t]: - ctr = tmap.get(av, 0) - tmap[av] = ctr + 1 - avtags_dict[t] = tmap - - if args.aliasdetect: - prev_tokens = set() - for entry in tags: - curr_tok = entry[0] - curr_count = token_count_map.get(curr_tok, 0) - token_count_map[curr_tok] = curr_count + 1 - for prev_tok in prev_tokens: - if prev_tok < curr_tok: - pair = prev_tok, curr_tok - else: - pair = curr_tok, prev_tok - pair_count = pair_count_map.get(pair, 0) - pair_count_map[pair] = pair_count + 1 - prev_tokens.add(curr_tok) - - # Collect stats - # TODO - should iterate once over tags for both stats and aliasdetect - if tags: - stats["tagged"] += 1 - if args.stats: - if vt_count > 3: - stats["maltagged"] += 1 - cat_map = { - "FAM": False, - "CLASS": False, - "BEH": False, - "FILE": False, - "UNK": False, - } - for t in tags: - path, cat = av_labels.taxonomy.get_info(t[0]) - cat_map[cat] = True - for c in cat_map: - if cat_map[c]: - stats[c] += 1 - - # Check if sample is PUP, if requested - if args.pup: - if av_labels.is_pup(tags, av_labels.taxonomy): - is_pup_str = "\t1" - else: - is_pup_str = "\t0" - else: - is_pup_str = "" - - # Select family for sample if needed, - # i.e., for compatibility mode or for ground truth - fam = "SINGLETON:" + name - if args.c or args.gt: - for t, s in tags: - cat = av_labels.taxonomy.get_category(t) - if cat in ["UNK", "FAM"]: - fam = t - break - - first_token_dict[name] = fam - gt_family = "\t" + gt_dict.get(name, "") - else: - gt_family = "" + return self.output - # Get VT tags as string - if args.vtt: - vtt = list_str(sample_info.vt_tags, prefix="\t") - else: - vtt = "" - - # Print family (and ground truth if available) to stdout - if not args.c: - if args.path: - tag_str = format_tag_pairs(tags, av_labels.taxonomy) - else: - tag_str = format_tag_pairs(tags) - sys.stdout.write( - "%s\t%d\t%s%s%s%s\n" - % (name, vt_count, tag_str, gt_family, is_pup_str, vtt) - ) - else: - sys.stdout.write("%s\t%s%s%s\n" % name, fam, gt_family, is_pup_str) - except: - traceback.print_exc(file=sys.stderr) - continue + def process_line(self, line: Union[AnyStr, Dict]): + if isinstance(line, str): + # If blank line, skip + if line == "\n": + return - sys.stderr.write("\r[-] %d JSON read" % vt_all) - sys.stderr.flush() - sys.stderr.write("\n") + # Debug info + if self.vt_all % 100 == 0: + self.print_error("\r[-] %d JSON read\n" % self.vt_all, flush=True) + self.vt_all += 1 - fd.close() + # Read JSON line + vt_rep = json.loads(line) + else: + vt_rep = line - # Print statistics - sys.stderr.write( - "[-] Samples: %d NoScans: %d NoTags: %d GroundTruth: %d\n" - % (vt_all, stats["noscans"], vt_all - stats["tagged"], len(gt_dict)) - ) + # Extract sample info + sample_info = self.get_sample_info(vt_rep) - # If ground truth, print precision, recall, and F1-measure - if args.gt: - precision, recall, fmeasure = ec.eval_precision_recall_fmeasure( - gt_dict, first_token_dict + # If no sample info, log error and continue + if sample_info is None: + try: + name = vt_rep["md5"] + self.print_error("\nNo scans for %s\n" % name, flush=True) + except KeyError: + self.print_error("\nCould not process: %s\n" % line, flush=True) + self.stats["noscans"] += 1 + return + + # Get the distinct tokens from all the av labels in the report + # And print them. + try: + self.get_tokens(sample_info) + except Exception: + traceback.print_exc(file=sys.stderr) + return + + def get_tokens(self, sample_info: NamedTuple): + # Sample's name is selected hash type (md5 by default) + name = getattr(sample_info, self.hash_type) + + # If the VT report has no AV labels, output and continue + if not sample_info.labels: + self.print_output("%s\t-\t[]\n" % (name)) + # self.print_error('\nNo AV labels for %s\n' % name, flush=True) + return + + # AV VENDORS PER TOKEN + av_tmp = self.av_labels.get_sample_tags(sample_info) + if self.av_tags: + self.av_vender_tags(av_tmp) + + tags = self.av_labels.rank_tags(av_tmp) + if self.av_labels.alias_detect: + self.av_vender_tokens(tags) + + # Compute VT_Count + vt_count = len(sample_info.labels) + + # Collect stats + # TODO: should iterate once over tags, + # for both stats and aliasdetect + if tags: + self.collect_stats(tags, vt_count) + + # Select family for sample if needed, + # i.e., for compatibility mode or for ground truth + fam, gt_family = self.get_family(name, tags) + + # Check if sample is PUP, if requested + pup_val = self.is_pup(self.pup_classify, tags) + + # Print family (and ground truth if available) + if self.compatibility_v1: + class_entry = self.avclass1_output( + name=name, + family=fam, + ground_truth=gt_family, + pup_val=pup_val, + vt_count=vt_count, + ) + self.output.append(class_entry) + else: + class_entry = self.avclass2_output( + name=name, + tags=tags, + sample_info=sample_info, + ground_truth=gt_family, + pup_val=pup_val, + vt_count=vt_count, + ) + self.output.append(class_entry) + + def avclass1_output( + self, + name: AnyStr, + family: AnyStr, + ground_truth: AnyStr, + pup_val: Optional[bool], + vt_count: int, + ) -> Dict: + """ + Build the v1 classification entry + + :param name: Hash + :param family: family classification + :param ground_truth: + :param pup_val: is a pup + :param vt_count: + :return: Dict of classification + """ + self.print_output( + "%s\t%s%s%s\n" % (name, family, ground_truth, self.get_pup_str(pup_val)) ) - sys.stderr.write( - "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" - % (precision, recall, fmeasure) + # Build json output + values = {"hash": name, "av_count": vt_count, "family": family} + if self.ground_truth: + values["ground_truth"] = ground_truth + if self.pup_classify: + values["pup"] = pup_val + return values + + def avclass2_output( + self, + name: AnyStr, + ground_truth: AnyStr, + pup_val: Optional[bool], + vt_count: int, + tags: List[Tuple], + sample_info: NamedTuple, + ) -> Dict: + """ + Build the v2 classification entry + + :param name: Hash + :param ground_truth: + :param pup_val: is a pup + :param vt_count: + :param tags: List of tags and their count + :param sample_info: + :return: Dict of classification + """ + # Build string output + if self.vt_tags: + vtt = self.list_str(sample_info.vt_tags, prefix="\t") + else: + vtt = "" + tag_str = self.format_tag_pairs_str( + tags, self.av_labels.taxonomy, self.path_export + ) + self.print_output( + "%s\t%d\t%s%s%s%s\n" + % (name, vt_count, tag_str, ground_truth, self.get_pup_str(pup_val), vtt) + ) + # Build json output + tag_dict = self.format_tag_pairs_list( + tags, self.av_labels.taxonomy, self.path_export ) + values = {"hash": name, "av_count": vt_count, "tags": tag_dict} + if self.ground_truth: + values["ground_truth"] = self.gt_dict.get(name, "") + if self.pup_classify: + values["pup"] = pup_val + if self.vt_tags: + values["vt_tags"] = sample_info.vt_tags + return values + + def get_family(self, name: AnyStr, tags: List[Tuple]) -> Tuple: + if self.compatibility_v1 or self.ground_truth: + fam = "SINGLETON:" + name + # fam = '' + for (t, s) in tags: + cat = self.av_labels.taxonomy.get_category(t) + if (cat == "UNK") or (cat == "FAM"): + fam = t + break + else: + fam = "" - # Output stats - if args.stats: - stats_fd = open("%s.stats" % out_prefix, "w") - num_samples = vt_all - stats_fd.write("Samples: %d\n" % num_samples) - num_tagged = stats["tagged"] - frac = float(num_tagged) / float(num_samples) * 100 - stats_fd.write("Tagged (all): %d (%.01f%%)\n" % (num_tagged, frac)) - num_maltagged = stats["maltagged"] - frac = float(num_maltagged) / float(num_samples) * 100 - stats_fd.write("Tagged (VT>3): %d (%.01f%%)\n" % (num_maltagged, frac)) - for c in ["FILE", "CLASS", "BEH", "FAM", "UNK"]: - count = stats[c] - frac = float(count) / float(num_maltagged) * 100 - stats_fd.write("%s: %d (%.01f%%)\n" % (c, stats[c], frac)) - stats_fd.close() + # Get ground truth family, if available + if self.ground_truth: + self.first_token_dict[name] = fam + gt_family = "\t" + self.gt_dict.get(name, "") + else: + gt_family = "" + return (fam, gt_family) + + def collect_stats(self, tags: List[Tuple], vt_count: int): + self.stats["tagged"] += 1 + if self.stats_export and vt_count > 3: + self.stats["maltagged"] += 1 + cat_map = { + "FAM": False, + "CLASS": False, + "BEH": False, + "FILE": False, + "UNK": False, + } + for t in tags: + cat = self.av_labels.taxonomy.get_info(t[0])[1] + cat_map[cat] = True + for c in cat_map: + if cat_map[c]: + self.stats[c] += 1 + + def av_vender_tags(self, av_tmp: Dict): + for t in av_tmp: + tmap = self.avtags_dict.get(t, {}) + for av in av_tmp[t]: + ctr = tmap.get(av, 0) + tmap[av] = ctr + 1 + self.avtags_dict[t] = tmap + + def av_vender_tokens(self, tags: List[Tuple]): + prev_tokens = set() + for entry in tags: + curr_tok = entry[0] + curr_count = self.token_count_map.get(curr_tok, 0) + self.token_count_map[curr_tok] = curr_count + 1 + for prev_tok in prev_tokens: + if prev_tok < curr_tok: + pair = (prev_tok, curr_tok) + else: + pair = (curr_tok, prev_tok) + pair_count = self.pair_count_map.get(pair, 0) + self.pair_count_map[pair] = pair_count + 1 + prev_tokens.add(curr_tok) + + def get_pup_str(self, is_pup: Optional[bool] = None) -> AnyStr: + if is_pup is True: + return "\t1" + elif is_pup is False: + return "\t0" + else: + return "" - # Output vendor info - if args.avtags: - avtags_fd = open("%s.avtags" % out_prefix, "w") - for t in sorted(avtags_dict.keys()): - avtags_fd.write("%s\t" % t) - pairs = sorted( - avtags_dict[t].items(), key=lambda pair: pair[1], reverse=True + def is_pup(self, pup_classify: bool, tags: List[Tuple]) -> Optional[bool]: + if pup_classify: + if self.av_labels.is_pup(tags, self.av_labels.taxonomy): + is_pup = True + else: + is_pup = False + else: + is_pup = None + return is_pup + + def get_hash_type(self, hash_type: Optional[AnyStr] = None) -> AnyStr: + if self.ground_truth: + with open(self.ground_truth, "r") as gt_fd: + for line in gt_fd: + gt_hash, family = map(str, line.strip().split("\t", 1)) + self.gt_dict[gt_hash] = family + # Guess type of hash in ground truth file + return self.guess_hash(list(self.gt_dict.keys())[0]) + else: + return hash_type if hash_type else "md5" + + def print_statistics(self): + self.print_error( + "[-] Samples: %d NoScans: %d NoTags: %d GroundTruth: %d\n" + % ( + self.vt_all, + self.stats["noscans"], + self.vt_all - self.stats["tagged"], + len(self.gt_dict), ) + ) - for pair in pairs: - avtags_fd.write("%s|%d," % (pair[0], pair[1])) - avtags_fd.write("\n") - avtags_fd.close() + def ground_truth_print(self): + # If ground truth, print precision, recall, and F1-measure + precision, recall, fmeasure = ec.eval_precision_recall_fmeasure( + self.gt_dict, self.first_token_dict + ) + self.print_error( + "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" + % (precision, recall, fmeasure) + ) - # If alias detection, print map - if args.aliasdetect: + def alias_detection(self, out_prefix: AnyStr, path_export: bool = False): + # Open alias file alias_filename = out_prefix + ".alias" alias_fd = open(alias_filename, "w+") # Sort token pairs by number of times they appear together - sorted_pairs = sorted(pair_count_map.items(), key=itemgetter(1)) - # sorted_pairs = sorted( - # pair_count_map.items()) + sorted_pairs = sorted(self.pair_count_map.items(), key=itemgetter(1)) + # sorted_pairs = sorted(self.pair_count_map.items()) # Output header line alias_fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n") # Compute token pair statistic and output to alias file - for t1, t2, c in sorted_pairs: - n1 = token_count_map[t1] - n2 = token_count_map[t2] + for (t1, t2), c in sorted_pairs: + n1 = self.token_count_map[t1] + n2 = self.token_count_map[t2] if n1 < n2: x = t1 y = t2 @@ -367,12 +448,215 @@ def main(): yn = n1 f = float(c) / float(xn) finv = float(c) / float(yn) + if path_export: + x = self.av_labels.taxonomy.get_path(x) + y = self.av_labels.taxonomy.get_path(y) alias_fd.write( "%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" % (x, y, xn, yn, c, f, finv) ) # Close alias file alias_fd.close() - sys.stderr.write("[-] Alias data in %s\n" % alias_filename) + self.print_error("[-] Alias data in %s\n" % (alias_filename)) + + def out_avtags(self, out_prefix: AnyStr): + avtags_fd = open("%s.avtags" % out_prefix, "w") + for t in sorted(self.avtags_dict.keys()): + avtags_fd.write("%s\t" % t) + pairs = sorted( + self.avtags_dict[t].items(), key=lambda pair: pair[1], reverse=True + ) + for pair in pairs: + avtags_fd.write("%s|%d," % (pair[0], pair[1])) + avtags_fd.write("\n") + avtags_fd.close() + + def out_stats(self, out_prefix: AnyStr): + # Output stats + stats_fd = open("%s.stats" % out_prefix, "w") + num_samples = self.vt_all + stats_fd.write("Samples: %d\n" % num_samples) + num_tagged = self.stats["tagged"] + frac = float(num_tagged) / float(num_samples) * 100 + stats_fd.write("Tagged (all): %d (%.01f%%)\n" % (num_tagged, frac)) + num_maltagged = self.stats["maltagged"] + frac = float(num_maltagged) / float(num_samples) * 100 + stats_fd.write("Tagged (VT>3): %d (%.01f%%)\n" % (num_maltagged, frac)) + for c in ["FILE", "CLASS", "BEH", "FAM", "UNK"]: + count = self.stats[c] + frac = float(count) / float(num_maltagged) * 100 + stats_fd.write("%s: %d (%.01f%%)\n" % (c, self.stats[c], frac)) + stats_fd.close() + + def guess_hash(self, h: AnyStr) -> Optional[AnyStr]: + """ + Guess hash type based on ``len(h)`` + + :param h: The hash + :return: The hash type (str) + """ + hlen = len(h) + if hlen == 32: + return "md5" + elif hlen == 40: + return "sha1" + elif hlen == 64: + return "sha256" + return None + + def format_tag_pairs_str( + self, tags: List[Tuple], taxonomy: Taxonomy = None, path_export: bool = False + ) -> AnyStr: + """ + Get ranked tags as a string. + + :param tags: + :param taxonomy: + :return: List of tags + """ + if not tags: + return "" + if path_export and taxonomy is not None: + p = taxonomy.get_path(tags[0][0]) + else: + p = tags[0][0] + out = "%s|%d" % (p, tags[0][1]) + for (t, s) in tags[1:]: + if path_export and taxonomy is not None: + p = taxonomy.get_path(t) + else: + p = t + out += ",%s|%d" % (p, s) + return out + + def format_tag_pairs_list( + self, tags: List[Tuple], taxonomy: Taxonomy = None, path_export: bool = False + ) -> List[Dict]: + """ + Get ranked tags as a list dictionary. + + :param tags: + :param taxonomy: + :return: List of tags + """ + out = [] + for (tag, count) in tags: + values = {"tag": tag, "count": count} + if path_export and taxonomy: + values["category"] = taxonomy.get_category(tag) + values["path"] = taxonomy.get_path(tag) + out.append(values) + return out + + def list_str( + self, vt_tags: Optional[Dict], sep: AnyStr = ", ", prefix: AnyStr = "" + ) -> AnyStr: + """ + Return list as a string + + :param vt_tags: The list of virus total tags + :param sep: The separator + :param prefix: The prefix + :return: A string representation of the list + """ + if not vt_tags or len(vt_tags) == 0: + return "" + out = prefix + vt_tags[0] + for s in vt_tags[1:]: + out = out + sep + s + return out + + def print_error(self, output: str = "", flush=False): + if self.console: + # TODO - would this be better? print(output, file=sys.stderr, flush=flush, end="") + sys.stderr.write(output) + if flush: + sys.stderr.flush() + + def print_output(self, output: str = ""): + if self.console: + sys.stdout.write(output) + + +def main(): + args = parse_args() + # Create AvLabels object + av_labels = AvLabels( + tag_file=args.tag, + tax_file=args.tax, + exp_file=args.exp, + av_file=args.av, + alias_detect=args.aliasdetect, + ) + # Build list of input files + # TODO: File selection should be rewritten as it is difficult to add new types. + # Would be nice to just have '-i or --input', detect if its a directory or file, + # then use a new arg string to specify the data type ["vt2", "vt3", "lb"] + files, data_type = get_files( + vt=args.vt, + lb=args.lb, + vtdir=args.vtdir, + lbdir=args.lbdir, + vt3=args.vt3, + ) + av_class = AVClass2(av_labels=av_labels) + result = av_class.run( + files=files, + data_type=data_type, + hash_type=args.hash, + stats_export=args.stats, + vt_tags=args.vtt, + av_tags=args.avtags, + ground_truth=args.gt, + pup_classify=args.pup, + path_export=args.path, + compatibility_v1=args.c, + console=not args.json, + ) + if args.json: + print(json.dumps(result)) + + +def get_files( + vt: Optional[str] = None, + lb: Optional[str] = None, + vtdir: Optional[str] = None, + lbdir: Optional[str] = None, + vt3: Optional[bool] = False, +) -> Tuple: + """ + Return list as a string + + :param vt: vt file + :param lb: lb file + :param vtdir: vt directory + :param lbdir: lb directory + :param vt3: vt3 json format + :return: A Tuple of files and type + """ + # NOTE: duplicate input files are not removed + ifile_l = [] + ifile_are_vt = None + if vt: + ifile_l += vt + ifile_are_vt = True + if lb: + ifile_l += lb + ifile_are_vt = False + if vtdir: + ifile_l += [os.path.join(vtdir, f) for f in os.listdir(vtdir)] + ifile_are_vt = True + if lbdir: + ifile_l += [os.path.join(lbdir, f) for f in os.listdir(lbdir)] + ifile_are_vt = False + + # Select correct sample info extraction function + if not ifile_are_vt: + data_type = "lb" + elif vt3: + data_type = "vt3" + else: + data_type = "vt2" + return ifile_l, data_type def parse_args(): @@ -458,6 +742,10 @@ def parse_args(): "-aliasdetect", action="store_true", help="if used produce aliases file at end" ) + argparser.add_argument( + "-json", "--json", action="store_true", help="output console to json" + ) + argparser.add_argument( "-stats", action="store_true", @@ -482,9 +770,10 @@ def parse_args(): ) exit(1) + devnull = "/dev/null" # TODO - consider letting argparse handle this? if args.tag: - if args.tag == "/dev/null": + if args.tag == devnull: sys.stderr.write("[-] Using no tagging rules\n") else: sys.stderr.write("[-] Using tagging rules in %s\n" % args.tag) @@ -495,7 +784,7 @@ def parse_args(): # TODO - consider letting argparse handle this? if args.tax: - if args.tax == "/dev/null": + if args.tax == devnull: sys.stderr.write("[-] Using no taxonomy\n") else: sys.stderr.write("[-] Using taxonomy in %s\n" % args.tax) @@ -504,7 +793,7 @@ def parse_args(): # TODO - consider letting argparse handle this? if args.exp: - if args.exp == "/dev/null": + if args.exp == devnull: sys.stderr.write("[-] Using no expansion tags\n") else: sys.stderr.write("[-] Using expansion tags in %s\n" % args.exp) From 45af907b0580a15637a5561ea5eaad76c8348095 Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Mon, 1 Feb 2021 17:33:30 -0500 Subject: [PATCH 16/36] Changed Class Name, Deprecated lb, vt, lbdir, vtdir, vt3 Reduced arguments to --input and --type. Handles multiple files or directories. This makes it easier to add additonal inputs, such as metadefender I left the old arguments in there for backward compatibility, but we may just want to remove them. --- avclass/labeler.py | 99 +++++++++++++++++++++++++++++----------------- 1 file changed, 63 insertions(+), 36 deletions(-) diff --git a/avclass/labeler.py b/avclass/labeler.py index dbf2202..7f0c85d 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -20,7 +20,7 @@ from avclass import clustering as ec, util -class AVClass2: +class AVClassLabeler: output = [] av_labels = None hash_type = None @@ -113,8 +113,13 @@ def run( self.print_error("[-] Processing input file %s\n" % ifile) # Process all lines in file - for line in fd: - self.process_line(line) + try: + for line in fd: + self.process_line(line) + except json.decoder.JSONDecodeError: + if isinstance(ifile, str): + self.print_error("Error parsing %s (possible incorrect file type\n" % ifile) + continue # Debug info self.print_error("\r[-] %d JSON read" % self.vt_all, flush=True) @@ -588,17 +593,16 @@ def main(): alias_detect=args.aliasdetect, ) # Build list of input files - # TODO: File selection should be rewritten as it is difficult to add new types. - # Would be nice to just have '-i or --input', detect if its a directory or file, - # then use a new arg string to specify the data type ["vt2", "vt3", "lb"] files, data_type = get_files( + file_input=args.input, + data_type=args.type, vt=args.vt, lb=args.lb, vtdir=args.vtdir, lbdir=args.lbdir, vt3=args.vt3, ) - av_class = AVClass2(av_labels=av_labels) + av_class = AVClassLabeler(av_labels=av_labels) result = av_class.run( files=files, data_type=data_type, @@ -617,10 +621,12 @@ def main(): def get_files( - vt: Optional[str] = None, - lb: Optional[str] = None, - vtdir: Optional[str] = None, - lbdir: Optional[str] = None, + file_input: Optional[AnyStr]=None, + data_type: Optional[AnyStr]=None, + vt: Optional[AnyStr]=None, + lb: Optional[AnyStr]=None, + vtdir: Optional[AnyStr]=None, + lbdir: Optional[AnyStr]=None, vt3: Optional[bool] = False, ) -> Tuple: """ @@ -633,29 +639,40 @@ def get_files( :param vt3: vt3 json format :return: A Tuple of files and type """ - # NOTE: duplicate input files are not removed ifile_l = [] ifile_are_vt = None - if vt: - ifile_l += vt - ifile_are_vt = True - if lb: - ifile_l += lb - ifile_are_vt = False - if vtdir: - ifile_l += [os.path.join(vtdir, f) for f in os.listdir(vtdir)] - ifile_are_vt = True - if lbdir: - ifile_l += [os.path.join(lbdir, f) for f in os.listdir(lbdir)] - ifile_are_vt = False + if file_input: + for fi in file_input: + if os.path.isdir(fi): + for f in os.listdir(fi): + dir_file = os.path.join(fi, f) + if dir_file not in ifile_l: + ifile_l.append(dir_file) + elif fi not in ifile_l: + ifile_l.append(fi) + else: + # NOTE: duplicate input files are not removed + if vt: + ifile_l += vt + ifile_are_vt = True + if lb: + ifile_l += lb + ifile_are_vt = False + if vtdir: + ifile_l += [os.path.join(vtdir, f) for f in os.listdir(vtdir)] + ifile_are_vt = True + if lbdir: + ifile_l += [os.path.join(lbdir, f) for f in os.listdir(lbdir)] + ifile_are_vt = False # Select correct sample info extraction function - if not ifile_are_vt: - data_type = "lb" - elif vt3: - data_type = "vt3" - else: - data_type = "vt2" + if not data_type: + if not ifile_are_vt: + data_type = "lb" + elif vt3: + data_type = "vt3" + else: + data_type = "vt2" return ifile_l, data_type @@ -669,24 +686,34 @@ def parse_args(): argparser.add_argument( "-vt", action="append", - help="file with VT reports (Can be provided multiple times)", + help="DEPRECATED (use -i & -type): file with VT reports (Can be provided multiple times)", ) argparser.add_argument( "-lb", action="append", - help="file with simplified JSON reports " + help="DEPRECATED (use -i & -type): file with simplified JSON reports " "{md5,sha1,sha256,scan_date,av_labels} (Can be provided " "multiple times)", ) - argparser.add_argument("-vtdir", help="existing directory with VT reports") + argparser.add_argument("-vtdir", help="DEPRECATED (use -i & -type): existing directory with VT reports") argparser.add_argument( - "-lbdir", help="existing directory with simplified JSON reports" + "-lbdir", help="DEPRECATED (use -i & -type) existing directory with simplified JSON reports" ) - argparser.add_argument("-vt3", action="store_true", help="input are VT v3 files") + argparser.add_argument("-vt3", action="store_true", help="DEPRECATED (use -type): input are VT v3 files") + + argparser.add_argument( + "-i", "--input", + action="append", + help="input report file or directory (Can be provided multiple times)" + ) + + argparser.add_argument( + "-type", "--type", help="the type of report (vt2, vt3, lb)" + ) argparser.add_argument( "-gt", @@ -756,7 +783,7 @@ def parse_args(): args = argparser.parse_args() # TODO - use non-exclusive group to ensure at least one is selected instead of this - if not args.vt and not args.lb and not args.vtdir and not args.lbdir: + if not args.input and not args.vt and not args.lb and not args.vtdir and not args.lbdir: sys.stderr.write( "One of the following 4 arguments is required: " "-vt,-lb,-vtdir,-lbdir\n" ) From a1bcb255f7c97b041465a27c8fa1fe5133c54cf3 Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Mon, 1 Feb 2021 17:42:12 -0500 Subject: [PATCH 17/36] black reformatting --- avclass/labeler.py | 48 ++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/avclass/labeler.py b/avclass/labeler.py index 7f0c85d..b09e3cf 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -118,7 +118,9 @@ def run( self.process_line(line) except json.decoder.JSONDecodeError: if isinstance(ifile, str): - self.print_error("Error parsing %s (possible incorrect file type\n" % ifile) + self.print_error( + "Error parsing %s (possible incorrect file type\n" % ifile + ) continue # Debug info @@ -621,12 +623,12 @@ def main(): def get_files( - file_input: Optional[AnyStr]=None, - data_type: Optional[AnyStr]=None, - vt: Optional[AnyStr]=None, - lb: Optional[AnyStr]=None, - vtdir: Optional[AnyStr]=None, - lbdir: Optional[AnyStr]=None, + file_input: Optional[AnyStr] = None, + data_type: Optional[AnyStr] = None, + vt: Optional[AnyStr] = None, + lb: Optional[AnyStr] = None, + vtdir: Optional[AnyStr] = None, + lbdir: Optional[AnyStr] = None, vt3: Optional[bool] = False, ) -> Tuple: """ @@ -697,24 +699,30 @@ def parse_args(): "multiple times)", ) - argparser.add_argument("-vtdir", help="DEPRECATED (use -i & -type): existing directory with VT reports") - argparser.add_argument( - "-lbdir", help="DEPRECATED (use -i & -type) existing directory with simplified JSON reports" + "-vtdir", help="DEPRECATED (use -i & -type): existing directory with VT reports" ) - argparser.add_argument("-vt3", action="store_true", help="DEPRECATED (use -type): input are VT v3 files") + argparser.add_argument( + "-lbdir", + help="DEPRECATED (use -i & -type) existing directory with simplified JSON reports", + ) argparser.add_argument( - "-i", "--input", - action="append", - help="input report file or directory (Can be provided multiple times)" + "-vt3", + action="store_true", + help="DEPRECATED (use -type): input are VT v3 files", ) - + argparser.add_argument( - "-type", "--type", help="the type of report (vt2, vt3, lb)" + "-i", + "--input", + action="append", + help="input report file or directory (Can be provided multiple times)", ) + argparser.add_argument("-type", "--type", help="the type of report (vt2, vt3, lb)") + argparser.add_argument( "-gt", help="file with ground truth. If provided it evaluates clustering accuracy. " @@ -783,7 +791,13 @@ def parse_args(): args = argparser.parse_args() # TODO - use non-exclusive group to ensure at least one is selected instead of this - if not args.input and not args.vt and not args.lb and not args.vtdir and not args.lbdir: + if ( + not args.input + and not args.vt + and not args.lb + and not args.vtdir + and not args.lbdir + ): sys.stderr.write( "One of the following 4 arguments is required: " "-vt,-lb,-vtdir,-lbdir\n" ) From 5cbe0408cb694fa2a898ada7f115aece3e687c30 Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Tue, 2 Feb 2021 17:13:27 -0500 Subject: [PATCH 18/36] Just removed -vt -lb -vtdir -lbdir -vt3, use -i & -t --- avclass/labeler.py | 116 ++++++++------------------------------------- 1 file changed, 20 insertions(+), 96 deletions(-) diff --git a/avclass/labeler.py b/avclass/labeler.py index b09e3cf..a9cc928 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -66,7 +66,7 @@ def run( Dict, List[Dict], ], - data_type: str = "vt3", + data_type: Optional[AnyStr] = "vt3", hash_type: Optional[AnyStr] = "md5", ground_truth: Optional[AnyStr] = None, stats_export: bool = False, @@ -91,6 +91,7 @@ def run( self.hash_type = self.get_hash_type(hash_type) # Select file type used for sampling + data_type = data_type if data_type else "vt3" self.get_sample_info = self.av_labels.get_sample_call(data_type) # Select output prefix @@ -572,14 +573,14 @@ def list_str( out = out + sep + s return out - def print_error(self, output: str = "", flush=False): + def print_error(self, output: AnyStr = "", flush=False): if self.console: # TODO - would this be better? print(output, file=sys.stderr, flush=flush, end="") sys.stderr.write(output) if flush: sys.stderr.flush() - def print_output(self, output: str = ""): + def print_output(self, output: AnyStr = ""): if self.console: sys.stdout.write(output) @@ -595,19 +596,13 @@ def main(): alias_detect=args.aliasdetect, ) # Build list of input files - files, data_type = get_files( + files = get_files( file_input=args.input, - data_type=args.type, - vt=args.vt, - lb=args.lb, - vtdir=args.vtdir, - lbdir=args.lbdir, - vt3=args.vt3, ) av_class = AVClassLabeler(av_labels=av_labels) result = av_class.run( files=files, - data_type=data_type, + data_type=args.type, hash_type=args.hash, stats_export=args.stats, vt_tags=args.vtt, @@ -624,25 +619,14 @@ def main(): def get_files( file_input: Optional[AnyStr] = None, - data_type: Optional[AnyStr] = None, - vt: Optional[AnyStr] = None, - lb: Optional[AnyStr] = None, - vtdir: Optional[AnyStr] = None, - lbdir: Optional[AnyStr] = None, - vt3: Optional[bool] = False, -) -> Tuple: +) -> List[AnyStr]: """ - Return list as a string - - :param vt: vt file - :param lb: lb file - :param vtdir: vt directory - :param lbdir: lb directory - :param vt3: vt3 json format - :return: A Tuple of files and type + Return List of the files to process + + :param file_input: file or directory to process + :return: List of type str """ ifile_l = [] - ifile_are_vt = None if file_input: for fi in file_input: if os.path.isdir(fi): @@ -652,30 +636,7 @@ def get_files( ifile_l.append(dir_file) elif fi not in ifile_l: ifile_l.append(fi) - else: - # NOTE: duplicate input files are not removed - if vt: - ifile_l += vt - ifile_are_vt = True - if lb: - ifile_l += lb - ifile_are_vt = False - if vtdir: - ifile_l += [os.path.join(vtdir, f) for f in os.listdir(vtdir)] - ifile_are_vt = True - if lbdir: - ifile_l += [os.path.join(lbdir, f) for f in os.listdir(lbdir)] - ifile_are_vt = False - - # Select correct sample info extraction function - if not data_type: - if not ifile_are_vt: - data_type = "lb" - elif vt3: - data_type = "vt3" - else: - data_type = "vt2" - return ifile_l, data_type + return ifile_l def parse_args(): @@ -685,35 +646,6 @@ def parse_args(): " recall if ground truth available", ) - argparser.add_argument( - "-vt", - action="append", - help="DEPRECATED (use -i & -type): file with VT reports (Can be provided multiple times)", - ) - - argparser.add_argument( - "-lb", - action="append", - help="DEPRECATED (use -i & -type): file with simplified JSON reports " - "{md5,sha1,sha256,scan_date,av_labels} (Can be provided " - "multiple times)", - ) - - argparser.add_argument( - "-vtdir", help="DEPRECATED (use -i & -type): existing directory with VT reports" - ) - - argparser.add_argument( - "-lbdir", - help="DEPRECATED (use -i & -type) existing directory with simplified JSON reports", - ) - - argparser.add_argument( - "-vt3", - action="store_true", - help="DEPRECATED (use -type): input are VT v3 files", - ) - argparser.add_argument( "-i", "--input", @@ -721,7 +653,9 @@ def parse_args(): help="input report file or directory (Can be provided multiple times)", ) - argparser.add_argument("-type", "--type", help="the type of report (vt2, vt3, lb)") + argparser.add_argument( + "-t", "--type", help="the type of report file (vt2, vt3, lb)" + ) argparser.add_argument( "-gt", @@ -791,25 +725,15 @@ def parse_args(): args = argparser.parse_args() # TODO - use non-exclusive group to ensure at least one is selected instead of this - if ( - not args.input - and not args.vt - and not args.lb - and not args.vtdir - and not args.lbdir - ): - sys.stderr.write( - "One of the following 4 arguments is required: " "-vt,-lb,-vtdir,-lbdir\n" - ) + if not args.input: + sys.stderr.write("Input file / directory is required: " "-i\n") exit(1) - # TODO - use mutex group for this instead of manual check - if (args.vt or args.vtdir) and (args.lb or args.lbdir): + if not args.type: + sys.stderr.write( - "Use either -vt/-vtdir or -lb/-lbdir. " - "Both types of input files cannot be combined.\n" + "[-] No type defined, using file type of VirusTotal v3: '-t vt3'\n" ) - exit(1) devnull = "/dev/null" # TODO - consider letting argparse handle this? From 09f2e5eb37f55e4e62ec0f5c2dfb5cc36a24dd53 Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Thu, 4 Feb 2021 08:51:18 -0500 Subject: [PATCH 19/36] Tweaks to --input --- avclass/labeler.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/avclass/labeler.py b/avclass/labeler.py index a9cc928..799e889 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -596,7 +596,7 @@ def main(): alias_detect=args.aliasdetect, ) # Build list of input files - files = get_files( + files = get_arg_files( file_input=args.input, ) av_class = AVClassLabeler(av_labels=av_labels) @@ -617,25 +617,24 @@ def main(): print(json.dumps(result)) -def get_files( - file_input: Optional[AnyStr] = None, +def get_arg_files( + file_input: List[AnyStr], ) -> List[AnyStr]: """ Return List of the files to process - :param file_input: file or directory to process + :param file_input: file(s) or directory to process :return: List of type str """ ifile_l = [] - if file_input: - for fi in file_input: - if os.path.isdir(fi): - for f in os.listdir(fi): - dir_file = os.path.join(fi, f) - if dir_file not in ifile_l: - ifile_l.append(dir_file) - elif fi not in ifile_l: - ifile_l.append(fi) + for fi in file_input: + if os.path.isdir(fi): + for f in os.listdir(fi): + dir_file = os.path.join(fi, f) + if dir_file not in ifile_l: + ifile_l.append(dir_file) + elif fi not in ifile_l: + ifile_l.append(fi) return ifile_l @@ -724,13 +723,11 @@ def parse_args(): args = argparser.parse_args() - # TODO - use non-exclusive group to ensure at least one is selected instead of this if not args.input: sys.stderr.write("Input file / directory is required: " "-i\n") exit(1) if not args.type: - sys.stderr.write( "[-] No type defined, using file type of VirusTotal v3: '-t vt3'\n" ) From f4efd26e75d56742b30a5eebb4455d8fc3d30430 Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Thu, 4 Feb 2021 13:54:34 -0500 Subject: [PATCH 20/36] MetaDefender support --- avclass/common.py | 44 +++++++++++++++++++++++++++++++++++++------- avclass/labeler.py | 2 +- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/avclass/common.py b/avclass/common.py index 5533b43..05e24dd 100755 --- a/avclass/common.py +++ b/avclass/common.py @@ -501,7 +501,7 @@ def get_sample_call(self, data_type: AnyStr) -> Callable: """ Return the correct parser for the report type - :param data_type: the type of file vt2, vt3, lb + :param data_type: the type of file vt2, vt3, lb, md :return: Callable function that returns SampleInfo """ if data_type == "lb": @@ -510,9 +510,11 @@ def get_sample_call(self, data_type: AnyStr) -> Callable: return self.get_sample_info_vt_v2 elif data_type == "vt3": return self.get_sample_info_vt_v3 + elif data_type == "md": + return self.get_sample_info_md else: sys.stderr.write( - "Invalid data type for sample: %s (should be vt, vt2, vt3, lb)" + "Invalid data type for sample: %s (should be vt, vt2, vt3, lb, md)" % data_type ) return self.get_sample_info_vt_v3 @@ -542,9 +544,9 @@ def get_sample_info_lb(record: Dict) -> SampleInfo: ) @staticmethod - def get_sample_info_vt_v2(record): + def get_sample_info_vt_v2(record: Dict) -> SampleInfo: """ - Convert VT (v2) JSON to a SampleInfo object + Convert VirusTotal (v2) JSON to a SampleInfo object :param record: The JSON record :return: An instance of SampleInfo @@ -572,9 +574,9 @@ def get_sample_info_vt_v2(record): return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags) @staticmethod - def get_sample_info_vt_v3(record): + def get_sample_info_vt_v3(record: Dict) -> SampleInfo: """ - Convert VT (v3) JSON to a SampleInfo object + Convert VirusTotal (v3) JSON to a SampleInfo object :param record: The JSON record :return: An instance of SampleInfo @@ -602,7 +604,35 @@ def get_sample_info_vt_v3(record): return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags) @staticmethod - def is_pup(tag_pairs, taxonomy: Taxonomy) -> Optional[bool]: + def get_sample_info_md(record: Dict) -> SampleInfo: + """ + Convert OPSWAT MetaDefender JSON to a SampleInfo object + + :param record: The JSON record + :return: An instance of SampleInfo + """ + try: + scans = record["scan_results"]["scan_details"] + md5 = record["file_info"]["md5"] + sha1 = record["file_info"]["sha1"] + sha256 = record["file_info"]["sha256"] + except KeyError: + return None + + # Obtain labels from scan results + label_pairs = [] + for av, res in scans.items(): + label = res["threat_found"] + if label is not None and res["scan_result_i"] == 1: + clean_label = "".join( + filter(lambda x: x in string.printable, label) + ).strip() + label_pairs.append((av, clean_label)) + + return SampleInfo(md5, sha1, sha256, label_pairs, []) + + @staticmethod + def is_pup(tag_pairs: List[Tuple], taxonomy: Taxonomy) -> Optional[bool]: """ Attempts to classify a sample (represented by ``tag_pairs``) as a PUP. We accomplish this by checking for the "grayware" label in the highest ranked CLASS. diff --git a/avclass/labeler.py b/avclass/labeler.py index 799e889..fbee834 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -653,7 +653,7 @@ def parse_args(): ) argparser.add_argument( - "-t", "--type", help="the type of report file (vt2, vt3, lb)" + "-t", "--type", help="the type of report file (vt2, vt3, lb, md)" ) argparser.add_argument( From dd3948f7a6127908e06ce15fbe1d118f0a0cec4d Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Thu, 4 Feb 2021 18:12:39 -0500 Subject: [PATCH 21/36] Copying over some changes from malicialab master (merge prep) --- .gitignore | 3 ++ avclass/common.py | 46 +++++++++++++------------- avclass/labeler.py | 15 ++++++++- avclass/update.py | 82 +++++++++++++++++++++++----------------------- 4 files changed, 81 insertions(+), 65 deletions(-) diff --git a/.gitignore b/.gitignore index 4b38bd9..1438929 100644 --- a/.gitignore +++ b/.gitignore @@ -139,3 +139,6 @@ cython_debug/ # PyCharm .idea/ + +# Apple +.DS_Store diff --git a/avclass/common.py b/avclass/common.py index 05e24dd..a799070 100755 --- a/avclass/common.py +++ b/avclass/common.py @@ -75,7 +75,7 @@ def __init__(self, filepath: Optional[AnyStr]): :param filepath: Path to taxonomy data """ - self.__tag_map = {} + self._tag_map = {} if filepath: self.read_taxonomy(filepath) @@ -86,7 +86,7 @@ def __len__(self) -> int: :return: The length (int) of the Taxonomy """ return ( - len(self.__tag_map) // 2 + len(self._tag_map) // 2 ) # TODO - perhaps there should be two dicts, one for names, one for paths? def is_generic(self, tag: AnyStr) -> bool: @@ -96,7 +96,7 @@ def is_generic(self, tag: AnyStr) -> bool: :param tag: The tag :return: Boolean """ - t = self.__tag_map.get(tag, None) + t = self._tag_map.get(tag, None) return getattr(t, "cat", None) == "GEN" def is_tag(self, tag: AnyStr) -> bool: @@ -106,7 +106,7 @@ def is_tag(self, tag: AnyStr) -> bool: :param tag: The tag :return: Boolean """ - return tag in self.__tag_map + return tag in self._tag_map def add_tag(self, s: AnyStr, override: bool = False): """ @@ -117,18 +117,18 @@ def add_tag(self, s: AnyStr, override: bool = False): :return: None """ tag = create_tag(s) - t = self.__tag_map.get(tag.name, None) + t = self._tag_map.get(tag.name, None) if t and (t.path != tag.path): if override: logger.warning("[Taxonomy] Replacing %s with %s\n" % t.path, tag.path) - del self.__tag_map[t.path] + del self._tag_map[t.path] else: return logger.debug("[Taxonomy] Adding tag %s" % s) - self.__tag_map[tag.name] = tag - self.__tag_map[tag.path] = tag + self._tag_map[tag.name] = tag + self._tag_map[tag.path] = tag def remove_tag(self, tag: AnyStr) -> bool: """ @@ -137,11 +137,11 @@ def remove_tag(self, tag: AnyStr) -> bool: :param tag: The tag to remove :return: Whether or not the tag was present """ - t = self.__tag_map.get(tag, None) + t = self._tag_map.get(tag, None) if tag: logger.debug("[Taxonomy] Removing tag: %s" % t.path) - del self.__tag_map[t.name] - del self.__tag_map[t.path] + del self._tag_map[t.name] + del self._tag_map[t.path] return t is not None def get_category(self, tag: AnyStr) -> AnyStr: @@ -151,7 +151,7 @@ def get_category(self, tag: AnyStr) -> AnyStr: :param tag: The tag :return: The category """ - t = self.__tag_map.get(tag, None) + t = self._tag_map.get(tag, None) return getattr(t, "cat", "UNK") def get_path(self, tag: AnyStr) -> AnyStr: @@ -161,7 +161,7 @@ def get_path(self, tag: AnyStr) -> AnyStr: :param tag: The tag :return: The tag's path """ - t = self.__tag_map.get(tag, None) + t = self._tag_map.get(tag, None) return getattr(t, "path", f"UNK:{tag}") def get_prefix_l(self, tag: AnyStr) -> List[AnyStr]: @@ -171,7 +171,7 @@ def get_prefix_l(self, tag: AnyStr) -> List[AnyStr]: :param tag: The tag :return: The tag's prefix list """ - t = self.__tag_map.get(tag, None) + t = self._tag_map.get(tag, None) return getattr(t, "prefix_l", []) def get_prefix(self, tag: AnyStr) -> List[AnyStr]: @@ -181,7 +181,7 @@ def get_prefix(self, tag: AnyStr) -> List[AnyStr]: :param tag: The tag :return: String representation of the tag's full prefix """ - t = self.__tag_map.get(tag, None) + t = self._tag_map.get(tag, None) tag_pfx = tag.path.split(":")[:-1] return t.prefix_l if t else tag_pfx @@ -192,7 +192,7 @@ def get_depth(self, tag: AnyStr) -> int: :param tag: The tag :return: The depth (int) of the tag """ - t = self.__tag_map.get(tag, None) + t = self._tag_map.get(tag, None) if t: return len(tag.prefix_l) + 2 return 0 @@ -204,7 +204,7 @@ def get_info(self, tag: AnyStr) -> Tuple[AnyStr, AnyStr]: :param tag: The tag :return: Tuple containing tag.path and tag.cat """ - t = self.__tag_map.get(tag, None) + t = self._tag_map.get(tag, None) if t: return t.path, t.cat return f"UNK:{tag}", "UNK" @@ -216,9 +216,9 @@ def expand(self, tag: AnyStr) -> List[AnyStr]: :param tag: The tag :return: A list of prefixes """ - t = self.__tag_map.get(tag, None) + t = self._tag_map.get(tag, None) if t: - return [x for x in t.prefix_l if x in self.__tag_map] + return [x for x in t.prefix_l if x in self._tag_map] return [] def platform_tags(self) -> Set[AnyStr]: @@ -229,7 +229,7 @@ def platform_tags(self) -> Set[AnyStr]: """ return { tag.name - for _, tag in self.__tag_map.items() + for _, tag in self._tag_map.items() if tag.path.startswith(platform_prefix) } @@ -286,7 +286,7 @@ def to_file(self, filepath: AnyStr): :return: None """ with open(filepath, "w") as fd: - tag_l = sorted(self.__tag_map.items(), key=lambda item: item[1].path) + tag_l = sorted(self._tag_map.items(), key=lambda item: item[1].path) idx = 0 for name, tag in tag_l: if (idx % 2) == 0: @@ -784,12 +784,12 @@ def get_sample_tags(self, sample_info: SampleInfo) -> Dict[AnyStr, List[AnyStr]] duplicates.add(label) - label = self.__remove_suffixes(av_name, label) + label = self._remove_suffixes(av_name, label) hashes = [sample_info.md5, sample_info.sha1, sample_info.sha256] tags = self.get_label_tags(label, hashes) # NOTE: Avoid expansion when aliases are set - expanded_tags = tags if self.alias_detect else self.__expand(tags) + expanded_tags = tags if self.alias_detect else self._expand(tags) # store av vendors for each tag for t in expanded_tags: diff --git a/avclass/labeler.py b/avclass/labeler.py index fbee834..957c1f8 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -1,4 +1,5 @@ import argparse +import gzip import os import json import sys @@ -75,6 +76,7 @@ def run( pup_classify: bool = False, path_export: bool = False, compatibility_v1: bool = False, + gzipped: bool = False, console: bool = False, ) -> List[Dict]: # Set class arguments @@ -108,7 +110,10 @@ def run( elif isinstance(ifile, StringIO): fd = ifile else: - fd = open(ifile, "r") + if gzipped: + fd = gzip.open(ifile, "rt") + else: + fd = open(ifile, "r") # Debug info, file processed self.print_error("[-] Processing input file %s\n" % ifile) @@ -611,6 +616,7 @@ def main(): pup_classify=args.pup, path_export=args.path, compatibility_v1=args.c, + gzipped=args.gzip, console=not args.json, ) if args.json: @@ -662,6 +668,13 @@ def parse_args(): "Prints precision, recall, F1-measure.", ) + argparser.add_argument( + "-gz", + "--gzip", + help="file with JSON reports is gzipped", + action="store_true", + ) + argparser.add_argument( "-vtt", help="Include VT tags in the output.", action="store_true" ) diff --git a/avclass/update.py b/avclass/update.py index a2bc73b..4e9d5ea 100644 --- a/avclass/update.py +++ b/avclass/update.py @@ -30,11 +30,11 @@ class Update: def __init__(self, rel_filepath: AnyStr, in_taxonomy: Taxonomy, in_translation: Translation, in_expansion: Expansion, n, t): - self.__out_taxonomy = in_taxonomy - self.__out_translation = in_translation - self.__out_expansion = in_expansion - self.__n = n - self.__t = t + self._out_taxonomy = in_taxonomy + self._out_translation = in_translation + self._out_expansion = in_expansion + self._n = n + self._t = t # Initialize blacklist self.blist = in_taxonomy.platform_tags() # Maps src -> cnt @@ -55,8 +55,8 @@ def is_weak_rel(self, rel: Relation) -> bool: :param rel: The relationship :return: Boolean """ - return ((int(rel.nalias_num) < self.__n) or - (float(rel.talias_num) < self.__t)) + return ((int(rel.nalias_num) < self._n) or + (float(rel.talias_num) < self._t)) def is_blacklisted_rel(self, rel: Relation) -> bool: """ @@ -77,16 +77,16 @@ def is_known_rel(self, rel: Relation) -> bool: t1 = rel.t1 t2 = rel.t2 # Known taxonomy relation - if self.__out_taxonomy.overlaps(t1, t2): + if self._out_taxonomy.overlaps(t1, t2): return True # Known expansion rule - t1_dst = self.__out_expansion.get_dst(t1) - t2_dst = self.__out_expansion.get_dst(t2) + t1_dst = self._out_expansion.get_dst(t1) + t2_dst = self._out_expansion.get_dst(t2) if t2 in t1_dst or t1 in t2_dst: return True # Known tagging rule - t1_dst = sorted(self.__out_translation.get_dst(t1)) - t2_dst = sorted(self.__out_translation.get_dst(t2)) + t1_dst = sorted(self._out_translation.get_dst(t1)) + t2_dst = sorted(self._out_translation.get_dst(t2)) if t2 in t1_dst or t1 in t2_dst: return True # Known alias in tagging @@ -102,9 +102,9 @@ def add_tag(self, name: AnyStr, path: AnyStr): :param path: The full path :return: None """ - dst = self.__out_translation.get_dst(name) + dst = self._out_translation.get_dst(name) if not dst: - self.__out_taxonomy.add_tag(path) + self._out_taxonomy.add_tag(path) def add_expansion(self, src: AnyStr, dst_l: Collection[AnyStr]): """ @@ -116,19 +116,19 @@ def add_expansion(self, src: AnyStr, dst_l: Collection[AnyStr]): """ ''' Add expansion rule fixing destination if src in tagging ''' # Select source handling aliases - dst = self.__out_translation.get_dst(src) + dst = self._out_translation.get_dst(src) if dst: new_src = dst[0] else: new_src = src # Select destinations removing overlaps with existing rule - dst = self.__out_expansion.get_dst(src) + dst = self._out_expansion.get_dst(src) if dst: dst.extend(dst_l) - target_l = self.__out_taxonomy.remove_overlaps(dst) - self.__out_expansion.add_rule(new_src, target_l, True) + target_l = self._out_taxonomy.remove_overlaps(dst) + self._out_expansion.add_rule(new_src, target_l, True) else: - self.__out_expansion.add_rule(new_src, dst_l, True) + self._out_expansion.add_rule(new_src, dst_l, True) def add_alias(self, src: AnyStr, dst: AnyStr, dst_prefix: AnyStr): """ @@ -140,7 +140,7 @@ def add_alias(self, src: AnyStr, dst: AnyStr, dst_prefix: AnyStr): :return: None """ # If src in tagging, use most popular target - tr_dst = self.__out_translation.get_dst(src) + tr_dst = self._out_translation.get_dst(src) target = dst if tr_dst: cnt_max = self.src_map[dst] @@ -149,17 +149,17 @@ def add_alias(self, src: AnyStr, dst: AnyStr, dst_prefix: AnyStr): if cnt > cnt_max: target = e # If dst is in tagging, update tagging rule destination, - tr_dst = self.__out_translation.get_dst(dst) + tr_dst = self._out_translation.get_dst(dst) if tr_dst: target_l = tr_dst # else add dst to taxonomy else: target_l = [target] - self.__out_taxonomy.add_tag('%s:%s' % (dst_prefix, dst)) + self._out_taxonomy.add_tag('%s:%s' % (dst_prefix, dst)) # Remove src from taxonomy - self.__out_taxonomy.remove_tag(src) + self._out_taxonomy.remove_tag(src) # Replace tagging rule - self.__out_translation.add_rule(src, target_l, True) + self._out_translation.add_rule(src, target_l, True) def is_expansion_rel(self, rel: Relation) -> bool: """ @@ -168,8 +168,8 @@ def is_expansion_rel(self, rel: Relation) -> bool: :param rel: The relation :return: Boolean """ - c1 = self.__out_taxonomy.get_category(rel.t1) - c2 = self.__out_taxonomy.get_category(rel.t2) + c1 = self._out_taxonomy.get_category(rel.t1) + c2 = self._out_taxonomy.get_category(rel.t2) return (((c1 == "FAM") and (c2 != c1) and (c2 != "UNK")) or ((c1 == "CLASS") and ((c2 == "FILE") or (c2 == "BEH"))) or ((c1 == "UNK") and ((c2 == "BEH") or (c2 == "CLASS")))) @@ -182,11 +182,11 @@ def find_expansions(self): """ acc = [] for rel in self.rel_set: - p1 = self.__out_taxonomy.get_path(rel.t1) - p2 = self.__out_taxonomy.get_path(rel.t2) + p1 = self._out_taxonomy.get_path(rel.t1) + p2 = self._out_taxonomy.get_path(rel.t2) logger.debug("Processing %s\t%s" % (p1, p2)) # Ignore relations where t1 is an alias - dst = self.__out_translation.get_dst(rel.t1) + dst = self._out_translation.get_dst(rel.t1) if dst: logger.debug("Ignoring relation for alias %s" % p1) continue @@ -198,16 +198,16 @@ def find_expansions(self): # def is_alias_rel(self, rel): # ''' Return true if relation implies alias rule ''' - # c1 = self.__out_taxonomy.get_category(rel.t1) - # c2 = self.__out_taxonomy.get_category(rel.t2) + # c1 = self._out_taxonomy.get_category(rel.t1) + # c2 = self._out_taxonomy.get_category(rel.t2) # return (((c1 == "UNK") and (c2 == "FAM")) or # ((c1 == "UNK") and (c2 == "UNK"))) # def find_aliases(self): # ''' Find aliases among relations ''' # for rel in self.rel_set: - # c1 = self.__out_taxonomy.get_category(rel.t1) - # c2 = self.__out_taxonomy.get_category(rel.t2) + # c1 = self._out_taxonomy.get_category(rel.t1) + # c2 = self._out_taxonomy.get_category(rel.t2) # if self.is_alias_rel(rel): # self.G.add_node(rel.t1) # self.G.add_node(rel.t2) @@ -223,8 +223,8 @@ def process_relation(self, rel: Relation): """ t1 = rel.t1 t2 = rel.t2 - p1, c1 = self.__out_taxonomy.get_info(rel.t1) - p2, c2 = self.__out_taxonomy.get_info(rel.t2) + p1, c1 = self._out_taxonomy.get_info(rel.t1) + p2, c2 = self._out_taxonomy.get_info(rel.t2) logger.debug("Processing %s\t%s" % (p1, p2)) @@ -388,19 +388,19 @@ def output_relations(self, filepath: AnyStr): with open(filepath, 'w') as fd: fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n") sorted_rules = sorted(self.rel_set, - key=lambda r: (self.__out_taxonomy.get_category(r.t1), - self.__out_taxonomy.get_category(r.t2))) + key=lambda r: (self._out_taxonomy.get_category(r.t1), + self._out_taxonomy.get_category(r.t2))) for rel in sorted_rules: - p1, c1 = self.__out_taxonomy.get_info(rel.t1) - p2, c2 = self.__out_taxonomy.get_info(rel.t2) + p1, c1 = self._out_taxonomy.get_info(rel.t1) + p2, c2 = self._out_taxonomy.get_info(rel.t2) fd.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (p1, p2, rel.t1_num, rel.t2_num, rel.nalias_num, rel.talias_num, rel.tinv_alias_num)) def output_rule_stats(self, fd: TextIO): # Compute rule statistics for rel in self.rel_set: - c1 = self.__out_taxonomy.get_category(rel.t1) - c2 = self.__out_taxonomy.get_category(rel.t2) + c1 = self._out_taxonomy.get_category(rel.t1) + c2 = self._out_taxonomy.get_category(rel.t2) self.cat_pairs_map[(c1, c2)] = self.cat_pairs_map.get((c1, c2), 0) + 1 self.dst_map[rel.t2] = self.dst_map.get(rel.t2, 0) + 1 # Output statistics From fc2d7def568fecefaac787c602d4367874aadd79 Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Mon, 8 Feb 2021 11:28:05 -0500 Subject: [PATCH 22/36] AVLabels default --- avclass/cli.py | 63 ++++++++++++++++++++++++++++++++++++++++++++--- avclass/common.py | 7 +++--- avclass/util.py | 55 ----------------------------------------- 3 files changed, 64 insertions(+), 61 deletions(-) diff --git a/avclass/cli.py b/avclass/cli.py index 76e2ad3..0586c3e 100644 --- a/avclass/cli.py +++ b/avclass/cli.py @@ -1,6 +1,63 @@ import argparse +import logging from avclass import util +from avclass.common import Taxonomy, Translation, Expansion +from typing import AnyStr + + +logger = logging.getLogger(__name__) + +__all__ = ( + 'validate_expansion', + 'validate_tagging', + 'validate_taxonomy', +) + +def validate_taxonomy(path: AnyStr): + """ + Validate and normalize a Taxonomy created from ``path`` + + :param path: Location on disk of a Taxonomy file + :return: Taxonomy object + """ + taxonomy = Taxonomy(path) + taxonomy.to_file(path) + + logger.info('[-] Normalized %d tags in taxonomy %s\n' % (len(taxonomy), path)) + + return taxonomy + + +def validate_tagging(path: AnyStr, taxonomy: Taxonomy): + """ + Validate and normalize Tagging created from ``path`` and verified against ``taxonomy`` + + :param path: Location on disk of a Tagging file + :param taxonomy: Valid Taxonomy object + :return: None + """ + tagging = Translation(path) + tagging.validate(taxonomy) + # tagging.expand_all_destinations() + tagging.to_file(path) + + logger.info('[-] Normalized %d tagging rules in %s\n' % (len(tagging), path)) + + +def validate_expansion(path: AnyStr, taxonomy: Taxonomy): + """ + Validate and normalize Expansion created from ``path`` and verified against ``taxonomy`` + + :param path: Location on disk of an Expansion file + :param taxonomy: Valid Taxonomy object + :return: None + """ + expansion = Expansion(path) + expansion.validate(taxonomy) + expansion.to_file(path) + + logger.info('[-] Normalized %d expansion rules in %s\n' % (len(expansion), path)) def validate_files(): @@ -17,6 +74,6 @@ def validate_files(): args = parser.parse_args() - taxonomy = util.validate_taxonomy(args.tax) - util.validate_tagging(args.tag, taxonomy) - util.validate_expansion(args.exp, taxonomy) + taxonomy = validate_taxonomy(args.tax) + validate_tagging(args.tag, taxonomy) + validate_expansion(args.exp, taxonomy) diff --git a/avclass/common.py b/avclass/common.py index afb6b76..a480747 100755 --- a/avclass/common.py +++ b/avclass/common.py @@ -4,6 +4,7 @@ import string import sys +from avclass import util from collections import defaultdict, namedtuple from typing import AnyStr, Callable, Collection, Dict, List, Optional, Set, Tuple, Union @@ -505,9 +506,9 @@ class AvLabels: def __init__( self, - tag_file: AnyStr, - exp_file: AnyStr = None, - tax_file: AnyStr = None, + tag_file: AnyStr = util.DEFAULT_TAG_PATH, + exp_file: AnyStr = util.DEFAULT_EXP_PATH, + tax_file: AnyStr = util.DEFAULT_TAX_PATH, av_file: AnyStr = None, alias_detect: bool = False, ): diff --git a/avclass/util.py b/avclass/util.py index 028bc36..e5d8bab 100755 --- a/avclass/util.py +++ b/avclass/util.py @@ -1,25 +1,15 @@ import atexit -import logging import pkg_resources from avclass import data -from avclass.common import Taxonomy, Translation, Expansion - -from typing import AnyStr __all__ = ( 'DEFAULT_EXP_PATH', 'DEFAULT_TAG_PATH', 'DEFAULT_TAX_PATH', - 'validate_expansion', - 'validate_tagging', - 'validate_taxonomy', ) - -logger = logging.getLogger(__name__) - RESOURCE_EXP = "default.expansion" RESOURCE_TAG = "default.tagging" RESOURCE_TAX = "default.taxonomy" @@ -39,48 +29,3 @@ atexit.register(pkg_resources.cleanup_resources) - -def validate_taxonomy(path: AnyStr): - """ - Validate and normalize a Taxonomy created from ``path`` - - :param path: Location on disk of a Taxonomy file - :return: Taxonomy object - """ - taxonomy = Taxonomy(path) - taxonomy.to_file(path) - - logger.info('[-] Normalized %d tags in taxonomy %s\n' % (len(taxonomy), path)) - - return taxonomy - - -def validate_tagging(path: AnyStr, taxonomy: Taxonomy): - """ - Validate and normalize Tagging created from ``path`` and verified against ``taxonomy`` - - :param path: Location on disk of a Tagging file - :param taxonomy: Valid Taxonomy object - :return: None - """ - tagging = Translation(path) - tagging.validate(taxonomy) - # tagging.expand_all_destinations() - tagging.to_file(path) - - logger.info('[-] Normalized %d tagging rules in %s\n' % (len(tagging), path)) - - -def validate_expansion(path: AnyStr, taxonomy: Taxonomy): - """ - Validate and normalize Expansion created from ``path`` and verified against ``taxonomy`` - - :param path: Location on disk of an Expansion file - :param taxonomy: Valid Taxonomy object - :return: None - """ - expansion = Expansion(path) - expansion.validate(taxonomy) - expansion.to_file(path) - - logger.info('[-] Normalized %d expansion rules in %s\n' % (len(expansion), path)) From 6b64ed6ba38efecb66e05d0f7afe57624875d3f8 Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Mon, 8 Feb 2021 11:44:52 -0500 Subject: [PATCH 23/36] AVClassLabeler default, formatting --- avclass/cli.py | 29 +++++++++++++---------------- avclass/common.py | 21 +++++++++++---------- avclass/labeler.py | 2 +- avclass/util.py | 7 +++---- 4 files changed, 28 insertions(+), 31 deletions(-) diff --git a/avclass/cli.py b/avclass/cli.py index 0586c3e..ad528ff 100644 --- a/avclass/cli.py +++ b/avclass/cli.py @@ -9,11 +9,12 @@ logger = logging.getLogger(__name__) __all__ = ( - 'validate_expansion', - 'validate_tagging', - 'validate_taxonomy', + "validate_expansion", + "validate_tagging", + "validate_taxonomy", ) + def validate_taxonomy(path: AnyStr): """ Validate and normalize a Taxonomy created from ``path`` @@ -24,7 +25,7 @@ def validate_taxonomy(path: AnyStr): taxonomy = Taxonomy(path) taxonomy.to_file(path) - logger.info('[-] Normalized %d tags in taxonomy %s\n' % (len(taxonomy), path)) + logger.info("[-] Normalized %d tags in taxonomy %s\n" % (len(taxonomy), path)) return taxonomy @@ -42,7 +43,7 @@ def validate_tagging(path: AnyStr, taxonomy: Taxonomy): # tagging.expand_all_destinations() tagging.to_file(path) - logger.info('[-] Normalized %d tagging rules in %s\n' % (len(tagging), path)) + logger.info("[-] Normalized %d tagging rules in %s\n" % (len(tagging), path)) def validate_expansion(path: AnyStr, taxonomy: Taxonomy): @@ -57,20 +58,16 @@ def validate_expansion(path: AnyStr, taxonomy: Taxonomy): expansion.validate(taxonomy) expansion.to_file(path) - logger.info('[-] Normalized %d expansion rules in %s\n' % (len(expansion), path)) + logger.info("[-] Normalized %d expansion rules in %s\n" % (len(expansion), path)) def validate_files(): - parser = argparse.ArgumentParser(description='Checks format of files Tagging, Expansion and Taxonomy.') - parser.add_argument('-exp', - help='expansion file', - default=util.DEFAULT_EXP_PATH) - parser.add_argument('-tag', - help='tagging file', - default=util.DEFAULT_TAG_PATH) - parser.add_argument('-tax', - help='taxonomy file', - default=util.DEFAULT_TAX_PATH) + parser = argparse.ArgumentParser( + description="Checks format of files Tagging, Expansion and Taxonomy." + ) + parser.add_argument("-exp", help="expansion file", default=util.DEFAULT_EXP_PATH) + parser.add_argument("-tag", help="tagging file", default=util.DEFAULT_TAG_PATH) + parser.add_argument("-tax", help="taxonomy file", default=util.DEFAULT_TAX_PATH) args = parser.parse_args() diff --git a/avclass/common.py b/avclass/common.py index a480747..d812523 100755 --- a/avclass/common.py +++ b/avclass/common.py @@ -40,7 +40,8 @@ class Tag: - ''' A Tag in the taxonomy ''' + """ A Tag in the taxonomy """ + def __init__(self, s): word_list = s.strip().split(":") if len(word_list) > 1: @@ -49,8 +50,8 @@ def __init__(self, s): self._prefix_l = [x.lower() for x in word_list[1:-1]] path = self._cat for x in self._prefix_l: - path = path + ':' + x - self._path = path + ':' + self._name + path = path + ":" + x + self._path = path + ":" + self._name else: self._name = word_list[0].lower() self._cat = uncategorized_cat @@ -58,27 +59,27 @@ def __init__(self, s): self._path = self._name def __hash__(self): - ''' Return hash ''' + """ Return hash """ return hash((self._path)) @property def name(self): - ''' Return tag name ''' + """ Return tag name """ return self._name @property def cat(self): - ''' Return tag category ''' + """ Return tag category """ return self._cat @property def path(self): - ''' Return tag path ''' + """ Return tag path """ return self._path @property def prefix_l(self): - ''' Return tag prefix list ''' + """ Return tag prefix list """ return self._prefix_l @@ -107,7 +108,7 @@ def __len__(self) -> int: return len(self._tags) def __iter__(self): - ''' Iterator over the alphabetically sorted tags in the taxonomy ''' + """ Iterator over the alphabetically sorted tags in the taxonomy """ return (t for t in sorted(self._tags)) def is_generic(self, tag: AnyStr) -> bool: @@ -522,7 +523,7 @@ def __init__( def get_sample_call(self, data_type: AnyStr) -> Callable: """ Return the correct parser for the report type - + :param data_type: the type of file vt2, vt3, lb, md :return: Callable function that returns SampleInfo """ diff --git a/avclass/labeler.py b/avclass/labeler.py index 957c1f8..c1d7d67 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -52,7 +52,7 @@ class AVClassLabeler: "UNK": 0, } - def __init__(self, av_labels: AvLabels): + def __init__(self, av_labels: AvLabels = AvLabels()): self.av_labels = av_labels def run( diff --git a/avclass/util.py b/avclass/util.py index e5d8bab..8a21db2 100755 --- a/avclass/util.py +++ b/avclass/util.py @@ -5,9 +5,9 @@ __all__ = ( - 'DEFAULT_EXP_PATH', - 'DEFAULT_TAG_PATH', - 'DEFAULT_TAX_PATH', + "DEFAULT_EXP_PATH", + "DEFAULT_TAG_PATH", + "DEFAULT_TAX_PATH", ) RESOURCE_EXP = "default.expansion" @@ -28,4 +28,3 @@ DEFAULT_TAX_PATH = pkg_resources.resource_filename(data.__name__, RESOURCE_TAX) atexit.register(pkg_resources.cleanup_resources) - From c9e9c6741ab8f7b2b2c9313cf46b773c49efa7d1 Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Mon, 8 Feb 2021 13:42:01 -0500 Subject: [PATCH 24/36] Add stats, avtags, alias to json output --- .gitignore | 5 ++ avclass/labeler.py | 116 ++++++++++++++++++++++++++++++++------------- 2 files changed, 89 insertions(+), 32 deletions(-) diff --git a/.gitignore b/.gitignore index 1438929..bd84ced 100644 --- a/.gitignore +++ b/.gitignore @@ -142,3 +142,8 @@ cython_debug/ # Apple .DS_Store + +# Output +*.avtags +*.stats +*.alias diff --git a/avclass/labeler.py b/avclass/labeler.py index c1d7d67..0f0259b 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -22,7 +22,7 @@ class AVClassLabeler: - output = [] + output = {"labels": []} av_labels = None hash_type = None ground_truth = None @@ -97,7 +97,10 @@ def run( self.get_sample_info = self.av_labels.get_sample_call(data_type) # Select output prefix - out_prefix = os.path.basename(os.path.splitext(files[0])[0]) + if isinstance(files, list) and isinstance(files[0], str): + out_prefix = os.path.basename(os.path.splitext(files[0])[0]) + else: + out_prefix = None # Process each input file if not isinstance(files, list): @@ -147,7 +150,7 @@ def run( if self.stats_export: self.out_stats(out_prefix) - # Output vendor info + # Output av vendor info if self.av_tags: self.out_avtags(out_prefix) @@ -238,7 +241,7 @@ def get_tokens(self, sample_info: NamedTuple): pup_val=pup_val, vt_count=vt_count, ) - self.output.append(class_entry) + self.output["labels"].append(class_entry) else: class_entry = self.avclass2_output( name=name, @@ -248,7 +251,7 @@ def get_tokens(self, sample_info: NamedTuple): pup_val=pup_val, vt_count=vt_count, ) - self.output.append(class_entry) + self.output["labels"].append(class_entry) def avclass1_output( self, @@ -434,17 +437,28 @@ def ground_truth_print(self): "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % (precision, recall, fmeasure) ) + self.output["ground_truth"] = { + "precision": "%.2f" % precision, + "recall": "%.2f" % recall, + "f1-measure": "%.2f" % fmeasure, + } def alias_detection(self, out_prefix: AnyStr, path_export: bool = False): - # Open alias file - alias_filename = out_prefix + ".alias" - alias_fd = open(alias_filename, "w+") + self.output["alias"] = [] + alias_fd = None + alias_filename = None # Sort token pairs by number of times they appear together sorted_pairs = sorted(self.pair_count_map.items(), key=itemgetter(1)) # sorted_pairs = sorted(self.pair_count_map.items()) - # Output header line - alias_fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n") + # Open alias file + if out_prefix: + alias_filename = out_prefix + ".alias" + alias_fd = open(alias_filename, "w+") + # Output header line + alias_fd.write( + "# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n" + ) # Compute token pair statistic and output to alias file for (t1, t2), c in sorted_pairs: n1 = self.token_count_map[t1] @@ -464,41 +478,79 @@ def alias_detection(self, out_prefix: AnyStr, path_export: bool = False): if path_export: x = self.av_labels.taxonomy.get_path(x) y = self.av_labels.taxonomy.get_path(y) - alias_fd.write( - "%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" % (x, y, xn, yn, c, f, finv) + self.output["alias"].append( + { + "tag1_label": x, + "tag2_label": y, + "tag1": xn, + "tag2": yn, + "tag1^tag2": c, + "tag1^tag2/tag1": f, + "tag1^tag2/tag2": finv, + } ) - # Close alias file - alias_fd.close() - self.print_error("[-] Alias data in %s\n" % (alias_filename)) + if out_prefix: + alias_fd.write( + "%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" % (x, y, xn, yn, c, f, finv) + ) + if out_prefix: + # Close alias file + alias_fd.close() + self.print_error("[-] Alias data in %s\n" % (alias_filename)) def out_avtags(self, out_prefix: AnyStr): - avtags_fd = open("%s.avtags" % out_prefix, "w") - for t in sorted(self.avtags_dict.keys()): - avtags_fd.write("%s\t" % t) + if out_prefix: + avtags_fd = open("%s.avtags" % out_prefix, "w") + for t in sorted(self.avtags_dict.keys()): + avtags_fd.write("%s\t" % t) + pairs = sorted( + self.avtags_dict[t].items(), key=lambda pair: pair[1], reverse=True + ) + for pair in pairs: + avtags_fd.write("%s|%d," % (pair[0], pair[1])) + avtags_fd.write("\n") + avtags_fd.close() + self.output["av_tags"] = {} + for tag in sorted(self.avtags_dict.keys()): + self.output["av_tags"][tag] = [] pairs = sorted( - self.avtags_dict[t].items(), key=lambda pair: pair[1], reverse=True + self.avtags_dict[tag].items(), key=lambda pair: pair[1], reverse=True ) for pair in pairs: - avtags_fd.write("%s|%d," % (pair[0], pair[1])) - avtags_fd.write("\n") - avtags_fd.close() + self.output["av_tags"][tag].append({"name": pair[0], "count": pair[1]}) def out_stats(self, out_prefix: AnyStr): # Output stats - stats_fd = open("%s.stats" % out_prefix, "w") num_samples = self.vt_all - stats_fd.write("Samples: %d\n" % num_samples) num_tagged = self.stats["tagged"] - frac = float(num_tagged) / float(num_samples) * 100 - stats_fd.write("Tagged (all): %d (%.01f%%)\n" % (num_tagged, frac)) + tag_frac = float(num_tagged) / float(num_samples) * 100 + num_maltagged = self.stats["maltagged"] - frac = float(num_maltagged) / float(num_samples) * 100 - stats_fd.write("Tagged (VT>3): %d (%.01f%%)\n" % (num_maltagged, frac)) - for c in ["FILE", "CLASS", "BEH", "FAM", "UNK"]: - count = self.stats[c] + maltag_frac = float(num_maltagged) / float(num_samples) * 100 + if out_prefix: + stats_fd = open("%s.stats" % out_prefix, "w") + stats_fd.write("Samples: %d\n" % num_samples) + stats_fd.write("Tagged (all): %d (%.01f%%)\n" % (num_tagged, tag_frac)) + stats_fd.write( + "Tagged (VT>3): %d (%.01f%%)\n" % (num_maltagged, maltag_frac) + ) + for c in ["FILE", "CLASS", "BEH", "FAM", "UNK"]: + count = self.stats[c] + frac = float(count) / float(num_maltagged) * 100 + stats_fd.write("%s: %d (%.01f%%)\n" % (c, self.stats[c], frac)) + stats_fd.close() + self.output["stats"] = { + "samples": num_samples, + "tagged_all": {"count": num_tagged, "ratio": "%.01f%%" % tag_frac}, + "tagged_vt3": {"count": num_maltagged, "ratio": "%.01f%%" % maltag_frac}, + "category": [], + } + for cat in ["FILE", "CLASS", "BEH", "FAM", "UNK"]: + count = self.stats[cat] frac = float(count) / float(num_maltagged) * 100 - stats_fd.write("%s: %d (%.01f%%)\n" % (c, self.stats[c], frac)) - stats_fd.close() + self.output["stats"]["category"].append( + {cat: {"count": count, "ratio": "%.01f%%" % frac}} + ) def guess_hash(self, h: AnyStr) -> Optional[AnyStr]: """ From 72c42a0fb5ce741483ec0cd3ae63112e3eb16c2f Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Mon, 8 Feb 2021 13:59:42 -0500 Subject: [PATCH 25/36] Just detect gz by magic bytes --- avclass/labeler.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/avclass/labeler.py b/avclass/labeler.py index 0f0259b..2343fe4 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -76,7 +76,6 @@ def run( pup_classify: bool = False, path_export: bool = False, compatibility_v1: bool = False, - gzipped: bool = False, console: bool = False, ) -> List[Dict]: # Set class arguments @@ -113,7 +112,7 @@ def run( elif isinstance(ifile, StringIO): fd = ifile else: - if gzipped: + if self.is_gz_file(ifile): fd = gzip.open(ifile, "rt") else: fd = open(ifile, "r") @@ -630,6 +629,10 @@ def list_str( out = out + sep + s return out + def is_gz_file(self, filepath): + with open(filepath, "rb") as test_f: + return test_f.read(2) == b"\x1f\x8b" + def print_error(self, output: AnyStr = "", flush=False): if self.console: # TODO - would this be better? print(output, file=sys.stderr, flush=flush, end="") @@ -668,7 +671,6 @@ def main(): pup_classify=args.pup, path_export=args.path, compatibility_v1=args.c, - gzipped=args.gzip, console=not args.json, ) if args.json: @@ -707,7 +709,7 @@ def parse_args(): "-i", "--input", action="append", - help="input report file or directory (Can be provided multiple times)", + help="input report file (plain or gzip) or directory. (Can be provided multiple times)", ) argparser.add_argument( @@ -720,13 +722,6 @@ def parse_args(): "Prints precision, recall, F1-measure.", ) - argparser.add_argument( - "-gz", - "--gzip", - help="file with JSON reports is gzipped", - action="store_true", - ) - argparser.add_argument( "-vtt", help="Include VT tags in the output.", action="store_true" ) From 7fb6bd1b3ced5c5c6bfe8c922584fff0513aa222 Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Mon, 8 Feb 2021 15:30:26 -0500 Subject: [PATCH 26/36] Readme update, path_export fix, renamed class argument --- README.md | 216 ++++++++++++++++++++++++++++++++++----------- avclass/labeler.py | 18 ++-- setup.py | 2 +- 3 files changed, 176 insertions(+), 60 deletions(-) diff --git a/README.md b/README.md index c6eae7a..6d92cf4 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,17 @@ -# AVClass and AVClass2 +# AVClass -AVClass and AVClass2 are Python tools to tag / label malware samples. -You give them as input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports) -and they output tags extracted from the AV labels of each sample. -The original AVClass only outputs family names (i.e., family tags). -By default, it outputs the most likely family for each sample (e.g., *zbot*, *virut*). +AVClass is a Python package / command line tool to tag / label malware samples. +You input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports) +and it outputs tags extracted from the AV labels of each sample. +AVClass will output the family names, along with other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). +It can also be run in compatibility mode `-c` (AVClass 1.x) to only output the family names (i.e., family tags). It can also output a ranking of all alternative family names it found for each sample. -The newer AVClass2, in addition to family names, also outputs other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). -A quick example helps illustrating the differences. If you run AVClass2 on our example input file: + +A quick example helps illustrating the differences of compatibility mode. If you run AVClass on our example input file: ```shell -$./avclass2/avclass2_labeler.py -lb examples/malheurReference_lb.json -p +$ python3 ./avclass/labeler.py -i ./examples/malheurReference_lb.json -t lb -p ``` the output on stdout is: @@ -27,40 +27,150 @@ was flagged by 33 AV engines and 10 of them agree it is *grayware*, 9 that it is Sample *67d15459e1f85898851148511c86d88d* is flagged by 37 AV engines and 23 of them consider it a *dialer*, 8 that it belongs to the *adultbrowser* family, and so on. -If you instead run AVClass on the same input file: +If you instead run AVClass on the same input file in compatibility mode: ```shell -$./avclass/avclass_labeler.py -lb examples/malheurReference_lb.json +$ python3 ./avclass/labeler.py -i ./examples/malheurReference_lb.json -t lb -c ``` -the output looks like this: +the output looks like this, which simply reports the most common family name for each sample. ``` aca2d12934935b070df8f50e06a20539 adrotator 67d15459e1f85898851148511c86d88d adultbrowser ``` -which simply reports the most common family name for each sample. - -In a nutshell, that is the main difference between both tools. -Of course, there are more options for both tools, -which you can read about in their corresponding README files. +The output can also be formatted as **JSON**. +```shell +$ python3 ./avclass/labeler.py -i ./examples/malheurReference_lb.json -t lb -p -json +``` +the output on stdout is: -## Which one should I use? +```json +{ + "labels": [ + { + "hash": "aca2d12934935b070df8f50e06a20539", + "av_count": 33, + "tags": [ + { + "tag": "grayware", + "count": 9, + "category": "CLASS", + "path": "CLASS:grayware" + }, + { + "tag": "adware", + "count": 9, + "category": "CLASS", + "path": "CLASS:grayware:adware" + }, + { + "tag": "windows", + "count": 8, + "category": "FILE", + "path": "FILE:os:windows" + }, + { + "tag": "adrotator", + "count": 8, + "category": "FAM", + "path": "FAM:adrotator" + }, + { + "tag": "execdownload", + "count": 3, + "category": "BEH", + "path": "BEH:execdownload" + }, + { + "tag": "downloader", + "count": 3, + "category": "CLASS", + "path": "CLASS:downloader" + }, + { + "tag": "zlob", + "count": 2, + "category": "FAM", + "path": "FAM:zlob" + } + ] + } + ] +} +``` -AVClass2 is the newer tool and it extracts more information -from the input AV labels. -So, if you are new to AVClass and AVClass2, we recommend trying it out first. +Or it can be used as a Python package: +```py +import json +from avclass.labeler import AVClassLabeler + +av_class = AVClassLabeler() +result = av_class.run( + files="./examples/malheurReference_lb.json", + data_type="lb", + path_export=True, +) +print(json.dumps(result)) +``` +the output on stdout is: -However, there are several reasons to keep AVClass around. -First, it is more mature and used by many analysts, -so we want to preserve backwards compatibility. -Second, for some applications only family names are needed and -for that AVClass is enough. -Third, AVClass is faster than AVClass2 since it extracts less info. -The lower runtime is nice when processing millions of samples and -not requiring the extra tags AVClass2 provides. +```json +{ + "labels": [ + { + "hash": "aca2d12934935b070df8f50e06a20539", + "av_count": 33, + "tags": [ + { + "tag": "grayware", + "count": 9, + "category": "CLASS", + "path": "CLASS:grayware" + }, + { + "tag": "adware", + "count": 9, + "category": "CLASS", + "path": "CLASS:grayware:adware" + }, + { + "tag": "windows", + "count": 8, + "category": "FILE", + "path": "FILE:os:windows" + }, + { + "tag": "adrotator", + "count": 8, + "category": "FAM", + "path": "FAM:adrotator" + }, + { + "tag": "execdownload", + "count": 3, + "category": "BEH", + "path": "BEH:execdownload" + }, + { + "tag": "downloader", + "count": 3, + "category": "CLASS", + "path": "CLASS:downloader" + }, + { + "tag": "zlob", + "count": 2, + "category": "FAM", + "path": "FAM:zlob" + } + ] + } + ] +} +``` ## References @@ -80,16 +190,15 @@ The design and evaluation of AVClass2 is detailed in our AVClass2: Massive Malware Tag Extraction from AV Labels. In proceedings of the Annual Computer Security Applications Conference, December 2020. -## Why are AVClass and AVClass2 useful? +## Why is AVClass useful? Because a lot of times security researchers want to extract family and other information from AV labels, but this process is not as simple as it looks, especially if you need to do it for large numbers (e.g., millions) of samples. -Some advantages of AVClass and AVClass2 are: +Some advantages of AVClass are: 1. *Automatic.* They remove manual analysis limitations on the size of the -input -dataset. +input dataset. 2. *Vendor-agnostic.* They operate on the labels of any available set of AV engines, which can vary from sample to sample. @@ -100,7 +209,7 @@ engines, e.g., Windows or Android malware. 4. *Does not require executables.* AV labels can be obtained from online services like VirusTotal using a sample's hash, even when the executable is not available. -5. *Quantified accuracy.* We have evaluated AVClass and AVClass2 on millions of +5. *Quantified accuracy.* We have evaluated AVClass 2.x on millions of samples and publicly available malware datasets with ground truth. Evaluation details are in the RAID 2016 and ACSAC 2020 papers. @@ -110,21 +219,21 @@ these tools. ## Limitations -The main limitations of AVClass and AVClass2 are that its output depends +The main limitations of AVClass is that the output depends on the input AV labels. -Both tools try to compensate for the noise on the AV labels, +The tool tries to compensate for the noise on the AV labels, but cannot identify tags if AV engines do not provide non-generic tokens in the labels of a sample. -In particular, they cannot tag samples if at least 2 AV engines +In particular, it cannot tag samples if at least 2 AV engines do not agree on a tag. -Still, there are many samples that both tools can tag -and thus we believe you will find them useful. +Still, there are many samples that it can tag +and thus we believe you will find it useful. We recommend you to read the RAID 2016 and ACSAC 2020 papers for more details. ## Input JSON format -AVClass and AVClass2 support three input JSON formats: +AVClass supports four input JSON formats: 1. VirusTotal v2 API JSON reports (*-vt file*), where each line in the input *file* should be the full JSON of a @@ -133,7 +242,7 @@ e.g., obtained by querying https://www.virustotal.com/vtapi/v2/file/report?apike There is an example VirusTotal v2 input file in examples/vtv2_sample.json ```shell -$./avclass2/avclass2_labeler.py -vt examples/vtv2_sample.json -p > output.txt +$./avclass/labeler.py -i examples/vtv2_sample.json -t vt2 -p > output.txt ``` 2. VirusTotal v3 API JSON reports (*-vt file -vt3*), @@ -142,7 +251,7 @@ e.g., obtained by querying https://www.virustotal.com/api/v3/files/{hash} There is an example VirusTotal v3 input file in examples/vtv3_sample.json ```shell -$./avclass2/avclass2_labeler.py -vt examples/vtv3_sample.json -p -vt3 > output.txt +$./avclass/labeler.py -i examples/vtv3_sample.json -p -t vt3 > output.txt ``` 3. Simplified JSON (*-lb file*), @@ -152,16 +261,23 @@ with (at least) these fields: There is an example of such input file in *examples/malheurReference_lb.json* ```shell -$./avclass2/avclass2_labeler.py -lb examples/malheurReference_lb.json -p > output.txt +$./avclass/labeler.py -i examples/malheurReference_lb.json -t lb -p > output.txt +``` + +4. Metadefender JSON (*-md file*), +where each line in *file* should be a JSON + +```shell +$./avclass/labeler.py -i examples/malheurReference_lb.json -t md -p > output.txt ``` **Why have a simplified JSON format?** We believe most users will get the AV labels using VirusTotal. -However, AVClass and AVClass2 are IO-bound and a VirusTotal report +However, AVClass is IO-bound and a VirusTotal report in addition to the AV labels and hashes includes -much other data that the tools do not need. -Thus, when applying AVClass or AVClass2 to millions of samples, +a lot of other data that the tools do not need. +Thus, when applying AVClass to millions of samples, reducing the input file size by removing unnnecessary data significantly improves efficiency. Furthermore, users could obtain AV labels from other sources and @@ -170,8 +286,8 @@ the easier to convert those AV labels into an input file. ## Dependencies -AVClass and AVClass2 are both written in Python. -They should both run on Python versions above 2.7 and 3.0. +AVClass is both written in Python. +It should be run on Python versions >= 3.6. They do not require installing any dependencies. @@ -182,11 +298,11 @@ pull request through GitHub. ## License -AVClass and AVClass2 are both released under the MIT license +AVClass is released under the MIT license ## Contributors Several members of the MaliciaLab at the [IMDEA Software Institute](http://software.imdea.org) -have contributed code to AVClasss and AVClass2: +have contributed code to AVClass: Marcos Sebastián, Richard Rivera, Platon Kotzias, Srdjan Matic, Silvia Sebastián, and Juan Caballero. diff --git a/avclass/labeler.py b/avclass/labeler.py index 2343fe4..481b984 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -30,7 +30,7 @@ class AVClassLabeler: console = False av_tags = False stats_export = False - compatibility_v1 = False + family_only = False pup_classify = False path_export = False vt_tags = False @@ -75,7 +75,7 @@ def run( av_tags: bool = False, pup_classify: bool = False, path_export: bool = False, - compatibility_v1: bool = False, + family_only: bool = False, console: bool = False, ) -> List[Dict]: # Set class arguments @@ -83,7 +83,7 @@ def run( self.ground_truth = ground_truth self.av_tags = av_tags self.stats_export = stats_export - self.compatibility_v1 = compatibility_v1 + self.family_only = family_only self.pup_classify = pup_classify self.path_export = path_export self.vt_tags = vt_tags @@ -232,7 +232,7 @@ def get_tokens(self, sample_info: NamedTuple): pup_val = self.is_pup(self.pup_classify, tags) # Print family (and ground truth if available) - if self.compatibility_v1: + if self.family_only: class_entry = self.avclass1_output( name=name, family=fam, @@ -307,7 +307,7 @@ def avclass2_output( else: vtt = "" tag_str = self.format_tag_pairs_str( - tags, self.av_labels.taxonomy, self.path_export + tags=tags, taxonomy=self.av_labels.taxonomy, path_export=self.path_export ) self.print_output( "%s\t%d\t%s%s%s%s\n" @@ -315,7 +315,7 @@ def avclass2_output( ) # Build json output tag_dict = self.format_tag_pairs_list( - tags, self.av_labels.taxonomy, self.path_export + tags=tags, taxonomy=self.av_labels.taxonomy, path_export=self.path_export ) values = {"hash": name, "av_count": vt_count, "tags": tag_dict} if self.ground_truth: @@ -327,7 +327,7 @@ def avclass2_output( return values def get_family(self, name: AnyStr, tags: List[Tuple]) -> Tuple: - if self.compatibility_v1 or self.ground_truth: + if self.family_only or self.ground_truth: fam = "SINGLETON:" + name # fam = '' for (t, s) in tags: @@ -605,7 +605,7 @@ def format_tag_pairs_list( out = [] for (tag, count) in tags: values = {"tag": tag, "count": count} - if path_export and taxonomy: + if path_export and taxonomy is not None: values["category"] = taxonomy.get_category(tag) values["path"] = taxonomy.get_path(tag) out.append(values) @@ -670,7 +670,7 @@ def main(): ground_truth=args.gt, pup_classify=args.pup, path_export=args.path, - compatibility_v1=args.c, + family_only=args.c, console=not args.json, ) if args.json: diff --git a/setup.py b/setup.py index 6bcc101..fa4ba97 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,6 @@ entry_points={ 'console_scripts': [ 'avclass = avclass.labeler:main', - 'avclass-validate = avclass.util:validate_files', + 'avclass-validate = avclass.cli:validate_files', ], }) From 127c96df6d2e723d3e72522f4cb7b5bea135e03a Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Mon, 8 Feb 2021 17:33:32 -0500 Subject: [PATCH 27/36] Readme, setup --- README.md | 100 ++++++++++++++++++++++++++++++++++++++------- avclass/cli.py | 13 ++++-- avclass/labeler.py | 14 +++---- avclass/update.py | 5 ++- setup.py | 1 + 5 files changed, 108 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 6d92cf4..3b15139 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,23 @@ +# Install +```shell +$ git clone http://.../avclass +$ cd avclass +$ pip3 install . +``` + # AVClass AVClass is a Python package / command line tool to tag / label malware samples. You input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports) and it outputs tags extracted from the AV labels of each sample. -AVClass will output the family names, along with other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). -It can also be run in compatibility mode `-c` (AVClass 1.x) to only output the family names (i.e., family tags). -It can also output a ranking of all alternative family names it found for each sample. +AVClass will output the family names, along with other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). It can also output a ranking of all alternative family names it found for each sample. +There is also a compatibility mode `-c` (AVClass 1.x) that will only output the family names (i.e., family tags). -A quick example helps illustrating the differences of compatibility mode. If you run AVClass on our example input file: +A quick example helps illustrating the differences of 1.x compatibility mode. If you run AVClass on our example input file: ```shell -$ python3 ./avclass/labeler.py -i ./examples/malheurReference_lb.json -t lb -p +$ avclass -i ./examples/malheurReference_lb.json -t lb -p ``` the output on stdout is: @@ -27,10 +33,10 @@ was flagged by 33 AV engines and 10 of them agree it is *grayware*, 9 that it is Sample *67d15459e1f85898851148511c86d88d* is flagged by 37 AV engines and 23 of them consider it a *dialer*, 8 that it belongs to the *adultbrowser* family, and so on. -If you instead run AVClass on the same input file in compatibility mode: +If you instead run AVClass on the same input file in compatibility mode `-c`: ```shell -$ python3 ./avclass/labeler.py -i ./examples/malheurReference_lb.json -t lb -c +$ avclass -i ./examples/malheurReference_lb.json -t lb -c ``` the output looks like this, which simply reports the most common family name for each sample. @@ -43,11 +49,11 @@ aca2d12934935b070df8f50e06a20539 adrotator The output can also be formatted as **JSON**. ```shell -$ python3 ./avclass/labeler.py -i ./examples/malheurReference_lb.json -t lb -p -json +$ avclass -i ./examples/malheurReference_lb.json -t lb -p -json ``` the output on stdout is: -```json +```yaml { "labels": [ { @@ -117,7 +123,7 @@ print(json.dumps(result)) ``` the output on stdout is: -```json +```yaml { "labels": [ { @@ -172,6 +178,72 @@ the output on stdout is: } ``` +## Update Module + +The update module can be used to suggest additions and changes to the input +taxonomy, tagging rules, and expansion rules. +Using the update module comprises of two steps. +The first step is obtaining an alias file from the labeler: + +```shell +$ avclass -i ./examples/malheurReference_lb.json -t lb -aliasdetect +``` + +The above command will create a file named \.alias, +malheurReference_lb.alias in our example. This file has 7 columns: + +1. t1: token that is an alias +2. t2: tag for which t1 is an alias +3. |t1|: number of input samples where t1 was observed +4. |t2|: number of input samples where t2 was observed +5. |t1^t2|: number of input samples where both t1 and t2 were observed +6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t1 was observed. +7. |t1^t2|/|t2|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t2 was observed. + + +The Update Module takes the above file as input with the -alias option, +as well as the default taxonomy, tagging, and expansion files in the data directory. +It outputs updated taxonomy, tagging, and expansion files that include the +suggested additions and changes. + +```shell +$ avclass-update -alias malheurReference_lb.alias -o output_prefix +``` + +This will produce three files: +output_prefix.taxonomy, output_prefix.tagging, output_prefix.expansion. +You can diff the output and input files to analyze the proposed changes. + +You can also modify the input taxonomy, tagging, and expansion rules in place, +rather than producing new files: + + +```shell +$ avclass-update -alias malheurReference_lb.alias -update +``` + + +## Customizing AVClass + +AVClass is fully customizable: +Tagging, Expansion and Taxonomy files can be easily modified by the analyst +either manually or by running the update module. + +If you change those files manually, we recommend running +afterwards the input checker script to keep them tidy. +It sorts the tags in the taxonomy and performs some basic cleaning like +removing redundant entries: + +```shell +$ avclass-validate -tax taxonomy_file -tag tagging_file -exp expansio_file +``` + +If the modifications are in the default files in the data directory you can simply run: + +```shell +$ avclass-validate +``` + ## References The design and evaluation of AVClass is detailed in our @@ -242,7 +314,7 @@ e.g., obtained by querying https://www.virustotal.com/vtapi/v2/file/report?apike There is an example VirusTotal v2 input file in examples/vtv2_sample.json ```shell -$./avclass/labeler.py -i examples/vtv2_sample.json -t vt2 -p > output.txt +$ avclass -i examples/vtv2_sample.json -t vt2 -p > output.txt ``` 2. VirusTotal v3 API JSON reports (*-vt file -vt3*), @@ -251,7 +323,7 @@ e.g., obtained by querying https://www.virustotal.com/api/v3/files/{hash} There is an example VirusTotal v3 input file in examples/vtv3_sample.json ```shell -$./avclass/labeler.py -i examples/vtv3_sample.json -p -t vt3 > output.txt +$ avclass -i examples/vtv3_sample.json -p -t vt3 > output.txt ``` 3. Simplified JSON (*-lb file*), @@ -261,14 +333,14 @@ with (at least) these fields: There is an example of such input file in *examples/malheurReference_lb.json* ```shell -$./avclass/labeler.py -i examples/malheurReference_lb.json -t lb -p > output.txt +$ avclass -i examples/malheurReference_lb.json -t lb -p > output.txt ``` 4. Metadefender JSON (*-md file*), where each line in *file* should be a JSON ```shell -$./avclass/labeler.py -i examples/malheurReference_lb.json -t md -p > output.txt +$ avclass -i examples/malheurReference_lb.json -t md -p > output.txt ``` **Why have a simplified JSON format?** diff --git a/avclass/cli.py b/avclass/cli.py index ad528ff..1fa35ac 100644 --- a/avclass/cli.py +++ b/avclass/cli.py @@ -25,7 +25,8 @@ def validate_taxonomy(path: AnyStr): taxonomy = Taxonomy(path) taxonomy.to_file(path) - logger.info("[-] Normalized %d tags in taxonomy %s\n" % (len(taxonomy), path)) + print("[-] Normalized %d tags in taxonomy %s" % (len(taxonomy), path)) + logger.info("[-] Normalized %d tags in taxonomy %s" % (len(taxonomy), path)) return taxonomy @@ -43,7 +44,8 @@ def validate_tagging(path: AnyStr, taxonomy: Taxonomy): # tagging.expand_all_destinations() tagging.to_file(path) - logger.info("[-] Normalized %d tagging rules in %s\n" % (len(tagging), path)) + print("[-] Normalized %d tagging rules in %s" % (len(tagging), path)) + logger.info("[-] Normalized %d tagging rules in %s" % (len(tagging), path)) def validate_expansion(path: AnyStr, taxonomy: Taxonomy): @@ -58,7 +60,8 @@ def validate_expansion(path: AnyStr, taxonomy: Taxonomy): expansion.validate(taxonomy) expansion.to_file(path) - logger.info("[-] Normalized %d expansion rules in %s\n" % (len(expansion), path)) + print("[-] Normalized %d expansion rules in %s" % (len(expansion), path)) + logger.info("[-] Normalized %d expansion rules in %s" % (len(expansion), path)) def validate_files(): @@ -74,3 +77,7 @@ def validate_files(): taxonomy = validate_taxonomy(args.tax) validate_tagging(args.tag, taxonomy) validate_expansion(args.exp, taxonomy) + + +if __name__ == "__main__": + validate_files() diff --git a/avclass/labeler.py b/avclass/labeler.py index 481b984..d3885be 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -479,13 +479,13 @@ def alias_detection(self, out_prefix: AnyStr, path_export: bool = False): y = self.av_labels.taxonomy.get_path(y) self.output["alias"].append( { - "tag1_label": x, - "tag2_label": y, - "tag1": xn, - "tag2": yn, - "tag1^tag2": c, - "tag1^tag2/tag1": f, - "tag1^tag2/tag2": finv, + "alias_token": x, + "alias_tag": y, + "count_token": xn, + "count_tag": yn, + "ratio": c, + "ratio_token": f, + "raiio_tag": finv, } ) if out_prefix: diff --git a/avclass/update.py b/avclass/update.py index 4e9d5ea..5adaf54 100644 --- a/avclass/update.py +++ b/avclass/update.py @@ -433,7 +433,7 @@ def output(prefix: Optional[AnyStr] = None): logger.info('[-] Output %d expansion rules to %s' % (len(expansion), exp_filepath)) -if __name__ == '__main__': +def main(): parser = argparse.ArgumentParser(description='Given a .alias file from the labeler, generates updates for the ' 'taxonomy, tagging, and expansion files.') @@ -508,3 +508,6 @@ def output(prefix: Optional[AnyStr] = None): update.output(out_prefix) update.output_relations(out_prefix + ".final.rules") + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index fa4ba97..10f35fc 100644 --- a/setup.py +++ b/setup.py @@ -21,5 +21,6 @@ 'console_scripts': [ 'avclass = avclass.labeler:main', 'avclass-validate = avclass.cli:validate_files', + 'avclass-update = avclass.update:main', ], }) From fb6d28dbab9573524e5c6791a04268bb964979b0 Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Mon, 8 Feb 2021 18:02:30 -0500 Subject: [PATCH 28/36] Readme updates --- .gitignore | 1 + README.md | 93 +++++++---------- avclass/README.md | 261 ---------------------------------------------- 3 files changed, 40 insertions(+), 315 deletions(-) delete mode 100644 avclass/README.md diff --git a/.gitignore b/.gitignore index bd84ced..9558345 100644 --- a/.gitignore +++ b/.gitignore @@ -147,3 +147,4 @@ cython_debug/ *.avtags *.stats *.alias +*.labels diff --git a/README.md b/README.md index 3b15139..075e64d 100644 --- a/README.md +++ b/README.md @@ -121,62 +121,47 @@ result = av_class.run( ) print(json.dumps(result)) ``` -the output on stdout is: +## Labeling: Ground Truth Evaluation + +If you have family ground truth for some malware samples, i.e., +you know the true family for those samples, you can evaluate the accuracy +of the family tags output by AVClass2 on those samples with respect to that ground truth. +The evaluation metrics used are precision, recall, and F1 measure. +See our [RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf) for their definition. +Note that the ground truth evaluation does not apply to non-family tags, +i.e., it only evaluates the output of the compatibility mode. + +```shell +$ avclass -i ./examples/malheurReference_lb.json -t lb -gt ./examples/malheurReference_gt.tsv > malheurReference.labels +``` + +The output includes these lines: + +``` +Calculating precision and recall +3131 out of 3131 +Precision: 90.81 Recall: 94.05 F1-Measure: 92.40 +``` + +Each line in the *../examples/malheurReference_gt.tsv* file has two **tab-separated** columns: -```yaml -{ - "labels": [ - { - "hash": "aca2d12934935b070df8f50e06a20539", - "av_count": 33, - "tags": [ - { - "tag": "grayware", - "count": 9, - "category": "CLASS", - "path": "CLASS:grayware" - }, - { - "tag": "adware", - "count": 9, - "category": "CLASS", - "path": "CLASS:grayware:adware" - }, - { - "tag": "windows", - "count": 8, - "category": "FILE", - "path": "FILE:os:windows" - }, - { - "tag": "adrotator", - "count": 8, - "category": "FAM", - "path": "FAM:adrotator" - }, - { - "tag": "execdownload", - "count": 3, - "category": "BEH", - "path": "BEH:execdownload" - }, - { - "tag": "downloader", - "count": 3, - "category": "CLASS", - "path": "CLASS:downloader" - }, - { - "tag": "zlob", - "count": 2, - "category": "FAM", - "path": "FAM:zlob" - } - ] - } - ] -} ``` +aca2d12934935b070df8f50e06a20539 ADROTATOR +``` + +which indicates that sample aca2d12934935b070df8f50e06a20539 is known +to be of the *ADROTATOR* family. +Each sample in the input file should also appear in the ground truth file. +Note that the particular label assigned to each family does not matter. +What matters is that all samples in the same family are assigned +the same family name (i.e., the same string in the second column) + +The ground truth can be obtained from publicly available malware datasets. +The one in *../examples/malheurReference_gt.tsv* comes from the +[Malheur](http://www.mlsec.org/malheur/) dataset. +There are other public datasets with ground truth such as +[Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or +[Malicia](http://malicia-project.com/dataset.html). ## Update Module diff --git a/avclass/README.md b/avclass/README.md deleted file mode 100644 index 83dfaad..0000000 --- a/avclass/README.md +++ /dev/null @@ -1,261 +0,0 @@ -# AVClass2 - -AVClass2 is a malware tagging tool. It extends AVClass to extract from AV labels not only family name tags, but other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). - -You give it as input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports) -and it outputs tags observed in the AV labels, ranked by decreasing popularity. - -The design and evaluation of AVClass2 is detailed in our ACSAC 2020 paper. - -> Silvia Sebastián, Juan Caballero. -AVClass2: Massive Malware Tag Extraction from AV Labels. -In proceedings of the Annual Computer Security Applications Conference, December 2020. - -In a nutshell, AVClass2 comprises two modules: labeling and update. Code for both is included, but most users will be only interested in the labeling, which outputs the tags for the samples. The update module is used to update the input taxonomy, tagging rules, and expansion rules. If you use our default taxonomy, tagging, and expansion files, you do not need to run the update module. - - -## Labeling - -The labeler takes as input a JSON file with the AV labels of malware samples -(-vt or -lb options), -a file with the taxonomy (-tax option), -a file with tagging rules (-tag option), and -a file with expansion rules (-exp option). -It outputs a set of ranked tags. -If you do not provide taxonomy, expansion or tagging files, -the default ones in the data folder are used. - -```shell -$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -``` - -The above command labels the samples whose AV labels are in -the ../examples/malheurReference_lb.json file. -It prints the results to stdout. -The output looks like this: - -``` -aca2d12934935b070df8f50e06a20539 33 grayware|10,adware|9,windows|8,adrotator|8,downloader|3,zlob|2 -67d15459e1f85898851148511c86d88d 37 dialer|23,windows|9,adultbrowser|8,porndialer|7,grayware|6,tool|3,target|2 -``` - -which means sample *aca2d12934935b070df8f50e06a20539* -was flagged by 33 AV engines and 10 of them agree it is *grayware*, 9 that it is more specifically *adware*, -8 mention that it runs on *windows*, another 8 that it is the *adrotator* family, -3 that it is a *downloader*, and 2 that it belongs instead to the *zlob* family. -Sample *67d15459e1f85898851148511c86d88d* is flagged by 37 AV engines and 23 of them -consider it a *dialer*, 8 that it belongs to the *adultbrowser* family, and so on. - -The -p option outputs the full path of each tag in the taxonomy: - -```shell -$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -p -``` - -The above command line outputs: - -``` -aca2d12934935b070df8f50e06a20539 33 CLASS:grayware|10,CLASS:grayware:adware|9,FILE:os:windows|8,FAM:adrotator|8,CLASS:downloader|3,FAM:zlob|2 -67d15459e1f85898851148511c86d88d 37 CLASS:dialer|23,FILE:os:windows|9,FAM:adultbrowser|8,CLASS:dialer:porndialer|7,CLASS:grayware|6,CLASS:grayware:tool|3,FAM:target|2 -``` - -where each tag has been replaced by its taxonomy path, which starts with the category in capitals, -followed by the path in the category (if any), and the tag itself, all separated by colons. -For example, *FAM:adrotator* makes explicit that *adrotator* is a malware family, -*CLASS:grayware* that *grayware* is a malware class, and -*CLASS:grayware:adware* that *adware* is a subclass of *grayware*. - -**Compatibility mode** - -The compatibility -c option makes AVClass2 output the same format as AVClass. - -```shell -$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -c -``` - -outputs: - -``` -bb23e1d296cf01bbaf32ed3938f9b0b8 allaple -cc4521ea738e8ba17139f86b3def5349 SINGLETON:cc4521ea738e8ba17139f86b3def5349 -``` - -As in AVClass, the output contains only the family name, -which corresponds to the highest ranked family tag, all other tags are ignored. -Samples for which a family cannot be obtained are labeled as singletons with their hash. - -It is important to note that AVClass2 compatibility mode results can differ from AVClass results -on the same input file. -The differences in family names are due to differences between the generics and aliases files -used by AVClass and the taxonomy, tagging rules, and expansion rules used by AVClass2. -In the future, we may change AVClass to use the taxonomy and rules from AVClass2 -as input (instead of the generics and aliases files) -to minimize such differences and avoid maintaining different data files. - - -## Input JSON format - -AVClass2 supports three input JSON formats: - -1. VirusTotal v2 API JSON reports (*-vt file*), -where each line in the input *file* should be the full JSON of a -VirusTotal v2 API response to the */file/report* endpoint, -e.g., obtained by querying https://www.virustotal.com/vtapi/v2/file/report?apikey={apikey}&resource={hash} -There is an example VirusTotal v2 input file in examples/vtv2_sample.json - -2. VirusTotal v3 API JSON reports (*-vt file -vt3*), -where each line in the input *file* should be the full JSON of a VirusTotal API version 3 response with a *File* object report, -e.g., obtained by querying https://www.virustotal.com/api/v3/files/{hash} -There is an example VirusTotal v3 input file in examples/vtv3_sample.json - -3. Simplified JSON (*-lb file*), -where each line in *file* should be a JSON -with (at least) these fields: -{md5, sha1, sha256, av_labels}. -There is an example of such input file in *examples/malheurReference_lb.json* - - -**Multiple input files** - -AVClass2 can handle multiple input files putting the results in the same output files -(if you want results in separate files, process each input file separately). - -It is possible to provide the -vt and -lb input options multiple times. - -```shell -$./avclass2_labeler.py -vt -vt -``` -```shell -$./avclass2_labeler.py -lb -lb -``` - -There are also -vtdir and -lbdir options that can be used to provide -an input directory where all files are VT (-vtdir) or simplified (-lbdir) JSON reports: - -```shell -$./avclass2_labeler.py -vtdir -``` - -It is also possible to combine -vt with -vtdir and -lb with -lbdir, -but you cannot combine input files of different format. Thus, this command works: - -```shell -$./avclass2_labeler.py -vt -vtdir -``` - -But, this one throws an error: - -```shell -$./avclass2_labeler.py -vt -lb -``` - -At this point you have read the most important information on how to use AVClass2. -The following sections describe steps that most users will not need. - -## Labeling: Ground Truth Evaluation - -If you have family ground truth for some malware samples, i.e., -you know the true family for those samples, you can evaluate the accuracy -of the family tags output by AVClass2 on those samples with respect to that ground truth. -The evaluation metrics used are precision, recall, and F1 measure. -See our [RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf) for their definition. -Note that the ground truth evaluation does not apply to non-family tags, -i.e., it only evaluates the output of the compatibility mode. - -```shell -$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -gt ../examples/malheurReference_gt.tsv > malheurReference.labels -``` - -The output includes these lines: - -``` -Calculating precision and recall -3131 out of 3131 -Precision: 90.81 Recall: 94.05 F1-Measure: 92.40 -``` - -Each line in the *../examples/malheurReference_gt.tsv* file has two **tab-separated** columns: - -``` -aca2d12934935b070df8f50e06a20539 ADROTATOR -``` - -which indicates that sample aca2d12934935b070df8f50e06a20539 is known -to be of the *ADROTATOR* family. -Each sample in the input file should also appear in the ground truth file. -Note that the particular label assigned to each family does not matter. -What matters is that all samples in the same family are assigned -the same family name (i.e., the same string in the second column) - -The ground truth can be obtained from publicly available malware datasets. -The one in *../examples/malheurReference_gt.tsv* comes from the -[Malheur](http://www.mlsec.org/malheur/) dataset. -There are other public datasets with ground truth such as -[Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or -[Malicia](http://malicia-project.com/dataset.html). - -## Update Module - -The update module can be used to suggest additions and changes to the input -taxonomy, tagging rules, and expansion rules. -Using the update module comprises of two steps. -The first step is obtaining an alias file from the labeler: - -```shell -$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -aliasdetect -``` - -The above command will create a file named \.alias, -malheurReference_lb.alias in our example. This file has 7 columns: - -1. t1: token that is an alias -2. t2: tag for which t1 is an alias -3. |t1|: number of input samples where t1 was observed -4. |t2|: number of input samples where t2 was observed -5. |t1^t2|: number of input samples where both t1 and t2 were observed -6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t1 was observed. -7. |t1^t2|/|t2|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t2 was observed. - - -The Update Module takes the above file as input with the -alias option, -as well as the default taxonomy, tagging, and expansion files in the data directory. -It outputs updated taxonomy, tagging, and expansion files that include the -suggested additions and changes. - -```shell -$./avclass2_update_module.py -alias malheurReference_lb.alias -o output_prefix -``` - -This will produce three files: -output_prefix.taxonomy, output_prefix.tagging, output_prefix.expansion. -You can diff the output and input files to analyze the proposed changes. - -You can also modify the input taxonomy, tagging, and expansion rules in place, -rather than producing new files: - - -```shell -$./avclass2_update_module.py -alias malheurReference_lb.alias -update -``` - - -## Customizing AVClass2 - -AVClass2 is fully customizable: -Tagging, Expansion and Taxonomy files can be easily modified by the analyst -either manually or by running the update module. - -If you change those files manually, we recommend running -afterwards the input checker script to keep them tidy. -It sorts the tags in the taxonomy and performs some basic cleaning like -removing redundant entries: - -```shell -$./avclass2_input_checker.py -tax taxonomy_file -tag tagging_file -exp expansio_file -``` - -If the modifications are in the default files in the data directory you can simply run: - -```shell -$./avclass2_input_checker.py -``` From 2e87fb696990cc38f17ee3812eefe44aeec657a7 Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Mon, 8 Feb 2021 18:05:56 -0500 Subject: [PATCH 29/36] Readme formatting --- README.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 075e64d..684fb7a 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,18 @@ -# Install -```shell -$ git clone http://.../avclass -$ cd avclass -$ pip3 install . -``` - # AVClass - AVClass is a Python package / command line tool to tag / label malware samples. You input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports) and it outputs tags extracted from the AV labels of each sample. AVClass will output the family names, along with other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). It can also output a ranking of all alternative family names it found for each sample. There is also a compatibility mode `-c` (AVClass 1.x) that will only output the family names (i.e., family tags). +## Install +```shell +$ git clone http://.../avclass +$ cd avclass +$ pip3 install . +``` +## Examples A quick example helps illustrating the differences of 1.x compatibility mode. If you run AVClass on our example input file: ```shell From 343aa5556811a0810171c096671070354af835ce Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Tue, 9 Feb 2021 16:59:00 -0500 Subject: [PATCH 30/36] Moved class variables into __init__ so they are reset --- avclass/labeler.py | 58 ++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/avclass/labeler.py b/avclass/labeler.py index d3885be..b0e362f 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -22,38 +22,36 @@ class AVClassLabeler: - output = {"labels": []} - av_labels = None - hash_type = None - ground_truth = None - get_sample_info = None - console = False - av_tags = False - stats_export = False - family_only = False - pup_classify = False - path_export = False - vt_tags = False - vt_all = 0 - first_token_dict = {} - token_count_map = {} - pair_count_map = {} - avtags_dict = {} - gt_dict = {} - stats = { - "samples": 0, - "noscans": 0, - "tagged": 0, - "maltagged": 0, - "FAM": 0, - "CLASS": 0, - "BEH": 0, - "FILE": 0, - "UNK": 0, - } - def __init__(self, av_labels: AvLabels = AvLabels()): self.av_labels = av_labels + self.output = {"labels": []} + self.hash_type = None + self.ground_truth = None + self.get_sample_info = None + self.console = False + self.av_tags = False + self.stats_export = False + self.family_only = False + self.pup_classify = False + self.path_export = False + self.vt_tags = False + self.vt_all = 0 + self.first_token_dict = {} + self.token_count_map = {} + self.pair_count_map = {} + self.avtags_dict = {} + self.gt_dict = {} + self.stats = { + "samples": 0, + "noscans": 0, + "tagged": 0, + "maltagged": 0, + "FAM": 0, + "CLASS": 0, + "BEH": 0, + "FILE": 0, + "UNK": 0, + } def run( self, From 2924a66e4358a6dcfe396f14b7bbc943d1820be1 Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Wed, 17 Feb 2021 13:03:44 -0500 Subject: [PATCH 31/36] Copied updates from common --- avclass/common.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/avclass/common.py b/avclass/common.py index d812523..d03c43a 100755 --- a/avclass/common.py +++ b/avclass/common.py @@ -328,7 +328,7 @@ def __init__(self, filepath: Optional[AnyStr]): :param filepath: The file to read from """ - self._rmap = {} + self._src_map = {} if filepath: self.read_rules(filepath) @@ -338,7 +338,7 @@ def __len__(self): :return: Number of rules """ - return len(self._rmap) + return len(self._src_map) def add_rule( self, src: AnyStr, dst_l: Collection[AnyStr] = None, overwrite: bool = False @@ -360,19 +360,19 @@ def add_rule( src_tag = Tag(src) if overwrite: target_l = [Tag(dst).name for dst in dst_l] - self._rmap[src_tag.name] = set(target_l) + self._src_map[src_tag.name] = set(target_l) else: - curr_dst = self._rmap.get(src_tag.name, set()) + curr_dst = self._src_map.get(src_tag.name, set()) for dst in dst_l: dst_tag = Tag(dst) curr_dst.add(dst_tag.name) - self._rmap[src_tag.name] = curr_dst + self._src_map[src_tag.name] = curr_dst def remove_rule(self, src: AnyStr) -> bool: - dst = self._rmap.get(src, []) + dst = self._src_map.get(src, []) if dst: logger.debug("[Rules] Removing rule: %s -> %s" % (src, dst)) - del self._rmap[src] + del self._src_map[src] return True return False @@ -383,7 +383,7 @@ def get_dst(self, src: AnyStr) -> List[AnyStr]: :param src: The source rule :return: List of dst """ - return list(self._rmap.get(src, [])) + return list(self._src_map.get(src, [])) def read_rules(self, filepath: AnyStr): """ @@ -409,7 +409,7 @@ def to_file(self, filepath: AnyStr, taxonomy: Taxonomy = None): :return: None """ with open(filepath, "w") as fd: - for src, dst_set in sorted(self._rmap.items()): + for src, dst_set in sorted(self._src_map.items()): dst_l = sorted(dst_set) if taxonomy: src_path = taxonomy.get_path(src) @@ -428,11 +428,11 @@ def expand_src_destinations(self, src: AnyStr) -> Set[AnyStr]: :return: List of expanded destinations """ # TODO - this only goes one layer deep it seems. Not actually recursive - dst_set = self._rmap.get(src, set()) + dst_set = self._src_map.get(src, set()) out = set() while dst_set: dst = dst_set.pop() - dst_l = self._rmap.get(dst, []) + dst_l = self._src_map.get(dst, []) if dst_l: for d in dst_l: if d not in out and d != dst: @@ -447,10 +447,10 @@ def expand_all_destinations(self): :return: None """ - src_l = self._rmap.keys() + src_l = self._src_map.keys() for src in src_l: dst_l = self.expand_src_destinations(src) - self._rmap[src] = dst_l + self._src_map[src] = dst_l class Translation(Rules): @@ -468,7 +468,9 @@ def validate(self, taxonomy: Taxonomy): :param taxonomy: The Taxonomy to use for checking :return: None """ - for tok, tag_l in self._rmap.items(): + for tok, tag_l in self._src_map.items(): + if taxonomy.is_tag(tok): + sys.stdout.write("[Tagging] SRC %s in taxonomy\n" % tok) for t in tag_l: if not taxonomy.is_tag(t): sys.stdout.write("[Tagging] %s not in taxonomy\n" % t) @@ -490,7 +492,7 @@ def validate(self, taxonomy: Taxonomy): :param taxonomy: The Taxonomy to use for checking :return: None """ - for src, dst_set in self._rmap.items(): + for src, dst_set in self._src_map.items(): if not taxonomy.is_tag(src): sys.stdout.write("[Expansion] %s not in taxonomy\n" % src) # TODO - raise or return False? From b2ce1afe7cee35145aeafb09d374a6d605fb9d1f Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Thu, 18 Feb 2021 10:34:50 -0500 Subject: [PATCH 32/36] platonK fix for parsing of VT file reports from VT file feed APIv3 --- avclass/common.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/avclass/common.py b/avclass/common.py index d03c43a..7e0eea4 100755 --- a/avclass/common.py +++ b/avclass/common.py @@ -606,11 +606,13 @@ def get_sample_info_vt_v3(record: Dict) -> SampleInfo: :param record: The JSON record :return: An instance of SampleInfo """ + if 'data' in record: + record = record['data'] try: - scans = record["data"]["attributes"]["last_analysis_results"] - md5 = record["data"]["attributes"]["md5"] - sha1 = record["data"]["attributes"]["sha1"] - sha256 = record["data"]["attributes"]["sha256"] + scans = record["attributes"]["last_analysis_results"] + md5 = record["attributes"]["md5"] + sha1 = record["attributes"]["sha1"] + sha256 = record["attributes"]["sha256"] except KeyError: return None @@ -624,7 +626,7 @@ def get_sample_info_vt_v3(record: Dict) -> SampleInfo: ).strip() label_pairs.append((av, clean_label)) - vt_tags = record["data"]["attributes"].get("tags", []) + vt_tags = record["attributes"].get("tags", []) return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags) From b113a69ce56f0edb19935dbe6ecad8c4823215d6 Mon Sep 17 00:00:00 2001 From: ElJeffe Date: Tue, 9 Mar 2021 10:23:04 -0500 Subject: [PATCH 33/36] MISP --- avclass/common.py | 9 +- avclass/data/default.taxonomy | 44 + .../cluster/avclass.json | 1288 ++++++++++++++++- .../avclass2.json => misp/galaxy/avclass.json | 2 +- 4 files changed, 1324 insertions(+), 19 deletions(-) rename avclass/data/misp/cluster/avclass2.json => misp/cluster/avclass.json (85%) rename avclass/data/misp/galaxy/avclass2.json => misp/galaxy/avclass.json (87%) diff --git a/avclass/common.py b/avclass/common.py index 7e0eea4..946f7ad 100755 --- a/avclass/common.py +++ b/avclass/common.py @@ -32,6 +32,11 @@ "GData", "Avast", "Sophos", + "BitDefenderTheta", + "Alibaba", + "Tencent", + "Cyren", + "Arcabit", "TrendMicro-HouseCall", "TrendMicro", "NANO-Antivirus", @@ -606,8 +611,8 @@ def get_sample_info_vt_v3(record: Dict) -> SampleInfo: :param record: The JSON record :return: An instance of SampleInfo """ - if 'data' in record: - record = record['data'] + if "data" in record: + record = record["data"] try: scans = record["attributes"]["last_analysis_results"] md5 = record["attributes"]["md5"] diff --git a/avclass/data/default.taxonomy b/avclass/data/default.taxonomy index 68a416a..963b8da 100644 --- a/avclass/data/default.taxonomy +++ b/avclass/data/default.taxonomy @@ -15,6 +15,7 @@ BEH:filemodify BEH:files BEH:hostsmodify BEH:infosteal +BEH:infosteal:coinstealer BEH:infosteal:gamethief BEH:inject BEH:irc @@ -98,9 +99,11 @@ FAM:adrotator FAM:adrotoob FAM:adultbrowser FAM:adviator +FAM:adwind FAM:adwk FAM:adwo FAM:aesads +FAM:agenttesla FAM:agobot FAM:agvd FAM:ahmyth @@ -136,7 +139,9 @@ FAM:asacub FAM:asprox FAM:autoins FAM:autosus +FAM:avemaria FAM:axespy +FAM:azorult FAM:badda FAM:badnews FAM:badpac @@ -150,6 +155,7 @@ FAM:basbanke FAM:basebridge FAM:basepay FAM:bauts +FAM:bazar FAM:bebeg FAM:becou FAM:beebone @@ -198,6 +204,7 @@ FAM:cardserv FAM:cashon FAM:cellshark FAM:centim +FAM:cerberus FAM:cerekv FAM:cheica FAM:chir @@ -207,15 +214,18 @@ FAM:cleaman FAM:clevernet FAM:clientor FAM:clinator +FAM:clipbanker FAM:cmccwm FAM:cnbtech FAM:cnzz +FAM:cobaltstrike FAM:coinhive FAM:coldfusion FAM:commplat FAM:conduit FAM:conficker FAM:contactscollector +FAM:conti FAM:cooee FAM:coogos FAM:coolmirage @@ -230,6 +240,7 @@ FAM:crusewind FAM:cryptodef FAM:cryptolocker FAM:cryptowall +FAM:crysis FAM:crytex FAM:cryxos FAM:ctchm @@ -246,6 +257,7 @@ FAM:dasu FAM:datacollector FAM:daws FAM:dbtes +FAM:deathransom FAM:deblio FAM:defmid FAM:delbar @@ -268,6 +280,7 @@ FAM:dorifel FAM:dorkbot FAM:dougalek FAM:dowgin +FAM:downeks FAM:downloadadmin FAM:downloadassistant FAM:downloadguide @@ -289,9 +302,11 @@ FAM:dusvext FAM:dynamer FAM:easyroot FAM:egame +FAM:egregor FAM:egroupdial FAM:ejik FAM:elite +FAM:emotet FAM:emudbot FAM:eorezo FAM:equationdrug @@ -384,6 +399,7 @@ FAM:ginamster FAM:ginko FAM:ginmaster FAM:gizmo +FAM:glupteba FAM:gobot FAM:golddream FAM:goldentouch @@ -391,7 +407,10 @@ FAM:gomanag FAM:gomunc FAM:gonesixty FAM:goodnews +FAM:goodor +FAM:gootkit FAM:gorillaprice +FAM:gozi FAM:gpspy FAM:grabos FAM:graybird @@ -417,6 +436,7 @@ FAM:hipposms FAM:honli FAM:hotbar FAM:hotclip +FAM:houdini FAM:hoverwatch FAM:hqowdo FAM:hqwar @@ -497,6 +517,7 @@ FAM:kyview FAM:laconic FAM:lardlond FAM:laroux +FAM:lazagne FAM:ldpinch FAM:leadbolt FAM:leapp @@ -505,6 +526,7 @@ FAM:lemon FAM:lethic FAM:letv FAM:lien +FAM:limerat FAM:linkular FAM:lirose FAM:lmir @@ -599,10 +621,12 @@ FAM:myteam FAM:mytrackp FAM:mywebsearch FAM:nandrobox +FAM:nanocore FAM:navbar FAM:nawiaiad FAM:necro FAM:necurs +FAM:neoreklami FAM:neospy FAM:neshta FAM:netbox @@ -630,6 +654,7 @@ FAM:opencandy FAM:openinstall FAM:opfake FAM:optix +FAM:orcusrat FAM:outbrowse FAM:oveead FAM:paccy @@ -644,6 +669,7 @@ FAM:penguin FAM:perflogger FAM:perkel FAM:petrolin +FAM:petya FAM:phonespy FAM:picsys FAM:piom @@ -674,6 +700,7 @@ FAM:pushad FAM:pushe FAM:puxis FAM:pykspa +FAM:quasar FAM:qbot FAM:qexma FAM:qplus @@ -681,11 +708,13 @@ FAM:qqrob FAM:qumi FAM:quozha FAM:qushu +FAM:raccoon FAM:raden FAM:ramnit FAM:ranky FAM:rasteal FAM:razam +FAM:razy FAM:rbot FAM:rebhip FAM:recmads @@ -697,12 +726,14 @@ FAM:reflod FAM:refog FAM:regon FAM:relevantknowledge +FAM:remcos FAM:renocide FAM:renos FAM:reporo FAM:reptilicus FAM:resharer FAM:reveton +FAM:revetrat FAM:revmob FAM:riltok FAM:rimod @@ -715,11 +746,13 @@ FAM:rootmaster FAM:rootnik FAM:rootsmart FAM:rotexy +FAM:rozena FAM:rufraud FAM:rukometa FAM:rungbu FAM:ruskill FAM:rusms +FAM:ryuk FAM:sacti FAM:sacto FAM:sadenav @@ -727,10 +760,12 @@ FAM:sadpor FAM:sahat FAM:sakezon FAM:sality +FAM:samsam FAM:sanctionedmedia FAM:sandr FAM:savemy FAM:scam +FAM:schwarzesonne FAM:sckeylog FAM:sdbot FAM:seaweth @@ -741,6 +776,7 @@ FAM:severs FAM:sfone FAM:shastrosms FAM:shedun +FAM:shelma FAM:sheridroid FAM:shixot FAM:shiz @@ -808,6 +844,7 @@ FAM:spyapp FAM:spybubble FAM:spydealer FAM:spyeye +FAM:spygate FAM:spyhasb FAM:spynote FAM:spyoo @@ -826,6 +863,7 @@ FAM:suaban FAM:suggestor FAM:supking FAM:svpeng +FAM:swrort FAM:swisyn FAM:swizzor FAM:systemmonitor @@ -872,6 +910,7 @@ FAM:tracer FAM:tracker FAM:trackerfree FAM:trackplus +FAM:trickbot FAM:trclick FAM:tridrongo FAM:troom @@ -895,6 +934,7 @@ FAM:usteal FAM:utchi FAM:uupay FAM:uuserv +FAM:valyria FAM:vapsup FAM:vdloader FAM:verti @@ -907,6 +947,7 @@ FAM:virusdoctor FAM:virut FAM:viser FAM:vittalia +FAM:vjworm FAM:vkemag FAM:vktihs FAM:vmvol @@ -1022,6 +1063,7 @@ FILE:packed:aspack FILE:packed:asprotect FILE:packed:beroexepacker FILE:packed:bobsoft +FILE:packed:confuser FILE:packed:decrypter FILE:packed:encryptpe FILE:packed:enigma @@ -1033,6 +1075,7 @@ FILE:packed:krunchy FILE:packed:maskpe FILE:packed:molebox FILE:packed:morphine +FILE:packed:multipacked FILE:packed:nakedpack FILE:packed:niceprotect FILE:packed:npack @@ -1057,6 +1100,7 @@ FILE:packed:upack FILE:packed:upx FILE:packed:vmprotect FILE:packed:yoda +FILE:proglang:autohk FILE:proglang:autoit FILE:proglang:delphi FILE:proglang:java diff --git a/avclass/data/misp/cluster/avclass2.json b/misp/cluster/avclass.json similarity index 85% rename from avclass/data/misp/cluster/avclass2.json rename to misp/cluster/avclass.json index a8a7d33..933e133 100644 --- a/avclass/data/misp/cluster/avclass2.json +++ b/misp/cluster/avclass.json @@ -134,6 +134,19 @@ "uuid": "e56915a8-a345-316a-9f69-d9e62a68c753", "value": "spyeye" }, + { + "description": "FAM:spygate", + "meta": { + "refs": [ + "https://www.fortiguard.com/encyclopedia/virus/8225407", + "https://www.rekings.com/spygate-rat-3-2/", + "https://www.symantec.com/security_response/attacksignatures/detail.jsp%3Fasid%3D27950", + "http://spygate-rat.blogspot.lu/" + ] + }, + "uuid": "793d27f3-f060-49f2-b572-8bc6fcdbbdef", + "value": "spygate" + }, { "description": "FAM:spyhasb", "meta": { @@ -341,6 +354,19 @@ "uuid": "b94f39e6-7997-373b-8d67-ae62d889e110", "value": "svpeng" }, + { + "description": "FAM:swrort", + "meta": { + "refs": [ + "https://blog.malwarebytes.com/detections/trojan-swrort/", + "https://malpedia.caad.fkie.fraunhofer.de/details/ps1.swrort" + ], + "synonyms": [], + "type": [] + }, + "uuid": "048b948f-5e4d-4e6f-a0b5-54157cf03c86", + "value": "swrort" + }, { "description": "FAM:swisyn", "meta": { @@ -871,6 +897,69 @@ "uuid": "42603f75-a6b9-3091-bf23-c2fb545fad56", "value": "trclick" }, + { + "description": "FAM:trickbot", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.trickbot", + "https://www.cybereason.com/blog/triple-threat-emotet-deploys-trickbot-to-steal-data-spread-ryuk-ransomware", + "https://blog.malwarebytes.com/threat-analysis/2017/08/trickbot-comes-with-new-tricks-attacking-outlook-and-browsing-data/", + "http://www.vkremez.com/2017/11/lets-learn-trickbot-socks5-backconnect.html", + "https://blog.trendmicro.com/trendlabs-security-intelligence/trickbot-adds-remote-application-credential-grabbing-capabilities-to-its-repertoire/", + "http://www.vkremez.com/2017/12/lets-learn-introducing-new-trickbot.html", + "https://blog.trendmicro.com/trendlabs-security-intelligence/trickbot-shows-off-new-trick-password-grabber-module", + "https://www.fidelissecurity.com/threatgeek/2016/10/trickbot-we-missed-you-dyre", + "https://www.flashpoint-intel.com/blog/trickbot-account-checking-hybrid-attack-model/", + "http://www.peppermalware.com/2019/03/quick-analysis-of-trickbot-sample-with.html", + "https://blog.malwarebytes.com/threat-analysis/2016/10/trick-bot-dyrezas-successor/", + "https://www.youtube.com/watch?v=KMcSAlS9zGE", + "https://www.crowdstrike.com/blog/sin-ful-spiders-wizard-spider-and-lunar-spider-sharing-the-same-web/", + "https://www.arbornetworks.com/blog/asert/trickbot-banker-insights/", + "https://blog.malwarebytes.com/threat-analysis/malware-threat-analysis/2018/11/whats-new-trickbot-deobfuscating-elements/", + "https://www.trustwave.com/Resources/SpiderLabs-Blog/Tale-of-the-Two-Payloads-%E2%80%93-TrickBot-and-Nitol/", + "http://www.vkremez.com/2018/04/lets-learn-trickbot-implements-network.html", + "https://securityintelligence.com/trickbot-takes-to-latin-america-continues-to-expand-its-global-reach/", + "https://qmemcpy.io/post/reverse-engineering-malware-trickbot-part-2-loader", + "https://www.fireeye.com/blog/threat-research/2019/01/a-nasty-trick-from-credential-theft-malware-to-business-disruption.html", + "https://securityintelligence.com/trickbots-cryptocurrency-hunger-tricking-the-bitcoin-out-of-wallets/", + "https://blog.fraudwatchinternational.com/malware/trickbot-malware-works", + "https://www.blueliv.com/research/trickbot-banking-trojan-using-eflags-as-an-anti-hook-technique/", + "https://f5.com/labs/articles/threat-intelligence/malware/trickbot-expands-global-targets-beyond-banks-and-payment-processors-to-crms", + "https://f5.com/labs/articles/threat-intelligence/malware/little-trickbot-growing-up-new-campaign-24412", + "https://github.com/JR0driguezB/malware_configs/tree/master/TrickBot", + "https://escinsecurity.blogspot.de/2018/01/weekly-trickbot-analysis-end-of-wc-22.html", + "https://www.webroot.com/blog/2018/03/21/trickbot-banking-trojan-adapts-new-module/", + "https://www.fortinet.com/blog/threat-research/deep-analysis-of-trickbot-new-module-pwgrab.html", + "https://www.securityartwork.es/wp-content/uploads/2017/06/Informe_Evoluci%C3%B3n_Trickbot.pdf", + "https://blogs.forcepoint.com/security-labs/trickbot-spread-necurs-botnet-adds-nordic-countries-its-targets", + "http://blog.fortinet.com/2016/12/06/deep-analysis-of-the-online-banking-botnet-trickbot", + "https://www.cyberbit.com/blog/endpoint-security/latest-trickbot-variant-has-new-tricks-up-its-sleeve/", + "http://www.malware-traffic-analysis.net/2018/02/01/", + "https://www.cert.pl/en/news/single/detricking-trickbot-loader/", + "https://www.trendmicro.com/vinfo/us/security/news/cybercrime-and-digital-threats/evolving-trickbot-adds-detection-evasion-and-screen-locking-features", + "https://securityintelligence.com/tricks-of-the-trade-a-deeper-look-into-trickbots-machinations/", + "http://www.pwc.co.uk/issues/cyber-security-data-privacy/research/trickbots-bag-of-tricks.html", + "https://qmemcpy.io/post/reverse-engineering-malware-trickbot-part-3-core", + "https://www.ringzerolabs.com/2017/07/trickbot-banking-trojan-doc00039217doc.html", + "https://www.youtube.com/watch?v=EdchPEHnohw", + "https://sysopfb.github.io/malware/2018/04/16/trickbot-uacme.html", + "https://blog.talosintelligence.com/2018/07/smoking-guns-smoke-loader-learned-new.html", + "https://www.vkremez.com/2018/11/lets-learn-introducing-latest-trickbot.html", + "https://www.youtube.com/watch?v=lTywPmZEU1A", + "https://qmemcpy.github.io/post/reverse-engineering-malware-trickbot-part-1-packer", + "https://www.botconf.eu/wp-content/uploads/2016/11/2016-LT09-TrickBot-Adams.pdf", + "https://www.flashpoint-intel.com/blog/new-version-trickbot-adds-worm-propagation-module/" + ], + "synonyms": [ + "thetrick", + "trickLoader", + "trickster" + ], + "type": [] + }, + "uuid": "b104ec95-e1bd-44c1-a193-d979bedc0a98", + "value": "trickbot" + }, { "description": "FAM:tridrongo", "meta": { @@ -1130,6 +1219,27 @@ "uuid": "ceea3c58-4d4a-34c5-9bf7-a7b621a0157b", "value": "uuserv" }, + { + "description": "FAM:valyria", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/ps1.powerstats", + "https://www.clearskysec.com/muddywater-operations-in-lebanon-and-oman/", + "https://unit42.paloaltonetworks.com/unit42-muddying-the-water-targeted-attacks-in-the-middle-east/", + "https://www.fireeye.com/blog/threat-research/2018/03/iranian-threat-group-updates-ttps-in-spear-phishing-campaign.html", + "https://blog.malwarebytes.com/threat-analysis/2017/09/elaborate-scripting-fu-used-in-espionage-attack-against-saudi-arabia-government_entity/", + "https://reaqta.com/2017/11/muddywater-apt-targeting-middle-east/", + "https://blog.trendmicro.com/trendlabs-security-intelligence/campaign-possibly-connected-muddywater-surfaces-middle-east-central-asia/", + "https://www.clearskysec.com/muddywater-targets-kurdish-groups-turkish-orgs/" + ], + "synonyms": [ + "powerstats" + ], + "type": [] + }, + "uuid": "9f958d8b-0489-40e5-91b4-aa780fc90393", + "value": "valyria" + }, { "description": "FAM:vapsup", "meta": { @@ -1284,6 +1394,21 @@ "uuid": "e6645d41-384f-3030-a76f-e12c94d6a39d", "value": "vittalia" }, + { + "description": "FAM:vjworm", + "meta": { + "refs": [ + "https://cofense.com/vjw0rm-malware-heres-watch/", + "https://www.trendmicro.com/vinfo/us/threat-encyclopedia/malware/js_vjworm.i" + ], + "synonyms": [ + "vjw0rm" + ], + "type": [] + }, + "uuid": "d2d82d11-f174-4804-abf5-2b81740b5993", + "value": "vjworm" + }, { "description": "FAM:vkemag", "meta": { @@ -2587,17 +2712,20 @@ "meta": { "refs": [], "synonyms": [ + "binder", "cryp", "crypt", "crypted", "crypter", "cryptic", + "cryptoobfuscator", "cryptor", "encpk", "genpack", "krypt", "kryptik", - "kryptk", + "kryptk", + "genkryptik", "malcrypt", "malob", "malpack", @@ -2611,6 +2739,7 @@ "pakes", "suspiciouspacker", "susppack", + "vbinder", "vbcrypt", "vbkrypt", "vbpack", @@ -2629,6 +2758,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "b1c0c358-3a33-37db-974d-d4b8c31e45d3", "value": "armadillo" }, @@ -2639,6 +2774,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "48eb867b-53ea-344f-8006-7756cdca4be9", "value": "aspack" }, @@ -2649,6 +2790,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "cd8af59b-808b-3692-8068-2df3fd6a3ab6", "value": "asprotect" }, @@ -2659,6 +2806,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "acebc64c-4419-3034-aeb7-62657e281101", "value": "beroexepacker" }, @@ -2669,9 +2822,31 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "32b30e14-264e-3482-aa8f-e741f3f9ea29", "value": "bobsoft" }, + { + "description": "FILE:packed:confuser", + "meta": { + "refs": [], + "synonyms": ["confuserex"], + "type": [] + }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], + "uuid": "f0c8b58d-040b-4956-9160-d17b6a6064e9", + "value": "confuser" + }, { "description": "FILE:packed:decrypter", "meta": { @@ -2679,6 +2854,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "11c5c00e-d453-39b9-b127-4c01be2227fb", "value": "decrypter" }, @@ -2689,16 +2870,30 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "b94cd8d8-48cf-38c0-b296-52036803a04f", "value": "encryptpe" }, { "description": "FILE:packed:enigma", "meta": { - "refs": [], - "synonyms": [], + "refs": "https://enigmaprotector.com/", + "synonyms": [ + "enigmaprotector" + ], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "2f5b61f3-5b5f-3eac-bbe6-32eb728cba6e", "value": "enigma" }, @@ -2709,6 +2904,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "19aa389c-ff65-36c7-8d79-e109e356866f", "value": "execryptor" }, @@ -2719,6 +2920,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "01f1aec1-33c8-3cbe-aa9b-5c78b8db18bf", "value": "exestealth" }, @@ -2729,6 +2936,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "138e2027-887b-3d4a-8baf-f265cda803ec", "value": "expressor" }, @@ -2739,6 +2952,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "62775d3e-9738-37eb-8521-b6086b278380", "value": "jiagu" }, @@ -2751,6 +2970,12 @@ ], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "0599aec6-6542-3014-b36a-85eb7b1acc50", "value": "krunchy" }, @@ -2763,6 +2988,12 @@ ], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "ac7fb78f-1cfb-3f53-a654-ebe0c02c61c1", "value": "maskpe" }, @@ -2773,6 +3004,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "10398ebf-c9c9-3620-b718-40934daa60f3", "value": "molebox" }, @@ -2783,9 +3020,31 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "cae3f868-9601-30b9-9e88-f5bf724c1fd3", "value": "morphine" }, + { + "description": "FILE:packed:multipacked", + "meta": { + "refs": "https://encyclopedia.kaspersky.com/knowledge/multipacked/", + "synonyms": [], + "type": [] + }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], + "uuid": "59f26518-641f-4c07-9aed-445d60158789", + "value": "multipacked" + }, { "description": "FILE:packed:nakedpack", "meta": { @@ -2793,6 +3052,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "030e9fd5-99c8-3c33-9896-d5711b54219a", "value": "nakedpack" }, @@ -2803,6 +3068,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "9ebe63ac-3a1f-3a64-afed-8adcec5f2fff", "value": "niceprotect" }, @@ -2813,6 +3084,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "fe292fb8-4e92-3def-8506-ee3278f457af", "value": "npack" }, @@ -2823,6 +3100,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "3fa37052-f024-3c63-89be-d8c366d6f83e", "value": "nspack" }, @@ -2833,6 +3116,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "5cea2ead-364f-3ff7-8dbf-9d0c0952c70c", "value": "obsidium" }, @@ -2843,6 +3132,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "1bf29bd3-1e76-330c-87ee-c186adf9c646", "value": "packman" }, @@ -2853,6 +3148,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "fe384ea4-109d-35ec-9f00-fea5f69914fd", "value": "pearmor" }, @@ -2863,6 +3164,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "634815d7-0cd8-37da-b20f-b126a744b825", "value": "pecompact" }, @@ -2873,6 +3180,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "70e0e627-e1a0-344a-a4cd-825e9de1b86f", "value": "pecrypt" }, @@ -2883,6 +3196,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "36224c37-5a49-336f-9fa9-42db1db78cff", "value": "pespin" }, @@ -2895,6 +3214,12 @@ ], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "a515ed68-5ab7-3a7a-ba4d-63ff5edba892", "value": "polycrypt" }, @@ -2905,6 +3230,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "26ee092f-b61c-3774-90ab-632d902a9690", "value": "punisher" }, @@ -2915,6 +3246,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "d9221f76-bbe9-3936-a6f4-cc2582ed8ae8", "value": "rcryptor" }, @@ -2925,6 +3262,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "a95dcb8b-fe56-31a7-80df-6764ed4b4759", "value": "rlpack" }, @@ -2935,6 +3278,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "73ab92ef-dbf4-38d3-8687-e9df33211bd1", "value": "sdprotector" }, @@ -2945,6 +3294,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "0e88fad9-544f-37fa-b4fd-13c91ea8afdd", "value": "secapk" }, @@ -2955,6 +3310,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "e10e4847-cad5-32e3-ae8d-3f08d2c4baac", "value": "secneo" }, @@ -2965,6 +3326,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "8cb96214-6910-3124-89fa-301eccee65ea", "value": "simplepack" }, @@ -2975,6 +3342,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "6cf84378-cc5e-3e92-8a13-87697f25808f", "value": "telock" }, @@ -2985,6 +3358,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "b247e2db-6c54-318f-95a4-8d3c0a0b384b", "value": "themida" }, @@ -2995,6 +3374,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "a550b64f-0f43-3c19-b372-fff40ddf8002", "value": "upack" }, @@ -3005,6 +3390,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "8a0a984c-44b7-38ed-8a8c-6e55a4c8d888", "value": "upx" }, @@ -3017,6 +3408,12 @@ ], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "b077a697-29f1-3780-9e37-88791a9e70c8", "value": "vmprotect" }, @@ -3031,9 +3428,27 @@ ], "type": [] }, + "related": [ + { + "dest-uuid": "eb6d3fe4-5b8e-306c-84ef-960f4e14cf1a", + "type": "subtechnique-of" + } + ], "uuid": "199805da-3a4c-3927-b32c-e6eb4b61f903", "value": "yoda" }, + { + "description": "FILE:proglang:autohk", + "meta": { + "refs": [], + "synonyms": [ + "autohotkey" + ], + "type": [] + }, + "uuid": "5391799c-5a56-4e63-8438-eba2b331e65d", + "value": "autohk" + }, { "description": "FILE:proglang:autoit", "meta": { @@ -3329,6 +3744,23 @@ "uuid": "46e0b44b-cf35-3472-9941-1bc1ea14943e", "value": "gizmo" }, + { + "description": "FAM:glupteba", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.glupteba", + "http://resources.infosecinstitute.com/tdss4-part-1/", + "http://malwarefor.me/2015-04-13-nuclear-ek-glupteba-and-operation-windigo/", + "https://www.welivesecurity.com/2014/03/18/operation-windigo-the-vivisection-of-a-large-linux-server-side-credential-stealing-malware-campaign/", + "https://www.welivesecurity.com/2011/03/02/tdl4-and-glubteba-piggyback-piggybugs/", + "https://www.welivesecurity.com/2018/03/22/glupteba-no-longer-windigo/" + ], + "synonyms": [], + "type": [] + }, + "uuid": "0409c6ab-133a-448e-a2cf-c2c7f55d4100", + "value": "glupteba" + }, { "description": "FAM:gobot", "meta": { @@ -3410,6 +3842,50 @@ "uuid": "c9431d80-ca53-3176-877b-e283aa3f9f11", "value": "goodnews" }, + { + "description": "FAM:goodor", + "meta": { + "refs": "https://malpedia.caad.fkie.fraunhofer.de/details/win.goodor", + "synonyms": [ + "fuerboos" + ], + "type": [] + }, + "uuid": "2115d439-31e9-45bd-ae00-dcb2fa5cde9c", + "value": "goodor" + }, + { + "description": "FAM:gootkit", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.gootkit", + "https://www.lexsi.com/securityhub/homer-simpson-brian-krebs-rencontrent-zeus-gootkit/", + "http://blog.cert.societegenerale.com/2015/04/analyzing-gootkits-persistence-mechanism.html", + "https://securityintelligence.com/gootkit-developers-dress-it-up-with-web-traffic-proxy/", + "https://forums.juniper.net/t5/Security-Now/New-Gootkit-Banking-Trojan-variant-pushes-the-limits-on-evasive/ba-p/319055", + "https://www.f5.com/labs/articles/threat-intelligence/tackling-gootkit-s-traps", + "https://securelist.com/blog/research/76433/inside-the-gootkit-cc-server/", + "https://www.us-cert.gov/ncas/alerts/TA16-336A", + "http://www.vkremez.com/2018/04/lets-learn-in-depth-dive-into-gootkit.html", + "https://securityintelligence.com/gootkit-bobbing-and-weaving-to-avoid-prying-eyes/", + "https://www.youtube.com/watch?v=242Tn0IL2jE", + "http://www.kernelmode.info/forum/viewtopic.php?f=16&t=3669", + "https://www.s21sec.com/en/blog/2016/05/reverse-engineering-gootkit/", + "http://blog.trendmicro.com/trendlabs-security-intelligence/fake-judicial-spam-leads-to-backdoor-with-fake-certificate-authority/", + "https://news.drweb.com/show/?i=4338&lng=en", + "https://www.youtube.com/watch?v=QgUlPvEE4aw", + "https://www.cyphort.com/angler-ek-leads-to-fileless-gootkit/" + ], + "synonyms": [ + "xswkit", + "talalpek", + "waldek" + ], + "type": [] + }, + "uuid": "beb37b7f-d2f9-47c5-a53b-4b5bb7db9cdf", + "value": "gootkit" + }, { "description": "FAM:gorillaprice", "meta": { @@ -3422,6 +3898,27 @@ "uuid": "83db9052-450f-30d9-90f0-e9ccffb3b348", "value": "gorillaprice" }, + { + "description": "FAM:gozi", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.gozi", + "http://blog.malwaremustdie.org/2013/02/the-infection-of-styx-exploit-kit.html", + "https://www.secureworks.com/research/gozi", + "https://lokalhost.pl/gozi_tree.txt", + "https://blog.gdatasoftware.com/2016/11/29325-analysis-ursnif-spying-on-your-data-since-2007", + "http://researchcenter.paloaltonetworks.com/2017/02/unit42-banking-trojans-ursnif-global-distribution-networks-identified/" + ], + "synonyms": [ + "papras", + "snifula", + "ursnif" + ], + "type": [] + }, + "uuid": "5fac06c6-010a-4af3-99c9-cb0052057bdf", + "value": "gozi" + }, { "description": "FAM:gpspy", "meta": { @@ -3616,6 +4113,21 @@ "uuid": "f801c366-5b0c-39da-9afb-ca515528bd99", "value": "hiddenapp" }, + { + "description": "FAM:hiddentear", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.hiddentear", + "https://www.tripwire.com/state-of-security/security-data-protection/cyber-security/hidden-tear-project-forbidden-fruit-is-the-sweetest/", + "https://twitter.com/struppigel/status/950787783353884672", + "https://github.com/goliate/hidden-tear" + ], + "synonyms": [], + "type": [] + }, + "uuid": "06e7d142-3ad1-40b6-b231-41dd47465ac3", + "value": "hiddentear" + }, { "description": "FAM:hiddnad", "meta": { @@ -3718,6 +4230,25 @@ "uuid": "373d306c-aaa3-38e3-b839-7dd39b51e89a", "value": "hotclip" }, + { + "description": "FAM:houdini", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.houdini", + "http://researchcenter.paloaltonetworks.com/2016/10/unit42-houdinis-magic-reappearance/?adbsc=social67221546&adbid=790972447373668352&adbpl=tw&adbpr=4487645412", + "http://blogs.360.cn/post/analysis-of-apt-c-37.html" + ], + "synonyms": [ + "dunihi", + "dinihou", + "hworm", + "jenxcus" + ], + "type": [] + }, + "uuid": "1f268f26-ad8b-4e4d-9efb-661904171c2a", + "value": "houdini" + }, { "description": "FAM:hoverwatch", "meta": { @@ -4637,6 +5168,19 @@ "uuid": "84da1a17-5013-3775-a58f-f913b62180ad", "value": "laroux" }, + { + "description": "FAM:lazagne", + "meta": { + "refs": [ + "https://attack.mitre.org/software/S0349", + "https://github.com/AlessandroZ/LaZagne" + ], + "synonyms": [], + "type": [] + }, + "uuid": "a8b9546f-1a91-468d-9304-3f6654d39352", + "value": "lazagne" + }, { "description": "FAM:ldpinch", "meta": { @@ -4725,6 +5269,21 @@ "uuid": "3388273e-3d1e-32a3-afb6-a0ade35d91b2", "value": "lien" }, + { + "description": "FAM:limerat", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.limerat", + "https://www.youtube.com/watch?v=x-g-ZLeX8GM", + "https://blog.yoroi.company/research/limerat-spreads-in-the-wild/", + "https://github.com/NYAN-x-CAT/Lime-RAT/" + ], + "synonyms": [], + "type": [] + }, + "uuid": "1ea6f1b4-cf3d-40aa-981f-31a1efbd819f", + "value": "limerat" + }, { "description": "FAM:linkular", "meta": { @@ -4803,6 +5362,22 @@ "uuid": "bc6ae7c3-eff8-3b11-999b-4248e5d93073", "value": "loapi" }, + { + "description": "FAM:loda", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.loda", + "https://www.proofpoint.com/us/threat-insight/post/introducing-loda-malware", + "https://zerophagemalware.com/2018/01/23/maldoc-rtf-drop-loda-logger/" + ], + "synonyms": [ + "nymeria" + ], + "type": [] + }, + "uuid": "f6203215-d07e-4108-bb75-ee5ad7e9dbfc", + "value": "loda" + }, { "description": "FAM:lockactivity", "meta": { @@ -5787,6 +6362,24 @@ "uuid": "da9dcdba-19b8-34b7-9647-94983e0f04ed", "value": "nandrobox" }, + { + "description": "FAM:nanocore", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.nanocore", + "https://www.fireeye.com/blog/threat-research/2017/09/apt33-insights-into-iranian-cyber-espionage.html", + "https://www.symantec.com/blogs/threat-intelligence/elfin-apt33-espionage", + "https://researchcenter.paloaltonetworks.com/2018/08/unit42-gorgon-group-slithering-nation-state-cybercrime/", + "https://www.bleepingcomputer.com/news/security/nanocore-rat-author-gets-33-months-in-prison/" + ], + "synonyms": [ + "nancrat" + ], + "type": [] + }, + "uuid": "048b948f-5e4d-4e6f-a0b5-54157cf03c86", + "value": "nanocore" + }, { "description": "FAM:navbar", "meta": { @@ -5827,6 +6420,25 @@ "uuid": "ddf07c01-91d3-35ab-b393-3afabe39dff7", "value": "necurs" }, + { + "description": "FAM:neoreklami", + "meta": { + "refs": [ + "https://blog.malwarebytes.com/detections/adware-neoreklami/", + "https://www.microsoft.com/en-us/wdsi/threats/malware-encyclopedia-description?Name=Adware:Win32/Neoreklami" + ], + "synonyms": [], + "type": [] + }, + "related": [ + { + "dest-uuid": "590eeb1a-2742-3301-94bf-9fa0856e959c", + "type": "variant-of" + } + ], + "uuid": "665ecfd3-b713-465f-879b-5182203b4a8b", + "value": "neoreklami" + }, { "description": "FAM:neospy", "meta": { @@ -5880,7 +6492,8 @@ "synonyms": [ "netweird", "weecnaw", - "wirenet" + "wirenet", + "netwire" ], "type": [] }, @@ -6140,6 +6753,26 @@ "uuid": "bd3777dc-6822-36d9-b57b-fe623ddb0170", "value": "optix" }, + { + "description": "FAM:orcusrat", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.orcus_rat", + "https://orcustechnologies.com/", + "https://blog.fortinet.com/2017/12/07/a-peculiar-case-of-orcus-rat-targeting-bitcoin-investors", + "https://www.canada.ca/en/radio-television-telecommunications/news/2019/03/crtc-and-rcmp-national-division-execute-warrants-in-malware-investigation.html", + "https://krebsonsecurity.com/2016/07/canadian-man-is-author-of-popular-orcus-rat/", + "https://krebsonsecurity.com/2019/04/canadian-police-raid-orcus-rat-author/", + "http://researchcenter.paloaltonetworks.com/2016/08/unit42-orcus-birth-of-an-unusual-plugin-builder-rat/" + ], + "synonyms": [ + "orcus" + ], + "type": [] + }, + "uuid": "0c57b2b4-b545-4b5d-bd27-b102b635e432", + "value": "orcusrat" + }, { "description": "FAM:outbrowse", "meta": { @@ -6303,6 +6936,25 @@ "uuid": "457c9036-a4bf-355e-844a-e74dd69c80e7", "value": "petrolin" }, + { + "description": "FAM:petya", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.petya", + "https://blog.malwarebytes.com/threat-analysis/2016/05/petya-and-mischa-ransomware-duet-p1/", + "https://blog.malwarebytes.com/threat-analysis/2016/07/third-time-unlucky-improved-petya-is-out/", + "https://blog.malwarebytes.com/cybercrime/2017/07/keeping-up-with-the-petyas-demystifying-the-malware-family/", + "https://blog.malwarebytes.com/malwarebytes-news/2017/07/bye-bye-petya-decryptor-old-versions-released/", + "https://blog.malwarebytes.com/threat-analysis/2016/04/petya-ransomware/" + ], + "synonyms": [ + "petr" + ], + "type": [] + }, + "uuid": "c4324143-3921-4771-a9b2-f15ae2b3777f", + "value": "petya" + }, { "description": "FAM:phonespy", "meta": { @@ -6706,6 +7358,33 @@ "uuid": "e49cc0f4-649c-344a-b75a-c6187d57e721", "value": "qqrob" }, + { + "description": "FAM:quasar", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.quasar_rat", + "https://researchcenter.paloaltonetworks.com/2018/01/unit42-vermin-quasar-rat-custom-malware-used-ukraine/", + "https://www.fireeye.com/blog/threat-research/2019/04/spear-phishing-campaign-targets-ukraine-government.html", + "https://researchcenter.paloaltonetworks.com/2018/08/unit42-gorgon-group-slithering-nation-state-cybercrime/", + "https://github.com/quasar/QuasarRAT/tree/master/Client", + "https://www.volexity.com/blog/2018/06/07/patchwork-apt-group-targets-us-think-tanks/", + "https://www.pwc.co.uk/cyber-security/pdf/cloud-hopper-annex-b-final.pdf", + "http://researchcenter.paloaltonetworks.com/2017/01/unit42-downeks-and-quasar-rat-used-in-recent-targeted-attacks-against-governments", + "https://documents.trendmicro.com/assets/tech-brief-untangling-the-patchwork-cyberespionage-group.pdf?platform=hootsuite", + "https://ti.360.net/blog/articles/analysis-of-apt-c-09-target-china/", + "https://www.symantec.com/blogs/threat-intelligence/elfin-apt33-espionage", + "https://twitter.com/malwrhunterteam/status/789153556255342596", + "https://www.welivesecurity.com/2018/07/17/deep-dive-vermin-rathole/" + ], + "synonyms": [ + "quasar_rat", + "quasarrat" + ], + "type": [] + }, + "uuid": "620903d7-42ed-4a16-b3df-4ca6076d9f31", + "value": "quasar" + }, { "description": "FAM:qumi", "meta": { @@ -6738,6 +7417,26 @@ "uuid": "0761227e-1a79-3a7b-bda6-b5962458e4ec", "value": "qushu" }, + { + "description": "FAM:raccoon", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.raccoon", + "https://www.secfreaks.gr/2019/12/in-depth-analysis-of-an-infostealer-raccoon.html", + "https://www.bitdefender.com/files/News/CaseStudies/study/289/Bitdefender-WhitePaper-Fallout.pdf", + "https://www.cybereason.com/blog/hunting-raccoon-stealer-the-new-masked-bandit-on-the-block", + "https://www.cynexlink.com/2020/12/26/raccoon-malware-a-threat-to-cybersecurity/" + ], + "synonyms": [ + "mohazo", + "racealer", + "racoon" + ], + "type": [] + }, + "uuid": "b934637a-5c8d-43bc-b595-61e8acd9af78", + "value": "raccoon" + }, { "description": "FAM:raden", "meta": { @@ -6800,6 +7499,25 @@ "uuid": "ac2392d0-c38e-3909-aabf-5e632062f24d", "value": "razam" }, + { + "description": "FAM:razy", + "meta": { + "refs": [ + "https://www.microsoft.com/en-us/wdsi/threats/malware-encyclopedia-description?Name=Trojan:Win32/Razy.A", + "https://threatpost.com/razy-browser-extensions-theft/141181/" + ], + "synonyms": [], + "type": [] + }, + "related": [ + { + "dest-uuid": "636db272-61ce-3e0f-ad64-d77048b05066", + "type": "variant-of" + } + ], + "uuid": "aad8ce8e-0c82-42f7-a63c-bbfe85c015b6", + "value": "razy" + }, { "description": "FAM:rbot", "meta": { @@ -6923,6 +7641,31 @@ "uuid": "4cbb8478-d6ca-3efc-bc72-9be4ffde7073", "value": "relevantknowledge" }, + { + "description": "FAM:remcos", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.remcos", + "https://www.riskiq.com/blog/labs/spear-phishing-turkish-defense-contractors/", + "https://researchcenter.paloaltonetworks.com/2018/08/unit42-gorgon-group-slithering-nation-state-cybercrime/", + "http://malware-traffic-analysis.net/2017/12/22/index.html", + "https://www.symantec.com/blogs/threat-intelligence/elfin-apt33-espionage", + "https://blog.fortinet.com/2017/02/14/remcos-a-new-rat-in-the-wild-2", + "https://krabsonsecurity.com/2018/03/02/analysing-remcos-rats-executable/", + "https://myonlinesecurity.co.uk/fake-order-spoofed-from-finchers-ltd-sankyo-rubber-delivers-remcos-rat-via-ace-attachments/", + "https://blog.talosintelligence.com/2018/08/picking-apart-remcos.html", + "https://secrary.com/ReversingMalware/RemcosRAT/" + ], + "synonyms": [ + "remcosrat", + "remvio", + "socmer" + ], + "type": [] + }, + "uuid": "1b2a647e-35a2-418d-95e2-e77e0423060b", + "value": "remcos" + }, { "description": "FAM:renocide", "meta": { @@ -6993,6 +7736,24 @@ "uuid": "94805334-1d2e-3621-aa6d-0b3dc8e0405d", "value": "reveton" }, + { + "description": "FAM:revetrat", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.revenge_rat", + "https://isc.sans.edu/diary/rss/22590", + "https://researchcenter.paloaltonetworks.com/2018/08/unit42-gorgon-group-slithering-nation-state-cybercrime/", + "http://blog.deniable.org/blog/2016/08/26/lurking-around-revenge-rat/" + ], + "synonyms": [ + "revenge", + "revet" + ], + "type": [] + }, + "uuid": "2326ae09-ab18-41cd-8f87-187853a8623f", + "value": "revetrat" + }, { "description": "FAM:revmob", "meta": { @@ -7122,6 +7883,19 @@ "uuid": "1eb2ee9d-7dff-3816-8641-ab772d90cb54", "value": "rotexy" }, + { + "description": "FAM:rozena", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.rozena", + "https://www.gdatasoftware.com/blog/2018/06/30862-fileless-malware-rozena" + ], + "synonyms": [], + "type": [] + }, + "uuid": "106d00a7-2044-46b6-9c08-35eb775764df", + "value": "rozena" + }, { "description": "FAM:rufraud", "meta": { @@ -7176,6 +7950,31 @@ "uuid": "1b7e36bf-e9dd-33c8-a9af-ad56c3c07f2b", "value": "rusms" }, + { + "description": "FAM:ryuk", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.ryuk", + "https://www.cybereason.com/blog/triple-threat-emotet-deploys-trickbot-to-steal-data-spread-ryuk-ransomware", + "https://research.checkpoint.com/ryuk-ransomware-targeted-campaign-break/", + "https://www.latimes.com/local/lanow/la-me-ln-times-delivery-disruption-20181229-story.html", + "https://www.crowdstrike.com/blog/big-game-hunting-with-ryuk-another-lucrative-targeted-ransomware/", + "https://www.fireeye.com/blog/threat-research/2019/01/a-nasty-trick-from-credential-theft-malware-to-business-disruption.html", + "https://www.fireeye.com/blog/threat-research/2019/04/pick-six-intercepting-a-fin6-intrusion.html", + "https://securingtomorrow.mcafee.com/other-blogs/mcafee-labs/ryuk-ransomware-attack-rush-to-attribution-misses-the-point/" + ], + "synonyms": [], + "type": [] + }, + "related": [ + { + "dest-uuid": "a8f4f4fa-1558-3f16-ac56-bce903a83b22", + "type": "variant-of" + } + ], + "uuid": "abf8f3dc-4dbf-47b3-95fd-b35ac2ed3f46", + "value": "ryuk" + }, { "description": "FAM:sacti", "meta": { @@ -7265,6 +8064,26 @@ "uuid": "5ff7793d-c1c4-380d-900e-d9aa6a409915", "value": "sality" }, + { + "description": "FAM:samsam", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.samsam", + "http://blog.talosintel.com/2016/03/samsam-ransomware.html", + "https://www.sophos.com/en-us/medialibrary/pdfs/technical-papers/samsam-ransomware-chooses-its-targets-carefully-wpna.aspx", + "https://www.crowdstrike.com/blog/an-in-depth-analysis-of-samsam-ransomware-and-boss-spider/", + "https://www.justice.gov/opa/pr/two-iranian-men-indicted-deploying-ransomware-extort-hospitals-municipalities-and-public", + "https://nakedsecurity.sophos.com/2018/05/01/samsam-ransomware-a-mean-old-dog-with-a-nasty-new-trick-report/", + "http://blog.talosintelligence.com/2018/01/samsam-evolution-continues-netting-over.html" + ], + "synonyms": [ + "samas" + ], + "type": [] + }, + "uuid": "d00e9064-f1e9-4696-87dc-13031aa4553d", + "value": "samsam" + }, { "description": "FAM:sanctionedmedia", "meta": { @@ -7310,6 +8129,20 @@ "uuid": "b568a5b2-1008-33cb-85ba-c461018fc2c8", "value": "scam" }, + { + "description": "FAM:schwarzesonne", + "meta": { + "refs": [ + "https://www.microsoft.com/en-us/wdsi/threats/malware-encyclopedia-description?Name=Trojan:Win32/SchwarzeSonne!MSR", + "https://www.fortiguard.com/encyclopedia/virus/7488534", + "https://www.virusradar.com/en/Win32_SchwarzeSonne.BL/description" + ], + "synonyms": [], + "type": [] + }, + "uuid": "a4c164cc-a4a1-4b7f-a9a2-c664f6b461d4", + "value": "schwarzesonne" + }, { "description": "FAM:sckeylog", "meta": { @@ -7431,6 +8264,27 @@ "uuid": "ad21874f-d8c7-33d7-9527-c9d666171aa8", "value": "shedun" }, + { + "description": "FAM:shelma", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.doghousepower", + "http://www1.paladion.net/hubfs/Newsletter/DogHousePower-%20Newly%20Identified%20Python-Based%20Ransomware.pdf" + ], + "synonyms": [ + "doghousepower" + ], + "type": [] + }, + "related": [ + { + "dest-uuid": "a8f4f4fa-1558-3f16-ac56-bce903a83b22", + "type": "variant-of" + } + ], + "uuid": "47ff7101-61d2-464f-8210-6fe26cac2772", + "value": "shelma" + }, { "description": "FAM:sheridroid", "meta": { @@ -8239,6 +9093,7 @@ "refs": [], "synonyms": [ "encoder", + "diskcoder", "filecoder", "ransomcrypt", "trojanransom" @@ -8309,11 +9164,14 @@ "refs": [], "synonyms": [ "banker", - "datasetaler", + "bitstealer", + "datastealer", + "discostealer", "delfsnif", "delpbanc", "infostealer", "monitor", + "passwordstealera", "pswtool", "pwsteal", "pwstealer", @@ -8326,6 +9184,22 @@ "uuid": "c65071d8-2bad-302b-8646-d309f7705fdb", "value": "infosteal" }, + { + "description": "BEH:infosteal:coinstealer", + "meta": { + "refs": [], + "synonyms": [], + "type": [] + }, + "related": [ + { + "dest-uuid": "c65071d8-2bad-302b-8646-d309f7705fdb", + "type": "subtechnique-of" + } + ], + "uuid": "e8e60d44-4950-4671-b56e-707d6ce0b2f6", + "value": "coinstealer" + }, { "description": "BEH:infosteal:gamethief", "meta": { @@ -8335,6 +9209,12 @@ ], "type": [] }, + "related": [ + { + "dest-uuid": "c65071d8-2bad-302b-8646-d309f7705fdb", + "type": "subtechnique-of" + } + ], "uuid": "b87b252e-b364-3cbb-92cf-939b2343b0bc", "value": "gamethief" }, @@ -8416,6 +9296,12 @@ ], "type": [] }, + "related": [ + { + "dest-uuid": "aecd212a-8701-3527-bbde-8cd36b405f93", + "type": "subtechnique-of" + } + ], "uuid": "1b4d1d8e-9cbf-3f9b-8308-23e6de3456fd", "value": "killsectool" }, @@ -8852,6 +9738,12 @@ ], "type": [] }, + "related": [ + { + "dest-uuid": "0fa687bd-caa9-32b1-a77e-13b98dc83685", + "type": "variant-of" + } + ], "uuid": "590eeb1a-2742-3301-94bf-9fa0856e959c", "value": "adware" }, @@ -8864,6 +9756,12 @@ ], "type": [] }, + "related": [ + { + "dest-uuid": "0fa687bd-caa9-32b1-a77e-13b98dc83685", + "type": "variant-of" + } + ], "uuid": "3ce6bd72-2133-35f8-b5a9-3d22c5e55a93", "value": "casino" }, @@ -8883,6 +9781,12 @@ ], "type": [] }, + "related": [ + { + "dest-uuid": "590eeb1a-2742-3301-94bf-9fa0856e959c", + "type": "subtechnique-of" + } + ], "uuid": "636db272-61ce-3e0f-ad64-d77048b05066", "value": "multiplug" }, @@ -8904,30 +9808,42 @@ ], "type": [] }, + "related": [ + { + "dest-uuid": "0fa687bd-caa9-32b1-a77e-13b98dc83685", + "type": "variant-of" + } + ], "uuid": "b6d3ea56-83b3-3524-a2f5-c87ce2ed0aab", "value": "tool" }, { - "description": "CLASS:hoax", + "description": "CLASS:grayware:tool:remoteadmin", "meta": { "refs": [], - "synonyms": [ - "joke" - ], + "synonyms": [], "type": [] }, - "uuid": "e7bd337d-700c-376b-ac75-61c85dd8a246", - "value": "hoax" + "related": [ + { + "dest-uuid": "b6d3ea56-83b3-3524-a2f5-c87ce2ed0aab", + "type": "variant-of" + } + ], + "uuid": "e43ecd9a-2734-34d6-b24f-77be13f4b9cd", + "value": "remoteadmin" }, { - "description": "CLASS:grayware:tool:remoteadmin", + "description": "CLASS:hoax", "meta": { "refs": [], - "synonyms": [], + "synonyms": [ + "joke" + ], "type": [] }, - "uuid": "e43ecd9a-2734-34d6-b24f-77be13f4b9cd", - "value": "remoteadmin" + "uuid": "e7bd337d-700c-376b-ac75-61c85dd8a246", + "value": "hoax" }, { "description": "CLASS:hoax:smshoax", @@ -8993,6 +9909,12 @@ ], "type": [] }, + "related": [ + { + "dest-uuid": "64376dc5-0640-33a5-ba0e-1a4b71922c06", + "type": "variant-of" + } + ], "uuid": "3265ee34-384e-3dc8-9652-19d88d4374cb", "value": "bitcoinminer" }, @@ -9096,6 +10018,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "973cc9e5-32ab-3403-9ead-eb941690fc23", + "type": "subtechnique-of" + } + ], "uuid": "b7be1d66-ac27-3b2c-8361-a652564ec2e3", "value": "prepender" }, @@ -9109,6 +10037,12 @@ ], "type": [] }, + "related": [ + { + "dest-uuid": "90bb8141-d467-3376-8c85-4e0ec9a2be05", + "type": "uses" + } + ], "uuid": "f0b15f66-0eae-37d8-bf08-eeca70557795", "value": "worm" }, @@ -9119,6 +10053,12 @@ "synonyms": [], "type": [] }, + "related": [ + { + "dest-uuid": "f0b15f66-0eae-37d8-bf08-eeca70557795", + "type": "subtechnique-of" + } + ], "uuid": "980f8421-cccd-3c17-b998-1ab1b7c7bdb9", "value": "emailworm" }, @@ -9377,6 +10317,32 @@ "uuid": "805e91bc-aaed-380d-97f7-7d9ae2ab4703", "value": "adviator" }, + { + "description": "FAM:adwind", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/jar.adwind", + "https://blogs.seqrite.com/evolution-of-jrat-java-malware/", + "https://www.fortinet.com/blog/threat-research/new-jrat-adwind-variant-being-spread-with-package-delivery-scam.html", + "http://blog.trendmicro.com/trendlabs-security-intelligence/spam-remote-access-trojan-adwind-jrat", + "http://malware-traffic-analysis.net/2017/07/04/index.html", + "https://codemetrix.net/decrypting-adwind-jrat-jbifrost-trojan/", + "https://gist.github.com/herrcore/8336975475e88f9bc539d94000412885", + "https://blog.talosintelligence.com/2018/09/adwind-dodgesav-dde.html" + ], + "synonyms": [ + "AlienSpy", + "Frutas", + "JBifrost", + "JSocket", + "Sockrat", + "UNRECOM" + ], + "type": [] + }, + "uuid": "04e324c1-a981-4bf7-aab4-d64d0dacae51", + "value": "adwind" + }, { "description": "FAM:adwk", "meta": { @@ -9409,6 +10375,25 @@ "uuid": "1bcf8191-2d6d-3f3e-a114-7df87b8aafcd", "value": "aesads" }, + { + "description": "FAM:agenttesla", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.agent_tesla", + "https://researchcenter.paloaltonetworks.com/2017/09/unit42-analyzing-various-layers-agentteslas-packing/", + "https://malwarebreakdown.com/2018/01/11/malspam-entitled-invoice-attched-for-your-reference-delivers-agent-tesla-keylogger/", + "https://www.zscaler.com/blogs/research/agent-tesla-keylogger-delivered-using-cybersquatting", + "https://blog.fortinet.com/2017/06/28/in-depth-analysis-of-net-malware-javaupdtr", + "https://www.fortinet.com/blog/threat-research/analysis-of-new-agent-tesla-spyware-variant.html", + "https://thisissecurity.stormshield.com/2018/01/12/agent-tesla-campaign/", + "https://blogs.forcepoint.com/security-labs/part-two-camouflage-netting" + ], + "synonyms": [], + "type": [] + }, + "uuid": "bd4238f7-fbfc-4ad8-b42e-247013c6df3d", + "value": "agenttesla" + }, { "description": "FAM:agobot", "meta": { @@ -9803,6 +10788,21 @@ "uuid": "4b0a463a-269d-3d58-ae8c-7935c51aa9bc", "value": "autosus" }, + { + "description": "FAM:avemaria", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.ave_maria", + "https://blog.yoroi.company/research/the-ave_maria-malware/" + ], + "synonyms": [ + "AVE_MARIA" + ], + "type": [] + }, + "uuid": "a006993a-6e83-4fb5-a6a6-d67a7dc71c23", + "value": "avemaria" + }, { "description": "FAM:axespy", "meta": { @@ -9813,6 +10813,31 @@ "uuid": "7bce3d09-df0c-3593-8353-7812dd205844", "value": "axespy" }, + { + "description": "FAM:azorult", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.azorult", + "https://www.bleepingcomputer.com/news/security/azorult-trojan-serving-aurora-ransomware-by-malactor-oktropys/", + "https://blog.minerva-labs.com/puffstealer-evasion-in-a-cloak-of-multiple-layers", + "https://malwarebreakdown.com/2017/07/24/the-seamless-campaign-drops-ramnit-follow-up-malware-azorult-stealer-smoke-loader-etc/", + "https://www.proofpoint.com/us/threat-insight/post/threat-actors-using-legitimate-paypal-accounts-to-distribute-chthonic-banking-trojan", + "http://www.vkremez.com/2017/07/lets-learn-reversing-credential-and.html", + "https://blog.minerva-labs.com/azorult-now-as-a-signed-google-update", + "https://www.proofpoint.com/us/threat-insight/post/new-version-azorult-stealer-improves-loading-features-spreads-alongside", + "https://malwarebreakdown.com/2017/11/12/seamless-campaign-delivers-ramnit-via-rig-ek-at-188-225-82-158-follow-up-malware-is-azorult-stealer/", + "https://www.blueliv.com/blog-news/research/azorult-crydbrox-stops-sells-malware-credential-stealer/", + "https://research.checkpoint.com/the-emergence-of-the-new-azorult-3-3/" + ], + "synonyms": [ + "puffstealer", + "rultazo" + ], + "type": [] + }, + "uuid": "fc17c756-528b-416e-907d-9d1ef7403df1", + "value": "azorult" + }, { "description": "FAM:badda", "meta": { @@ -9967,6 +10992,27 @@ "uuid": "40f45119-a4ce-335c-be07-c46ff67f3dcf", "value": "bauts" }, + { + "description": "FAM:bazar", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.bazarbackdoor", + "https://www.advanced-intel.com/post/anatomy-of-attack-inside-bazarbackdoor-to-ryuk-ransomware-one-group-via-cobalt-strike", + "https://www.pandasecurity.com/en/mediacenter/business/bazarbackdoor-trickbot-backdoor/" + ], + "synonyms": [ + "bazarbackdoor", + "beerbot", + "bazarcall", + "kegtap", + "team9backdoor", + "bazaloader" + ], + "type": [] + }, + "uuid": "2f6e812e-16a6-4fbc-9273-1aebc12b7d3d", + "value": "bazar" + }, { "description": "FAM:bebeg", "meta": { @@ -10185,7 +11231,11 @@ "description": "FAM:bladabindi", "meta": { "refs": "https://malpedia.caad.fkie.fraunhofer.de/details/win.njrat", - "synonyms": [], + "synonyms": [ + "bladabi", + "bladabindinet", + "njrat" + ], "type": [] }, "uuid": "470bf5fe-81e2-3da1-a4da-6a1680119a0f", @@ -10519,6 +11569,20 @@ "uuid": "a4e78673-2014-3dbd-bf93-628bc644a872", "value": "centim" }, + { + "description": "FAM:cerberus", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/apk.cerberus", + "https://attack.mitre.org/software/S0480", + "https://www.threatfabric.com/blogs/cerberus-a-new-banking-trojan-from-the-underworld.html" + ], + "synonyms": [], + "type": [] + }, + "uuid": "bfd0098a-822d-436b-b751-1c61ff661cfe", + "value": "cerberus" + }, { "description": "FAM:cerekv", "meta": { @@ -10620,6 +11684,22 @@ "uuid": "d46db949-1fd3-303c-9bf1-b56f86d9077b", "value": "clinator" }, + { + "description": "FAM:clipbanker", + "meta": { + "refs": "https://malpedia.caad.fkie.fraunhofer.de/details/win.clipbanker", + "synonyms": [], + "type": [] + }, + "related": [ + { + "dest-uuid": "c65071d8-2bad-302b-8646-d309f7705fdb", + "type": "variant-of" + } + ], + "uuid": "21529e81-1aea-4435-a407-c5016653d63d", + "value": "clipbanker" + }, { "description": "FAM:cmccwm", "meta": { @@ -10650,6 +11730,31 @@ "uuid": "5c5aa6ae-b94a-31df-bea0-4e672b746664", "value": "cnzz" }, + { + "description": "FAM:cobaltstrike", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.cobalt_strike", + "https://www.fireeye.com/blog/threat-research/2017/06/phished-at-the-request-of-counsel.html", + "https://www.symantec.com/connect/blogs/odinaff-new-trojan-used-high-level-financial-attacks", + "https://github.com/JPCERTCC/aa-tools/blob/master/cobaltstrikescan.py", + "https://blogs.jpcert.or.jp/en/2018/08/volatility-plugin-for-detecting-cobalt-strike-beacon.html", + "https://blog.cobaltstrike.com/", + "https://www.cobaltstrike.com/support", + "https://www.fireeye.com/blog/threat-research/2018/11/not-so-cozy-an-uncomfortable-examination-of-a-suspected-apt29-phishing-campaign.html", + "http://blog.morphisec.com/new-global-attack-on-point-of-sale-systems", + "https://www.lac.co.jp/lacwatch/people/20180521_001638.html", + "https://401trg.com/burning-umbrella/ ", + "https://www.pentestpartners.com/security-blog/cobalt-strike-walkthrough-for-red-teamers/", + "https://pylos.co/2018/11/18/cozybear-in-from-the-cold/", + "http://cyberforensicator.com/2018/12/23/dissecting-cozy-bears-malicious-lnk-file/" + ], + "synonyms": [], + "type": [] + }, + "uuid": "008947a7-9634-4f83-851b-f65e1a0f2f0c", + "value": "cobaltstrike" + }, { "description": "FAM:coinhive", "meta": { @@ -10712,6 +11817,25 @@ "uuid": "875a27b7-cc81-3b09-8c23-d2c7b1bd6ac4", "value": "contactscollector" }, + { + "description": "FAM:conti", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.conti", + "https://www.carbonblack.com/blog/tau-threat-discovery-conti-ransomware/" + ], + "synonyms": [], + "type": [] + }, + "related": [ + { + "dest-uuid": "a8f4f4fa-1558-3f16-ac56-bce903a83b22", + "type": "variant-of" + } + ], + "uuid": "7d7da922-9df0-4184-944c-215b74c8095b", + "value": "conti" + }, { "description": "FAM:cooee", "meta": { @@ -11065,6 +12189,24 @@ "uuid": "96986f73-ee4f-330c-92f9-805d05e6f44b", "value": "dbtes" }, + { + "description": "FAM:deathransom", + "meta": { + "refs": "https://malpedia.caad.fkie.fraunhofer.de/details/win.deathransom", + "synonyms": [ + "wacatac" + ], + "type": [] + }, + "related": [ + { + "dest-uuid": "a8f4f4fa-1558-3f16-ac56-bce903a83b22", + "type": "variant-of" + } + ], + "uuid": "bc154e88-a6ae-4f5d-b029-bbd3b8acf587", + "value": "deathransom" + }, { "description": "FAM:deblio", "meta": { @@ -11124,6 +12266,33 @@ "uuid": "9b9eaf63-3447-3349-b955-6b62a9809d85", "value": "detroie" }, + { + "description": "FAM:crysis", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.dharma", + "https://www.carbonblack.com/2018/07/10/carbon-black-tau-threat-analysis-recent-dharma-ransomware-highlights-attackers-continued-use-open-source-tools/", + "https://www.bleepingcomputer.com/news/security/new-arena-crysis-ransomware-variant-released/" + ], + "synonyms": [ + "crusis", + "dharma", + "phobos", + "arena", + "wadhrama", + "ncov" + ], + "type": [] + }, + "related": [ + { + "dest-uuid": "a8f4f4fa-1558-3f16-ac56-bce903a83b22", + "type": "variant-of" + } + ], + "uuid": "689e26c6-8cf6-4ce0-ba87-cad1377996ae", + "value": "crysis" + }, { "description": "FAM:dianjin", "meta": { @@ -11329,6 +12498,19 @@ "uuid": "3eb5f701-637e-3b03-ac32-47f59641c718", "value": "dowgin" }, + { + "description": "FAM:downeks", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.downeks", + "http://researchcenter.paloaltonetworks.com/2017/01/unit42-downeks-and-quasar-rat-used-in-recent-targeted-attacks-against-governments/?adbsc=social69739136&adbid=826218465723756545&adbpl=tw&adbpr=4487645412" + ], + "synonyms": [], + "type": [] + }, + "uuid": "c8149b45-7d28-421e-bc6f-25c4b8698b92", + "value": "downeks" + }, { "description": "FAM:downloadadmin", "meta": { @@ -11582,6 +12764,32 @@ "uuid": "88fec24c-acb5-3403-b8bf-2da120708b5c", "value": "egame" }, + { + "description": "FAM:egregor", + "meta": { + "ransomnotes-filenames": [ + "RECOVER-FILES.txt" + ], + "ransomnotes-refs": [ + "https://www.bleepstatic.com/images/news/columns/week-in-ransomware/2020/september/25/egregor.jpg" + ], + "refs": [ + "https://www.appgate.com/news-press/appgate-labs-analyzes-new-family-of-ransomware-egregor", + "https://www.bleepingcomputer.com/news/security/crytek-hit-by-egregor-ransomware-ubisoft-data-leaked/", + "https://cybersecuritynews.com/egregor-ransomware/" + ], + "synonyms": [], + "type": [] + }, + "related": [ + { + "dest-uuid": "a8f4f4fa-1558-3f16-ac56-bce903a83b22", + "type": "variant-of" + } + ], + "uuid": "75c5da38-3097-4e81-9c34-188cfbec1596", + "value": "egregor" + }, { "description": "FAM:egroupdial", "meta": { @@ -11622,6 +12830,54 @@ "uuid": "d43481d8-9186-33cc-8974-75fb3f7a357d", "value": "elite" }, + { + "description": "FAM:emotet", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.emotet", + "https://blog.trendmicro.com/trendlabs-security-intelligence/ursnif-emotet-dridex-and-bitpaymer-gangs-linked-by-a-similar-loader/", + "http://blog.trendmicro.com/trendlabs-security-intelligence/emotet-returns-starts-spreading-via-spam-botnet/", + "https://www.fortinet.com/blog/threat-research/deep-analysis-of-new-emotet-variant-part-2.html", + "https://www.spamhaus.org/news/article/783/emotet-adds-a-further-layer-of-camouflage", + "https://isc.sans.edu/forums/diary/Emotet+infections+and+followup+malware/24532/", + "https://www.welivesecurity.com/2018/11/09/emotet-launches-major-new-spam-campaign/", + "https://github.com/d00rt/emotet_research", + "https://blog.kryptoslogic.com/malware/2018/08/01/emotet.html", + "https://www.us-cert.gov/ncas/alerts/TA18-201A", + "https://portswigger.net/daily-swig/emotet-trojan-implicated-in-wolverine-solutions-ransomware-attack", + "https://blog.trendmicro.com/trendlabs-security-intelligence/new-emotet-hijacks-windows-api-evades-sandbox-analysis/", + "https://blog.kryptoslogic.com/malware/2018/10/31/emotet-email-theft.html", + "http://blog.fortinet.com/2017/05/03/deep-analysis-of-new-emotet-variant-part-1", + "https://www.intezer.com/mitigating-emotet-the-most-common-banking-trojan/", + "https://maxkersten.nl/binary-analysis-course/malware-analysis/emotet-droppers/", + "https://research.checkpoint.com/emotet-tricky-trojan-git-clones/", + "https://www.cert.pl/en/news/single/analysis-of-emotet-v4/", + "https://www.symantec.com/blogs/threat-intelligence/evolution-emotet-trojan-distributor", + "https://www.crowdstrike.com/blog/meet-crowdstrikes-adversary-of-the-month-for-february-mummy-spider/", + "https://www.melani.admin.ch/melani/de/home/dokumentation/newsletter/Trojaner_Emotet_greift_Unternehmensnetzwerke_an.html", + "https://persianov.net/emotet-malware-analysis-part-1", + "https://persianov.net/emotet-malware-analysis-part-2", + "https://int0xcc.svbtle.com/dissecting-emotet-s-network-communication-protocol", + "https://blog.trendmicro.com/trendlabs-security-intelligence/exploring-emotet-examining-emotets-activities-infrastructure/", + "https://paste.cryptolaemus.com", + "https://cloudblogs.microsoft.com/microsoftsecure/2017/11/06/mitigating-and-eliminating-info-stealing-qakbot-and-emotet-in-corporate-networks/?source=mmpc", + "https://www.spamtitan.com/blog/emotet-malware-revives-old-email-conversations-threads-to-increase-infection-rates/", + "https://www.fidelissecurity.com/threatgeek/2017/07/emotet-takes-wing-spreader", + "https://securelist.com/analysis/publications/69560/the-banking-trojan-emotet-detailed-analysis/", + "https://feodotracker.abuse.ch/?filter=version_e", + "https://www.gdata.de/blog/2017/10/30110-emotet-beutet-outlook-aus", + "https://malfind.com/index.php/2018/07/23/deobfuscating-emotets-powershell-payload/", + "https://medium.com/@0xd0cf11e/analyzing-emotet-with-ghidra-part-1-4da71a5c8d69" + ], + "synonyms": [ + "geodo", + "heodo" + ], + "type": [] + }, + "uuid": "054e50ca-aeec-428e-91a5-f45e4029a073", + "value": "emotet" + }, { "description": "FAM:emudbot", "meta": { diff --git a/avclass/data/misp/galaxy/avclass2.json b/misp/galaxy/avclass.json similarity index 87% rename from avclass/data/misp/galaxy/avclass2.json rename to misp/galaxy/avclass.json index 656826f..8a95d0b 100644 --- a/avclass/data/misp/galaxy/avclass2.json +++ b/misp/galaxy/avclass.json @@ -1,6 +1,6 @@ { "description": "A malware galaxy based on AvClass", - "icon": "", + "icon": "optin-monster", "name": "AvClass", "namespace": "misp", "type": "avclass", From 5e6dd1d8f2c45cb083229b9f6326c25fbb26e831 Mon Sep 17 00:00:00 2001 From: Jeffrey Gentes Date: Wed, 15 Feb 2023 20:44:26 -0500 Subject: [PATCH 34/36] Update based on malicialab master --- avclass/common.py | 23 ++++++- avclass/data/default.tagging | 91 ++++++++++++++++++++++++++- avclass/data/default.taxonomy | 27 ++++++-- avclass/labeler.py | 4 +- avclass/update.py | 6 +- examples/metadefender_sample.json | 1 + misp/cluster/avclass.json | 101 +++++++++++++++++++++++++++++- 7 files changed, 239 insertions(+), 14 deletions(-) create mode 100644 examples/metadefender_sample.json diff --git a/avclass/common.py b/avclass/common.py index 946f7ad..a6a439c 100755 --- a/avclass/common.py +++ b/avclass/common.py @@ -30,7 +30,6 @@ "Jiangmin", "Comodo", "GData", - "Avast", "Sophos", "BitDefenderTheta", "Alibaba", @@ -116,6 +115,12 @@ def __iter__(self): """ Iterator over the alphabetically sorted tags in the taxonomy """ return (t for t in sorted(self._tags)) + def is_hex(self, tag: AnyStr) -> bool: + # exclude generic hex tags like 004bc24a + return bool(re.search(r"\d", tag)) and bool( + re.fullmatch(r"[0-9a-fA-F]+", tag) + ) + def is_generic(self, tag: AnyStr) -> bool: """ Whether or not the input ``tag`` is generic @@ -748,6 +753,10 @@ def get_label_tags(self, label: AnyStr, hashes: Collection[AnyStr]) -> Set[AnySt if self.taxonomy.is_generic(token): continue + # Ignore hex tokens + if self.taxonomy.is_hex(token): + continue + # Apply tagging rule dst_l = self.translations.get_dst(token) if dst_l: @@ -829,6 +838,18 @@ def get_sample_tags(self, sample_info: SampleInfo) -> Dict[AnyStr, List[AnyStr]] return av_dict + def get_sample_vt_count(self, sample_info): + ''' Return number of detections for sample + in the provided AV whitelist (if any) ''' + if self.avs is None: + return len(sample_info.labels) + else: + cnt = 0 + for (av_name, label) in sample_info.labels: + if av_name in self.avs: + cnt += 1 + return cnt + @staticmethod def rank_tags( av_dict: Dict[AnyStr, List[AnyStr]], threshold: int = 1 diff --git a/avclass/data/default.tagging b/avclass/data/default.tagging index bbdaa98..7de3e8f 100644 --- a/avclass/data/default.tagging +++ b/avclass/data/default.tagging @@ -13,6 +13,7 @@ addisplay adware addrop adware adfltnet amonetize adgazele adgazelle +adhubllka deathransom adiwky airpush adknowledge adware adload adware @@ -26,14 +27,18 @@ adtrafficanalysis winkad adwareeorezo eorezo afoynq ksapp agemt domob +agensla agenttesla +agentesla agenttesla agewap opfake agile biige agilebinary biige agnsmit infectionads +ainslot blackshades airad airinstaller airadinstaller airinstaller airinstall airinstaller akan winwebsec +alienspy adwind allad airpush almanahe alman alureon tdss @@ -96,8 +101,14 @@ banloader rimod basebrid basebridge batteryd fakedoc batterydoctor fakedoc +bazaloader bazar +bazarldr bazar +bazarloader bazar +bazdor bazar +bazzarldr bazar bbridge basebridge bckdr backdoor +beacon cobaltstrike bean nandrobox bearshare bandoo beita beitaad @@ -113,6 +124,8 @@ bitminer bitcoinminer bjlog zegost bkdr backdoor blackice whiteice +bladabi bladabindi +blanajog spygate blic whiteice blocal vmvol blocker killsectool @@ -121,9 +134,11 @@ botnet gidix bototer wapomi boxer fakeinst boxersms fakeinst +bozok bezigate braininst installbrain brantall installbrain brappware multiplug +breu darkkomet browsepulse browsefox browsermodifier multiplug browserplugin multiplug @@ -144,7 +159,9 @@ c2lop swizzor cabby dalexis caphaw shylock casonline casino +cassiopeia blackshades cawitt smsbot +cebruser cerberus ceeinject inject cellphonetrack mytrackp cellspy mobilespy @@ -158,6 +175,7 @@ chinky vobfus chydo pykspa cidox vundo cimag hiloti +cinarat quasar cinmeng cinmus citirevo vundo clemag cleaman @@ -170,7 +188,10 @@ clickspring purityscan clientconnect opencandy climap androrat clkpotato hotbar +cloudatlas neoreklami clspring purityscan +cobalt cobaltstrike +cobaltstr cobaltstrike cobbler focobers cobblerone focobers cobbleronea focobers @@ -183,6 +204,7 @@ coinminer miner coldfuson coldfusion collector autoins comet darkkomet +cometer cobaltstrike cometsys darkkomet cometsystems darkkomet condestil firseria @@ -198,6 +220,7 @@ cracktool tool crisis morcut crori crossrider crosate svpeng +crusis crysis crwind crusewind cryp packed crypt packed @@ -209,16 +232,19 @@ cryptinno installcore cryptodefense cryptodef cryptominer miner cryptor packed +cryptz rozena cson simbot ctblocker dalexis cudos fosniw cupi smssend +cybergate rebhip cybota cycbot cycler unruy dadmin downloadadmin dailer dialer dalamodo cossta damaged corrupted +darkcomet darkkomet darksnow whiteice datasetaler infosteal daytre upatre @@ -236,11 +262,14 @@ derdroi simbad desktoplightning cashon detroi detroie detroia detroie +dexcrypt mbrlock +dharma crysis dial dialer dialers dialer dialpass egroupdial dialplatform dialer didat dabom +dinihou jenxcus diple vobfus directdown directdownloader dizhi lecna @@ -309,13 +338,16 @@ droppr downloader dropr downloader duel loveletter dumobove hiddad +dunihi jenxcus duptwux lolbot +dwnld downloader dwnldr downloader dwonk pykspa easydl amonetize echiui invis ecsys mailcab egbii biige +egregorransom egregor egroup egroupdial eicar testvirus electron sytro @@ -327,6 +359,7 @@ emagsoftware smsreg email spam emailspy maistealer emerleox fujacks +emotetcrypt emotet emud emudbot encoder filecrypt ransomware encpk packed @@ -335,6 +368,7 @@ epicgames gamevance epicplay gamevance eqdrug equationdrug equation equationdrug +eregorcrypt egregor erop smssend escape laroux escop laroux @@ -382,6 +416,7 @@ fakeupdates gamex fakmod fakeapp fakromup soft32downloader faktvx fakeangry +fareitvb fareit farex fearso fastsave megasearch fastsaveapp megasearch @@ -427,6 +462,7 @@ freeandspy freespy freepds hotclip frogonal ginmaster fujack fujacks +fullscreen lockscreen funclub smssend funweb mywebsearch fynloski darkkomet @@ -478,6 +514,7 @@ gmeil gamex gnurbulf rungbu goidu oveead goldclick hiddad +goldeneye petya gonca gonesixty gone gonesixty gonfu droidkungfu @@ -503,6 +540,7 @@ hacyayu winwebsec hamob fakeflash hdusafe wapron helldoor hilldoor +hellokitty deathransom hellospy spyoo hiddenad hiddad hiddeninstall jsmshider @@ -518,8 +556,10 @@ homepage browsermodify hongtoutou adrd horse trojan hosts-modifier hostsmodify +houdini jenxcus hublo crytex huigezi hupigon +hworm jenxcus hype loadmoney hyteod kovter iadpush dowgin @@ -579,6 +619,7 @@ jedan kuguo jelbrus techsnab joke hoax joleee tedroo +jrat adwind juched griptolo kaka telman kanav alyak @@ -596,6 +637,8 @@ kibi ksapp kichhoat smsreg killav killsectool killfiles files +kitty deathransom +kittycrypt deathransom kituri placms kkrunchy krunchy klevate webprefix @@ -607,6 +650,8 @@ kometa rukometa kongfu droidkungfu kouto koutodoor koyotelab bandoo +kpotsteal kpot +kpotstealer kpot krademok darkkomet kranxpay mmarketpay krypt packed @@ -644,6 +689,8 @@ llond lardlond loadmoneyent loadmoney locker lockscreen locm locmg +loda nymeria +lodarat nymeria lohmys midia looked viking loorp wapomi @@ -673,6 +720,7 @@ malpe corrupted manalo laroux mandaph socks marketpay mmarketpay +maskit khalesi massmailer spam master masterkey maxplus zeroaccess @@ -724,7 +772,9 @@ morstar firseria morstars firseria mosky skymobi mostofate softomate +mozaakai bazar mplug multiplug +mrophine morphine msilobfuscator msil packed mspyonline mspy msteal maistealer @@ -735,6 +785,7 @@ muldrop downloader multibardown multibar multibardownloader multibar multiinstall vilsel +multipacked packed multipluggen multiplug musictoolbar bandoo mutibar multibar @@ -744,13 +795,19 @@ mw97 macro mytrack mytrackp nabucur virlock najin feejar +nancrat nanocore nandrob nandrobox +nanobot nanocore +negasteal agenttesla nemucod smsreg +neobar neoreklami neshuta neshta netboxserver netbox neteyes ipamor netfilter network netweird netwiredrc +netwire netwiredrc +netwired netwiredrc networm worm newyearl plankton nextup verti @@ -764,6 +821,8 @@ nimnul wapomi ninebox kuguo nioserv nocoma nisev nocoma +njrat bladabindi +noancooe nanocore nofear fearso nofer fearso noico zdtad @@ -808,6 +867,8 @@ optinstall ibryte optiuminstaller ibryte optixp optix optixpro optix +orcusrat orcus +orcusrot orcus osx mac osx32 mac otran vobfus @@ -843,7 +904,9 @@ perfectkeylogger perflogger perfkey perflogger perfloger perflogger perkele perkel +petr petya petrolan petrolin +petrwrap petya philis viking pigeon hupigon pigetrl lockscreen @@ -871,6 +934,7 @@ polipos cardserv polycryptt polycrypt polyransom virlock pony fareit +ponystealer fareit popeler firseria popov fakeinst popuppers soft32downloader @@ -915,6 +979,7 @@ qakbot qbot qhost hostsmodify qhosts hostsmodify qqrobber qqrob +quasarrat quasar qukart berbew qvod wapomi rabbhome fjcon @@ -932,6 +997,7 @@ ratab mamianune razel rasteal raziel rasteal recal mogap +recam netwiredrc recordpage browsefox redirector network reefwal kalfere @@ -941,7 +1007,12 @@ relevant relevantknowledge relik updtkiller remtasu xtrat renamer files +reposfxg trickbot reptilic reptilicus +rescoms remcos +revenge revetrat +revengerat revetrat +revet revetrat revtcp metasploit rimecud palevo risk grayware @@ -967,6 +1038,7 @@ rugo hotbar runitslf looper runonce chir runouce chir +ruyk ryuk safekidzone sakezon sahagent sahat saho wroba @@ -978,9 +1050,11 @@ salitystub sality salload sality salpack sality salrenmetie sality +samas samsam sambamedia softpulse sancmed sanctionedmedia sandrorat sandr +sasfis oficla saveshare megasearch scareware rogueware scavir fakeinst @@ -999,6 +1073,7 @@ securitydefender defmid securitytool tool secxplod securityxploded secxploded securityxploded +sekhmet egregor selfdel beebone sendpay shastrosms sensode zxshell @@ -1007,6 +1082,8 @@ serpip morto sethom hiddad sexxoo redmobile sexyclip smssend +shadebot blackshades +shakblades blackshades sharestar gappusin shell shellcode shellkode shellcode @@ -1023,6 +1100,7 @@ sinodo sinowal sintal plankton sirefef zeroaccess skanik smssend +skeeeyah avemaria skywiper flame slybdb blohi smabo adialer @@ -1055,6 +1133,9 @@ sndapps typstu sneakytrail installerex sniffer network sobot clientor +sodin revil +sodinokib revil +sodinokibi revil soft32down soft32downloader soft32download soft32downloader softbase softobase @@ -1113,6 +1194,7 @@ suspiciouspacker packed susppack packed sventore firseria swiftbrowse browsefox +swrort rozena system droidkungfu systemfix fakesysdef systemsecurity winwebsec @@ -1146,12 +1228,16 @@ tinbelog nandrobox tiny small tklocker lockscreen tonclank plankton +toobpug neoreklami toorch rootnik tophos stegvob torchmedia bandoo torpump winpump tovkater installmonster towelexploit towel +trick trickbot +trickbotcrypt trickbot +trickpak trickbot trj trojan trjdown downloader trojan trjndwnlder downloader trojan @@ -1229,6 +1315,7 @@ w2km macro w32 windows w64 windows w97m macro +wadhrama crysis wakeful cardserv wali wapomi walkfree kalfere @@ -1253,6 +1340,7 @@ websearch search webtoolbar toolbar wedownload soft32downloader weecnaw netwiredrc +weenloc lockscreen weiyi smforw whboy fujacks whistle whistlesoftware @@ -1262,6 +1350,7 @@ win windows win32 windows win64 windows winge cardserv +winlock lockscreen winnt windows winsoft fosniw winsxsbot sfone @@ -1313,4 +1402,4 @@ zona zvuzona zpack packed zsone raden zwunzi zwangi -zybut shiz +zybut shiz \ No newline at end of file diff --git a/avclass/data/default.taxonomy b/avclass/data/default.taxonomy index 963b8da..2bb344e 100644 --- a/avclass/data/default.taxonomy +++ b/avclass/data/default.taxonomy @@ -43,6 +43,7 @@ BEH:spam BEH:tor BEH:vmdetect BEH:whatsapp +BEH:windef CLASS:apt CLASS:backdoor CLASS:bot @@ -116,6 +117,7 @@ FAM:allaple FAM:alman FAM:alyak FAM:amonetize +FAM:amphitryon FAM:androidarmour FAM:androidlost FAM:androrat @@ -166,15 +168,18 @@ FAM:berbew FAM:bertle FAM:betterad FAM:bettersurf +FAM:bezigate FAM:bgserv FAM:bicololo FAM:bifrose +FAM:bifrost FAM:biige FAM:binka FAM:bips FAM:birele FAM:bitrep FAM:blacklister +FAM:blackshades FAM:bladabindi FAM:blohi FAM:blueguard @@ -190,6 +195,7 @@ FAM:brontok FAM:browsefox FAM:bruad FAM:bublik +FAM:buhtrap FAM:bundlore FAM:buterat FAM:buzus @@ -275,6 +281,7 @@ FAM:dofoil FAM:dogowar FAM:domaiq FAM:domob +FAM:donoff FAM:dorfdo FAM:dorifel FAM:dorkbot @@ -373,6 +380,7 @@ FAM:fujacks FAM:gabas FAM:gabpath FAM:gamania +FAM:gamaredon FAM:gamarue FAM:gambler FAM:gamclk @@ -409,6 +417,7 @@ FAM:gonesixty FAM:goodnews FAM:goodor FAM:gootkit +FAM:gorgon FAM:gorillaprice FAM:gozi FAM:gpspy @@ -476,6 +485,7 @@ FAM:irtard FAM:itracker FAM:jayqa FAM:jeefo +FAM:jenxcus FAM:jfpush FAM:jiead FAM:jifake @@ -495,6 +505,7 @@ FAM:kasidet FAM:katrep FAM:kelihos FAM:kgbspy +FAM:khalesi FAM:kidlogger FAM:kimia FAM:kingroot @@ -507,6 +518,7 @@ FAM:koobface FAM:korgo FAM:koutodoor FAM:kovter +FAM:kpot FAM:krefel FAM:kronos FAM:ksapp @@ -536,6 +548,7 @@ FAM:loapi FAM:lockactivity FAM:locmg FAM:loic +FAM:lokibot FAM:lolbot FAM:lollipop FAM:loodos @@ -550,6 +563,7 @@ FAM:lucky FAM:lxasj FAM:lynep FAM:mabezat +FAM:macrobe FAM:magiccasino FAM:mailcab FAM:maistealer @@ -564,6 +578,7 @@ FAM:masplot FAM:masspr FAM:maxapp FAM:mazarbot +FAM:mbrlock FAM:mecor FAM:medfos FAM:mediafinder @@ -626,6 +641,7 @@ FAM:navbar FAM:nawiaiad FAM:necro FAM:necurs +FAM:nemim FAM:neoreklami FAM:neospy FAM:neshta @@ -640,9 +656,11 @@ FAM:nocoma FAM:notifyer FAM:nqshield FAM:nymaim +FAM:nymeria FAM:obtes FAM:ocikq FAM:odpa +FAM:oficla FAM:oimobi FAM:oivim FAM:oixal @@ -654,7 +672,7 @@ FAM:opencandy FAM:openinstall FAM:opfake FAM:optix -FAM:orcusrat +FAM:orcus FAM:outbrowse FAM:oveead FAM:paccy @@ -700,11 +718,11 @@ FAM:pushad FAM:pushe FAM:puxis FAM:pykspa -FAM:quasar FAM:qbot FAM:qexma FAM:qplus FAM:qqrob +FAM:quasar FAM:qumi FAM:quozha FAM:qushu @@ -734,6 +752,7 @@ FAM:reptilicus FAM:resharer FAM:reveton FAM:revetrat +FAM:revil FAM:revmob FAM:riltok FAM:rimod @@ -910,8 +929,8 @@ FAM:tracer FAM:tracker FAM:trackerfree FAM:trackplus -FAM:trickbot FAM:trclick +FAM:trickbot FAM:tridrongo FAM:troom FAM:truedownloader @@ -1205,4 +1224,4 @@ GEN:undef GEN:undefined GEN:unknown GEN:variant -GEN:website +GEN:website \ No newline at end of file diff --git a/avclass/labeler.py b/avclass/labeler.py index b0e362f..ae22a0d 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -213,8 +213,8 @@ def get_tokens(self, sample_info: NamedTuple): if self.av_labels.alias_detect: self.av_vender_tokens(tags) - # Compute VT_Count - vt_count = len(sample_info.labels) + # Compute VT_Count (using list of AV engines if provided) + vt_count = self.av_labels.get_sample_vt_count(sample_info) # Collect stats # TODO: should iterate once over tags, diff --git a/avclass/update.py b/avclass/update.py index 5adaf54..9615207 100644 --- a/avclass/update.py +++ b/avclass/update.py @@ -437,12 +437,12 @@ def main(): parser = argparse.ArgumentParser(description='Given a .alias file from the labeler, generates updates for the ' 'taxonomy, tagging, and expansion files.') - parser.add_argument('-alias', help='file to parse with alias from labeler which runs if -alias not present') + parser.add_argument('-alias', help='input file with alias from labeler. Mandatory.') - parser.add_argument('-n', help='Minimum number of times that a pair of tokes have been seen. Default: 20', + parser.add_argument('-n', help='Minimum number of times that a pair of tokens have been seen. Default: 20', type=int, default=20) - parser.add_argument('-t', help='Minimum percentage of times two tokens appear together. Default: 1.94', + parser.add_argument('-t', help='Minimum percentage of times two tokens appear together. Default: 0.94', type=float, default=0.94) parser.add_argument('-o', help='output prefix for files') diff --git a/examples/metadefender_sample.json b/examples/metadefender_sample.json new file mode 100644 index 0000000..0577345 --- /dev/null +++ b/examples/metadefender_sample.json @@ -0,0 +1 @@ +{"data_id": "49f8ca95f24a45ce9b7feb41b484e165", "dlp_info": {}, "extracted_files": {"files_extracted_count": 4, "files_in_archive": [{"data_id": "0dba93e893a64e42b2aad42996d52fb2", "detected_by": 0, "display_name": "reedmi.cvl", "file_size": 251124, "file_type": "application/vnd.rar", "file_type_description": "WinRAR Compressed Archive", "process_info": {"blocked_reason": "Encrypted Archive", "progress_percentage": 100, "result": "Blocked", "verdicts": ["Encrypted Archive"]}, "progress_percentage": 100, "scan_all_result_a": "Encrypted Archive", "scan_all_result_i": 12, "scanned_with": 29}, {"data_id": "220373b076e74ab09ae49b9879617b9b", "detected_by": 2, "display_name": "elp.bat", "file_size": 670, "file_type": "text/plain", "file_type_description": "ASCII Text", "process_info": {"blocked_reason": "Infected", "progress_percentage": 100, "result": "Blocked", "verdicts": ["Infected"]}, "progress_percentage": 100, "scan_all_result_a": "Infected", "scan_all_result_i": 1, "scanned_with": 29}, {"data_id": "b43c3a5ba47249a6b99632d9fb0563c5", "detected_by": 1, "display_name": "extraPFZ.exe", "file_size": 564896, "file_type": "application/x-dosexec", "file_type_description": "Executable File", "process_info": {"blocked_reason": "Infected", "progress_percentage": 100, "result": "Blocked", "verdicts": ["Infected"]}, "progress_percentage": 100, "scan_all_result_a": "Infected", "scan_all_result_i": 1, "scanned_with": 29}, {"data_id": "662a175c7783459b9afa28b1d33d1379", "detected_by": 0, "display_name": "svideo.vbs", "file_size": 81, "file_type": "text/plain", "file_type_description": "ASCII Text", "process_info": {"blocked_reason": "", "progress_percentage": 100, "result": "Allowed", "verdicts": ["No Threat Detected"]}, "progress_percentage": 100, "scan_all_result_a": "No Threat Detected", "scan_all_result_i": 0, "scanned_with": 29}], "first_index": 0, "page_size": 50, "total_extracted_files": 4, "worst_data_id": "49f8ca95f24a45ce9b7feb41b484e165"}, "file_info": {"display_name": "2c6110a76dda8da49195052fa561ab8b8278c02df400124e46d26d2df228b70b", "file_size": 988643, "file_type": "application/vnd.microsoft.portable-executable", "file_type_description": "Self-extracting Executable File", "md5": "33ca3e86d783234092e52369e1b6bb83", "sha1": "653ab54e15b01473943cd897ded24f742b0193c5", "sha256": "2c6110a76dda8da49195052fa561ab8b8278c02df400124e46d26d2df228b70b", "upload_timestamp": "2021-01-29T22:53:45.604Z"}, "process_info": {"blocked_reason": "Infected", "file_type_skipped_scan": false, "post_processing": {"actions_failed": "", "actions_ran": "", "converted_destination": "", "converted_to": "", "copy_move_destination": ""}, "processing_time": 20516, "profile": "File process", "progress_percentage": 100, "queue_time": 1219, "result": "Blocked", "user_agent": "", "username": "", "verdicts": ["Infected"]}, "scan_results": {"data_id": "49f8ca95f24a45ce9b7feb41b484e165", "last_file_scanned": "reedmi.cvl", "progress_percentage": 100, "scan_all_result_a": "Infected", "scan_all_result_i": 1, "scan_details": {"AegisLab": {"def_time": "2021-01-29T12:48:00.000Z", "eng_id": "aegislab_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 9, "threat_found": "", "wait_time": 1366}, "Ahnlab": {"def_time": "2021-01-30T00:00:00.000Z", "eng_id": "ahnlab_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 24, "threat_found": "Malware/Win32.Generic", "wait_time": 1351}, "Antiy": {"def_time": "2021-01-29T15:48:00.000Z", "eng_id": "antiy_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 20, "threat_found": "", "wait_time": 1355}, "Avira": {"def_time": "2021-01-29T00:00:00.000Z", "eng_id": "avira_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 1, "threat_found": "TR/Drop.Agent.xlojg", "wait_time": 1374}, "BitDefender": {"def_time": "2021-01-29T13:19:00.000Z", "eng_id": "bitdefender_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 140, "threat_found": "Trojan.Dropper.ZME", "wait_time": 1501}, "ByteHero": {"def_time": "2021-01-27T00:00:00.000Z", "eng_id": "bytehero_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 680, "threat_found": "", "wait_time": 1352}, "ClamAV": {"def_time": "2021-01-28T07:28:06.000Z", "eng_id": "clamav_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 1125, "threat_found": "", "wait_time": 1438}, "Comodo": {"def_time": "2021-01-29T05:05:50.000Z", "eng_id": "comodo_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 26, "threat_found": "Malware", "wait_time": 1349}, "Cyren": {"def_time": "2021-01-29T14:35:00.000Z", "eng_id": "cyren_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 94, "threat_found": "", "wait_time": 1547}, "ESET": {"def_time": "2021-01-29T00:00:00.000Z", "eng_id": "eset_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 50, "threat_found": "", "wait_time": 1544}, "Emsisoft": {"def_time": "2021-01-29T12:07:00.000Z", "eng_id": "emsisoft_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 1202, "threat_found": "Trojan.Dropper.ZME (B)", "wait_time": 1502}, "Filseclab": {"def_time": "2021-01-27T23:08:00.000Z", "eng_id": "filseclab_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 411, "threat_found": "", "wait_time": 1527}, "Huorong": {"def_time": "2021-01-29T09:24:00.000Z", "eng_id": "huorong_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 260, "threat_found": "", "wait_time": 1584}, "Ikarus": {"def_time": "2021-01-29T13:13:30.000Z", "eng_id": "ikarus_1_windows", "location": "local", "scan_result_i": 3, "scan_time": 235, "threat_found": "The archive is password protected or the given password is invalid.", "wait_time": 1594}, "K7": {"def_time": "2021-01-29T11:16:00.000Z", "eng_id": "k7_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 12, "threat_found": "Trojan ( 005631561 )", "wait_time": 1363}, "McAfee": {"def_time": "2021-01-29T00:00:00.000Z", "eng_id": "mcafee_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 61, "threat_found": "RDN/Dridex", "wait_time": 1549}, "NANOAV": {"def_time": "2021-01-29T11:38:00.000Z", "eng_id": "nano_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 28, "threat_found": "Trojan.Win32.Dridex.icipbk", "wait_time": 1519}, "NetGate": {"def_time": "2021-01-24T04:10:00.000Z", "eng_id": "netgate_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 64, "threat_found": "", "wait_time": 1561}, "Quick Heal": {"def_time": "2021-01-29T06:52:00.000Z", "eng_id": "quickheal_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 51, "threat_found": "Backdoor.Dridex", "wait_time": 1559}, "Sophos": {"def_time": "2021-01-29T00:12:00.000Z", "eng_id": "sophos_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 238, "threat_found": "", "wait_time": 1591}, "Symantec": {"def_time": "2021-01-29T00:00:00.000Z", "eng_id": "symantec_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 21, "threat_found": "", "wait_time": 1464}, "TACHYON": {"def_time": "2021-01-29T00:00:00.000Z", "eng_id": "nprotect_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 76, "threat_found": "", "wait_time": 1549}, "TrendMicro": {"def_time": "2021-01-27T20:22:00.000Z", "eng_id": "trendmicro_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 1388, "threat_found": "", "wait_time": 1441}, "TrendMicro House Call": {"def_time": "2021-01-28T22:14:00.000Z", "eng_id": "trendmicrohousecall_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 1281, "threat_found": "", "wait_time": 1454}, "Vir.IT eXplorer": {"def_time": "2021-01-29T12:10:00.000Z", "eng_id": "viritexplorer_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 72, "threat_found": "", "wait_time": 1569}, "VirusBlokAda": {"def_time": "2021-01-29T08:04:00.000Z", "eng_id": "virusblokada_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 492, "threat_found": "", "wait_time": 1493}, "Windows Defender": {"def_time": "2021-01-29T07:07:36.000Z", "eng_id": "windowsdefender_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 760, "threat_found": "", "wait_time": 1334}, "Xvirus Personal Guard": {"def_time": "2021-01-28T05:47:00.000Z", "eng_id": "xviruspersonalguard_1_windows", "location": "local", "scan_result_i": 1, "scan_time": 825, "threat_found": "Suspicious:NewThreat.179", "wait_time": 1363}, "Zillya!": {"def_time": "2021-01-28T07:07:00.000Z", "eng_id": "zillya_1_windows", "location": "local", "scan_result_i": 0, "scan_time": 10, "threat_found": "", "wait_time": 1475}}, "start_time": "2021-01-29T22:53:46.823Z", "total_avs": 29, "total_time": 20516}, "vulnerability_info": {}, "yara_info": {}} diff --git a/misp/cluster/avclass.json b/misp/cluster/avclass.json index 933e133..7bf7521 100644 --- a/misp/cluster/avclass.json +++ b/misp/cluster/avclass.json @@ -2730,6 +2730,7 @@ "malob", "malpack", "msilobfuscator", + "msilkrypt", "nsanti", "obfus", "obfusc", @@ -2737,6 +2738,7 @@ "obfuscated", "obfuscator", "pakes", + "packer", "suspiciouspacker", "susppack", "vbinder", @@ -3453,7 +3455,9 @@ "description": "FILE:proglang:autoit", "meta": { "refs": [], - "synonyms": [], + "synonyms": [ + "autoitscript" + ], "type": [] }, "uuid": "e16e2760-e497-3e39-9ca2-68a23ccd2b4f", @@ -5412,6 +5416,19 @@ "uuid": "0c6ba93f-a1bc-3e31-bf0e-ffab207c80f8", "value": "loic" }, + { + "description": "FAM:lokibot", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/apk.lokibot", + "https://www.threatfabric.com/blogs/lokibot_the_first_hybrid_android_malware.html" + ], + "synonyms": [], + "type": [] + }, + "uuid": "14b91559-69a4-4f1c-aeac-346be227d08d", + "value": "lokibot" + }, { "description": "FAM:lolbot", "meta": { @@ -8793,6 +8810,23 @@ "uuid": "0e0ea1ba-65d6-3132-b36c-48cd50ca03cd", "value": "smszombie" }, + { + "description": "FAM:snatch", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.snatch" + ], + "synonyms": [] + }, + "related": [ + { + "dest-uuid": "a8f4f4fa-1558-3f16-ac56-bce903a83b22", + "type": "variant-of" + } + ], + "uuid": "5a5cdf0a-ead8-4399-848a-44a9a48cb237", + "value": "snatch" + }, { "description": "FAM:snowfox", "meta": { @@ -8817,6 +8851,28 @@ "uuid": "ab23cc7e-2a93-36e8-a0b7-91bf87e0a142", "value": "socks" }, + { + "description": "FAM:revil", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.revil", + "https://blog.talosintelligence.com/2019/04/sodinokibi-ransomware-exploits-weblogic.html" + ], + "synonyms": [ + "sodinokibi", + "sodin", + "sodinoransom" + ] + }, + "related": [ + { + "dest-uuid": "a8f4f4fa-1558-3f16-ac56-bce903a83b22", + "type": "variant-of" + } + ], + "uuid": "a3d6c162-a51c-4b25-a0cd-7d89037140a3", + "value": "revil" + }, { "description": "FAM:soft32downloader", "meta": { @@ -8968,6 +9024,7 @@ "meta": { "refs": [], "synonyms": [ + "autoruns", "autoruner", "autorunerent" ], @@ -9165,6 +9222,7 @@ "synonyms": [ "banker", "bitstealer", + "cookiesstealer", "datastealer", "discostealer", "delfsnif", @@ -9172,6 +9230,7 @@ "infostealer", "monitor", "passwordstealera", + "poscardstealer", "pswtool", "pwsteal", "pwstealer", @@ -10542,6 +10601,19 @@ "uuid": "5e9086ac-ea73-306b-8faf-fd96bd20e8a2", "value": "amonetize" }, + { + "description": "FAM:ammyy", + "meta": { + "refs": [], + "synonyms": [ + "fakeammyy", + "ammyyadmin" + ], + "type": [] + }, + "uuid": "4dc7b45d-cff8-447a-97a6-2181f7b7773d", + "value": "ammyy" + }, { "description": "FAM:androidarmour", "meta": { @@ -11006,7 +11078,9 @@ "bazarcall", "kegtap", "team9backdoor", - "bazaloader" + "bazaloader", + "bazarloader", + "bazloader" ], "type": [] }, @@ -11412,6 +11486,26 @@ "uuid": "cd7498b7-68c7-34ba-bdfd-9381aa879adb", "value": "bublik" }, + { + "description": "FAM:buhtrap", + "meta": { + "refs": [ + "https://malpedia.caad.fkie.fraunhofer.de/details/win.buhtrap", + "https://malware-research.org/carbanak-source-code-leaked/", + "https://www.symantec.com/connect/blogs/russian-bank-employees-received-fake-job-offers-targeted-email-attack", + "https://www.welivesecurity.com/2015/04/09/operation-buhtrap/", + "https://www.group-ib.com/brochures/gib-buhtrap-report.pdf", + "https://www.arbornetworks.com/blog/asert/diving-buhtrap-banking-trojan-activity/", + "https://blog.dcso.de/pegasus-buhtrap-analysis-of-the-malware-stage-based-on-the-leaked-source-code/" + ], + "synonyms": [ + "ratopak" + ], + "type": [] + }, + "uuid": "71e031ee-50e8-46a8-bda8-c4ae9c0012de", + "value": "buhtrap" + }, { "description": "FAM:bundlore", "meta": { @@ -13127,6 +13221,7 @@ "fakebrows", "fakeicq", "fakeinstall", + "fakeinstaller", "fakeinsthw", "fakeinstsms", "fodeg", @@ -13854,5 +13949,5 @@ "value": "geinimi" } ], - "version": 0.1 + "version": 0.2 } \ No newline at end of file From 2039d304d8026f97d10b54d6e41645a608806215 Mon Sep 17 00:00:00 2001 From: Jeffrey Gentes Date: Fri, 17 Feb 2023 22:20:54 -0500 Subject: [PATCH 35/36] tagging and taxonomy updates --- avclass/data/default.tagging | 166 ++++++++++++++++++++++++---------- avclass/data/default.taxonomy | 13 ++- 2 files changed, 130 insertions(+), 49 deletions(-) diff --git a/avclass/data/default.tagging b/avclass/data/default.tagging index 7de3e8f..e983863 100644 --- a/avclass/data/default.tagging +++ b/avclass/data/default.tagging @@ -43,6 +43,7 @@ allad airpush almanahe alman alureon tdss amab mobidash +ammyyadmin ammyy amorba ipamor andef fkdefend andr android @@ -77,14 +78,19 @@ arcadeparlor gamevance arcadeweb gamevance archsms smshoax arcparlor gamevance +arena crysis armour androidarmour arto renos artro renos aservicea kuguo +autohotkey autohk +autoitscript autoit autokms winactivator -autoruner autorun vobfus -autorunerent autorun palevo +autoruner autorun +autorunerent autorun +autoruns autorun avalod sinowal +ave_maria avemaria aveasms smskey avkill killsectool bacteraloh sality @@ -102,15 +108,19 @@ basebrid basebridge batteryd fakedoc batterydoctor fakedoc bazaloader bazar +bazarbackdoor bazar +bazarcall bazar bazarldr bazar bazarloader bazar bazdor bazar +bazloader bazar bazzarldr bazar bbridge basebridge bckdr backdoor beacon cobaltstrike bean nandrobox bearshare bandoo +beerbot bazar beita beitaad bergat xtrat bertlea bertle @@ -119,12 +129,15 @@ betterinstaller somoto bflient palevo bibean faketimer biez loadmoney +binder packed bitcoin bitcoinminer bitminer bitcoinminer +bitstealer infosteal bjlog zegost bkdr backdoor blackice whiteice bladabi bladabindi +bladabindinet bladabindi blanajog spygate blic whiteice blocal vmvol @@ -190,6 +203,7 @@ climap androrat clkpotato hotbar cloudatlas neoreklami clspring purityscan +cmkfkw 5wfo cobalt cobaltstrike cobaltstr cobaltstrike cobbler focobers @@ -208,14 +222,17 @@ cometer cobaltstrike cometsys darkkomet cometsystems darkkomet condestil firseria +confuserex confuser contrand sckeylog controlrandom sckeylog +cookiesstealer infosteal coolpaperleak coolwall copycat airpush corrupt corrupted cosha lovetrap counterclank plankton crack tool +cracks wzteam cracktool tool crisis morcut crori crossrider @@ -231,6 +248,7 @@ cryptic packed cryptinno installcore cryptodefense cryptodef cryptominer miner +cryptoobfuscator packed cryptor packed cryptz rozena cson simbot @@ -247,17 +265,18 @@ damaged corrupted darkcomet darkkomet darksnow whiteice datasetaler infosteal +datastealer infosteal daytre upatre ddlight droiddreamlight dealcabby adpeak debris gamarue delf delphi delfiles filedelete -delfinject delphi inject -delfloader delphi downloader -delfsnif delphi infosteal -delpbanc delphi infosteal -delpdldr delphi downloader +delfinject delphi +delfloader delphi +delfsnif delphi +delpbanc delphi +delpdldr delphi derdroi simbad desktoplightning cashon detroi detroie @@ -269,9 +288,11 @@ dialers dialer dialpass egroupdial dialplatform dialer didat dabom -dinihou jenxcus +dinihou houdini diple vobfus directdown directdownloader +discostealer infosteal +diskcoder filecrypt dizhi lecna dldr downloader dldrop downloader @@ -281,10 +302,11 @@ dloader downloader dloadr downloader dloadware adware dnschanger dnsmodify -docdl downloader msoffice -docdrop downloader msoffice -docdrp downloader msoffice +docdl downloader +docdrop downloader +docdrp downloader dogbite dogowar +doghousepower shelma dogwar dogowar doidroot rooter domainiq domaiq @@ -338,7 +360,7 @@ droppr downloader dropr downloader duel loveletter dumobove hiddad -dunihi jenxcus +dunihi houdini duptwux lolbot dwnld downloader dwnldr downloader @@ -361,9 +383,10 @@ emailspy maistealer emerleox fujacks emotetcrypt emotet emud emudbot -encoder filecrypt ransomware +encoder filecrypt encpk packed engwings cardserv +enigmaprotector enigma epicgames gamevance epicplay gamevance eqdrug equationdrug @@ -386,6 +409,7 @@ extrat xtrat eydrop dinwod fakapp styricka fakealert rogueware +fakeammyy ammyy fakeav rogueware fakebattscar fakedoc fakebrows fakeinst @@ -395,6 +419,7 @@ fakedefender fkdefend fakefldr fakefolder fakeicq fakeinst fakeinstall fakeinst +fakeinstaller fakeinst fakeinsthw fakeinst fakeinstsms fakeinst fakejoboffer fakejob @@ -402,7 +427,7 @@ fakelogosms fakelogo fakelt elite fakemini opfake fakemms fakeplayer -fakems fakepublisher signed +fakems fakepublisher fakengry fakeangry fakenotify opfake fakeplay fakeplayer @@ -426,7 +451,7 @@ fenomen fenomengame fenomengamet fenomengame fenservice fengvi fidgo opfake -filecoder filecrypt ransomware +filecoder filecrypt filehunter winpump fileinfector infector filesearch amonetize @@ -455,12 +480,14 @@ fokonge droidkungfu foncysms foncy foran anforen fraud rogueware -fraudload downloader rogueware +fraudload downloader fraudtool tool freeandroidspy freespy freeandspy freespy freepds hotclip frogonal ginmaster +frutas adwind +fuerboos goodor fujack fujacks fullscreen lockscreen funclub smssend @@ -476,6 +503,7 @@ gamevancecs gamevance gampass gamethief ganelp griptolo gaobot agobot +gaslome loosemaque gasms gambler gastab gabas gavir viking @@ -489,8 +517,10 @@ geksone crytex gemest smishing genericab wroba genericgb basebridge +genkryptik packed genpack packed gentroj trojan +geodo emotet gepat airpush getextension eorezo getfaster 4shared @@ -542,6 +572,7 @@ hdusafe wapron helldoor hilldoor hellokitty deathransom hellospy spyoo +heodo emotet hiddenad hiddad hiddeninstall jsmshider hidrag jeefo @@ -556,10 +587,9 @@ homepage browsermodify hongtoutou adrd horse trojan hosts-modifier hostsmodify -houdini jenxcus hublo crytex huigezi hupigon -hworm jenxcus +hworm houdini hype loadmoney hyteod kovter iadpush dowgin @@ -609,17 +639,20 @@ intex intexdial intexus intexdial invader daws ipatre upatre -ircbot bot irc +ircbot bot ispyoo spyoo j2me java jackpos jinupd jadtre wapomi javak suggestor +jbifrost adwind jedan kuguo jelbrus techsnab +jenxcus houdini joke hoax joleee tedroo jrat adwind +jsocket adwind juched griptolo kaka telman kanav alyak @@ -627,6 +660,7 @@ kasandra sandr kashu sality kazaa benjamin keepmusic hiddad +kegtap bazar keji basebridge kelvin smssend kernelpatch geral @@ -689,8 +723,7 @@ llond lardlond loadmoneyent loadmoney locker lockscreen locm locmg -loda nymeria -lodarat nymeria +lodarat loda lohmys midia looked viking loorp wapomi @@ -700,7 +733,7 @@ lotuseed lotusid lower airpush lozfoon loozfon macosx mac -macrodown downloader macro +macrodown downloader madanf virut madang virut madangel virut @@ -760,6 +793,7 @@ mobkong smssend mobspy trackplus mobsqueeze fakedoc mofksys swisyn +mohazo raccoon monad damon monderb vundo monitor infosteal @@ -775,7 +809,8 @@ mostofate softomate mozaakai bazar mplug multiplug mrophine morphine -msilobfuscator msil packed +msilkrypt packed +msilobfuscator msil mspyonline mspy msteal maistealer mswdm ipamor @@ -798,6 +833,7 @@ najin feejar nancrat nanocore nandrob nandrobox nanobot nanocore +ncov crysis negasteal agenttesla nemucod smsreg neobar neoreklami @@ -836,6 +872,7 @@ nsanti packed nuwar tibs nyearleaker airpush nyleaker airpush +nymeria loda o97m macro obfus packed obfusc packed @@ -867,8 +904,8 @@ optinstall ibryte optiuminstaller ibryte optixp optix optixpro optix -orcusrat orcus -orcusrot orcus +orcus orcusrat +orcusrot orcusrat osx mac osx32 mac otran vobfus @@ -878,13 +915,16 @@ overt sadenav overtls sadenav ozotshielder kmin pace socks +packer packed padobot korgo padodor berbew pakes packed panda zbot pandaent zbot pandora nandrobox +papras gozi parnian smssend +passwordstealera infosteal patch filemodify patched filemodify patcher filemodify @@ -908,6 +948,7 @@ petr petya petrolan petrolin petrwrap petya philis viking +phobos crysis pigeon hupigon pigetrl lockscreen pikor wapomi @@ -942,12 +983,14 @@ porn porndialer porndial porndialer pornlocker lockscreen portscan network +poscardstealer infosteal positivefinds browsefox positmob fakeinst potentially grayware poweliks wowlik powerliks wowlik powerpack linkular +powerstats valyria powessere wowlik pp97m macro preloader megasearch @@ -962,6 +1005,7 @@ protil wapomi provar fakeinst pswtool infosteal pua grayware +puffstealer azorult pup grayware pupil plemood purity purityscan @@ -979,23 +1023,29 @@ qakbot qbot qhost hostsmodify qhosts hostsmodify qqrobber qqrob +quasar_rat quasar quasarrat quasar qukart berbew qvod wapomi rabbhome fjcon rabidog dogowar +racealer raccoon +racoon raccoon rahack allaple rahiwi brontok raideloz vobfus ramdo redyms ranck ranky +randaev exrand ransom ransomware -ransomcrypt filecrypt ransomware -ransomlock lockscreen ransomware +ransomcrypt filecrypt +ransomlock lockscreen rapiddown firseria ratab mamianune +ratopak buhtrap razel rasteal raziel rasteal +rdpdos rdpkill recal mogap recam netwiredrc recordpage browsefox @@ -1005,7 +1055,9 @@ refogkeylogger refog regie fosniw relevant relevantknowledge relik updtkiller +remcosrat remcos remtasu xtrat +remvio remcos renamer files reposfxg trickbot reptilic reptilicus @@ -1016,7 +1068,7 @@ revet revetrat revtcp metasploit rimecud palevo risk grayware -risktool grayware tool +risktool grayware riskware grayware rivalgame gamevance rkdoor koutodoor @@ -1035,6 +1087,7 @@ ropin leadbolt rorpian zeroaccess ruftar usteal rugo hotbar +rultazo azorult runitslf looper runonce chir runouce chir @@ -1132,10 +1185,14 @@ snadapps typstu sndapps typstu sneakytrail installerex sniffer network +snifula gozi sobot clientor +sockrat adwind +socmer remcos sodin revil sodinokib revil sodinokibi revil +sodinoransom revil soft32down soft32downloader soft32download soft32downloader softbase softobase @@ -1155,7 +1212,7 @@ spacer unruy spakrab vidro spambot spam spammer spam -spamtool spam tool +spamtool spam spatet rebhip spdupmypc speedingupmypc speedupmypc uniblue @@ -1201,6 +1258,7 @@ systemsecurity winwebsec systex daws systro sytro sysvenfak loadmoney +talalpek gootkit talklog talkw taojin taojinstar tapsnake gpspy @@ -1209,11 +1267,13 @@ tatus tetus tazebama mabezat tdownloader installerex tdssrt tdss +team9backdoor bazar tedro tedroo temai ksapp tepfer fareit test testvirus testfile testvirus +thetrick trickbot tibspak tibs tibspk tibs tibsys tibser @@ -1237,26 +1297,29 @@ tovkater installmonster towelexploit towel trick trickbot trickbotcrypt trickbot +trickloader trickbot trickpak trickbot +trickster trickbot trj trojan -trjdown downloader trojan -trjndwnlder downloader trojan +trjdown downloader +trjndwnlder downloader troj trojan -trojanapt apt trojan -trojanbanker infosteal trojan -trojanclicker adware clicker trojan -trojandldr downloader trojan -trojandownloader downloader trojan -trojandropper downloader trojan -trojandwnldr downloader trojan -trojanfakeav alertuser rogueware trojan +trojanapt apt +trojanbanker infosteal +trojanclicker adware +trojandldr downloader +trojandownloader downloader +trojandropper downloader +trojandwnldr downloader +trojanfakeav alertuser trojanhorse trojan -trojanproxy proxy trojan -trojanpsw infosteal trojan -trojanransom filecrypt ransomware trojan -trojansms sms trojan -trojanspy spyware trojan +trojanproxy proxy +trojanpsw infosteal +trojanransom filecrypt +trojansms sms +trojanspy spyware trojware trojan +trollster kefamad truedown truedownloader tsuploader installerex tufei tufik @@ -1274,24 +1337,27 @@ ultradownload vilsel ultradownloads vilsel umeng gumen unix linux +unrecom adwind unsafe grayware unwanted grayware unwnt grayware updatekiller updtkiller updtkill updtkiller uracto maistealer +ursnif gozi uuser uuserv uxipp yzhc valhalla xorala valla xorala vbccrypt vobfus -vbcrypt packed visualbasic -vbinject inject visualbasic -vbkrypt packed visualbasic +vbcrypt packed +vbinder packed +vbinject inject +vbkrypt packed vbna vobfus vbobf vobfus vbobfus vobfus -vbpack packed visualbasic +vbpack packed vernet dusvext vertex dusvext vertexb dusvext @@ -1305,6 +1371,7 @@ virtob virut virtool tool vitallia vittalia vjadtre wapomi +vjw0rm vjworm vmdetector vmdetect vmpbad vmprotect vnfraye dusvext @@ -1315,8 +1382,10 @@ w2km macro w32 windows w64 windows w97m macro +wacatac deathransom wadhrama crysis wakeful cardserv +waldek gootkit wali wapomi walkfree kalfere walksteal walkinwat @@ -1371,6 +1440,7 @@ xloader wroba xpack packed xpiro expiro xsider jsmshider +xswkit gootkit xtoober karagany xtreme xtrat xworm loveletter diff --git a/avclass/data/default.taxonomy b/avclass/data/default.taxonomy index 2bb344e..8404f50 100644 --- a/avclass/data/default.taxonomy +++ b/avclass/data/default.taxonomy @@ -78,6 +78,7 @@ CLASS:worm CLASS:worm:emailworm FAM:1clickdownload FAM:4shared +FAM:5wfo FAM:abeciv FAM:accutrack FAM:acecard @@ -116,6 +117,7 @@ FAM:aliyuncs FAM:allaple FAM:alman FAM:alyak +FAM:ammyy FAM:amonetize FAM:amphitryon FAM:androidarmour @@ -323,6 +325,7 @@ FAM:etumbot FAM:ewind FAM:expiro FAM:expressdownloader +FAM:exrand FAM:faceniff FAM:fakeangry FAM:fakeapp @@ -437,6 +440,7 @@ FAM:haynu FAM:hero FAM:hiddad FAM:hiddenapp +FAM:hiddentear FAM:hiddnad FAM:highster FAM:hilldoor @@ -503,6 +507,7 @@ FAM:kapratect FAM:karagany FAM:kasidet FAM:katrep +FAM:kefamad FAM:kelihos FAM:kgbspy FAM:khalesi @@ -547,12 +552,14 @@ FAM:loadmoney FAM:loapi FAM:lockactivity FAM:locmg +FAM:loda FAM:loic FAM:lokibot FAM:lolbot FAM:lollipop FAM:loodos FAM:looper +FAM:loosemaque FAM:loozfon FAM:lotusid FAM:lovefraud @@ -673,6 +680,7 @@ FAM:openinstall FAM:opfake FAM:optix FAM:orcus +FAM:orcusrat FAM:outbrowse FAM:oveead FAM:paccy @@ -734,6 +742,7 @@ FAM:rasteal FAM:razam FAM:razy FAM:rbot +FAM:rdpkill FAM:rebhip FAM:recmads FAM:redalert @@ -838,6 +847,7 @@ FAM:smssend FAM:smsspy FAM:smsthief FAM:smszombie +FAM:snatch FAM:snowfox FAM:socks FAM:soft32downloader @@ -882,9 +892,9 @@ FAM:suaban FAM:suggestor FAM:supking FAM:svpeng -FAM:swrort FAM:swisyn FAM:swizzor +FAM:swrort FAM:systemmonitor FAM:systush FAM:sytro @@ -1006,6 +1016,7 @@ FAM:wowlik FAM:wqmobile FAM:wroba FAM:wtaspin +FAM:wzteam FAM:xavierad FAM:xinhua FAM:xolco From 2f013f89da93011f96e926fc3840ff91bf85697a Mon Sep 17 00:00:00 2001 From: Jeffrey Gentes Date: Tue, 21 Feb 2023 22:10:40 -0500 Subject: [PATCH 36/36] Allow AVLabels to be init with Classes --- avclass/common.py | 54 ++++++++++++++++++++++++++++------------------ avclass/labeler.py | 10 ++++----- 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/avclass/common.py b/avclass/common.py index a6a439c..0ae4436 100755 --- a/avclass/common.py +++ b/avclass/common.py @@ -4,7 +4,6 @@ import string import sys -from avclass import util from collections import defaultdict, namedtuple from typing import AnyStr, Callable, Collection, Dict, List, Optional, Set, Tuple, Union @@ -44,7 +43,7 @@ class Tag: - """ A Tag in the taxonomy """ + """A Tag in the taxonomy""" def __init__(self, s): word_list = s.strip().split(":") @@ -63,27 +62,27 @@ def __init__(self, s): self._path = self._name def __hash__(self): - """ Return hash """ + """Return hash""" return hash((self._path)) @property def name(self): - """ Return tag name """ + """Return tag name""" return self._name @property def cat(self): - """ Return tag category """ + """Return tag category""" return self._cat @property def path(self): - """ Return tag path """ + """Return tag path""" return self._path @property def prefix_l(self): - """ Return tag prefix list """ + """Return tag prefix list""" return self._prefix_l @@ -112,14 +111,18 @@ def __len__(self) -> int: return len(self._tags) def __iter__(self): - """ Iterator over the alphabetically sorted tags in the taxonomy """ + """Iterator over the alphabetically sorted tags in the taxonomy""" return (t for t in sorted(self._tags)) def is_hex(self, tag: AnyStr) -> bool: - # exclude generic hex tags like 004bc24a - return bool(re.search(r"\d", tag)) and bool( - re.fullmatch(r"[0-9a-fA-F]+", tag) - ) + """ + Whether or not the input ``tag`` is hex + Exclude generic hex tags like 004bc24a + + :param tag: The tag + :return: Boolean + """ + return bool(re.search(r"\d", tag)) and bool(re.fullmatch(r"[0-9a-fA-F]+", tag)) def is_generic(self, tag: AnyStr) -> bool: """ @@ -512,22 +515,31 @@ def validate(self, taxonomy: Taxonomy): # TODO - raise or return False? -class AvLabels: +class AVLabels: """ Primary class used to interpret AV Labels """ def __init__( self, - tag_file: AnyStr = util.DEFAULT_TAG_PATH, - exp_file: AnyStr = util.DEFAULT_EXP_PATH, - tax_file: AnyStr = util.DEFAULT_TAX_PATH, + translations: Union[AnyStr, Translation] = None, + expansions: Union[AnyStr, Expansion] = None, + taxonomy: Union[AnyStr, Taxonomy] = None, av_file: AnyStr = None, alias_detect: bool = False, ): - self.taxonomy = Taxonomy(tax_file) - self.translations = Translation(tag_file) - self.expansions = Expansion(exp_file) + if isinstance(taxonomy, Taxonomy): + self.taxonomy = taxonomy + else: + self.taxonomy = Taxonomy(taxonomy) + if isinstance(translations, Translation): + self.translations = translations + else: + self.translations = Translation(translations) + if isinstance(expansions, Expansion): + self.expansions = expansions + else: + self.expansions = Expansion(expansions) self.avs = self.read_avs(av_file) if av_file else None # Alias statistics initialization self.alias_detect = alias_detect @@ -839,8 +851,8 @@ def get_sample_tags(self, sample_info: SampleInfo) -> Dict[AnyStr, List[AnyStr]] return av_dict def get_sample_vt_count(self, sample_info): - ''' Return number of detections for sample - in the provided AV whitelist (if any) ''' + """Return number of detections for sample + in the provided AV whitelist (if any)""" if self.avs is None: return len(sample_info.labels) else: diff --git a/avclass/labeler.py b/avclass/labeler.py index ae22a0d..784e220 100755 --- a/avclass/labeler.py +++ b/avclass/labeler.py @@ -12,17 +12,17 @@ from typing import AnyStr, Dict, List, NamedTuple, Optional, Tuple, Union try: - from avclass.common import AvLabels, Taxonomy + from avclass.common import AVLabels, Taxonomy from avclass import clustering as ec, util except ModuleNotFoundError: # Helps find the avclasses when run from console sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - from avclass.common import AvLabels, Taxonomy + from avclass.common import AVLabels, Taxonomy from avclass import clustering as ec, util class AVClassLabeler: - def __init__(self, av_labels: AvLabels = AvLabels()): + def __init__(self, av_labels: AVLabels = AVLabels()): self.av_labels = av_labels self.output = {"labels": []} self.hash_type = None @@ -645,8 +645,8 @@ def print_output(self, output: AnyStr = ""): def main(): args = parse_args() - # Create AvLabels object - av_labels = AvLabels( + # Create AVLabels object + av_labels = AVLabels( tag_file=args.tag, tax_file=args.tax, exp_file=args.exp,