diff --git a/.gitignore b/.gitignore index 0d20b64..d004991 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,132 @@ -*.pyc +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# TODO +TODO* diff --git a/EVALUATION.md b/EVALUATION.md index 50bc0d6..667afa7 100644 --- a/EVALUATION.md +++ b/EVALUATION.md @@ -11,6 +11,16 @@ This page is a work in progress to explain common errors that we have observed, so that they can be avoided. We'll try to update this page as we observe new issues. +Here is the quick summary if you are in a hurry: + +1. AVClass2 has superseeded AVClass, so your evaluation should include AVClass2, not only the original AVClass. +2. For malware labeling, please use AVClass2 compatibility mode (-c command line option). +3. Tagging more samples is not an evaluation goal by itself, the tags need to be accurate. For example, it is known that allowing tags from a single AV engine or ignoring generic tags will enable tagging more samples, but it will introduce incorrect tags. +4. You need ground truth to evaluate the accuracy/precision/recall of AVClass/AVClass2. +5. You should also evaluate scalability (runtime and memory usage) since that is a major design goal of AVClass/AVClass2 +6. Note that AVClass2 and AVClass may not provide the same family tag for all samples when run on the same AV labels. +7. AVClass2/AVClass are not malware detection tools, please do not try to evaluate them for that scenario. + ## Which tool should I evaluate and compare with? You should evaluate and compare with AVClass2. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..8747980 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include avclass2/data/* diff --git a/README.md b/README.md index dcda698..668d5e2 100644 --- a/README.md +++ b/README.md @@ -1,218 +1,441 @@ -# AVClass and AVClass2 +# AVClass + +AVClass is a Python package and command line tool to tag / label +malware samples. +You input the AV labels for a large number of malware samples +(e.g., VirusTotal JSON reports) +and it outputs tags extracted from the AV labels of each sample. + +AVClass can output the most likely family name for each sample, +as well as the list of all tags identified, +ranked by decreasing popularity. +Beyond family names, tags can capture +the malware class (e.g., *worm*, *ransomware*, *grayware*), +behaviors (e.g., *spam*, *ddos*), and +file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). + +If you are wondering if this is AVClass or AVClass2, +the answer is this is the right repository in both cases. +The old AVClass code was deprecated and AVClass2 was renamed as AVClass. +A longer explanation is below. + +## Installation +```shell +pip install avclass2 +``` -AVClass and AVClass2 are Python tools to tag / label malware samples. -You give them as input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports) -and they output tags extracted from the AV labels of each sample. -The original AVClass only outputs family names (i.e., family tags). -By default, it outputs the most likely family for each sample (e.g., *zbot*, *virut*). -It can also output a ranking of all alternative family names it found for each sample. -The newer AVClass2, in addition to family names, also outputs other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). +## Examples -A quick example helps illustrating the differences. If you run AVClass2 on our example input file: +To extract all tags for each sample run: ```shell -$./avclass2/avclass2_labeler.py -lb examples/malheurReference_lb.json -p +avclass -vt examples/vtv2_sample.json -p ``` -the output on stdout is: +the output on stdout will be: ``` -aca2d12934935b070df8f50e06a20539 33 CLASS:grayware|10,CLASS:grayware:adware|9,FILE:os:windows|8,FAM:adrotator|8,CLASS:downloader|3,FAM:zlob|2 -67d15459e1f85898851148511c86d88d 37 CLASS:dialer|23,FILE:os:windows|9,FAM:adultbrowser|8,CLASS:dialer:porndialer|7,CLASS:grayware|6,CLASS:grayware:tool|3,FAM:target|2 +602695c8f2ad76564bddcaf47b76edff 52 FAM:zeroaccess|19,FILE:os:windows|16,BEH:server|8,CLASS:backdoor|8,FILE:packed|7 +f117cc1477513cb181cc2e9fcaab39b2 39 CLASS:rogueware|15,BEH:alertuser|15,FILE:os:windows|11,FAM:winwebsec|4,CLASS:grayware|4,CLASS:grayware:tool|3,FILE:packed|3 ``` -which means sample *aca2d12934935b070df8f50e06a20539* -was flagged by 33 AV engines and 10 of them agree it is *grayware*, 9 that it is more specifically *adware*, -8 mention that it runs on *windows*, another 8 that it is the *adrotator* family, -3 that it is a *downloader*, and 2 that it belongs instead to the *zlob* family. -Sample *67d15459e1f85898851148511c86d88d* is flagged by 37 AV engines and 23 of them -consider it a *dialer*, 8 that it belongs to the *adultbrowser* family, and so on. - -If you instead run AVClass on the same input file: +which means sample *602695c8f2ad76564bddcaf47b76edff* +was flagged by 52 AV engines and that +19 of them agree is is from the *zeroaccess* family, +16 that runs on *windows*, +8 that it is a *backdoor*, and +7 that it is a *packed* file. +Sample *f117cc1477513cb181cc2e9fcaab39b2* is flagged by 39 AV engines and +15 of them mention its class to be *rogueware*, +15 mention that it has the *alertuser* behvior, +11 that it runs on *windows*, +4 that it belongs to the *winwebsec* family, +and so on. + +Most users will be interested in obtaining the most likely family name +for each sample and may not care about other tags. +For that you can use the compatiblity _-c_ option: ```shell -$./avclass/avclass_labeler.py -lb examples/malheurReference_lb.json +avclass -vt examples/vtv2_sample.json -c ``` -the output looks like this: +the output on stdout will be: ``` -aca2d12934935b070df8f50e06a20539 adrotator -67d15459e1f85898851148511c86d88d adultbrowser -``` +602695c8f2ad76564bddcaf47b76edff zeroaccess +f117cc1477513cb181cc2e9fcaab39b2 winwebsec +``` which simply reports the most common family name for each sample. -In a nutshell, that is the main difference between both tools. -Of course, there are more options for both tools, -which you can read about in their corresponding README files. - - -## Which tool should I use? - -You should use AVClass2. It is the newer tool, extracts more information -from the input AV labels, and has a compatibility mode to be used for -family labeling in the same way that the original AVClass. - -AVClass is no longer updated and may be deprecated in the near future. -For example, we add family aliases and generic terms from time to time -for AVClass2, but we are not currently adding them for AVClass. - -The main reason to keep the original AVClass around is because in research, -reproducibility can be important. -Other researchers may have their own reasons to compare against an older tool. -However, as a researcher, if you compare against the original AVClass, -you should also include AVClass2 in the comparison. - -The only benefit of the original AVClass is that it is slightly -faster than AVClass2 since it extracts less info. -It is unlikely that the lower runtime makes a significant difference -for most users, but worth mentioning it in case you process many -millions of samples and do not require the extra info AVClass2 provides. - -## Evaluating and comparing with AVClass/AVClass2 - -Other researchers may want to independently evaluate AVClass/AVClass2 and -to compare it with their own approaches. -We encourage such evaluation, feedback on limitations, and proposals for -improvement. -However, we have observed a number of common errors in such evaluations that -should be avoided. Below is the quick summary. -A more detailed explanation is in the [evaluation page](EVALUATION.md) - -1. AVClass2 has superseeded AVClass, so your evaluation should include AVClass2, not only the original AVClass. -2. For malware labeling, please use AVClass2 compatibility mode (-c command line option). -3. Tagging more samples is not an evaluation goal by itself, the tags need to be accurate. For example, it is known that allowing tags from a single AV engine or ignoring generic tags will enable tagging more samples, but it will introduce incorrect tags. -4. You need ground truth to evaluate the accuracy/precision/recall of AVClass/AVClass2. -5. You should also evaluate scalability (runtime and memory usage) since that is a major design goal of AVClass/AVClass2 -6. Note that AVClass2 and AVClass may not provide the same family tag for all samples when run on the same AV labels. -7. AVClass2/AVClass are not malware detection tools, please do not try to evaluate them for that scenario. - -## References - -The design and evaluation of AVClass is detailed in our -[RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf): - -> Marcos Sebastián, Richard Rivera, Platon Kotzias, and Juan Caballero. -AVClass: A Tool for Massive Malware Labeling. -In Proceedings of the International Symposium on Research in -Attacks, Intrusions and Defenses, -September 2016. +For some samples, AVClass compatibility mode may return: -The design and evaluation of AVClass2 is detailed in our -[ACSAC 2020 paper](https://arxiv.org/pdf/2006.10615.pdf): +``` +f465a2c1b852373c72a1ccd161fbe94c SINGLETON:f465a2c1b852373c72a1ccd161fbe94c +``` -> Silvia Sebastián, Juan Caballero. -AVClass2: Massive Malware Tag Extraction from AV Labels. -In proceedings of the Annual Computer Security Applications Conference, -December 2020. +This means that AVClass was not able to identify a family name for that sample. +AVClass uses the SINGLETON:hash terminology, +(e.g., instead of an empty string or NULL) +so that the second column can be used as a cluster identifier where +each unlabeled sample is placed in its own cluster. +This prevents considering that all unlabeled samples are part of the +same family / cluster. -## Why are AVClass and AVClass2 useful? +## Why is AVClass useful? -Because a lot of times security researchers want to extract family and other -information from AV labels, but this process is not as simple as it looks, -especially if you need to do it for large numbers (e.g., millions) of samples. -Some advantages of AVClass and AVClass2 are: +Because a lot of times security researchers want to extract family and other +information from AV labels, but this process is not as simple as it looks, +especially if you need to do it for large numbers (e.g., millions) of samples. +Some advantages of AVClass are: -1. *Automatic.* They remove manual analysis limitations on the size of the -input -dataset. +1. *Automatic.* It avoids manual work that does not scale for large datasets. -2. *Vendor-agnostic.* They operate on the labels of any available set of AV +2. *Vendor-agnostic.* It operates on the labels of any available set of AV engines, which can vary from sample to sample. -3. *Cross-platform.* They can be used for any platforms supported by AV +3. *Cross-platform.* It can be used for any platforms supported by AV engines, e.g., Windows or Android malware. -4. *Does not require executables.* AV labels can be obtained from online services - like VirusTotal using a sample's hash, even when the executable is not available. +4. *Does not require executables.* AV labels can be obtained from online +services like VirusTotal using a sample's hash, +even when the executable is not available. -5. *Quantified accuracy.* We have evaluated AVClass and AVClass2 on millions of -samples and publicly available malware datasets with ground truth. -Evaluation details are in the RAID 2016 and ACSAC 2020 papers. +5. *Quantified accuracy.* We have evaluated AVClass on millions of +samples and publicly available malware datasets with ground truth. +Evaluation details are in the RAID 2016 and ACSAC 2020 papers +(see References section). -6. *Open source.* The code is available and we are happy to incorporate -suggestions and improvements so that the security community benefits from -these tools. +6. *Open source.* The code is available and we are happy to incorporate +suggestions and improvements so that the security community benefits from +the tool. ## Limitations -The main limitations of AVClass and AVClass2 are that its output depends -on the input AV labels. -Both tools try to compensate for the noise on the AV labels, -but cannot identify tags if AV engines do not provide non-generic tokens -in the labels of a sample. -In particular, they cannot tag samples if at least 2 AV engines -do not agree on a tag. +The main limitations of AVClass is that its output depends +on the input AV labels. +AVClass tries to compensate for the noise on the AV labels, +but cannot identify tags if AV engines do not provide non-generic tokens +in the labels of a sample. +In particular, it cannot tag samples if at least 2 AV engines +do not agree on a tag. Still, there are many samples that both tools can tag and thus we believe you will find them useful. We recommend you to read the RAID 2016 and ACSAC 2020 papers for more details. +## Is this AVClass or AVClass2? + +The short answer is that the current code in this repo is +based on the code of AVClass2. +The original AVClass code has been deprecated. +Below, we detail this process. + +We originally published AVClass in RAID 2016 and made its code +available in this repository in July 2016. +AVClass extracted only the family names from the input samples. + +We published AVClass2 in ACSAC 2020 and made its code +available in this repository in September 2020. +AVClass2 extracted all tags from the input samples and included a +compatibility _-c_ option to provide only the family names in the +same format as the original AVClass. + +For 2.5 years, both tools were available in this repository in +separate directories. +In February 2023, we decided to deprecate the original AVClass code, +rename AVClass2 as AVClass, and +release a PyPI package to ease installation. + ## Input JSON format -AVClass and AVClass2 support three input JSON formats: +AVClass supports three input JSON formats: -1. VirusTotal v2 API JSON reports (*-vt file*), -where each line in the input *file* should be the full JSON of a +1. VirusTotal v2 API JSON reports (*-vt file*), +where each line in the input *file* should be the full JSON of a VirusTotal v2 API response to the */file/report* endpoint, e.g., obtained by querying https://www.virustotal.com/vtapi/v2/file/report?apikey={apikey}&resource={hash} There is an example VirusTotal v2 input file in examples/vtv2_sample.json ```shell -$./avclass2/avclass2_labeler.py -vt examples/vtv2_sample.json -p > output.txt +avclass -vt examples/vtv2_sample.json -p > output.txt ``` -2. VirusTotal v3 API JSON reports (*-vt file -vt3*), -where each line in the input *file* should be the full JSON of a VirusTotal API version 3 response with a *File* object report, +2. VirusTotal v3 API JSON reports (*-vt file -vt3*), +where each line in the input *file* should be the full JSON of a +VirusTotal API version 3 response with a *File* object report, e.g., obtained by querying https://www.virustotal.com/api/v3/files/{hash} There is an example VirusTotal v3 input file in examples/vtv3_sample.json ```shell -$./avclass2/avclass2_labeler.py -vt examples/vtv3_sample.json -p -vt3 > output.txt +avclass -vt examples/vtv3_sample.json -p -vt3 > output.txt ``` 3. Simplified JSON (*-lb file*), -where each line in *file* should be a JSON +where each line in *file* should be a JSON with (at least) these fields: -{md5, sha1, sha256, av_labels}. +{md5, sha1, sha256, av_labels}. There is an example of such input file in *examples/malheurReference_lb.json* ```shell -$./avclass2/avclass2_labeler.py -lb examples/malheurReference_lb.json -p > output.txt +avclass -lb examples/malheurReference_lb.json -p > output.txt ``` **Why have a simplified JSON format?** -We believe most users will get the AV labels using VirusTotal. -However, AVClass and AVClass2 are IO-bound and a VirusTotal report -in addition to the AV labels and hashes includes -much other data that the tools do not need. -Thus, when applying AVClass or AVClass2 to millions of samples, -reducing the input file size by removing unnnecessary data -significantly improves efficiency. -Furthermore, users could obtain AV labels from other sources and -the simpler the input JSON format, +We believe most users will get the AV labels using VirusTotal. +However, AVClass is IO-bound and a VirusTotal report +in addition to the AV labels and hashes includes +much other data that the tool does not need. +Thus, when applying AVClass to millions of samples, +reducing the input file size by removing unnnecessary data +significantly improves efficiency. +Furthermore, users could obtain AV labels from other sources and +the simpler the input JSON format, the easier to convert those AV labels into an input file. +**Multiple input files** + +AVClass can handle multiple input files putting the results in the +same output files +(if you want results in separate files, process each input file separately). + +It is possible to provide the -vt and -lb input options multiple times. + +```shell +avclass -vt -vt +``` +```shell +avclass -lb -lb +``` + +There are also -vtdir and -lbdir options that can be used to provide +an input directory where all files are VT (-vtdir) or simplified (-lbdir) +JSON reports: + +```shell +avclass -vtdir +``` + +It is also possible to combine -vt with -vtdir and -lb with -lbdir, +but you cannot combine input files of different format. +Thus, this command works: + +```shell +avclass -vt -vtdir +``` + +But, this one throws an error: + +```shell +avclass -vt -lb +``` + +At this point you have read the most important information on +how to use AVClass. +The following sections describe steps that most users will not need. + +## Labeling: Using only Selected AV Engines + +By default, AVClass will use the labels of all AV engines that appear in +the input reports. +If you want to limit AVClass to use only the labels of certain AV engines, +you can use the -av option to pass it a file where each line has the name of +an AV engine (case-sensitive). + +For example, you could create a file engines.txt with three lines: +Agnitum +Symantec +TotalDefense + +```shell +avclass -av engines.txt -vt ../examples/vtv2_sample.json > example.labels +``` + +would output into example.labels: +``` +602695c8f2ad76564bddcaf47b76edff 2 +f117cc1477513cb181cc2e9fcaab39b2 3 winwebsec|2 +``` + +where only the labels of Agnitum, Symantec, and TotalDefense have been used +to extract tags. +Note that the number of detections is with respect to the provided engines, +i.e., even if the first sample has 52 detections, +only 2 of the 3 selected engines detected it. + + +## Labeling: Ground Truth Evaluation + +If you have family ground truth for some malware samples, +i.e., you know the true family for those samples, +you can evaluate the accuracy of the family tags output by AVClass on +those samples with respect to that ground truth. +The evaluation metrics used are precision, recall, and F1 measure. +See our +[RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf) for their definition. +Note that the ground truth evaluation does not apply to non-family tags, +i.e., it only evaluates the output of the compatibility mode. + +```shell +avclass -lb examples/malheurReference_lb.json -gt ../examples/malheurReference_gt.tsv > malheurReference.labels +``` + +The output includes these lines: + +``` +Calculating precision and recall +3131 out of 3131 +Precision: 90.81 Recall: 94.05 F1-Measure: 92.40 +``` + +Each line in the *examples/malheurReference_gt.tsv* file has +two **tab-separated** columns: + +``` +aca2d12934935b070df8f50e06a20539 ADROTATOR +``` + +which indicates that sample aca2d12934935b070df8f50e06a20539 is known +to be of the *ADROTATOR* family. +Each sample in the input file should also appear in the ground truth file. +Note that the particular label assigned to each family does not matter. +What matters is that all samples in the same family are assigned +the same family name (i.e., the same string in the second column) + +The ground truth can be obtained from publicly available malware datasets. +The one in *../examples/malheurReference_gt.tsv* comes from the +[Malheur](http://www.mlsec.org/malheur/) dataset. +There are other public datasets with ground truth such as +[Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or +[Malicia](http://malicia-project.com/dataset.html). + + +## Update Module + +The update module can be used to suggest additions and changes to the input +taxonomy, tagging rules, and expansion rules. +By default, AVClass uses the default taxonomy, tagging, and expansion files +included in the repository. +Thus, we expect that most users will not need to run the update module. +But, below we explain how to run in case you need to. + +Using the update module comprises of two steps. +The first step is obtaining an alias file: + +```shell +avclass -lb ../examples/malheurReference_lb.json -aliasdetect +``` + +The above command will create a file named \.alias, +malheurReference_lb.alias in our example. This file has 7 columns: + +1. t1: token that is an alias +2. t2: tag for which t1 is an alias +3. |t1|: number of input samples where t1 was observed +4. |t2|: number of input samples where t2 was observed +5. |t1^t2|: number of input samples where both t1 and t2 were observed +6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t1 was observed. +7. |t1^t2|/|t2|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t2 was observed. + +The Update Module takes the above file as input with the -alias option, +as well as the default taxonomy, tagging, and expansion files +in the data directory. +It outputs updated taxonomy, tagging, and expansion files that include the +suggested additions and changes. + +```shell +avclass-update -alias malheurReference_lb.alias -o output_prefix +``` + +This will produce three files: +output_prefix.taxonomy, output_prefix.tagging, output_prefix.expansion. +You can diff the output and input files to analyze the proposed changes. + +You can also modify the input taxonomy, tagging, and expansion rules in place, +rather than producing new files: + +```shell +avclass-update -alias malheurReference_lb.alias -update +``` + +## Customizing AVClass + +AVClass is fully customizable: +Tagging, Expansion and Taxonomy files can be easily modified by the analyst +either manually or by running the update module. + +If you change those files manually, we recommend running +afterwards the normalization script to keep them tidy. +It sorts the tags in the taxonomy and performs some basic cleaning like +removing redundant entries: + +```shell +avclass-normalize -tax mytaxonomy -tag mytagging -exp myexpansions +``` + +If the modifications are in the default files in the data directory you can +simply run: + +```shell +avclass-normalize +``` + +## Evaluating and comparing with AVClass + +Other researchers may want to independently evaluate AVClass/AVClass2 and +to compare it with their own approaches. +We encourage such evaluation, feedback on limitations, and proposals for +improvement. +However, we have observed a number of common errors in such evaluations that +should be avoided. +Thus, if you need to compare your approach with AVClass/AVClass2, +please read the [evaluation page](EVALUATION.md) + ## Dependencies -AVClass and AVClass2 are both written in Python. -They should both run on Python versions above 2.7 and 3.0. +AVClass is written in Python. +It should run on Python versions above 2.7 and 3.0. -They do not require installing any dependencies. +It does not require installing any dependencies. ## Support and Contributing -If you have issues or want to contribute, please file a issue or perform a +If you have issues or want to contribute, please file a issue or perform a pull request through GitHub. ## License -AVClass and AVClass2 are both released under the MIT license +AVClass is released under the MIT license + +## References + +The design and evaluation of AVClass is detailed in our +[RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf): + +> Marcos Sebastián, Richard Rivera, Platon Kotzias, and Juan Caballero.
+AVClass: A Tool for Massive Malware Labeling.
+In Proceedings of the International Symposium on Research in +Attacks, Intrusions and Defenses, +September 2016. + +The design and evaluation of AVClass2 is detailed in our +[ACSAC 2020 paper](https://arxiv.org/pdf/2006.10615.pdf): + +> Silvia Sebastián, Juan Caballero.
+AVClass2: Massive Malware Tag Extraction from AV Labels.
+In proceedings of the Annual Computer Security Applications Conference, +December 2020. ## Contributors -Several members of the MaliciaLab at the [IMDEA Software Institute](http://software.imdea.org) -have contributed code to AVClasss and AVClass2: -Marcos Sebastián, Richard Rivera, Platon Kotzias, Srdjan Matic, Silvia Sebastián, and Juan Caballero. +Several members of the MaliciaLab at the +[IMDEA Software Institute](http://software.imdea.org) +have contributed to AVClass: +Marcos Sebastián, Richard Rivera, Platon Kotzias, Srdjan Matic, +Silvia Sebastián, and Juan Caballero. + +GitHub users with significant contributions to AVClass include +(let us know if you believe you should be listed here): +[eljeffeg](https://github.com/eljeffeg) diff --git a/avclass/README.md b/avclass/README.md deleted file mode 100644 index 07fb2ec..0000000 --- a/avclass/README.md +++ /dev/null @@ -1,401 +0,0 @@ -# AVClass - -AVClass is a malware labeling tool. - -You give it as input the AV labels for a large number of -malware samples (e.g., VirusTotal JSON reports) and it outputs the most -likely family name for each sample that it can extract from the AV labels. -It can also output a ranking of all alternative names it found for each sample. - -The design and evaluation of AVClass is detailed in our -[RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf): - -> Marcos Sebastián, Richard Rivera, Platon Kotzias, and Juan Caballero. -AVClass: A Tool for Massive Malware Labeling. -In Proceedings of the International Symposium on Research in -Attacks, Intrusions and Defenses, -September 2016. - -In a nutshell, AVClass comprises two phases: -preparation (optional) and labeling. -Code for both is included, -but most users will be only interested in the labeling, which outputs the -family name for the samples. -The preparation produces a list of aliases and generic tokens -used by the labeling. -If you use our default aliases and generic tokens lists, -you do not need to run the preparation. - - -## Labeling - -The labeler takes as input -a JSON file with the AV labels of malware samples (-vt or -lb options), -a file with generic tokens (-gen option), -and a file with aliases (-alias option). -It outputs the most likely family name for each sample. -If you do not provide alias or generic tokens files, -the default ones in the *data* folder are used. - -```shell -$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v > malheurReference.labels -``` - -The above command labels the samples whose AV labels are in the -*../examples/malheurReference_lb.json* file. -It prints the results to stdout, -which we redirect to the *malheurReference.labels* file. -The output looks like this: - -``` -aca2d12934935b070df8f50e06a20539 adrotator -67d15459e1f85898851148511c86d88d adultbrowser -``` - -which means sample aca2d12934935b070df8f50e06a20539 is most likely -from the *adrotator* family and -67d15459e1f85898851148511c86d88d from the *adultbrowser* family. - -The verbose (-v) option makes it output an extra -*malheurReference_lb.verbose* file -with all families extracted for each sample ranked by the number of AV -engines that use that family. -The file looks like this: - -``` -aca2d12934935b070df8f50e06a20539 [(u'adrotator', 8), (u'zlob', 2)] -ee90a64fcfaa54a314a7b5bfe9b57357 [(u'swizzor', 19)] -f465a2c1b852373c72a1ccd161fbe94c SINGLETON:f465a2c1b852373c72a1ccd161fbe94c -``` - -which means that for sample aca2d12934935b070df8f50e06a20539 -there are 8 AV engines assigning *adrotator* as the family and -another 2 assigning *zlob*. -Thus, *adrotator* is the most likely family. -On the other hand, for ee90a64fcfaa54a314a7b5bfe9b57357 there are 19 AV -engines assigning *swizzor* as family, -and no other family was found. -The last line means that for sample f465a2c1b852373c72a1ccd161fbe94c -no family name was found in the AV labels. -Thus, the sample is placed by himself in a singleton cluster -with the name of the cluster being the sample's hash. - -Note that the sum of the number of AV engines may not equal the number -of AV engines with a label for that sample in the input file -because the labels of some AV engines may only include generic tokens -that are removed by AVClass. - -## Input JSON format - -AVClass supports three input JSON formats: - -1. VirusTotal v2 API JSON reports (*-vt file*), -where each line in the input *file* should be the full JSON of a -VirusTotal v2 API response to the */file/report* endpoint, -e.g., obtained by querying https://www.virustotal.com/vtapi/v2/file/report?apikey={apikey}&resource={hash} -There is an example VirusTotal v2 input file in examples/vtv2_sample.json - -2. VirusTotal v3 API JSON reports (*-vt file -vt3*), -where each line in the input *file* should be the full JSON of a VirusTotal API version 3 response with a *File* object report, -e.g., obtained by querying https://www.virustotal.com/api/v3/files/{hash} -There is an example VirusTotal v3 input file in examples/vtv3_sample.json - -3. Simplified JSON (*-lb file*), -where each line in *file* should be a JSON -with (at least) these fields: -{md5, sha1, sha256, av_labels}. -There is an example of such input file in *examples/malheurReference_lb.json* - - -**Multiple input files** - -AVClass can handle multiple input files putting the results in the same output files -(if you want results in separate files, process each input file separately). - -It is possible to provide the -vt and -lb input options multiple times. - -```shell -$./avclass_labeler.py -vt -vt -``` -```shell -$./avclass_labeler.py -lb -lb -``` - -There are also -vtdir and -lbdir options that can be used to provide -an input directory where all files are VT (-vtdir) or simplified (-lbdir) JSON reports: - -```shell -$./avclass_labeler.py -vtdir -``` - -It is also possible to combine -vt with -vtdir and -lb with -lbdir, -but you cannot combine input files of different format. Thus, this command works: - -```shell -$./avclass_labeler.py -vt -vtdir -``` - -But, this one throws an error: - -```shell -$./avclass_labeler.py -vt -lb -``` - -## Labeling: Family Ranking - -AVClass has a -fam option to output a file with a ranking of the -families assigned to the input samples. - -```shell -$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -fam > malheurReference.labels -``` - -will produce a file called *malheurReference_lb.families* with two columns: - -``` -virut 441 -allaple 301 -podnuha 300 -``` - -indicating that 441 samples were classified in the virut family, -301 as allaple, and 300 as podnuha. - -This option is very similar to using the following shell command: - -```shell -$cut -f 2 malheurReference.labels | sort | uniq -c | sort -nr -``` - -The main difference is that using the -fam option all SINGLETON samples, -i.e., those for which no label was found, -are grouped into a fake *SINGLETONS* family, -while the shell command would leave each singleton as a separate family. - - -## Labeling: PUP Classification - -AVClass also has a -pup option to classify a sample as -Potentially Unwanted Program (PUP) or malware. -This classification looks for PUP-related keywords -(e.g., pup, pua, unwanted, adware) in the AV labels and was proposed in our -[CCS 2015 paper](https://software.imdea.org/~juanca/papers/malsign_ccs15.pdf): - -> Platon Kotzias, Srdjan Matic, Richard Rivera, and Juan Caballero. -Certified PUP: Abuse in Authenticode Code Signing. -In Proceedings of the 22nd ACM Conference on Computer and Communication Security, Denver, CO, October 2015 - -```shell -$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -pup > malheurReference.labels -``` - -With the -pup option the output of the *malheurReference.labels* file -looks like this: - -``` -aca2d12934935b070df8f50e06a20539 adrotator 1 -67d15459e1f85898851148511c86d88d adultbrowser 0 -``` - -The digit at the end is a Boolean flag that -indicates sample aca2d12934935b070df8f50e06a20539 is -(likely) PUP, but sample 67d15459e1f85898851148511c86d88d is (likely) not. - -In our experience the PUP classification is conservative, -i.e., if it says the sample is PUP, it most likely is. -But, if it says that it is not PUP, it could still be PUP if the AV labels -do not contain PUP-related keywords. -Note that it is possible that some samples from a family get -the PUP flag while other samples from the same family do not -because the PUP-related keywords may not appear in the labels of -all samples from the same family. -To address this issue, you can combine the -pup option with the -fam option. -This combination will add into the families file the classification of the -family as malware or PUP, based on a majority vote among the samples in a -family. - -```shell -$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -pup -fam > malheurReference.labels -``` - -will produce a file called *malheurReference_lb.families* with five columns: - -``` -# Family Total Malware PUP FamType -virut 441 441 0 malware -magiccasino 173 0 173 pup -ejik 168 124 44 malware -``` - -For virut, the numbers indicate all the 441 virut samples are classified -as malware, and thus the last column states that virut is a malware family. -For magiccasino, all 173 samples are labeled as PUP, thus the family is PUP. -For ejik, out of the 168 samples, 124 are labeled as malware and 44 as PUP, -so the family is classified as malware. - - -## Labeling: Ground Truth Evaluation - -If you have ground truth for some malware samples, -i.e., you know the true family for those samples, you can evaluate the accuracy of the labeling output by AVClass on those samples with respect to that -ground truth. -The evaluation metrics used are precision, recall, and F1 measure. -See our RAID 2016 paper above for their definition. - -```shell -$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -gt ../examples/malheurReference_gt.tsv -eval > malheurReference.labels -``` - -The output includes these lines: - -``` -Calculating precision and recall -3131 out of 3131 -Precision: 90.81 Recall: 94.05 F1-Measure: 92.40 -``` - -The last line corresponds to the accuracy metrics obtained by -comparing AVClass results with the provided ground truth. - -Each line in the *../examples/malheurReference_gt.tsv* file has -two **tab-separated** columns: - -``` -0058780b175c3ce5e244f595951f611b8a24bee2 CASINO -``` - -which indicates that sample 0058780b175c3ce5e244f595951f611b8a24bee2 -is known to be of the *CASINO* family. -Each sample in the input file should also appear in the ground truth file. -Note that the particular label assigned to each family does not matter. -What matters is that all samples in the same family are assigned the -same family name (i.e., the same string in the second column) - -The ground truth can be obtained from publicly available malware -datasets. -The one in *../examples/malheurReference_gt.tsv* comes from the -[Malheur](http://www.mlsec.org/malheur/) dataset. -There are other public datasets with ground truth such as -[Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or -[Malicia](http://malicia-project.com/dataset.html). - - -## Preparation: Generic Token Detection - -The labeling takes as input a file with generic tokens that should be -ignored in the AV labels, e.g., trojan, virus, generic, linux. -By default, the labeling uses the *data/default.generics* -generic tokens file. -You can edit that file to add additional generic tokens you feel -we are missing. - -In our RAID 2016 paper we describe an automatic approach to -identify generic tokens, which **requires ground truth**, -i.e., it requires knowing the true family for each input sample. -Not only that, but **the ground truth should be large**, -i.e., contain at least one hundred thousand samples. -In our work we identified generic tokens using as ground truth -the concatenation of all datasets for which we had ground truth. -This requirement of a large ground truth dataset is why we expect most users -will skip this step and simply use our provided default file. - -If you want to test generic token detection you can do: - -```shell - $./avclass_generic_detect.py -lb ../examples/malheurReference_lb.json -gt ../examples/malheurReference_gt.tsv -tgen 10 > malheurReference.gen -``` - -Each line in the *../examples/malheurReference_gt.tsv* file has -two **tab-separated** columns: - -``` -0058780b175c3ce5e244f595951f611b8a24bee2 CASINO -``` - -which indicates that sample 0058780b175c3ce5e244f595951f611b8a24bee2 -is known to be of the *CASINO* family. - -The *-tgen 10* option is a threshold for the minimum number of families -where a token has to be observed to be considered generic. -If the option is ommitted, the default threshold of 8 is used. - -The above command outputs two files: -*malheurReference.gen* and *malheurReference_lb.gen*. -Each of them has 2 columns: token and number of families where the token -was observed. -File *malheurReference.gen* is the final output with the detected -generic tokens for which the number of families is above -the given threshold. -The file *malheurReference_lb.gen* has this information for all tokens. -Thus, *malheurReference.gen* is a subset of *malheurReference_lb.gen*. - -However, note that in the above command you are trying to identify generic -tokens from a small dataset since Drebin only contains 3K labeled samples. -Thus, *malheurReference.gen* only contains 25 identified generic tokens. -Using those 25 generic tokens will produce significantly worse results -than using the generic tokens in *data/default.generics*. -For more details you can refer to our RAID 2016 paper. - - -## Preparation: Alias Detection - -Different vendors may assign different names (i.e., aliases) for the same -family. For example, some vendors may use *zeus* and others *zbot* -as aliases for the same malware family. -The labeling takes as input a file with aliases that should be merged. -By default, the labeling uses the *data/default.aliases* aliases file. -You can edit that file to add additional aliases you feel we are missing. - -In our RAID 2016 paper we describe an automatic approach -to identify aliases. -Our alias detection approach -**requires as input the AV labels for large set of samples**, -e.g., several million samples. -In contrast with the generic token detection, the input samples for -alias detection **do not need to be labeled**, -i.e., no need to know their family. -In our work we identified aliases using as input the largest of our -unlabeled datasets, which contained nearly 8M samples. -This requirement of a large input dataset is why we expect most users -will skip this step and simply use our provided default file. - -If you want to test alias detection you can do: - -```shell -$./avclass_alias_detect.py -lb ../examples/malheurReference_lb.json -nalias 100 -talias 0.98 > malheurReference.aliases -``` - -The -nalias threshold provides the minimum number of samples two tokens -need to be observed in to be considered aliases. -If the option is not provided the default is 20. - -The -talias threshold provides the minimum fraction of times that -the samples appear together. -If the is not provided the default is 0.94 (94%). - -The above command outputs two files: -*malheurReference.aliases* and *malheurReference_lb.alias*. -Each of them has 6 columns: -1. t1: token that is an alias -2. t2: family for which t1 is an alias -3. |t1|: number of input samples where t1 was observed -4. |t2|: number of input samples where t2 was observed -5. |t1^t2|: number of input samples where both t1 and t2 were observed -6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 -were observed over the number of input samples where t1 was observed. - -File *malheurReference.aliases* is the final output with the -detected aliases that satisfy the -nalias and -talias thresholds. -The file *malheurReference_lb.alias* has this information for all tokens. -Thus, *malheurReference.aliases* is a subset -of *malheurReference_lb.alias*. - -However, note that in the above command you are trying to identify aliases -from a small dataset since Drebin only contains 3K samples. -Thus, *malheurReference.aliases* only contains 6 identified aliases. -Using those 6 aliases will produce significantly worse results than using -the aliases in *data/default.aliases*. -As mentioned, to improve the identified aliases you should provide as -input several million samples. -For more details you can refer to our RAID 2016 paper. - diff --git a/avclass/__init__.py b/avclass/__init__.py new file mode 100644 index 0000000..538e9fd --- /dev/null +++ b/avclass/__init__.py @@ -0,0 +1,13 @@ +import os + +AVCLASS_ROOT = os.path.dirname(os.path.abspath(__file__)) +DATA_FOLDER = os.path.join(AVCLASS_ROOT, 'data/') + +RESOURCE_TAG = "default.tagging" +RESOURCE_TAX = "default.taxonomy" +RESOURCE_EXP = "default.expansion" + +DEFAULT_TAX_PATH = os.path.join(DATA_FOLDER, RESOURCE_TAX) +DEFAULT_TAG_PATH = os.path.join(DATA_FOLDER, RESOURCE_TAG) +DEFAULT_EXP_PATH = os.path.join(DATA_FOLDER, RESOURCE_EXP) + diff --git a/avclass/avclass_alias_detect.py b/avclass/avclass_alias_detect.py deleted file mode 100755 index 6111a94..0000000 --- a/avclass/avclass_alias_detect.py +++ /dev/null @@ -1,89 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -''' -AVClass Alias detect -''' -import sys -import argparse -import subprocess -import os - - -def main(args): - # Set input switch - itype = '-vt' if args.vt else '-lb' - ifile = args.vt if args.vt else args.lb - - # Set generic tokens file if provided - gen_switch = "-gen " + args.gen if args.gen else "" - sys.stderr.write('Switch: %s\n' % (gen_switch)) - - # Run avclass_labeler - sys.stderr.write('[-] Running avclass_labeler on %s\n' % (ifile)) - FNULL = open(os.devnull, 'w') - labeler = subprocess.Popen(\ - "python3 avclass_labeler.py %s %s %s -alias /dev/null -aliasdetect" % - (itype, ifile, gen_switch), shell=True, stdout=FNULL) - labeler.wait() - - # Process alias file - sys.stderr.write('[-] Processing token pairs.\n') - alias_fname = os.path.basename(os.path.splitext(ifile)[0]) + '.alias' - with open(alias_fname, 'r') as fr: - for pos, line in enumerate(fr): - cline = line.strip('\n') - # Print headers - if not pos: - sys.stdout.write("%s\n" % cline) - continue - t1, t2, t1_num, t2_num, nalias_num, talias_num = cline.split('\t') - if int(nalias_num) > args.nalias and\ - float(talias_num) > args.talias: - sys.stdout.write("%s\n" % cline) - - # Done - sys.stderr.write('[-] Done.\n') - - -if __name__=='__main__': - argparser = argparse.ArgumentParser(prog='avclass_alias_detect', - description='''Given a collection of VT reports it detects aliases - used by AVs. It runs the avclass_labeler with specific arguments - and processes the output.''') - - argparser.add_argument('-vt', - help='file to parse with full VT reports ' - '(REQUIRED if -lb argument not present)') - - argparser.add_argument('-lb', - help='file to parse with subset of VT reports' - '{md5,sha1,sha256,scan_date,av_labels} ' - '(REQUIRED if -vt not present)') - - argparser.add_argument('-gen', - help='file with generic tokens.') - - argparser.add_argument('-nalias', - help='Minimum number of times that a pair of tokes have been seen.' - 'Default: 20', - type=int, - default = 20) - - argparser.add_argument('-talias', - help='Minimum percentage of times two tokens appear together.' - 'Default: 0.94', - type=float, - default = 0.94) - - args = argparser.parse_args() - - if not args.vt and not args.lb: - sys.stderr.write('Argument -vt or -lb is required\n') - exit(1) - - if args.vt and args.lb: - sys.stderr.write('Use either -vt or -lb argument, not both.\n') - exit(1) - - main(args) - diff --git a/avclass/avclass_generic_detect.py b/avclass/avclass_generic_detect.py deleted file mode 100755 index f7b74f4..0000000 --- a/avclass/avclass_generic_detect.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -''' -AVClass Generic detect -''' -import sys -import argparse -import subprocess -import os - - -def main(args): - # Set input switch - itype = '-vt' if args.vt else '-lb' - ifile = args.vt if args.vt else args.lb - - # Run avclass_labeler - sys.stderr.write('[-] Running avclass_labeler on %s\n' % (ifile)) - FNULL = open(os.devnull, 'w') - labeler = subprocess.Popen(\ - "python3 avclass_labeler.py %s %s -alias /dev/null"\ - " -gen /dev/null -gendetect -gt %s" % - (itype, ifile, args.gt), shell=True, stdout=FNULL) - labeler.wait() - - # Process generic tokens file - sys.stderr.write('[-] Processing results.\n') - gen_fname = os.path.basename(os.path.splitext(ifile)[0]) + '.gen' - with open(gen_fname, 'r') as fr: - for pos, line in enumerate(fr): - cline = line.strip('\n') - # Print headers - if not pos: - sys.stdout.write("%s\n" % cline) - continue - token, fam_num = cline.split('\t') - if int(fam_num) > args.tgen: - sys.stdout.write("%s\n" % cline) - - # Done - sys.stderr.write('[-] Done.\n') - - -if __name__=='__main__': - argparser = argparse.ArgumentParser(prog='avclass_alias_detect', - description='''Given a collection of VT reports and the family - names of these samples (i.e., groundtruth) it generates a list - of generic tokens to be excluded from labeling.''') - - argparser.add_argument('-vt', - help='file to parse with full VT reports ' - '(REQUIRED if -lb argument not present)') - - argparser.add_argument('-lb', - help='file to parse with subset of VT reports' - '{md5,sha1,sha256,scan_date,av_labels} ' - '(REQUIRED if -vt not present)') - - argparser.add_argument('-tgen', - help='Minimum number of families that a token appears. ' - 'Default: 8', - type=int, - default = 8) - - argparser.add_argument('-gt', - help='file with ground truth') - - args = argparser.parse_args() - - if not args.vt and not args.lb: - sys.stderr.write('Argument -vt or -lb is required\n') - exit(1) - - if args.vt and args.lb: - sys.stderr.write('Use either -vt or -lb argument, not both.\n') - exit(1) - - if not args.gt: - sys.stderr.write('Generic token detection needs groundtruth (-gt)\n') - exit(1) - - main(args) - diff --git a/avclass/avclass_labeler.py b/avclass/avclass_labeler.py deleted file mode 100755 index 043a5fd..0000000 --- a/avclass/avclass_labeler.py +++ /dev/null @@ -1,459 +0,0 @@ -#!/usr/bin/env python3 -''' -AVClass labeler -''' - -import os -import sys -path = os.path.dirname(os.path.abspath(__file__)) -libpath = os.path.join(path, 'lib/') -sharedpath = os.path.join(path, '../shared/') -sys.path.insert(1, libpath) -sys.path.insert(1, sharedpath) -import argparse -from avclass_common import AvLabels -from operator import itemgetter -import evaluate_clustering as ec -import json -import traceback - -# Default alias file -default_alias_file = os.path.join(path, "data/default.aliases") -# Default generic tokens file -default_gen_file = os.path.join(path, "data/default.generics") - -def guess_hash(h): - '''Given a hash string, guess the hash type based on the string length''' - hlen = len(h) - if hlen == 32: - return 'md5' - elif hlen == 40: - return 'sha1' - elif hlen == 64: - return 'sha256' - else: - return None - -def main(args): - # Select hash used to identify sample, by default MD5 - hash_type = args.hash if args.hash else 'md5' - - # If ground truth provided, read it from file - gt_dict = {} - if args.gt: - with open(args.gt, 'r') as gt_fd: - for line in gt_fd: - gt_hash, family = map(str.lower, line.strip().split('\t', 1)) - gt_dict[gt_hash] = family - - # Guess type of hash in ground truth file - hash_type = guess_hash(list(gt_dict.keys())[0]) - - # Create AvLabels object - av_labels = AvLabels(args.gen, args.alias, args.av) - - # Build list of input files - # NOTE: duplicate input files are not removed - ifile_l = [] - if (args.vt): - ifile_l += args.vt - ifile_are_vt = True - if (args.lb): - ifile_l += args.lb - ifile_are_vt = False - if (args.vtdir): - ifile_l += [os.path.join(args.vtdir, f) for f in os.listdir(args.vtdir)] - ifile_are_vt = True - if (args.lbdir): - ifile_l += [os.path.join(args.lbdir, f) for f in os.listdir(args.lbdir)] - ifile_are_vt = False - - # Select correct sample info extraction function - if not ifile_are_vt: - get_sample_info = av_labels.get_sample_info_lb - elif args.vt3: - get_sample_info = av_labels.get_sample_info_vt_v3 - else: - get_sample_info = av_labels.get_sample_info_vt_v2 - - # Select output prefix - out_prefix = os.path.basename(os.path.splitext(ifile_l[0])[0]) - - # If verbose, open log file - if args.verbose: - log_filename = out_prefix + '.verbose' - verb_fd = open(log_filename, 'w+') - - # Initialize state - first_token_dict = {} - token_count_map = {} - pair_count_map = {} - token_family_map = {} - fam_stats = {} - vt_all = 0 - vt_empty = 0 - singletons = 0 - - # Process each input file - for ifile in ifile_l: - # Open file - fd = open(ifile, 'r') - - # Debug info, file processed - sys.stderr.write('[-] Processing input file %s\n' % ifile) - - # Process all lines in file - for line in fd: - - # If blank line, skip - if line == '\n': - continue - - # Debug info - if vt_all % 100 == 0: - sys.stderr.write('\r[-] %d JSON read' % vt_all) - sys.stderr.flush() - vt_all += 1 - - # Read JSON line and extract sample info (i.e., hashes and labels) - vt_rep = json.loads(line) - sample_info = get_sample_info(vt_rep) - if sample_info is None: - try: - name = vt_rep['md5'] - sys.stderr.write('\nNo AV labels for %s\n' % name) - except KeyError: - sys.stderr.write('\nCould not process: %s\n' % line) - sys.stderr.flush() - vt_empty += 1 - continue - - # Sample's name is selected hash type (md5 by default) - name = getattr(sample_info, hash_type) - - # If the VT report has no AV labels, continue - if not sample_info[3]: - vt_empty += 1 - sys.stderr.write('\nNo AV labels for %s\n' % name) - sys.stderr.flush() - continue - - # Get the distinct tokens from all the av labels in the report - # And print them. If not verbose, print the first token. - # If verbose, print the whole list - try: - # Get distinct tokens from AV labels - tokens = list(av_labels.get_family_ranking(sample_info).items()) - - # If alias detection, populate maps - if args.aliasdetect: - prev_tokens = set() - for entry in tokens: - curr_tok = entry[0] - curr_count = token_count_map.get(curr_tok) - if curr_count: - token_count_map[curr_tok] = curr_count + 1 - else: - token_count_map[curr_tok] = 1 - for prev_tok in prev_tokens: - if prev_tok < curr_tok: - pair = (prev_tok,curr_tok) - else: - pair = (curr_tok,prev_tok) - pair_count = pair_count_map.get(pair) - if pair_count: - pair_count_map[pair] = pair_count + 1 - else: - pair_count_map[pair] = 1 - prev_tokens.add(curr_tok) - - # If generic token detection, populate map - if args.gendetect and args.gt: - for entry in tokens: - curr_tok = entry[0] - curr_fam_set = token_family_map.get(curr_tok) - family = gt_dict[name] if name in gt_dict else None - if curr_fam_set and family: - curr_fam_set.add(family) - elif family: - token_family_map[curr_tok] = set(family) - - # Top candidate is most likely family name - if tokens: - family = tokens[0][0] - is_singleton = False - else: - family = "SINGLETON:" + name - is_singleton = True - singletons += 1 - - # Check if sample is PUP, if requested - if args.pup: - is_pup = av_labels.is_pup(sample_info[3]) - if is_pup: - is_pup_str = "\t1" - else: - is_pup_str = "\t0" - else: - is_pup = None - is_pup_str = "" - - # Build family map for precision, recall, computation - first_token_dict[name] = family - - # Get ground truth family, if available - if args.gt: - gt_family = '\t' + gt_dict[name] if name in gt_dict else "" - else: - gt_family = "" - - # Print family (and ground truth if available) to stdout - sys.stdout.write('%s\t%s%s%s\n' % (name, family, gt_family, - is_pup_str)) - - # If verbose, print tokens (and ground truth if available) - # to log file - if args.verbose: - verb_fd.write('%s\t%s%s%s\n' % ( - name, tokens, gt_family, is_pup_str)) - - # Store family stats (if required) - if args.fam: - if is_singleton: - ff = 'SINGLETONS' - else: - ff = family - try: - numAll, numMal, numPup = fam_stats[ff] - except KeyError: - numAll = 0 - numMal = 0 - numPup = 0 - - numAll += 1 - if args.pup: - if is_pup: - numPup += 1 - else: - numMal += 1 - fam_stats[ff] = (numAll, numMal, numPup) - - except: - traceback.print_exc(file=sys.stderr) - continue - - # Debug info - sys.stderr.write('\r[-] %d JSON read' % vt_all) - sys.stderr.flush() - sys.stderr.write('\n') - - # Close file - fd.close() - - # Print statistics - sys.stderr.write( - "[-] Samples: %d NoLabels: %d Singletons: %d " - "GroundTruth: %d\n" % ( - vt_all, vt_empty, singletons, len(gt_dict))) - - # If ground truth, print precision, recall, and F1-measure - if args.gt and args.eval: - precision, recall, fmeasure = \ - ec.eval_precision_recall_fmeasure(gt_dict, - first_token_dict) - sys.stderr.write( \ - "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % \ - (precision, recall, fmeasure)) - - # If generic token detection, print map - if args.gendetect: - # Open generic tokens file - gen_filename = out_prefix + '.gen' - gen_fd = open(gen_filename, 'w+') - # Output header line - gen_fd.write("Token\t#Families\n") - sorted_pairs = sorted(token_family_map.items(), - key=lambda x: len(x[1]) if x[1] else 0, - reverse=True) - for (t,fset) in sorted_pairs: - gen_fd.write("%s\t%d\n" % (t, len(fset))) - - # Close generic tokens file - gen_fd.close() - sys.stderr.write('[-] Generic token data in %s\n' % (gen_filename)) - - # If alias detection, print map - if args.aliasdetect: - # Open alias file - alias_filename = out_prefix + '.alias' - alias_fd = open(alias_filename, 'w+') - # Sort token pairs by number of times they appear together - sorted_pairs = sorted( - pair_count_map.items(), key=itemgetter(1)) - # Output header line - alias_fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\n") - # Compute token pair statistic and output to alias file - for (t1,t2),c in sorted_pairs: - n1 = token_count_map[t1] - n2 = token_count_map[t2] - if (n1 < n2): - x = t1 - y = t2 - xn = n1 - yn = n2 - else: - x = t2 - y = t1 - xn = n2 - yn = n1 - f = float(c) / float(xn) - alias_fd.write("%s\t%s\t%d\t%d\t%d\t%0.2f\n" % ( - x,y,xn,yn,c,f)) - # Close alias file - alias_fd.close() - sys.stderr.write('[-] Alias data in %s\n' % (alias_filename)) - - # If family statistics, output to file - if args.fam: - # Open family file - fam_filename = out_prefix + '.families' - fam_fd = open(fam_filename, 'w+') - # Output header line - if args.pup: - fam_fd.write("# Family\tTotal\tMalware\tPUP\tFamType\n") - else: - fam_fd.write("# Family\tTotal\n") - # Sort map - sorted_pairs = sorted(fam_stats.items(), key=itemgetter(1), - reverse=True) - # Print map contents - for (f,fstat) in sorted_pairs: - if args.pup: - if fstat[1] > fstat[2]: - famType = "malware" - else: - famType = "pup" - fam_fd.write("%s\t%d\t%d\t%d\t%s\n" % (f, fstat[0], fstat[1], - fstat[2], famType)) - else: - fam_fd.write("%s\t%d\n" % (f, fstat[0])) - # Close file - fam_fd.close() - sys.stderr.write('[-] Family data in %s\n' % (fam_filename)) - - # Close log file - if args.verbose: - sys.stderr.write('[-] Verbose output in %s\n' % (log_filename)) - verb_fd.close() - - - -if __name__=='__main__': - argparser = argparse.ArgumentParser(prog='avclass_labeler', - description='''Extracts the family of a set of samples. - Also calculates precision and recall if ground truth available''') - - argparser.add_argument('-vt', action='append', - help='file with VT reports ' - '(Can be provided multiple times)') - - argparser.add_argument('-lb', action='append', - help='file with simplified JSON reports ' - '{md5,sha1,sha256,scan_date,av_labels} ' - '(Can be provided multiple times)') - - argparser.add_argument('-vtdir', - help='existing directory with VT reports') - - argparser.add_argument('-lbdir', - help='existing directory with simplified JSON reports') - - argparser.add_argument('-gt', - help='file with ground truth') - - argparser.add_argument('-eval', - action='store_true', - help='if used it evaluates clustering accuracy.' - ' Prints precision, recall, F1-measure. Requires -gt parameter') - - argparser.add_argument('-alias', - help='file with aliases.', - default = default_alias_file) - - argparser.add_argument('-gen', - help='file with generic tokens.', - default = default_gen_file) - - argparser.add_argument('-av', - help='file with list of AVs to use') - - argparser.add_argument('-pup', - action='store_true', - help='if used each sample is classified as PUP or not') - - argparser.add_argument('-gendetect', - action='store_true', - help='if used produce generics file at end. Requires -gt parameter') - - argparser.add_argument('-aliasdetect', - action='store_true', - help='if used produce aliases file at end') - - argparser.add_argument('-v', '--verbose', - action='store_true', - help='output .verbose file with distinct tokens') - - argparser.add_argument('-hash', - help='hash used to name samples. Should match ground truth', - choices=['md5', 'sha1', 'sha256']) - - argparser.add_argument('-fam', - action='store_true', - help='if used produce families file with PUP/malware counts per family') - - argparser.add_argument('-vt3', action='store_true', - help='input are VT v3 files') - - args = argparser.parse_args() - - if not args.vt and not args.lb and not args.vtdir and not args.lbdir: - sys.stderr.write('One of the following 4 arguments is required: ' - '-vt,-lb,-vtdir,-lbdir\n') - exit(1) - - if (args.vt or args.vtdir) and (args.lb or args.lbdir): - sys.stderr.write('Use either -vt/-vtdir or -lb/-lbdir. ' - 'Both types of input files cannot be combined.\n') - exit(1) - - if args.gendetect and not args.gt: - sys.stderr.write('Generic token detection requires -gt param\n') - exit(1) - - if args.eval and not args.gt: - sys.stderr.write('Evaluating clustering accuracy needs -gt param\n') - exit(1) - - if args.alias: - if args.alias == '/dev/null': - sys.stderr.write('[-] Using no aliases\n') - args.alias = None - else: - sys.stderr.write('[-] Using aliases in %s\n' % ( - args.alias)) - else: - sys.stderr.write('[-] Using generic aliases in %s\n' % ( - default_alias_file)) - - if args.gen: - if args.gen == '/dev/null': - sys.stderr.write('[-] Using no generic tokens\n') - args.gen = None - else: - sys.stderr.write('[-] Using generic tokens in %s\n' % ( - args.gen)) - else: - sys.stderr.write('[-] Using default generic tokens in %s\n' % ( - default_gen_file)) - - main(args) diff --git a/avclass2/lib/avclass2_common.py b/avclass/common.py old mode 100755 new mode 100644 similarity index 98% rename from avclass2/lib/avclass2_common.py rename to avclass/common.py index a86877b..51b869c --- a/avclass2/lib/avclass2_common.py +++ b/avclass/common.py @@ -1,11 +1,8 @@ -''' -Main AVClass class -''' - -import sys +import logging import re import string -import logging +import sys + from collections import OrderedDict as OrdDict from collections import namedtuple from operator import itemgetter, attrgetter @@ -23,11 +20,22 @@ ['md5', 'sha1', 'sha256', 'labels', 'vt_tags']) # AVs to use in suffix removal -suffix_removal_av_set = {'Norman', 'Avast', 'Avira', 'Kaspersky', - 'ESET-NOD32', 'Fortinet', 'Jiangmin', 'Comodo', - 'GData', 'Sophos', - 'TrendMicro-HouseCall', 'TrendMicro', - 'NANO-Antivirus', 'Microsoft'} +suffix_removal_av_set = { + 'Norman', + 'Avast', + 'Avira', + 'Kaspersky', + 'ESET-NOD32', + 'Fortinet', + 'Jiangmin', + 'Comodo', + 'GData', + 'Sophos', + 'TrendMicro-HouseCall', + 'TrendMicro', + 'NANO-Antivirus', + 'Microsoft' +} class Tag: ''' A Tag in the taxonomy ''' diff --git a/avclass2/data/andropup.expansion b/avclass/data/andropup.expansion similarity index 100% rename from avclass2/data/andropup.expansion rename to avclass/data/andropup.expansion diff --git a/avclass/data/default.aliases b/avclass/data/default.aliases deleted file mode 100644 index d9ed41c..0000000 --- a/avclass/data/default.aliases +++ /dev/null @@ -1,559 +0,0 @@ -oneclickdownload 1clickdownload -4share 4shared -getfaster 4shared -activshop activshopper -adgazele adgazelle -smabo adialer -dealcabby adpeak -adswo adwo -gaobot agobot -airad airinstaller -airadinstaller airinstaller -airinstall airinstaller -rahack allaple -starman allaple -almanahe alman -kanav alyak -adfltnet amonetize -easydl amonetize -filesearch amonetize -imonetize amonetize -armour androidarmour -climap androrat -arcparlor arcadeparlor -badday badda -bearshare bandoo -ilivid bandoo -koyotelab bandoo -musictoolbar bandoo -searchsuite bandoo -seasuite bandoo -torchmedia bandoo -basebrid basebridge -batteryd batterydoctor -fakebattscar batterydoctor -klezer beebone -selfdel beebone -kazaa benjamin -qukart berbew -padodor berbew -bertle bertle -bertlea bertle -serbg bgserv -midgare bifrose -egbii biige -widoman bmmedia -bobic bobax -boxersms boxer -smsboxer boxer -browsepulse browsefox -dragonbranch browsefox -expressfind browsefox -glassbottle browsefox -greatfind browsefox -liteweb browsefox -positivefinds browsefox -recordpage browsefox -rollaround browsefox -salecharger browsefox -strongsignal browsefox -swiftbrowse browsefox -wanderburst browsefox -yontoo browsefox -yotoon browsefox -bundl bundlore -installvibe bundlore -buzb bzub -desktoplightning cashon -dowcen centim -chinesehacker chir -runonce chir -runouce chir -cinmeng cinmus -clemag cleaman -clientconnect conduit -searchprotect conduit -kucirc cosmu -overdoom cosmu -dalamodo cossta -putalol couponmarvel -crori crossrider -geksone crytex -hublo crytex -cybota cycbot -gbot cycbot -goolbot cycbot -cabby dalexis -ctblocker dalexis -elenoocka dalexis -comet darkkomet -cometsys darkkomet -cometsystems darkkomet -finloski darkkomet -fynloski darkkomet -krademok darkkomet -montiera delbar -cheval detroie -detroi detroie -detroia detroie -eydrop dinwod -directdown directdownloader -indirect directdownloader -zadved dlhelper -dogbite dogowar -dogwar dogowar -rabidog dogowar -domainiq domaiq -domalq domaiq -domlq domaiq -payint domaiq -tugspay domaiq -downloadmin downloadadmin -downloadasist downloadassistant -downloaderguide downloadguide -drdelux droiddeluxe -ddlight droiddreamlight -lightdd droiddreamlight -fokonge droidkungfu -kongfu droidkungfu -kungfu droidkungfu -ibashade drolnux -dialpass egroupdial -egroup egroupdial -exedial egroupdial -instantaccess egroupdial -emud emudbot -adwareeorezo eorezo -getextension eorezo -tuto4pc eorezo -eqdrug equationdrug -equation equationdrug -xpiro expiro -yourfiledownloader expressdownloader -fakerecovery fakesysdef -prodatect fakesysdef -systemfix fakesysdef -tepfer fareit -farex fearso -nofear fearso -nofer fearso -fenomen fenomengame -fenomengamet fenomengame -condestil firseria -downloadmr firseria -firser firseria -firseriainstaller firseria -fiseria firseria -morstar firseria -morstars firseria -popeler firseria -rapiddown firseria -solimba firseria -sventore firseria -flyagent flystudio -flystud flystudio -cobbler focobers -cobblerone focobers -cudos fosniw -regie fosniw -winsoft fosniw -emerleox fujacks -fujack fujacks -whboy fujacks -gaba gabpath -androm gamarue -andromeda gamarue -bundpil gamarue -debris gamarue -dromedan gamarue -lilu gamarue -wauchos gamarue -arcadeparlor gamevance -arcadeweb gamevance -epicgames gamevance -epicplay gamevance -gamevancecs gamevance -gvance gamevance -rivalgame gamevance -juched ganelp -waps gappusin -wapsx gappusin -geimini geinimi -geinim geinimi -kernelpatch geral -livesoft getnow -livesoftaction getnow -frogonal ginmaster -gingermaster ginmaster -gmaster ginmaster -ghostbot gobot -gdream golddream -glodream golddream -gprice gorillaprice -spysheriff harnig -helldoor hilldoor -hippo hipposms -hipsmser hipposms -hispo hipposms -banach hotbar -clickpotato hotbar -clkpotato hotbar -pinball hotbar -rugo hotbar -screensaver hotbar -zango hotbar -freepds hotclip -huigezi hupigon -pigeon hupigon -optimum ibryte -optimuminstall ibryte -optimuminstaller ibryte -optinstall ibryte -optiuminstaller ibryte -ickboy icekboy -iceboy icekboy -installcube icloader -iconos iconosys -iconosis iconosys -inboxtoolbar inbox -dowins inservice -inservc inservice -braininst installbrain -brantall installbrain -ibrain installbrain -clickrun installcore -clickrunsoftware installcore -cryptinno installcore -installco installcore -installrex installerex -sneakytrail installerex -tdownloader installerex -tsuploader installerex -webpick installerex -installq installiq -installmet installmetrix -instmonetizer installmonetizer -installmon installmonster -installmonst installmonster -installmonstr installmonster -monstruos installmonster -tovkater installmonster -intex intexdial -intexus intexdial -neteyes ipamor -mswdm ipamor -amorba ipamor -hidrag jeefo -jackpos jinupd -plosa karagany -xtoober karagany -kgbkeylogger kgbspy -elkern klez -padobot korgo -rkdoor koutodoor -hyteod kovter -lacon laconic -escape laroux -escop laroux -manalo laroux -linkun linkular -powerpack linkular -legendmir lmir -legmir lmir -lemir lmir -biez loadmoney -gldct loadmoney -ldmon loadmoney -loadmoneyent loadmoney -odyssey loadmoney -ogimant loadmoney -plocust loadmoney -duptwux lolbot -duel loveletter -mixor loveletter -xworm loveletter -tazebama mabezat -ratab mamianune -midhos medfos -magmedia mediamagnet -mmag mediamagnet -downloadnsave megasearch -fastsave megasearch -fastsaveapp megasearch -preloader megasearch -saveshare megasearch -morefi memery -lohmys midia -marketpay mmarketpay -mmarket mmarketpay -mmarketp mmarketpay -fipp morto -serpip morto -mspyonline mspy -multibardown multibar -multibardownloader multibar -mutibar multibar -ticno multibar -mplug multiplug -licat murofet -funweb mywebsearch -mindspark mywebsearch -nandrob nandrobox -neshuta neshta -netboxserver netbox -bespal netins -netweird netwiredrc -weecnaw netwiredrc -wirenet netwiredrc -nickispy nickyspy -nickspy nickyspy -conduit opencandy -optixp optix -optixpro optix -bflient palevo -pilleuz palevo -rimecud palevo -pate parite -pinfi parite -perfectkeylogger perflogger -perfkey perflogger -perfloger perflogger -petrolan petrolin -yoof picsys -fixflo pioneer -flofix pioneer -floxif pioneer -floxlib pioneer -apperhand plankton -plangton plankton -pupil plemood -purplemood plemood -purple plemood -gulpix plugx -poisonivy poison -polipos polip -screenblaze prosti -acute pullupdate -clickspring purityscan -clspring purityscan -purity purityscan -chydo pykspa -dwonk pykspa -pykse pykspa -qakbot qbot -qqrobber qqrob -zsone raden -protexor ramnit -rmnet ramnit -ranck ranky -dracur rebhip -spatet rebhip -spyrat rebhip -refogkeylogger refog -relevant relevantknowledge -rknowledge relevantknowledge -arto renos -codecpack renos -codepack renos -banloader rimod -mutopy rodecap -ggsmart rootsmart -kometa rukometa -gnurbulf rungbu -overt sadenav -overtls sadenav -sahagent sahat -shopathome sahat -safekidzone sakezon -kashu sality -kuku sality -saldrop sality -salicode sality -salitystub sality -salload sality -salpack sality -salrenmetie sality -stubofsality sality -sancmed sanctionedmedia -contrand sckeylog -controlrandom sckeylog -sckeylogger sckeylog -sclog sckeylog -softcentral sckeylog -secxplod securityxploded -secxploded securityxploded -winsxsbot sfone -ibank shiz -pinny shiz -shifu shiz -zybut shiz -shohdi shodi -caphaw shylock -opclose sillyfdc -cson simbot -rodricter simda -avalod sinowal -sinodo sinowal -wplug slugin -wplugin slugin -koceg socks -mandaph socks -pace socks -fakromup soft32downloader -popuppers soft32downloader -soft32down soft32downloader -soft32download soft32downloader -wedownload soft32downloader -softbase softobase -bxib softonic -softonicdownloader softonic -driverupd softpulse -sambamedia softpulse -softpules softpulse -betterinstaller somoto -mazel somoto -somato somoto -somotobetterinstaller somoto -somotoltd somoto -optimizerpro speedingupmypc -spdupmypc speedingupmypc -superoptimizer speedingupmypc -superpctools speedingupmypc -spyeyes spyeye -spyweep spyeye -square squarenet -javak suggestor -steekt steek -tophos stegvob -mofksys swisyn -c2lop swizzor -electron sytro -soltern sytro -systro sytro -taojin taojinstar -alureon tdss -olmarik tdss -tidserv tdss -tdssrt tdss -jelbrus techsnab -privitize techsnab -joleee tedroo -tedro tedroo -gael tenga -gaelicum tenga -licum tenga -nuwar tibs -peacomm tibs -tibspk tibs -zhelatin tibs -tinbakd tinba -pirrit tirrip -pirritsuggestor tirrip -inffinity toggle -inffinityinternet toggle -stufik tufik -tufei tufik -twetty twetti -speedupmypc uniblue -bandito unruy -banito unruy -cycler unruy -spacer unruy -cryptodef upatre -daytre upatre -ipatre upatre -waski upatre -yarwi upatre -gupboot urelas -plite urelas -ruftar usteal -nextup verti -lavandos vidro -spakrab vidro -gavir viking -looked viking -philis viking -multiinstall vilsel -ultradownload vilsel -ultradownloads vilsel -vils vilsel -nabucur virlock -polyransom virlock -virransom virlock -angel virut -angryangel virut -guarder virut -madanf virut -madang virut -madangel virut -vetor virut -virtob virut -vserv viser -vitallia vittalia -changeup vobfus -chinky vobfus -diple vobfus -meredrop vobfus -pronny vobfus -purora vobfus -vbccrypt vobfus -vbna vobfus -vbobfus vobfus -wbna vobfus -vflood vtflooder -vflooder vtflooder -wanna wannacry -wanacry wannacry -wannacrypt wannacry -wannacryptor wannacry -jadtre wapomi -loorp wapomi -mikcer wapomi -nimnul wapomi -otwycal wapomi -pikor wapomi -pikorms wapomi -protil wapomi -qvod wapomi -simfect wapomi -vjadtre wapomi -wali wapomi -stration warezov -webalt webalta -bulknet webprefix -klevate webprefix -blackice whiteice -blic whiteice -darksnow whiteice -autokms winactivator -kmsauto winactivator -hackkms winactivator -statblaster winfetcher -akan winwebsec -livesecurity winwebsec -mbro winwebsec -systemsecurity winwebsec -poweliks wowlik -powerliks wowlik -powessere wowlik -appquanta wkload -valla xorala -valhalla xorala -extrat xtrat -remtasu xtrat -xtreme xtrat -zbomber zombbomber -panda zbot -zbocheman zbot -zeus zbot -bjlog zegost -zeno zenosearch -maxplus zeroaccess -maxplusent zeroaccess -pmax zeroaccess -sirefef zeroaccess -smadow zeroaccess -zaccess zeroaccess -zona zvuzona -onestep zwangi -zwunzi zwangi diff --git a/avclass2/data/default.expansion b/avclass/data/default.expansion similarity index 100% rename from avclass2/data/default.expansion rename to avclass/data/default.expansion diff --git a/avclass/data/default.generics b/avclass/data/default.generics deleted file mode 100644 index 1fbef42..0000000 --- a/avclass/data/default.generics +++ /dev/null @@ -1,418 +0,0 @@ -# Architecture / OS -win -win32 -w32 -win64 -w64 -winnt -linux -unix -android -androidos -andr -macosx -osx -osx32 - -# Malicious software -malware -malicious -malagent -maldroid -dangerousobject - -# Heuristic detection -generic -generik -gen -agen -genmalicious -generickd -tsgeneric -genericr -heuristic -heur -siggen -genetic -genome -cloud -kcloud -memscan -high -score -attribute -advml -bloodhound -sape -maltrec -symvt -igeneric -eheur -posible -undefined -static - -# Malware classes -trojan -horse -troj -trj -trojanhorse -trojware -trojanransom -trojanspy -trojanapt -trojanclicker -trojanfakeav -trojanpsw -worm -networm -hllw -virus -fileinfector -infector -prepender -hllp -rootkit -spyware -ddos -flooder -dialer -porndialer -porn -backdoor -bkdr -keylog -keylogger -datastealer -stealer -infostealer -pwstealer -banker -monitor -mailer -email -emailworm -massmailer -smtp -stmp -spam -spammer -spambot -ransom -ransomlock -ransomcrypt -ransomware -filecoder -filecryptor -rogue -fakeav -fakealert -clicker -adclicker -click -miner -coinmine -coinminer -bitcoinminer -bitcoin -btcmine -bitminer -trojansms -smssend -searcher -phishing - -# Macro -macro -badmacro -maliciousmacro -w97m -o97m -x97m -pp97m -mw97 -w2km -mo97 -x2km - -# Downloader -downloader -downldr -dloader -dwnldr -dldr -dloadr -dloade -download -dload -downware -downagent -dropper -drop -dropr -dldrop -exedrop -mdropper -muldrop -droppr -trojandropper -trojandownloader -trojandwnldr -trjndwnlder -exedown -downldexe -dropped -docdl -docdrop -docdrp -macrodown -downloadware -dloadware - -# PUP -pup -pua -adware -potentially -unwanted -not-a-virus -riskware -risk -grayware -unwnt -addisplay -adknowledge -adload -applicunwnt -adplugin -plugin -downad -toolbar -webtoolbar -casino -casonline -install -installer -bundle -bundler -bundled -bundleapp -bundleinstaller -softwarebundler -nsis -browsermodifier -unsafe -securityrisk - -# Suspicious -suspected -suspect -suspicious -susp -suspic -suspectcrc -reputation -behaveslike -lookslike -variant -based -possible -threat -probably -confidence -highconfidence - -# Unclassified -unknown -unclassifiedmalware -undef - -# Behavior: injection -injector -inject -injecter -vbinject -injcrypt -injected - -# Behavior: homepage modification -homepage -startpage - -# Behavior: kill -avkill -killav -antiav -antifw -blocker - -# Behavior: signed -fakems -signed - -# Behavior: proxy -proxy -trojanproxy - -# Behavior: autorun -autorun -autoruner -starter - -# Behavior: network -netfilter -redirector -sniffer -portscan - -# Behavior: files -killfiles -renamer - -# Behavior: services -servstart -server - -# Behavior: VM detect -vmdetect -vmdetector - -# Packer -packer -cryptor -crypter -obfuscator -msilobfuscator -encoder - -# Packed -packed -malpack -encpk -malob -cryp -crypt -crypted -cryptic -genpack -krypt -kryptk -kryptik -obfuscated -obfus -obfusc -obfuscate -malcrypt -vbcrypt -vbkrypt -vbpack -xpack -zpack -susppack -suspiciouspacker - -# Packed (specific packers) -asprotect -nspack -pecompact -upack -themida -vmprotect - -# Program -program -application -appl -software -file - -# File types -text -html -script -word -msword -excel -msexcel -office -msoffice -shellcode -shellkode -msil -java -j2me -fakedoc -fakepdf -webpage -iframe -powershell -perl -python -flash -jpeg -autoit - -# Patch -pepatch -patchfile -patched -patcher - -# Exploit -exploit -expl - -# Corrupted -damaged -corrupt -pemalform -malpe - -# Tools -tool -risktool -securitytool -fraudtool -virtool -keygen -hack -hacktool -hktl -spamtool -crack -cracktool - -# Small -small -tiny - -# Generic families -agent -eldorado -artemis -krap -kazy -katusha -pornoasset -foreign -symmi -jorik -graftor -strictor - -# Test -test -testvirus - -# Misc -password -website -encodefeature -multi -normal -other -optional -access -onion - diff --git a/avclass2/data/default.tagging b/avclass/data/default.tagging similarity index 100% rename from avclass2/data/default.tagging rename to avclass/data/default.tagging diff --git a/avclass2/data/default.taxonomy b/avclass/data/default.taxonomy similarity index 100% rename from avclass2/data/default.taxonomy rename to avclass/data/default.taxonomy diff --git a/avclass2/data/misp/cluster/avclass2.json b/avclass/data/misp/cluster/avclass2.json similarity index 100% rename from avclass2/data/misp/cluster/avclass2.json rename to avclass/data/misp/cluster/avclass2.json diff --git a/avclass2/data/misp/galaxy/avclass2.json b/avclass/data/misp/galaxy/avclass2.json similarity index 100% rename from avclass2/data/misp/galaxy/avclass2.json rename to avclass/data/misp/galaxy/avclass2.json diff --git a/shared/evaluate_clustering.py b/avclass/evaluate.py similarity index 100% rename from shared/evaluate_clustering.py rename to avclass/evaluate.py diff --git a/avclass2/avclass2_labeler.py b/avclass/labeler.py similarity index 94% rename from avclass2/avclass2_labeler.py rename to avclass/labeler.py index ea172ed..577b9bb 100755 --- a/avclass2/avclass2_labeler.py +++ b/avclass/labeler.py @@ -1,27 +1,24 @@ #!/usr/bin/env python3 -''' -AVClass2 labeler -''' -import os -import sys -script_dir = os.path.dirname(os.path.abspath(__file__)) -sys.path.insert(1, os.path.join(script_dir, 'lib/')) -sys.path.insert(1, os.path.join(script_dir, '../shared/')) import argparse -from avclass2_common import AvLabels -from operator import itemgetter -import evaluate_clustering as ec +import gzip import json +import os +import sys import traceback -import gzip -# Default tagging file -default_tag_file = os.path.join(script_dir, "data/default.tagging") -# Default expansion file -default_exp_file = os.path.join(script_dir, "data/default.expansion") -# Default taxonomy file -default_tax_file = os.path.join(script_dir, "data/default.taxonomy") +from operator import itemgetter + +try: + from avclass import DEFAULT_TAX_PATH, DEFAULT_TAG_PATH, DEFAULT_EXP_PATH + from avclass.common import AvLabels, Taxonomy + from avclass import evaluate as ec +except ModuleNotFoundError: + # Helps find the avclasses when run from console + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + from avclass import DEFAULT_TAX_PATH, DEFAULT_TAG_PATH, DEFAULT_EXP_PATH + from avclass.common import AvLabels, Taxonomy + from avclass import evaluate as ec def guess_hash(h): ''' Given a hash string, guess the hash type based on the string length ''' @@ -61,7 +58,10 @@ def list_str(l, sep=", ", prefix=""): out = out + sep + s return out -def main(args): +def main(): + # Parse arguments + args = parse_args() + # Select hash used to identify sample, by default MD5 hash_type = args.hash if args.hash else 'md5' @@ -362,8 +362,8 @@ def main(args): sys.stderr.write('[-] Alias data in %s\n' % (alias_filename)) -if __name__=='__main__': - argparser = argparse.ArgumentParser(prog='avclass2_labeler', +def parse_args(): + argparser = argparse.ArgumentParser(prog='avclass', description='''Extracts tags for a set of samples. Also calculates precision and recall if ground truth available''') @@ -400,15 +400,15 @@ def main(args): argparser.add_argument('-tag', help='file with tagging rules.', - default = default_tag_file) + default = DEFAULT_TAG_PATH) argparser.add_argument('-tax', help='file with taxonomy.', - default = default_tax_file) + default = DEFAULT_TAX_PATH) argparser.add_argument('-exp', help='file with expansion rules.', - default = default_exp_file) + default = DEFAULT_EXP_PATH) argparser.add_argument('-av', help='file with list of AVs to use') @@ -464,7 +464,7 @@ def main(args): args.tag)) else: sys.stderr.write('[-] Using default tagging rules in %s\n' % ( - default_tag_file)) + DEFAULT_TAG_PATH)) if args.tax: if args.tax == '/dev/null': @@ -474,7 +474,7 @@ def main(args): args.tax)) else: sys.stderr.write('[-] Using default taxonomy in %s\n' % ( - default_tax_file)) + DEFAULT_TAX_PATH)) if args.exp: if args.exp == '/dev/null': @@ -484,6 +484,9 @@ def main(args): args.exp)) else: sys.stderr.write('[-] Using default expansion tags in %s\n' % ( - default_exp_file)) + DEFAULT_EXP_PATH)) + + return args - main(args) +if __name__ == "__main__": + main() diff --git a/avclass/lib/avclass_common.py b/avclass/lib/avclass_common.py deleted file mode 100755 index 29e308e..0000000 --- a/avclass/lib/avclass_common.py +++ /dev/null @@ -1,336 +0,0 @@ -''' -Main AVClass class -''' - -import re -import string -from collections import OrderedDict as OrdDict -from collections import namedtuple -from operator import itemgetter, attrgetter - -SampleInfo = namedtuple('SampleInfo', - ['md5', 'sha1', 'sha256', 'labels']) - -# AVs to use in is_pup method -pup_av_set = {'Malwarebytes', 'K7AntiVirus', 'Avast', - 'AhnLab-V3', 'Kaspersky', 'K7GW', 'Ikarus', - 'Fortinet', 'Antiy-AVL', 'Agnitum', 'ESET-NOD32'} - -# Tokens that indicate PUP used by is_pup method -pup_tokens = {'PUA', 'Adware', 'PUP', 'Unwanted', 'Riskware', 'grayware', - 'Unwnt', 'Adknowledge', 'toolbar', 'casino', 'casonline', - 'AdLoad', 'not-a-virus'} - -# AVs to use in suffix removal -suffix_removal_av_set = {'Norman', 'Avast', 'Avira', 'Kaspersky', - 'ESET-NOD32', 'Fortinet', 'Jiangmin', 'Comodo', - 'GData', 'Sophos', - 'TrendMicro-HouseCall', 'TrendMicro', - 'NANO-Antivirus', 'Microsoft'} - -class AvLabels: - ''' - Class to operate on AV labels, - such as extracting the most likely family name. - ''' - def __init__(self, gen_file = None, alias_file = None, av_file = None): - - # Read generic token set from file - self.gen_set = self.read_generics(gen_file) if gen_file else set() - - # Read aliases map from file - self.aliases_map = self.read_aliases(alias_file) if alias_file else {} - - # Read AV engine set from file - self.avs = self.read_avs(av_file) if av_file else None - - @staticmethod - def read_aliases(alfile): - '''Read aliases map from given file''' - if alfile is None: - return {} - almap = {} - with open(alfile, 'r') as fd: - for line in fd: - alias, token = line.strip().split()[0:2] - almap[alias] = token - return almap - - @staticmethod - def read_generics(generics_file): - '''Read generic token set from given file''' - gen_set = set() - with open(generics_file) as gen_fd: - for line in gen_fd: - if line.startswith('#') or line == '\n': - continue - gen_set.add(line.strip()) - return gen_set - - @staticmethod - def read_avs(avs_file): - '''Read AV engine set from given file''' - with open(avs_file) as fd: - avs = set(map(str.strip, fd.readlines())) - return avs - - @staticmethod - def get_sample_info_lb(vt_rep): - '''Parse and extract sample information from JSON line - Returns a SampleInfo named tuple - ''' - return SampleInfo(vt_rep['md5'], vt_rep['sha1'], vt_rep['sha256'], - vt_rep['av_labels']) - - @staticmethod - def get_sample_info_vt_v2(vt_rep): - '''Parse and extract sample information from JSON line - Returns a SampleInfo named tuple - ''' - label_pairs = [] - # Obtain scan results, if available - try: - scans = vt_rep['scans'] - md5 = vt_rep['md5'] - sha1 = vt_rep['sha1'] - sha256 = vt_rep['sha256'] - except KeyError: - return None - # Obtain labels from scan results - for av, res in scans.items(): - if res['detected']: - label = res['result'] - clean_label = ''.join(filter( - lambda x: x in string.printable, - label)).strip() - label_pairs.append((av, clean_label)) - - return SampleInfo(md5, sha1, sha256, label_pairs) - - @staticmethod - def get_sample_info_vt_v3(vt_rep): - '''Parse and extract sample information from JSON line - Returns a SampleInfo named tuple - ''' - label_pairs = [] - # Obtain scan results, if available - try: - scans = vt_rep['data']['attributes']['last_analysis_results'] - md5 = vt_rep['data']['attributes']['md5'] - sha1 = vt_rep['data']['attributes']['sha1'] - sha256 = vt_rep['data']['attributes']['sha256'] - except KeyError: - return None - # Obtain labels from scan results - for av, res in scans.items(): - label = res['result'] - if label is not None: - clean_label = ''.join(filter( - lambda x: x in string.printable, - label)).strip() - label_pairs.append((av, clean_label)) - - return SampleInfo(md5, sha1, sha256, label_pairs) - - @staticmethod - def is_pup(av_label_pairs): - '''This function classifies the sample as PUP or not - using the AV labels as explained in the paper: - "Certified PUP: Abuse in Authenticode Code Signing" - (ACM CCS 2015) - It uses the AV labels of 11 specific AVs. - The function checks for 13 keywords used to indicate PUP. - Return: - True/False/None - ''' - # If no AV labels, nothing to do, return - if not av_label_pairs: - return None - # Initialize - pup = False - threshold = 0.5 - # Set with (AV name, Flagged/not flagged as PUP), for AVs in pup_av_set - bool_set = set([(pair[0], t.lower() in pair[1].lower()) - for t in pup_tokens - for pair in av_label_pairs - if pair[0] in pup_av_set]) - - # Number of AVs that had a label for the sample - av_detected = len([p[0] for p in av_label_pairs - if p[0] in pup_av_set]) - - # Number of AVs that flagged the sample as PUP - av_pup = list(map(lambda x: x[1], bool_set)).count(True) - - # Flag as PUP according to a threshold - if (float(av_pup) >= float(av_detected)*threshold) and av_pup != 0: - pup = True - return pup - - - @staticmethod - def _remove_suffixes(av_name, label): - '''Remove AV specific suffixes from given label - Returns updated label''' - - # Truncate after last '.' - if av_name in suffix_removal_av_set: - label = label.rsplit('.', 1)[0] - - # Truncate after last '.' - # if suffix only contains digits or uppercase (no lowercase) chars - if av_name == 'AVG': - tokens = label.rsplit('.', 1) - if len(tokens) > 1 and re.match("^[A-Z0-9]+$", tokens[1]): - label = tokens[0] - - # Truncate after last '!' - if av_name in set(['Agnitum','McAffee','McAffee-GW-Edition']): - label = label.rsplit('!', 1)[0] - - # Truncate after last '(' - if av_name in set(['K7AntiVirus', 'K7GW']): - label = label.rsplit('(', 1)[0] - - # Truncate after last '@' - # GData would belong here, but already trimmed earlier - if av_name in set(['Ad-Aware', 'BitDefender', 'Emsisoft', 'F-Secure', - 'Microworld-eScan']): - label = label.rsplit('(', 1)[0] - - return label - - - def _normalize(self, label, hashes): - '''Tokenize label, filter tokens, and replace aliases''' - - # If empty label, nothing to do - if not label: - return [] - - # Initialize list of tokens to return - ret = [] - - # Split label into tokens and process each token - for token in re.split("[^0-9a-zA-Z]", label): - # Convert to lowercase - token = token.lower() - - # Remove digits at the end - end_len = len(re.findall("\d*$", token)[0]) - if end_len: - token = token[:-end_len] - - # Ignore short token - if len(token) < 4: - continue - - # Remove generic tokens - if token in self.gen_set: - continue - - # Ignore token if prefix of a hash of the sample - # Most AVs use MD5 prefixes in labels, - # but we check SHA1 and SHA256 as well - hash_token = False - for hash_str in hashes: - if hash_str[0:len(token)] == token: - hash_token = True - break - if hash_token: - continue - - # Replace alias - token = self.aliases_map[token] if token in self.aliases_map \ - else token - - # Add token - ret.append(token) - return ret - - def get_family_ranking(self, sample_info): - ''' - Returns sorted dictionary of most likely family names for sample - ''' - # Extract info from named tuple - av_label_pairs = sample_info[3] - hashes = [ sample_info[0], sample_info[1], sample_info[2] ] - - # Whitelist the AVs to filter the ones with meaningful labels - av_whitelist = self.avs - - # Initialize auxiliary data structures - labels_seen = set() - token_map = {} - - # Process each AV label - for (av_name, label) in av_label_pairs: - # If empty label, nothing to do - if not label: - continue - - ################ - # AV selection # - ################ - if av_whitelist and av_name not in av_whitelist: - continue - - ##################### - # Duplicate removal # - ##################### - - # Emsisoft uses same label as - # GData/ESET-NOD32/BitDefender/Ad-Aware/MicroWorld-eScan, - # but suffixes ' (B)' to their label. Remove the suffix. - if label.endswith(' (B)'): - label = label[:-4] - - # F-Secure uses Avira's engine since Nov. 2018 - # but prefixes 'Malware.' to Avira's label. Remove the prefix. - if label.startswith('Malware.'): - label = label[8:] - - # Other engines often use exactly the same label, e.g., - # AVG/Avast - # K7Antivirus/K7GW - # Kaspersky/ZoneAlarm - - # If we have seen the exact same label before, skip - if label in labels_seen: - continue - # If not, we add it to the set of labels seen - else: - labels_seen.add(label) - - ################## - # Suffix removal # - ################## - label = self._remove_suffixes(av_name, label) - - ######################################################## - # Tokenization, token filtering, and alias replacement # - ######################################################## - tokens = self._normalize(label, hashes) - - # Increase token count in map - for t in tokens: - c = token_map[t] if t in token_map else 0 - token_map[t] = c + 1 - - ################################################################## - # Token ranking: sorts tokens by decreasing count and then token # - ################################################################## - sorted_tokens = sorted(token_map.items(), - key=itemgetter(1,0), - reverse=True) - - # Delete the tokens appearing only in one AV, add rest to output - sorted_dict = OrdDict() - for t, c in sorted_tokens: - if c > 1: - sorted_dict[t] = c - else: - break - - return sorted_dict - diff --git a/avclass2/avclass2_input_checker.py b/avclass/normalize.py similarity index 68% rename from avclass2/avclass2_input_checker.py rename to avclass/normalize.py index a700ca1..b4727df 100755 --- a/avclass2/avclass2_input_checker.py +++ b/avclass/normalize.py @@ -1,34 +1,32 @@ #!/usr/bin/env python3 -''' -AVClass2 input checker -''' +import argparse import os import sys -import argparse -script_dir = os.path.dirname(os.path.abspath(__file__)) -sys.path.insert(1, os.path.join(script_dir, 'lib/')) -from avclass2_common import Taxonomy, Tagging, Expansion -default_tag_file = "data/default.tagging" -default_tax_file = "data/default.taxonomy" -default_exp_file = "data/default.expansion" +try: + from avclass import DEFAULT_TAX_PATH, DEFAULT_TAG_PATH, DEFAULT_EXP_PATH + from avclass.common import Taxonomy, Tagging, Expansion +except ModuleNotFoundError: + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + from avclass import DEFAULT_TAX_PATH, DEFAULT_TAG_PATH, DEFAULT_EXP_PATH + from avclass.common import Taxonomy, Tagging, Expansion -if __name__ == '__main__': +def main(): argparser = argparse.ArgumentParser(prog='input_checker', description='Checks format of files Tagging, Expansion and Taxonomy.') argparser.add_argument('-tag', help='tagging file', - default=default_tag_file) + default=DEFAULT_TAG_PATH) argparser.add_argument('-tax', help='taxonomy file', - default=default_tax_file) + default=DEFAULT_TAX_PATH) argparser.add_argument('-exp', help='expansion file', - default=default_exp_file) + default=DEFAULT_EXP_PATH) # Parse arguments args = argparser.parse_args() @@ -54,3 +52,6 @@ sys.stdout.write('[-] Normalized %d expansion rules in %s\n' % ( len(expansion), args.exp)) +if __name__ == "__main__": + main() + diff --git a/avclass2/avclass2_update_module.py b/avclass/update.py similarity index 93% rename from avclass2/avclass2_update_module.py rename to avclass/update.py index a3e70e6..93cf8e7 100755 --- a/avclass2/avclass2_update_module.py +++ b/avclass/update.py @@ -1,20 +1,23 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -''' -AVClass2 Update module -''' -import sys -import os + import argparse import logging -# Make sure paths are relative to execution path -script_dir = os.path.dirname(os.path.abspath(__file__)) -sys.path.insert(0, os.path.join(script_dir, 'lib/')) -from operator import itemgetter +import os +import sys + from collections import namedtuple -from avclass2_common import Taxonomy, Expansion, Tagging +from operator import itemgetter # from Levenshtein import ratio as levenshtein_ratio +try: + from avclass import DEFAULT_TAX_PATH, DEFAULT_TAG_PATH, DEFAULT_EXP_PATH + from avclass.common import Taxonomy, Tagging, Expansion +except ModuleNotFoundError: + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + from avclass import DEFAULT_TAX_PATH, DEFAULT_TAG_PATH, DEFAULT_EXP_PATH + from avclass.common import Taxonomy, Tagging, Expansion + # Set logging log = logging.getLogger(__name__) @@ -27,14 +30,6 @@ root.setLevel(logging.DEBUG) root.addHandler(handler_stderr) - -# Default tagging file -default_tagging_file = os.path.join(script_dir, "data/default.tagging") -# Default expansion file -default_expansion_file = os.path.join(script_dir, "data/default.expansion") -# Default taxonomy file -default_taxonomy_file = os.path.join(script_dir, "data/default.taxonomy") - # Threshold for string similarity # sim_threshold = 0.6 @@ -198,7 +193,7 @@ def process_relation(self, rel): log.debug("Processing %s\t%s" % (p1, p2)) # If both directions strong, then equivalent, i.e., alias - if (float(rel.tinv_alias_num) >= args.t): + if (float(rel.tinv_alias_num) >= self._t): if (c1 != "UNK") and (c2 == "UNK"): prefix = p1[0:p1.rfind(':')] elif (c1 == "UNK") and (c2 != "UNK"): @@ -390,26 +385,26 @@ def output_rule_stats(self, fd): def output(self, out_prefix): if (not out_prefix): - tax_filepath = default_taxonomy_file - tag_filepath = default_tagging_file - exp_filepath = default_expansion_file + tax_filepath = DEFAULT_TAX_PATH + tag_filepath = DEFAULT_TAG_PATH + exp_filepath = DEFAULT_EXP_PATH else: tax_filepath = out_prefix + ".taxonomy" tag_filepath = out_prefix + ".tagging" exp_filepath = out_prefix + ".expansion" - taxonomy.to_file(tax_filepath) + self._out_taxonomy.to_file(tax_filepath) log.info('[-] Output %d taxonomy tags to %s' % ( - len(taxonomy), tax_filepath)) - tagging.expand_all_destinations() - tagging.to_file(tag_filepath) + len(self._out_taxonomy), tax_filepath)) + self._out_tagging.expand_all_destinations() + self._out_tagging.to_file(tag_filepath) log.info('[-] Output %d tagging rules to %s' % ( - len(tagging), tag_filepath)) - expansion.to_file(exp_filepath) + len(self._out_tagging), tag_filepath)) + self._out_expansion.to_file(exp_filepath) log.info('[-] Output %d expansion rules to %s' % ( - len(expansion), exp_filepath)) + len(self._out_expansion), exp_filepath)) -if __name__ == '__main__': +def main(): argparser = argparse.ArgumentParser( description='''Given a .alias file from the labeler, generates updates for the taxonomy, tagging, and expansion files.''') @@ -438,15 +433,15 @@ def output(self, out_prefix): argparser.add_argument('-tag', help='file with tagging rules.', - default = default_tagging_file) + default = DEFAULT_TAG_PATH) argparser.add_argument('-tax', help='file with taxonomy.', - default = default_taxonomy_file) + default = DEFAULT_TAX_PATH) argparser.add_argument('-exp', help='file with expansion rules.', - default = default_expansion_file) + default = DEFAULT_EXP_PATH) argparser.add_argument('-v', '--verbose', action='store_true', @@ -509,3 +504,6 @@ def output(self, out_prefix): # Output final rules update.output_relations(out_prefix + ".final.rules") +if __name__ == "__main__": + main() + diff --git a/avclass2/README.md b/avclass2/README.md deleted file mode 100644 index b01e394..0000000 --- a/avclass2/README.md +++ /dev/null @@ -1,290 +0,0 @@ -# AVClass2 - -AVClass2 is a malware tagging tool. It extends AVClass to extract from AV labels not only family name tags, but other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). - -You give it as input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports) -and it outputs tags observed in the AV labels, ranked by decreasing popularity. - -The design and evaluation of AVClass2 is detailed in our ACSAC 2020 paper. - -> Silvia Sebastián, Juan Caballero. -AVClass2: Massive Malware Tag Extraction from AV Labels. -In proceedings of the Annual Computer Security Applications Conference, December 2020. - -In a nutshell, AVClass2 comprises two modules: labeling and update. Code for both is included, but most users will be only interested in the labeling, which outputs the tags for the samples. The update module is used to update the input taxonomy, tagging rules, and expansion rules. If you use our default taxonomy, tagging, and expansion files, you do not need to run the update module. - - -## Labeling - -The labeler takes as input a JSON file with the AV labels of malware samples -(-vt or -lb options), -a file with the taxonomy (-tax option), -a file with tagging rules (-tag option), and -a file with expansion rules (-exp option). -It outputs a set of ranked tags. -If you do not provide taxonomy, expansion or tagging files, -the default ones in the data folder are used. - -```shell -$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -``` - -The above command labels the samples whose AV labels are in -the ../examples/malheurReference_lb.json file. -It prints the results to stdout. -The output looks like this: - -``` -aca2d12934935b070df8f50e06a20539 33 grayware|10,adware|9,windows|8,adrotator|8,downloader|3,zlob|2 -67d15459e1f85898851148511c86d88d 37 dialer|23,windows|9,adultbrowser|8,porndialer|7,grayware|6,tool|3,target|2 -``` - -which means sample *aca2d12934935b070df8f50e06a20539* -was flagged by 33 AV engines and 10 of them agree it is *grayware*, 9 that it is more specifically *adware*, -8 mention that it runs on *windows*, another 8 that it is the *adrotator* family, -3 that it is a *downloader*, and 2 that it belongs instead to the *zlob* family. -Sample *67d15459e1f85898851148511c86d88d* is flagged by 37 AV engines and 23 of them -consider it a *dialer*, 8 that it belongs to the *adultbrowser* family, and so on. - -The -p option outputs the full path of each tag in the taxonomy: - -```shell -$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -p -``` - -The above command line outputs: - -``` -aca2d12934935b070df8f50e06a20539 33 CLASS:grayware|10,CLASS:grayware:adware|9,FILE:os:windows|8,FAM:adrotator|8,CLASS:downloader|3,FAM:zlob|2 -67d15459e1f85898851148511c86d88d 37 CLASS:dialer|23,FILE:os:windows|9,FAM:adultbrowser|8,CLASS:dialer:porndialer|7,CLASS:grayware|6,CLASS:grayware:tool|3,FAM:target|2 -``` - -where each tag has been replaced by its taxonomy path, which starts with the category in capitals, -followed by the path in the category (if any), and the tag itself, all separated by colons. -For example, *FAM:adrotator* makes explicit that *adrotator* is a malware family, -*CLASS:grayware* that *grayware* is a malware class, and -*CLASS:grayware:adware* that *adware* is a subclass of *grayware*. - -**Compatibility mode** - -The compatibility -c option makes AVClass2 output the same format as AVClass. - -```shell -$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -c -``` - -outputs: - -``` -bb23e1d296cf01bbaf32ed3938f9b0b8 allaple -cc4521ea738e8ba17139f86b3def5349 SINGLETON:cc4521ea738e8ba17139f86b3def5349 -``` - -As in AVClass, the output contains only the family name, -which corresponds to the highest ranked family tag, all other tags are ignored. -Samples for which a family cannot be obtained are labeled as singletons with their hash. - -It is important to note that AVClass2 compatibility mode results can differ from AVClass results -on the same input file. -The differences in family names are due to differences between the generics and aliases files -used by AVClass and the taxonomy, tagging rules, and expansion rules used by AVClass2. -In the future, we may change AVClass to use the taxonomy and rules from AVClass2 -as input (instead of the generics and aliases files) -to minimize such differences and avoid maintaining different data files. - - -## Input JSON format - -AVClass2 supports three input JSON formats: - -1. VirusTotal v2 API JSON reports (*-vt file*), -where each line in the input *file* should be the full JSON of a -VirusTotal v2 API response to the */file/report* endpoint, -e.g., obtained by querying https://www.virustotal.com/vtapi/v2/file/report?apikey={apikey}&resource={hash} -There is an example VirusTotal v2 input file in examples/vtv2_sample.json - -2. VirusTotal v3 API JSON reports (*-vt file -vt3*), -where each line in the input *file* should be the full JSON of a VirusTotal API version 3 response with a *File* object report, -e.g., obtained by querying https://www.virustotal.com/api/v3/files/{hash} -There is an example VirusTotal v3 input file in examples/vtv3_sample.json - -3. Simplified JSON (*-lb file*), -where each line in *file* should be a JSON -with (at least) these fields: -{md5, sha1, sha256, av_labels}. -There is an example of such input file in *examples/malheurReference_lb.json* - - -**Multiple input files** - -AVClass2 can handle multiple input files putting the results in the same output files -(if you want results in separate files, process each input file separately). - -It is possible to provide the -vt and -lb input options multiple times. - -```shell -$./avclass2_labeler.py -vt -vt -``` -```shell -$./avclass2_labeler.py -lb -lb -``` - -There are also -vtdir and -lbdir options that can be used to provide -an input directory where all files are VT (-vtdir) or simplified (-lbdir) JSON reports: - -```shell -$./avclass2_labeler.py -vtdir -``` - -It is also possible to combine -vt with -vtdir and -lb with -lbdir, -but you cannot combine input files of different format. Thus, this command works: - -```shell -$./avclass2_labeler.py -vt -vtdir -``` - -But, this one throws an error: - -```shell -$./avclass2_labeler.py -vt -lb -``` - -At this point you have read the most important information on how to use AVClass2. -The following sections describe steps that most users will not need. - -## Labeling: Using only Selected AV Engines - -By default, AVClass2 will use the labels of all AV engines that appear in -the input reports. -If you want to limit AVClass2 to use only the labels of certain AV engines, -you can use the -av option to pass it a file where each line has the name of -an AV engine (case-sensitive). - -For example, you could create a file engines.txt with three lines: -Agnitum -Symantec -TotalDefense - -```shell -$./avclass2_labeler.py -av engines.txt -vt ../examples/vtv2_sample.json > example.labels -``` - -would output into example.labels: -``` -602695c8f2ad76564bddcaf47b76edff 2 -f117cc1477513cb181cc2e9fcaab39b2 3 winwebsec|2 -``` - -where only the labels of Agnitum, Symantec, and TotalDefense have been used -to extract tags. -Note that the number of detections is with respect to the provided engines, -i.e., even if the first sample has 52 detections, -only 2 of the 3 selected engines detected it. - -## Labeling: Ground Truth Evaluation - -If you have family ground truth for some malware samples, i.e., -you know the true family for those samples, you can evaluate the accuracy -of the family tags output by AVClass2 on those samples with respect to that ground truth. -The evaluation metrics used are precision, recall, and F1 measure. -See our [RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf) for their definition. -Note that the ground truth evaluation does not apply to non-family tags, -i.e., it only evaluates the output of the compatibility mode. - -```shell -$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -gt ../examples/malheurReference_gt.tsv > malheurReference.labels -``` - -The output includes these lines: - -``` -Calculating precision and recall -3131 out of 3131 -Precision: 90.81 Recall: 94.05 F1-Measure: 92.40 -``` - -Each line in the *../examples/malheurReference_gt.tsv* file has two **tab-separated** columns: - -``` -aca2d12934935b070df8f50e06a20539 ADROTATOR -``` - -which indicates that sample aca2d12934935b070df8f50e06a20539 is known -to be of the *ADROTATOR* family. -Each sample in the input file should also appear in the ground truth file. -Note that the particular label assigned to each family does not matter. -What matters is that all samples in the same family are assigned -the same family name (i.e., the same string in the second column) - -The ground truth can be obtained from publicly available malware datasets. -The one in *../examples/malheurReference_gt.tsv* comes from the -[Malheur](http://www.mlsec.org/malheur/) dataset. -There are other public datasets with ground truth such as -[Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or -[Malicia](http://malicia-project.com/dataset.html). - -## Update Module - -The update module can be used to suggest additions and changes to the input -taxonomy, tagging rules, and expansion rules. -Using the update module comprises of two steps. -The first step is obtaining an alias file from the labeler: - -```shell -$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -aliasdetect -``` - -The above command will create a file named \.alias, -malheurReference_lb.alias in our example. This file has 7 columns: - -1. t1: token that is an alias -2. t2: tag for which t1 is an alias -3. |t1|: number of input samples where t1 was observed -4. |t2|: number of input samples where t2 was observed -5. |t1^t2|: number of input samples where both t1 and t2 were observed -6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t1 was observed. -7. |t1^t2|/|t2|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t2 was observed. - - -The Update Module takes the above file as input with the -alias option, -as well as the default taxonomy, tagging, and expansion files in the data directory. -It outputs updated taxonomy, tagging, and expansion files that include the -suggested additions and changes. - -```shell -$./avclass2_update_module.py -alias malheurReference_lb.alias -o output_prefix -``` - -This will produce three files: -output_prefix.taxonomy, output_prefix.tagging, output_prefix.expansion. -You can diff the output and input files to analyze the proposed changes. - -You can also modify the input taxonomy, tagging, and expansion rules in place, -rather than producing new files: - - -```shell -$./avclass2_update_module.py -alias malheurReference_lb.alias -update -``` - - -## Customizing AVClass2 - -AVClass2 is fully customizable: -Tagging, Expansion and Taxonomy files can be easily modified by the analyst -either manually or by running the update module. - -If you change those files manually, we recommend running -afterwards the input checker script to keep them tidy. -It sorts the tags in the taxonomy and performs some basic cleaning like -removing redundant entries: - -```shell -$./avclass2_input_checker.py -tax taxonomy_file -tag tagging_file -exp expansio_file -``` - -If the modifications are in the default files in the data directory you can simply run: - -```shell -$./avclass2_input_checker.py -``` diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..c3d05b3 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,28 @@ +[build-system] +requires = ["setuptools>=61.0.0", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "avclass-malicialab" +version = "2.5.0" +description = "AVClass is a Python package and command line tool to tag / label malware samples." +readme = "README.md" +authors = [{ name = "MaliciaLab" }] +license = { file = "LICENSE" } +classifiers = [ + "License :: OSI Approved :: MIT License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Topic :: Security", +] +keywords = ["malware", "malware family", "tag", "AV label"] +dependencies = [ +] + +[project.urls] +Homepage = "https://github.com/malicialab/avclass" + +[project.scripts] +avclass = "avclass.labeler:main" +avclass-update = "avclass.update:main" +avclass-normalize = "avclass.normalize:main"