From a78b60e086a55bd1fa2c1e74a17fb6a5bfe79c8d Mon Sep 17 00:00:00 2001 From: Juan Caballero Date: Tue, 1 Sep 2020 19:26:48 +0200 Subject: [PATCH] Added first version of AVClass2 --- LICENSE | 2 +- README.md | 552 ++----- avclass/README.md | 392 +++++ .../avclass_alias_detect.py | 0 .../avclass_generic_detect.py | 0 .../avclass_labeler.py | 0 {data => avclass/data}/default.aliases | 0 {data => avclass/data}/default.generics | 0 {lib => avclass/lib}/avclass_common.py | 0 {lib => avclass/lib}/evaluate_clustering.py | 0 avclass2/README.md | 252 ++++ avclass2/avclass2_input_checker.py | 51 + avclass2/avclass2_labeler.py | 469 ++++++ avclass2/avclass2_update_module.py | 480 ++++++ avclass2/data/expansion | 17 + avclass2/data/tagging | 1300 +++++++++++++++++ avclass2/data/taxonomy | 1138 +++++++++++++++ avclass2/lib/avclass2_common.py | 636 ++++++++ avclass2/lib/evaluate_clustering.py | 141 ++ {data => examples}/malheurReference_gt.tsv | 0 {data => examples}/malheurReference_lb.json | 0 examples/vtv3_sample.json | 1 + 22 files changed, 4999 insertions(+), 432 deletions(-) create mode 100644 avclass/README.md rename avclass_alias_detect.py => avclass/avclass_alias_detect.py (100%) rename avclass_generic_detect.py => avclass/avclass_generic_detect.py (100%) rename avclass_labeler.py => avclass/avclass_labeler.py (100%) rename {data => avclass/data}/default.aliases (100%) rename {data => avclass/data}/default.generics (100%) rename {lib => avclass/lib}/avclass_common.py (100%) rename {lib => avclass/lib}/evaluate_clustering.py (100%) create mode 100644 avclass2/README.md create mode 100755 avclass2/avclass2_input_checker.py create mode 100755 avclass2/avclass2_labeler.py create mode 100755 avclass2/avclass2_update_module.py create mode 100644 avclass2/data/expansion create mode 100644 avclass2/data/tagging create mode 100644 avclass2/data/taxonomy create mode 100755 avclass2/lib/avclass2_common.py create mode 100755 avclass2/lib/evaluate_clustering.py rename {data => examples}/malheurReference_gt.tsv (100%) rename {data => examples}/malheurReference_lb.json (100%) create mode 100644 examples/vtv3_sample.json diff --git a/LICENSE b/LICENSE index 7996e63..1b6a62b 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2016 MaliciaLab @ IMDEA Software Institute +Copyright (c) 2016-2020 MaliciaLab @ IMDEA Software Institute Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index f7d1ee8..fcdd06f 100644 --- a/README.md +++ b/README.md @@ -1,484 +1,174 @@ -# AVClass +# AVClass and AVClass2 -[AVClass](https://github.com/malicialab/avclass) -is a malware labeling tool. +AVClass and AVClass2 are Python tools to tag / label malware samples. +You give them as input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports) +and they output tags extracted from the AV labels of each sample. +The original AVClass only outputs family names (i.e., family tags). +By default, it outputs the most likely family for each sample (e.g., *zbot*, *virut*). +It can also output a ranking of all alternative family names it found for each sample. +The newer AVClass2, in addition to family names, also outputs other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). -You give it as input the AV labels for a large number of -malware samples (e.g., VirusTotal JSON reports) and it outputs the most -likely family name for each sample that it can extract from the AV labels. -It can also output a ranking of all alternative names it found for each sample. +A quick example helps illustrating the differences. If you run AVClass2 on our example input file: -The design and evaluation of AVClass is detailed in our -[RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf): - -> Marcos Sebastián, Richard Rivera, Platon Kotzias, and Juan Caballero. -AVClass: A tool for Massive Malware Labeling. -In Proceedings of the International Symposium on Research in -Attacks, Intrusions and Defenses, -September 2016. - -In a nutshell, AVClass comprises two phases: -preparation (optional) and labeling. -Code for both is included, -but most users will be only interested in the labeling, which outputs the -family name for the samples. -The preparation produces a list of aliases and generic tokens -used by the labeling. -If you use our default aliases and generic tokens lists, -you do not need to run the preparation. - -**Why is AVClass useful?** - -Because a lot of times security researchers want to extract -family information from AV labels, -but this process is not as simple as it looks, -especially if you need to do it for large numbers (e.g., millions) of -samples. Some advantages of AVClass are: - -1. *Automatic.* - AVClass removes manual analysis limitations on the size of - the input dataset. - -2. *Vendor-agnostic.* - AVclass operates on the labels of any available set of AV engines, - which can vary from sample to sample. - -3. *Cross-platform.* - AVclass can be used for any platforms supported by AV engines, - e.g., Windows or Android malware. - -4. *Does not require executables.* - AV labels can be obtained from online services like VirusTotal - using a sample's hash, even when the executable is not available. - -5. *Quantified accuracy.* - We have evaluated AVClass on 5 publicly available malware datasets with - ground truth. Details are in the above RAID 2016 paper. - -6. *Open source.* - The code is available and we are happy to incorporate suggestions and - improvements so that the security community benefits from AVClass. - -**Limitations** - -The main limitation of AVClass is that its output depends on the input -AV labels. -It tries to compensate for the noise on those labels, but -cannot identify the family of a sample if AV engines do not provide -non-generic family names to that sample. -In particular, it cannot label samples if at least 2 AV engines -do not agree on a non-generic family name. -Results on 8 million samples showed that AVClass could label 81% of the -samples. -In other words, it could not label 19% of the -samples because their labels contained only generic tokens. - -Still, there are many samples that AVClass can label and thus we believe -you will find it a useful tool. -We recommend you to read the discussion section in our RAID 2016 paper for -more details. - -## Labeling - -The labeler takes as input -a JSON file with the AV labels of malware samples (-vt or -lb switches), -a file with generic tokens (-gen switch), -and a file with aliases (-alias switch). -It outputs the most likely family name for each sample. -If you do not provide alias or generic tokens files, -the default ones in the *data* folder are used. - -``` -$./avclass_labeler.py -lb data/malheurReference_lb.json -v > malheurReference.labels -``` - -The above command labels the samples whose AV labels are in the -*data/malheurReference_lb.json* file. -It prints the results to stdout, -which we redirect to the *malheurReference.labels* file. -The output looks like this: - -``` -aca2d12934935b070df8f50e06a20539 adrotator -67d15459e1f85898851148511c86d88d adultbrowser +```shell +$./avclass2/avclass2_labeler.py -lb examples/malheurReference_lb.json -p ``` -which means sample aca2d12934935b070df8f50e06a20539 is most likely -from the *adrotator* family and -67d15459e1f85898851148511c86d88d from the *adultbrowser* family. - -The verbose (-v) switch makes it output an extra -*malheurReference_lb.verbose* file -with all families extracted for each sample ranked by the number of AV -engines that use that family. -The file looks like this: +the output on stdout is: ``` -aca2d12934935b070df8f50e06a20539 [(u'adrotator', 8), (u'zlob', 2)] -ee90a64fcfaa54a314a7b5bfe9b57357 [(u'swizzor', 19)] -f465a2c1b852373c72a1ccd161fbe94c SINGLETON:f465a2c1b852373c72a1ccd161fbe94c +aca2d12934935b070df8f50e06a20539 33 CLASS:grayware|10,CLASS:grayware:adware|9,FILE:os:windows|8,FAM:adrotator|8,CLASS:downloader|3,FAM:zlob|2 +67d15459e1f85898851148511c86d88d 37 CLASS:dialer|23,FILE:os:windows|9,FAM:adultbrowser|8,CLASS:dialer:porndialer|7,CLASS:grayware|6,CLASS:grayware:tool|3,FAM:target|2 ``` +which means sample *aca2d12934935b070df8f50e06a20539* +was flagged by 33 AV engines and 10 of them agree it is *grayware*, 9 that it is more specifically *adware*, +8 mention that it runs on *windows*, another 8 that it is the *adrotator* family, +3 that it is a *downloader*, and 2 that it belongs instead to the *zlob* family. +Sample *67d15459e1f85898851148511c86d88d* is flagged by 37 AV engines and 23 of them +consider it a *dialer*, 8 that it belongs to the *adultbrowser* family, and so on. -which means that for sample aca2d12934935b070df8f50e06a20539 -there are 8 AV engines assigning *adrotator* as the family and -another 2 assigning *zlob*. -Thus, *adrotator* is the most likely family. -On the other hand, for ee90a64fcfaa54a314a7b5bfe9b57357 there are 19 AV -engines assigning *swizzor* as family, -and no other family was found. -The last line means that for sample f465a2c1b852373c72a1ccd161fbe94c -no family name was found in the AV labels. -Thus, the sample is placed by himself in a singleton cluster -with the name of the cluster being the sample's hash. - -Note that the sum of the number of AV engines may not equal the number -of AV engines with a label for that sample in the input file -because the labels of some AV engines may only include generic tokens -that are removed by AVClass. - - -## Input JSON format - -AVClass supports two input JSON formats: - -1. VirusTotal JSON reports (*-vt file*), -where each line in *file* should be the full JSON of a -VirusTotal report as fetched through the VirusTotal API. -By default, it assumes the VT reports are from VT API version 2. -If the VT reports are from VT API version 3, add the -vt3 command line option. +If you instead run AVClass on the same input file: -2. Simplified JSON (*-lb file*), -where each line in *file* should be a JSON -with (at least) these fields: -{md5, sha1, sha256, av_labels}. -There is an example of such input file in *data/malheurReference_lb.json* - -**Why have 2 different input formats?** - -We believe most users will get the AV labels using VirusTotal. -However, AVClass is IO-bound and a VirusTotal report -in addition to the AV labels and hashes includes -much other data that AVClass does not need. -Thus, when applying AVClass to millions of samples, -reducing the input file size by removing unnnecessary data -significantly improves efficiency. -Furthermore, users could obtain AV labels from other sources and -the simpler the input JSON format, -the easier to convert those AV labels into an input file. - -**Multiple input files** - -AVClass can handle multiple input files putting the results in the same -output files -(if you want results in separate files, process each input file separately). - -It is possible to provide the -vt and -lb input options multiple times. - -``` -$./avclass_labeler.py -vt -vt > all.labels -``` -``` -$./avclass_labeler.py -lb -lb > all.labels -``` - -There are also -vtdir and -lbdir options that can be used to provide -an input directory where all files are VT (-vtdir) -or simplified (-lbdir) JSON reports - - -``` -$./avclass_labeler.py -vtdir > all.labels -``` - -It is also possible to combine -vt with -vtdir and -lb with -lbdir, -but you cannot combine input files of different format. -Thus, this command works: - - -``` -$./avclass_labeler.py -vt -vtdir > all.labels +```shell +$./avclass/avclass_labeler.py -lb examples/malheurReference_lb.json ``` -But, this one throws an error: +the output looks like this: ``` -$./avclass_labeler.py -vt -lb > all.labels -``` - - -At this point you have read the most important information on how to use -AVClass. -The following sections describe steps that most users will not need. - -## Labeling: Family Ranking - -AVClass has a -fam switch to output a file with a ranking of the -families assigned to the input samples. +aca2d12934935b070df8f50e06a20539 adrotator +67d15459e1f85898851148511c86d88d adultbrowser +``` -``` -$./avclass_labeler.py -lb data/malheurReference_lb.json -v -fam > malheurReference.labels -``` +which simply reports the most common family name for each sample. -will produce a file called *malheurReference_lb.families* with two columns: +In a nutshell, that is the main difference between both tools. +Of course, there are more options for both tools, +which you can read about in their corresponding README files. -``` -virut 441 -allaple 301 -podnuha 300 -``` -indicating that 441 samples were classified in the virut family, -301 as allaple, and 300 as podnuha. +## Which one should I use? -This switch is very similar to using the following shell command: +AVClass2 is the newer tool and it extracts more information +from the input AV labels. +So, if you are new to AVClass and AVClass2, we recommend trying it out first. -``` -$cut -f 2 malheurReference.labels | sort | uniq -c | sort -nr -``` +However, there are several reasons to keep AVClass around. +First, it is more mature and used by many analysts, +so we want to preserve backwards compatibility. +Second, for some applications only family names are needed and +for that AVClass is enough. +Third, AVClass is faster than AVClass2 since it extracts less info. +The lower runtime is nice when processing millions of samples and +not requiring the extra tags AVClass2 provides. -The main difference is that using the -fam switch all SINGLETON samples, -i.e., those for which no label was found, -are grouped into a fake *SINGLETONS* family, -while the shell command would leave each singleton as a separate family. +## References +The design and evaluation of AVClass is detailed in our +[RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf): -## Labeling: PUP Classification +> Marcos Sebastián, Richard Rivera, Platon Kotzias, and Juan Caballero. +AVClass: A Tool for Massive Malware Labeling. +In Proceedings of the International Symposium on Research in +Attacks, Intrusions and Defenses, +September 2016. -AVClass also has a -pup switch to classify a sample as -Potentially Unwanted Program (PUP) or malware. -This classification looks for PUP-related keywords -(e.g., pup, pua, unwanted, adware) in the AV labels and was proposed in our -[CCS 2015 paper](https://software.imdea.org/~juanca/papers/malsign_ccs15.pdf): +The design and evaluation of AVClass2 is detailed in our ACSAC 2020 paper. -> Platon Kotzias, Srdjan Matic, Richard Rivera, and Juan Caballero. -Certified PUP: Abuse in Authenticode Code Signing. -In Proceedings of the 22nd ACM Conference on Computer and Communication Security, Denver, CO, October 2015 +> Silvia Sebastián, Juan Caballero. +AVClass2: Massive Malware Tag Extraction from AV Labels. +In proceedings of the Annual Computer Security Applications Conference, December 2020. -``` -$./avclass_labeler.py -lb data/malheurReference_lb.json -v -pup > malheurReference.labels -``` +## Why are AVClass and AVClass2 useful? -With the -pup switch the output of the *malheurReference.labels* file -looks like this: +Because a lot of times security researchers want to extract family and other +information from AV labels, but this process is not as simple as it looks, +especially if you need to do it for large numbers (e.g., millions) of samples. +Some advantages of AVClass and AVClass2 are: -``` -aca2d12934935b070df8f50e06a20539 adrotator 1 -67d15459e1f85898851148511c86d88d adultbrowser 0 -``` +1. *Automatic.* They remove manual analysis limitations on the size of the +input +dataset. -The digit at the end is a Boolean flag that -indicates sample aca2d12934935b070df8f50e06a20539 is -(likely) PUP, but sample 67d15459e1f85898851148511c86d88d is (likely) not. - -In our experience the PUP classification is conservative, -i.e., if it says the sample is PUP, it most likely is. -But, if it says that it is not PUP, it could still be PUP if the AV labels -do not contain PUP-related keywords. -Note that it is possible that some samples from a family get -the PUP flag while other samples from the same family do not -because the PUP-related keywords may not appear in the labels of -all samples from the same family. -To address this issue, you can combine the -pup switch with the -fam switch. -This combination will add into the families file the classification of the -family as malware or PUP, based on a majority vote among the samples in a -family. +2. *Vendor-agnostic.* They operate on the labels of any available set of AV +engines, which can vary from sample to sample. -``` -$./avclass_labeler.py -lb data/malheurReference_lb.json -v -pup -fam > malheurReference.labels -``` +3. *Cross-platform.* They can be used for any platforms supported by AV +engines, e.g., Windows or Android malware. -will produce a file called *malheurReference_lb.families* with five columns: +4. *Does not require executables.* AV labels can be obtained from online services + like VirusTotal using a sample's hash, even when the executable is not available. -``` -# Family Total Malware PUP FamType -virut 441 441 0 malware -magiccasino 173 0 173 pup -ejik 168 124 44 malware -``` +5. *Quantified accuracy.* We have evaluated AVClass and AVClass2 on millions of +samples and publicly available malware datasets with ground truth. +Evaluation details are in the RAID 2016 and ACSAC 2020 papers. -For virut, the numbers indicate all the 441 virut samples are classified -as malware, and thus the last column states that virut is a malware family. -For magiccasino, all 173 samples are labeled as PUP, thus the family is PUP. -For ejik, out of the 168 samples, 124 are labeled as malware and 44 as PUP, -so the family is classified as malware. +6. *Open source.* The code is available and we are happy to incorporate +suggestions and improvements so that the security community benefits from +these tools. +## Limitations -## Labeling: Ground Truth Evaluation +The main limitations of AVClass and AVClass2 are that its output depends +on the input AV labels. +Both tools try to compensate for the noise on the AV labels, +but cannot identify tags if AV engines do not provide non-generic tokens +in the labels of a sample. +In particular, they cannot tag samples if at least 2 AV engines +do not agree on a tag. -If you have ground truth for some malware samples, -i.e., you know the true family for those samples, you can evaluate the accuracy of the labeling output by AVClass on those samples with respect to that -ground truth. -The evaluation metrics used are precision, recall, and F1 measure. -See our RAID 2016 paper above for their definition. +Still, there are many samples that both tools can tag +and thus we believe you will find them useful. +We recommend you to read the RAID 2016 and ACSAC 2020 papers for more details. -``` -$./avclass_labeler.py -lb data/malheurReference_lb.json -v -gt data/malheurReference_gt.tsv -eval > data/malheurReference.labels -``` +## Input JSON format -The output includes these lines: +AVClass and AVClass2 support two input JSON formats: -``` -Calculating precision and recall -3131 out of 3131 -Precision: 90.81 Recall: 94.05 F1-Measure: 92.40 -``` +1. VirusTotal JSON reports (*-vt file*), +where each line in *file* should be the full JSON of a +VirusTotal report as fetched through the VirusTotal API. +By default, it assumes the VT reports are from VT API version 2. +If the VT reports are from VT API version 3, add the -vt3 command line option. -The last line corresponds to the accuracy metrics obtained by -comparing AVClass results with the provided ground truth. +2. Simplified JSON (*-lb file*), +where each line in *file* should be a JSON +with (at least) these fields: +{md5, sha1, sha256, av_labels}. +There is an example of such input file in *examples/malheurReference_lb.json* -Each line in the *data/malheurReference_gt.tsv* file has -two **tab-separated** columns: +**Why have two different input formats?** -``` -0058780b175c3ce5e244f595951f611b8a24bee2 CASINO -``` +We believe most users will get the AV labels using VirusTotal. +However, AVClass and AVClass2 are IO-bound and a VirusTotal report +in addition to the AV labels and hashes includes +much other data that the tools do not need. +Thus, when applying AVClass or AVClass2 to millions of samples, +reducing the input file size by removing unnnecessary data +significantly improves efficiency. +Furthermore, users could obtain AV labels from other sources and +the simpler the input JSON format, +the easier to convert those AV labels into an input file. -which indicates that sample 0058780b175c3ce5e244f595951f611b8a24bee2 -is known to be of the *CASINO* family. -Each sample in the input file should also appear in the ground truth file. -Note that the particular label assigned to each family does not matter. -What matters is that all samples in the same family are assigned the -same family name (i.e., the same string in the second column) - -The ground truth can be obtained from publicly available malware -datasets. -The one in *data/malheurReference_gt.tsv* comes from the -[Malheur](http://www.mlsec.org/malheur/) dataset. -There are other public datasets with ground truth such as -[Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or -[Malicia](http://malicia-project.com/dataset.html). - - -## Preparation: Generic Token Detection - -The labeling takes as input a file with generic tokens that should be -ignored in the AV labels, e.g., trojan, virus, generic, linux. -By default, the labeling uses the *data/default.generics* -generic tokens file. -You can edit that file to add additional generic tokens you feel -we are missing. - -In our RAID 2016 paper we describe an automatic approach to -identify generic tokens, which **requires ground truth**, -i.e., it requires knowing the true family for each input sample. -Not only that, but **the ground truth should be large**, -i.e., contain at least one hundred thousand samples. -In our work we identified generic tokens using as ground truth -the concatenation of all datasets for which we had ground truth. -This requirement of a large ground truth dataset is why we expect most users -will skip this step and simply use our provided default file. - -If you want to test generic token detection you can do: +## Dependencies -``` - $./avclass_generic_detect.py -lb data/malheurReference_lb.json -gt data/malheurReference_gt.tsv -tgen 10 > malheurReference.gen -``` +AVClass and AVClass2 are both written in Python. +They should both run on Python versions above 2.7 and 3.0. -Each line in the *data/malheurReference_gt.tsv* file has -two **tab-separated** columns: +They do not require installing any dependencies. -``` -0058780b175c3ce5e244f595951f611b8a24bee2 CASINO -``` +## Support and Contributing -which indicates that sample 0058780b175c3ce5e244f595951f611b8a24bee2 -is known to be of the *CASINO* family. - -The *-tgen 10* switch is a threshold for the minimum number of families -where a token has to be observed to be considered generic. -If the switch is ommitted, the default threshold of 8 is used. - -The above command outputs two files: -*malheurReference.gen* and *malheurReference_lb.gen*. -Each of them has 2 columns: token and number of families where the token -was observed. -File *malheurReference.gen* is the final output with the detected -generic tokens for which the number of families is above -the given threshold. -The file *malheurReference_lb.gen* has this information for all tokens. -Thus, *malheurReference.gen* is a subset of *malheurReference_lb.gen*. - -However, note that in the above command you are trying to identify generic -tokens from a small dataset since Drebin only contains 3K labeled samples. -Thus, *malheurReference.gen* only contains 25 identified generic tokens. -Using those 25 generic tokens will produce significantly worse results -than using the generic tokens in *data/default.generics*. -For more details you can refer to our RAID 2016 paper. - - -## Preparation: Alias Detection - -Different vendors may assign different names (i.e., aliases) for the same -family. For example, some vendors may use *zeus* and others *zbot* -as aliases for the same malware family. -The labeling takes as input a file with aliases that should be merged. -By default, the labeling uses the *data/default.aliases* aliases file. -You can edit that file to add additional aliases you feel we are missing. - -In our RAID 2016 paper we describe an automatic approach -to identify aliases. -Our alias detection approach -**requires as input the AV labels for large set of samples**, -e.g., several million samples. -In contrast with the generic token detection, the input samples for -alias detection **do not need to be labeled**, -i.e., no need to know their family. -In our work we identified aliases using as input the largest of our -unlabeled datasets, which contained nearly 8M samples. -This requirement of a large input dataset is why we expect most users -will skip this step and simply use our provided default file. - -If you want to test alias detection you can do: +If you have issues or want to contribute, please file a issue or perform a +pull request through GitHub. -``` -$./avclass_alias_detect.py -lb data/malheurReference_lb.json -nalias 100 -talias 0.98 > malheurReference.aliases -``` +## License -The -nalias threshold provides the minimum number of samples two tokens -need to be observed in to be considered aliases. -If the switch is not provided the default is 20. - -The -talias threshold provides the minimum fraction of times that -the samples appear together. -If the switch is not provided the default is 0.94 (94%). - -The above command outputs two files: -*malheurReference.aliases* and *malheurReference_lb.alias*. -Each of them has 6 columns: -1. t1: token that is an alias -2. t2: family for which t1 is an alias -3. |t1|: number of input samples where t1 was observed -4. |t2|: number of input samples where t2 was observed -5. |t1^t2|: number of input samples where both t1 and t2 were observed -6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 -were observed over the number of input samples where t1 was observed. - -File *malheurReference.aliases* is the final output with the -detected aliases that satisfy the -nalias and -talias thresholds. -The file *malheurReference_lb.alias* has this information for all tokens. -Thus, *malheurReference.aliases* is a subset -of *malheurReference_lb.alias*. - -However, note that in the above command you are trying to identify aliases -from a small dataset since Drebin only contains 3K samples. -Thus, *malheurReference.aliases* only contains 6 identified aliases. -Using those 6 aliases will produce significantly worse results than using -the aliases in *data/default.aliases*. -As mentioned, to improve the identified aliases you should provide as -input several million samples. -For more details you can refer to our RAID 2016 paper. - - -## Support - -If you have issues or want to contribute generic tokens and/or aliases, -please file a bug report through GitHub. +AVClass and AVClass2 are both released under the MIT license ## Contributors -Several members of the MaliciaLab at the -[IMDEA Software Institute](http://software.imdea.org) -have contributed code to AVClass including: -Marcos Sebastián, Richard Rivera, Platon Kotzias, Srdjan Matic, and -Juan Caballero. +Several members of the MaliciaLab at the [IMDEA Software Institute](http://software.imdea.org) +have contributed code to AVClasss and AVClass2: +Marcos Sebastián, Richard Rivera, Platon Kotzias, Srdjan Matic, Silvia Sebastián, and Juan Caballero. diff --git a/avclass/README.md b/avclass/README.md new file mode 100644 index 0000000..134fa87 --- /dev/null +++ b/avclass/README.md @@ -0,0 +1,392 @@ +# AVClass + +AVClass is a malware labeling tool. + +You give it as input the AV labels for a large number of +malware samples (e.g., VirusTotal JSON reports) and it outputs the most +likely family name for each sample that it can extract from the AV labels. +It can also output a ranking of all alternative names it found for each sample. + +The design and evaluation of AVClass is detailed in our +[RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf): + +> Marcos Sebastián, Richard Rivera, Platon Kotzias, and Juan Caballero. +AVClass: A Tool for Massive Malware Labeling. +In Proceedings of the International Symposium on Research in +Attacks, Intrusions and Defenses, +September 2016. + +In a nutshell, AVClass comprises two phases: +preparation (optional) and labeling. +Code for both is included, +but most users will be only interested in the labeling, which outputs the +family name for the samples. +The preparation produces a list of aliases and generic tokens +used by the labeling. +If you use our default aliases and generic tokens lists, +you do not need to run the preparation. + + +## Labeling + +The labeler takes as input +a JSON file with the AV labels of malware samples (-vt or -lb options), +a file with generic tokens (-gen option), +and a file with aliases (-alias option). +It outputs the most likely family name for each sample. +If you do not provide alias or generic tokens files, +the default ones in the *data* folder are used. + +```shell +$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v > malheurReference.labels +``` + +The above command labels the samples whose AV labels are in the +*../examples/malheurReference_lb.json* file. +It prints the results to stdout, +which we redirect to the *malheurReference.labels* file. +The output looks like this: + +``` +aca2d12934935b070df8f50e06a20539 adrotator +67d15459e1f85898851148511c86d88d adultbrowser +``` + +which means sample aca2d12934935b070df8f50e06a20539 is most likely +from the *adrotator* family and +67d15459e1f85898851148511c86d88d from the *adultbrowser* family. + +The verbose (-v) option makes it output an extra +*malheurReference_lb.verbose* file +with all families extracted for each sample ranked by the number of AV +engines that use that family. +The file looks like this: + +``` +aca2d12934935b070df8f50e06a20539 [(u'adrotator', 8), (u'zlob', 2)] +ee90a64fcfaa54a314a7b5bfe9b57357 [(u'swizzor', 19)] +f465a2c1b852373c72a1ccd161fbe94c SINGLETON:f465a2c1b852373c72a1ccd161fbe94c +``` + +which means that for sample aca2d12934935b070df8f50e06a20539 +there are 8 AV engines assigning *adrotator* as the family and +another 2 assigning *zlob*. +Thus, *adrotator* is the most likely family. +On the other hand, for ee90a64fcfaa54a314a7b5bfe9b57357 there are 19 AV +engines assigning *swizzor* as family, +and no other family was found. +The last line means that for sample f465a2c1b852373c72a1ccd161fbe94c +no family name was found in the AV labels. +Thus, the sample is placed by himself in a singleton cluster +with the name of the cluster being the sample's hash. + +Note that the sum of the number of AV engines may not equal the number +of AV engines with a label for that sample in the input file +because the labels of some AV engines may only include generic tokens +that are removed by AVClass. + +## Input JSON format + +AVClass supports three input JSON formats: + +1. VirusTotal JSON reports (**-vt** file), where each line in file should be + the full JSON of a VirusTotal report as fetched through the VirusTotal API. + By default, it assumes the VT reports are from VT API version 2. + If the VT reports are from VT API version 3, add the **-vt3** command line option. + +2. Simplified JSON (**-lb** file), where each line in file should be a JSON with + (at least) these fields: {md5, sha1, sha256, scan_date, av_labels}. + There is an example of such input file in ../examples/malheurReference_lb.json + +**Multiple input files** + +AVClass can handle multiple input files putting the results in the same output files +(if you want results in separate files, process each input file separately). + +It is possible to provide the -vt and -lb input options multiple times. + +```shell +$./avclass_labeler.py -vt -vt +``` +```shell +$./avclass_labeler.py -lb -lb +``` + +There are also -vtdir and -lbdir options that can be used to provide +an input directory where all files are VT (-vtdir) or simplified (-lbdir) JSON reports: + +```shell +$./avclass_labeler.py -vtdir +``` + +It is also possible to combine -vt with -vtdir and -lb with -lbdir, +but you cannot combine input files of different format. Thus, this command works: + +```shell +$./avclass_labeler.py -vt -vtdir +``` + +But, this one throws an error: + +```shell +$./avclass_labeler.py -vt -lb +``` + +## Labeling: Family Ranking + +AVClass has a -fam option to output a file with a ranking of the +families assigned to the input samples. + +```shell +$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -fam > malheurReference.labels +``` + +will produce a file called *malheurReference_lb.families* with two columns: + +``` +virut 441 +allaple 301 +podnuha 300 +``` + +indicating that 441 samples were classified in the virut family, +301 as allaple, and 300 as podnuha. + +This option is very similar to using the following shell command: + +```shell +$cut -f 2 malheurReference.labels | sort | uniq -c | sort -nr +``` + +The main difference is that using the -fam option all SINGLETON samples, +i.e., those for which no label was found, +are grouped into a fake *SINGLETONS* family, +while the shell command would leave each singleton as a separate family. + + +## Labeling: PUP Classification + +AVClass also has a -pup option to classify a sample as +Potentially Unwanted Program (PUP) or malware. +This classification looks for PUP-related keywords +(e.g., pup, pua, unwanted, adware) in the AV labels and was proposed in our +[CCS 2015 paper](https://software.imdea.org/~juanca/papers/malsign_ccs15.pdf): + +> Platon Kotzias, Srdjan Matic, Richard Rivera, and Juan Caballero. +Certified PUP: Abuse in Authenticode Code Signing. +In Proceedings of the 22nd ACM Conference on Computer and Communication Security, Denver, CO, October 2015 + +```shell +$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -pup > malheurReference.labels +``` + +With the -pup option the output of the *malheurReference.labels* file +looks like this: + +``` +aca2d12934935b070df8f50e06a20539 adrotator 1 +67d15459e1f85898851148511c86d88d adultbrowser 0 +``` + +The digit at the end is a Boolean flag that +indicates sample aca2d12934935b070df8f50e06a20539 is +(likely) PUP, but sample 67d15459e1f85898851148511c86d88d is (likely) not. + +In our experience the PUP classification is conservative, +i.e., if it says the sample is PUP, it most likely is. +But, if it says that it is not PUP, it could still be PUP if the AV labels +do not contain PUP-related keywords. +Note that it is possible that some samples from a family get +the PUP flag while other samples from the same family do not +because the PUP-related keywords may not appear in the labels of +all samples from the same family. +To address this issue, you can combine the -pup option with the -fam option. +This combination will add into the families file the classification of the +family as malware or PUP, based on a majority vote among the samples in a +family. + +```shell +$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -pup -fam > malheurReference.labels +``` + +will produce a file called *malheurReference_lb.families* with five columns: + +``` +# Family Total Malware PUP FamType +virut 441 441 0 malware +magiccasino 173 0 173 pup +ejik 168 124 44 malware +``` + +For virut, the numbers indicate all the 441 virut samples are classified +as malware, and thus the last column states that virut is a malware family. +For magiccasino, all 173 samples are labeled as PUP, thus the family is PUP. +For ejik, out of the 168 samples, 124 are labeled as malware and 44 as PUP, +so the family is classified as malware. + + +## Labeling: Ground Truth Evaluation + +If you have ground truth for some malware samples, +i.e., you know the true family for those samples, you can evaluate the accuracy of the labeling output by AVClass on those samples with respect to that +ground truth. +The evaluation metrics used are precision, recall, and F1 measure. +See our RAID 2016 paper above for their definition. + +```shell +$./avclass_labeler.py -lb ../examples/malheurReference_lb.json -v -gt ../examples/malheurReference_gt.tsv -eval > malheurReference.labels +``` + +The output includes these lines: + +``` +Calculating precision and recall +3131 out of 3131 +Precision: 90.81 Recall: 94.05 F1-Measure: 92.40 +``` + +The last line corresponds to the accuracy metrics obtained by +comparing AVClass results with the provided ground truth. + +Each line in the *../examples/malheurReference_gt.tsv* file has +two **tab-separated** columns: + +``` +0058780b175c3ce5e244f595951f611b8a24bee2 CASINO +``` + +which indicates that sample 0058780b175c3ce5e244f595951f611b8a24bee2 +is known to be of the *CASINO* family. +Each sample in the input file should also appear in the ground truth file. +Note that the particular label assigned to each family does not matter. +What matters is that all samples in the same family are assigned the +same family name (i.e., the same string in the second column) + +The ground truth can be obtained from publicly available malware +datasets. +The one in *../examples/malheurReference_gt.tsv* comes from the +[Malheur](http://www.mlsec.org/malheur/) dataset. +There are other public datasets with ground truth such as +[Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or +[Malicia](http://malicia-project.com/dataset.html). + + +## Preparation: Generic Token Detection + +The labeling takes as input a file with generic tokens that should be +ignored in the AV labels, e.g., trojan, virus, generic, linux. +By default, the labeling uses the *data/default.generics* +generic tokens file. +You can edit that file to add additional generic tokens you feel +we are missing. + +In our RAID 2016 paper we describe an automatic approach to +identify generic tokens, which **requires ground truth**, +i.e., it requires knowing the true family for each input sample. +Not only that, but **the ground truth should be large**, +i.e., contain at least one hundred thousand samples. +In our work we identified generic tokens using as ground truth +the concatenation of all datasets for which we had ground truth. +This requirement of a large ground truth dataset is why we expect most users +will skip this step and simply use our provided default file. + +If you want to test generic token detection you can do: + +```shell + $./avclass_generic_detect.py -lb ../examples/malheurReference_lb.json -gt ../examples/malheurReference_gt.tsv -tgen 10 > malheurReference.gen +``` + +Each line in the *../examples/malheurReference_gt.tsv* file has +two **tab-separated** columns: + +``` +0058780b175c3ce5e244f595951f611b8a24bee2 CASINO +``` + +which indicates that sample 0058780b175c3ce5e244f595951f611b8a24bee2 +is known to be of the *CASINO* family. + +The *-tgen 10* option is a threshold for the minimum number of families +where a token has to be observed to be considered generic. +If the option is ommitted, the default threshold of 8 is used. + +The above command outputs two files: +*malheurReference.gen* and *malheurReference_lb.gen*. +Each of them has 2 columns: token and number of families where the token +was observed. +File *malheurReference.gen* is the final output with the detected +generic tokens for which the number of families is above +the given threshold. +The file *malheurReference_lb.gen* has this information for all tokens. +Thus, *malheurReference.gen* is a subset of *malheurReference_lb.gen*. + +However, note that in the above command you are trying to identify generic +tokens from a small dataset since Drebin only contains 3K labeled samples. +Thus, *malheurReference.gen* only contains 25 identified generic tokens. +Using those 25 generic tokens will produce significantly worse results +than using the generic tokens in *data/default.generics*. +For more details you can refer to our RAID 2016 paper. + + +## Preparation: Alias Detection + +Different vendors may assign different names (i.e., aliases) for the same +family. For example, some vendors may use *zeus* and others *zbot* +as aliases for the same malware family. +The labeling takes as input a file with aliases that should be merged. +By default, the labeling uses the *data/default.aliases* aliases file. +You can edit that file to add additional aliases you feel we are missing. + +In our RAID 2016 paper we describe an automatic approach +to identify aliases. +Our alias detection approach +**requires as input the AV labels for large set of samples**, +e.g., several million samples. +In contrast with the generic token detection, the input samples for +alias detection **do not need to be labeled**, +i.e., no need to know their family. +In our work we identified aliases using as input the largest of our +unlabeled datasets, which contained nearly 8M samples. +This requirement of a large input dataset is why we expect most users +will skip this step and simply use our provided default file. + +If you want to test alias detection you can do: + +```shell +$./avclass_alias_detect.py -lb ../examples/malheurReference_lb.json -nalias 100 -talias 0.98 > malheurReference.aliases +``` + +The -nalias threshold provides the minimum number of samples two tokens +need to be observed in to be considered aliases. +If the option is not provided the default is 20. + +The -talias threshold provides the minimum fraction of times that +the samples appear together. +If the is not provided the default is 0.94 (94%). + +The above command outputs two files: +*malheurReference.aliases* and *malheurReference_lb.alias*. +Each of them has 6 columns: +1. t1: token that is an alias +2. t2: family for which t1 is an alias +3. |t1|: number of input samples where t1 was observed +4. |t2|: number of input samples where t2 was observed +5. |t1^t2|: number of input samples where both t1 and t2 were observed +6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 +were observed over the number of input samples where t1 was observed. + +File *malheurReference.aliases* is the final output with the +detected aliases that satisfy the -nalias and -talias thresholds. +The file *malheurReference_lb.alias* has this information for all tokens. +Thus, *malheurReference.aliases* is a subset +of *malheurReference_lb.alias*. + +However, note that in the above command you are trying to identify aliases +from a small dataset since Drebin only contains 3K samples. +Thus, *malheurReference.aliases* only contains 6 identified aliases. +Using those 6 aliases will produce significantly worse results than using +the aliases in *data/default.aliases*. +As mentioned, to improve the identified aliases you should provide as +input several million samples. +For more details you can refer to our RAID 2016 paper. + diff --git a/avclass_alias_detect.py b/avclass/avclass_alias_detect.py similarity index 100% rename from avclass_alias_detect.py rename to avclass/avclass_alias_detect.py diff --git a/avclass_generic_detect.py b/avclass/avclass_generic_detect.py similarity index 100% rename from avclass_generic_detect.py rename to avclass/avclass_generic_detect.py diff --git a/avclass_labeler.py b/avclass/avclass_labeler.py similarity index 100% rename from avclass_labeler.py rename to avclass/avclass_labeler.py diff --git a/data/default.aliases b/avclass/data/default.aliases similarity index 100% rename from data/default.aliases rename to avclass/data/default.aliases diff --git a/data/default.generics b/avclass/data/default.generics similarity index 100% rename from data/default.generics rename to avclass/data/default.generics diff --git a/lib/avclass_common.py b/avclass/lib/avclass_common.py similarity index 100% rename from lib/avclass_common.py rename to avclass/lib/avclass_common.py diff --git a/lib/evaluate_clustering.py b/avclass/lib/evaluate_clustering.py similarity index 100% rename from lib/evaluate_clustering.py rename to avclass/lib/evaluate_clustering.py diff --git a/avclass2/README.md b/avclass2/README.md new file mode 100644 index 0000000..fb5ca17 --- /dev/null +++ b/avclass2/README.md @@ -0,0 +1,252 @@ +# AVClass2 + +AVClass2 is a malware tagging tool. It extends AVClass to extract from AV labels not only family name tags, but other tags capturing the malware class (e.g., *worm*, *ransomware*, *grayware*), behaviors (e.g., *spam*, *ddos*), and file properties (e.g., *packed*, *themida*, *bundle*, *nsis*). + +You give it as input the AV labels for a large number of malware samples (e.g., VirusTotal JSON reports) +and it outputs tags observed in the AV labels, ranked by decreasing popularity. + +The design and evaluation of AVClass2 is detailed in our ACSAC 2020 paper. + +> Silvia Sebastián, Juan Caballero. +AVClass2: Massive Malware Tag Extraction from AV Labels. +In proceedings of the Annual Computer Security Applications Conference, December 2020. + +In a nutshell, AVClass2 comprises two modules: labeling and update. Code for both is included, but most users will be only interested in the labeling, which outputs the tags for the samples. The update module is used to update the input taxonomy, tagging rules, and expansion rules. If you use our default taxonomy, tagging, and expansion files, you do not need to run the update module. + + +## Labeling + +The labeler takes as input a JSON file with the AV labels of malware samples +(-vt or -lb options), +a file with the taxonomy (-tax option), +a file with tagging rules (-tag option), and +a file with expansion rules (-exp option). +It outputs a set of ranked tags. +If you do not provide taxonomy, expansion or tagging files, +the default ones in the data folder are used. + +```shell +$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json +``` + +The above command labels the samples whose AV labels are in +the ../examples/malheurReference_lb.json file. +It prints the results to stdout. +The output looks like this: + +``` +aca2d12934935b070df8f50e06a20539 33 grayware|10,adware|9,windows|8,adrotator|8,downloader|3,zlob|2 +67d15459e1f85898851148511c86d88d 37 dialer|23,windows|9,adultbrowser|8,porndialer|7,grayware|6,tool|3,target|2 +``` + +which means sample *aca2d12934935b070df8f50e06a20539* +was flagged by 33 AV engines and 10 of them agree it is *grayware*, 9 that it is more specifically *adware*, +8 mention that it runs on *windows*, another 8 that it is the *adrotator* family, +3 that it is a *downloader*, and 2 that it belongs instead to the *zlob* family. +Sample *67d15459e1f85898851148511c86d88d* is flagged by 37 AV engines and 23 of them +consider it a *dialer*, 8 that it belongs to the *adultbrowser* family, and so on. + +The -p option outputs the full path of each tag in the taxonomy: + +```shell +$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -p +``` + +The above command line outputs: + +``` +aca2d12934935b070df8f50e06a20539 33 CLASS:grayware|10,CLASS:grayware:adware|9,FILE:os:windows|8,FAM:adrotator|8,CLASS:downloader|3,FAM:zlob|2 +67d15459e1f85898851148511c86d88d 37 CLASS:dialer|23,FILE:os:windows|9,FAM:adultbrowser|8,CLASS:dialer:porndialer|7,CLASS:grayware|6,CLASS:grayware:tool|3,FAM:target|2 +``` + +where each tag has been replaced by its taxonomy path, which starts with the category in capitals, +followed by the path in the category (if any), and the tag itself, all separated by colons. +For example, *FAM:adrotator* makes explicit that *adrotator* is a malware family, +*CLASS:grayware* that *grayware* is a malware class, and +*CLASS:grayware:adware* that *adware* is a subclass of *grayware*. + +** Compatibility mode ** + +The compatibility -c option makes AVClass2 output the same format as AVClass. + +```shell +$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -c +``` + +outputs: + +``` +bb23e1d296cf01bbaf32ed3938f9b0b8 allaple +cc4521ea738e8ba17139f86b3def5349 SINGLETON:cc4521ea738e8ba17139f86b3def5349 +``` + +As in AVClass, the output contains only the family name, +which corresponds to the highest ranked family tag, all other tags are ignored. +Samples for which a family cannot be obtained are labeled as singletons with their hash. + +It is important to note that AVClass2 compatibility mode results can differ from AVClass results +on the same input file. +The differences in family names are due to differences between the generics and aliases files +used by AVClass and the taxonomy, tagging rules, and expansion rules used by AVClass2. +In the future, we may change AVClass to use the taxonomy and rules from AVClass2 +as input (instead of the generics and aliases files) +to minimize such differences and avoid maintaining different data files. + + +## Input JSON format + +AVClass2 supports three input JSON formats: + +1. VirusTotal JSON reports (**-vt** file), where each line in file should be + the full JSON of a VirusTotal report as fetched through the VirusTotal API. + By default, it assumes the VT reports are from VT API version 2. + If the VT reports are from VT API version 3, add the **-vt3** command line option. + +2. Simplified JSON (**-lb** file), where each line in file should be a JSON with + (at least) these fields: {md5, sha1, sha256, scan_date, av_labels}. + There is an example of such input file in ../examples/malheurReference_lb.json + +**Multiple input files** + +AVClass can handle multiple input files putting the results in the same output files +(if you want results in separate files, process each input file separately). + +It is possible to provide the -vt and -lb input options multiple times. + +```shell +$./avclass2_labeler.py -vt -vt +``` +```shell +$./avclass2_labeler.py -lb -lb +``` + +There are also -vtdir and -lbdir options that can be used to provide +an input directory where all files are VT (-vtdir) or simplified (-lbdir) JSON reports: + +```shell +$./avclass2_labeler.py -vtdir +``` + +It is also possible to combine -vt with -vtdir and -lb with -lbdir, +but you cannot combine input files of different format. Thus, this command works: + +```shell +$./avclass2_labeler.py -vt -vtdir +``` + +But, this one throws an error: + +```shell +$./avclass2_labeler.py -vt -lb +``` + +At this point you have read the most important information on how to use AVClass2. +The following sections describe steps that most users will not need. + +## Labeling: Ground Truth Evaluation + +If you have family ground truth for some malware samples, i.e., +you know the true family for those samples, you can evaluate the accuracy +of the family tags output by AVClass2 on those samples with respect to that ground truth. +The evaluation metrics used are precision, recall, and F1 measure. +See our [RAID 2016 paper](https://software.imdea.org/~juanca/papers/avclass_raid16.pdf) for their definition. +Note that the ground truth evaluation does not apply to non-family tags, +i.e., it only evaluates the output of the compatibility mode. + +```shell +$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -gt ../examples/malheurReference_gt.tsv > malheurReference.labels +``` + +The output includes these lines: + +``` +Calculating precision and recall +3131 out of 3131 +Precision: 90.81 Recall: 94.05 F1-Measure: 92.40 +``` + +Each line in the *../examples/malheurReference_gt.tsv* file has two **tab-separated** columns: + +``` +aca2d12934935b070df8f50e06a20539 ADROTATOR +``` + +which indicates that sample aca2d12934935b070df8f50e06a20539 is known +to be of the *ADROTATOR* family. +Each sample in the input file should also appear in the ground truth file. +Note that the particular label assigned to each family does not matter. +What matters is that all samples in the same family are assigned +the same family name (i.e., the same string in the second column) + +The ground truth can be obtained from publicly available malware datasets. +The one in *../examples/malheurReference_gt.tsv* comes from the +[Malheur](http://www.mlsec.org/malheur/) dataset. +There are other public datasets with ground truth such as +[Drebin](https://www.sec.cs.tu-bs.de/~danarp/drebin/) or +[Malicia](http://malicia-project.com/dataset.html). + +## Update Module + +The update module can be used to suggest additions and changes to the input +taxonomy, tagging rules, and expansion rules. +Using the update module comprises of two steps. +The first step is obtaining an alias file from the labeler: + +```shell +$./avclass2_labeler.py -lb ../examples/malheurReference_lb.json -aliasdetect +``` + +The above command will create a file named \.alias, +malheurReference_lb.alias in our example. This file has 7 columns: + +1. t1: token that is an alias +2. t2: tag for which t1 is an alias +3. |t1|: number of input samples where t1 was observed +4. |t2|: number of input samples where t2 was observed +5. |t1^t2|: number of input samples where both t1 and t2 were observed +6. |t1^t2|/|t1|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t1 was observed. +7. |t1^t2|/|t2|: ratio of input samples where both t1 and t2 were observed over the number of input samples where t2 was observed. + + +The Update Module takes the above file as input with the -alias option, +as well as the default taxonomy, tagging, and expansion files in the data directory. +It outputs updated taxonomy, tagging, and expansion files that include the +suggested additions and changes. + +```shell +$./avclass2_update_module.py -alias malheurReference_lb.alias -o output_prefix +``` + +This will produce three files: +output_prefix.taxonomy, output_prefix.tagging, output_prefix.expansion. +You can diff the output and input files to analyze the proposed changes. + +You can also modify the input taxonomy, tagging, and expansion rules in place, +rather than producing new files: + + +```shell +$./avclass2_update_module.py -alias malheurReference_lb.alias -update +``` + + +## Customizing AVClass2 + +AVClass2 is fully customizable: +Tagging, Expansion and Taxonomy files can be easily modified by the analyst +either manually or by running the update module. + +If you change those files manually, we recommend running +afterwards the input checker script to keep them tidy. +It sorts the tags in the taxonomy and performs some basic cleaning like +removing redundant entries: + +```shell +$./avclass2_input_checker.py -tax taxonomy_file -tag tagging_file -exp expansio_file +``` + +If the modifications are in the default files in the data directory you can simply run: + +```shell +$./avclass2_input_checker.py +``` diff --git a/avclass2/avclass2_input_checker.py b/avclass2/avclass2_input_checker.py new file mode 100755 index 0000000..adbf8e5 --- /dev/null +++ b/avclass2/avclass2_input_checker.py @@ -0,0 +1,51 @@ +import os +import sys +import argparse +script_dir = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, os.path.join(script_dir, 'lib/')) +from avclass2_common import Taxonomy, Tagging, Expansion + +default_tag_file = "data/tagging" +default_tax_file = "data/taxonomy" +default_exp_file = "data/expansion" + +if __name__ == '__main__': + argparser = argparse.ArgumentParser(prog='input_checker', + description='Checks format of files Tagging, Expansion and Taxonomy.') + + argparser.add_argument('-tag', + help='tagging file', + default=default_tag_file) + + argparser.add_argument('-tax', + help='taxonomy file', + default=default_tax_file) + + argparser.add_argument('-exp', + help='expansion file', + default=default_exp_file) + + # Parse arguments + args = argparser.parse_args() + + # Normalize taxonomy + taxonomy = Taxonomy(args.tax) + taxonomy.to_file(args.tax) + sys.stdout.write('[-] Normalized %d tags in taxonomy %s\n' % ( + len(taxonomy), args.tax)) + + # Normalize tagging rules + tagging = Tagging(args.tag) + tagging.validate(taxonomy) + # tagging.expand_all_destinations() + tagging.to_file(args.tag) + sys.stdout.write('[-] Normalized %d tagging rules in %s\n' % ( + len(tagging), args.tag)) + + # Normalize expansion rules + expansion = Expansion(args.exp) + expansion.validate(taxonomy) + expansion.to_file(args.exp) + sys.stdout.write('[-] Normalized %d expansion rules in %s\n' % ( + len(expansion), args.exp)) + diff --git a/avclass2/avclass2_labeler.py b/avclass2/avclass2_labeler.py new file mode 100755 index 0000000..cf872dd --- /dev/null +++ b/avclass2/avclass2_labeler.py @@ -0,0 +1,469 @@ +#!/usr/bin/env python2 +''' +AVClass2 labeler +''' + +import os +import sys +script_dir = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, os.path.join(script_dir, 'lib/')) +import argparse +from avclass2_common import AvLabels +from operator import itemgetter +import evaluate_clustering as ec +import json +import traceback + +# Default tagging file +default_tag_file = os.path.join(script_dir, "data/tagging") +# Default expansion file +default_exp_file = os.path.join(script_dir, "data/expansion") +# Default taxonomy file +default_tax_file = os.path.join(script_dir, "data/taxonomy") + +def guess_hash(h): + ''' Given a hash string, guess the hash type based on the string length ''' + hlen = len(h) + if hlen == 32: + return 'md5' + elif hlen == 40: + return 'sha1' + elif hlen == 64: + return 'sha256' + else: + return None + +def format_tag_pairs(l, taxonomy=None): + ''' Return ranked tags as string ''' + if not l: + return "" + p = taxonomy.get_path(l[0][0]) if taxonomy else l[0][0] + out = "%s|%d" % (p, l[0][1]) + for (t,s) in l[1:]: + p = taxonomy.get_path(t) if taxonomy else t + out += ",%s|%d" % (p, s) + return out + +def list_str(l, sep=", ", prefix=""): + ''' Return list as a string ''' + if not l: + return "" + out = prefix + l[0] + for s in l[1:]: + out = out + sep + s + return out + +def main(args): + # Select hash used to identify sample, by default MD5 + hash_type = args.hash if args.hash else 'md5' + + # If ground truth provided, read it from file + gt_dict = {} + if args.gt: + with open(args.gt, 'r') as gt_fd: + for line in gt_fd: + gt_hash, family = map(str, line.strip().split('\t', 1)) + gt_dict[gt_hash] = family + + # Guess type of hash in ground truth file + hash_type = guess_hash(list(gt_dict.keys())[0]) + + # Create AvLabels object + av_labels = AvLabels(args.tag, args.exp, args.tax, + args.av, args.aliasdetect) + + # Build list of input files + # NOTE: duplicate input files are not removed + ifile_l = [] + if (args.vt): + ifile_l += args.vt + ifile_are_vt = True + if (args.lb): + ifile_l += args.lb + ifile_are_vt = False + if (args.vtdir): + ifile_l += [os.path.join(args.vtdir, + f) for f in os.listdir(args.vtdir)] + ifile_are_vt = True + if (args.lbdir): + ifile_l += [os.path.join(args.lbdir, + f) for f in os.listdir(args.lbdir)] + ifile_are_vt = False + + # Select correct sample info extraction function + if not ifile_are_vt: + get_sample_info = av_labels.get_sample_info_lb + elif args.vt3: + get_sample_info = av_labels.get_sample_info_vt_v3 + else: + get_sample_info = av_labels.get_sample_info_vt_v2 + + # Select output prefix + out_prefix = os.path.basename(os.path.splitext(ifile_l[0])[0]) + + # Initialize state + first_token_dict = {} + token_count_map = {} + pair_count_map = {} + vt_all = 0 + avtags_dict = {} + stats = {'samples': 0, 'noscans': 0, 'tagged': 0, 'maltagged': 0, + 'FAM': 0, 'CLASS': 0, 'BEH': 0, 'FILE': 0, 'UNK': 0} + + # Process each input file + for ifile in ifile_l: + # Open file + fd = open(ifile, 'r') + + # Debug info, file processed + sys.stderr.write('[-] Processing input file %s\n' % ifile) + + # Process all lines in file + for line in fd: + + # If blank line, skip + if line == '\n': + continue + + # Debug info + if vt_all % 100 == 0: + sys.stderr.write('\r[-] %d JSON read' % vt_all) + sys.stderr.flush() + vt_all += 1 + + # Read JSON line + vt_rep = json.loads(line) + + # Extract sample info + sample_info = get_sample_info(vt_rep) + + # If no sample info, log error and continue + if sample_info is None: + try: + name = vt_rep['md5'] + sys.stderr.write('\nNo scans for %s\n' % name) + except KeyError: + sys.stderr.write('\nCould not process: %s\n' % line) + sys.stderr.flush() + stats['noscans'] += 1 + continue + + # Sample's name is selected hash type (md5 by default) + name = getattr(sample_info, hash_type) + + # If the VT report has no AV labels, output and continue + if not sample_info.labels: + sys.stdout.write('%s\t-\t[]\n' % (name)) + # sys.stderr.write('\nNo AV labels for %s\n' % name) + # sys.stderr.flush() + continue + + # Compute VT_Count + vt_count = len(sample_info.labels) + + # Get the distinct tokens from all the av labels in the report + # And print them. + try: + av_tmp = av_labels.get_sample_tags(sample_info) + tags = av_labels.rank_tags(av_tmp) + + # AV VENDORS PER TOKEN + if args.avtags: + for t in av_tmp: + tmap = avtags_dict.get(t, {}) + for av in av_tmp[t]: + ctr = tmap.get(av, 0) + tmap[av] = ctr + 1 + avtags_dict[t] = tmap + + if args.aliasdetect: + prev_tokens = set() + for entry in tags: + curr_tok = entry[0] + curr_count = token_count_map.get(curr_tok, 0) + token_count_map[curr_tok] = curr_count + 1 + for prev_tok in prev_tokens: + if prev_tok < curr_tok: + pair = (prev_tok,curr_tok) + else: + pair = (curr_tok,prev_tok) + pair_count = pair_count_map.get(pair, 0) + pair_count_map[pair] = pair_count + 1 + prev_tokens.add(curr_tok) + + # Collect stats + # FIX: should iterate once over tags, + # for both stats and aliasdetect + if tags: + stats["tagged"] += 1 + if args.stats: + if (vt_count > 3): + stats["maltagged"] += 1 + cat_map = {'FAM': False, 'CLASS': False, + 'BEH': False, 'FILE': False, 'UNK': + False} + for t in tags: + path, cat = av_labels.taxonomy.get_info(t[0]) + cat_map[cat] = True + for c in cat_map: + if cat_map[c]: + stats[c] += 1 + + # Check if sample is PUP, if requested + if args.pup: + if av_labels.is_pup(tags, av_labels.taxonomy): + is_pup_str = "\t1" + else: + is_pup_str = "\t0" + else: + is_pup_str = "" + + # Select family for sample if needed, + # i.e., for compatibility mode or for ground truth + if args.c or args.gt: + fam = "SINGLETON:" + name + # fam = '' + for (t,s) in tags: + cat = av_labels.taxonomy.get_category(t) + if (cat == "UNK") or (cat == "FAM"): + fam = t + break + + # Get ground truth family, if available + if args.gt: + first_token_dict[name] = fam + gt_family = '\t' + gt_dict.get(name, "") + else: + gt_family = "" + + # Get VT tags as string + if args.vtt: + vtt = list_str(sample_info.vt_tags, prefix="\t") + else: + vtt = "" + + # Print family (and ground truth if available) to stdout + if not args.c: + if args.path: + tag_str = format_tag_pairs(tags, av_labels.taxonomy) + else: + tag_str = format_tag_pairs(tags) + sys.stdout.write('%s\t%d\t%s%s%s%s\n' % + (name, vt_count, tag_str, gt_family, + is_pup_str, vtt)) + else: + sys.stdout.write('%s\t%s%s%s\n' % + (name, fam, gt_family, is_pup_str)) + except: + traceback.print_exc(file=sys.stderr) + continue + + # Debug info + sys.stderr.write('\r[-] %d JSON read' % vt_all) + sys.stderr.flush() + sys.stderr.write('\n') + + # Close file + fd.close() + + # Print statistics + sys.stderr.write( + "[-] Samples: %d NoScans: %d NoTags: %d GroundTruth: %d\n" % ( + vt_all, stats['noscans'], vt_all - stats['tagged'], + len(gt_dict))) + + # If ground truth, print precision, recall, and F1-measure + if args.gt: + precision, recall, fmeasure = \ + ec.eval_precision_recall_fmeasure(gt_dict, + first_token_dict) + sys.stderr.write( + "Precision: %.2f\tRecall: %.2f\tF1-Measure: %.2f\n" % \ + (precision, recall, fmeasure)) + + # Output stats + if args.stats: + stats_fd = open("%s.stats" % out_prefix, 'w') + num_samples = vt_all + stats_fd.write('Samples: %d\n' % num_samples) + num_tagged = stats['tagged'] + frac = float(num_tagged) / float(num_samples) * 100 + stats_fd.write('Tagged (all): %d (%.01f%%)\n' % (num_tagged, frac)) + num_maltagged = stats['maltagged'] + frac = float(num_maltagged) / float(num_samples) * 100 + stats_fd.write('Tagged (VT>3): %d (%.01f%%)\n' % (num_maltagged, frac)) + for c in ['FILE','CLASS','BEH','FAM','UNK']: + count = stats[c] + frac = float(count) / float(num_maltagged) * 100 + stats_fd.write('%s: %d (%.01f%%)\n' % (c, stats[c], frac)) + stats_fd.close() + + # Output vendor info + if args.avtags: + avtags_fd = open("%s.avtags" % out_prefix, 'w') + for t in sorted(avtags_dict.keys()): + avtags_fd.write('%s\t' % t) + pairs = sorted(avtags_dict[t].items(), + key=lambda pair : pair[1], + reverse=True) + for pair in pairs: + avtags_fd.write('%s|%d,' % (pair[0], pair[1])) + avtags_fd.write('\n') + avtags_fd.close() + + # If alias detection, print map + if args.aliasdetect: + # Open alias file + alias_filename = out_prefix + '.alias' + alias_fd = open(alias_filename, 'w+') + # Sort token pairs by number of times they appear together + sorted_pairs = sorted( + pair_count_map.items(), key=itemgetter(1)) + # sorted_pairs = sorted( + # pair_count_map.items()) + + # Output header line + alias_fd.write("# t1\tt2\t|t1|\t|t2|\t" + "|t1^t2|\t|t1^t2|/|t1|\t|t1^t2|/|t2|\n") + # Compute token pair statistic and output to alias file + for (t1, t2), c in sorted_pairs: + n1 = token_count_map[t1] + n2 = token_count_map[t2] + if (n1 < n2): + x = t1 + y = t2 + xn = n1 + yn = n2 + else: + x = t2 + y = t1 + xn = n2 + yn = n1 + f = float(c) / float(xn) + finv = float(c) / float(yn) + alias_fd.write("%s\t%s\t%d\t%d\t%d\t%0.2f\t%0.2f\n" % ( + x, y, xn, yn, c, f, finv)) + # Close alias file + alias_fd.close() + sys.stderr.write('[-] Alias data in %s\n' % (alias_filename)) + + +if __name__=='__main__': + argparser = argparse.ArgumentParser(prog='avclass2_labeler', + description='''Extracts tags for a set of samples. + Also calculates precision and recall if ground truth available''') + + argparser.add_argument('-vt', action='append', + help='file with VT reports ' + '(Can be provided multiple times)') + + argparser.add_argument('-lb', action='append', + help='file with simplified JSON reports' + '{md5,sha1,sha256,scan_date,av_labels} ' + '(Can be provided multiple times)') + + argparser.add_argument('-vtdir', + help='existing directory with VT reports') + + argparser.add_argument('-lbdir', + help='existing directory with simplified JSON reports') + + argparser.add_argument('-vt3', action='store_true', + help='input are VT v3 files') + + argparser.add_argument('-gt', + help='file with ground truth. ' + 'If provided it evaluates clustering accuracy. ' + 'Prints precision, recall, F1-measure.') + + argparser.add_argument('-vtt', + help='Include VT tags in the output.', + action='store_true') + + argparser.add_argument('-tag', + help='file with tagging rules.', + default = default_tag_file) + + argparser.add_argument('-tax', + help='file with taxonomy.', + default = default_tax_file) + + argparser.add_argument('-exp', + help='file with expansion rules.', + default = default_exp_file) + + argparser.add_argument('-av', + help='file with list of AVs to use') + + argparser.add_argument('-avtags', + help='extracts tags per av vendor', + action='store_true') + + argparser.add_argument('-pup', + action='store_true', + help='if used each sample is classified as PUP or not') + + argparser.add_argument('-p', '--path', + help='output.full path for tags', + action='store_true') + + argparser.add_argument('-hash', + help='hash used to name samples. Should match ground truth', + choices=['md5', 'sha1', 'sha256']) + + argparser.add_argument('-c', + help='Compatibility mode. Outputs results in AVClass format.', + action='store_true') + + argparser.add_argument('-aliasdetect', + action='store_true', + help='if used produce aliases file at end') + + argparser.add_argument('-stats', + action='store_true', + help='if used produce 1 file ' + 'with stats per category ' + '(File, Class, ' + 'Behavior, Family, Unclassified)') + + args = argparser.parse_args() + + if not args.vt and not args.lb: + sys.stderr.write('Argument -vt or -lb is required\n') + exit(1) + + if args.vt and args.lb: + sys.stderr.write('Use either -vt or -lb argument, not both.\n') + exit(1) + + if args.tag: + if args.tag == '/dev/null': + sys.stderr.write('[-] Using no tagging rules\n') + else: + sys.stderr.write('[-] Using tagging rules in %s\n' % ( + args.tag)) + else: + sys.stderr.write('[-] Using default tagging rules in %s\n' % ( + default_tag_file)) + + if args.tax: + if args.tax == '/dev/null': + sys.stderr.write('[-] Using no taxonomy\n') + else: + sys.stderr.write('[-] Using taxonomy in %s\n' % ( + args.tax)) + else: + sys.stderr.write('[-] Using default taxonomy in %s\n' % ( + default_tax_file)) + + if args.exp: + if args.exp == '/dev/null': + sys.stderr.write('[-] Using no expansion tags\n') + else: + sys.stderr.write('[-] Using expansion tags in %s\n' % ( + args.exp)) + else: + sys.stderr.write('[-] Using default expansion tags in %s\n' % ( + default_exp_file)) + + main(args) diff --git a/avclass2/avclass2_update_module.py b/avclass2/avclass2_update_module.py new file mode 100755 index 0000000..0dd2209 --- /dev/null +++ b/avclass2/avclass2_update_module.py @@ -0,0 +1,480 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +''' +AVClass2 Update module +''' +import sys +import os +import argparse +import logging +# Make sure paths are relative to execution path +script_dir = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, os.path.join(script_dir, 'lib/')) +from operator import itemgetter +from collections import namedtuple +from avclass2_common import Taxonomy, Expansion, Tagging +# from Levenshtein import ratio as levenshtein_ratio + +# Set logging +log = logging.getLogger(__name__) + +# Log warn and above to stderr +formatter = logging.Formatter(u'%(message)s') +handler_stderr = logging.StreamHandler(sys.stderr) +handler_stderr.setLevel(logging.INFO) +handler_stderr.setFormatter(formatter) +root = logging.getLogger() +root.setLevel(logging.DEBUG) +root.addHandler(handler_stderr) + + +# Default tagging file +default_tagging_file = os.path.join(script_dir, "data/tagging") +# Default expansion file +default_expansion_file = os.path.join(script_dir, "data/expansion") +# Default taxonomy file +default_taxonomy_file = os.path.join(script_dir, "data/taxonomy") + +# Threshold for string similarity +sim_threshold = 0.6 + +# Relation +Rel = namedtuple('Rel', ['t1', 't2', 't1_num', 't2_num', + 'nalias_num', 'talias_num', 'tinv_alias_num']) + +class Update: + ''' Update Module ''' + def __init__(self, rel_filepath, in_taxonomy, in_tagging, in_expansion, + n, t): + # Initialize inputs + self.__out_taxonomy = in_taxonomy + self.__out_tagging = in_tagging + self.__out_expansion = in_expansion + self.__n = n + self.__t = t + # Initialize blacklist + self.blist = in_taxonomy.platform_tags() + log.info(self.blist) + # Maps src -> cnt + self.src_map = {} + # Read relations from file + self.rel_set = self.read_relations(rel_filepath) + + def num_rules(self): + return len(self.rel_set) + + def is_weak_rel(self, rel): + ''' Return true if relationship is weak, + i.e., does not meet thresholds ''' + return ((int(rel.nalias_num) < self.__n) or + (float(rel.talias_num) < self.__t)) + + def is_blacklisted_rel(self, rel): + ''' Return true if relationship is blacklisted ''' + return (rel.t1 in self.blist) or (rel.t2 in self.blist) + + def is_known_rel(self, rel): + ''' Return true if relationship is known ''' + t1 = rel.t1 + t2 = rel.t2 + # Known taxonomy relation + if self.__out_taxonomy.overlaps(t1,t2): + return True + # Known expansion rule + t1_dst = self.__out_expansion.get_dst(t1) + t2_dst = self.__out_expansion.get_dst(t2) + if (t2 in t1_dst) or (t1 in t2_dst): + return True + # Known tagging rule + t1_dst = sorted(self.__out_tagging.get_dst(t1)) + t2_dst = sorted(self.__out_tagging.get_dst(t2)) + if (t2 in t1_dst) or (t1 in t2_dst): + return True + # Known alias in tagging + if t1_dst and (t1_dst == t2_dst): + return True + return False + + def add_tag(self, name, path): + ''' Add tag to taxonomy if not in tagging ''' + l = self.__out_tagging.get_dst(name) + if (not l): + self.__out_taxonomy.add_tag(path) + + def add_expansion(self, src, dst_l): + ''' Add expansion rule fixing destination if src in tagging ''' + # Select source handling aliases + l = self.__out_tagging.get_dst(src) + if l: + new_src = l[0] + else: + new_src = src + # Select destinations removing overlaps with existing rule + l = self.__out_expansion.get_dst(src) + if l: + l.extend(dst_l) + target_l = self.__out_taxonomy.remove_overlaps(l) + self.__out_expansion.add_rule(new_src, target_l, True) + else: + self.__out_expansion.add_rule(new_src, dst_l, True) + + def add_alias(self, src, dst, dst_prefix): + ''' Add alias relation to taxonomy, tagging ''' + # If src in tagging, use most popular target + l = self.__out_tagging.get_dst(src) + target = dst + if l: + cnt_max = self.src_map[dst] + for e in l: + cnt = self.src_map.get(e, 0) + if cnt > cnt_max: + target = e + # If dst is in tagging, update tagging rule destination, + l = self.__out_tagging.get_dst(dst) + if l: + target_l = l + # else add dst to taxonomy + else: + target_l = [target] + self.__out_taxonomy.add_tag('%s:%s' % (dst_prefix, dst)) + # Remove src from taxonomy + self.__out_taxonomy.remove_tag(src) + # Replace tagging rule + self.__out_tagging.add_rule(src, target_l, True) + + def is_expansion_rel(self, rel): + ''' Return true if relation implies expansion rule ''' + c1 = self.__out_taxonomy.get_category(rel.t1) + c2 = self.__out_taxonomy.get_category(rel.t2) + return (((c1 == "FAM") and (c2 != c1) and (c2 != "UNK")) or + ((c1 == "CLASS") and ((c2 == "FILE") or (c2 == "BEH"))) or + ((c1 == "UNK") and ((c2 == "BEH") or (c2 == "CLASS")))) + + def find_expansions(self): + ''' Find expansions among relations ''' + acc = [] + for rel in self.rel_set: + p1 = self.__out_taxonomy.get_path(rel.t1) + p2 = self.__out_taxonomy.get_path(rel.t2) + log.info("Processing %s\t%s" % (p1, p2)) + if self.is_expansion_rel(rel): + self.add_expansion(rel.t1, [rel.t2]) + acc.append(rel) + for rel in acc: + self.rel_set.remove(rel) + + #def is_alias_rel(self, rel): + # ''' Return true if relation implies alias rule ''' + # c1 = self.__out_taxonomy.get_category(rel.t1) + # c2 = self.__out_taxonomy.get_category(rel.t2) + # return (((c1 == "UNK") and (c2 == "FAM")) or + # ((c1 == "UNK") and (c2 == "UNK"))) + + + #def find_aliases(self): + # ''' Find aliases among relations ''' + # for rel in self.rel_set: + # c1 = self.__out_taxonomy.get_category(rel.t1) + # c2 = self.__out_taxonomy.get_category(rel.t2) + # if self.is_alias_rel(rel): + # self.G.add_node(rel.t1) + # self.G.add_node(rel.t2) + # self.G.add_edge(rel.t1, rel.t2, score=rel.talias_num) + # self.output_components("comp") + + def process_relation(self, rel): + ''' Process relation and update taxonomy/tagging correspondingly ''' + + # Obtain tag info + t1 = rel.t1 + t2 = rel.t2 + p1,c1 = self.__out_taxonomy.get_info(rel.t1) + p2,c2 = self.__out_taxonomy.get_info(rel.t2) + + log.info("Processing %s\t%s" % (p1, p2)) + + # If both directions strong, then equivalent, i.e., alias + if (float(rel.tinv_alias_num) >= args.t): + if (c1 != "UNK") and (c2 == "UNK"): + prefix = p1[0:p1.rfind(':')] + elif (c1 == "UNK") and (c2 != "UNK"): + prefix = p2[0:p2.rfind(':')] + elif (c1 == "UNK") and (c2 == "UNK"): + prefix = "FAM" + elif (c1 == c2): + prefix = p1[0:p1.rfind(':')] + else: + log.warn("Equivalent rule with different categories: %s\t%s" % + (p1, p2)) + return -1 + self.add_alias(t1, t2, prefix) + return 1 + + # UNK -> FAM : alias-family + elif (c1 == "UNK") and (c2 == 'FAM'): + self.add_alias(t1, t2, "FAM") + return 1 + + # UNK -> CLASS : taxonomy-family + # Return 0 so that expansion handled at end + elif (c1 == "UNK") and (c2 == 'CLASS'): + self.add_tag(t1, 'FAM:%s' % t1) + return 0 + + # UNK -> BEH : taxonomy-family + # Return 0 so that expansion handled at end + elif (c1 == "UNK") and (c2 == 'BEH'): + self.add_tag(t1, 'FAM:%s' % t1) + return 0 + + # UNK -> FILE : taxonomy-file + elif (c1 == "UNK") and (c2 == 'FILE'): + self.add_tag(t1, '%s:%s' % (p2, t1)) + return 1 + + # UNK -> UNK + elif (c1 == "UNK") and (c2 == "UNK"): + self.add_alias(t1, t2, "FAM") + return 1 + + # FAM -> UNK : alias-family + elif (c1 == "FAM") and (c2 == "UNK"): + self.add_alias(t1, t2, "FAM") + return 1 + + # FILE -> UNK : alias-file + elif (c1 == "FILE") and (c2 == "UNK"): + prefix = p1[0:p1.rfind(':')] + self.add_alias(t1, t2, prefix) + return 1 + + # Same category : alias + elif (c1 == "FAM") and (c2 == "FAM"): + #elif c1 == c2: + prefix = p2[0:p2.rfind(':')] + self.add_alias(t1, t2, prefix) + return 1 + + # Target unknown + elif (c2 == "UNK"): + # If tokens are similar, likely family aliases + # log.info("Similarity: %.02f" % levenshtein_ratio(t1, t2)) + # if (levenshtein_ratio(t1, t2) > sim_threshold): + # prefix = p1[0:p1.rfind(':')] + # self.add_alias(t1, t2, prefix) + # return 1 + # else: + # return 0 + return 0 + + # Default: review taxonomy + else: + return 0 + + + def run(self): + num_iter = 0 + while self.rel_set: + # Do a pass in remaining relations + cnt = 0 + new_set = set() + log.info("[-] %03d Processing relations" % num_iter) + while self.rel_set: + rel = self.rel_set.pop() + # If known relation, continue + if self.is_known_rel(rel): + continue + + # Process relation + result = self.process_relation(rel) + + if result: + cnt += 1 + else: + new_set.add(rel) + + # Update relation set + self.rel_set = new_set + + # If no relations processed, finish + if cnt == 0: + break + else: + num_iter += 1 + + # Find aliases + # self.find_aliases() + + # Find expansions + log.info("[-] Finding expansions") + self.find_expansions() + + + def read_relations(self, filepath): + ''' Returns relations in file as a set + Filters weak and blacklisted relations ''' + rel_set = set() + with open(filepath, 'r') as fd: + for line in fd: + # Ignore comments + if line.startswith('#'): + continue + # Parse line + t1, t2, t1_num, t2_num, nalias_num, talias_num, \ + tinv_alias_num = line.strip().split('\t') + # Build relation + rel = Rel(t1, t2, t1_num, t2_num, nalias_num, + talias_num, tinv_alias_num) + # Ignore weak relations + if self.is_weak_rel(rel): + continue + # Ignore blacklisted relations + if self.is_blacklisted_rel(rel): + continue + # Ignore known relations + # NOTE: commented since we check if a + # relation is known before processing it + #if self.is_known_rel(rel): + # continue + # Add relation to set + rel_set.add(rel) + # Add to src_map + self.src_map[rel.t1] = rel.t1_num + self.src_map[rel.t2] = rel.t2_num + + return rel_set + + def output_relations(self, filepath): + fd = open(filepath, 'w') + fd.write("# t1\tt2\t|t1|\t|t2|\t|t1^t2|\t|t1^t2|/|t1|\t" + "|t1^t2|/|t2|\n") + sorted_rules = sorted(self.rel_set, + key=(lambda r: ( + self.__out_taxonomy.get_category(r.t1), + self.__out_taxonomy.get_category(r.t2))), + reverse=False) + for rel in sorted_rules: + p1,c1 = self.__out_taxonomy.get_info(rel.t1) + p2,c2 = self.__out_taxonomy.get_info(rel.t2) + fd.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %( + p1, p2, rel.t1_num, rel.t2_num, rel.nalias_num, + rel.talias_num, rel.tinv_alias_num)) + fd.close() + + def output_rule_stats(self, fd): + # Initialize maps for statistics + self.dst_map = {} + self.cat_pairs_map = {} + # Compute rule statistics + for rel in self.rel_set: + c1 = self.__out_taxonomy.get_category(rel.t1) + c2 = self.__out_taxonomy.get_category(rel.t2) + self.cat_pairs_map[(c1,c2)] = self.cat_pairs_map.get((c1, + c2), 0) + 1 + self.dst_map[rel.t2] = self.dst_map.get(rel.t2, 0) + 1 + # Output statistics + cat_pairs = sorted(update.cat_pairs_map.items(), key=itemgetter(1,0), + reverse=True) + for (c1,c2), cnt in cat_pairs: + fd.write("%s\t%s\t%03d\n" % (c1, c2, cnt)) + + # Print dst statistics + dst_pairs = sorted(update.dst_map.items(), key=itemgetter(1,0), + reverse=False) + for dst, cnt in dst_pairs: + fd.write("%s\t%03d\n" % (taxonomy.get_path(dst), cnt)) + + def output(self, out_prefix): + if (not out_prefix): + tax_filepath = default_taxonomy_file + tag_filepath = default_tagging_file + exp_filepath = default_expansion_file + else: + tax_filepath = out_prefix + ".taxonomy" + tag_filepath = out_prefix + ".tagging" + exp_filepath = out_prefix + ".expansion" + taxonomy.to_file(tax_filepath) + tagging.expand_all_destinations() + tagging.to_file(tag_filepath) + expansion.to_file(exp_filepath) + + +if __name__ == '__main__': + argparser = argparse.ArgumentParser( + description='''Given a .alias file from the labeler, + generates updates for the taxonomy, tagging, and expansion files.''') + + argparser.add_argument('-alias', + help='file to parse with alias from labeler' + 'Labeler will run if -alias not present') + + argparser.add_argument('-n', + help='Minimum number of times that a pair of tokes have been seen.' + 'Default: 20', + type=int, + default=20) + + argparser.add_argument('-t', + help='Minimum percentage of times two tokens appear together.' + 'Default: 1.94', + type=float, + default=0.94) + + argparser.add_argument('-o', + help='output prefix for files') + + argparser.add_argument('-update', + action='store_true', + help='update default taxonomy,tagging,expansion files in place') + + # Parse arguments + args = argparser.parse_args() + + # Check we have the input + if not args.alias: + log.error('[-] Please provide an alias file with -alias') + exit(1) + + # Set output prefix + if args.o: + out_prefix = args.o + else: + out_prefix = os.path.splitext(args.alias)[0] + + # Read taxonomy + taxonomy = Taxonomy(default_taxonomy_file) + + # Read expansion rules + expansion = Expansion(default_expansion_file) + + # Read tagging rules + tagging = Tagging(default_tagging_file) + + # Build update object + if not args.alias: + alias_fname = os.path.basename(os.path.splitext(ifile)[0]) + '.alias' + else: + alias_fname = args.alias + update = Update(alias_fname, taxonomy, tagging, expansion, args.n, args.t) + + log.info('[-] Read %d relations satisfying t>=%.2f n>=%d\n' % ( + update.num_rules(), args.t, args.n)) + + # Output initial rules + update.output_relations(out_prefix + ".orig.rules") + + # Output initial rules statistics + update.output_rule_stats(sys.stderr) + + # Process relations + update.run() + + # Output updated taxonomy,tagging,expansion + if args.update: + update.output(None) + else: + update.output(out_prefix) + + # Output final rules + update.output_relations(out_prefix + ".final.rules") + diff --git a/avclass2/data/expansion b/avclass2/data/expansion new file mode 100644 index 0000000..3582f10 --- /dev/null +++ b/avclass2/data/expansion @@ -0,0 +1,17 @@ +backdoor server +bitcoinminer bitcoinmining +clicker click +ddoser ddos +dialer dial +downloader execdownload +gamania gamethief +keylogger keylog +miner mining +onlinegames gamethief +ransomware filecrypt +rogueware alertuser +rootkit osmodify +searcher search +smshoax sendssms +virus filemodify +worm selfpropagate diff --git a/avclass2/data/tagging b/avclass2/data/tagging new file mode 100644 index 0000000..dbd54a9 --- /dev/null +++ b/avclass2/data/tagging @@ -0,0 +1,1300 @@ +0052f0b gappusin +0053284d plankton +4share 4shared +6a53ba64ab smsreg +aacf dnotua +achros cova +actehc gomanag +activshop activshopper +acute pullupdate +adanalysis winkad +adclicker clicker +addisplay adware +addrop adware +adfltnet amonetize +adgazele adgazelle +adiwky airpush +adknowledge adware +adload adware +admin downloadadmin +adop fakeapp +adplugin adware +adpoooh poohad +adspy hotbar +adswo adwo +adtrafficanalysis winkad +adwareeorezo eorezo +afoynq ksapp +agemt domob +agewap opfake +agile biige +agilebinary biige +agnsmit infectionads +airad airinstaller +airadinstaller airinstaller +airinstall airinstaller +akan winwebsec +allad airpush +almanahe alman +alureon tdss +amab mobidash +amorba ipamor +andef fkdefend +andr android +androidos android +androm gamarue +andromeda gamarue +androsmscontrol ansmcon +androways badnews +andup fakeangry +angel virut +angryangel virut +anserver basebridge +ansver basebridge +antiav killsectool +antifw killsectool +antimalwaredefender defmid +anudow anydown +anways badnews +anxin lovetrap +anzhi dowgin +apke8bd dowgin +apperhand plankton +appinventor steek +appleservice coogos +applicunsaf grayware +applicunwnt grayware +applovin plankton +appquanta wkload +apprisk grayware +appsgeyser fakeflash +aque beebone +arcadeparlor gamevance +arcadeweb gamevance +archsms smshoax +arcparlor gamevance +armour androidarmour +arto renos +artro renos +aservicea kuguo +autokms winactivator +autoruner autorun vobfus +autorunerent autorun palevo +avalod sinowal +aveasms smskey +avkill killsectool +bacteraloh sality +badao smsspy +badday badda +badmacro macro +badnew badnews +banach hotbar +bandito unruy +banito unruy +banker infosteal +bankrypt bancos +banloader rimod +basebrid basebridge +batteryd fakedoc +batterydoctor fakedoc +bbridge basebridge +bckdr backdoor +bean nandrobox +bearshare bandoo +bergat xtrat +bertlea bertle +bespal netins +betterinstaller somoto +bflient palevo +bibean faketimer +biez loadmoney +bitcoin bitcoinminer +bitminer bitcoinminer +bjlog zegost +bkdr backdoor +blackice whiteice +blic whiteice +blocal vmvol +blocker killsectool +bobic bobax +botnet gidix +bototer wapomi +boxer fakeinst +boxersms fakeinst +braininst installbrain +brantall installbrain +brappware multiplug +browsepulse browsefox +browsermodifier multiplug +browserplugin multiplug +bsihai kabun +bsurf bettersurf +btapk smsreg +btcmine bitcoinminer +bulknet webprefix +bundl bundlore +bundleapp bundle +bundled bundle +bundleinstaller bundle +bundler bundle +bundpil gamarue +buzb bzub +bxib softonic +c2lop swizzor +cabby dalexis +caphaw shylock +casonline casino +cawitt smsbot +ceeinject inject +cellphonetrack mytrackp +cellspy mobilespy +ceshark cellshark +changeup vobfus +chard hiddad +checks-gps locationcheck +cheval detroie +chinesehacker chir +chinky vobfus +chydo pykspa +cidox vundo +cimag hiloti +cinmeng cinmus +citirevo vundo +clemag cleaman +click clicker +clickfraud clicker +clickpotato hotbar +clickrun installcore +clickrunsoftware installcore +clickspring purityscan +clientconnect opencandy +climap androrat +clkpotato hotbar +clspring purityscan +cobbler focobers +cobblerone focobers +cobbleronea focobers +codecpa renos +codecpack renos +codepack renos +coee cooee +coinmine miner +coinminer miner +collector autoins +comet darkkomet +cometsys darkkomet +cometsystems darkkomet +condestil firseria +conduit opencandy +contrand sckeylog +controlrandom sckeylog +coolpaperleak coolwall +copycat airpush +corrupt corrupted +cosha lovetrap +counterclank plankton +crack tool +cracktool tool +crisis morcut +crori crossrider +crosate svpeng +crwind crusewind +cryp packed +crypt packed +cryptdomaiq domaiq +crypted packed +crypter packed +cryptic packed +cryptinno installcore +cryptodefense cryptodef +cryptominer miner +cryptor packed +cson simbot +ctblocker dalexis +cudos fosniw +cupi smssend +cybota cycbot +cycler unruy +dadmin downloadadmin +dailer dialer +dalamodo cossta +damaged corrupted +darksnow whiteice +datasetaler infosteal +daytre upatre +ddlight droiddreamlight +dealcabby adpeak +debris gamarue +delf delphi +delfiles filedelete +delfinject delphi inject +delfloader delphi downloader +delfsnif delphi infosteal +delpbanc delphi infosteal +delpdldr delphi downloader +derdroi simbad +desktoplightning cashon +detroi detroie +detroia detroie +dial dialer +dialers dialer +dialpass egroupdial +dialplatform dialer +didat dabom +diple vobfus +directdown directdownloader +dldr downloader +dldrop downloader +dload downloader +dloade downloader +dloader downloader +dloadr downloader +dloadware adware +dnschanger dnsmodify +docdl downloader msoffice +docdrop downloader msoffice +docdrp downloader msoffice +dogbite dogowar +dogwar dogowar +doidroot rooter +domainiq domaiq +domalq domaiq +domlq domaiq +dontlback fakeinst +doods loic +dordae droiddreamlight +dordrae droiddreamlight +dotdo multiplug +dotdoads multiplug +douga dougalek +dougaleaker dougalek +dowcen centim +dowins inservice +downad adware +downagent downloader +downldexe downloader +downldr downloader +download downloader +downloadasist downloadassistant +downloaderguide downloadguide +downloadmin downloadadmin +downloadmr firseria +downloadnsave megasearch +downloadware adware +downsms dropdialer +downware downloader +dracur rebhip +dragonball vietsms +dragonbranch browsefox +drddream droiddream +drdelux droiddeluxe +dreamexploid droiddream +dridexdownloader dridex +dridld dridex +driverupd softpulse +drixed dridex +droidap smssend +droidapp smssend +droiddelux droiddeluxe +droidkrungfu droidkungfu +droidlive rootsmart +droidrooter rooter +drokole lockscreen +dromedan gamarue +drop downloader +dropped downloader +dropper downloader +droppr downloader +dropr downloader +duel loveletter +dumobove hiddad +duptwux lolbot +dwnldr downloader +dwonk pykspa +easydl amonetize +echiui invis +ecsys mailcab +egbii biige +egroup egroupdial +eicar testvirus +electron sytro +elenoocka dalexis +elephant dowgin +elkern klez +elpso vidro +emagsoftware smsreg +email spam +emailspy maistealer +emerleox fujacks +emud emudbot +encoder filecrypt ransomware +encpk packed +engwings cardserv +epicgames gamevance +epicplay gamevance +eqdrug equationdrug +equation equationdrug +erop smssend +escape laroux +escop laroux +evitanf hiddenapp +ewalls imlog +excel msexcel +exedial egroupdial +exedown downloader +exedrop downloader +expl exploit +expressfind browsefox +extens damon +extension damon +extrat xtrat +eydrop dinwod +fakapp styricka +fakealert rogueware +fakeav rogueware +fakebattscar fakedoc +fakebrows fakeinst +fakecodec renos +fakedefend fkdefend +fakedefender fkdefend +fakefldr fakefolder +fakeicq fakeinst +fakeinstall fakeinst +fakeinsthw fakeinst +fakeinstsms fakeinst +fakejoboffer fakejob +fakelogosms fakelogo +fakelt elite +fakemini opfake +fakemms fakeplayer +fakems fakepublisher signed +fakengry fakeangry +fakenotify opfake +fakeplay fakeplayer +fakeqou styricka +fakerecovery fakesysdef +fakerun airpush +fakesecsuit spyeye +fakesite perkel +fakeumg gumen +fakeupdates gamex +fakmod fakeapp +fakromup soft32downloader +faktvx fakeangry +farex fearso +fastsave megasearch +fastsaveapp megasearch +fatakr steek +fech wroba +fenomen fenomengame +fenomengamet fenomengame +fenservice fengvi +fidgo opfake +filecoder filecrypt ransomware +filehunter winpump +fileinfector infector +filesearch amonetize +finfisher finspy +finloski darkkomet +finlosky darkkomet +fipp morto +firser firseria +firseriainstaller firseria +fiseria firseria +fixflo pioneer +fkangry fakeangry +fkclip smssend +fkealrt rogueware +fksite perkel +fktime faketimer +flofix pioneer +flooder ddos +floxif pioneer +floxlib pioneer +flyagent flystudio +flystud flystudio +fodeg fakeinst +fokonge droidkungfu +foncysms foncy +foran anforen +fraud rogueware +fraudload downloader rogueware +fraudtool tool +freepds hotclip +frogonal ginmaster +fujack fujacks +funclub smssend +funweb mywebsearch +fynloski darkkomet +gaba gabpath +gael tenga +gaelicum tenga +gallm nandrobox +game grayware +gamehack onlinegames +gamevancecs gamevance +gampass gamethief +ganelp griptolo +gaobot agobot +gasms gambler +gastab gabas +gavir viking +gbot cycbot +gdjowa joye +gdream golddream +gectams smsspy +geimini geinimi +geinim geinimi +geksone crytex +gemest smishing +genericab wroba +genericgb basebridge +genpack packed +gentroj trojan +gepat airpush +getextension eorezo +getfaster 4shared +geyser plankton +ggsmart rootsmart +ggtracker ggtrack +ghostbot gobot +ghostpush ztorg +ginermaster kuguo +gingermaster ginmaster +glassbottle browsefox +gldct loadmoney +gletan ganlet +glodream golddream +glogo fakeapp +gmaster ginmaster +gmasterb kuguo +gmastere kuguo +gmeil gamex +gnurbulf rungbu +goidu oveead +goldclick hiddad +gonca gonesixty +gone gonesixty +gonfu droidkungfu +gongfu droidkungfu +goolbot cycbot +gopf uupay +gploader ewind +gprice gorillaprice +gray grayware +greatfind browsefox +guarder virut +gugespy qplus +gulpix plugx +gunpoder dowgin +gupboot urelas +gvance gamevance +h5games hiddad +habey elite +hackav kiser +hackkms winactivator +hacktool tool +hacyayu winwebsec +hamob fakeflash +hdusafe wapron +helldoor hilldoor +hellospy spyoo +hiddenad hiddad +hiddeninstall jsmshider +hidrag jeefo +hippo hipposms +hipsmser hipposms +hispo hipposms +hktl tool +hllp virus +hllw worm +hlux kelihos +homepage browsermodify +hongtoutou adrd +horse trojan +hosts-modifier hostsmodify +hublo crytex +huigezi hupigon +hype loadmoney +hyteod kovter +iadpush dowgin +ibank shiz +ibashade drolnux +ibrain installbrain +iceboy icekboy +ickboy icekboy +iconos iconosys +iconosis iconosys +idapk opfake +ihouse spyagent +ikangoo smssend +ilivid bandoo +imestartup cyfin +imonetize amonetize +inboxtoolbar inbox +indirect directdownloader +infdas infectionads +inffinity toggle +inffinityinternet toggle +infostealer infosteal +injcrypt inject +injected inject +injecter inject +injection inject +injector inject +inoco zdtad +inservc inservice +install installer +installcloud installerex +installco installcore +installcube icloader +installmat installmate +installmet installmetrix +installmon installmonster +installmonst installmonster +installmonstr installmonster +installq installiq +installrex installerex +installvibe bundlore +instantaccess egroupdial +instmonetizer installmonetizer +intex intexdial +intexus intexdial +invader daws +ipatre upatre +ircbot bot irc +ispyoo spyoo +j2me java +jackpos jinupd +jadtre wapomi +javak suggestor +jedan kuguo +jelbrus techsnab +joke hoax +joleee tedroo +juched griptolo +kaka telman +kanav alyak +kasandra sandr +kashu sality +kazaa benjamin +keepmusic hiddad +keji basebridge +kelvin smssend +kernelpatch geral +keygen tool +keylog keylogger +kgbkeylogger kgbspy +kibi ksapp +kichhoat smsreg +killav killsectool +killfiles files +kituri placms +klevate webprefix +klezer beebone +kmsauto winactivator +koceg socks +koler svpeng +kometa rukometa +kongfu droidkungfu +kouto koutodoor +koyotelab bandoo +krademok darkkomet +kranxpay mmarketpay +krypt packed +kryptik packed +kryptk packed +kucirc cosmu +kuku sality +kungfu droidkungfu +kusasesms hipposms +lacon laconic +langya lien +lanucher bgserv +lavandos vidro +ldmon loadmoney +lebag ramnit +legana droidkungfu +legendmir lmir +legmir lmir +lemir lmir +letang ganlet +licat murofet +licum tenga +liezar rasteal +lightdd droiddreamlight +lijo smssend +lilu gamarue +limpopo loadmoney +lineage gamania +linkun linkular +liteweb browsefox +livesecurity winwebsec +livesoft getnow +livesoftaction getnow +llond lardlond +loadmoneyent loadmoney +locker lockscreen +locm locmg +lohmys midia +looked viking +loorp wapomi +lootor exploit +lotoor exploit +lower airpush +lozfoon loozfon +macosx mac +macrodown downloader macro +madanf virut +madang virut +madangel virut +magania gamania +magmedia mediamagnet +mailer spam +mailstealer maistealer +mainservice pjapps +maklt renos +malcrypt packed +malhome updtkiller +maliciousmacro macro +mallocker lockscreen +malob packed +malpack packed +malpe corrupted +manalo laroux +mandaph socks +marketpay mmarketpay +massmailer spam +masterkey master +maxplus zeroaccess +maxplusent zeroaccess +mayachok vundo +mazel somoto +mazig fakeinst +mbro winwebsec +mdropper downloader +meredrop vobfus +meterpreter metasploit +mfinder mediafinder +midgare bifrose +midhos medfos +mikcer wapomi +milicenso pirminay +mimobsms minimob +mindspark mywebsearch +miscosms gidix +misosms gidix +mixor loveletter +mketpay mmarketpay +mmag mediamagnet +mmarket mmarketpay +mmarketp mmarketpay +mmob minimob +mo97 macro +mobcore airpush +mobi fakeinst +mobigapp gamex +mobilehotdog nandrobox +mobinauten smsspy +mobistealth stealthcell +mobkong smssend +mobspy trackplus +mobsqueeze fakedoc +mofksys swisyn +monad damon +monderb vundo +monitor infosteal +monocle monokle +monstruos installmonster +montiera delbar +morefi memery +morepak pushad +morstar firseria +morstars firseria +mosky skymobi +mostofate softomate +mplug multiplug +msilobfuscator msil packed +mspyonline mspy +msteal maistealer +mswdm ipamor +mufanom hiloti +mulad kuguo +muldrop downloader +multibardown multibar +multibardownloader multibar +multiinstall vilsel +multipluggen multiplug +musictoolbar bandoo +mutibar multibar +mutopy rodecap +mvlove vmvol +mw97 macro +mytrack mytrackp +nabucur virlock +najin feejar +nandrob nandrobox +nemucod smsreg +neshuta neshta +netboxserver netbox +neteyes ipamor +netfilter network +netweird netwiredrc +networm worm +newyearl plankton +nextup verti +nickibot nickyspy +nickispy nickyspy +nickspy nickyspy +nicky nickyspy +nidb spyoo +nimefas mseg +nimnul wapomi +ninebox kuguo +nioserv nocoma +nisev nocoma +nofear fearso +nofer fearso +noico zdtad +noiconads zdtad +nopoc smforw +not-a-virus grayware +notcom nocoma +notcompatible nocoma +noticemob ginmaster +nsanti packed +nuwar tibs +nyearleaker airpush +nyleaker airpush +o97m macro +obfus packed +obfusc packed +obfuscate packed +obfuscated packed +obfuscator packed +odyssey loadmoney +offerad appoffer +office msoffice +ogimant loadmoney +olmarik tdss +onbsms smssend +oneclick oneclickfraud +oneclickdownload 1clickdownload +onestep zwangi +onlineg onlinegames +onlinega onlinegames +onlinegam onlinegames +onlinegame onlinegames +onlinegamehack onlinegames +ooqqxx boqx +opclose sillyfdc +opfakesms fakeinst +optimizerpro speedingupmypc +optimum ibryte +optimuminstall ibryte +optimuminstaller ibryte +optinstall ibryte +optiuminstaller ibryte +optixp optix +optixpro optix +osx mac +osx32 mac +otran vobfus +otwycal wapomi +overdoom cosmu +overt sadenav +overtls sadenav +ozotshielder kmin +pace socks +padobot korgo +padodor berbew +pakes packed +panda zbot +pandaent zbot +pandora nandrobox +parnian smssend +patch filemodify +patched filemodify +patcher filemodify +patchfile filemodify +pate parite +payint domaiq +payment basebridge +pazetus brontok +pe windows +peacomm tibs +pemalform corrupted +pemask maskpe +penetrata penetho +penetrate penetho +pepatch filemodify +perfectkeylogger perflogger +perfkey perflogger +perfloger perflogger +perkele perkel +petrolan petrolin +philis viking +pigeon hupigon +pigetrl lockscreen +pikor wapomi +pikorms wapomi +pilleuz palevo +pinball hotbar +pinfi parite +pinny shiz +pirater walkinwat +pirrit tirrip +pirritsuggestor tirrip +placsms placms +plangton plankton +plite urelas +plocust loadmoney +plosa karagany +plugin multiplug +pmax zeroaccess +podec fobus +podnuha boaxxe +poisonivy poison +polip cardserv +polipos cardserv +polycryptt polycrypt +polyransom virlock +popeler firseria +popov fakeinst +popuppers soft32downloader +porn porndialer +porndial porndialer +pornlocker lockscreen +portscan network +positivefinds browsefox +positmob fakeinst +potentially grayware +poweliks wowlik +powerliks wowlik +powerpack linkular +powessere wowlik +pp97m macro +preloader megasearch +premiumsms smskey +premiumsmsscam smshoax +privacyrisk grayware +privitize techsnab +prockill killproc +prodatect fakesysdef +pronny vobfus +protexor ramnit +protil wapomi +provar fakeinst +pswtool infosteal +pua grayware +pup grayware +pupil plemood +purity purityscan +purora vobfus +purple plemood +purplemood plemood +pushdo cutwail +putalol couponmarvel +pwsonlinegames onlinegames +pwsteal infosteal +pwstealer infosteal +pwszbot zbot +pykse pykspa +qakbot qbot +qhost hostsmodify +qhosts hostsmodify +qqrobber qqrob +qukart berbew +qvod wapomi +rabbhome fjcon +rabidog dogowar +rahack allaple +rahiwi brontok +raideloz vobfus +ramdo redyms +ranck ranky +ransom ransomware +ransomcrypt filecrypt ransomware +ransomlock lockscreen ransomware +rapiddown firseria +ratab mamianune +razel rasteal +raziel rasteal +recal mogap +recordpage browsefox +redirector network +reefwal kalfere +refogkeylogger refog +regie fosniw +relevant relevantknowledge +relik updtkiller +remtasu xtrat +renamer files +reptilic reptilicus +revmob plankton +revtcp metasploit +rimecud palevo +risk grayware +risktool grayware tool +riskware grayware +rivalgame gamevance +rkdoor koutodoor +rknowledge relevantknowledge +rlemon lemon +rmnet ramnit +rodricter simda +rogue rogueware +roguesppush shastrosms +rollaround browsefox +rontokbr brontok +rontokbro brontok +roop svpeng +rootcager droiddream +ropin leadbolt +rorpian zeroaccess +ruftar usteal +rugo hotbar +runitslf looper +runonce chir +runouce chir +safekidzone sakezon +sahagent sahat +saho wroba +saiva smammer +saldrop sality +salecharger browsefox +salicode sality +salitystub sality +salload sality +salpack sality +salrenmetie sality +sambamedia softpulse +sancmed sanctionedmedia +sandrorat sandr +saveshare megasearch +scareware rogueware +scavir fakeinst +sckeylogger sckeylog +sclog sckeylog +screenblaze prosti +screensaver hotbar +script jswebinject +searchprotect opencandy +searchsuite bandoo +seasuite bandoo +seaweed seaweth +secretspy smforw +secshieldfraud securityshield +securitydefender defmid +securitytool tool +secxplod securityxploded +secxploded securityxploded +selfdel beebone +sendpay shastrosms +serbg bgserv +serpip morto +sethom hiddad +sexxoo redmobile +sexyclip smssend +sharestar gappusin +shell shellcode +shellkode shellcode +sheriff sheridroid +shifu shiz +shohdi shodi +shopathome sahat +signalbooster fakedoc +signalboosterb fakedoc +silentcaller dialer +simfect wapomi +simplock simplocker +sinodo sinowal +sintal plankton +sirefef zeroaccess +skanik smssend +slybdb blohi +smabo adialer +smadow zeroaccess +smbot fakeins +smbox fakeinst +smfrow dowgin +smokeloader dofoil +smsarch smshoax +smsbank smsreg +smsbox fakeinst +smsboxer fakeinst +smscc smcc +smser smssend +smsfakesky opfake +smsforward smforw +smsfraud smshoax +smsfwd smforw +smshider jsmshider +smsilence smscatch +smskute smsagent +smsseaw seaweth +smssilence smscatch +smsstealer smsspy +smthief smsthief +smtp spam +snadapps typstu +sndapps typstu +sneakytrail installerex +sniffer network +sobot clientor +soft32down soft32downloader +soft32download soft32downloader +softbase softobase +softcentral sckeylog +softonicdownloader softonic +softpules softpulse +softwarebundler bundle +sohand sohanad +sohaned sohanad +solimba firseria +soltern sytro +somato somoto +somotobetterinstaller somoto +somotoltd somoto +soobek lockscreen +spacer unruy +spakrab vidro +spambot spam +spammer spam +spamtool spam tool +spatet rebhip +spdupmypc speedingupmypc +speedupmypc uniblue +spez spyzie +spyagnt piom +spybubb spybubble +spyeyes spyeye +spygold golddream +spymob trackplus +spyphone phonespy +spyrat rebhip +spysat spyset +spysheriff harnig +spytomobile gpspy +spytrack spyset +spyweep spyeye +square squarenet +ssam guerrilla +starman allaple +starsys plankton +startapp plankton +startpage browsermodify +statblaster winfetcher +stealer infosteal +steekt steek +stesec smssend +stmp spam +stration warezov +strongsignal browsefox +stubofsality sality +stufik tufik +sunnet smsreg +superoptimizer speedingupmypc +superpctools speedingupmypc +suspiciouspacker packed +susppack packed +sventore firseria +swiftbrowse browsefox +system droidkungfu +systemfix fakesysdef +systemsecurity winwebsec +systex daws +systro sytro +sysvenfak loadmoney +talklog talkw +taojin taojinstar +tapsnake gpspy +tattoohack exploid +tatus tetus +tazebama mabezat +tdownloader installerex +tdssrt tdss +tedro tedroo +temai ksapp +tepfer fareit +test testvirus +testfile testvirus +tibspak tibs +tibspk tibs +tibsys tibser +tibsystems tibser +ticno multibar +tidserv tdss +tiger tigerbot +tigrbot tigerbot +timpdoor clientor +tinbakd tinba +tinbelog nandrobox +tiny small +tklocker lockscreen +tonclank plankton +toorch rootnik +tophos stegvob +torchmedia bandoo +torpump winpump +tovkater installmonster +towelexploit towel +trj trojan +trjdown downloader trojan +trjndwnlder downloader trojan +troj trojan +trojanapt apt trojan +trojanbanker infosteal trojan +trojanclicker adware clicker trojan +trojandldr downloader trojan +trojandownloader downloader trojan +trojandropper downloader trojan +trojandwnldr downloader trojan +trojanfakeav alertuser rogueware trojan +trojanhorse trojan +trojanproxy proxy trojan +trojanpsw infosteal trojan +trojanransom filecrypt ransomware trojan +trojansms sms trojan +trojanspy spyware trojan +trojware trojan +truedown truedownloader +tsuploader installerex +tufei tufik +tugspay domaiq +tunkoo silentboot +turk alyak +turko turkojan +tuto4pc eorezo +tweetbot smsbot +twetty twetti +txmob mobiletx +typnotify typstu +ucont spyagent +ultradownload vilsel +ultradownloads vilsel +umeng gumen +unix linux +unsafe grayware +unwanted grayware +unwnt grayware +updatekiller updtkiller +updtkill updtkiller +uracto maistealer +uuser uuserv +uxipp yzhc +valhalla xorala +valla xorala +vbccrypt vobfus +vbcrypt packed visualbasic +vbinject inject visualbasic +vbkrypt packed visualbasic +vbna vobfus +vbobf vobfus +vbobfus vobfus +vbpack packed visualbasic +vernet dusvext +vertex dusvext +vertexb dusvext +vertexbot dusvext +vetor virut +vflood vtflooder +vflooder vtflooder +vils vilsel +virransom virlock +virtob virut +virtool tool +vitallia vittalia +vjadtre wapomi +vmdetector vmdetect +vmpbad vmprotect +vnfraye dusvext +vsaas vsas +vserv viser +vxidl tibs +w2km macro +w32 windows +w64 windows +w97m macro +wakeful cardserv +wali wapomi +walkfree kalfere +walksteal walkinwat +wanacry wannacry +wanderburst browsefox +wanna wannacry +wannacrypt wannacry +wannacryptor wannacry +wapnor shedun +waps gappusin +wapsx gappusin +wapz gappusin +waren qumi +waski upatre +wauchos gamarue +wbna vobfus +webalt webalta +webatla webalta +webpick installerex +websearch search +webtoolbar toolbar +wedownload soft32downloader +weecnaw netwiredrc +weiyi smforw +whboy fujacks +whistle whistlesoftware +whistles whistlesoftware +widoman bmmedia +win windows +win32 windows +win64 windows +winge cardserv +winnt windows +winsoft fosniw +winsxsbot sfone +wipelock elite +wipelocker elite +wirenet netwiredrc +wohis dowgin +wondertek tekwon +word msword +wpay smsreg +wplug slugin +wplugin slugin +wukong yzhc +x2km macro +x97m macro +ximad plankton +xloader wroba +xpack packed +xpiro expiro +xsider jsmshider +xtoober karagany +xtreme xtrat +xworm loveletter +yangamon pirates +yarwi upatre +yontoo browsefox +yoof picsys +yotoon browsefox +yourfiledownloader expressdownloader +yusttohq trackplus +yzhcsms yzhc +zaccess zeroaccess +zadved dlhelper +zango hotbar +zangosearch hotbar +zawet masplot +zbocheman zbot +zbomber zombbomber +zbotk zbot +zebt hiddenapp +zeno zenosearch +zeus zbot +zhelatin tibs +zombie smszombie +zona zvuzona +zpack packed +zsone raden +zwunzi zwangi +zybut shiz diff --git a/avclass2/data/taxonomy b/avclass2/data/taxonomy new file mode 100644 index 0000000..fa6f7ad --- /dev/null +++ b/avclass2/data/taxonomy @@ -0,0 +1,1138 @@ +BEH:alertuser +BEH:autorun +BEH:browsermodify +BEH:browsermodify:toolbar +BEH:click +BEH:ddos +BEH:defaulttab +BEH:dial +BEH:dnsmodify +BEH:execdownload +BEH:facebook +BEH:filecrypt +BEH:filedelete +BEH:filemodify +BEH:files +BEH:hostsmodify +BEH:infosteal +BEH:infosteal:gamethief +BEH:inject +BEH:irc +BEH:jswebinject +BEH:keylog +BEH:killproc +BEH:killproc:killsectool +BEH:locationcheck +BEH:lockscreen +BEH:mining +BEH:mining:bitcoinmining +BEH:network +BEH:osmodify +BEH:phishing +BEH:proxy +BEH:proxychanger +BEH:search +BEH:selfpropagate +BEH:server +BEH:servstart +BEH:sms +BEH:sms:readssms +BEH:sms:sendssms +BEH:spam +BEH:tor +BEH:vmdetect +CLASS:apt +CLASS:backdoor +CLASS:bot +CLASS:bot:bankbot +CLASS:clicker +CLASS:ddoser +CLASS:dialer +CLASS:dialer:porndialer +CLASS:downloader +CLASS:fakeantivirus +CLASS:grayware +CLASS:grayware:adware +CLASS:grayware:adware:adlibrary +CLASS:grayware:adware:adlibrary:adpush +CLASS:grayware:adware:multiplug +CLASS:grayware:casino +CLASS:grayware:tool +CLASS:grayware:tool:remoteadmin +CLASS:hoax +CLASS:hoax:smshoax +CLASS:infector +CLASS:keylogger +CLASS:miner +CLASS:miner:bitcoinminer +CLASS:ransomware +CLASS:rogueware +CLASS:rooter +CLASS:rootkit +CLASS:searcher +CLASS:spyware +CLASS:virus +CLASS:virus:prepender +CLASS:worm +CLASS:worm:emailworm +FAM:0052b +FAM:154b2720de +FAM:1clickdownload +FAM:4shared +FAM:560de1fe9de +FAM:abeciv +FAM:accutrack +FAM:acecard +FAM:activeinject +FAM:activshopper +FAM:adcolony +FAM:adend +FAM:adflex +FAM:adgazelle +FAM:adialer +FAM:adinject +FAM:adir +FAM:adlock +FAM:admogo +FAM:adpeak +FAM:adpooh +FAM:adrd +FAM:adrotator +FAM:adrotoob +FAM:adultbrowser +FAM:adviator +FAM:adwk +FAM:adwo +FAM:aesads +FAM:agobot +FAM:agvd +FAM:ahmyth +FAM:ahopc +FAM:airinstaller +FAM:airpush +FAM:aiwan +FAM:aliyuncs +FAM:allaple +FAM:alman +FAM:alyak +FAM:amonetize +FAM:androidarmour +FAM:androidlost +FAM:androrat +FAM:anforen +FAM:angupsh +FAM:anlost +FAM:ansmcon +FAM:anti +FAM:anubis +FAM:anydown +FAM:anzhu +FAM:aplog +FAM:apofer +FAM:appoffer +FAM:appsad +FAM:appwiz +FAM:appwizz +FAM:aqplay +FAM:asacub +FAM:asprox +FAM:autoins +FAM:autosus +FAM:axespy +FAM:badda +FAM:badnews +FAM:badpac +FAM:baiduprotect +FAM:bajaspy +FAM:bamital +FAM:bancos +FAM:bandoo +FAM:banload +FAM:basbanke +FAM:basebridge +FAM:basepay +FAM:bauts +FAM:bebeg +FAM:becou +FAM:beebone +FAM:beita +FAM:beitaad +FAM:belesak +FAM:benjamin +FAM:berbew +FAM:bertle +FAM:betterad +FAM:bettersurf +FAM:bgserv +FAM:bicololo +FAM:bifrose +FAM:biige +FAM:binka +FAM:bips +FAM:birele +FAM:bitrep +FAM:blacklister +FAM:bladabindi +FAM:blohi +FAM:blueguard +FAM:bmmedia +FAM:boaxxe +FAM:bobax +FAM:bobic +FAM:boogrdex +FAM:boomp +FAM:boqx +FAM:boyad +FAM:bredolab +FAM:brontok +FAM:browsefox +FAM:bruad +FAM:bublik +FAM:bundlore +FAM:buzus +FAM:buzztouch +FAM:bzub +FAM:callflakes +FAM:callpay +FAM:callrecorder +FAM:campys +FAM:carberp +FAM:cardserv +FAM:cashon +FAM:cellshark +FAM:centim +FAM:cerekv +FAM:cheica +FAM:chir +FAM:chyapo +FAM:cinmus +FAM:cleaman +FAM:clevernet +FAM:clientor +FAM:clinator +FAM:cmccwm +FAM:cnbtech +FAM:cnzz +FAM:coinhive +FAM:commplat +FAM:conduit +FAM:conficker +FAM:contactscollector +FAM:cooee +FAM:coogos +FAM:coolmirage +FAM:coolwall +FAM:cosmu +FAM:cossta +FAM:couponmarvel +FAM:cova +FAM:cridex +FAM:crossrider +FAM:crusewind +FAM:cryptodef +FAM:cryptolocker +FAM:cryptowall +FAM:crytex +FAM:cryxos +FAM:ctchm +FAM:cutwail +FAM:cycbot +FAM:cyfin +FAM:dabom +FAM:dalexis +FAM:damon +FAM:dangbei +FAM:darkkomet +FAM:darop +FAM:dasu +FAM:daws +FAM:dbtes +FAM:deblio +FAM:defmid +FAM:delbar +FAM:deshacop +FAM:detroie +FAM:dianle +FAM:dianru +FAM:dilidi +FAM:dinwod +FAM:directdownloader +FAM:dlhelper +FAM:dnotua +FAM:dofoil +FAM:dogowar +FAM:domaiq +FAM:domob +FAM:dorfdo +FAM:dorifel +FAM:dorkbot +FAM:dougalek +FAM:dowgin +FAM:downloadadmin +FAM:downloadassistant +FAM:downloadguide +FAM:dqshell +FAM:dridex +FAM:droidcoupon +FAM:droiddeluxe +FAM:droiddream +FAM:droiddreamlight +FAM:droidkungfu +FAM:droidsheep +FAM:drolnux +FAM:drolock +FAM:dropdialer +FAM:drosel +FAM:drstwex +FAM:dsploit +FAM:dusvext +FAM:dynamer +FAM:easyroot +FAM:egame +FAM:egroupdial +FAM:ejik +FAM:elite +FAM:emudbot +FAM:eorezo +FAM:equationdrug +FAM:esfury +FAM:etooe +FAM:ewind +FAM:expiro +FAM:expressdownloader +FAM:f7fa48878f6c +FAM:faceniff +FAM:fakeangry +FAM:fakeapp +FAM:fakebank +FAM:fakebanker +FAM:fakebkupt +FAM:fakedep +FAM:fakedoc +FAM:fakeflash +FAM:fakefolder +FAM:fakeins +FAM:fakeinst +FAM:fakejob +FAM:fakekrb +FAM:fakelogo +FAM:fakepay +FAM:fakeplayer +FAM:fakerateapp +FAM:fakerean +FAM:fakesysdef +FAM:fakesysui +FAM:faketimer +FAM:fakevalidation +FAM:fakgram +FAM:fareit +FAM:farfli +FAM:farmap +FAM:fateon +FAM:fearso +FAM:feejar +FAM:feiad +FAM:feiwo +FAM:fengvi +FAM:fenomengame +FAM:fictus +FAM:finspy +FAM:firseria +FAM:fjcon +FAM:fkav +FAM:fkdefend +FAM:float +FAM:flystudio +FAM:fobus +FAM:focobers +FAM:fogo +FAM:foncy +FAM:forav +FAM:fosniw +FAM:framaroot +FAM:freeandroidspy +FAM:freeandspy +FAM:freespy +FAM:frupi +FAM:fujacks +FAM:gabas +FAM:gabpath +FAM:gamania +FAM:gamarue +FAM:gambler +FAM:gamclk +FAM:gameguardian +FAM:gamevance +FAM:gamex +FAM:ganga +FAM:ganlet +FAM:gapev +FAM:gappusin +FAM:gato +FAM:gbdialer +FAM:gbqal +FAM:geinimi +FAM:general +FAM:gepew +FAM:geral +FAM:getnow +FAM:gexin +FAM:ggtrack +FAM:gibdy +FAM:gidby +FAM:gidix +FAM:ginamster +FAM:ginko +FAM:ginmaster +FAM:gizmo +FAM:gobot +FAM:golddream +FAM:goldentouch +FAM:gomanag +FAM:gomunc +FAM:gonesixty +FAM:goodnews +FAM:gorillaprice +FAM:gpspy +FAM:grabos +FAM:graybird +FAM:griptolo +FAM:guerrilla +FAM:gugi +FAM:gumen +FAM:gupay +FAM:gysad +FAM:hahad +FAM:hamad +FAM:harnig +FAM:hasdk +FAM:haynu +FAM:hero +FAM:hiddad +FAM:hiddenapp +FAM:hiddnad +FAM:highster +FAM:hilldoor +FAM:hiloti +FAM:hipposms +FAM:honli +FAM:hotbar +FAM:hotclip +FAM:hoverwatch +FAM:hqowdo +FAM:hqwar +FAM:htmlapp +FAM:humanspy +FAM:hupigon +FAM:hypay +FAM:ibryte +FAM:icekboy +FAM:icloader +FAM:iconhider +FAM:iconosys +FAM:icqbomber +FAM:imali +FAM:imaut +FAM:imlog +FAM:inbox +FAM:infectionads +FAM:inor +FAM:inservice +FAM:installbrain +FAM:installcore +FAM:installerex +FAM:installiq +FAM:installmetrix +FAM:installmonetizer +FAM:installmonster +FAM:intersad +FAM:intexdial +FAM:invent +FAM:invis +FAM:ipamor +FAM:iqiad +FAM:iresearch +FAM:irtard +FAM:itracker +FAM:jayqa +FAM:jeefo +FAM:jfpush +FAM:jiagu +FAM:jiead +FAM:jifake +FAM:jinupd +FAM:jisut +FAM:joye +FAM:joynow +FAM:jsmshider +FAM:jssms +FAM:judy +FAM:juzi +FAM:kabun +FAM:kalfere +FAM:kapratect +FAM:karagany +FAM:kasidet +FAM:katrep +FAM:kelihos +FAM:kgbspy +FAM:kidlogger +FAM:kimia +FAM:kingroot +FAM:kirko +FAM:kiser +FAM:klez +FAM:kmin +FAM:kolab +FAM:koobface +FAM:korgo +FAM:koutodoor +FAM:kovter +FAM:krefel +FAM:ksapp +FAM:kuguo +FAM:kurash +FAM:kyhub +FAM:kyview +FAM:laconic +FAM:lardlond +FAM:laroux +FAM:ldpinch +FAM:leadbolt +FAM:leapp +FAM:lemon +FAM:letv +FAM:lien +FAM:linkular +FAM:lirose +FAM:lmir +FAM:lmmob +FAM:loadmoney +FAM:loapi +FAM:lockactivity +FAM:locmg +FAM:loic +FAM:lolbot +FAM:lollipop +FAM:loodos +FAM:looper +FAM:loozfon +FAM:lotuseed +FAM:lotusid +FAM:lovefraud +FAM:loveletter +FAM:lovetrack +FAM:lovetrap +FAM:lucky +FAM:lxasj +FAM:lynep +FAM:mabezat +FAM:magiccasino +FAM:mailcab +FAM:maistealer +FAM:malwarescope +FAM:mamianune +FAM:mankess +FAM:marcher +FAM:mars +FAM:marsdaemon +FAM:mart +FAM:masplot +FAM:masspr +FAM:maxapp +FAM:mazarbot +FAM:mecor +FAM:medfos +FAM:mediafinder +FAM:mediamagnet +FAM:meftadon +FAM:megasearch +FAM:memery +FAM:menti +FAM:metasploit +FAM:mgyun +FAM:midia +FAM:migun +FAM:milipnot +FAM:minimob +FAM:mirai +FAM:mmarketpay +FAM:mmaro +FAM:mobby +FAM:mobcent +FAM:mobclick +FAM:mobeleader +FAM:mobhey +FAM:mobiad +FAM:mobidash +FAM:mobifence +FAM:mobikok +FAM:mobile +FAM:mobilepay +FAM:mobilespy +FAM:mobiletracker +FAM:mobiletx +FAM:mobwin +FAM:mocpiad +FAM:mogap +FAM:mogosec +FAM:monitorminor +FAM:monokle +FAM:moplus +FAM:morcut +FAM:morix +FAM:morto +FAM:mprt +FAM:mseg +FAM:mspy +FAM:mtracker +FAM:multibar +FAM:murofet +FAM:mwiam +FAM:mydoom +FAM:myfolder +FAM:myteam +FAM:mytrackp +FAM:mywebsearch +FAM:nandrobox +FAM:navbar +FAM:nawiaiad +FAM:necro +FAM:necurs +FAM:neospy +FAM:neshta +FAM:netbox +FAM:netins +FAM:netwiredrc +FAM:ngrbot +FAM:nickyspy +FAM:nitol +FAM:nivdort +FAM:nocoma +FAM:notifyer +FAM:nqshield +FAM:obtes +FAM:ocikq +FAM:odpa +FAM:oimobi +FAM:oivim +FAM:oixal +FAM:omsysd +FAM:oneclickfraud +FAM:onexuan +FAM:onlinegames +FAM:opencandy +FAM:openinstall +FAM:opfake +FAM:optix +FAM:outbrowse +FAM:oveead +FAM:paccy +FAM:palevo +FAM:pandaad +FAM:parite +FAM:patacore +FAM:paycall +FAM:pcclient +FAM:penetho +FAM:penguin +FAM:perflogger +FAM:perkel +FAM:petrolin +FAM:phonespy +FAM:picsys +FAM:piom +FAM:pioneer +FAM:pirates +FAM:pirminay +FAM:pjapps +FAM:placms +FAM:plankton +FAM:plemood +FAM:plugx +FAM:poison +FAM:pokotus +FAM:ponmocup +FAM:poohad +FAM:pornapp +FAM:pornoasset +FAM:pornoblocker +FAM:pornpay +FAM:pornvideo +FAM:presenoker +FAM:prorat +FAM:prosti +FAM:pullupdate +FAM:pupy +FAM:purityscan +FAM:pushad +FAM:pushe +FAM:puxis +FAM:pykspa +FAM:qbot +FAM:qexma +FAM:qplus +FAM:qqrob +FAM:qumi +FAM:quozha +FAM:qushu +FAM:raden +FAM:ramnit +FAM:ranky +FAM:rasteal +FAM:razam +FAM:rbot +FAM:rebhip +FAM:recmads +FAM:redalert +FAM:rediassi +FAM:redmobile +FAM:redyms +FAM:reflod +FAM:refog +FAM:regon +FAM:relevantknowledge +FAM:renocide +FAM:renos +FAM:reporo +FAM:reptilicus +FAM:resharer +FAM:reveton +FAM:riltok +FAM:rimod +FAM:robtes +FAM:rodecap +FAM:rogueurl +FAM:root +FAM:rootagent +FAM:rootmaster +FAM:rootnik +FAM:rootsmart +FAM:rotexy +FAM:rufraud +FAM:rukometa +FAM:rungbu +FAM:ruskill +FAM:rusms +FAM:sacti +FAM:sacto +FAM:sadenav +FAM:sadpor +FAM:sahat +FAM:sakezon +FAM:sality +FAM:sanctionedmedia +FAM:sandr +FAM:savemy +FAM:scam +FAM:sckeylog +FAM:sdbot +FAM:seaweth +FAM:secapk +FAM:securityshield +FAM:securityxploded +FAM:senddroid +FAM:severs +FAM:sfone +FAM:shastrosms +FAM:shedun +FAM:sheridroid +FAM:shixot +FAM:shiz +FAM:shodi +FAM:shuame +FAM:shylock +FAM:silentboot +FAM:silentinst +FAM:silentinstaller +FAM:sillyfdc +FAM:silverpush +FAM:simbad +FAM:simbot +FAM:simda +FAM:simpatchy +FAM:simplocker +FAM:sinowal +FAM:skeeyah +FAM:skplanet +FAM:skymobi +FAM:slic +FAM:slocker +FAM:slugin +FAM:smammer +FAM:smartfortress +FAM:smcc +FAM:smforw +FAM:smishing +FAM:smsagent +FAM:smsbomber +FAM:smsbot +FAM:smscatch +FAM:smscmd +FAM:smsfakeinstall +FAM:smsgol +FAM:smskey +FAM:smspay +FAM:smsreg +FAM:smssend +FAM:smsspy +FAM:smsthief +FAM:smszombie +FAM:snowfox +FAM:socks +FAM:soft32downloader +FAM:softcnapp +FAM:softobase +FAM:softomate +FAM:softonic +FAM:softpulse +FAM:sohanad +FAM:sokmi +FAM:somoto +FAM:sopes +FAM:sosceo +FAM:soundy +FAM:spbot +FAM:speedingupmypc +FAM:spigot +FAM:spitmo +FAM:spotad +FAM:sprovider +FAM:spyagent +FAM:spyapp +FAM:spybubble +FAM:spydealer +FAM:spyeye +FAM:spyhasb +FAM:spynote +FAM:spyoo +FAM:spyset +FAM:spyzie +FAM:squarenet +FAM:stalk +FAM:stealthcell +FAM:steek +FAM:stegvob +FAM:stopsmsc +FAM:stoqx +FAM:strarpay +FAM:styricka +FAM:suaban +FAM:suggestor +FAM:supking +FAM:svpeng +FAM:swisyn +FAM:swizzor +FAM:systemmonitor +FAM:systush +FAM:sytro +FAM:tachi +FAM:talkw +FAM:taojinstar +FAM:tapcore +FAM:target +FAM:tdss +FAM:tebak +FAM:techsnab +FAM:tedroo +FAM:teebik +FAM:tekwon +FAM:telman +FAM:tenga +FAM:terkcop +FAM:tescrypt +FAM:teslacrypt +FAM:tetus +FAM:tgapp +FAM:tgpotato +FAM:tgpush +FAM:tibs +FAM:tibser +FAM:tifamily +FAM:tigerbot +FAM:tinba +FAM:tirrip +FAM:tispy +FAM:tocrenu +FAM:toga +FAM:toggle +FAM:toofan +FAM:tordow +FAM:toreoc +FAM:torjok +FAM:totap +FAM:towel +FAM:tracer +FAM:tracker +FAM:trackerfree +FAM:trackplus +FAM:trclick +FAM:tridrongo +FAM:troom +FAM:truedownloader +FAM:tufik +FAM:turkojan +FAM:tuyoopay +FAM:twetti +FAM:txing +FAM:typstu +FAM:ultima +FAM:umpay +FAM:uniblue +FAM:unruy +FAM:upatre +FAM:updtkiller +FAM:urelas +FAM:usatek +FAM:ussder +FAM:usteal +FAM:utchi +FAM:uupay +FAM:uuserv +FAM:vapsup +FAM:vdloader +FAM:verti +FAM:vidro +FAM:vietsms +FAM:viking +FAM:vilsel +FAM:virlock +FAM:virusdoctor +FAM:virut +FAM:viser +FAM:vittalia +FAM:vkemag +FAM:vktihs +FAM:vmvol +FAM:vnapstore +FAM:vobfus +FAM:vpsdrop +FAM:vsas +FAM:vtflooder +FAM:vundo +FAM:wabot +FAM:wajar +FAM:waledac +FAM:walex +FAM:walien +FAM:walkinwat +FAM:wallad +FAM:wannacry +FAM:wannalocker +FAM:wapomi +FAM:wapron +FAM:warezov +FAM:webalta +FAM:webkey +FAM:webprefix +FAM:whatsapp +FAM:whistlesoftware +FAM:whiteice +FAM:whitesmoke +FAM:wifikill +FAM:winactivator +FAM:winfetcher +FAM:winkad +FAM:winpump +FAM:winwebsec +FAM:wkload +FAM:wooboo +FAM:wowlik +FAM:wqmobile +FAM:wroba +FAM:wtaspin +FAM:xavierad +FAM:xinhua +FAM:xolco +FAM:xorala +FAM:xtrat +FAM:xynyin +FAM:yeahmobi +FAM:yekrand +FAM:yoga +FAM:youku +FAM:youmi +FAM:yuchanglou +FAM:yzhc +FAM:zadmo +FAM:zbot +FAM:zdtad +FAM:zegost +FAM:zenosearch +FAM:zeroaccess +FAM:zhash +FAM:zhidian +FAM:zhui +FAM:zitmo +FAM:zlob +FAM:zniu +FAM:zombbomber +FAM:ztorg +FAM:zusy +FAM:zvuzona +FAM:zwangi +FILE:bundle +FILE:corrupted +FILE:exploit +FILE:exploit:asroot +FILE:exploit:doidroot +FILE:exploit:droidrt +FILE:exploit:enoket +FILE:exploit:exploid +FILE:exploit:exynos +FILE:exploit:fakeroot +FILE:exploit:gingerbreak +FILE:exploit:gxox +FILE:exploit:master +FILE:exploit:masterkey +FILE:exploit:ratc +FILE:exploit:rootor +FILE:exploit:signaturebypass +FILE:exploit:stagefright +FILE:exploit:towelroot +FILE:fakepdf +FILE:fakepublisher +FILE:filetype:flash +FILE:filetype:html +FILE:filetype:jpeg +FILE:filetype:msoffice +FILE:filetype:msoffice:msexcel +FILE:filetype:msoffice:msword +FILE:filetype:pdf +FILE:filetype:text +FILE:iframe +FILE:iframe:iframeref +FILE:installer +FILE:installer:installmate +FILE:installer:nsis +FILE:installer:smartinstaller +FILE:installer:wiseinstaller +FILE:macro +FILE:msil +FILE:os:android +FILE:os:linux +FILE:os:mac +FILE:os:windows +FILE:packed +FILE:packed:asprotect +FILE:packed:decrypter +FILE:packed:execryptor +FILE:packed:expressor +FILE:packed:krunchy +FILE:packed:maskpe +FILE:packed:molebox +FILE:packed:nakedpack +FILE:packed:nspack +FILE:packed:pearmor +FILE:packed:pecompact +FILE:packed:polycrypt +FILE:packed:rcryptor +FILE:packed:themida +FILE:packed:upack +FILE:packed:vmprotect +FILE:proglang:autoit +FILE:proglang:delphi +FILE:proglang:java +FILE:proglang:java:genericgba +FILE:proglang:perl +FILE:proglang:powershell +FILE:proglang:python +FILE:proglang:visualbasic +FILE:shellcode +FILE:signed +FILE:small +FILE:testvirus +FILE:webpage +GEN:abuse +GEN:access +GEN:advml +GEN:agen +GEN:apk +GEN:appl +GEN:application +GEN:attribute +GEN:based +GEN:behav +GEN:behaveslike +GEN:bloodhound +GEN:cloud +GEN:confidence +GEN:dangerousobject +GEN:deepscan +GEN:eheur +GEN:encodefeature +GEN:file +GEN:gen +GEN:gena +GEN:generic +GEN:generickd +GEN:genericr +GEN:generik +GEN:genetic +GEN:genfamily:agent +GEN:genfamily:artemis +GEN:genfamily:badur +GEN:genfamily:barys +GEN:genfamily:dapato +GEN:genfamily:delf +GEN:genfamily:eldorado +GEN:genfamily:foreign +GEN:genfamily:graftor +GEN:genfamily:jorik +GEN:genfamily:katusha +GEN:genfamily:kazy +GEN:genfamily:krap +GEN:genfamily:mikey +GEN:genfamily:scar +GEN:genfamily:strictor +GEN:genfamily:symmi +GEN:genfamily:yakes +GEN:genmalicious +GEN:genome +GEN:hack +GEN:heur +GEN:heuristic +GEN:high +GEN:highconfidence +GEN:igeneric +GEN:kcloud +GEN:lookslike +GEN:malagent +GEN:maldroid +GEN:malicious +GEN:maltrec +GEN:malware +GEN:memscan +GEN:multi +GEN:normal +GEN:onion +GEN:optional +GEN:other +GEN:password +GEN:posible +GEN:possible +GEN:probably +GEN:program +GEN:reputation +GEN:sape +GEN:score +GEN:securityrisk +GEN:siggen +GEN:software +GEN:static +GEN:susp +GEN:suspect +GEN:suspectcrc +GEN:suspected +GEN:suspic +GEN:suspicious +GEN:symvt +GEN:threat +GEN:trojan +GEN:tsgeneric +GEN:unclassifiedmalware +GEN:undef +GEN:undefined +GEN:unknown +GEN:variant +GEN:website diff --git a/avclass2/lib/avclass2_common.py b/avclass2/lib/avclass2_common.py new file mode 100755 index 0000000..2233242 --- /dev/null +++ b/avclass2/lib/avclass2_common.py @@ -0,0 +1,636 @@ +#!/usr/bin/env python +''' +Main AVClass class +''' + +import sys +import re +import string +import logging +from collections import OrderedDict as OrdDict +from collections import namedtuple +from operator import itemgetter, attrgetter + +# Set logging +log = logging.getLogger(__name__) + +# Prefix to identify platform tags +platform_prefix = "FILE:os:" + +# Default category for tags in taxonomy with no category +uncategorized_cat = "UNC" + +SampleInfo = namedtuple('SampleInfo', + ['md5', 'sha1', 'sha256', 'labels', 'vt_tags']) + +Tag = namedtuple('Tag', ['name', 'cat', 'path', 'prefix_l']) + +# AVs to use in suffix removal +suffix_removal_av_set = {'Norman', 'Avast', 'Avira', 'Kaspersky', + 'ESET-NOD32', 'Fortinet', 'Jiangmin', 'Comodo', + 'GData', 'Avast', 'Sophos', + 'TrendMicro-HouseCall', 'TrendMicro', + 'NANO-Antivirus', 'Microsoft'} + +def create_tag(s): + ''' Create a Tag from its string representation ''' + word_list = s.strip().split(":") + if len(word_list) > 1: + name = word_list[-1].lower() + cat = word_list[0].upper() + prefix_l = [x.lower() for x in word_list[1:-1]] + path = cat + for x in prefix_l: + path = path + ':' + x + path = path + ':' + name + else: + name = word_list[0].lower() + cat = uncategorized_cat + prefix_l = [] + path = name + return Tag(name, cat, path, prefix_l) + +class Taxonomy: + ''' + A taxonomy of tags and generic tokens read from file + ''' + def __init__(self, filepath): + ''' Map tag.name | tag.path -> Tag ''' + self.__tag_map = {} + if filepath: + self.read_taxonomy(filepath) + + def __len__(self): + ''' Taxonomy length is the number of tags it contains ''' + return len(self.__tag_map)/2 + + def is_generic(self, t): + ''' Return true if input is generic, false otherwise ''' + tag = self.__tag_map.get(t, None) + if tag: + return tag.cat == "GEN" + else: + return False + + def is_tag(self, t): + ''' Return true if input is tag, false otherwise ''' + return t in self.__tag_map + + def add_tag(self, s, override=False): + ''' Add tag to taxonomy + If tag already exists with different path, + only replaces if override True ''' + tag = create_tag(s) + t = self.__tag_map.get(tag.name, None) + if t and (t.path != tag.path): + if (not override): + return + else: + log.warn("[Taxonomy] Replacing %s with %s\n" % ( + t.path, tag.path)) + del self.__tag_map[t.path] + log.info("[Taxonomy] Adding tag %s" % s) + self.__tag_map[tag.name] = tag + self.__tag_map[tag.path] = tag + return + + def remove_tag(self, t): + ''' Remove tag from taxonomy. Returns 1 if removed, zero if unknown ''' + tag = self.__tag_map.get(t, None) + if tag: + log.info("[Taxonomy] Removing tag: %s" % tag.path) + del self.__tag_map[tag.name] + del self.__tag_map[tag.path] + return 1 + else: + return 0 + + def get_category(self, t): + ''' Return category of input tag, UNK if not a tag ''' + tag = self.__tag_map.get(t, None) + if tag: + return tag.cat + else: + return "UNK" + + def get_path(self, t): + ''' Return full path for given tag, or empty string if not a tag ''' + tag = self.__tag_map.get(t, None) + if tag: + return tag.path + else: + return ("UNK:" + t) + + def get_prefix_l(self, t): + ''' Return prefix list for given tag, or empty string if not a tag ''' + tag = self.__tag_map.get(t, None) + if tag: + return tag.prefix_l + else: + return [] + + def get_prefix(self, t): + ''' Return prefix string for given tag, + or empty string if not a tag ''' + tag = self.__tag_map.get(t, None) + if tag: + return tag.prefix_l + else: + return t.path[0:t.path.rfind(':')] + + def get_depth(self, t): + ''' Return depth of tag in taxonomy. + Returns zero if tag not in taxonomy. + A normal tag CAT:name has depth two ''' + tag = self.__tag_map.get(t, None) + if tag: + return len(tag.prefix_l) + 2 + else: + return 0 + + def get_info(self, t): + ''' Return (path,category) for given tag, or UNK:t if not a tag ''' + tag = self.__tag_map.get(t, None) + if tag: + return tag.path, tag.cat + else: + return "UNK:" + t, "UNK" + + def expand(self, t): + ''' Return list of tags in prefix list that are leaves ''' + tag = self.__tag_map.get(t, None) + if tag: + return [t for t in tag.prefix_l if t in self.__tag_map] + else: + return [] + + def platform_tags(self): + ''' Returns list with platform tags in taxonomy ''' + acc = set() + for idx,tag in self.__tag_map.items(): + if tag.path.startswith(platform_prefix): + acc.add(tag.name) + return acc + + def overlaps(self, t1, t2): + ''' Returns true if the path of the given tags overlaps ''' + m1 = self.get_prefix_l(t1) + m2 = self.get_prefix_l(t2) + return (t1 in m2) or (t2 in m1) + + def remove_overlaps(self, l): + ''' Returns list with overlapping tags removed ''' + if not l: + return l + pair_l = sorted([(self.get_depth(t),t) for t in l]) + out_l = [pair_l.pop()[1]] + while pair_l: + t = pair_l.pop()[1] + if (not any(self.overlaps(t, e) for e in out_l)): + out_l.append(t) + return out_l + + def read_taxonomy(self, filepath): + '''Read taxonomy from given file ''' + with open(filepath, 'r') as fd: + for line in fd: + if line.startswith('#') or line == '\n': + continue + self.add_tag(line.strip()) + return + + def to_file(self, filepath): + ''' Output sorted taxonomy to given file ''' + # Open output file + fd = open(filepath, 'w') + # Write sorted tags + tag_l = sorted(self.__tag_map.items(), + key=lambda item : item[1].path, + reverse=False) + idx = 0 + for name,tag in tag_l: + if (idx % 2) == 0: + fd.write(tag.path+"\n") + idx+=1 + # Close output file + fd.close() + +class Rules: + ''' + Rules are src -> dst1, dst2, ... relations + ''' + def __init__(self, filepath): + ''' Map src -> set(dst) ''' + self._rmap = {} + if filepath: + self.read_rules(filepath) + + def __len__(self): + ''' Length is number of rules, i.e., number of src ''' + return len(self._rmap) + + def add_rule(self, src, dst_l, overwrite=False): + ''' Add rule. If rule exists: + if overwrite==True, replace destination list + else append dst_l to current target set ''' + # Remove src from dst_l if it exists + dst_l = filter(lambda x: x != src, dst_l) + # If no destinations, nothing to do + if (not dst_l): + return + log.info("[Rules] Adding %s -> %s" % (src, dst_l)) + src_tag = create_tag(src) + if overwrite: + target_l = [create_tag(dst).name for dst in dst_l] + self._rmap[src_tag.name] = set(target_l) + else: + curr_dst = self._rmap.get(src_tag.name, set()) + for dst in dst_l: + dst_tag = create_tag(dst) + curr_dst.add(dst_tag.name) + self._rmap[src_tag.name] = curr_dst + return + + def remove_rule(self, src): + l = self._rmap.get(src, []) + if l: + log.info("[Rules] Removing rule: %s -> %s" % (src, l)) + del self._rmap[src] + return 1 + else: + return 0 + + def get_dst(self, src): + ''' Returns dst list for given src, or empty list if no expansion ''' + return list(self._rmap.get(src, [])) + + def read_rules(self, filepath): + '''Read rules from given file''' + with open(filepath, 'r') as fd: + for line in fd: + if line.startswith('#') or line == '\n': + continue + word_list = line.strip().split() + if len(word_list) > 1: + self.add_rule(word_list[0],word_list[1:]) + return + + def to_file(self, filepath, taxonomy=None): + ''' Output sorted rules to given file + If taxonomy is provided, it outputs full tag path ''' + fd = open(filepath, 'w') + for src,dst_set in sorted(self._rmap.items()): + dst_l = sorted(dst_set, reverse=False) + if taxonomy: + src_path = taxonomy.get_path(src) + path_l = [taxonomy.get_path(t) for t in dst_l] + dst_str = '\t'.join(path_l) + fd.write("%s\t%s\n" % (src_path,dst_str)) + else: + dst_str = '\t'.join(dst_l) + fd.write("%s\t%s\n" % (src,dst_str)) + fd.close() + + def expand_src_destinations(self, src): + ''' Return destination list for given src after recursively + following any rules for destinations ''' + dst_set = self._rmap.get(src, set()) + out = set() + while dst_set: + dst = dst_set.pop() + l = self._rmap.get(dst, []) + if l: + for e in l: + if (e not in out) and (e != dst): + dst_set.add(e) + else: + out.add(dst) + return out + + def expand_all_destinations(self): + ''' Return destination list for given src after recursively + following any rules for destinations ''' + src_l = self._rmap.keys() + for src in src_l: + dst_l = self.expand_src_destinations(src) + self._rmap[src] = dst_l + +class Tagging(Rules): + ''' + Tagging rules have src UNK and dst in taxonomy + ''' + def __init__(self, filepath): + Rules.__init__(self, filepath) + + def validate(self, taxonomy): + ''' Check that tags in tagging rules are in given taxonomy ''' + for tok,tag_l in self._rmap.items(): + for t in tag_l: + if (not taxonomy.is_tag(t)): + sys.stdout.write("[Tagging] %s not in taxonomy\n" % t) + +class Expansion(Rules): + ''' + Expansion rules have src and dst in taxonomy and + src.category != dst.category + ''' + def __init__(self, filepath): + Rules.__init__(self, filepath) + + def validate(self, taxonomy): + ''' Check that tags in expansion rules are in given taxonomy ''' + for src,dst_set in self._rmap.items(): + if (not taxonomy.is_tag(src)): + sys.stdout.write("[Expansion] %s not in taxonomy\n" % src) + for dst in dst_set: + if (not taxonomy.is_tag(dst)): + sys.stdout.write("[Expansion] %s not in taxonomy\n" % dst) + +class AvLabels: + ''' + Class to operate on AV labels, + such as extracting the most likely family name. + ''' + def __init__(self, tag_file, exp_file = None, tax_file = None, + av_file = None, aliasdetect=False): + # Read taxonomy + self.taxonomy = Taxonomy(tax_file) + # Read tag rules + self.tagging = Tagging(tag_file) + # Read expansion rules + self.expansions = Expansion(exp_file) + # Read AV engines + self.avs = self.read_avs(av_file) if av_file else None + # Alias statistics initialization + self.aliasdetect = aliasdetect + + @staticmethod + def read_avs(avs_file): + '''Read AV engine set from given file''' + with open(avs_file) as fd: + avs = set(map(str.strip, fd.readlines())) + return avs + + @staticmethod + def get_sample_info_lb(vt_rep): + '''Parse and extract sample information from JSON line + Returns a SampleInfo named tuple + ''' + return SampleInfo(vt_rep['md5'], vt_rep['sha1'], vt_rep['sha256'], + vt_rep['av_labels'], []) + + @staticmethod + def get_sample_info_vt_v2(vt_rep): + '''Parse and extract sample information from JSON line + Returns a SampleInfo named tuple + ''' + label_pairs = [] + # Obtain scan results, if available + try: + scans = vt_rep['scans'] + md5 = vt_rep['md5'] + sha1 = vt_rep['sha1'] + sha256 = vt_rep['sha256'] + except KeyError: + return None + # Obtain labels from scan results + for av, res in scans.items(): + if res['detected']: + label = res['result'] + clean_label = ''.join(filter( + lambda x: x in string.printable, + label)).strip() + label_pairs.append((av, clean_label)) + # Obtain VT tags, if available + vt_tags = vt_rep.get('tags', []) + + return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags) + + @staticmethod + def get_sample_info_vt_v3(vt_rep): + '''Parse and extract sample information from JSON line + Returns a SampleInfo named tuple + ''' + label_pairs = [] + # Obtain scan results, if available + try: + scans = vt_rep['data']['attributes']['last_analysis_results'] + md5 = vt_rep['data']['attributes']['md5'] + sha1 = vt_rep['data']['attributes']['sha1'] + sha256 = vt_rep['data']['attributes']['sha256'] + except KeyError: + return None + # Obtain labels from scan results + for av, res in scans.items(): + label = res['result'] + if label is not None: + clean_label = ''.join(filter( + lambda x: x in string.printable, + label)).strip() + label_pairs.append((av, clean_label)) + # Obtain VT tags, if available + vt_tags = vt_rep['data']['attributes'].get('tags', []) + + return SampleInfo(md5, sha1, sha256, label_pairs, vt_tags) + + + @staticmethod + def is_pup(tag_pairs, taxonomy): + '''This function classifies the sample as PUP or not + by checking if highest ranked CLASS tag contains "grayware" + and is above a predefined threshold + Return: + True/False/None + ''' + threshold = 0.5 + # If no tags, return false + if len(tag_pairs) < 1: + return None + max_ctr = tag_pairs[0][1] + for (tag,ctr) in tag_pairs: + (path, cat) = taxonomy.get_info(tag) + if (cat == "CLASS"): + if ("grayware" in path): + return (float(ctr) >= float(max_ctr)*threshold) + else: + return False + return False + + @staticmethod + def __remove_suffixes(av_name, label): + '''Remove AV specific suffixes from given label + Returns updated label''' + + # Truncate after last '.' + if av_name in suffix_removal_av_set: + label = label.rsplit('.', 1)[0] + + # Truncate after last '.' + # if suffix only contains digits or uppercase (no lowercase) chars + if av_name == 'AVG': + tokens = label.rsplit('.', 1) + if len(tokens) > 1 and re.match("^[A-Z0-9]+$", tokens[1]): + label = tokens[0] + + # Truncate after last '!' + if av_name == 'Agnitum': + label = label.rsplit('!', 1)[0] + + return label + + + def get_label_tags(self, label, hashes): + ''' Return list of tags in given label + Tokenizes label, filters unneeded tokens, and + applies tagging rules ''' + + # Initialize set of tags to return + # We use a set to avoid duplicate tokens in the same AV label + # This avoids "potentially unwanted" contributing twice BEH:pup + tags = set() + + # If empty label, nothing to do + if not label: + return tags + + # Split label into tokens and process each token + for token in re.split("[^0-9a-zA-Z]", label): + # Convert token to lowercase + token = token.lower() + + # Remove digits at the end + end_len = len(re.findall("\d*$", token)[0]) + if end_len: + token = token[:-end_len] + + # Ignore token if prefix of a hash of the sample + # Most AVs use MD5 prefixes in labels, + # but we check SHA1 and SHA256 as well + hash_token = False + for hash_str in hashes: + if hash_str[0:len(token)] == token: + hash_token = True + break + if hash_token: + continue + + # Ignore generic tokens + if self.taxonomy.is_generic(token): + continue + + # Apply tagging rule + dst_l = self.tagging.get_dst(token) + if dst_l: + # Ignore generic tokens + for t in dst_l: + if not self.taxonomy.is_generic(t): + tags.add(t) + # Add token if longer than 3 characters and no tagging rule + elif len(token) > 3: + tags.add(token) + + # Return tags + return tags + + + def __expand(self, tag_set): + ''' Return expanded set of tags ''' + ret = set() + for t in tag_set: + # Include tag + ret.add(t) + + # Include target of expansion rule in output + ret.update(self.expansions.get_dst(t)) + + # Include implicit expansions in taxonomy + ret.update(self.taxonomy.expand(t)) + + # Return a list for backwards compatibility + return ret + + def get_sample_tags(self, sample_info): + ''' Returns dictionary tag -> AV list of tags for the given sample ''' + + # Whitelist the AVs to filter the ones with meaningful labels + av_whitelist = self.avs + # Initialize auxiliary data structures + duplicates = set() + av_dict = {} + + # Process each AV label + for (av_name, label) in sample_info.labels: + # If empty label, nothing to do + if not label: + continue + + ################ + # AV selection # + ################ + if av_whitelist and av_name not in av_whitelist: + continue + + ##################### + # Duplicate removal # + ##################### + + # Emsisoft uses same label as + # GData/ESET-NOD32/BitDefender/Ad-Aware/MicroWorld-eScan, + # but suffixes ' (B)' to their label. Remove the suffix. + if label.endswith(' (B)'): + label = label[:-4] + + # F-Secure uses Avira's engine since Nov. 2018 + # but prefixes 'Malware.' to Avira's label. Remove the prefix. + if label.startswith('Malware.'): + label = label[8:] + + # Other engines often use exactly the same label, e.g., + # AVG/Avast + # K7Antivirus/K7GW + # Kaspersky/ZoneAlarm + + # If we have seen the exact same label before, skip + if label in duplicates: + continue + # If not, we add it to duplicates + else: + duplicates.add(label) + + ################## + # Suffix removal # + ################## + label = self.__remove_suffixes(av_name, label) + + ######################################################## + # Tokenization and tagging # + ######################################################## + hashes = [ sample_info.md5, sample_info.sha1, sample_info.sha256 ] + tags = self.get_label_tags(label, hashes) + + ######################################################## + # Expansions # + ######################################################## + # NOTE: Avoiding to do expansion when aliases + if self.aliasdetect: + expanded_tags = tags + else: + expanded_tags = self.__expand(tags) + + ######################################################## + # Stores information that relates AV vendors with tags # + ######################################################## + for t in expanded_tags: + av_dict.setdefault(t, []).append(av_name) + + + return av_dict + + def rank_tags(self, av_dict, threshold=1): + ''' Return list of (tag, confidence) ranked by decreasing confidence + and filter tags with less or equal threshold confidence ''' + + pairs = ((t, len(avs)) for (t,avs) in av_dict.items() + if len(avs) > threshold) + return sorted(pairs, key=itemgetter(1,0), reverse=True) + diff --git a/avclass2/lib/evaluate_clustering.py b/avclass2/lib/evaluate_clustering.py new file mode 100755 index 0000000..2196d7a --- /dev/null +++ b/avclass2/lib/evaluate_clustering.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python +import sys + +def tp_fp_fn(CORRECT_SET, GUESS_SET): + """ + INPUT: dictionary with the elements in the cluster from the ground truth + (CORRECT_SET) and dictionary with the elements from the estimated cluster + (ESTIMATED_SET). + + OUTPUT: number of True Positives (elements in both clusters), False + Positives (elements only in the ESTIMATED_SET), False Negatives (elements + only in the CORRECT_SET). + """ + tp = 0 + fp = 0 + fn = 0 + for elem in GUESS_SET: + # True Positives (elements in both clusters) + if elem in CORRECT_SET: + tp += 1 + else: + # False Positives (elements only in the "estimated cluster") + fp += 1 + for elem in CORRECT_SET: + if elem not in GUESS_SET: + # False Negatives (elements only in the "correct cluster") + fn += 1 + return tp, fp, fn + + +def eval_precision_recall_fmeasure(GROUNDTRUTH_DICT, ESTIMATED_DICT): + """ + INPUT: dictionary with the mapping "element:cluster_id" for both the ground + truth and the ESTIMATED_DICT clustering. + + OUTPUT: average values of Precision, Recall and F-Measure. + """ + # eval: precision, recall, f-measure + tmp_precision = 0 + tmp_recall = 0 + + # build reverse dictionary of ESTIMATED_DICT + rev_est_dict = {} + for k, v in ESTIMATED_DICT.items(): + if v not in rev_est_dict: + rev_est_dict[v] = { k } + else: + rev_est_dict[v].add(k) + + # build reverse dictionary of GROUNDTRUTH_DICT + gt_rev_dict = {} + for k, v in GROUNDTRUTH_DICT.items(): + if v not in gt_rev_dict: + gt_rev_dict[v] = { k } + else: + gt_rev_dict[v].add(k) + + + counter, l = 0, len(ESTIMATED_DICT) + + sys.stderr.write('Calculating precision and recall\n') + + # For each element + for element in ESTIMATED_DICT: + + # Print progress + if counter % 1000 == 0: + sys.stderr.write('\r%d out of %d' % (counter, l)) + sys.stderr.flush() + counter += 1 + + # Get elements in the same cluster (for "ESTIMATED_DICT cluster") + guess_cluster_id = ESTIMATED_DICT[element] + + # Get the list of elements in the same cluster ("correct cluster") + correct_cluster_id = GROUNDTRUTH_DICT[element] + + # Calculate TP, FP, FN + tp, fp, fn = tp_fp_fn(gt_rev_dict[correct_cluster_id], + rev_est_dict[guess_cluster_id]) + + # tmp_precision + p = 1.0*tp/(tp+fp) + tmp_precision += p + # tmp_recall + r = 1.0*tp/(tp+fn) + tmp_recall += r + sys.stderr.write('\r%d out of %d' % (counter, l)) + sys.stderr.write('\n') + precision = 100.0*tmp_precision/len(ESTIMATED_DICT) + recall = 100.0*tmp_recall/len(ESTIMATED_DICT) + fmeasure = (2*precision*recall)/(precision+recall) + return precision, recall, fmeasure + + +if __name__ == "__main__": + + # The ground truth. + # Dictionary with mapping: "element : cluster_id". + diz_grth = { + "a": 1, + "b": 1, + "c": 2, + "d": 3 + } + + # An example of an "estimated cluster". + # Dictionary with mapping: "element : cluster_id". + diz_estim = { + "a": 66, + "b": 'malware', + "c": 'goodware', + "d": 'trojan' + } + + # An example of an "estimated cluster": same partitioning as for the ground + # truth, but just different cluster labels. Precision == Recall == + # F-Measure == 100%. + # Dictionary with mapping: "element : cluster_id". + diz_estim_grth = { + "a": 2, + "b": 2, + "c": 66, + "d": 9 + } + + # a sample where estimated != ground truth + sys.stdout.write("Ground truth\n") + sys.stdout.write("%8s --> %10s\n" % ("Element", "Cluster_ID")) + for k, v in diz_grth.items(): + sys.stdout.write("%8s --> %10s\n" % (k, v)) + sys.stdout.write("\nEstimated clustering\n") + sys.stdout.write("%8s --> %10s\n" % ("Element", "Cluster_ID")) + for k, v in diz_estim.items(): + sys.stdout.write("%8s --> %10s\n" % (k, v)) + # precision, recall, f-measure + p, r, f = eval_precision_recall_fmeasure(diz_grth, diz_estim) + sys.stdout.write("\nPrecison: %s%%\n" % p) + sys.stdout.write("Recall: %s%%\n" % r) + sys.stdout.write("F-Measure: %s%%\n" % f) + diff --git a/data/malheurReference_gt.tsv b/examples/malheurReference_gt.tsv similarity index 100% rename from data/malheurReference_gt.tsv rename to examples/malheurReference_gt.tsv diff --git a/data/malheurReference_lb.json b/examples/malheurReference_lb.json similarity index 100% rename from data/malheurReference_lb.json rename to examples/malheurReference_lb.json diff --git a/examples/vtv3_sample.json b/examples/vtv3_sample.json new file mode 100644 index 0000000..08dfe77 --- /dev/null +++ b/examples/vtv3_sample.json @@ -0,0 +1 @@ +{ "data": { "attributes": { "creation_date": 1584397860, "exiftool": { "Author": "Tatyana", "Characters": 1896, "CharactersWithSpaces": 2224, "CreateDate": "2020:03:16 22:31:00", "FileType": "RTF", "FileTypeExtension": "rtf", "InternalVersionNumber": "57433", "LastModifiedBy": "apcach E", "MIMEType": "text/rtf", "ModifyDate": "2020:03:16 22:31:00", "Pages": 1, "RevisionNumber": "2", "TotalEditTime": "1 minute", "Warning": "Unsupported RTF encoding cp936. Will assume Latin.", "Words": 332 }, "first_submission_date": 1584418873, "last_analysis_date": 1584939766, "last_analysis_results": { "ALYac": { "category": "malicious", "engine_name": "ALYac", "engine_update": "20200323", "engine_version": "1.1.1.5", "method": "blacklist", "result": "Exploit.CVE-2017-11882" }, "APEX": { "category": "type-unsupported", "engine_name": "APEX", "engine_update": "20200322", "engine_version": "6.3", "method": "blacklist", "result": null }, "AVG": { "category": "malicious", "engine_name": "AVG", "engine_update": "20200323", "engine_version": "18.4.3895.0", "method": "blacklist", "result": "Other:Malware-gen [Trj]" }, "Acronis": { "category": "type-unsupported", "engine_name": "Acronis", "engine_update": "20200315", "engine_version": "1.1.1.73", "method": "blacklist", "result": null }, "Ad-Aware": { "category": "malicious", "engine_name": "Ad-Aware", "engine_update": "20200323", "engine_version": "3.0.5.370", "method": "blacklist", "result": "Trojan.Agent.ENJC" }, "AegisLab": { "category": "malicious", "engine_name": "AegisLab", "engine_update": "20200323", "engine_version": "4.2", "method": "blacklist", "result": "Hacktool.RTF.CVE-2018-0802.3!c" }, "AhnLab-V3": { "category": "undetected", "engine_name": "AhnLab-V3", "engine_update": "20200323", "engine_version": "3.17.3.26870", "method": "blacklist", "result": null }, "Alibaba": { "category": "type-unsupported", "engine_name": "Alibaba", "engine_update": "20190527", "engine_version": "0.3.0.5", "method": "blacklist", "result": null }, "Antiy-AVL": { "category": "malicious", "engine_name": "Antiy-AVL", "engine_update": "20200323", "engine_version": "3.0.0.1", "method": "blacklist", "result": "Trojan[Exploit]/RTF.Obscure.Gen" }, "Arcabit": { "category": "malicious", "engine_name": "Arcabit", "engine_update": "20200323", "engine_version": "1.0.0.870", "method": "blacklist", "result": "Trojan.Agent.ENJC" }, "Avast": { "category": "malicious", "engine_name": "Avast", "engine_update": "20200323", "engine_version": "18.4.3895.0", "method": "blacklist", "result": "Other:Malware-gen [Trj]" }, "Avast-Mobile": { "category": "undetected", "engine_name": "Avast-Mobile", "engine_update": "20200319", "engine_version": "200319-00", "method": "blacklist", "result": null }, "Avira": { "category": "malicious", "engine_name": "Avira", "engine_update": "20200323", "engine_version": "8.3.3.8", "method": "blacklist", "result": "EXP/CVE-2017-11882.zfknn" }, "Baidu": { "category": "undetected", "engine_name": "Baidu", "engine_update": "20190318", "engine_version": "1.0.0.2", "method": "blacklist", "result": null }, "BitDefender": { "category": "malicious", "engine_name": "BitDefender", "engine_update": "20200323", "engine_version": "7.2", "method": "blacklist", "result": "Trojan.Agent.ENJC" }, "BitDefenderTheta": { "category": "undetected", "engine_name": "BitDefenderTheta", "engine_update": "20200311", "engine_version": "7.2.37796.0", "method": "blacklist", "result": null }, "Bkav": { "category": "undetected", "engine_name": "Bkav", "engine_update": "20200321", "engine_version": "1.3.0.9899", "method": "blacklist", "result": null }, "CAT-QuickHeal": { "category": "malicious", "engine_name": "CAT-QuickHeal", "engine_update": "20200323", "engine_version": "14.00", "method": "blacklist", "result": "RTF.Agent.37108" }, "CMC": { "category": "undetected", "engine_name": "CMC", "engine_update": "20190321", "engine_version": "1.1.0.977", "method": "blacklist", "result": null }, "ClamAV": { "category": "malicious", "engine_name": "ClamAV", "engine_update": "20200322", "engine_version": "0.102.2.0", "method": "blacklist", "result": "Rtf.Dropper.Agent-7624526-0" }, "Comodo": { "category": "undetected", "engine_name": "Comodo", "engine_update": "20200323", "engine_version": "32234", "method": "blacklist", "result": null }, "CrowdStrike": { "category": "type-unsupported", "engine_name": "CrowdStrike", "engine_update": "20180202", "engine_version": "1.0", "method": "blacklist", "result": null }, "Cybereason": { "category": "type-unsupported", "engine_name": "Cybereason", "engine_update": "20180308", "engine_version": null, "method": "blacklist", "result": null }, "Cylance": { "category": "type-unsupported", "engine_name": "Cylance", "engine_update": "20200323", "engine_version": "2.3.1.101", "method": "blacklist", "result": null }, "Cyren": { "category": "malicious", "engine_name": "Cyren", "engine_update": "20200323", "engine_version": "6.2.2.2", "method": "blacklist", "result": "RTF/CVE1711882" }, "DrWeb": { "category": "malicious", "engine_name": "DrWeb", "engine_update": "20200323", "engine_version": "7.0.44.12030", "method": "blacklist", "result": "Exploit.Rtf.CVE2012-0158" }, "ESET-NOD32": { "category": "malicious", "engine_name": "ESET-NOD32", "engine_update": "20200323", "engine_version": "21042", "method": "blacklist", "result": "Win32/Exploit.CVE-2017-11882.AWP" }, "Emsisoft": { "category": "malicious", "engine_name": "Emsisoft", "engine_update": "20200323", "engine_version": "2018.12.0.1641", "method": "blacklist", "result": "Trojan.Agent.ENJC (B)" }, "Endgame": { "category": "type-unsupported", "engine_name": "Endgame", "engine_update": "20200226", "engine_version": "3.0.17", "method": "blacklist", "result": null }, "F-Prot": { "category": "malicious", "engine_name": "F-Prot", "engine_update": "20200323", "engine_version": "4.7.1.166", "method": "blacklist", "result": "RTF/CVE1711882" }, "F-Secure": { "category": "malicious", "engine_name": "F-Secure", "engine_update": "20200323", "engine_version": "12.0.86.52", "method": "blacklist", "result": "Exploit.EXP/CVE-2017-11882.zfknn" }, "FireEye": { "category": "undetected", "engine_name": "FireEye", "engine_update": "20200316", "engine_version": "32.31.0.0", "method": "blacklist", "result": null }, "Fortinet": { "category": "malicious", "engine_name": "Fortinet", "engine_update": "20200323", "engine_version": "6.2.142.0", "method": "blacklist", "result": "RTF/Dropper.A879!tr" }, "GData": { "category": "malicious", "engine_name": "GData", "engine_update": "20200323", "engine_version": "A:25.25222B:26.18109", "method": "blacklist", "result": "Trojan.Agent.ENJC" }, "Ikarus": { "category": "malicious", "engine_name": "Ikarus", "engine_update": "20200322", "engine_version": "0.1.5.2", "method": "blacklist", "result": "Exploit.CVE-2017-11882" }, "Invincea": { "category": "type-unsupported", "engine_name": "Invincea", "engine_update": "20200219", "engine_version": "6.3.6.26157", "method": "blacklist", "result": null }, "Jiangmin": { "category": "undetected", "engine_name": "Jiangmin", "engine_update": "20200322", "engine_version": "16.0.100", "method": "blacklist", "result": null }, "K7AntiVirus": { "category": "undetected", "engine_name": "K7AntiVirus", "engine_update": "20200323", "engine_version": "11.100.33608", "method": "blacklist", "result": null }, "K7GW": { "category": "undetected", "engine_name": "K7GW", "engine_update": "20200322", "engine_version": "11.100.33607", "method": "blacklist", "result": null }, "Kaspersky": { "category": "malicious", "engine_name": "Kaspersky", "engine_update": "20200323", "engine_version": "15.0.1.13", "method": "blacklist", "result": "HEUR:Exploit.RTF.CVE-2018-0802.gen" }, "Kingsoft": { "category": "undetected", "engine_name": "Kingsoft", "engine_update": "20200323", "engine_version": "2013.8.14.323", "method": "blacklist", "result": null }, "MAX": { "category": "undetected", "engine_name": "MAX", "engine_update": "20200323", "engine_version": "2019.9.16.1", "method": "blacklist", "result": null }, "Malwarebytes": { "category": "undetected", "engine_name": "Malwarebytes", "engine_update": "20200323", "engine_version": "3.6.4.335", "method": "blacklist", "result": null }, "MaxSecure": { "category": "undetected", "engine_name": "MaxSecure", "engine_update": "20200320", "engine_version": "1.0.0.1", "method": "blacklist", "result": null }, "McAfee": { "category": "malicious", "engine_name": "McAfee", "engine_update": "20200322", "engine_version": "6.0.6.653", "method": "blacklist", "result": "RTFObfustream.a!5E31D16D6BF3" }, "McAfee-GW-Edition": { "category": "malicious", "engine_name": "McAfee-GW-Edition", "engine_update": "20200322", "engine_version": "v2017.3010", "method": "blacklist", "result": "RTFObfustream.a!5E31D16D6BF3" }, "MicroWorld-eScan": { "category": "malicious", "engine_name": "MicroWorld-eScan", "engine_update": "20200323", "engine_version": "14.0.409.0", "method": "blacklist", "result": "Trojan.Agent.ENJC" }, "Microsoft": { "category": "malicious", "engine_name": "Microsoft", "engine_update": "20200323", "engine_version": "1.1.16800.2", "method": "blacklist", "result": "Exploit:O97M/CVE-2017-11882.G!MTB" }, "NANO-Antivirus": { "category": "malicious", "engine_name": "NANO-Antivirus", "engine_update": "20200323", "engine_version": "1.0.134.25032", "method": "blacklist", "result": "Exploit.Rtf.Heuristic-rtf.dinbqn" }, "Paloalto": { "category": "type-unsupported", "engine_name": "Paloalto", "engine_update": "20200323", "engine_version": "1.0", "method": "blacklist", "result": null }, "Panda": { "category": "undetected", "engine_name": "Panda", "engine_update": "20200322", "engine_version": "4.6.4.2", "method": "blacklist", "result": null }, "Qihoo-360": { "category": "malicious", "engine_name": "Qihoo-360", "engine_update": "20200323", "engine_version": "1.0.0.1120", "method": "blacklist", "result": "heur.rtf.obfuscated.1" }, "Rising": { "category": "undetected", "engine_name": "Rising", "engine_update": "20200322", "engine_version": "25.0.0.24", "method": "blacklist", "result": null }, "SUPERAntiSpyware": { "category": "undetected", "engine_name": "SUPERAntiSpyware", "engine_update": "20200317", "engine_version": "5.6.0.1032", "method": "blacklist", "result": null }, "Sangfor": { "category": "undetected", "engine_name": "Sangfor", "engine_update": "20200320", "engine_version": "1.0", "method": "blacklist", "result": null }, "SentinelOne": { "category": "type-unsupported", "engine_name": "SentinelOne", "engine_update": "20200220", "engine_version": "2.0.0.2603", "method": "blacklist", "result": null }, "Sophos": { "category": "undetected", "engine_name": "Sophos", "engine_update": "20200323", "engine_version": "4.98.0", "method": "blacklist", "result": null }, "Symantec": { "category": "malicious", "engine_name": "Symantec", "engine_update": "20200322", "engine_version": "1.11.0.0", "method": "blacklist", "result": "Trojan.Mdropper" }, "SymantecMobileInsight": { "category": "type-unsupported", "engine_name": "SymantecMobileInsight", "engine_update": "20200210", "engine_version": "2.0", "method": "blacklist", "result": null }, "TACHYON": { "category": "malicious", "engine_name": "TACHYON", "engine_update": "20200323", "engine_version": "2020-03-23.01", "method": "blacklist", "result": "Trojan-Exploit/RTF.CVE-2018-0798" }, "Tencent": { "category": "malicious", "engine_name": "Tencent", "engine_update": "20200323", "engine_version": "1.0.0.1", "method": "blacklist", "result": "Win32.Exploit.Cve-2018-0802.Sxen" }, "Trapmine": { "category": "type-unsupported", "engine_name": "Trapmine", "engine_update": "20200123", "engine_version": "3.2.22.914", "method": "blacklist", "result": null }, "TrendMicro": { "category": "malicious", "engine_name": "TrendMicro", "engine_update": "20200323", "engine_version": "11.0.0.1006", "method": "blacklist", "result": "TROJ_FRS.VSNTCH20" }, "TrendMicro-HouseCall": { "category": "malicious", "engine_name": "TrendMicro-HouseCall", "engine_update": "20200323", "engine_version": "10.0.0.1040", "method": "blacklist", "result": "TROJ_FRS.VSNTCH20" }, "Trustlook": { "category": "type-unsupported", "engine_name": "Trustlook", "engine_update": "20200323", "engine_version": "1.0", "method": "blacklist", "result": null }, "VBA32": { "category": "undetected", "engine_name": "VBA32", "engine_update": "20200320", "engine_version": "4.3.0", "method": "blacklist", "result": null }, "VIPRE": { "category": "undetected", "engine_name": "VIPRE", "engine_update": "20200323", "engine_version": "82430", "method": "blacklist", "result": null }, "ViRobot": { "category": "undetected", "engine_name": "ViRobot", "engine_update": "20200323", "engine_version": "2014.3.20.0", "method": "blacklist", "result": null }, "Webroot": { "category": "type-unsupported", "engine_name": "Webroot", "engine_update": "20200323", "engine_version": "1.0.0.403", "method": "blacklist", "result": null }, "Yandex": { "category": "malicious", "engine_name": "Yandex", "engine_update": "20200320", "engine_version": "5.5.2.24", "method": "blacklist", "result": "Trojan.ARicher.bSxJ5m" }, "Zillya": { "category": "undetected", "engine_name": "Zillya", "engine_update": "20200320", "engine_version": "2.0.0.4051", "method": "blacklist", "result": null }, "ZoneAlarm": { "category": "malicious", "engine_name": "ZoneAlarm", "engine_update": "20200323", "engine_version": "1.0", "method": "blacklist", "result": "HEUR:Exploit.RTF.CVE-2018-0802.gen" }, "Zoner": { "category": "malicious", "engine_name": "Zoner", "engine_update": "20200323", "engine_version": "1.0.0.1", "method": "blacklist", "result": "Probably RTFObfuscationD" }, "eGambit": { "category": "type-unsupported", "engine_name": "eGambit", "engine_update": "20200323", "engine_version": null, "method": "blacklist", "result": null } }, "last_analysis_stats": { "confirmed-timeout": 0, "failure": 0, "harmless": 0, "malicious": 35, "suspicious": 0, "timeout": 0, "type-unsupported": 15, "undetected": 24 }, "last_modification_date": 1584939782, "last_submission_date": 1584418873, "magic": "Rich Text Format data, version 1, unknown character set", "md5": "5e31d16d6bf35ea117d6d2c4d42ea879", "meaningful_name": "President discusses budget savings due to coronavirus with Finance Minister.rtf", "names": [ "President discusses budget savings due to coronavirus with Finance Minister.rtf" ], "reputation": 0, "rtf_info": { "document_properties": { "custom_xml_data_properties": 1, "default_ansi_codepage": "Simplified Chinese", "default_character_set": "ANSI", "default_languages": [ "English - United States", "Arabic - Saudi Arabia", "Chinese - People's Republic of China" ], "dos_stubs": 0, "embedded_drawings": 0, "embedded_pictures": 0, "longest_hex_string": 508408, "non_ascii_characters": 0, "objects": [ { "class": null, "type": "OLE embedded" }, { "class": null, "type": "OLE control" } ], "read_only_protection": false, "rtf_header": "rtf1", "user_protection": false }, "summary_info": { "author": "Tatyana", "creation_time": "2020-03-16 22:31:00", "editing_time": 1, "number_of_characters": 1896, "number_of_non_whitespace_characters": 2224, "number_of_pages": 1, "number_of_words": 332, "operator": "apcach E", "revision_time": "2020-03-16 22:31:00", "version": 2, "version_number": 57433 } }, "sha1": "f8fb81d0a0acf5815190e1c85d937e49bc1dfec7", "sha256": "1527f7b9bdea7752f72ffcd8b0a97e9f05092fed2cb9909a463e5775e12bd2d6", "size": 574379, "ssdeep": "12288:v2D2vD2k+tSycQFfJyLhWr95EWV9kFyTDDpRosvcVdwA0:OD2vD2k+tcQFfyhWr95EFF+LosvOwF", "tags": [ "ole-embedded", "rtf", "cve-2017-11882", "cve-2012-0158", "ole-control", "exploit", "cve-2018-0802", "cve-2018-0798" ], "times_submitted": 1, "total_votes": { "harmless": 0, "malicious": 0 }, "trid": [ { "file_type": "file seems to be plain text/ASCII", "probability": 0.0 } ], "type_description": "Rich Text Format", "type_tag": "rtf", "unique_sources": 1, "vhash": "8596f9f7a194270fb5b3a2677abd4de52" }, "id": "1527f7b9bdea7752f72ffcd8b0a97e9f05092fed2cb9909a463e5775e12bd2d6", "links": { "self": "https://www.virustotal.com/api/v3/files/1527f7b9bdea7752f72ffcd8b0a97e9f05092fed2cb9909a463e5775e12bd2d6" }, "type": "file" } }