From 66c5bb9113cf6c3b7c98c8e9cf9e849ec855a607 Mon Sep 17 00:00:00 2001 From: Roman Babenko Date: Fri, 9 Aug 2024 11:26:02 +0300 Subject: [PATCH] UUID pattern (#577) * UUID pattern added * BM scores fix --- .github/workflows/benchmark.yml | 9 +- cicd/benchmark.txt | 150 ++++++++++++++-------------- credsweeper/rules/config.yaml | 17 ++++ tests/__init__.py | 10 +- tests/data/depth_3.json | 27 +++++ tests/data/doc.json | 27 +++++ tests/data/ml_threshold.json | 27 +++++ tests/data/output.json | 27 +++++ tests/ml_model/test_ml_validator.py | 26 +++++ tests/samples/uuid | 2 + tests/test_main.py | 5 +- 11 files changed, 240 insertions(+), 87 deletions(-) create mode 100644 tests/samples/uuid diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 8b418ecf1..90dfa8f8d 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -23,7 +23,7 @@ jobs: uses: actions/checkout@v4 with: repository: babenek/CredData - ref: auxiliary + ref: uuid - name: Markup hashing run: | @@ -74,7 +74,7 @@ jobs: uses: actions/checkout@v4 with: repository: babenek/CredData - ref: auxiliary + ref: uuid - name: Markup hashing run: | @@ -172,7 +172,7 @@ jobs: uses: actions/checkout@v4 with: repository: babenek/CredData - ref: auxiliary + ref: uuid - name: Markup hashing run: | @@ -342,6 +342,7 @@ jobs: exit ${exit_code} # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # + experiment: # the ml train test is placed here to use cached data set needs: [ download_data ] @@ -354,7 +355,7 @@ jobs: uses: actions/checkout@v4 with: repository: babenek/CredData - ref: auxiliary + ref: uuid - name: Markup hashing run: | diff --git a/cicd/benchmark.txt b/cicd/benchmark.txt index 1626253d5..72246106f 100644 --- a/cicd/benchmark.txt +++ b/cicd/benchmark.txt @@ -1,22 +1,22 @@ -DATA: 16978521 interested lines. MARKUP: 61855 items +DATA: 16348035 interested lines. MARKUP: 62567 items FileType FileNumber ValidLines Positives Negatives Templates --------------- ------------ ------------ ----------- ----------- ----------- - 194 28318 64 429 87 + 194 28318 66 427 87 .1 2 641 2 5 .admx 1 26 1 -.adoc 1 158 11 6 1 +.adoc 1 158 13 6 1 .api 2 118 4 -.asciidoc 96 14471 53 348 27 +.asciidoc 96 14471 51 348 27 .axaml 5 286 5 -.backup 1 62 1 1 +.backup 1 62 2 1 .bash 2 2158 2 1 -.bat 4 233 1 13 2 +.bat 4 233 14 2 .bats 15 2804 14 50 9 .bazel 3 424 8 .build 2 40 3 .bundle 4 1512 570 .bzl 3 2503 11 -.c 179 284009 9 942 5 +.c 179 284009 9 943 5 .cc 29 30562 622 1 .cf 3 126 2 1 .cfg 1 385 1 1 @@ -27,13 +27,13 @@ FileType FileNumber ValidLines Positives Negatives Templat .cmd 4 401 2 3 .cnf 8 858 18 45 18 .coffee 1 585 2 -.conf 60 4945 54 69 54 +.conf 60 4945 54 71 53 .config 20 492 16 33 1 -.cpp 15 5688 1 61 +.cpp 15 5688 2 61 .creds 1 10 1 1 .crlf 1 27 1 .crt 2 4979 253 -.cs 268 82410 120 908 99 +.cs 268 82410 158 907 94 .cshtml 5 180 12 .csp 3 379 11 .csproj 1 14 1 @@ -52,44 +52,44 @@ FileType FileNumber ValidLines Positives Negatives Templat .env 10 136 11 3 17 .erb 13 323 27 .erl 4 96 8 -.ex 25 4968 3 105 5 -.example 17 1838 74 38 54 -.exs 24 4842 3 190 4 +.ex 25 4968 5 105 5 +.example 17 1838 75 38 54 +.exs 24 4842 8 190 4 .ext 5 211 1 4 2 -.fsproj 1 75 1 +.fsproj 1 75 1 1 .g4 2 201 2 .gd 1 37 1 .gml 3 3075 26 .gni 3 5017 18 -.go 1079 566327 623 4329 742 +.go 1080 566476 673 4319 741 .golden 5 1168 1 14 29 .gradle 45 3265 4 91 100 .graphql 7 420 13 .graphqls 1 30 1 -.groovy 22 4986 20 215 1 -.h 11 2038 38 +.groovy 22 4986 24 215 1 +.h 9 1958 36 .haml 9 191 16 .hbs 2 54 3 .hs 14 4140 31 72 5 -.html 53 15327 14 115 18 -.idl 2 777 4 +.html 53 15327 22 115 18 +.idl 2 777 1 4 .iml 6 699 36 .in 6 2130 3 78 12 .inc 2 56 2 1 -.ini 11 1437 24 12 18 +.ini 11 1437 25 12 18 .ipynb 1 134 5 .j 1 241 2 2 .j2 30 5530 6 213 10 -.java 621 134132 314 1361 170 -.jenkinsfile 1 58 1 7 +.java 621 134132 359 1360 170 +.jenkinsfile 1 58 2 7 .jinja2 1 64 2 -.js 659 536413 526 2638 336 -.json 860 13670669 624 10946 140 +.js 659 536413 536 2635 330 +.json 850 13046270 1074 10778 140 .jsp 13 3202 1 42 .jsx 7 857 19 .jwt 1 1 2 .key 83 2737 70 14 -.kt 123 20774 51 384 3 +.kt 123 20774 67 384 3 .l 1 982 1 .las 1 6656 46 .lasso 1 230 6 @@ -106,33 +106,33 @@ FileType FileNumber ValidLines Positives Negatives Templat .log 2 199 38 52 .lua 10 1924 37 3 .m 16 13358 11 152 3 -.manifest 3 102 3 +.manifest 3 102 9 3 .markdown 3 139 3 1 .markerb 3 12 3 .marko 1 21 2 -.md 673 149294 658 2362 664 +.md 674 149399 722 2365 662 .mdx 3 549 7 .mjml 1 18 1 -.mjs 22 4424 50 343 +.mjs 22 4424 78 343 .mk 1 5878 16 .ml 1 1856 24 .mlir 2 1596 19 .mod 2 96 4 .moo 1 1404 26 .mqh 1 1023 2 -.msg 1 26644 1 +.msg 1 26644 1 1 .mysql 1 36 2 -.ndjson 2 5006 37 266 2 +.ndjson 2 5006 70 266 2 .nix 4 211 12 .nolint 1 2 1 .odd 1 1281 57 .oracle 1 9 1 .p8 4 64 4 .pan 2 48 4 -.patch 4 109405 27 +.patch 4 109405 4 27 .pbxproj 1 941 1 .pem 48 1169 47 8 -.php 371 75710 130 1769 80 +.php 371 75710 131 1769 80 .pl 16 14727 6 47 .pm 3 744 8 .po 3 2994 15 @@ -143,20 +143,20 @@ FileType FileNumber ValidLines Positives Negatives Templat .ppk 1 45 37 .private 1 15 1 .proj 1 85 3 -.properties 48 1621 51 28 34 -.proto 5 5768 58 -.ps1 16 8509 68 2 +.properties 48 1621 53 28 33 +.proto 5 5768 2 58 +.ps1 16 8509 15 67 2 .ps1xml 1 5022 1 .pug 2 193 2 .purs 1 69 4 .pxd 1 150 5 2 -.py 890 291553 627 3460 744 +.py 890 291553 685 3456 729 .pyi 4 1361 9 .pyp 1 167 1 .pyx 2 1094 21 .r 4 62 6 3 1 .rake 2 51 2 -.rb 861 131867 237 3458 615 +.rb 860 131838 259 3451 612 .re 1 31 1 .red 1 159 1 .release 1 13 4 @@ -169,100 +169,98 @@ FileType FileNumber ValidLines Positives Negatives Templat .rs 31 9855 2 238 11 .rsc 1 691 1 .rsp 16 7101 19 10 28 -.rst 86 33980 61 358 68 +.rst 86 33980 69 358 68 .rules 1 6 2 -.sample 2 25 1 5 4 +.sample 2 25 3 4 4 .sbt 3 570 6 2 -.scala 40 5071 13 102 +.scala 40 5071 22 102 .scss 16 8553 32 1 .secrets 1 11 1 -.sh 143 21525 50 474 30 +.sh 143 21525 51 474 30 .slim 1 153 2 2 -.sln 1 306 2 .smali 1 775 12 -.snap 3 1708 1 34 2 +.snap 3 1708 9 30 2 .spec 2 332 2 .spin 1 565 1 -.sql 27 6606 29 62 4 +.sql 27 6606 126 62 4 .storyboard 20 1802 401 .strings 20 1240 184 .stub 3 84 6 .sublime-keymap 1 3 1 .sum 37 22854 283 .svg 1 638 12 -.swift 6 278 16 .t 9 1767 28 56 14 .td 2 14002 6 .template 19 1633 4 42 11 .test 2 24 25 4 -.testsettings 1 21 5 +.testsettings 1 21 1 5 .tf 21 1377 3 32 2 -.tfstate 4 307 18 11 4 +.tfstate 4 307 22 11 4 .tfvars 1 31 3 3 .tl 2 2161 165 2 .tmpl 5 336 3 9 .token 1 1 3 -.toml 83 2379 54 72 172 +.toml 83 2379 54 73 172 .tpl 1 43 1 .travis 1 34 4 3 1 .ts 583 106730 158 1935 203 .tsx 54 7914 1 124 5 -.ttar 2 6050 3 -.txt 443 78152 1775 14282 50 +.ttar 1 452 1 +.txt 440 78102 1861 14251 50 .utf8 1 77 2 .vsixmanifest 1 36 1 -.vsmdi 1 6 1 +.vsmdi 1 6 2 .vue 50 8736 1 183 1 -.xaml 21 8103 174 +.xaml 21 8103 175 .xcscheme 1 109 6 .xib 11 503 174 .xml 9 689 9 .xsl 1 311 1 -.yaml 137 19004 116 356 44 -.yml 418 36162 460 916 384 +.yaml 137 19004 128 356 44 +.yml 418 36162 515 910 384 .zsh 6 872 12 .zsh-theme 1 97 1 -TOTAL: 10281 16978521 7546 59932 5216 -credsweeper result_cnt : 6585, lost_cnt : 0, true_cnt : 6367, false_cnt : 218 +TOTAL: 10259 16348035 8706 59679 5182 +credsweeper result_cnt : 7664, lost_cnt : 0, true_cnt : 7472, false_cnt : 192 Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1 ------------------------------ ----------- ----------- ----------- ---------- ---- ---- ----- ---- -------- -------- -------- -------- -------- -------- -API 124 3162 185 112 109 3 3344 15 0.000896 0.120968 0.994814 0.973214 0.879032 0.923729 -AWS Client ID 168 13 0 160 160 0 13 8 0.000000 0.047619 0.955801 1.000000 0.952381 0.975610 -AWS Multi 75 12 0 87 75 11 1 0 0.916667 0.000000 0.873563 0.872093 1.000000 0.931677 -AWS S3 Bucket 61 25 0 92 61 25 0 0 1.000000 0.000000 0.709302 0.709302 1.000000 0.829932 -Atlassian Old PAT token 27 212 3 12 3 8 207 24 0.037209 0.888889 0.867769 0.272727 0.111111 0.157895 -Auth 408 2727 77 372 351 21 2783 57 0.007489 0.139706 0.975716 0.943548 0.860294 0.900000 +API 131 3126 185 111 109 2 3309 22 0.000604 0.167939 0.993027 0.981982 0.832061 0.900826 +AWS Client ID 167 18 0 160 160 0 18 7 0.000000 0.041916 0.962162 1.000000 0.958084 0.978593 +AWS Multi 75 14 0 87 75 11 3 0 0.785714 0.000000 0.876404 0.872093 1.000000 0.931677 +AWS S3 Bucket 66 24 0 92 66 24 0 0 1.000000 0.000000 0.733333 0.733333 1.000000 0.846154 +Atlassian Old PAT token 27 208 3 12 3 8 203 24 0.037915 0.888889 0.865546 0.272727 0.111111 0.157895 +Auth 412 2723 76 371 353 18 2781 59 0.006431 0.143204 0.976020 0.951482 0.856796 0.901660 Azure Access Token 19 0 0 12 12 0 0 7 0.368421 0.631579 1.000000 0.631579 0.774194 BASE64 Private Key 7 2 0 7 7 0 2 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 BASE64 encoded PEM Private Key 7 0 0 5 5 0 0 2 0.285714 0.714286 1.000000 0.714286 0.833333 -Bitbucket Client ID 142 1813 9 46 27 18 1804 115 0.009879 0.809859 0.932281 0.600000 0.190141 0.288770 -Bitbucket Client Secret 230 535 10 44 33 11 534 197 0.020183 0.856522 0.731613 0.750000 0.143478 0.240876 -Certificate 25 459 1 21 20 1 459 5 0.002174 0.200000 0.987629 0.952381 0.800000 0.869565 -Credential 91 155 74 90 87 3 226 4 0.013100 0.043956 0.978125 0.966667 0.956044 0.961326 +Bitbucket Client ID 142 1807 9 46 27 18 1798 115 0.009912 0.809859 0.932074 0.600000 0.190141 0.288770 +Bitbucket Client Secret 230 527 10 44 33 11 526 197 0.020484 0.856522 0.728814 0.750000 0.143478 0.240876 +Certificate 25 460 1 21 20 1 460 5 0.002169 0.200000 0.987654 0.952381 0.800000 0.869565 +Credential 94 154 74 90 90 0 228 4 0.000000 0.042553 0.987578 1.000000 0.957447 0.978261 Docker Swarm Token 2 0 0 2 2 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 Dropbox App secret 62 114 0 46 36 9 105 26 0.078947 0.419355 0.801136 0.800000 0.580645 0.672897 Facebook Access Token 0 1 0 0 0 1 0 0.000000 1.000000 Firebase Domain 6 1 0 7 6 1 0 0 1.000000 0.000000 0.857143 0.857143 1.000000 0.923077 Github Old Token 1 0 0 1 1 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 -Gitlab Feed Token 188 460 88 60 47 12 536 141 0.021898 0.750000 0.792120 0.796610 0.250000 0.380567 +Gitlab Feed Token 188 451 87 60 47 12 526 141 0.022305 0.750000 0.789256 0.796610 0.250000 0.380567 Gitlab Incoming Email Token 37 3 0 21 19 2 1 18 0.666667 0.486486 0.500000 0.904762 0.513514 0.655172 Google API Key 12 0 0 12 12 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 Google Multi 10 2 0 11 10 1 1 0 0.500000 0.000000 0.916667 0.909091 1.000000 0.952381 Google OAuth Access Token 3 0 0 3 3 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000 Grafana Provisioned API Key 22 1 0 5 5 0 1 17 0.000000 0.772727 0.260870 1.000000 0.227273 0.370370 -JSON Web Token 169 61 0 130 130 0 61 39 0.000000 0.230769 0.830435 1.000000 0.769231 0.869565 +JSON Web Token 170 61 0 131 131 0 61 39 0.000000 0.229412 0.831169 1.000000 0.770588 0.870432 Jira / Confluence PAT token 0 4 0 0 0 4 0 0.000000 1.000000 Jira 2FA 14 6 0 10 10 0 6 4 0.000000 0.285714 0.800000 1.000000 0.714286 0.833333 -Key 497 8483 464 448 439 9 8938 58 0.001006 0.116700 0.992906 0.979911 0.883300 0.929101 -Nonce 90 47 0 84 83 1 46 7 0.021277 0.077778 0.941606 0.988095 0.922222 0.954023 -Other 0 0 5 0 0 5 0 0.000000 1.000000 +Key 522 8453 464 452 447 5 8912 75 0.000561 0.143678 0.991525 0.988938 0.856322 0.917864 +Nonce 91 47 0 84 83 1 46 8 0.021277 0.087912 0.934783 0.988095 0.912088 0.948571 PEM Private Key 1019 1483 0 1023 1019 4 1479 0 0.002697 0.000000 0.998401 0.996090 1.000000 0.998041 -Password 1834 7473 2741 1691 1627 64 10150 207 0.006266 0.112868 0.977507 0.962153 0.887132 0.923121 +Password 1841 7468 2724 1691 1637 54 10138 204 0.005298 0.110809 0.978559 0.968066 0.889191 0.926954 Salt 45 73 2 39 39 0 75 6 0.000000 0.133333 0.950000 1.000000 0.866667 0.928571 -Secret 1362 28492 868 1236 1231 5 29355 131 0.000170 0.096182 0.995573 0.995955 0.903818 0.947652 +Secret 1365 28359 868 1237 1233 4 29223 132 0.000137 0.096703 0.995554 0.996766 0.903297 0.947733 Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000 Slack Token 4 1 0 4 4 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000 -Token 586 3974 438 513 504 9 4403 82 0.002040 0.139932 0.981793 0.982456 0.860068 0.917197 +Token 612 3949 437 516 511 5 4381 101 0.001140 0.165033 0.978792 0.990310 0.834967 0.906028 Twilio API Key 0 5 2 0 0 7 0 0.000000 1.000000 -URL Credentials 198 127 249 190 190 0 376 8 0.000000 0.040404 0.986063 1.000000 0.959596 0.979381 - 7546 59932 5216 6596 6367 218 59714 1179 0.003637 0.156242 0.979297 0.966894 0.843758 0.901139 +URL Credentials 209 127 240 200 200 0 367 9 0.000000 0.043062 0.984375 1.000000 0.956938 0.977995 +UUID 1068 1 0 1058 1057 1 0 11 1.000000 0.010300 0.988775 0.999055 0.989700 0.994356 + 8706 59679 5182 7671 7472 192 59487 1234 0.003217 0.141741 0.979147 0.974948 0.858259 0.912889 diff --git a/credsweeper/rules/config.yaml b/credsweeper/rules/config.yaml index afe66d028..29c266f8e 100644 --- a/credsweeper/rules/config.yaml +++ b/credsweeper/rules/config.yaml @@ -126,6 +126,23 @@ target: - code +- name: UUID + severity: info + confidence: strong + type: pattern + values: + - (?[0-9A-Fa-f]{8}(-[0-9A-Fa-f]{4}){3}-[0-9A-Fa-f]{12})(?![=0-9A-Za-z_+-]) + min_line_len: 36 + required_substrings: + - "-" + required_regex: "[0-9A-Za-z_/+-]{15}" + filter_type: + - ValuePatternCheck + use_ml: false + target: + - code + - doc + - name: AWS Client ID severity: high confidence: moderate diff --git a/tests/__init__.py b/tests/__init__.py index a501eae08..85a275175 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,20 +1,20 @@ from pathlib import Path # total number of files in test samples -SAMPLES_FILES_COUNT: int = 129 +SAMPLES_FILES_COUNT: int = 130 # the lowest value of ML threshold is used to display possible lowest values NEGLIGIBLE_ML_THRESHOLD = 0.0001 # credentials count after scan -SAMPLES_CRED_COUNT: int = 362 -SAMPLES_CRED_LINE_COUNT: int = 379 +SAMPLES_CRED_COUNT: int = 363 +SAMPLES_CRED_LINE_COUNT: int = 380 # credentials count after post-processing -SAMPLES_POST_CRED_COUNT: int = 321 +SAMPLES_POST_CRED_COUNT: int = 322 # with option --doc -SAMPLES_IN_DOC = 415 +SAMPLES_IN_DOC = 416 # archived credentials that are not found without --depth SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 24 diff --git a/tests/data/depth_3.json b/tests/data/depth_3.json index 01cdc64ff..8fd52ab2c 100644 --- a/tests/data/depth_3.json +++ b/tests/data/depth_3.json @@ -10919,6 +10919,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "UUID", + "severity": "info", + "confidence": "strong", + "line_data_list": [ + { + "line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp", + "line_num": 1, + "path": "tests/samples/uuid", + "info": "tests/samples/uuid|RAW", + "value": "bace4d19-fa7e-beef-cafe-9129474bcd81", + "value_start": 0, + "value_end": 36, + "variable": null, + "variable_start": -2, + "variable_end": -2, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.2373263071270246, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/data/doc.json b/tests/data/doc.json index 46c8cdb7d..0311797b5 100644 --- a/tests/data/doc.json +++ b/tests/data/doc.json @@ -13064,6 +13064,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "UUID", + "severity": "info", + "confidence": "strong", + "line_data_list": [ + { + "line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp", + "line_num": 1, + "path": "tests/samples/uuid", + "info": "tests/samples/uuid|RAW", + "value": "bace4d19-fa7e-beef-cafe-9129474bcd81", + "value_start": 0, + "value_end": 36, + "variable": null, + "variable_start": -2, + "variable_end": -2, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.2373263071270246, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/data/ml_threshold.json b/tests/data/ml_threshold.json index d905cd4b5..01c1a0929 100644 --- a/tests/data/ml_threshold.json +++ b/tests/data/ml_threshold.json @@ -9981,6 +9981,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "UUID", + "severity": "info", + "confidence": "strong", + "line_data_list": [ + { + "line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp", + "line_num": 1, + "path": "tests/samples/uuid", + "info": "", + "value": "bace4d19-fa7e-beef-cafe-9129474bcd81", + "value_start": 0, + "value_end": 36, + "variable": null, + "variable_start": -2, + "variable_end": -2, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.2373263071270246, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/data/output.json b/tests/data/output.json index 1d75227d5..f6dcde344 100644 --- a/tests/data/output.json +++ b/tests/data/output.json @@ -8901,6 +8901,33 @@ } ] }, + { + "api_validation": "NOT_AVAILABLE", + "ml_validation": "NOT_AVAILABLE", + "ml_probability": null, + "rule": "UUID", + "severity": "info", + "confidence": "strong", + "line_data_list": [ + { + "line": "bace4d19-fa7e-beef-cafe-9129474bcd81 # tp", + "line_num": 1, + "path": "tests/samples/uuid", + "info": "", + "value": "bace4d19-fa7e-beef-cafe-9129474bcd81", + "value_start": 0, + "value_end": 36, + "variable": null, + "variable_start": -2, + "variable_end": -2, + "entropy_validation": { + "iterator": "BASE36_CHARS", + "entropy": 3.2373263071270246, + "valid": true + } + } + ] + }, { "api_validation": "NOT_AVAILABLE", "ml_validation": "NOT_AVAILABLE", diff --git a/tests/ml_model/test_ml_validator.py b/tests/ml_model/test_ml_validator.py index ee7083ae9..3e61d2e1a 100644 --- a/tests/ml_model/test_ml_validator.py +++ b/tests/ml_model/test_ml_validator.py @@ -65,6 +65,32 @@ def validate(_candidate: Candidate) -> Tuple[bool, float]: decision, probability = validate(candidate) self.assertAlmostEqual(0.9980608820915222, probability, delta=NEGLIGIBLE_ML_THRESHOLD) + def test_ml_validator_auxiliary_p(self): + candidate = Candidate.get_dummy_candidate(self.config, "secret", "", "") + candidate.rule_name = "Secret" + candidate.line_data_list[0].line = "secret=bace4d19-dead-beef-cafe-9129474bcd81" + candidate.line_data_list[0].variable = "secret" + candidate.line_data_list[0].value_start = 7 + candidate.line_data_list[0].value_end = 43 + candidate.line_data_list[0].value = "bace4d19-dead-beef-cafe-9129474bcd81" + # auxiliary candidate for a pattern rule - without variable + aux_candidate = copy.deepcopy(candidate) + aux_candidate.line_data_list[0].variable = None + + # todo: the scores are low for current ML model - will be changed after train + + candidate_key = CandidateKey(candidate.line_data_list[0]) + sample_as_batch = [(candidate_key, [candidate])] + is_cred_batch, probability_batch = self.ml_validator.validate_groups(sample_as_batch, 2) + self.assertAlmostEqual(0.16333681344985962, probability_batch[0], delta=NEGLIGIBLE_ML_THRESHOLD) + + # auxiliary rule in train does not increase ML probability yet - will be used after next train + + aux_candidate.rule_name = "UUID" + sample_as_batch = [(candidate_key, [candidate, aux_candidate])] + is_cred_batch, probability_batch = self.ml_validator.validate_groups(sample_as_batch, 2) + self.assertAlmostEqual(0.16333681344985962, probability_batch[0], delta=NEGLIGIBLE_ML_THRESHOLD) + def test_extract_features_p(self): candidate1 = Candidate.get_dummy_candidate(self.config, "main.py", ".py", "info") candidate1.line_data_list[0].line = 'ABC123' diff --git a/tests/samples/uuid b/tests/samples/uuid new file mode 100644 index 000000000..0ce05f451 --- /dev/null +++ b/tests/samples/uuid @@ -0,0 +1,2 @@ +bace4d19-fa7e-beef-cafe-9129474bcd81 # tp +12345678-1234-1234-1234-1234567890ab # fp diff --git a/tests/test_main.py b/tests/test_main.py index 6c774c4f9..7e664004b 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -484,7 +484,7 @@ def test_pdf_p(self) -> None: # may be tested with # https://www.dcc.edu/documents/administration/offices/information-technology/password-examples.pdf content_provider: AbstractProvider = FilesProvider([SAMPLES_PATH / "sample.pdf"]) - cred_sweeper = CredSweeper(depth=33) + cred_sweeper = CredSweeper(depth=7) cred_sweeper.run(content_provider=content_provider) found_credentials = cred_sweeper.credential_manager.get_credentials() self.assertSetEqual({"AWS Client ID", "Password", "Github Classic Token", "Key"}, @@ -786,6 +786,7 @@ def test_param_n(self) -> None: ("pager.rs", b"token: impl AsRef,"), # ("pager.rs", b" let tokens = quote::quote! {"), # ("pager.rs", b" let cert_chain = x509_rx"), # + ("my.kt", b'val password: String? = null'), # ] content_provider: AbstractProvider = FilesProvider([(file_name, io.BytesIO(data_line)) for file_name, data_line in items]) @@ -819,7 +820,7 @@ def test_param_p(self) -> None: ("accept.py", b"password='Ahga%$FiQ@Ei8'", "password", "Ahga%$FiQ@Ei8"), # ("test.template", b" NAMED_API_KEY=qii7t1m6423127xto389xc914l34451qz5135865564sg ", "NAMED_API_KEY", "qii7t1m6423127xto389xc914l34451qz5135865564sg"), # - ("my.kt", b'val password: String? = "Ahga%$FiQ@Ei8"', "password", "Ahga%$FiQ@Ei8"), # + ("my.kt", b'val password: String = "Ahga%$FiQ@Ei8"', "password", "Ahga%$FiQ@Ei8"), # ] for file_name, data_line, variable, value in items: content_provider: AbstractProvider = FilesProvider([