From c07a187b8e4651b95374afe179534d006438740a Mon Sep 17 00:00:00 2001 From: Josh Feather <142008135+josh-feather@users.noreply.github.com> Date: Wed, 29 Jan 2025 12:17:03 +0000 Subject: [PATCH] Ignore non-JSON output in detect-it-easy output The `diec` binary outputs non-JSON output in JSON mode, causing the parsing to fail. For example: ```text [!] Heuristic scan is disabled. Use --heuristicscan to enable { "detects": [ { "filetype": "PE64", "info": "", "offset": "0", "parentfilepart": "Header", "size": "3488048", "values": [ { "info": "", "name": "Microsoft Linker", "string": "Linker: Microsoft Linker(14.22.27905)", "type": "Linker", "version": "14.22.27905" }, { "info": "C++", "name": "Microsoft Visual C/C++", "string": "Compiler: Microsoft Visual C/C++(19.22.27905)[C++]", "type": "Compiler", "version": "19.22.27905" }, { "info": "", "name": "Visual Studio", "string": "Tool: Visual Studio(2019, v16.2)", "type": "Tool", "version": "2019, v16.2" } ] } ] }``` There is a related issue noted against the DIE project: https://github.com/horsicq/Detect-It-Easy/issues/242 --- .../common/integrations/file_extra_info.py | 33 ++++++++++++------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/lib/cuckoo/common/integrations/file_extra_info.py b/lib/cuckoo/common/integrations/file_extra_info.py index 868cf666327..951dd17026d 100644 --- a/lib/cuckoo/common/integrations/file_extra_info.py +++ b/lib/cuckoo/common/integrations/file_extra_info.py @@ -4,13 +4,14 @@ import json import logging import os +import re import shlex import shutil import signal import subprocess # from contextlib import suppress -from typing import DefaultDict, List, Optional, Set, Union +from typing import Any, DefaultDict, List, Optional, Set, Union import pebble @@ -262,27 +263,37 @@ def static_file_info( def detect_it_easy_info(file_path: str): if not path_exists(processing_conf.die.binary): + log.warning("detect-it-easy binary not found at path %s", processing_conf.die.binary) return [] try: - result_json = subprocess.check_output( + die_output = subprocess.check_output( [processing_conf.die.binary, "-j", file_path], stderr=subprocess.STDOUT, universal_newlines=True, ) - if "detects" not in result_json: - return [] + def get_json() -> dict[str, Any]: + """Get the JSON element from the detect it easy output. - if "Invalid signature" in result_json and "{" in result_json: - start = result_json.find("{") - if start != -1: - result_json = result_json[start:] + This is required due to non-JSON output in JSON mode. + https://github.com/horsicq/Detect-It-Easy/issues/242 + """ + matches = re.findall(r"\{.*\}", die_output, re.S) + return json.loads(matches[0]) if matches else {} - strings = [sub["string"] for block in json.loads(result_json).get("detects", []) for sub in block.get("values", [])] + def get_matches() -> list[str]: + """Get the string values from the detect it easy output.""" + return [sub["string"] for block in get_json().get("detects", []) for sub in block.get("values", [])] - if strings: - return strings + return [] if "detects" not in die_output else get_matches() + except subprocess.CalledProcessError as err: + log.error( + "Detect-It-Easy: Failed to execute cmd=`%s`, stdout=`%s`, stderr=`%s`", + shlex.join(err.cmd), + err.stdout, + err.stderr, + ) except json.decoder.JSONDecodeError as e: log.debug("DIE results are not in json format: %s", str(e)) except Exception as e: