Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

converted ini config to toml #152

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 0 additions & 10 deletions fact_extractor/config/main.cfg

This file was deleted.

43 changes: 43 additions & 0 deletions fact_extractor/config/main.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
[unpack]
blacklist = [
"application/json",
"application/pdf",
"application/x-java-applet",
"application/x-object",
"application/x-sharedlib",
"application/x-shockwave-flash",
"application/x-terminfo",
"audio/mpeg",
"font/sfnt",
"image/bmp",
"image/gif",
"image/jpeg",
"image/png",
"image/x-tga",
"image/x-win-bitmap",
"inode/symlink",
"text/csv",
"text/plain",
"video/mp4",
"video/mpeg",
"video/ogg",
"video/quicktime",
"video/x-msvideo",
]
data_folder = "/tmp/extractor"
exclude = []

[expert_settings]
statistics = true
unpack_threshold = 0.8
header_overhead = 256
compressed_file_types = [
"application/x-shockwave-flash",
"audio/mpeg",
"audio/ogg",
"image/gif",
"image/jpeg",
"image/png",
"video/mp4",
"video/ogg",
]
11 changes: 6 additions & 5 deletions fact_extractor/docker_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
from pathlib import Path
import sys

from helperFunctions.config import get_config_dir
from helperFunctions.config import load_config
from helperFunctions.file_system import change_owner_of_output_files
from helperFunctions.program_setup import check_ulimits, load_config, setup_logging
from helperFunctions.program_setup import check_ulimits, setup_logging
from unpacker.unpack import unpack


Expand All @@ -41,17 +41,18 @@ def _parse_args():


def main(args):
config = load_config(f'{get_config_dir()}/main.cfg')
config = load_config()
setup_logging(debug=False)
check_ulimits()

input_dir = Path(config.get('unpack', 'data_folder'), 'input')
data_folder = Path(config.unpack.data_folder)
input_dir = data_folder / 'input'
input_file = list(input_dir.iterdir())[0]

unpack(input_file, config, args.extract_everything)

if args.chown:
output_dir = Path(config.get('unpack', 'data_folder'), 'files')
output_dir = data_folder / 'files'
return change_owner_of_output_files(output_dir, args.chown)

return 0
Expand Down
5 changes: 3 additions & 2 deletions fact_extractor/fact_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
import sys
from pathlib import Path

from helperFunctions.program_setup import setup_argparser, setup_logging, load_config
from helperFunctions.config import load_config
from helperFunctions.program_setup import setup_argparser, setup_logging
from unpacker.unpack import unpack


Expand All @@ -30,7 +31,7 @@ def main():
setup_logging(arguments.debug, log_file=arguments.log_file, log_level=arguments.log_level)

# Make sure report folder exists some meta.json can be written
report_folder = Path(config.get('unpack', 'data_folder'), 'reports')
report_folder = Path(config.unpack.data_folder, 'reports')
report_folder.mkdir(parents=True, exist_ok=True)
unpack(arguments.FILE_PATH, config)

Expand Down
60 changes: 30 additions & 30 deletions fact_extractor/helperFunctions/config.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,41 @@
import os
from configparser import ConfigParser, NoOptionError, NoSectionError
from pathlib import Path
from typing import List

import toml
from pydantic import BaseModel, Field

from helperFunctions.file_system import get_src_dir


def load_config(config_file_name):
'''
loads config of CONFIG_DIR/config_file_name
Returns config object
'''
config = ConfigParser()
config_path = '{}/{}'.format(get_config_dir(), config_file_name)
if os.path.exists(config_path):
config.read(config_path)
return config
raise RuntimeError('Cannot load config')
class ExtractorUnpackConfig(BaseModel):
blacklist: List[str] = Field(default_factory=list)
data_folder: str = "/tmp/extractor"
exclude: List[str] = Field(default_factory=list)


def get_config_dir():
'''
Returns the absolute path of the config directory
'''
return '{}/config'.format(get_src_dir())
class ExpertSettings(BaseModel):
statistics: bool = True
unpack_threshold: float = 0.8
header_overhead: int = 256
compressed_file_types: List[str] = Field(default_factory=list)


class FactExtractorConfig(BaseModel):
unpack: ExtractorUnpackConfig
expert_settings: ExpertSettings

def read_list_from_config(config_file: ConfigParser, section: str, key: str, default=None):
if default is None:
default = []

if not config_file:
return default
def load_config(config_file_name: str = 'main.toml') -> FactExtractorConfig:
'''
loads config of CONFIG_DIR/config_file_name
Returns config object
'''
config_path = get_config_dir() / config_file_name
if config_path.is_file():
cfg_data = toml.loads(config_path.read_text())
return FactExtractorConfig(**cfg_data)
raise RuntimeError('Cannot load config')

try:
config_entry = config_file.get(section, key)
except (NoOptionError, NoSectionError):
return default

if not config_entry:
return default
return [item.strip() for item in config_entry.split(',') if item]
def get_config_dir() -> Path:
return Path(get_src_dir()) / 'config'
2 changes: 1 addition & 1 deletion fact_extractor/helperFunctions/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def _checkout_github_project(github_path, folder_name):

def load_main_config():
config = configparser.ConfigParser()
config_path = Path(Path(__file__).parent.parent, 'config', 'main.cfg')
config_path = Path(Path(__file__).parent.parent, 'config', 'main.toml')
if not config_path.is_file():
raise InstallationError(f'Could not load config at path {config_path}')
config.read(str(config_path))
Expand Down
57 changes: 42 additions & 15 deletions fact_extractor/helperFunctions/program_setup.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import argparse
import configparser
import logging
import resource

Expand All @@ -10,19 +9,53 @@


def setup_argparser(name, description, command_line_options, version=__VERSION__):
parser = argparse.ArgumentParser(description='{} - {}'.format(name, description))
parser.add_argument('-V', '--version', action='version', version='{} {}'.format(name, version))
parser.add_argument('-l', '--log_file', help='path to log file', default=None)
parser.add_argument('-L', '--log_level', help='define the log level', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], default=None)
parser.add_argument('-d', '--debug', action='store_true', default=False, help='print debug messages')
parser.add_argument('-C', '--config_file', help='set path to config File', default='{}/main.cfg'.format(get_config_dir()))
parser.add_argument('FILE_PATH', type=str, help='Path to file that should be extracted')
parser = argparse.ArgumentParser(description=f'{name} - {description}')
parser.add_argument(
'-V',
'--version',
action='version',
version=f'{name} {version}',
)
parser.add_argument(
'-l',
'--log_file',
help='path to log file',
default=None,
)
parser.add_argument(
'-L',
'--log_level',
help='define the log level',
choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
default=None,
)
parser.add_argument(
'-d',
'--debug',
action='store_true',
default=False,
help='print debug messages',
)
parser.add_argument(
'-C',
'--config_file',
help='set path to config File',
default=f'{get_config_dir()}/main.cfg',
)
parser.add_argument(
'FILE_PATH',
type=str,
help='Path to file that should be extracted',
)
return parser.parse_args(command_line_options[1:])


def setup_logging(debug, log_file=None, log_level=None):
log_level = log_level if log_level else logging.WARNING
log_format = logging.Formatter(fmt='[%(asctime)s][%(module)s][%(levelname)s]: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
log_format = logging.Formatter(
fmt='[%(asctime)s][%(module)s][%(levelname)s]: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S',
)
logger = logging.getLogger('')
logger.setLevel(logging.DEBUG)

Expand All @@ -48,9 +81,3 @@ def check_ulimits():
logging.info(f'The number of openable files has been raised from {soft} to {min(1024, hard)}.')
elif soft == resource.RLIM_INFINITY or soft > 100000:
logging.warning('Warning: A very high (or no) nofile limit will slow down fakeroot and cause other problems.')


def load_config(config_file):
config = configparser.ConfigParser()
config.read(config_file)
return config
40 changes: 26 additions & 14 deletions fact_extractor/helperFunctions/statistics.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
from configparser import ConfigParser
from __future__ import annotations

from contextlib import suppress
from pathlib import Path
from typing import Dict, List
from typing import TYPE_CHECKING

from common_helper_files import safe_rglob
from common_helper_unpacking_classifier import (
avg_entropy, get_binary_size_without_padding, is_compressed
)
from common_helper_unpacking_classifier import avg_entropy, get_binary_size_without_padding, is_compressed
from fact_helper_file import get_file_type_from_path
from helperFunctions.config import read_list_from_config

from helperFunctions.config import FactExtractorConfig

if TYPE_CHECKING:
from pathlib import Path


def add_unpack_statistics(extraction_dir: Path, meta_data: Dict):
def add_unpack_statistics(extraction_dir: Path, meta_data: dict):
unpacked_files, unpacked_directories = 0, 0
for extracted_item in safe_rglob(extraction_dir):
if extracted_item.is_file():
Expand All @@ -23,21 +25,31 @@ def add_unpack_statistics(extraction_dir: Path, meta_data: Dict):
meta_data['number_of_unpacked_directories'] = unpacked_directories


def get_unpack_status(file_path: str, binary: bytes, extracted_files: List[Path], meta_data: Dict, config: ConfigParser):
def get_unpack_status(
file_path: str, binary: bytes, extracted_files: list[Path], meta_data: dict, config: FactExtractorConfig
):
meta_data['summary'] = []
meta_data['entropy'] = avg_entropy(binary)

if not extracted_files and meta_data.get('number_of_excluded_files', 0) == 0:
if get_file_type_from_path(file_path)['mime'] in read_list_from_config(config, 'ExpertSettings', 'compressed_file_types')\
or not is_compressed(binary, compress_entropy_threshold=config.getfloat('ExpertSettings', 'unpack_threshold'), classifier=avg_entropy):
mime = get_file_type_from_path(file_path)['mime']
if mime in config.expert_settings.compressed_file_types or not _is_compressed(binary, config):
meta_data['summary'] = ['unpacked']
else:
meta_data['summary'] = ['packed']
else:
_detect_unpack_loss(binary, extracted_files, meta_data, config.getint('ExpertSettings', 'header_overhead'))
_detect_unpack_loss(binary, extracted_files, meta_data, config.expert_settings.header_overhead)


def _is_compressed(binary, config: FactExtractorConfig):
return is_compressed(
binary,
compress_entropy_threshold=config.expert_settings.unpack_threshold,
classifier=avg_entropy,
)


def _detect_unpack_loss(binary: bytes, extracted_files: List[Path], meta_data: Dict, header_overhead: int):
def _detect_unpack_loss(binary: bytes, extracted_files: list[Path], meta_data: dict, header_overhead: int):
decoding_overhead = 1 - meta_data.get('encoding_overhead', 0)
cleaned_size = get_binary_size_without_padding(binary) * decoding_overhead - header_overhead
size_of_extracted_files = _total_size_of_extracted_files(extracted_files)
Expand All @@ -46,7 +58,7 @@ def _detect_unpack_loss(binary: bytes, extracted_files: List[Path], meta_data: D
meta_data['summary'] = ['data lost'] if cleaned_size > size_of_extracted_files else ['no data lost']


def _total_size_of_extracted_files(extracted_files: List[Path]) -> int:
def _total_size_of_extracted_files(extracted_files: list[Path]) -> int:
total_size = 0
for item in extracted_files:
with suppress(OSError):
Expand Down
8 changes: 4 additions & 4 deletions fact_extractor/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
from flask import Flask
from flask_restful import Api, Resource

from helperFunctions.config import load_config
from helperFunctions.config import load_config, FactExtractorConfig
from helperFunctions.file_system import change_owner_of_output_files
from helperFunctions.program_setup import setup_logging
from unpacker.unpack import unpack

app = Flask(__name__)
api = Api(app)
config = load_config('main.cfg')
setup_logging(False, log_level=int(os.getenv('LOG_LEVEL', logging.WARNING))) # pylint: disable=invalid-envvar-default
config: FactExtractorConfig = load_config('main.toml')
setup_logging(False, log_level=int(os.getenv('LOG_LEVEL', logging.WARNING)))


@api.resource('/start/<folder>', methods=['GET'])
Expand All @@ -23,7 +23,7 @@ def __init__(self):
self.owner = os.getenv('CHMOD_OWNER', None)

def get(self, folder):
input_dir = Path(config.get('unpack', 'data_folder'), folder, 'input')
input_dir = Path(config.unpack.data_folder, folder, 'input')
try:
input_file = list(input_dir.iterdir())[0]
unpack(file_path=str(input_file), config=config, folder=folder)
Expand Down
26 changes: 24 additions & 2 deletions fact_extractor/test/data/helperFunctions/test.cfg
Original file line number Diff line number Diff line change
@@ -1,2 +1,24 @@
[test]
test = test_config
[unpack]
blacklist = [
"image/bmp",
"image/gif",
]
data_folder = "/tmp/extractor"
exclude = [
"foobar",
]

[expert_settings]
statistics = false
unpack_threshold = 0.8
header_overhead = 256
compressed_file_types = [
"application/x-shockwave-flash",
"audio/mpeg",
"audio/ogg",
"image/gif",
"image/jpeg",
"image/png",
"video/mp4",
"video/ogg",
]
Loading