ban.py

# ban - backup analyzer
# Given two files with a format
#   <checksum>  <file path>
# The two spaces between the checksum and the file path are inserted by sha256sum as a marker that the file was read in a text mode.
# Will output:
#  1. Checksums and their file paths that are present in the first file but not in the second one
#  2. Bash commands to copy the changed files to a different location

# Glossary
# Snapshot - a copy of data. When you copy your data to a backup device, the resulting files are called a snapshot.
# Snapshot hashfile - a file generated by running sha256sum on every file in a snapshot.
#   Contains a hash and a filepath in every line.
# Snapshot entry - a representation of a file from a snapshot.
#   The snapshot entry has a hash and a path inside of the snapshot.
# Snapshot diff - a list of snapshot entries that exist in one snapshot,
#   but are missing from another one.
# Missing snapshot entry - means that the file that existed at a path and had a hash now either:
#   - has the same hash but a different path (was moved)
#   - has the same path but a different hash (was modified)
# Hash index - a dictionary representation, where key is the hash and values are the paths


import sys
import os
import logging
import argparse
import os.path


class SpaceNotFound(Exception):
    pass


class Entry:
    """Represents an entry from an input file"""
    def __init__(self, sha, path):
        self.sha = sha
        self.path = path

    def get_sha(self):
        return self.sha

    def get_path(self):
        return self.path

    def __repr__(self):
        return f'{self.sha} {self.path}'


def is_valid_hash(word):
    return len(word) == 64


def parse_hash_and_path(line):
    two_spaces = line.find('  ')
    if two_spaces != -1:
        # +2 because we want first char after two spaces
        return line[:two_spaces], line[two_spaces+2:]

    logging.error(f'Input line does not contain a normal two-space separator: "{line}"')

    space = line.find(' ')
    if space != -1:
        # +1 because we want the first char after the space
        return line[:space], line[space+1:]

    logging.error(f'Input line also does not contain a single-space separator: "{line}"')
    return None, None


def read_entries(stream):
    """Read entries from a file handle stream"""
    entries = []

    while True:
        line = stream.readline()
        if not line:
            return entries

        file_hash, file_path = parse_hash_and_path(line)
        if not file_hash or not file_path:
            logging.error('continuing to next line...')
            continue

        if not is_valid_hash(file_hash):
            logging.error(f'found a weird line without hash: "{line}"')
            continue

        # The -1 and the very end removes the new line \n character introduced by the readline() method
        if file_path[-1] == '\n':
            new_entry = Entry(file_hash, file_path[:-1])
        else:
            new_entry = Entry(file_hash, file_path)
        entries.append(new_entry)


def list_to_dict(entries):
    """Returns a dictionary with has as a key and the value beiing a list of paths
    sharing the same hash"""

    dic = {}
    for entry in entries:
        if entry.get_sha() in dic:
            dic[entry.get_sha()].append(entry.get_path())
        else:
            dic[entry.get_sha()] = [entry.get_path()]
    return dic


# Iterates over old entries and collects those missing from the late
def get_early_missing_from_late(early, late):
    missing = []
    for sha in early:
        if not sha in late:
            for path in early[sha]:
                missing.append(Entry(sha, path))
    return missing


def keep_path(path, paths_to_skip):
    for p in paths_to_skip:
        if path.startswith(p):
            return False
    return True

def apple_double(path):
    dotUnderscore = os.path.basename(path).startswith('._')
    dsStore = os.path.basename(path) == ".DS_Store"
    return dotUnderscore or dsStore


def filter_entries(entries, paths_to_skip):
    return [x for x in entries if keep_path(x.get_path(), paths_to_skip)]

# The AppleDoubles files are the dot underscore ._ files craeted for each file by MacOS
# They mess up the whole diffing mechanism
def filter_out_apple_doubles(entries):
    return [x for x in entries if not apple_double(x.get_path())]


def bash_print_missing(entry, target_path):
    '''Print a ready for execution bash command to copy a missing file'''

    fileName = os.path.basename(entry.get_path())
    targetFolder = os.path.join(target_path, os.path.dirname(entry.get_path()))
    targetFile = os.path.join(targetFolder, fileName)

    return f'mkdir -p {targetFolder} && cp -a {entry.get_path()} {targetFile}'


def get_parsed_arguments():
    parser = argparse.ArgumentParser(prog='Snapshot hashfile analyzer',
                                     description='Compares the two hashfiles of snapshots, prints missing entries',
                                     epilog='by Oleg Krasnianskyi')

    parser.add_argument('-e', '--early-hashes',
                        type=argparse.FileType('r'),
                        required=True,
                        help='hash file for the first snapshot')

    parser.add_argument('-l', '--late-hashes',
                        type=argparse.FileType('r'),
                        required=True,
                        help='hash file for the late snapshot')

    parser.add_argument('-s', '--skip',
                        action='append',
                        default=[],
                        help='paths from early snapshot to ingnore when comparing with late snapshot, can be specified multiple times')

    return parser.parse_args()


if __name__ == '__main__':
    args = get_parsed_arguments()

    early_entries = []
    late_entries = []

    try:
        early_entries= read_entries(args.early_hashes)
    except SpaceNotFound as e:
        raise RuntimeError(f'Early hashes file has a bad line: {str(e)}')

    try:
        late_entries = read_entries(args.late_hashes)
    except SpaceNotFound as e:
        raise RuntimeError(f'Late hashes file has a bad line: {str(e)}')

    early_entries_count = len(early_entries)
    late_entries_count = len(late_entries)
    print(f'Number of early entries: {early_entries_count} {args.early_hashes.name}')
    print(f'Number of late entries: {late_entries_count} {args.late_hashes.name}')
    print(f'Difference in hashes number: {late_entries_count - early_entries_count}')

    # Create a dictionary of entries
    early_hash_map = list_to_dict(early_entries)
    late_hash_map = list_to_dict(late_entries)

    # Find entries that are present in the old dictionary, but are missing in the second one:
    missing_entries = get_early_missing_from_late(early_hash_map, late_hash_map)

    print(f'Hashes present in early but missing from late: {len(missing_entries)}')

    filtered_entries = filter_entries(missing_entries, args.skip)

    print(f'Number of Hashes that were filtered out: {len(missing_entries) - len(filtered_entries)}')

    filtered_without_apple_doubles = filter_out_apple_doubles(filtered_entries)

    print(f'Number of apple doubles: {len(filtered_entries) - len(filtered_without_apple_doubles)}')

    print(f'Number of actually missing entries: {len(filtered_without_apple_doubles)}')

    #for e in filtered_entries: print(bash_print_missing(e, '/node/save/'))
    #for e in filtered_without_apple_doubles: print(bash_print_missing(e, ''))
    for e in filtered_without_apple_doubles: print(e)


# Find entries that are duplicate within old file and print them out
def findDupeHashes(dic):
    for sha in dic:
        if len(dic[sha]) > 1:
            for path in dic[sha]:
                print("+ " + path)
            print("")