-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathban.py
238 lines (165 loc) · 7.63 KB
/
ban.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# ban - backup analyzer
# Given two files with a format
# <checksum> <file path>
# The two spaces between the checksum and the file path are inserted by sha256sum as a marker that the file was read in a text mode.
# Will output:
# 1. Checksums and their file paths that are present in the first file but not in the second one
# 2. Bash commands to copy the changed files to a different location
# Glossary
# Snapshot - a copy of data. When you copy your data to a backup device, the resulting files are called a snapshot.
# Snapshot hashfile - a file generated by running sha256sum on every file in a snapshot.
# Contains a hash and a filepath in every line.
# Snapshot entry - a representation of a file from a snapshot.
# The snapshot entry has a hash and a path inside of the snapshot.
# Snapshot diff - a list of snapshot entries that exist in one snapshot,
# but are missing from another one.
# Missing snapshot entry - means that the file that existed at a path and had a hash now either:
# - has the same hash but a different path (was moved)
# - has the same path but a different hash (was modified)
# Hash index - a dictionary representation, where key is the hash and values are the paths
import sys
import os
import logging
import argparse
import os.path
class SpaceNotFound(Exception):
pass
class Entry:
"""Represents an entry from an input file"""
def __init__(self, sha, path):
self.sha = sha
self.path = path
def get_sha(self):
return self.sha
def get_path(self):
return self.path
def __repr__(self):
return f'{self.sha} {self.path}'
def is_valid_hash(word):
return len(word) == 64
def parse_hash_and_path(line):
two_spaces = line.find(' ')
if two_spaces != -1:
# +2 because we want first char after two spaces
return line[:two_spaces], line[two_spaces+2:]
logging.error(f'Input line does not contain a normal two-space separator: "{line}"')
space = line.find(' ')
if space != -1:
# +1 because we want the first char after the space
return line[:space], line[space+1:]
logging.error(f'Input line also does not contain a single-space separator: "{line}"')
return None, None
def read_entries(stream):
"""Read entries from a file handle stream"""
entries = []
while True:
line = stream.readline()
if not line:
return entries
file_hash, file_path = parse_hash_and_path(line)
if not file_hash or not file_path:
logging.error('continuing to next line...')
continue
if not is_valid_hash(file_hash):
logging.error(f'found a weird line without hash: "{line}"')
continue
# The -1 and the very end removes the new line \n character introduced by the readline() method
if file_path[-1] == '\n':
new_entry = Entry(file_hash, file_path[:-1])
else:
new_entry = Entry(file_hash, file_path)
entries.append(new_entry)
def list_to_dict(entries):
"""Returns a dictionary with has as a key and the value beiing a list of paths
sharing the same hash"""
dic = {}
for entry in entries:
if entry.get_sha() in dic:
dic[entry.get_sha()].append(entry.get_path())
else:
dic[entry.get_sha()] = [entry.get_path()]
return dic
# Iterates over old entries and collects those missing from the late
def get_early_missing_from_late(early, late):
missing = []
for sha in early:
if not sha in late:
for path in early[sha]:
missing.append(Entry(sha, path))
return missing
def keep_path(path, paths_to_skip):
for p in paths_to_skip:
if path.startswith(p):
return False
return True
def apple_double(path):
dotUnderscore = os.path.basename(path).startswith('._')
dsStore = os.path.basename(path) == ".DS_Store"
return dotUnderscore or dsStore
def filter_entries(entries, paths_to_skip):
return [x for x in entries if keep_path(x.get_path(), paths_to_skip)]
# The AppleDoubles files are the dot underscore ._ files craeted for each file by MacOS
# They mess up the whole diffing mechanism
def filter_out_apple_doubles(entries):
return [x for x in entries if not apple_double(x.get_path())]
def bash_print_missing(entry, target_path):
'''Print a ready for execution bash command to copy a missing file'''
fileName = os.path.basename(entry.get_path())
targetFolder = os.path.join(target_path, os.path.dirname(entry.get_path()))
targetFile = os.path.join(targetFolder, fileName)
return f'mkdir -p {targetFolder} && cp -a {entry.get_path()} {targetFile}'
def get_parsed_arguments():
parser = argparse.ArgumentParser(prog='Snapshot hashfile analyzer',
description='Compares the two hashfiles of snapshots, prints missing entries',
epilog='by Oleg Krasnianskyi')
parser.add_argument('-e', '--early-hashes',
type=argparse.FileType('r'),
required=True,
help='hash file for the first snapshot')
parser.add_argument('-l', '--late-hashes',
type=argparse.FileType('r'),
required=True,
help='hash file for the late snapshot')
parser.add_argument('-s', '--skip',
action='append',
default=[],
help='paths from early snapshot to ingnore when comparing with late snapshot, can be specified multiple times')
return parser.parse_args()
if __name__ == '__main__':
args = get_parsed_arguments()
early_entries = []
late_entries = []
try:
early_entries= read_entries(args.early_hashes)
except SpaceNotFound as e:
raise RuntimeError(f'Early hashes file has a bad line: {str(e)}')
try:
late_entries = read_entries(args.late_hashes)
except SpaceNotFound as e:
raise RuntimeError(f'Late hashes file has a bad line: {str(e)}')
early_entries_count = len(early_entries)
late_entries_count = len(late_entries)
print(f'Number of early entries: {early_entries_count} {args.early_hashes.name}')
print(f'Number of late entries: {late_entries_count} {args.late_hashes.name}')
print(f'Difference in hashes number: {late_entries_count - early_entries_count}')
# Create a dictionary of entries
early_hash_map = list_to_dict(early_entries)
late_hash_map = list_to_dict(late_entries)
# Find entries that are present in the old dictionary, but are missing in the second one:
missing_entries = get_early_missing_from_late(early_hash_map, late_hash_map)
print(f'Hashes present in early but missing from late: {len(missing_entries)}')
filtered_entries = filter_entries(missing_entries, args.skip)
print(f'Number of Hashes that were filtered out: {len(missing_entries) - len(filtered_entries)}')
filtered_without_apple_doubles = filter_out_apple_doubles(filtered_entries)
print(f'Number of apple doubles: {len(filtered_entries) - len(filtered_without_apple_doubles)}')
print(f'Number of actually missing entries: {len(filtered_without_apple_doubles)}')
#for e in filtered_entries: print(bash_print_missing(e, '/node/save/'))
#for e in filtered_without_apple_doubles: print(bash_print_missing(e, ''))
for e in filtered_without_apple_doubles: print(e)
# Find entries that are duplicate within old file and print them out
def findDupeHashes(dic):
for sha in dic:
if len(dic[sha]) > 1:
for path in dic[sha]:
print("+ " + path)
print("")