-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathduplicates.py
123 lines (89 loc) · 3.9 KB
/
duplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
#
# hashtool - hashing of entire trees of files
# Copyright (C) 2015 - Felipe Machado
#
#
# Python future imports
from __future__ import print_function
# Python standard library
import os.path
import re
# Local imports
from utils import loadHashDBFile, formatDataSize, shouldIgnore
###############################################################################
FOLDER_HAS_DUPLICATES = 0
FOLDER_HAS_ORIGINALS = 1
FOLDER_HAS_DUPLICATES_AND_ORIGINALS = 2
def updateFoldersStates( folders_data, file_path, is_original ):
current_path = file_path
while os.path.split( current_path )[1]:
current_path = os.path.split( current_path )[0]
if current_path in folders_data:
if ( ( folders_data[ current_path ] == FOLDER_HAS_ORIGINALS
and
not is_original )
or
( folders_data[ current_path ] == FOLDER_HAS_DUPLICATES
and
is_original ) ):
folders_data[ current_path ] == FOLDER_HAS_DUPLICATES_AND_ORIGINALS
else:
folders_data[ current_path ] = ( FOLDER_HAS_ORIGINALS if is_original
else FOLDER_HAS_DUPLICATES )
###############################################################################
###############################################################################
# Public function #
###################
def findDuplicates( config ):
file_db = loadHashDBFile( config.hash_db )
hashes_map = {}
count = 0
acc_size = 0
duplicate_folders = {}
for ( file_path, file_metadata ) in sorted( file_db.items(),
key=lambda x: x[0] ):
if file_metadata["size"] == 0 or shouldIgnore( file_path, config.prefix ):
continue
file_hashes = []
if "hash" in file_metadata:
file_hashes = [ "sha1_" + file_metadata["hash"] ]
elif "hashes" in file_metadata:
for hash_type in file_metadata[ "hashes" ]:
file_hashes += [ hash_type + "_" + file_metadata[ "hashes" ][hash_type] ]
file_folder = os.path.dirname( file_path )
for file_hash in file_hashes:
if file_hash in hashes_map:
if config.print_single_files:
print( "'%s' is a duplicate of '%s'"
% ( hashes_map[ file_hash ], file_path ) )
updateFoldersStates( duplicate_folders, file_path, False )
count += 1
acc_size += file_metadata["size"]
else:
hashes_map[ file_hash ] = file_path
updateFoldersStates( duplicate_folders, file_path, True )
print( str( count ) + " duplicates found" )
print( "Total duplicates size: " + formatDataSize( acc_size ) )
folders_lists = [ [], [], [] ]
sorted_duplicate_folders = \
sorted( [ ( s, d ) for ( d, s ) in duplicate_folders.items() ] )
for ( folder_state, folder_path ) in sorted_duplicate_folders:
folders_lists[ folder_state ].append( folder_path )
if not config.dont_print_folders_with_duplicates:
folders_list = folders_lists[ FOLDER_HAS_DUPLICATES ]
print( "Folders with only duplicate files (%d):" % len( folders_list ) )
for path in folders_list:
print( path )
if config.print_folders_with_originals:
folders_list = folders_lists[ FOLDER_HAS_ORIGINALS ]
print( "Folders with only original files (%d):" % len( folders_list ) )
for path in folders_list:
print( path )
if config.print_folders_with_both:
folders_list = folders_lists[ FOLDER_HAS_DUPLICATES_AND_ORIGINALS ]
print( "Folders with duplicates and originals files (%d):"
% len( folders_list ) )
for path in folders_list:
print( path )
###############################################################################