-
Notifications
You must be signed in to change notification settings - Fork 0
/
py_clean.py
113 lines (90 loc) · 3.02 KB
/
py_clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/python
import argparse
import os
import sys
import hashlib
import time
from tqdm import tqdm
#
__author__ = 'Pir00t'
__date__ = 20180828
__version__ = 0.1
__description__ = 'Tool to hash files in specified directory and remove duplicates'
#
'''
Built upon the script outlined here: https://www.pythoncentral.io/finding-duplicate-files-with-python/
'''
def find_dup(working_dir):
# empty dictionary for duplicates: {hash:[list of paths]}
duplicates = {}
for topdir, subdirs, files in os.walk(working_dir):
print ('Scanning {} ...'.format(topdir))
paths = [os.path.join(topdir, fname) for fname in files]
for path in paths:
# Calculate hash
file_hash = hashfile(path)
# Add or append the file path
if file_hash in duplicates:
duplicates[file_hash].append(path)
else:
duplicates[file_hash] = [path]
return duplicates
#
def joinDicts(dict1, dict2):
for key in dict2.keys():
if key in dict1:
dict1[key] = dict1[key] + dict2[key]
else:
dict1[key] = dict2[key]
#
def hashfile(path, blocksize = 65536):
with open(path, 'rb') as f:
hasher = hashlib.sha256()
buf = f.read(blocksize)
while len(buf) > 0:
hasher.update(buf)
buf = f.read(blocksize)
return hasher.hexdigest()
#
def dup_delete(dict1):
results = dict1.values()
with open("log.txt", "w") as logfile:
for res in results:
if len(res) > 1:
print('\n[+] Duplicates found:\n\n{}'.format(" | ".join(res)))
logfile.write(" | ".join(res) + '\n')
for r in res:
delete = input('\t[!] Delete {} - y/n: '.format(r))
if delete == 'y' or delete == 'Y':
os.remove(r)
print ('\t[-] Deleted\n')
else:
continue
else:
print ('No duplicate file found: {}'.format("".join(res)))
#
def main():
print ("\nPy_Clean")
print ("\nScript by %s" % __author__)
print ("Current version %s\n" % __version__)
# Add in argument options
parser = argparse.ArgumentParser(description="Specify folder(s) to scan")
parser.add_argument("-f", "--folder", help="Folder(s) to parse", nargs='*')
# check for arg and input
if len(sys.argv) <=2:
parser.print_help()
args = parser.parse_args()
if args.folder:
dirs = args.folder
# primary dictionary that will contain combined results from all folders provided
duplicates = {}
for d in dirs:
# check path valid
if os.path.exists(d):
dups = find_dup(d)
joinDicts(duplicates,dups)
else:
print ('[!] Error. Invalid directory')
dup_delete(duplicates)
if __name__ == '__main__':
main()