-
Notifications
You must be signed in to change notification settings - Fork 1
/
spacesavers2_grubbers
executable file
·115 lines (99 loc) · 2.98 KB
/
spacesavers2_grubbers
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python3
import sys
import os
import gzip
import textwrap
import time
from src.VersionCheck import version_check
from src.VersionCheck import __version__
version_check()
# from src.FileDetails import FileDetails
from src.dfUnit import fgz
# from src.Summary import Summary
from src.utils import *
from datetime import date
import argparse
def main():
start = time.time()
scriptname = os.path.basename(__file__)
elog = textwrap.dedent(
"""\
Version:
{}
Example:
> spacesavers2_grubbers -f /output/from/spacesavers2_finddup/prefix.files.gz
""".format(
__version__
)
)
parser = argparse.ArgumentParser(
description="spacesavers2_grubbers: get list of large duplicates sorted by total size",
epilog=elog,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"-f",
"--filesgz",
dest="filesgz",
required=True,
type=str,
default=sys.stdin,
help="spacesavers2_mimeo prefix.<user>.mimeo.files.gz file",
)
parser.add_argument(
"-l",
"--limit",
dest="limit",
required=False,
type=float,
default=5,
help="stop showing duplicates with total size smaller than (5 default) GiB. Set 0 for unlimited.",
)
parser.add_argument(
"-o",
"--outfile",
dest="outfile",
required=False,
type=str,
help="output tab-delimited file (default STDOUT)",
)
parser.add_argument("-v", "--version", action="version", version=__version__)
print_with_timestamp(
start=start, scriptname=scriptname, string="version: {}".format(__version__)
)
global args
args = parser.parse_args()
dups = []
with gzip.open(os.path.join(args.filesgz), "rt") as filesgz:
for l in filesgz:
dfu = fgz()
properly_set = dfu.set(l)
if not properly_set: # could not read line properly or there are no duplicates
continue
if dfu.ndup_uid_inode == 0: continue # in case mimeo was run without -z
dups.append(dfu)
dups.sort() # look at __lt__ ... its sorting from highest to lowest totalsize
saved = 0
top_limit = args.limit * 1024 * 1024 * 1024 # 5 GiB Default
if args.outfile:
of = open(args.outfile, "w")
else:
of = sys.stdout
for fgitem in dups:
if fgitem.totalsize <= top_limit:
break
saved += fgitem.totalsize
outstr = str(fgitem)
if outstr == "": continue
of.write("%s\n"%(fgitem))
if args.outfile:
of.close()
hrsaved = get_human_readable_size(saved)
print_with_timestamp(
start=start,
scriptname=scriptname,
string="Deleting top grubbers will save {} [ {} Bytes ] !".format(hrsaved,saved),
)
print_with_timestamp(start=start, scriptname=scriptname, string="Done!")
if __name__ == "__main__":
main()