-
Notifications
You must be signed in to change notification settings - Fork 3
/
dups.go
277 lines (257 loc) · 6.75 KB
/
dups.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
package dups
import (
"crypto/md5"
"crypto/sha256"
"fmt"
"github.com/cespare/xxhash"
"github.com/cheggaaa/pb/v3"
"io"
"io/ioutil"
"os"
"path/filepath"
"sync"
)
const (
// XXHash represents XXHash algorithm
XXHash = "xxhash"
// MD5 represents XXHash algorithm
MD5 = "md5"
// SHA256 represents XXHash algorithm
SHA256 = "sha256"
)
// FileInfo represents a file containing os.FileInfo and file path
type FileInfo struct {
Path string
Info os.FileInfo
}
// getXXHash return xxhash of a file
func getXXHash(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
h := xxhash.New()
if _, err := io.Copy(h, f); err != nil {
return "", err
}
return fmt.Sprintf("%x", h.Sum64()), nil
}
// getMD5 returns md5 hash of a file
func getMD5(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
h := md5.New()
if _, err := io.Copy(h, f); err != nil {
return "", err
}
return fmt.Sprintf("%x", h.Sum(nil)), nil
}
// getSHA256 returns sha256 hash of a file
func getSHA256(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
h := sha256.New()
if _, err := io.Copy(h, f); err != nil {
return "", err
}
return fmt.Sprintf("%x", h.Sum(nil)), nil
}
// GetFileHash returns given file hash using the provided algorithm
// Default: md5
func GetFileHash(path, algorithm string) (string, error) {
switch algorithm {
case MD5:
return getMD5(path)
case XXHash:
return getXXHash(path)
case SHA256:
return getSHA256(path)
default:
return getMD5(path)
}
}
// GetFiles finds and returns all the files in the given path
// It will also returns any file in sub-directories if "full=true"
func GetFiles(root string, full bool) ([]FileInfo, error) {
var filesInfos []FileInfo
cleanedPath := CleanPath(root)
if !full {
files, err := ioutil.ReadDir(cleanedPath)
if err != nil {
return filesInfos, err
}
for _, file := range files {
if !file.IsDir() {
filesInfos = append(filesInfos, FileInfo{
Path: filepath.Join(cleanedPath, file.Name()),
Info: file,
})
}
}
} else {
err := filepath.Walk(cleanedPath, func(path string, info os.FileInfo, err error) error {
if !info.IsDir() {
filesInfos = append(filesInfos, FileInfo{
Path: path,
Info: info,
})
}
return nil
})
if err != nil {
return filesInfos, err
}
}
return filesInfos, nil
}
// GroupFiles groups files based on their file size
// This will help avoid unnecessary hash calculations since files with different file sizes can't be duplicates
func GroupFiles(files []FileInfo, minSize int) (map[int][]FileInfo, int) {
groups := make(map[int][]FileInfo)
fileCount := 0
for _, file := range files {
size := int(file.Info.Size())
// Ignore files less than minimum size
if size > minSize {
groups[size] = append(groups[size], file)
fileCount++
}
}
return groups, fileCount
}
// CollectHashes returns hashes for the given group files if there is more than one file with the same size
// A hash will be the key and a list of FileInfo for files that share the hash as the value
// "singleThread=false" will force all the function to use one thread only
// minSize is the minimum file size to scan
// "flat=true" will tell the function not to print out any data other than the path to duplicate files
// algorithm is the algorithm to calculate the hash with
func CollectHashes(fileGroups map[int][]FileInfo, singleThread bool, algorithm string, flat bool, fileCount int) map[string][]FileInfo {
hashes := map[string][]FileInfo{}
// You cant't read/write at the same time to a map
// readHash and writeHash will read/write the given key/value to/from the map
// they make sure that the map is locked while a read or write is happening
var lock = sync.RWMutex{}
var readHash = func(key string) []FileInfo {
lock.RLock()
defer lock.RUnlock()
return hashes[key]
}
var writeHash = func(hash string, files []FileInfo) {
lock.Lock()
defer lock.Unlock()
hashes[hash] = files
}
// progress bar to show if "flat=false"
var bar *pb.ProgressBar
if !flat {
bar = createBar(fileCount)
}
if singleThread {
for _, group := range fileGroups {
// Ignore groups with one file
if len(group) > 1 {
for _, file := range group {
hash, err := GetFileHash(file.Path, algorithm)
if err == nil {
hashes[hash] = append(hashes[hash], file)
}
if bar != nil {
bar.Increment()
}
}
} else {
if bar != nil {
bar.Increment()
}
}
}
if bar != nil {
bar.Finish()
}
} else {
var wg sync.WaitGroup
wg.Add(fileCount)
for _, group := range fileGroups {
// Ignore groups with one file
if len(group) > 1 {
for _, file := range group {
go func(f FileInfo, bar *pb.ProgressBar) {
hash, err := GetFileHash(f.Path, algorithm)
if err == nil {
oldHashes := readHash(hash)
newHashes := append(oldHashes, f)
writeHash(hash, newHashes)
}
if bar != nil {
// tell the progress bar that a process is finished
bar.Increment()
}
wg.Done()
}(file, bar)
}
} else {
wg.Done()
if bar != nil {
bar.Increment()
}
}
}
wg.Wait()
if bar != nil {
// tell the progress bar that all the processes are finished
bar.Finish()
}
}
return hashes
}
// GetDuplicates scans the given map of hashes and finds the one with duplicates
// It will return a slice containing slices with each slice containing paths to duplicate files
// It will also returns the total of duplicate files and the total of files that have duplicates
func GetDuplicates(hashes map[string][]FileInfo) ([][]FileInfo, int, int) {
var duplicateFiles [][]FileInfo
// total duplicate files
total := 0
// Total number of files with duplicates
totalFiles := 0
for _, files := range hashes {
if len(files) > 1 {
totalFiles++
// for original file which will be counted in the next for
total--
var duplicates []FileInfo
for _, file := range files {
total++
duplicates = append(duplicates, file)
}
duplicateFiles = append(duplicateFiles, duplicates)
}
}
return duplicateFiles, totalFiles, total
}
// RemoveDuplicates removes duplicates
// It will keep the first file in a duplicate set and removes any other files in the set
// It will return the sum of deleted file sizes and total number of deleted files
func RemoveDuplicates(fileSets [][]FileInfo) (int, int, error) {
totalSize := 0
totalDeleted := 0
for _, files := range fileSets {
for i, file := range files {
if i > 0 {
totalSize += int(file.Info.Size())
totalDeleted++
err := os.Remove(file.Path)
if err != nil {
return totalSize, totalDeleted, err
}
}
}
}
return totalSize, totalDeleted, nil
}