-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.go
122 lines (108 loc) · 3.36 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
// Unique is a command-line utility which ingests string values and outputs the unique ones.
// This is achieved by keeping track of the encountered values, which means that the consumed memory will grow with
// incoming unique values.
package main
import (
"bufio"
"bytes"
"flag"
"fmt"
"io"
"log"
"os"
"github.com/cespare/xxhash/v2"
)
const (
// defaultLineLengthLimit defines how many bytes of a given line we'll take into consideration when comparing.
// This protects us from OOM when reading an endless line.
defaultLineLengthLimit = 100 * 1024 * 1024 // 100 MiB
)
// readFile opens the given file for reading and returns a reader and a closing function.
func readFile(path string, lineLimit int) (r *bufio.Reader, closeFn func() error, err error) {
if path == "" {
closeFn = func() error { return nil }
r = bufio.NewReaderSize(os.Stdin, lineLimit)
return r, closeFn, nil
}
file, err := os.Open(path)
if err != nil {
return nil, nil, err
}
r = bufio.NewReaderSize(file, lineLimit)
return r, file.Close, nil
}
// outputUnique reads from the provided reader and outputs all unique lines.
func outputUnique(r *bufio.Reader, trim bool) error {
// This map will hold the hashes of unique lines.
m := make(map[uint64]struct{})
var line []byte
var partial bool
var err error
var hash uint64
for {
line, partial, err = r.ReadLine()
if err == io.EOF {
return nil
}
if err != nil {
return fmt.Errorf("failed to read from file: %w", err)
}
if trim {
line = bytes.TrimSpace(line)
}
hash = xxhash.Sum64(line)
if _, exists := m[hash]; exists {
// Not unique. Drain the rest of the line, if any, and continue.
for partial {
_, partial, err = r.ReadLine()
if err == io.EOF {
return nil
}
if err != nil {
return fmt.Errorf("failed to read from file: %w", err)
}
}
continue
}
// It's unique! We'll add it to the map, and we'll output it.
m[hash] = struct{}{}
fmt.Print(string(line))
// The line was too long to read, so we only got the first `lineLengthLimit` bytes of it.
// We still need to read through the rest.
for partial {
line, partial, err = r.ReadLine()
if err == io.EOF {
return nil
}
if err != nil {
return fmt.Errorf("failed to read from file: %w", err)
}
// TODO There is a potential issue here with multibyte UTF-8 characters that can be split apart.
fmt.Print(string(line))
}
fmt.Println()
}
}
func main() {
flag.Usage = func() {
_, _ = fmt.Fprintf(flag.CommandLine.Output(), "Usage of unique:\n\n")
_, _ = fmt.Fprintf(flag.CommandLine.Output(), "When no arguments are given %s reads from the standard in.\n\n", os.Args[0])
flag.PrintDefaults()
}
var filePath string
var trim bool
var lineLengthLimit int
flag.StringVar(&filePath, "f", "", "path to the file to process")
flag.BoolVar(&trim, "t", false, "trim whitespace from each line (default false)")
flag.IntVar(&lineLengthLimit, "ll", defaultLineLengthLimit, "limit the length of each line being processed, ignoring any data beyond that length (values under 16 are ignored)")
flag.Parse()
reader, closeFn, err := readFile(filePath, lineLengthLimit)
if err != nil {
log.Fatalf("Failed to read from file '%s'. Error: %s\n", filePath, err.Error())
}
defer func() { _ = closeFn() }()
err = outputUnique(reader, trim)
if err != nil {
log.Fatalf("Failed to process all data. Error: %s\n", err)
}
}