From b26640ed031cd1eb94a2e43c4a6c43324f66b449 Mon Sep 17 00:00:00 2001 From: syrmel <104119569+syrmel@users.noreply.github.com> Date: Fri, 26 Jan 2024 15:51:37 +0100 Subject: [PATCH] Adding the uniq utility (#123) * Initial uniq * Added uniq tests * Fixed documentation on read_line_until() * Replace CustomBufferedReader with new vlib version --- src/uniq/delete.me | 0 src/uniq/settings.v | 55 ++++++++++++++++++ src/uniq/uniq.v | 133 +++++++++++++++++++++++++++++++++++++++++++ src/uniq/uniq_test.v | 77 +++++++++++++++++++++++++ 4 files changed, 265 insertions(+) delete mode 100644 src/uniq/delete.me create mode 100644 src/uniq/settings.v create mode 100644 src/uniq/uniq.v create mode 100644 src/uniq/uniq_test.v diff --git a/src/uniq/delete.me b/src/uniq/delete.me deleted file mode 100644 index e69de29b..00000000 diff --git a/src/uniq/settings.v b/src/uniq/settings.v new file mode 100644 index 00000000..063b2936 --- /dev/null +++ b/src/uniq/settings.v @@ -0,0 +1,55 @@ +import common +import os + +struct Settings { +mut: + count bool + repeated bool + unique bool + case_insensitive bool + check_chars int + help bool + version bool + skip_fields int + skip_chars int + line_delimiter u8 + input_file string + output_file string +} + +fn args() Settings { + mut fp := common.flag_parser(os.args) + fp.application(app_name) + fp.description(app_description) + fp.footer("\nA field is a run of blanks (usually spaces and/or TABs), then non-blank\ncharacters. Fields are skipped before chars.\n\nNote: 'uniq' does not detect repeated lines unless they are adjacent.\nYou may want to sort the input first, or use 'sort -u' without 'uniq'.") + + mut st := Settings{} + st.count = fp.bool('count', `c`, false, 'prefix lines by the number of occurrences') + st.repeated = fp.bool('repeated', `d`, false, 'only print duplicate lines, one for each group') + st.unique = fp.bool('unique', `u`, false, 'only print unique lines') + st.case_insensitive = fp.bool('ignore-case', `i`, false, 'ignore differences in case when comparing') + st.check_chars = fp.int('check-chars', `w`, -1, 'compare no more than N characters in lines') + st.skip_fields = fp.int('skip-fields', `f`, -1, 'avoid comparing the first N fields') + st.skip_chars = fp.int('skip-chars', `s`, -1, 'avoid comparing the first N characters') + st.input_file = '-' + st.output_file = '-' + zero_terminated := fp.bool('zero-terminated', `z`, false, 'line delimiter is NUL, not newline') + if zero_terminated { + st.line_delimiter = `\0` + } else { + st.line_delimiter = `\n` + } + fnames := fp.remaining_parameters() + + // Validation + if fnames.len > 2 { + // Exits the program + fail('Too many arguments specified') + } else if fnames.len == 2 { + st.output_file = fnames[1] + } + if fnames.len > 0 { + st.input_file = fnames[0] + } + return st +} diff --git a/src/uniq/uniq.v b/src/uniq/uniq.v new file mode 100644 index 00000000..29e80422 --- /dev/null +++ b/src/uniq/uniq.v @@ -0,0 +1,133 @@ +module main + +import io +import math +import os + +// POSIX Spec: https://pubs.opengroup.org/onlinepubs/9699919799/utilities/uniq.html +const app_name = 'uniq' +const app_description = 'report or omit repeated lines' + +struct Buffer { +mut: + seen string + count int +} + +@[noreturn] +fn fail(message string) { + eprintln('${app_name}: ${message}') + exit(1) +} + +fn output_line(s Buffer, settings Settings, mut outfile os.File) !bool { + if s.count > 0 { + if (!settings.unique && !settings.repeated) + || (settings.unique && s.count == 1) + || (settings.repeated && s.count > 1) { + if settings.count { + outfile.write('${s.count:7} '.bytes())! + } + outfile.write('${s.seen}'.bytes())! + outfile.write(rune(settings.line_delimiter).bytes())! + } + } + return true +} + +fn get_start_of_field(source string, target_field int) int { + mut field := 0 + mut interstitial := true + for i in 0 .. source.len { + if source[i].is_space() { + interstitial = true + } else { + if interstitial { + field += 1 + interstitial = false + // If we skip n fields, we want the start of field n+1 + if field == target_field { + return i + } + } + } + } + return source.len +} + +fn compare(source string, target string, settings Settings) bool { + mut s1 := source + mut s2 := target + + if settings.skip_fields > -1 { + s1 = s1[get_start_of_field(s1, settings.skip_fields + 1)..] + s2 = s2[get_start_of_field(s2, settings.skip_fields + 1)..] + } + + if settings.skip_chars > -1 { + s1 = source[math.min(s1.len, settings.skip_chars)..] + s2 = target[math.min(s2.len, settings.skip_chars)..] + } + + if settings.check_chars > -1 { + s1 = source[0..math.min(s1.len, settings.check_chars)] + s2 = target[0..math.min(s2.len, settings.check_chars)] + } + + if settings.case_insensitive { + return s1.to_lower() == s2.to_lower() + } else { + return s1 == s2 + } +} + +fn uniq(settings Settings) { + mut file := os.File{} + mut outfile := os.File{} + if settings.input_file == '-' { + file = os.stdin() + } else { + file = os.open(settings.input_file) or { + fail('${settings.input_file}: No such file or directory') + } + } + defer { + file.close() + } + + if settings.output_file == '-' { + outfile = os.stdout() + } else { + outfile = os.create(settings.output_file) or { + fail('${settings.output_file}: No such file or directory') + } + } + defer { + outfile.close() + } + + mut br := io.new_buffered_reader(io.BufferedReaderConfig{ reader: file }) + defer { + br.free() + } + + mut s := Buffer{ + seen: '' + count: 0 + } + for { + line := br.read_line(delim: settings.line_delimiter) or { break } + if !compare(line, s.seen, settings) { + output_line(s, settings, mut &outfile) or { panic(err) } + s.seen = line + s.count = 1 + } else { + s.count += 1 + } + } + output_line(s, settings, mut &outfile) or { panic(err) } +} + +fn main() { + uniq(args()) +} diff --git a/src/uniq/uniq_test.v b/src/uniq/uniq_test.v new file mode 100644 index 00000000..be62c9af --- /dev/null +++ b/src/uniq/uniq_test.v @@ -0,0 +1,77 @@ +import common.testing +import os + +const util = 'uniq' +const platform_util = $if !windows { + util +} $else { + 'coreutils ${util}' +} + +const cmd = testing.new_paired_command(platform_util, executable_under_test) +const executable_under_test = testing.prepare_executable(util) +const temp_dir = testing.temp_folder + +const posix_test_data = [ + '#01 foo0 bar0 foo1 bar1', + '#02 bar0 foo1 bar1 foo1', + '#03 foo0 bar0 foo1 bar1', + '#04', + '#05 foo0 bar0 foo1 bar1', + '#06 foo0 bar0 foo1 bar1', + '#07 bar0 foo1 bar1 foo0', +] +const posix_test_path_newline = 'posix_nl.txt' +const posix_test_path_zeroterm = 'posix_zt.txt' + +fn call_for_test(args string) os.Result { + res := os.execute('${executable_under_test} ${args}') + assert res.exit_code == 0 + return res +} + +fn test_posix_spec_case_1() { + assert cmd.same_results('-c -f 1 posix_nl.txt') +} + +fn test_posix_spec_case_2() { + assert cmd.same_results('-d -f 1 posix_nl.txt') +} + +fn test_posix_spec_case_3() { + assert cmd.same_results('-u -f 1 posix_nl.txt') +} + +fn test_posix_spec_case_4() { + assert cmd.same_results('-d -s 2 posix_nl.txt') +} + +fn test_posix_spec_case_1_zero_term() { + assert call_for_test('-c -f 1 posix_zt.txt').output.split('\0').len == 7 +} + +fn test_posix_spec_case_2_zero_term() { + assert call_for_test('-d -f 1 -z posix_zt.txt').output.split('\0').len == 2 +} + +fn test_posix_spec_case_3_zero_term() { + assert call_for_test('-u -f 1 -z posix_zt.txt').output.split('\0').len == 6 +} + +fn test_posix_spec_case_4_zero_term() { + assert call_for_test('-d -s 2 -z posix_zt.txt').output.split('\0').len == 1 +} + +fn testsuite_begin() { + os.write_file(posix_test_path_newline, posix_test_data.join('\n'))! + os.write_file(posix_test_path_zeroterm, posix_test_data.join('\0'))! +} + +fn testsuite_end() { + os.rm(posix_test_path_newline)! + os.rm(posix_test_path_zeroterm)! +} + +fn test_help_and_version() { + cmd.ensure_help_and_version_options_work()! +}