From d9060220b6da3d12aab80c3ff804cbb12c668013 Mon Sep 17 00:00:00 2001 From: mike-ward Date: Tue, 2 Jul 2024 08:38:16 -0500 Subject: [PATCH] sort implementation --- README.md | 4 +- src/sort/delete.me | 0 src/sort/options.v | 129 ++++++++++++++++++++++++ src/sort/sort.v | 177 ++++++++++++++++++++++++++++++++ src/sort/sort_key.v | 194 +++++++++++++++++++++++++++++++++++ src/sort/sort_keys_test.v | 143 ++++++++++++++++++++++++++ src/sort/sort_test.v | 207 ++++++++++++++++++++++++++++++++++++++ src/sort/test.txt | 4 + 8 files changed, 856 insertions(+), 2 deletions(-) delete mode 100644 src/sort/delete.me create mode 100644 src/sort/options.v create mode 100644 src/sort/sort.v create mode 100644 src/sort/sort_key.v create mode 100644 src/sort/sort_keys_test.v create mode 100644 src/sort/sort_test.v create mode 100644 src/sort/test.txt diff --git a/README.md b/README.md index 28f82910..fbaeb647 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,7 @@ compare against the true GNU coreutils version on the Linux-based tests first. | | link | Make a hard link via the link syscall | | ✓ | ln | Make links between files | | ✓ | logname | Print current login name | -| ✓ | ls | List directory contents | +| | ls | List directory contents | | ✓ | md5sum | Print or check MD5 digests | | ✓ | mkdir | Make directories | | | mkfifo | Make FIFOs (named pipes) | @@ -130,7 +130,7 @@ compare against the true GNU coreutils version on the Linux-based tests first. | | shred | Remove files more securely | | ✓ | shuf | Shuffling text | | ✓ | sleep | Delay for a specified time | -| | sort | Sort text files | +| ✓ | sort | Sort text files | | | split | Split a file into pieces | | ✓ | stat | Report file or file system status | | | stdbuf | Run a command with modified I/O stream buffering | diff --git a/src/sort/delete.me b/src/sort/delete.me deleted file mode 100644 index e69de29b..00000000 diff --git a/src/sort/options.v b/src/sort/options.v new file mode 100644 index 00000000..d7dba601 --- /dev/null +++ b/src/sort/options.v @@ -0,0 +1,129 @@ +import common +import flag +import os +import time + +const app_name = 'sort' + +struct Options { + ignore_leading_blanks bool + dictionary_order bool + ignore_case bool + ignore_non_printing bool + numeric bool + reverse bool + // other optoins + check_diagnose bool + check_quiet bool + sort_keys []string + field_separator string = ' ' + merge bool + output_file string + unique bool + files []string +} + +fn get_options() Options { + mut fp := flag.new_flag_parser(os.args) + fp.application(app_name) + fp.version(common.coreutils_version()) + fp.skip_executable() + fp.arguments_description('[FILE]') + fp.description('\nWrite sorted concatenation of all FILE(s) to standard output.' + + '\nWith no FILE, or when FILE is -, read standard input.') + + ignore_leading_blanks := fp.bool('ignore-leading-blanks', `b`, false, 'ignore leading blanks') + dictionary_order := fp.bool('dictionary-order', `d`, false, 'consider only blanks and alphanumeric characters') + ignore_case := fp.bool('ignore-case', `f`, false, 'fold lower case to upper case characters') + ignore_non_printing := fp.bool('ignore-non-printing', `i`, false, 'consider only printable characters') + numeric := fp.bool('numeric-sort', `n`, false, + 'Restrict the sort key to an initial numeric\n${flag.space}' + + 'string, consisting of optional characters,\n${flag.space}' + + 'optional character, and zero or\n${flag.space}' + + 'more digits, which shall be sorted by arithmetic\n${flag.space}' + + 'value. An empty digit string shall be treated as\n${flag.space}' + + 'zero. Leading zeros shall not affect ordering.') + reverse := fp.bool('reverse', `r`, false, 'reverse the result of comparisons\n\nOther options:') + + check_diagnose := fp.bool('', `c`, false, 'check for sorted input; do not sort') + check_quiet := fp.bool('', `C`, false, 'like -c, but do not report first bad line') + sort_keys := fp.string_multi('key', `k`, 'sort via a key(s); gives location and type') + merge := fp.bool('merge', `m`, false, 'merge already sorted files; do not sort') + field_separator := fp.string('', `t`, ' ', 'use as field separator') + output_file := fp.string('output', `o`, '', 'write result to FILE instead of standard output') + unique := fp.bool('unique', `u`, false, 'with -c, check for strict ordering;\n${flag.space}' + + 'without -c, output only the first of an equal run') + + fp.footer(" + + KEYDEF is F[.C][OPTS][,F[.C][OPTS]] for start and stop position, + where F is a field number and C a character position in the + field; both are origin 1, and the stop position defaults to the + line's end. If neither -t nor -b is in effect, characters in a + field are counted from the beginning of the preceding whitespace. + OPTS is one or more single-letter ordering options [bdfir], which + override global ordering options for that key. If no key is + given, use the entire line as the key.".trim_indent()) + + fp.footer(common.coreutils_footer()) + files := fp.finalize() or { exit_error(err.msg()) } + + return Options{ + ignore_leading_blanks: ignore_leading_blanks + dictionary_order: dictionary_order + ignore_case: ignore_case + ignore_non_printing: ignore_non_printing + numeric: numeric + reverse: reverse + // other options + check_diagnose: check_diagnose + check_quiet: check_quiet + sort_keys: sort_keys + field_separator: field_separator + merge: merge + output_file: output_file + unique: unique + files: scan_files_arg(files) + } +} + +fn scan_files_arg(files_arg []string) []string { + mut files := []string{} + for file in files_arg { + if file == '-' { + files << stdin_to_tmp() + continue + } + files << file + } + if files.len == 0 { + files << stdin_to_tmp() + } + return files +} + +const tmp_pattern = '/${app_name}-tmp-' + +fn stdin_to_tmp() string { + tmp := '${os.temp_dir()}/${tmp_pattern}${time.ticks()}' + os.create(tmp) or { exit_error(err.msg()) } + mut f := os.open_append(tmp) or { exit_error(err.msg()) } + defer { f.close() } + for { + s := os.get_raw_line() + if s.len == 0 { + break + } + f.write_string(s) or { exit_error(err.msg()) } + } + return tmp +} + +@[noreturn] +fn exit_error(msg string) { + if msg.len > 0 { + eprintln('${app_name}: ${error}') + } + eprintln("Try '${app_name} --help' for more information.") + exit(2) // exit(1) is used with the -c option +} diff --git a/src/sort/sort.v b/src/sort/sort.v new file mode 100644 index 00000000..0ffdc488 --- /dev/null +++ b/src/sort/sort.v @@ -0,0 +1,177 @@ +import os +import arrays +import strconv + +const space = ` ` +const tab = `\t` + +fn main() { + options := get_options() + results := sort(options) + if options.output_file == '' { + for result in results { + println(result) + } + } else { + os.write_lines(options.output_file, results) or { exit_error(err.msg()) } + } +} + +fn sort(options Options) []string { + mut results := []string{} + for file in options.files { + results << do_sort(file, options) + } + return results +} + +fn do_sort(file string, options Options) []string { + mut lines := os.read_lines(file) or { exit_error(err.msg()) } + original := if options.check_diagnose || options.check_quiet { + lines.clone() + } else { + []string{} + } + match true { + // order matters here + options.sort_keys.len > 0 { sort_key(mut lines, options) } + options.numeric { sort_general_numeric(mut lines, options) } + options.ignore_case { sort_ignore_case(mut lines, options) } + options.dictionary_order { sort_dictionary_order(mut lines, options) } + options.ignore_non_printing { sort_ignore_non_printing(mut lines, options) } + options.ignore_leading_blanks { sort_ignore_leading_blanks(mut lines, options) } + else { sort_lines(mut lines, options) } + } + if options.unique { + lines = arrays.distinct(lines) + } + if original.len > 0 { + if lines != original { + if options.check_diagnose { + println('sort: not sorted') + } + exit(1) + } else { + if options.check_diagnose { + println('sort: already sorted') + } + exit(0) + } + } + return lines +} + +fn sort_lines(mut lines []string, options Options) { + cmp := if options.reverse { compare_strings_reverse } else { compare_strings } + lines.sort_with_compare(fn [cmp] (a &string, b &string) int { + return cmp(a, b) + }) +} + +fn compare_strings_reverse(a &string, b &string) int { + return compare_strings(b, a) +} + +fn sort_ignore_case(mut lines []string, options Options) { + lines.sort_ignore_case() + if options.reverse { + lines.reverse_in_place() + } +} + +// Ignore leading blanks when finding sort keys in each line. +// By default a blank is a space or a tab +fn sort_ignore_leading_blanks(mut lines []string, options Options) { + cmp := if options.reverse { compare_strings_reverse } else { compare_strings } + lines.sort_with_compare(fn [cmp] (a &string, b &string) int { + return cmp(trim_leading_spaces(a), trim_leading_spaces(b)) + }) +} + +fn trim_leading_spaces(s string) string { + return s.trim_left(' \n\t\v\f\r') +} + +// Sort in phone directory order: ignore all characters except letters, digits +// and blanks when sorting. By default letters and digits are those of ASCII +fn sort_dictionary_order(mut lines []string, options Options) { + cmp := if options.reverse { compare_strings_reverse } else { compare_strings } + lines.sort_with_compare(fn [cmp] (a &string, b &string) int { + aa := a.bytes().map(is_dictionary_char).bytestr() + bb := b.bytes().map(is_dictionary_char).bytestr() + return cmp(aa, bb) + }) +} + +fn is_dictionary_char(e u8) u8 { + return match e.is_digit() || e.is_letter() || e == space || e == tab { + true { e } + else { space } + } +} + +// Sort numerically, converting a prefix of each line to a long double-precision +// floating point number. See Floating point numbers. Do not report overflow, +// underflow, or conversion errors. Use the following collating sequence: +// Lines that do not start with numbers (all considered to be equal). +// - NaNs (“Not a Number” values, in IEEE floating point arithmetic) in a +// consistent but machine-dependent order. +// - Minus infinity. +// - Finite numbers in ascending numeric order (with -0 and +0 equal). +// - Plus infinity +fn sort_general_numeric(mut lines []string, options Options) { + cmp := if options.reverse { compare_strings_reverse } else { compare_strings } + lines.sort_with_compare(fn [cmp, options] (a &string, b &string) int { + numeric_a, rest_a := numeric_rest(a) + numeric_b, rest_b := numeric_rest(b) + numeric_diff := if options.reverse { numeric_b - numeric_a } else { numeric_a - numeric_b } + return if numeric_diff != 0 { + if numeric_diff > 0 { 1 } else { -1 } + } else { + cmp(rest_a, rest_b) + } + }) +} + +const minus_infinity = f64(-0xFFFFFFFFFFFFFFF) + +fn numeric_rest(s string) (f64, string) { + mut num := 0.0 + mut rest := s + mut allow_blanks := true + mut allow_sign := true + for i := 0; i < s.len; i++ { + c := s[i] + if allow_blanks && c == space { + continue + } + if allow_sign && (c == `-` || c == `+`) { + allow_sign = false + allow_blanks = false + continue + } + if c.is_digit() || c == strconv.c_dpoint { + allow_sign = false + allow_blanks = false + continue + } + num = strconv.atof64(s[0..i]) or { minus_infinity } + rest = s[i..].clone() + } + return num, rest +} + +// This option has no effect if the stronger --dictionary-order (-d) option +// is also given. +fn sort_ignore_non_printing(mut lines []string, options Options) { + cmp := if options.reverse { compare_strings_reverse } else { compare_strings } + lines.sort_with_compare(fn [cmp] (a &string, b &string) int { + aa := a.bytes().map(is_printable).bytestr() + bb := b.bytes().map(is_printable).bytestr() + return cmp(aa, bb) + }) +} + +fn is_printable(e u8) u8 { + return if e >= u8(` `) && e <= u8(`~`) { e } else { space } +} diff --git a/src/sort/sort_key.v b/src/sort/sort_key.v new file mode 100644 index 00000000..af574216 --- /dev/null +++ b/src/sort/sort_key.v @@ -0,0 +1,194 @@ +import strconv + +enum SortType { + ascii + numeric + leading + dictionary + ignore_case + ignore_non_printing + reverse +} + +struct SortKey { + f1 int + c1 int + f2 int + c2 int + sort_type SortType +} + +fn sort_key(mut lines []string, options Options) { + mut sort_keys := []SortKey{} + for sort_key in options.sort_keys { + sort_keys << parse_sort_key(sort_key) + } + + lines.sort_with_compare(fn [sort_keys, options] (a &string, b &string) int { + for key in sort_keys { + aa := find_field(a, key, options) + bb := find_field(b, key, options) + // println('${aa}, ${bb}') + result := match key.sort_type { + .numeric { compare_numeric(aa, bb) } + .leading { compare_leading(aa, bb) } + .dictionary { compare_dictionary(aa, bb) } + .ignore_case { compare_ignore_case(aa, bb) } + .ignore_non_printing { compare_ignore_non_printing(aa, bb) } + .reverse { compare_strings(bb, aa) } + else { compare_strings(aa, bb) } + } + if result != 0 { + return result + } + } + return compare_strings(a, b) + }) +} + +fn compare_numeric(a &string, b &string) int { + af, ar := numeric_rest(a) + bf, br := numeric_rest(b) + diff := af - bf + return if diff != 0 { + match diff > 0 { + true { 1 } + else { -1 } + } + } else { + compare_strings(ar, br) + } +} + +fn compare_leading(a &string, b &string) int { + aa := trim_leading_spaces(a) + bb := trim_leading_spaces(b) + return compare_strings(aa, bb) +} + +fn compare_dictionary(a &string, b &string) int { + aa := a.bytes().map(is_dictionary_char).bytestr() + bb := b.bytes().map(is_dictionary_char).bytestr() + return compare_strings(aa, bb) +} + +fn compare_ignore_case(a &string, b &string) int { + return compare_strings(a.to_upper(), b.to_upper()) +} + +fn compare_ignore_non_printing(a &string, b &string) int { + aa := a.bytes().map(is_printable).bytestr() + bb := b.bytes().map(is_printable).bytestr() + return compare_strings(aa, bb) +} + +fn find_field(s string, key SortKey, options Options) string { + parts := s.split(options.field_separator) + f1 := key.f1 - 1 + c1 := if key.c1 > 0 { key.c1 - 1 } else { 0 } + f2 := key.f2 // from the end, don't subtrace 1 + c2 := key.c2 // from the end, don't subtrace 1 + start := if f1 < parts.len { f1 } else { 0 } + end := if f2 >= f1 && f2 < parts.len { f2 } else { parts.len } + join := parts[start..end].join('') + begin := join[c1..] + field := if c2 > 0 { + c := begin.len - c2 + begin[..c] + } else { + begin + } + return field +} + +fn parse_sort_key(k string) SortKey { + mut i := 0 + mut f1 := 0 + mut c1 := 0 + mut f2 := 0 + mut c2 := 0 + mut start := 0 + + // field + for ; i < k.len; i++ { + if !k[i].is_digit() { + f1 = strconv.atoi(k[start..i]) or { exit_error(err.msg()) } + break + } + } + + if f1 == 0 { + f1 = strconv.atoi(k[start..i]) or { exit_error(err.msg()) } + } + + // column + if i < k.len && k[i] == `.` { + i += 1 + start = i + for ; i < k.len; i++ { + if !k[i].is_digit() { + c1 = strconv.atoi(k[start..i]) or { exit_error(err.msg()) } + break + } + } + + if c1 == 0 { + c1 = strconv.atoi(k[start..i]) or { exit_error(err.msg()) } + } + } + + // sort option + sort_t := if i < k.len { k[i] } else { space } + + sort_type := match sort_t { + `b` { SortType.leading } + `d` { SortType.dictionary } + `f` { SortType.ignore_case } + `i` { SortType.ignore_non_printing } + `n` { SortType.numeric } + `r` { SortType.reverse } + else { SortType.ascii } + } + + if sort_type != .ascii { + i += 1 + } + + if i < k.len && k[i] == `,` { + i += 1 + start = i + for ; i < k.len; i++ { + if !k[i].is_digit() { + f2 = strconv.atoi(k[start..i]) or { exit_error(err.msg()) } + break + } + } + + if f2 == 0 { + f2 = strconv.atoi(k[start..i]) or { exit_error(err.msg()) } + } + + if i < k.len && k[i] == `.` { + i += 1 + start = i + for ; i < k.len; i++ { + if !k[i].is_digit() { + c2 = strconv.atoi(k[start..i]) or { exit_error(err.msg()) } + break + } + } + + if c2 == 0 { + c2 = strconv.atoi(k[start..i]) or { exit_error(err.msg()) } + } + } + } + + return SortKey{ + f1: f1 + c1: c1 + f2: f2 + c2: c2 + sort_type: sort_type + } +} diff --git a/src/sort/sort_keys_test.v b/src/sort/sort_keys_test.v new file mode 100644 index 00000000..344614be --- /dev/null +++ b/src/sort/sort_keys_test.v @@ -0,0 +1,143 @@ +module main + +import os + +const test_aa = os.temp_dir() + '/test_aa.txt' +const test_bb = os.temp_dir() + '/test_bb.txt' + +fn testsuite_begin() { + create_test_data() +} + +fn create_test_data() { + os.write_lines(test_aa, [ + 'Now is the time', + 'for all good men', + 'to come to the aid', + 'of their country', + ]) or {} + os.write_lines(test_bb, [ + ' 4.0 Now is the time', + ' 3.0 for all good men', + ' 2.0 to come to the aid', + ' 01. of their country', + ]) or {} +} + +// parse field tests + +fn test_parse_simple_field() { + assert parse_sort_key('2') == SortKey{ + f1: 2 + c1: 0 + f2: 0 + c2: 0 + sort_type: .ascii + } +} + +fn test_parse_field_column() { + assert parse_sort_key('2.1') == SortKey{ + f1: 2 + c1: 1 + f2: 0 + c2: 0 + sort_type: .ascii + } +} + +fn test_parse_field_column_sort_field() { + assert parse_sort_key('2.1b,3') == SortKey{ + f1: 2 + c1: 1 + f2: 3 + c2: 0 + sort_type: .leading + } +} + +fn test_parse_field_column_sort_field_column() { + assert parse_sort_key('2.1i,3.3') == SortKey{ + f1: 2 + c1: 1 + f2: 3 + c2: 3 + sort_type: .ignore_non_printing + } +} + +// find field tests +// +fn test_find_field_simple() { + key := SortKey{ + f1: 2 + c1: 0 + f2: 2 + c2: 0 + sort_type: .ascii + } + assert find_field('Now is the time', key, Options{}) == 'is' +} + +fn test_find_field_no_f2() { + key := SortKey{ + f1: 2 + c1: 0 + f2: 0 + c2: 0 + sort_type: .ascii + } + assert find_field('Now is the time', key, Options{}) == 'isthetime' +} + +fn test_find_field_full_spec() { + key := SortKey{ + f1: 1 + c1: 2 + f2: 4 + c2: 1 + sort_type: .ascii + } + assert find_field('Now is the time', key, Options{}) == 'owisthetim' +} + +// sorting + +fn test_sort_simple_column() { + options := Options{ + sort_keys: ['2'] + files: [test_aa] + } + assert sort(options) == [ + 'for all good men', + 'to come to the aid', + 'Now is the time', + 'of their country', + ] +} + +fn test_sort_full_spec() { + options := Options{ + sort_keys: ['1.2,4.1'] + files: [test_aa] + } + assert sort(options) == [ + 'of their country', + 'to come to the aid', + 'for all good men', + 'Now is the time', + ] +} + +fn test_sort_numeric_simple() { + options := Options{ + sort_keys: ['1n'] + files: [test_bb] + } + assert sort(options) == [ + ' 01. of their country', + ' 2.0 to come to the aid', + ' 3.0 for all good men', + ' 4.0 Now is the time', + ] +} diff --git a/src/sort/sort_test.v b/src/sort/sort_test.v new file mode 100644 index 00000000..e5310934 --- /dev/null +++ b/src/sort/sort_test.v @@ -0,0 +1,207 @@ +module main + +import os + +const test_a = os.temp_dir() + '/test_a.txt' +const test_b = os.temp_dir() + '/test_b.txt' +const test_c = os.temp_dir() + '/test_c.txt' +const test_d = os.temp_dir() + '/test_d.txt' +const test_e = os.temp_dir() + '/test_e.txt' + +fn testsuite_begin() { + create_test_data() +} + +fn create_test_data() { + os.write_lines(test_a, [ + 'Now is the time', + 'for all good men', + 'to come to the aid', + 'of their country', + ]) or {} + + os.write_lines(test_b, [ + ' Now is the time', + ' for all good men', + ' to come to the aid', + ' of their country', + ]) or {} + os.write_lines(test_c, [ + '% to come to the aid', + '* for all good men', + '# of their country', + '! Now is the time', + ]) or {} + os.write_lines(test_d, [ + '\xf1 Now is the time', + '\xf2 for all good men', + '\xf3 to come to the aid', + '\xf4 of their country', + ]) or {} + os.write_lines(test_e, [ + '100.1 Now is the time', + '50.2 for all good men', + 'to come to the aid', + '-24.3 of their country', + ]) or {} +} + +fn test_no_options() { + options := Options{ + files: [test_a] + } + assert sort(options) == [ + 'Now is the time', + 'for all good men', + 'of their country', + 'to come to the aid', + ] +} + +fn test_reverse() { + options := Options{ + reverse: true + files: [test_a] + } + assert sort(options) == [ + 'to come to the aid', + 'of their country', + 'for all good men', + 'Now is the time', + ] +} + +fn test_ignore_case() { + options := Options{ + ignore_case: true + files: [test_a] + } + assert sort(options) == [ + 'for all good men', + 'Now is the time', + 'of their country', + 'to come to the aid', + ] +} + +fn test_ignore_case_reverse() { + options := Options{ + reverse: true + ignore_case: true + files: [test_a] + } + assert sort(options) == [ + 'to come to the aid', + 'of their country', + 'Now is the time', + 'for all good men', + ] +} + +fn test_ignore_leading_blanks() { + options := Options{ + ignore_leading_blanks: true + files: [test_b] + } + assert sort(options) == [ + ' Now is the time', + ' for all good men', + ' of their country', + ' to come to the aid', + ] +} + +fn test_ignore_leading_blanks_reverse() { + options := Options{ + reverse: true + ignore_leading_blanks: true + files: [test_b] + } + assert sort(options) == [ + ' to come to the aid', + ' of their country', + ' for all good men', + ' Now is the time', + ] +} + +fn test_dictionary_order() { + options := Options{ + dictionary_order: true + files: [test_c] + } + assert sort(options) == [ + '! Now is the time', + '* for all good men', + '# of their country', + '% to come to the aid', + ] +} + +fn test_dictionary_order_everse() { + options := Options{ + reverse: true + dictionary_order: true + files: [test_c] + } + assert sort(options) == [ + '% to come to the aid', + '# of their country', + '* for all good men', + '! Now is the time', + ] +} + +fn test_non_printing() { + options := Options{ + ignore_non_printing: true + files: [test_d] + } + assert sort(options) == [ + '\xf1 Now is the time', + '\xf2 for all good men', + '\xf4 of their country', + '\xf3 to come to the aid', + ] +} + +fn test_non_printing_reverse() { + options := Options{ + reverse: true + ignore_non_printing: true + files: [test_d] + } + assert sort(options) == [ + '\xf3 to come to the aid', + '\xf4 of their country', + '\xf2 for all good men', + '\xf1 Now is the time', + ] +} + +fn test_numeric() { + options := Options{ + numeric: true + files: [test_e] + } + assert sort(options) == [ + 'to come to the aid', + '-24.3 of their country', + '50.2 for all good men', + '100.1 Now is the time', + ] +} + +fn test_numeric_reverse() { + options := Options{ + numeric: true + reverse: true + files: [test_e] + } + assert sort(options) == [ + '100.1 Now is the time', + '50.2 for all good men', + '-24.3 of their country', + 'to come to the aid', + ] +} diff --git a/src/sort/test.txt b/src/sort/test.txt new file mode 100644 index 00000000..73794140 --- /dev/null +++ b/src/sort/test.txt @@ -0,0 +1,4 @@ +Now is the time +for all good men +to come to the aid +of their country \ No newline at end of file