diff --git a/Cargo.lock b/Cargo.lock index c8134384..22b573b7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -611,7 +611,7 @@ checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d" [[package]] name = "encstr" -version = "0.29.3-alpha.1" +version = "0.29.3" [[package]] name = "enum-map" @@ -768,7 +768,7 @@ checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" [[package]] name = "hl" -version = "0.29.3-alpha.1" +version = "0.29.3" dependencies = [ "atoi", "bincode", diff --git a/Cargo.toml b/Cargo.toml index 0351683d..8d2a73ac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,7 @@ members = [".", "crate/encstr"] [workspace.package] repository = "https://github.com/pamburus/hl" authors = ["Pavel Ivanov "] -version = "0.29.3-alpha.1" +version = "0.29.3" edition = "2021" license = "MIT" @@ -135,6 +135,10 @@ harness = false name = "json" harness = false +[[bench]] +name = "mem" +harness = false + [[bench]] name = "encstr" path = "benches/encstr/benches.rs" diff --git a/Makefile b/Makefile index e3dd2adb..072ae217 100644 --- a/Makefile +++ b/Makefile @@ -35,12 +35,12 @@ install-versioned: contrib-build ## Run tests test: contrib-build - @cargo test + @cargo test --workspace .PHONY: test ## Run benchmarks bench: contrib-build - @cargo bench + @cargo bench --workspace .PHONY: bench ## Show usage of the binary diff --git a/benches/mem.rs b/benches/mem.rs new file mode 100644 index 00000000..631d99f8 --- /dev/null +++ b/benches/mem.rs @@ -0,0 +1,60 @@ +// std imports +use std::alloc::System; + +// third-party imports +use criterion::{criterion_group, criterion_main, Criterion}; +use stats_alloc::{StatsAlloc, INSTRUMENTED_SYSTEM}; +use std::hint::black_box; + +#[global_allocator] +static GLOBAL: &StatsAlloc = &INSTRUMENTED_SYSTEM; + +fn benchmark(c: &mut Criterion) { + let mut c = c.benchmark_group("mem"); + + let bufs = |size| { + let vi: Vec = (0..size).into_iter().map(|x| x as u8).collect(); + let ve: Vec = Vec::with_capacity(size); + (vi, ve) + }; + + for n in [512, 4096] { + c.bench_function(format!("mem-rotate-{}", n), |b| { + let (mut vi, _) = bufs(n); + b.iter(|| { + black_box(&mut vi).rotate_right(1); + }); + }); + c.bench_function(format!("mem-copy-{}", n), |b| { + let (vi, mut ve) = bufs(n); + b.iter(|| { + ve.clear(); + black_box(&mut ve).extend_from_slice(black_box(&vi).as_slice()); + }); + }); + } + + c.bench_function("mem-find-single-value-4096", |b| { + let vi: Vec = (0..4096).into_iter().map(|x| (x / 16) as u8).collect(); + b.iter(|| { + black_box(vi.iter().position(|&x| x == 128)); + }); + }); + + c.bench_function("mem-find-one-of-two-values-4096", |b| { + let vi: Vec = (0..4096).into_iter().map(|x| (x / 16) as u8).collect(); + b.iter(|| { + black_box(vi.iter().position(|&x| matches!(x, 128 | 192))); + }); + }); + + c.bench_function("mem-find-one-of-four-values-4096", |b| { + let vi: Vec = (0..4096).into_iter().map(|x| (x / 16) as u8).collect(); + b.iter(|| { + black_box(vi.iter().position(|&x| matches!(x, 128 | 192 | 224 | 240))); + }); + }); +} + +criterion_group!(benches, benchmark); +criterion_main!(benches); diff --git a/build/ci/coverage.sh b/build/ci/coverage.sh index e5e43fc9..a8c47442 100755 --- a/build/ci/coverage.sh +++ b/build/ci/coverage.sh @@ -21,7 +21,7 @@ IGNORE=( function executables() { echo ${MAIN_EXECUTABLE:?} - cargo test --tests --no-run --message-format=json \ + cargo test --workspace --tests --no-run --message-format=json \ | jq -r 'select(.profile.test == true) | .filenames[]' \ | grep -v dSYM - } @@ -33,11 +33,13 @@ LLVM_COV_FLAGS=( ) function clean() { - rm -f ${LLVM_PROFILE_PATTERN:?} + rm -f \ + ${LLVM_PROFILE_PATTERN:?} \ + crate/encstr/${LLVM_PROFILE_PATTERN:?} } function test() { - cargo test --tests + cargo test --tests --workspace cargo build ${MAIN_EXECUTABLE:?} > /dev/null ${MAIN_EXECUTABLE:?} --config= --help > /dev/null @@ -50,8 +52,10 @@ function test() { function merge() { ${LLVM_BIN:?}/llvm-profdata merge \ - -sparse ${LLVM_PROFILE_PATTERN:?} \ - -o ${PROFDATA_FILE:?} + -o ${PROFDATA_FILE:?} \ + -sparse \ + ${LLVM_PROFILE_PATTERN:?} \ + crate/encstr/${LLVM_PROFILE_PATTERN:?} } function report() { diff --git a/crate/encstr/.gitignore b/crate/encstr/.gitignore new file mode 100644 index 00000000..c41cc9e3 --- /dev/null +++ b/crate/encstr/.gitignore @@ -0,0 +1 @@ +/target \ No newline at end of file diff --git a/crate/encstr/src/encstr.rs b/crate/encstr/src/encstr.rs index ef4be69a..c78ed0d1 100644 --- a/crate/encstr/src/encstr.rs +++ b/crate/encstr/src/encstr.rs @@ -233,7 +233,7 @@ where impl Handler for &mut Vec { #[inline(always)] fn handle(&mut self, token: Token<'_>) -> Option<()> { - Appender::new(self).handle(token) + RawAppender::new(self).handle(token) } } @@ -314,38 +314,7 @@ impl Builder { impl Handler for Builder { #[inline(always)] fn handle(&mut self, token: Token<'_>) -> Option<()> { - Appender::new(&mut self.buffer).handle(token) - } -} - -// --- - -pub struct Appender<'a> { - buffer: &'a mut Vec, -} - -impl<'a> Appender<'a> { - #[inline(always)] - pub fn new(buffer: &'a mut Vec) -> Self { - Self { buffer } - } -} - -impl<'a> Handler for Appender<'a> { - #[inline(always)] - fn handle(&mut self, token: Token<'_>) -> Option<()> { - match token { - Token::Char(ch) => match ch { - ..='\x7F' => self.buffer.push(ch as u8), - _ => { - let mut buf = [0; 4]; - let s = ch.encode_utf8(&mut buf); - self.buffer.extend(s.as_bytes()); - } - }, - Token::Sequence(s) => self.buffer.extend(s.as_bytes()), - } - Some(()) + RawAppender::new(&mut self.buffer).handle(token) } } diff --git a/crate/encstr/src/json.rs b/crate/encstr/src/json.rs index 5b0066c6..bda5f3b4 100644 --- a/crate/encstr/src/json.rs +++ b/crate/encstr/src/json.rs @@ -49,6 +49,68 @@ impl<'a> From<&'a str> for JsonEncodedString<'a> { // --- +// --- + +pub struct Appender<'a> { + buffer: &'a mut Vec, +} + +impl<'a> Appender<'a> { + #[inline(always)] + pub fn new(buffer: &'a mut Vec) -> Self { + Self { buffer } + } +} + +impl<'a> Handler for Appender<'a> { + #[inline(always)] + fn handle(&mut self, token: Token<'_>) -> Option<()> { + match token { + Token::Char(ch) => match ch { + ..='\x7f' => { + let ch = ch as u8; + if !ESCAPE[ch as usize] { + self.buffer.push(ch); + } else { + self.buffer.push(b'\\'); + match ch { + b'\x08' => self.buffer.push(b'b'), + b'\x0c' => self.buffer.push(b'f'), + b'\n' => self.buffer.push(b'n'), + b'\r' => self.buffer.push(b'r'), + b'\t' => self.buffer.push(b't'), + b'\\' | b'"' => self.buffer.push(ch), + _ => { + self.buffer.extend(b"u00"); + self.buffer.push(HEX[((ch & 0xf0) >> 4) as usize]); + self.buffer.push(HEX[(ch & 0x0f) as usize]); + } + } + } + } + _ => { + let mut buf = [0; 4]; + let s = ch.encode_utf8(&mut buf); + self.buffer.extend(s.as_bytes()); + } + }, + Token::Sequence(s) => { + let mut ss = s.as_bytes(); + while let Some(pos) = ss.iter().position(|x| matches!(x, b'"' | b'\\')) { + self.buffer.extend(&ss[..pos]); + self.buffer.push(b'\\'); + self.buffer.push(ss[pos]); + ss = &ss[pos + 1..]; + } + self.buffer.extend(ss); + } + } + Some(()) + } +} + +// --- + struct Parser<'a> { input: &'a str, index: usize, @@ -287,7 +349,7 @@ impl<'a> Iterator for Tokens<'a> { #[inline(always)] fn decode_hex_val(val: u8) -> Option { - let n = HEX[val as usize] as u16; + let n = UNHEX[val as usize] as u16; if n == 255 { None } else { @@ -325,7 +387,11 @@ static ESCAPE: [bool; 256] = { ] }; -static HEX: [u8; 256] = { +static HEX: [u8; 16] = [ + b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'a', b'b', b'c', b'd', b'e', b'f', +]; + +static UNHEX: [u8; 256] = { const __: u8 = 255; // not a hex digit [ // 1 2 3 4 5 6 7 8 9 A B C D E F @@ -411,6 +477,15 @@ mod tests { assert_eq!(tokens.next(), None); } + #[test] + fn test_tokens_escape_b() { + let mut tokens = Tokens::new(&r#""00 \b""#); + assert_eq!(tokens.next(), Some(Ok(Token::Sequence("00 ")))); + assert_eq!(tokens.next(), Some(Ok(Token::Char('\x08')))); + assert_eq!(tokens.next(), None); + assert_eq!(tokens.next(), None); + } + #[test] fn test_tokens_control() { let mut tokens = Tokens::new(&r#""hello, \x00world""#); @@ -440,4 +515,45 @@ mod tests { assert_eq!(tokens.next(), Some(Ok(Token::Sequence("hello, ")))); assert_eq!(tokens.next(), Some(Err(Error::UnexpectedEndOfHexEscape))); } + + #[test] + fn test_append_esc_q() { + let mut tokens = Tokens::new(&r#""hello\u002c \"world\"""#); + let mut buffer = Vec::new(); + let mut appender = Appender::new(&mut buffer); + while let Some(Ok(token)) = tokens.next() { + appender.handle(token); + } + assert_eq!(buffer, "hello, \\\"world\\\"".as_bytes()); + } + + #[test] + fn test_append_esc_bfnrt() { + let mut tokens = Tokens::new(r#""00 \b\f\n\r\t""#); + let mut buffer = Vec::new(); + let mut appender = Appender::new(&mut buffer); + while let Some(Ok(token)) = tokens.next() { + appender.handle(token); + } + assert_eq!(buffer, r#"00 \b\f\n\r\t"#.as_bytes()); + } + + #[test] + fn test_append_esc_unicode() { + let mut tokens = Tokens::new(r#""00 ∞ \u2023""#); + let mut buffer = Vec::new(); + let mut appender = Appender::new(&mut buffer); + while let Some(Ok(token)) = tokens.next() { + appender.handle(token); + } + assert_eq!(buffer, r#"00 ∞ ‣"#.as_bytes(), "{:?}", String::from_utf8_lossy(&buffer)); + } + + #[test] + fn test_append_sequence_with_quotes() { + let mut buffer = Vec::new(); + let mut appender = Appender::new(&mut buffer); + appender.handle(Token::Sequence(r#"hello, "world""#)); + assert_eq!(buffer, r#"hello, \"world\""#.as_bytes()); + } } diff --git a/crate/encstr/src/lib.rs b/crate/encstr/src/lib.rs index 634c5a98..ab217bdd 100644 --- a/crate/encstr/src/lib.rs +++ b/crate/encstr/src/lib.rs @@ -6,3 +6,6 @@ mod encstr; pub use encstr::*; pub use error::*; + +pub type JsonAppender<'a> = json::Appender<'a>; +pub type RawAppender<'a> = raw::Appender<'a>; diff --git a/crate/encstr/src/raw.rs b/crate/encstr/src/raw.rs index 505940c4..9959cf3b 100644 --- a/crate/encstr/src/raw.rs +++ b/crate/encstr/src/raw.rs @@ -74,15 +74,56 @@ impl<'a> Iterator for Tokens<'a> { // --- +pub struct Appender<'a> { + buffer: &'a mut Vec, +} + +impl<'a> Appender<'a> { + #[inline(always)] + pub fn new(buffer: &'a mut Vec) -> Self { + Self { buffer } + } +} + +impl<'a> Handler for Appender<'a> { + #[inline(always)] + fn handle(&mut self, token: Token<'_>) -> Option<()> { + match token { + Token::Char(ch) => match ch { + ..='\x7f' => self.buffer.push(ch as u8), + _ => { + let mut buf = [0; 4]; + let s = ch.encode_utf8(&mut buf); + self.buffer.extend(s.as_bytes()); + } + }, + Token::Sequence(s) => self.buffer.extend(s.as_bytes()), + } + Some(()) + } +} + +// --- + #[cfg(test)] mod tests { use super::*; #[test] - fn raw_string() { + fn test_raw_string() { let mut result = Builder::new(); - let string = RawString::new("hello, world!"); + let string = RawString::new("hello, world!¡"); string.decode(&mut result).unwrap(); - assert_eq!(result.as_str(), "hello, world!"); + assert_eq!(result.as_str(), "hello, world!¡"); + } + + #[test] + fn test_appender() { + let mut buffer = Vec::new(); + let mut appender = Appender::new(&mut buffer); + appender.handle(Token::Sequence("hello ")).unwrap(); + appender.handle(Token::Char('•')).unwrap(); + appender.handle(Token::Sequence(" world")).unwrap(); + assert_eq!(std::str::from_utf8(&buffer).unwrap(), "hello • world"); } } diff --git a/src/formatting.rs b/src/formatting.rs index ead6c440..14293819 100644 --- a/src/formatting.rs +++ b/src/formatting.rs @@ -568,7 +568,7 @@ enum FormattedFieldVariant { pub mod string { // workspace imports - use encstr::{AnyEncodedString, Appender, Result}; + use encstr::{AnyEncodedString, JsonAppender, Result}; // third-party imports use bitmask_enum::bitmask; @@ -617,27 +617,31 @@ pub mod string { return Ok(()); } - buf.truncate(begin); + if !mask.intersects(Mask::DoubleQuote | Mask::Control | Mask::Backslash) { + buf.push(b'"'); + buf.push(b'"'); + buf[begin..].rotate_right(1); + return Ok(()); + } - match mask & (Mask::DoubleQuote | Mask::SingleQuote | Mask::ExtendedSpace | Mask::Control | Mask::Backslash) - { - Mask::DoubleQuote => { - return ValueFormatSingleQuoted::new(self.string).format(buf); - } - Mask::SingleQuote => { - return ValueFormatDoubleQuoted::new(self.string).format(buf); - } - _ => (), - }; + if !mask.intersects(Mask::SingleQuote | Mask::Control | Mask::Backslash) { + buf.push(b'\''); + buf.push(b'\''); + buf[begin..].rotate_right(1); + return Ok(()); + } + + const Z: Mask = Mask::none(); + const XS: Mask = Mask::Control.or(Mask::ExtendedSpace); - let mask = - mask & (Mask::DoubleQuote | Mask::SingleQuote | Mask::ExtendedSpace | Mask::Control | Mask::Backtick); - if mask.intersects(Mask::DoubleQuote | Mask::SingleQuote | Mask::ExtendedSpace) - && !mask.intersects(Mask::Control | Mask::Backtick) - { - return ValueFormatBacktickQuoted::new(self.string).format(buf); + if matches!(mask.and(Mask::Backtick.or(XS)), Z | XS) { + buf.push(b'`'); + buf.push(b'`'); + buf[begin..].rotate_right(1); + return Ok(()); } + buf.truncate(begin); ValueFormatDoubleQuoted::new(self.string).format(buf) } } @@ -684,68 +688,7 @@ pub mod string { { #[inline] fn format(&self, buf: &mut Vec) -> Result<()> { - if self.string.source().len() == 0 { - buf.push(b'"'); - } else if self.string.source().as_bytes()[0] == b'"' { - buf.extend(self.string.source().as_bytes()); - } else { - buf.push(b'"'); - self.string.decode(Appender::new(buf))?; - buf.push(b'"'); - } - Ok(()) - } - } - - // --- - - pub struct ValueFormatSingleQuoted { - string: S, - } - - impl ValueFormatSingleQuoted { - #[inline(always)] - pub fn new(string: S) -> Self { - Self { string } - } - } - - impl<'a, S> Format for ValueFormatSingleQuoted - where - S: AnyEncodedString<'a>, - { - #[inline(always)] - fn format(&self, buf: &mut Vec) -> Result<()> { - buf.push(b'\''); - self.string.decode(Appender::new(buf))?; - buf.push(b'\''); - Ok(()) - } - } - - // --- - - pub struct ValueFormatBacktickQuoted { - string: S, - } - - impl ValueFormatBacktickQuoted { - #[inline(always)] - pub fn new(string: S) -> Self { - Self { string } - } - } - - impl<'a, S> Format for ValueFormatBacktickQuoted - where - S: AnyEncodedString<'a>, - { - #[inline(always)] - fn format(&self, buf: &mut Vec) -> Result<()> { - buf.push(b'`'); - self.string.decode(Appender::new(buf))?; - buf.push(b'`'); - Ok(()) + self.string.format_json(buf) } } @@ -824,15 +767,25 @@ pub mod string { { #[inline] fn format(&self, buf: &mut Vec) -> Result<()> { - if self.string.source().len() == 0 { - buf.push(b'"'); - } else if self.string.source().as_bytes()[0] == b'"' { - buf.extend(self.string.source().as_bytes()); - } else { - buf.push(b'"'); - self.string.decode(Appender::new(buf))?; - buf.push(b'"'); - } + self.string.format_json(buf) + } + } + + // --- + + trait EncodedStringExt { + fn format_json(&self, buf: &mut Vec) -> Result<()>; + } + + impl<'a, S> EncodedStringExt for S + where + S: AnyEncodedString<'a>, + { + #[inline] + fn format_json(&self, buf: &mut Vec) -> Result<()> { + buf.push(b'"'); + self.decode(JsonAppender::new(buf))?; + buf.push(b'"'); Ok(()) } } @@ -846,7 +799,7 @@ pub mod string { const BS: Mask = Mask::Backslash; // 0x5C const BT: Mask = Mask::Backtick; // 0x60 const SP: Mask = Mask::Space; // 0x20 - const XS: Mask = Mask::ExtendedSpace; // 0x09, 0x0A, 0x0D + const XS: Mask = Mask::Control.or(Mask::ExtendedSpace); // 0x09, 0x0A, 0x0D const EQ: Mask = Mask::EqualSign; // 0x3D const __: Mask = Mask::none(); [ @@ -1190,4 +1143,32 @@ mod tests { assert_eq!(&format_no_color(&rec), r#"k="some-\u001b[1mvalue\u001b[0m""#); } + + #[test] + fn test_string_value_json_with_control_characters_and_quotes() { + let rec = Record { + fields: RecordFields { + head: heapless::Vec::from_slice(&[( + "k", + RawValue::String(EncodedString::json(r#""some-\u001b[1m\"value\"\u001b[0m""#)), + )]) + .unwrap(), + tail: Default::default(), + }, + ..Default::default() + }; + + assert_eq!(&format_no_color(&rec), r#"k="some-\u001b[1m\"value\"\u001b[0m""#); + } + + #[test] + fn test_message_double_quoted() { + let rec = Record { + message: Some(RawValue::String(EncodedString::raw(r#""hello, world""#))), + ..Default::default() + }; + + let result = format_no_color(&rec); + assert_eq!(&result, r#""\"hello, world\"""#, "{}", result); + } }