Skip to content

Commit

Permalink
fix: remove unneeded escaping in strings if possible (#253)
Browse files Browse the repository at this point in the history
  • Loading branch information
pamburus authored May 9, 2024
1 parent 62c1f72 commit f756756
Show file tree
Hide file tree
Showing 11 changed files with 317 additions and 138 deletions.
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ members = [".", "crate/encstr"]
[workspace.package]
repository = "https://github.com/pamburus/hl"
authors = ["Pavel Ivanov <[email protected]>"]
version = "0.29.3-alpha.1"
version = "0.29.3"
edition = "2021"
license = "MIT"

Expand Down Expand Up @@ -135,6 +135,10 @@ harness = false
name = "json"
harness = false

[[bench]]
name = "mem"
harness = false

[[bench]]
name = "encstr"
path = "benches/encstr/benches.rs"
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ install-versioned: contrib-build

## Run tests
test: contrib-build
@cargo test
@cargo test --workspace
.PHONY: test

## Run benchmarks
bench: contrib-build
@cargo bench
@cargo bench --workspace
.PHONY: bench

## Show usage of the binary
Expand Down
60 changes: 60 additions & 0 deletions benches/mem.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
// std imports
use std::alloc::System;

// third-party imports
use criterion::{criterion_group, criterion_main, Criterion};
use stats_alloc::{StatsAlloc, INSTRUMENTED_SYSTEM};
use std::hint::black_box;

#[global_allocator]
static GLOBAL: &StatsAlloc<System> = &INSTRUMENTED_SYSTEM;

fn benchmark(c: &mut Criterion) {
let mut c = c.benchmark_group("mem");

let bufs = |size| {
let vi: Vec<u8> = (0..size).into_iter().map(|x| x as u8).collect();
let ve: Vec<u8> = Vec::with_capacity(size);
(vi, ve)
};

for n in [512, 4096] {
c.bench_function(format!("mem-rotate-{}", n), |b| {
let (mut vi, _) = bufs(n);
b.iter(|| {
black_box(&mut vi).rotate_right(1);
});
});
c.bench_function(format!("mem-copy-{}", n), |b| {
let (vi, mut ve) = bufs(n);
b.iter(|| {
ve.clear();
black_box(&mut ve).extend_from_slice(black_box(&vi).as_slice());
});
});
}

c.bench_function("mem-find-single-value-4096", |b| {
let vi: Vec<u8> = (0..4096).into_iter().map(|x| (x / 16) as u8).collect();
b.iter(|| {
black_box(vi.iter().position(|&x| x == 128));
});
});

c.bench_function("mem-find-one-of-two-values-4096", |b| {
let vi: Vec<u8> = (0..4096).into_iter().map(|x| (x / 16) as u8).collect();
b.iter(|| {
black_box(vi.iter().position(|&x| matches!(x, 128 | 192)));
});
});

c.bench_function("mem-find-one-of-four-values-4096", |b| {
let vi: Vec<u8> = (0..4096).into_iter().map(|x| (x / 16) as u8).collect();
b.iter(|| {
black_box(vi.iter().position(|&x| matches!(x, 128 | 192 | 224 | 240)));
});
});
}

criterion_group!(benches, benchmark);
criterion_main!(benches);
14 changes: 9 additions & 5 deletions build/ci/coverage.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ IGNORE=(

function executables() {
echo ${MAIN_EXECUTABLE:?}
cargo test --tests --no-run --message-format=json \
cargo test --workspace --tests --no-run --message-format=json \
| jq -r 'select(.profile.test == true) | .filenames[]' \
| grep -v dSYM -
}
Expand All @@ -33,11 +33,13 @@ LLVM_COV_FLAGS=(
)

function clean() {
rm -f ${LLVM_PROFILE_PATTERN:?}
rm -f \
${LLVM_PROFILE_PATTERN:?} \
crate/encstr/${LLVM_PROFILE_PATTERN:?}
}

function test() {
cargo test --tests
cargo test --tests --workspace
cargo build
${MAIN_EXECUTABLE:?} > /dev/null
${MAIN_EXECUTABLE:?} --config= --help > /dev/null
Expand All @@ -50,8 +52,10 @@ function test() {

function merge() {
${LLVM_BIN:?}/llvm-profdata merge \
-sparse ${LLVM_PROFILE_PATTERN:?} \
-o ${PROFDATA_FILE:?}
-o ${PROFDATA_FILE:?} \
-sparse \
${LLVM_PROFILE_PATTERN:?} \
crate/encstr/${LLVM_PROFILE_PATTERN:?}
}

function report() {
Expand Down
1 change: 1 addition & 0 deletions crate/encstr/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/target
35 changes: 2 additions & 33 deletions crate/encstr/src/encstr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ where
impl Handler for &mut Vec<u8> {
#[inline(always)]
fn handle(&mut self, token: Token<'_>) -> Option<()> {
Appender::new(self).handle(token)
RawAppender::new(self).handle(token)
}
}

Expand Down Expand Up @@ -314,38 +314,7 @@ impl Builder {
impl Handler for Builder {
#[inline(always)]
fn handle(&mut self, token: Token<'_>) -> Option<()> {
Appender::new(&mut self.buffer).handle(token)
}
}

// ---

pub struct Appender<'a> {
buffer: &'a mut Vec<u8>,
}

impl<'a> Appender<'a> {
#[inline(always)]
pub fn new(buffer: &'a mut Vec<u8>) -> Self {
Self { buffer }
}
}

impl<'a> Handler for Appender<'a> {
#[inline(always)]
fn handle(&mut self, token: Token<'_>) -> Option<()> {
match token {
Token::Char(ch) => match ch {
..='\x7F' => self.buffer.push(ch as u8),
_ => {
let mut buf = [0; 4];
let s = ch.encode_utf8(&mut buf);
self.buffer.extend(s.as_bytes());
}
},
Token::Sequence(s) => self.buffer.extend(s.as_bytes()),
}
Some(())
RawAppender::new(&mut self.buffer).handle(token)
}
}

Expand Down
120 changes: 118 additions & 2 deletions crate/encstr/src/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,68 @@ impl<'a> From<&'a str> for JsonEncodedString<'a> {

// ---

// ---

pub struct Appender<'a> {
buffer: &'a mut Vec<u8>,
}

impl<'a> Appender<'a> {
#[inline(always)]
pub fn new(buffer: &'a mut Vec<u8>) -> Self {
Self { buffer }
}
}

impl<'a> Handler for Appender<'a> {
#[inline(always)]
fn handle(&mut self, token: Token<'_>) -> Option<()> {
match token {
Token::Char(ch) => match ch {
..='\x7f' => {
let ch = ch as u8;
if !ESCAPE[ch as usize] {
self.buffer.push(ch);
} else {
self.buffer.push(b'\\');
match ch {
b'\x08' => self.buffer.push(b'b'),
b'\x0c' => self.buffer.push(b'f'),
b'\n' => self.buffer.push(b'n'),
b'\r' => self.buffer.push(b'r'),
b'\t' => self.buffer.push(b't'),
b'\\' | b'"' => self.buffer.push(ch),
_ => {
self.buffer.extend(b"u00");
self.buffer.push(HEX[((ch & 0xf0) >> 4) as usize]);
self.buffer.push(HEX[(ch & 0x0f) as usize]);
}
}
}
}
_ => {
let mut buf = [0; 4];
let s = ch.encode_utf8(&mut buf);
self.buffer.extend(s.as_bytes());
}
},
Token::Sequence(s) => {
let mut ss = s.as_bytes();
while let Some(pos) = ss.iter().position(|x| matches!(x, b'"' | b'\\')) {
self.buffer.extend(&ss[..pos]);
self.buffer.push(b'\\');
self.buffer.push(ss[pos]);
ss = &ss[pos + 1..];
}
self.buffer.extend(ss);
}
}
Some(())
}
}

// ---

struct Parser<'a> {
input: &'a str,
index: usize,
Expand Down Expand Up @@ -287,7 +349,7 @@ impl<'a> Iterator for Tokens<'a> {

#[inline(always)]
fn decode_hex_val(val: u8) -> Option<u16> {
let n = HEX[val as usize] as u16;
let n = UNHEX[val as usize] as u16;
if n == 255 {
None
} else {
Expand Down Expand Up @@ -325,7 +387,11 @@ static ESCAPE: [bool; 256] = {
]
};

static HEX: [u8; 256] = {
static HEX: [u8; 16] = [
b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'a', b'b', b'c', b'd', b'e', b'f',
];

static UNHEX: [u8; 256] = {
const __: u8 = 255; // not a hex digit
[
// 1 2 3 4 5 6 7 8 9 A B C D E F
Expand Down Expand Up @@ -411,6 +477,15 @@ mod tests {
assert_eq!(tokens.next(), None);
}

#[test]
fn test_tokens_escape_b() {
let mut tokens = Tokens::new(&r#""00 \b""#);
assert_eq!(tokens.next(), Some(Ok(Token::Sequence("00 "))));
assert_eq!(tokens.next(), Some(Ok(Token::Char('\x08'))));
assert_eq!(tokens.next(), None);
assert_eq!(tokens.next(), None);
}

#[test]
fn test_tokens_control() {
let mut tokens = Tokens::new(&r#""hello, \x00world""#);
Expand Down Expand Up @@ -440,4 +515,45 @@ mod tests {
assert_eq!(tokens.next(), Some(Ok(Token::Sequence("hello, "))));
assert_eq!(tokens.next(), Some(Err(Error::UnexpectedEndOfHexEscape)));
}

#[test]
fn test_append_esc_q() {
let mut tokens = Tokens::new(&r#""hello\u002c \"world\"""#);
let mut buffer = Vec::new();
let mut appender = Appender::new(&mut buffer);
while let Some(Ok(token)) = tokens.next() {
appender.handle(token);
}
assert_eq!(buffer, "hello, \\\"world\\\"".as_bytes());
}

#[test]
fn test_append_esc_bfnrt() {
let mut tokens = Tokens::new(r#""00 \b\f\n\r\t""#);
let mut buffer = Vec::new();
let mut appender = Appender::new(&mut buffer);
while let Some(Ok(token)) = tokens.next() {
appender.handle(token);
}
assert_eq!(buffer, r#"00 \b\f\n\r\t"#.as_bytes());
}

#[test]
fn test_append_esc_unicode() {
let mut tokens = Tokens::new(r#""00 ∞ \u2023""#);
let mut buffer = Vec::new();
let mut appender = Appender::new(&mut buffer);
while let Some(Ok(token)) = tokens.next() {
appender.handle(token);
}
assert_eq!(buffer, r#"00 ∞ ‣"#.as_bytes(), "{:?}", String::from_utf8_lossy(&buffer));
}

#[test]
fn test_append_sequence_with_quotes() {
let mut buffer = Vec::new();
let mut appender = Appender::new(&mut buffer);
appender.handle(Token::Sequence(r#"hello, "world""#));
assert_eq!(buffer, r#"hello, \"world\""#.as_bytes());
}
}
3 changes: 3 additions & 0 deletions crate/encstr/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@ mod encstr;

pub use encstr::*;
pub use error::*;

pub type JsonAppender<'a> = json::Appender<'a>;
pub type RawAppender<'a> = raw::Appender<'a>;
Loading

0 comments on commit f756756

Please sign in to comment.