Skip to content

Commit

Permalink
WIP: reuse string bitmap
Browse files Browse the repository at this point in the history
  • Loading branch information
liuq19 committed Nov 25, 2024
1 parent 9dcdbfd commit 7c96242
Show file tree
Hide file tree
Showing 7 changed files with 128 additions and 68 deletions.
10 changes: 10 additions & 0 deletions sonic-simd/src/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ impl Simd for Simd256i {
fn gt(&self, rhs: &Self) -> Self::Mask {
unsafe { Mask256(_mm256_cmpgt_epi8(self.0, rhs.0)) }
}

#[inline(always)]
fn element_from(b: u8) -> Self::Element {
b as Self::Element
}
}

#[derive(Debug)]
Expand Down Expand Up @@ -136,4 +141,9 @@ impl Simd for Simd256u {
fn gt(&self, _rhs: &Self) -> Self::Mask {
todo!()
}

#[inline(always)]
fn element_from(b: u8) -> Self::Element {
b as Self::Element
}
}
10 changes: 10 additions & 0 deletions sonic-simd/src/bits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ macro_rules! impl_bits {
debug_assert!(n <= Self::LEN);
*self & ((u64::MAX as $ty) >> n)
}

#[inline]
fn clear_lowest_bit(&self) -> Self {
*self & ((*self).wrapping_sub(1))
}
}
)*
};
Expand Down Expand Up @@ -99,4 +104,9 @@ impl BitMask for NeonBits {
debug_assert!(n <= Self::LEN);
Self(self.0 & u64::MAX >> (n * 4))
}

#[inline]
fn clear_lowest_bit(&self) -> Self {
unimplemented!("maybe ineffient here")
}
}
10 changes: 10 additions & 0 deletions sonic-simd/src/sse2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@ impl Simd for Simd128i {
fn gt(&self, rhs: &Self) -> Self::Mask {
unsafe { Mask128(_mm_cmpgt_epi8(self.0, rhs.0)) }
}

#[inline(always)]
fn element_from(b: u8) -> Self::Element {
b as Self::Element
}
}

#[derive(Debug)]
Expand Down Expand Up @@ -135,4 +140,9 @@ impl Simd for Simd128u {
fn gt(&self, _rhs: &Self) -> Self::Mask {
todo!()
}

#[inline(always)]
fn element_from(b: u8) -> Self::Element {
b as Self::Element
}
}
5 changes: 5 additions & 0 deletions sonic-simd/src/traits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ pub trait Simd: Sized {

fn splat(elem: Self::Element) -> Self;

fn element_from(b: u8) -> Self::Element;

/// greater than
fn gt(&self, rhs: &Self) -> Self::Mask;

Expand Down Expand Up @@ -65,4 +67,7 @@ pub trait BitMask {

/// clear high n bits.
fn clear_high_bits(&self, n: usize) -> Self;

/// clear lowest bit.
fn clear_lowest_bit(&self) -> Self;
}
10 changes: 10 additions & 0 deletions sonic-simd/src/v512.rs
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,11 @@ impl Simd for Simd512u {
let hi = self.0 .1.gt(&rhs.0 .1);
Mask512((lo, hi))
}

#[inline(always)]
fn element_from(b: u8) -> Self::Element {
b as Self::Element
}
}

impl Simd for Simd512i {
Expand Down Expand Up @@ -167,4 +172,9 @@ impl Simd for Simd512i {
let hi = self.0 .1.gt(&rhs.0 .1);
Mask512((lo, hi))
}

#[inline(always)]
fn element_from(b: u8) -> Self::Element {
b as Self::Element
}
}
83 changes: 45 additions & 38 deletions src/parser.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
use std::{
num::NonZeroU8,
ops::Deref,
slice::{from_raw_parts, from_raw_parts_mut},
str::from_utf8_unchecked,
};

use faststr::FastStr;
use serde::de::{self, Expected, Unexpected};
use sonic_number::{parse_number, ParserNumber};
#[cfg(all(target_feature = "neon", target_arch = "aarch64"))]
use sonic_simd::bits::NeonBits;
use sonic_simd::{i8x32, m8x32, u8x32, u8x64, Mask, Simd};
use std::slice::from_raw_parts_mut;
use std::{num::NonZeroU8, ops::Deref, slice::from_raw_parts, str::from_utf8_unchecked};

use crate::{
config::DeserializeCfg,
Expand Down Expand Up @@ -189,6 +184,8 @@ pub(crate) struct Parser<R> {
error_index: usize, // mark the error position
nospace_bits: u64, // SIMD marked nospace bitmap
nospace_start: isize, // the start position of nospace_bits
str_start: *mut u8,
str_quote: u64,
pub(crate) cfg: DeserializeCfg,
}

Expand All @@ -209,6 +206,8 @@ where
error_index: usize::MAX,
nospace_bits: 0,
nospace_start: -128,
str_start: std::ptr::null_mut(),
str_quote: 0,
cfg: DeserializeCfg::default(),
}
}
Expand Down Expand Up @@ -304,39 +303,47 @@ where

#[inline(always)]
fn parse_string_inplace<V: JsonVisitor<'de>>(&mut self, vis: &mut V) -> Result<()> {
if !self.cfg.use_raw {
unsafe {
let mut src = self.read.cur_ptr();
let start = self.read.cur_ptr();
let cnt = parse_string_inplace(&mut src, self.cfg.utf8_lossy)
unsafe {
let mut src = self.read.cur_ptr();
let start = self.read.cur_ptr();
if self.str_quote != 0 {
let ptr = self.str_start.add(self.str_quote.trailing_zeros() as usize);
self.read.set_ptr(ptr.add(1));
let len = ptr.offset_from(start) as usize;
let s = str_from_raw_parts(start, len);
self.str_quote &= self.str_quote.wrapping_sub(1);
self.str_quote &= self.str_quote.wrapping_sub(1);
return check_visit!(self, vis.visit_borrowed_str(s));
} else if !self.cfg.use_raw {
let block = parse_string_inplace(&mut src, self.cfg.utf8_lossy)
.map_err(|e| self.error(e))?;
self.read.set_ptr(src);
let slice = from_raw_parts(start, cnt);
let slice = from_raw_parts(start, block.len);
let s = from_utf8_unchecked(slice);
self.str_start = block.start as *mut u8;
self.str_quote = block.quote;
return check_visit!(self, vis.visit_borrowed_str(s));
}
}

unsafe {
let start_idx = self.read.index();
let mut src = self.read.cur_ptr();
let start = self.read.cur_ptr();
match self.skip_string_unchecked()? {
ParseStatus::HasEscaped => {
let end = self.check_string_eof_inpadding()?;
let raw = as_str(&self.read.as_u8_slice()[start_idx - 1..end]);
let alloc = vis.allocator().unwrap();
let raw = RawStr::new_in(alloc, raw);
let cnt = parse_string_inplace(&mut src, self.cfg.utf8_lossy)
.map_err(|e| self.error(e))?;
self.read.set_ptr(src);
let s = str_from_raw_parts(start, cnt);
check_visit!(self, vis.visit_raw_str(s, raw))
}
ParseStatus::None => {
let end = self.check_string_eof_inpadding()?;
let s = as_str(&self.read.as_u8_slice()[start_idx..end - 1]);
check_visit!(self, vis.visit_borrowed_str(s))
} else {
let start_idx = self.read.index();
let mut src = self.read.cur_ptr();
let start = self.read.cur_ptr();
match self.skip_string_unchecked()? {
ParseStatus::HasEscaped => {
let end = self.check_string_eof_inpadding()?;
let raw = as_str(&self.read.as_u8_slice()[start_idx - 1..end]);
let alloc = vis.allocator().unwrap();
let raw = RawStr::new_in(alloc, raw);
let block = parse_string_inplace(&mut src, self.cfg.utf8_lossy)
.map_err(|e| self.error(e))?;
self.read.set_ptr(src);
let s = str_from_raw_parts(start, block.len);
check_visit!(self, vis.visit_raw_str(s, raw))
}
ParseStatus::None => {
let end = self.check_string_eof_inpadding()?;
let s = as_str(&self.read.as_u8_slice()[start_idx..end - 1]);
check_visit!(self, vis.visit_borrowed_str(s))
}
}
}
}
Expand Down Expand Up @@ -787,7 +794,7 @@ where
#[cfg(all(target_feature = "neon", target_arch = "aarch64"))]
let mut block: StringBlock<NeonBits>;
#[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))]
let mut block: StringBlock<u32>;
let mut block: StringBlock<u64>;

Check failure on line 797 in src/parser.rs

View workflow job for this annotation

GitHub Actions / clippy

the trait bound `u64: sonic_simd::Simd` is not satisfied

error[E0277]: the trait bound `u64: sonic_simd::Simd` is not satisfied --> src/parser.rs:797:24 | 797 | let mut block: StringBlock<u64>; | ^^^^^^^^^^^^^^^^ the trait `sonic_simd::Simd` is not implemented for `u64` | = help: the following other types implement trait `sonic_simd::Simd`: sonic_simd::avx2::Simd256i sonic_simd::avx2::Simd256u sonic_simd::sse2::Simd128i sonic_simd::sse2::Simd128u sonic_simd::v512::Simd512i sonic_simd::v512::Simd512u note: required by a bound in `util::string::StringBlock` --> src/util/string.rs:39:34 | 39 | pub(crate) struct StringBlock<S: Simd> { | ^^^^ required by this bound in `StringBlock`

self.parse_escaped_char(buf)?;

Expand Down Expand Up @@ -861,7 +868,7 @@ where
#[cfg(all(target_feature = "neon", target_arch = "aarch64"))]
let mut block: StringBlock<NeonBits>;
#[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))]
let mut block: StringBlock<u32>;
let mut block: StringBlock<u64>;

Check failure on line 871 in src/parser.rs

View workflow job for this annotation

GitHub Actions / clippy

the trait bound `u64: sonic_simd::Simd` is not satisfied in `util::string::StringBlock<u64>`

error[E0277]: the trait bound `u64: sonic_simd::Simd` is not satisfied in `util::string::StringBlock<u64>` --> src/parser.rs:871:13 | 871 | let mut block: StringBlock<u64>; | ^^^^^^^^^ within `util::string::StringBlock<u64>`, the trait `sonic_simd::Simd` is not implemented for `u64`, which is required by `util::string::StringBlock<u64>: std::marker::Sized` | = help: the following other types implement trait `sonic_simd::Simd`: sonic_simd::avx2::Simd256i sonic_simd::avx2::Simd256u sonic_simd::sse2::Simd128i sonic_simd::sse2::Simd128u sonic_simd::v512::Simd512i sonic_simd::v512::Simd512u note: required because it appears within the type `util::string::StringBlock<u64>` --> src/util/string.rs:39:19 | 39 | pub(crate) struct StringBlock<S: Simd> { | ^^^^^^^^^^^ = note: all local variables must have a statically known size = help: unsized locals are gated as an unstable feature help: consider borrowing here | 871 | let mut block: &StringBlock<u64>; | +

Check failure on line 871 in src/parser.rs

View workflow job for this annotation

GitHub Actions / clippy

the trait bound `u64: sonic_simd::Simd` is not satisfied

error[E0277]: the trait bound `u64: sonic_simd::Simd` is not satisfied --> src/parser.rs:871:24 | 871 | let mut block: StringBlock<u64>; | ^^^^^^^^^^^^^^^^ the trait `sonic_simd::Simd` is not implemented for `u64` | = help: the following other types implement trait `sonic_simd::Simd`: sonic_simd::avx2::Simd256i sonic_simd::avx2::Simd256u sonic_simd::sse2::Simd128i sonic_simd::sse2::Simd128u sonic_simd::v512::Simd512i sonic_simd::v512::Simd512u note: required by a bound in `util::string::StringBlock` --> src/util/string.rs:39:34 | 39 | pub(crate) struct StringBlock<S: Simd> { | ^^^^ required by this bound in `StringBlock`

while let Some(chunk) = self.read.peek_n(StringBlock::LANES) {
let v = unsafe { load(chunk.as_ptr()) };

Check failure on line 874 in src/parser.rs

View workflow job for this annotation

GitHub Actions / clippy

the trait bound `u64: sonic_simd::Simd` is not satisfied

error[E0277]: the trait bound `u64: sonic_simd::Simd` is not satisfied --> src/parser.rs:874:30 | 874 | let v = unsafe { load(chunk.as_ptr()) }; | ^^^^^^^^^^^^^^^^^^^^ the trait `sonic_simd::Simd` is not implemented for `u64` | = help: the following other types implement trait `sonic_simd::Simd`: sonic_simd::avx2::Simd256i sonic_simd::avx2::Simd256u sonic_simd::sse2::Simd128i sonic_simd::sse2::Simd128u sonic_simd::v512::Simd512i sonic_simd::v512::Simd512u note: required by a bound in `util::string::load` --> src/util/string.rs:92:30 | 92 | pub(crate) unsafe fn load<V: Simd>(ptr: *const u8) -> V { | ^^^^ required by this bound in `load`
Expand Down
68 changes: 38 additions & 30 deletions src/util/string.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use sonic_simd::u8x64;
use std::{
mem::MaybeUninit,
slice::{from_raw_parts, from_raw_parts_mut},
Expand Down Expand Up @@ -35,42 +36,27 @@ pub const ESCAPED_TAB: [u8; 256] = [
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
];

#[derive(Debug)]
pub(crate) struct StringBlock<B: BitMask> {
pub(crate) bs_bits: B,
pub(crate) quote_bits: B,
pub(crate) unescaped_bits: B,
pub(crate) struct StringBlock<S: Simd> {
pub(crate) bs_bits: <S::Mask as Mask>::BitMask,
pub(crate) quote_bits: <S::Mask as Mask>::BitMask,
pub(crate) unescaped_bits: <S::Mask as Mask>::BitMask,
}

#[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))]
impl StringBlock<u32> {
pub(crate) const LANES: usize = 32;

#[inline]
pub fn new(v: &u8x32) -> Self {
Self {
bs_bits: (v.eq(&u8x32::splat(b'\\'))).bitmask(),
quote_bits: (v.eq(&u8x32::splat(b'"'))).bitmask(),
unescaped_bits: (v.le(&u8x32::splat(0x1f))).bitmask(),
}
}
}

#[cfg(all(target_feature = "neon", target_arch = "aarch64"))]
impl StringBlock<NeonBits> {
pub(crate) const LANES: usize = 16;
impl<S: Simd> StringBlock<S> {
pub(crate) const LANES: usize = S::LANES;

#[inline]
pub fn new(v: &u8x16) -> Self {
pub fn new(v: &S) -> Self {
Self {
bs_bits: (v.eq(&u8x16::splat(b'\\'))).bitmask(),
quote_bits: (v.eq(&u8x16::splat(b'"'))).bitmask(),
unescaped_bits: (v.le(&u8x16::splat(0x1f))).bitmask(),
bs_bits: (v.eq(&S::splat(S::element_from(b'\\')))).bitmask(),
quote_bits: (v.eq(&S::splat(S::element_from(b'"')))).bitmask(),
unescaped_bits: (v.le(&S::splat(S::element_from(0x1f)))).bitmask(),
}
}
}

impl<B: BitMask> StringBlock<B> {
impl<S: Simd> StringBlock<S> {
#[inline(always)]
pub fn has_unescaped(&self) -> bool {
self.unescaped_bits.before(&self.quote_bits)
Expand Down Expand Up @@ -108,29 +94,45 @@ pub(crate) unsafe fn load<V: Simd>(ptr: *const u8) -> V {
V::from_slice_unaligned_unchecked(chunk)
}

#[derive(Debug)]
pub struct StrBits {
pub start: *const u8,
pub quote: u64,
pub len: usize,
}

/// Return the size of the actual parsed string, `repr` means repr invalid UTF16 surrogate with
/// `\uFFFD`
/// TODO: fix me, there are repeat codes!!!
#[inline(always)]
pub(crate) unsafe fn parse_string_inplace(
src: &mut *mut u8,
repr: bool,
) -> std::result::Result<usize, ErrorCode> {
) -> std::result::Result<StrBits, ErrorCode> {
#[cfg(all(target_feature = "neon", target_arch = "aarch64"))]
let mut block: StringBlock<NeonBits>;
#[cfg(not(all(target_feature = "neon", target_arch = "aarch64")))]
let mut block: StringBlock<u32>;
let mut block: StringBlock<u8x64>;

let sdst = *src;
let src: &mut *const u8 = std::mem::transmute(src);

// loop for string without escaped chars
loop {
block = StringBlock::new(&unsafe { load(*src) });
// string: xxx", 0"xx0"
// quote : 000100010001
// cc : 000000100010
if block.has_quote_first() {
let idx = block.quote_index();
let start = *src;
*src = src.add(idx + 1);
return Ok(src.offset_from(sdst) as usize - 1);
let len = src.offset_from(sdst) as usize - 1;
let mut quote = block.bs_bits.wrapping_sub(1) & block.quote_bits;
// remove current ending quote and the next start quotes
quote &= quote.wrapping_sub(1);
quote &= quote.wrapping_sub(1);
return Ok(StrBits { start, len, quote });
}
if block.has_unescaped() {
return Err(ControlCharacterWhileParsingString);
Expand Down Expand Up @@ -173,13 +175,19 @@ pub(crate) unsafe fn parse_string_inplace(
let v = unsafe { load(*src) };
let block = StringBlock::new(&v);
if block.has_quote_first() {
let start = *src;
// TODO: loop unrolling here
while **src != b'"' {
*dst = **src;
dst = dst.add(1);
*src = src.add(1);
}
*src = src.add(1); // skip ending quote
return Ok(dst.offset_from(sdst) as usize);
let len = dst.offset_from(sdst) as usize;
let mut quote = block.bs_bits.wrapping_sub(1) & block.quote_bits;

Check failure on line 187 in src/util/string.rs

View workflow job for this annotation

GitHub Actions / clippy

type annotations needed

error[E0282]: type annotations needed --> src/util/string.rs:187:33 | 187 | let mut quote = block.bs_bits.wrapping_sub(1) & block.quote_bits; | ^^^^^^^^^^^^^ cannot infer type
quote &= quote.wrapping_sub(1);
quote &= quote.wrapping_sub(1);
return Ok(StrBits { start, len, quote });
}
if block.has_unescaped() {
return Err(ControlCharacterWhileParsingString);
Expand Down

0 comments on commit 7c96242

Please sign in to comment.