Skip to content

Commit

Permalink
Auto merge of #16350 - roife:neon-support-for-line-index, r=Veykril
Browse files Browse the repository at this point in the history
internal: Speedup line index calculation via NEON for aarch64

This commit provides SIMD acceleration (via NEON) for `line-index` library on aarch64 architecture, which improves performance for Apple Silicon users (and potentially for future aarch64-based chips).

The algorithm used here follows the same process as the original implementation using SSE2. Most of the vector instructions in SSE2 have corresponding parts in neon. The only issue is that there is no corresponding instruction for `_mm_movemask_epi8` in neon. To address this problem, I referred to the article at https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon.
  • Loading branch information
bors committed Jan 16, 2024
2 parents 5df53c9 + df53828 commit 18abb12
Showing 1 changed file with 112 additions and 1 deletion.
113 changes: 112 additions & 1 deletion lib/line-index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,22 @@ fn analyze_source_file_dispatch(
}
}

#[cfg(target_arch = "aarch64")]
fn analyze_source_file_dispatch(
src: &str,
lines: &mut Vec<TextSize>,
multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,
) {
if std::arch::is_aarch64_feature_detected!("neon") {
// SAFETY: NEON support was checked
unsafe {
analyze_source_file_neon(src, lines, multi_byte_chars);
}
} else {
analyze_source_file_generic(src, src.len(), TextSize::from(0), lines, multi_byte_chars);
}
}

/// Checks 16 byte chunks of text at a time. If the chunk contains
/// something other than printable ASCII characters and newlines, the
/// function falls back to the generic implementation. Otherwise it uses
Expand Down Expand Up @@ -322,7 +338,102 @@ unsafe fn analyze_source_file_sse2(
}
}

#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
#[target_feature(enable = "neon")]
#[cfg(any(target_arch = "aarch64"))]
#[inline]
// See https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
//
// The mask is a 64-bit integer, where each 4-bit corresponds to a u8 in the
// input vector. The least significant 4 bits correspond to the first byte in
// the vector.
unsafe fn move_mask(v: std::arch::aarch64::uint8x16_t) -> u64 {
use std::arch::aarch64::*;

let nibble_mask = vshrn_n_u16(vreinterpretq_u16_u8(v), 4);
vget_lane_u64(vreinterpret_u64_u8(nibble_mask), 0)
}

#[target_feature(enable = "neon")]
#[cfg(any(target_arch = "aarch64"))]
unsafe fn analyze_source_file_neon(
src: &str,
lines: &mut Vec<TextSize>,
multi_byte_chars: &mut IntMap<u32, Vec<WideChar>>,
) {
use std::arch::aarch64::*;

const CHUNK_SIZE: usize = 16;

let src_bytes = src.as_bytes();

let chunk_count = src.len() / CHUNK_SIZE;

let newline = vdupq_n_s8(b'\n' as i8);

// This variable keeps track of where we should start decoding a
// chunk. If a multi-byte character spans across chunk boundaries,
// we need to skip that part in the next chunk because we already
// handled it.
let mut intra_chunk_offset = 0;

for chunk_index in 0..chunk_count {
let ptr = src_bytes.as_ptr() as *const i8;
let chunk = vld1q_s8(ptr.add(chunk_index * CHUNK_SIZE));

// For character in the chunk, see if its byte value is < 0, which
// indicates that it's part of a UTF-8 char.
let multibyte_test = vcltzq_s8(chunk);
// Create a bit mask from the comparison results.
let multibyte_mask = move_mask(multibyte_test);

// If the bit mask is all zero, we only have ASCII chars here:
if multibyte_mask == 0 {
assert!(intra_chunk_offset == 0);

// Check for newlines in the chunk
let newlines_test = vceqq_s8(chunk, newline);
let mut newlines_mask = move_mask(newlines_test);

// If the bit mask is not all zero, there are newlines in this chunk.
if newlines_mask != 0 {
let output_offset = TextSize::from((chunk_index * CHUNK_SIZE + 1) as u32);

while newlines_mask != 0 {
let trailing_zeros = newlines_mask.trailing_zeros();
let index = trailing_zeros / 4;

lines.push(TextSize::from(index) + output_offset);

// Clear the current 4-bit, so we can find the next one.
newlines_mask &= (!0xF) << trailing_zeros;
}
}
continue;
}

let scan_start = chunk_index * CHUNK_SIZE + intra_chunk_offset;
intra_chunk_offset = analyze_source_file_generic(
&src[scan_start..],
CHUNK_SIZE - intra_chunk_offset,
TextSize::from(scan_start as u32),
lines,
multi_byte_chars,
);
}

let tail_start = chunk_count * CHUNK_SIZE + intra_chunk_offset;
if tail_start < src.len() {
analyze_source_file_generic(
&src[tail_start..],
src.len() - tail_start,
TextSize::from(tail_start as u32),
lines,
multi_byte_chars,
);
}
}

#[cfg(not(any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64")))]
// The target (or compiler version) does not support SSE2 ...
fn analyze_source_file_dispatch(
src: &str,
Expand Down

0 comments on commit 18abb12

Please sign in to comment.