Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[stdlib] Micro-optimize utf8 helper functions #3896

Open
wants to merge 4 commits into
base: nightly
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions stdlib/src/collections/string.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -132,15 +132,11 @@ fn chr(c: Int) -> String:

Examples:
```mojo
print(chr(97)) # "a"
print(chr(8364)) # "€"
print(chr(97), chr(8364)) # "a €"
```
.
"""

if c < 0b1000_0000: # 1 byte ASCII char
return String(String._buffer_type(c, 0))

var num_bytes = _unicode_codepoint_utf8_byte_length(c)
var p = UnsafePointer[UInt8].alloc(num_bytes + 1)
_shift_unicode_to_utf8(p, c, num_bytes)
Expand Down
45 changes: 30 additions & 15 deletions stdlib/src/utils/string_slice.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ fn _unicode_codepoint_utf8_byte_length(c: Int) -> Int:
debug_assert(
0 <= c <= 0x10FFFF, "Value: ", c, " is not a valid Unicode code point"
)
alias sizes = SIMD[DType.int32, 4](0, 0b0111_1111, 0b0111_1111_1111, 0xFFFF)
return int((sizes < c).cast[DType.uint8]().reduce_add())
alias sizes = SIMD[DType.uint32, 4](0, 0x80, 0x8_00, 0x1_00_00)
return int((sizes <= c).cast[DType.uint8]().reduce_add())


@always_inline
Expand All @@ -81,10 +81,12 @@ fn _utf8_first_byte_sequence_length(b: Byte) -> Int:
(b & 0b1100_0000) != 0b1000_0000,
"Function does not work correctly if given a continuation byte.",
)
return int(count_leading_zeros(~b)) + int(b < 0b1000_0000)
return int(count_leading_zeros(~b) | (b < 0b1000_0000).cast[DType.uint8]())


fn _shift_unicode_to_utf8(ptr: UnsafePointer[UInt8], c: Int, num_bytes: Int):
fn _shift_unicode_to_utf8[
optimize_ascii: Bool = True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add explanation for this parameter in the docstring?

](ptr: UnsafePointer[UInt8], c: Int, num_bytes: Int):
"""Shift unicode to utf8 representation.

### Unicode (represented as UInt32 BE) to UTF-8 conversion:
Expand All @@ -99,19 +101,32 @@ fn _shift_unicode_to_utf8(ptr: UnsafePointer[UInt8], c: Int, num_bytes: Int):
- (a >> 18) | 0b11110000, (b >> 12) | 0b10000000, (c >> 6) | 0b10000000,
d | 0b10000000
"""
if num_bytes == 1:
ptr[0] = UInt8(c)
return

var shift = 6 * (num_bytes - 1)
var mask = UInt8(0xFF) >> (num_bytes + 1)
var num_bytes_marker = UInt8(0xFF) << (8 - num_bytes)
ptr[0] = ((c >> shift) & mask) | num_bytes_marker
for i in range(1, num_bytes):
shift -= 6
ptr[i] = ((c >> shift) & 0b0011_1111) | 0b1000_0000
@parameter
if optimize_ascii:
if likely(num_bytes == 1):
ptr[0] = UInt8(c)
return
var shift = 6 * (num_bytes - 1)
var mask = UInt8(0xFF) >> (num_bytes + 1)
var num_bytes_marker = UInt8(0xFF) << (8 - num_bytes)
ptr[0] = ((c >> shift) & mask) | num_bytes_marker
for i in range(1, num_bytes):
shift -= 6
ptr[i] = ((c >> shift) & 0b0011_1111) | 0b1000_0000
else:
var shift = 6 * (num_bytes - 1)
var mask = UInt8(0xFF) >> (num_bytes + int(num_bytes > 1))
var num_bytes_marker = UInt8(0xFF) << (8 - num_bytes)
ptr[0] = ((c >> shift) & mask) | (
num_bytes_marker & -int(num_bytes != 1)
)
for i in range(1, num_bytes):
shift -= 6
ptr[i] = ((c >> shift) & 0b0011_1111) | 0b1000_0000


@always_inline
fn _utf8_byte_type(b: SIMD[DType.uint8, _], /) -> __type_of(b):
"""UTF-8 byte type.

Expand All @@ -126,7 +141,7 @@ fn _utf8_byte_type(b: SIMD[DType.uint8, _], /) -> __type_of(b):
- 3 -> start of 3 byte long sequence.
- 4 -> start of 4 byte long sequence.
"""
return count_leading_zeros(~(b & UInt8(0b1111_0000)))
return count_leading_zeros(~b)


@always_inline
Expand Down
1 change: 1 addition & 0 deletions stdlib/test/collections/test_string.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,7 @@ def test_ord():


def test_chr():
assert_equal("\0", chr(0))
assert_equal("A", chr(65))
assert_equal("a", chr(97))
assert_equal("!", chr(33))
Expand Down
Loading