From 5e0085b0b90abea029268a8cf2872cee010372cf Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Wed, 5 Jun 2024 15:50:44 -0400 Subject: [PATCH 01/27] start fixing isspace and str len Signed-off-by: martinvuyk --- install_id | 1 + stdlib/src/builtin/string.mojo | 120 +++++++++++++---------------- stdlib/src/utils/string_slice.mojo | 42 ++++++++++ 3 files changed, 98 insertions(+), 65 deletions(-) create mode 100644 install_id diff --git a/install_id b/install_id new file mode 100644 index 0000000000..d9bca3b7f2 --- /dev/null +++ b/install_id @@ -0,0 +1 @@ +vj3s8KOW8jfIJiUFVSYOasUNvswv4KZ_oz0P2TsU_FU \ No newline at end of file diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index 5972fe35ae..c3c336246c 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -705,7 +705,7 @@ struct _StringIter[ self.length = length self.continuation_bytes = 0 for i in range(length): - if _utf8_byte_type(int(unsafe_pointer[i])) == 1: + if _utf8_byte_type(unsafe_pointer[i]) == 1: self.continuation_bytes += 1 fn __iter__(self) -> Self: @@ -716,7 +716,7 @@ struct _StringIter[ if forward: var byte_len = 1 if self.continuation_bytes > 0: - var byte_type = _utf8_byte_type(int(self.ptr[self.index])) + var byte_type = _utf8_byte_type(self.ptr[self.index]) if byte_type != 0: byte_len = int(byte_type) self.continuation_bytes -= byte_len - 1 @@ -728,11 +728,11 @@ struct _StringIter[ else: var byte_len = 1 if self.continuation_bytes > 0: - var byte_type = _utf8_byte_type(int(self.ptr[self.index - 1])) + var byte_type = _utf8_byte_type(self.ptr[self.index - 1]) if byte_type != 0: while byte_type == 1: byte_len += 1 - var b = int(self.ptr[self.index - byte_len]) + var b = self.ptr[self.index - byte_len] byte_type = _utf8_byte_type(b) self.continuation_bytes -= byte_len - 1 self.index -= byte_len @@ -1198,7 +1198,7 @@ struct String( An iterator of references to the string elements. """ return _StringIter[__lifetime_of(self)]( - unsafe_pointer=self.unsafe_ptr(), length=len(self) + unsafe_pointer=self.unsafe_ptr(), length=self.byte_length() ) fn __reversed__(ref [_]self) -> _StringIter[__lifetime_of(self), False]: @@ -1208,7 +1208,7 @@ struct String( A reversed iterator of references to the string elements. """ return _StringIter[__lifetime_of(self), forward=False]( - unsafe_pointer=self.unsafe_ptr(), length=len(self) + unsafe_pointer=self.unsafe_ptr(), length=self.byte_length() ) # ===------------------------------------------------------------------=== # @@ -1222,7 +1222,7 @@ struct String( Returns: True if the string length is greater than zero, and False otherwise. """ - return len(self) > 0 + return self.byte_length() > 0 @always_inline fn __len__(self) -> Int: @@ -1231,12 +1231,15 @@ struct String( Returns: The string byte length. """ - # Avoid returning -1 if the buffer is not initialized - if not self.unsafe_ptr(): - return 0 + # TODO: everything uses this method assuming it's byte length + # var unicode_length = self.byte_length() - # The negative 1 is to account for the terminator. - return len(self._buffer) - 1 + # for i in range(unicode_length): + # if _utf8_byte_type(self._buffer[i]) == 1: + # unicode_length -= 1 + + # return unicode_length + return self.byte_length() @always_inline fn __str__(self) -> String: @@ -1267,6 +1270,19 @@ struct String( # Methods # ===------------------------------------------------------------------=== # + fn byte_length(self) -> Int: + """Returns the string byte length. + + Returns: + The string byte length. + """ + # Avoid returning -1 if the buffer is not initialized + if not self.unsafe_ptr(): + return 0 + + # The negative 1 is to account for the terminator. + return len(self._buffer) - 1 + @always_inline fn _adjust_span(self, span: Slice) -> Slice: """Adjusts the span based on the string length.""" @@ -1552,49 +1568,22 @@ struct String( ) fn isspace(self) -> Bool: - """Determines whether the given String is a python - whitespace String. This corresponds to Python's + """Determines whether every character in the given String is a + python whitespace String. This corresponds to Python's [universal separators]( https://docs.python.org/3/library/stdtypes.html#str.splitlines) `" \\t\\n\\r\\f\\v\\x1c\\x1e\\x85\\u2028\\u2029"`. Returns: - True if the String is one of the whitespace characters + True if the whole String is made up of whitespace characters listed above, otherwise False. """ - # TODO add line and paragraph separator as stringliteral - # once unicode escape secuences are accepted - var next_line = List[UInt8](0xC2, 0x85) - """TODO: \\x85""" - var unicode_line_sep = List[UInt8](0xE2, 0x80, 0xA8) - """TODO: \\u2028""" - var unicode_paragraph_sep = List[UInt8](0xE2, 0x80, 0xA9) - """TODO: \\u2029""" - - @always_inline - fn _compare( - item1: UnsafePointer[UInt8], item2: UnsafePointer[UInt8], amnt: Int - ) -> Bool: - var ptr1 = DTypePointer(item1) - var ptr2 = DTypePointer(item2) - return memcmp(ptr1, ptr2, amnt) == 0 if len(self) == 0: return False for s in self: - var no_null_len = len(s) - var ptr = s.unsafe_ptr() - if no_null_len == 1 and not _isspace(ptr[0]): - return False - elif no_null_len == 2 and not _compare( - ptr, next_line.unsafe_ptr(), 2 - ): - return False - elif no_null_len == 3 and not ( - _compare(ptr, unicode_line_sep.unsafe_ptr(), 3) - or _compare(ptr, unicode_paragraph_sep.unsafe_ptr(), 3) - ): + if not s.isspace(): return False return True @@ -1623,7 +1612,7 @@ struct String( """ var output = List[String]() - var str_iter_len = len(self) - 1 + var str_byte_len = len(self) - 1 var lhs = 0 var rhs = 0 var items = 0 @@ -1631,7 +1620,7 @@ struct String( if sep_len == 0: raise Error("ValueError: empty separator") - while lhs <= str_iter_len: + while lhs <= str_byte_len: rhs = self.find(sep, lhs) if rhs == -1: output.append(self[lhs:]) @@ -1650,12 +1639,13 @@ struct String( output.append("") return output - fn split(self, *, maxsplit: Int = -1) -> List[String]: + fn split(self, sep: NoneType = None, maxsplit: Int = -1) -> List[String]: """Split the string by every Whitespace separator. Currently only uses C style separators. Args: + sep: None. maxsplit: The maximum amount of items to split from String. Defaults to unlimited. @@ -1671,41 +1661,39 @@ struct String( # Splitting a string with leading, trailing, and middle whitespaces _ = String(" hello world ").split() # ["hello", "world"] + # Splitting adjacent universal newlines: + _ = String( + "hello \\t\\n\\r\\f\\v\\x1c\\x1e\\x85\\u2028\\u2029world" + ).split() # ["hello", "world"] ``` . """ - # TODO: implement and document splitting adjacent universal newlines: - # _ = String( - # "hello \\t\\n\\r\\f\\v\\x1c\\x1e\\x85\\u2028\\u2029world" - # ).split() # ["hello", "world"] var output = List[String]() - var str_iter_len = len(self) - 1 + var str_byte_len = len(self) - 1 var lhs = 0 var rhs = 0 var items = 0 - # FIXME: this should iterate and build unicode strings - # and use self.isspace() - while lhs <= str_iter_len: + for substr in self: # Python adds all "whitespace chars" as one separator # if no separator was specified - while lhs <= str_iter_len: - if not _isspace(self._buffer.unsafe_get(lhs)[]): + while lhs <= str_byte_len: + if not substr.isspace(): break lhs += 1 # if it went until the end of the String, then # it should be sliced up until the original # start of the whitespace which was already appended - if lhs - 1 == str_iter_len: + if lhs - 1 == str_byte_len: break - elif lhs == str_iter_len: + elif lhs == str_byte_len: # if the last char is not whitespace - output.append(self[str_iter_len]) + output.append(self[str_byte_len]) break rhs = lhs + 1 - while rhs <= str_iter_len: - if _isspace(self._buffer.unsafe_get(rhs)[]): + while rhs <= str_byte_len: + if substr.isspace(): break rhs += 1 @@ -1860,9 +1848,10 @@ struct String( Returns: A copy of the string with no trailing whitespaces. """ - # TODO: should use self.__iter__ and self.isspace() var r_idx = len(self) - while r_idx > 0 and _isspace(self._buffer.unsafe_get(r_idx - 1)[]): + for s in self.__reversed__(): + if not s.isspace(): + break r_idx -= 1 return self[:r_idx] @@ -1888,9 +1877,10 @@ struct String( Returns: A copy of the string with no leading whitespaces. """ - # TODO: should use self.__iter__ and self.isspace() var l_idx = 0 - while l_idx < len(self) and _isspace(self._buffer.unsafe_get(l_idx)[]): + for s in self: + if not s.isspace(): + break l_idx += 1 return self[l_idx:] diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 6922066129..12298eb8d5 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -21,6 +21,7 @@ from utils import StringSlice """ from utils import Span +from builtin.string import _isspace struct StringSlice[ @@ -180,3 +181,44 @@ struct StringSlice[ without the string getting deallocated early. """ pass + + fn isspace(self) -> Bool: + """Determines whether the given StringSlice is a python + whitespace String. This corresponds to Python's + [universal separators]( + https://docs.python.org/3/library/stdtypes.html#str.splitlines) + `" \\t\\n\\r\\f\\v\\x1c\\x1e\\x85\\u2028\\u2029"`. + + Returns: + True if the String is one of the whitespace characters + listed above, otherwise False. + """ + # TODO add line and paragraph separator as stringliteral + # once unicode escape secuences are accepted + var next_line = List[UInt8](0xC2, 0x85) + """TODO: \\x85""" + var unicode_line_sep = List[UInt8](0xE2, 0x80, 0xA8) + """TODO: \\u2028""" + var unicode_paragraph_sep = List[UInt8](0xE2, 0x80, 0xA9) + """TODO: \\u2029""" + + @always_inline + fn _compare( + item1: UnsafePointer[UInt8], item2: UnsafePointer[UInt8], amnt: Int + ) -> Bool: + var ptr1 = DTypePointer(item1) + var ptr2 = DTypePointer(item2) + return memcmp(ptr1, ptr2, amnt) == 0 + + var no_null_len = len(self) + var ptr = self.unsafe_ptr() + if no_null_len == 1 and not _isspace(ptr[0]): + return False + elif no_null_len == 2 and not _compare(ptr, next_line.unsafe_ptr(), 2): + return False + elif no_null_len == 3 and not ( + _compare(ptr, unicode_line_sep.unsafe_ptr(), 3) + or _compare(ptr, unicode_paragraph_sep.unsafe_ptr(), 3) + ): + return False + return False From 954d5ab8216f77632827969539f6e2d7b160f682 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Wed, 5 Jun 2024 16:04:08 -0400 Subject: [PATCH 02/27] remove damn install_id Signed-off-by: martinvuyk --- install_id | 1 - 1 file changed, 1 deletion(-) delete mode 100644 install_id diff --git a/install_id b/install_id deleted file mode 100644 index d9bca3b7f2..0000000000 --- a/install_id +++ /dev/null @@ -1 +0,0 @@ -vj3s8KOW8jfIJiUFVSYOasUNvswv4KZ_oz0P2TsU_FU \ No newline at end of file From 7e8a33d2c6d867849669041ff52678c7da3742d8 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 11 Jun 2024 09:55:14 -0400 Subject: [PATCH 03/27] fix details Signed-off-by: martinvuyk --- .gitignore | 2 ++ stdlib/src/builtin/string.mojo | 34 ++++++++++++++++++------------ stdlib/src/utils/string_slice.mojo | 13 ++++++------ 3 files changed, 29 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index 7beaae8a60..c7d479dafc 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,5 @@ venv.bak/ # MacOS .DS_Store + +install_id \ No newline at end of file diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index d2e6af8048..01eff2d3d6 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1656,17 +1656,17 @@ struct String( var output = List[String]() - var str_byte_len = len(self) - 1 + var str_byte_len = self.byte_length() - 1 var lhs = 0 var rhs = 0 var items = 0 - for substr in self: + while lhs <= str_byte_len: # Python adds all "whitespace chars" as one separator # if no separator was specified - while lhs <= str_byte_len: - if not substr.isspace(): + for s in self[lhs:]: + if not s.isspace(): break - lhs += 1 + lhs += len(s) # if it went until the end of the String, then # it should be sliced up until the original # start of the whitespace which was already appended @@ -1677,10 +1677,10 @@ struct String( output.append(self[str_byte_len]) break rhs = lhs + 1 - while rhs <= str_byte_len: - if substr.isspace(): + for s in self[lhs + 1 :]: + if s.isspace(): break - rhs += 1 + rhs += len(s) if maxsplit > -1: if items == maxsplit: @@ -1834,9 +1834,12 @@ struct String( A copy of the string with no trailing whitespaces. """ var r_idx = len(self) - for s in self.__reversed__(): - if not s.isspace(): - break + # TODO: should use this once llvm intrinsics can be used at comp time + # for s in self.__reversed__(): + # if not s.isspace(): + # break + # r_idx -= 1 + while r_idx > 0 and _isspace(self._buffer.unsafe_get(r_idx - 1)[]): r_idx -= 1 return self[:r_idx] @@ -1863,9 +1866,12 @@ struct String( A copy of the string with no leading whitespaces. """ var l_idx = 0 - for s in self: - if not s.isspace(): - break + # TODO: should use this once llvm intrinsics can be used at comp time + # for s in self: + # if not s.isspace(): + # break + # l_idx += 1 + while l_idx < len(self) and _isspace(self._buffer.unsafe_get(l_idx)[]): l_idx += 1 return self[l_idx:] diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 12298eb8d5..54e7d097f0 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -212,13 +212,14 @@ struct StringSlice[ var no_null_len = len(self) var ptr = self.unsafe_ptr() - if no_null_len == 1 and not _isspace(ptr[0]): - return False - elif no_null_len == 2 and not _compare(ptr, next_line.unsafe_ptr(), 2): - return False - elif no_null_len == 3 and not ( + if no_null_len == 1 and _isspace(ptr[0]): + return True + elif no_null_len == 2 and _compare(ptr, next_line.unsafe_ptr(), 2): + return True + elif no_null_len == 3 and ( _compare(ptr, unicode_line_sep.unsafe_ptr(), 3) or _compare(ptr, unicode_paragraph_sep.unsafe_ptr(), 3) ): - return False + return True + _ = next_line, unicode_line_sep, unicode_paragraph_sep return False From 944d8516aba27f487fd55cd0a1c3d46ab5109ab4 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 11 Jun 2024 10:09:26 -0400 Subject: [PATCH 04/27] fix details Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index 01eff2d3d6..81b06db539 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1236,15 +1236,14 @@ struct String( Returns: The string byte length. """ - # TODO: everything uses this method assuming it's byte length - # var unicode_length = self.byte_length() + var unicode_length = self.byte_length() + # TODO: everything uses this method assuming it's byte length # for i in range(unicode_length): # if _utf8_byte_type(self._buffer[i]) == 1: # unicode_length -= 1 - # return unicode_length - return self.byte_length() + return unicode_length @always_inline fn __str__(self) -> String: From 968a3e3d0594267145e5087606d92fdd3157489b Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 11 Jun 2024 10:25:20 -0400 Subject: [PATCH 05/27] fix details Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index 81b06db539..9ceeeb939e 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1563,7 +1563,7 @@ struct String( listed above, otherwise False. """ - if len(self) == 0: + if self.byte_length() == 0: return False for s in self: @@ -1596,11 +1596,11 @@ struct String( """ var output = List[String]() - var str_byte_len = len(self) - 1 + var str_byte_len = self.byte_length() - 1 var lhs = 0 var rhs = 0 var items = 0 - var sep_len = len(sep) + var sep_len = sep.byte_length() if sep_len == 0: raise Error("ValueError: empty separator") From 43d8b1005af6e0b8f74fcea90f19c305a1605c75 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 11 Jun 2024 10:30:50 -0400 Subject: [PATCH 06/27] fix details Signed-off-by: martinvuyk --- stdlib/test/builtin/test_string.mojo | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/stdlib/test/builtin/test_string.mojo b/stdlib/test/builtin/test_string.mojo index 5eb1b077c8..73c62e0dce 100644 --- a/stdlib/test/builtin/test_string.mojo +++ b/stdlib/test/builtin/test_string.mojo @@ -628,8 +628,31 @@ fn test_split() raises: assert_true(d[0] == "hello \t" and d[1] == "" and d[2] == "\v\fworld") # Should add all whitespace-like chars as one - alias utf8_spaces = String(" \t\n\r\v\f") - var s = utf8_spaces + "hello" + utf8_spaces + "world" + utf8_spaces + # test all unicode separators + # 0 is to build a String with null terminator + alias next_line = List[UInt8](0xC2, 0x85, 0) + """TODO: \\x85""" + alias unicode_line_sep = List[UInt8](0xE2, 0x80, 0xA8, 0) + """TODO: \\u2028""" + alias unicode_paragraph_sep = List[UInt8](0xE2, 0x80, 0xA9, 0) + """TODO: \\u2029""" + # TODO add line and paragraph separator as stringliteral once unicode + # escape secuences are accepted + var univ_sep_var = ( + String(" ") + + String("\t") + + String("\n") + + String("\r") + + String("\v") + + String("\f") + + String("\x1c") + + String("\x1d") + + String("\x1e") + + String(next_line) + + String(unicode_line_sep) + + String(unicode_paragraph_sep) + ) + var s = univ_sep_var + "hello" + univ_sep_var + "world" + univ_sep_var d = s.split() assert_true(len(d) == 2) assert_true(d[0] == "hello" and d[1] == "world") From b1f305539c8a4caba9b16947d6e89024f0877cad Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 11 Jun 2024 19:04:29 -0400 Subject: [PATCH 07/27] fix details Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 2 +- stdlib/src/utils/string_slice.mojo | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index 9ceeeb939e..832673e9c3 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1556,7 +1556,7 @@ struct String( python whitespace String. This corresponds to Python's [universal separators]( https://docs.python.org/3/library/stdtypes.html#str.splitlines) - `" \\t\\n\\r\\f\\v\\x1c\\x1e\\x85\\u2028\\u2029"`. + `" \\t\\n\\r\\f\\v\\x1c\\x1d\\x1e\\x85\\u2028\\u2029"`. Returns: True if the whole String is made up of whitespace characters diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 54e7d097f0..9254ce63c5 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -187,7 +187,7 @@ struct StringSlice[ whitespace String. This corresponds to Python's [universal separators]( https://docs.python.org/3/library/stdtypes.html#str.splitlines) - `" \\t\\n\\r\\f\\v\\x1c\\x1e\\x85\\u2028\\u2029"`. + `" \\t\\n\\r\\f\\v\\x1c\\x1d\\x1e\\x85\\u2028\\u2029"`. Returns: True if the String is one of the whitespace characters From 8e5c0ca9aa72aba8cdc22618349f54f096e2cc62 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 11 Jun 2024 19:07:10 -0400 Subject: [PATCH 08/27] fix details Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 2 -- 1 file changed, 2 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index 832673e9c3..76f2d42fb3 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1626,8 +1626,6 @@ struct String( fn split(self, sep: NoneType = None, maxsplit: Int = -1) -> List[String]: """Split the string by every Whitespace separator. - Currently only uses C style separators. - Args: sep: None. maxsplit: The maximum amount of items to split from String. Defaults From cbe54cdf51bd28a4effd28f0c3613dff5a4c1a92 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 11 Jun 2024 19:07:45 -0400 Subject: [PATCH 09/27] fix details Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index 76f2d42fb3..6b59db0964 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1651,8 +1651,8 @@ struct String( . """ + _ = sep var output = List[String]() - var str_byte_len = self.byte_length() - 1 var lhs = 0 var rhs = 0 From 6e7754f56351000ab21cb8af15efda204e62a5c0 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 11 Jun 2024 20:37:41 -0400 Subject: [PATCH 10/27] fix details Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index 6b59db0964..efdbbf120b 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1645,7 +1645,7 @@ struct String( _ = String(" hello world ").split() # ["hello", "world"] # Splitting adjacent universal newlines: _ = String( - "hello \\t\\n\\r\\f\\v\\x1c\\x1e\\x85\\u2028\\u2029world" + "hello \\t\\n\\r\\f\\v\\x1c\\x1d\\x1e\\x85\\u2028\\u2029world" ).split() # ["hello", "world"] ``` . From 64645eecf0a16efeab5cb61fffa7f555d94820aa Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Wed, 12 Jun 2024 11:02:19 -0400 Subject: [PATCH 11/27] fix details Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index efdbbf120b..50ad8c0c85 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1275,10 +1275,10 @@ struct String( # ===------------------------------------------------------------------=== # fn byte_length(self) -> Int: - """Returns the string byte length. + """Returns the string byte length without null terminator. Returns: - The string byte length. + The string byte length without null terminator. """ # Avoid returning -1 if the buffer is not initialized if not self.unsafe_ptr(): From 2fd95ceabe4e8cbdf03a4991835f77d946b243ca Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 2 Jul 2024 11:54:47 -0400 Subject: [PATCH 12/27] fix suggestions Signed-off-by: martinvuyk --- .gitignore | 2 -- stdlib/src/builtin/string.mojo | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index c7d479dafc..7beaae8a60 100644 --- a/.gitignore +++ b/.gitignore @@ -19,5 +19,3 @@ venv.bak/ # MacOS .DS_Store - -install_id \ No newline at end of file diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index f42240ff89..b799c4cd6c 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1056,8 +1056,9 @@ struct String( ```mojo var string = String.format_sequence(1, ", ", 2.0, ", ", "three") - - assert_equal(string, "1, 2.0, three") + print(string) # "1, 2.0, three" + %# from testing import assert_equal + %# assert_equal(string, "1, 2.0, three") ``` . """ @@ -1780,7 +1781,6 @@ struct String( . """ - _ = sep var output = List[String]() var str_byte_len = self.byte_length() - 1 var lhs = 0 From 2cb088066a535923879aa9ef3cf07b954d6ce4dc Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 2 Jul 2024 11:58:36 -0400 Subject: [PATCH 13/27] add llvm intrinsics issue #933 Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index c690f79814..a4f80f725d 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1981,7 +1981,7 @@ struct String( A copy of the string with no trailing whitespaces. """ var r_idx = len(self) - # TODO: should use this once llvm intrinsics can be used at comp time + # TODO (#933): should use this once llvm intrinsics can be used at comp time # for s in self.__reversed__(): # if not s.isspace(): # break @@ -2013,7 +2013,7 @@ struct String( A copy of the string with no leading whitespaces. """ var l_idx = 0 - # TODO: should use this once llvm intrinsics can be used at comp time + # TODO (#933): should use this once llvm intrinsics can be used at comp time # for s in self: # if not s.isspace(): # break From 051e6ebcfc64fb67f19684a358622834533c16d8 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 2 Jul 2024 12:04:46 -0400 Subject: [PATCH 14/27] fix detail Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 2 -- 1 file changed, 2 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index a4f80f725d..e374a130b6 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1074,8 +1074,6 @@ struct String( Construct a String from several `Formattable` arguments: ```mojo - from testing import assert_equal - var string = String.format_sequence(1, ", ", 2.0, ", ", "three") print(string) # "1, 2.0, three" %# from testing import assert_equal From b32daf3e4616e451099c2c166b240d5add96419c Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 8 Jul 2024 14:46:37 -0400 Subject: [PATCH 15/27] move isspace and stringiter impl to stringslice Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 136 ++++----------------- stdlib/src/builtin/string_literal.mojo | 9 +- stdlib/src/utils/string_slice.mojo | 163 ++++++++++++++++++++----- 3 files changed, 157 insertions(+), 151 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index e374a130b6..77be761554 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -25,6 +25,7 @@ from memory import DTypePointer, LegacyPointer, UnsafePointer, memcmp, memcpy from utils import Span, StaticIntTuple, StringRef, StringSlice from utils._format import Formattable, Formatter, ToFormatter +from utils.string_slice import _StringSliceIter # ===----------------------------------------------------------------------=== # # ord @@ -69,11 +70,11 @@ fn ord(s: StringSlice) -> Int: var p = s.unsafe_ptr().bitcast[UInt8]() var b1 = p[] if (b1 >> 7) == 0: # This is 1 byte ASCII char - debug_assert(s._byte_length() == 1, "input string length must be 1") + debug_assert(s.byte_length() == 1, "input string length must be 1") return int(b1) var num_bytes = countl_zero(~b1) debug_assert( - s._byte_length() == int(num_bytes), "input string must be one character" + s.byte_length() == int(num_bytes), "input string must be one character" ) debug_assert( 1 < int(num_bytes) < 5, "invalid UTF-8 byte " + str(b1) + " at index 0" @@ -782,76 +783,6 @@ fn _utf8_byte_type(b: UInt8) -> UInt8: return countl_zero(~(b & 0b1111_0000)) -@value -struct _StringIter[ - is_mutable: Bool, //, - lifetime: AnyLifetime[is_mutable].type, - forward: Bool = True, -]: - """Iterator for String. - - Parameters: - is_mutable: Whether the slice is mutable. - lifetime: The lifetime of the underlying string data. - forward: The iteration direction. `False` is backwards. - """ - - var index: Int - var continuation_bytes: Int - var ptr: UnsafePointer[UInt8] - var length: Int - - fn __init__( - inout self, *, unsafe_pointer: UnsafePointer[UInt8], length: Int - ): - self.index = 0 if forward else length - self.ptr = unsafe_pointer - self.length = length - self.continuation_bytes = 0 - for i in range(length): - if _utf8_byte_type(unsafe_pointer[i]) == 1: - self.continuation_bytes += 1 - - fn __iter__(self) -> Self: - return self - - fn __next__(inout self) -> StringSlice[lifetime]: - @parameter - if forward: - var byte_len = 1 - if self.continuation_bytes > 0: - var byte_type = _utf8_byte_type(self.ptr[self.index]) - if byte_type != 0: - byte_len = int(byte_type) - self.continuation_bytes -= byte_len - 1 - self.index += byte_len - return StringSlice[lifetime]( - unsafe_from_utf8_ptr=self.ptr + (self.index - byte_len), - len=byte_len, - ) - else: - var byte_len = 1 - if self.continuation_bytes > 0: - var byte_type = _utf8_byte_type(self.ptr[self.index - 1]) - if byte_type != 0: - while byte_type == 1: - byte_len += 1 - var b = self.ptr[self.index - byte_len] - byte_type = _utf8_byte_type(b) - self.continuation_bytes -= byte_len - 1 - self.index -= byte_len - return StringSlice[lifetime]( - unsafe_from_utf8_ptr=self.ptr + self.index, len=byte_len - ) - - fn __len__(self) -> Int: - @parameter - if forward: - return self.length - self.index - self.continuation_bytes - else: - return self.index - self.continuation_bytes - - struct String( Sized, Stringable, @@ -1314,23 +1245,25 @@ struct String( count=other_len + 1, ) - fn __iter__(ref [_]self) -> _StringIter[__lifetime_of(self)]: + fn __iter__(ref [_]self) -> _StringSliceIter[__lifetime_of(self)]: """Iterate over elements of the string, returning immutable references. Returns: An iterator of references to the string elements. """ - return _StringIter[__lifetime_of(self)]( + return _StringSliceIter[__lifetime_of(self)]( unsafe_pointer=self.unsafe_ptr(), length=self.byte_length() ) - fn __reversed__(ref [_]self) -> _StringIter[__lifetime_of(self), False]: + fn __reversed__( + ref [_]self, + ) -> _StringSliceIter[__lifetime_of(self), False]: """Iterate backwards over the string, returning immutable references. Returns: A reversed iterator of references to the string elements. """ - return _StringIter[__lifetime_of(self), forward=False]( + return _StringSliceIter[__lifetime_of(self), forward=False]( unsafe_pointer=self.unsafe_ptr(), length=self.byte_length() ) @@ -1400,19 +1333,6 @@ struct String( # Methods # ===------------------------------------------------------------------=== # - fn byte_length(self) -> Int: - """Returns the string byte length without null terminator. - - Returns: - The string byte length without null terminator. - """ - # Avoid returning -1 if the buffer is not initialized - if not self.unsafe_ptr(): - return 0 - - # The negative 1 is to account for the terminator. - return len(self._buffer) - 1 - fn format_to(self, inout writer: Formatter): """ Formats this string to the provided formatter. @@ -1572,19 +1492,17 @@ struct String( @always_inline fn as_bytes_slice(ref [_]self) -> Span[UInt8, __lifetime_of(self)]: - """ - Returns a contiguous slice of the bytes owned by this string. - - This does not include the trailing null terminator. - + """Returns a contiguous slice of the bytes owned by this string. Returns: A contiguous slice pointing to the bytes owned by this string. + + Notes: + This does not include the trailing null terminator. """ + # Does NOT include the NUL terminator. return Span[UInt8, __lifetime_of(self)]( - unsafe_ptr=self._buffer.unsafe_ptr(), - # Does NOT include the NUL terminator. - len=self._byte_length(), + unsafe_ptr=self._buffer.unsafe_ptr(), len=self.byte_length() ) @always_inline @@ -1599,21 +1517,16 @@ struct String( # guaranteed to be valid. return StringSlice(unsafe_from_utf8=self.as_bytes_slice()) - fn _byte_length(self) -> Int: + fn byte_length(self) -> Int: """Get the string length in bytes. - This does not include the trailing null terminator in the count. - Returns: The length of this string in bytes, excluding null terminator. - """ - - var buffer_len = len(self._buffer) - if buffer_len > 0: - return buffer_len - 1 - else: - return buffer_len + Notes: + This does not include the trailing null terminator in the count. + """ + return max(len(self._buffer) - 1, 0) fn _steal_ptr(inout self) -> UnsafePointer[UInt8]: """Transfer ownership of pointer to the underlying memory. @@ -1711,14 +1624,7 @@ struct String( True if the whole String is made up of whitespace characters listed above, otherwise False. """ - - if self.byte_length() == 0: - return False - - for s in self: - if not s.isspace(): - return False - return True + return self.as_string_slice().isspace() fn split(self, sep: String, maxsplit: Int = -1) raises -> List[String]: """Split the string by a separator. diff --git a/stdlib/src/builtin/string_literal.mojo b/stdlib/src/builtin/string_literal.mojo index c3d0ab1128..3f0f587a2d 100644 --- a/stdlib/src/builtin/string_literal.mojo +++ b/stdlib/src/builtin/string_literal.mojo @@ -191,7 +191,7 @@ struct StringLiteral( # TODO(MSTDL-160): # Properly count Unicode codepoints instead of returning this length # in bytes. - return self._byte_length() + return self.byte_length() @always_inline("nodebug") fn __bool__(self) -> Bool: @@ -262,11 +262,14 @@ struct StringLiteral( # ===-------------------------------------------------------------------===# @always_inline - fn _byte_length(self) -> Int: + fn byte_length(self) -> Int: """Get the string length in bytes. Returns: The length of this StringLiteral in bytes. + + Notes: + This does not include the trailing null terminator in the count. """ return __mlir_op.`pop.string.size`(self.value) @@ -333,7 +336,7 @@ struct StringLiteral( return Span[UInt8, ImmutableStaticLifetime]( unsafe_ptr=ptr, - len=self._byte_length(), + len=self.byte_length(), ) fn format_to(self, inout writer: Formatter): diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 0357fa0ac9..90ef29eceb 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -21,12 +21,82 @@ from utils import StringSlice """ from utils import Span -from builtin.string import _isspace +from builtin.string import _isspace, _utf8_byte_type alias StaticString = StringSlice[ImmutableStaticLifetime] """An immutable static string slice.""" +@value +struct _StringSliceIter[ + is_mutable: Bool, //, + lifetime: AnyLifetime[is_mutable].type, + forward: Bool = True, +]: + """Iterator for String. + + Parameters: + is_mutable: Whether the slice is mutable. + lifetime: The lifetime of the underlying string data. + forward: The iteration direction. `False` is backwards. + """ + + var index: Int + var continuation_bytes: Int + var ptr: UnsafePointer[UInt8] + var length: Int + + fn __init__( + inout self, *, unsafe_pointer: UnsafePointer[UInt8], length: Int + ): + self.index = 0 if forward else length + self.ptr = unsafe_pointer + self.length = length + self.continuation_bytes = 0 + for i in range(length): + if _utf8_byte_type(unsafe_pointer[i]) == 1: + self.continuation_bytes += 1 + + fn __iter__(self) -> Self: + return self + + fn __next__(inout self) -> StringSlice[lifetime]: + @parameter + if forward: + var byte_len = 1 + if self.continuation_bytes > 0: + var byte_type = _utf8_byte_type(self.ptr[self.index]) + if byte_type != 0: + byte_len = int(byte_type) + self.continuation_bytes -= byte_len - 1 + self.index += byte_len + return StringSlice[lifetime]( + unsafe_from_utf8_ptr=self.ptr + (self.index - byte_len), + len=byte_len, + ) + else: + var byte_len = 1 + if self.continuation_bytes > 0: + var byte_type = _utf8_byte_type(self.ptr[self.index - 1]) + if byte_type != 0: + while byte_type == 1: + byte_len += 1 + var b = self.ptr[self.index - byte_len] + byte_type = _utf8_byte_type(b) + self.continuation_bytes -= byte_len - 1 + self.index -= byte_len + return StringSlice[lifetime]( + unsafe_from_utf8_ptr=self.ptr + self.index, len=byte_len + ) + + fn __len__(self) -> Int: + @parameter + if forward: + return self.length - self.index - self.continuation_bytes + else: + return self.index - self.continuation_bytes + + struct StringSlice[ is_mutable: Bool, //, lifetime: AnyLifetime[is_mutable].type, @@ -69,8 +139,7 @@ struct StringSlice[ # FIXME(MSTDL-160): # Ensure StringLiteral _actually_ always uses UTF-8 encoding. self = StringSlice[lifetime]( - unsafe_from_utf8_ptr=literal.unsafe_ptr(), - len=literal._byte_length(), + unsafe_from_utf8_ptr=literal.unsafe_ptr(), len=literal.byte_length() ) @always_inline @@ -156,9 +225,13 @@ struct StringSlice[ Returns: The length in Unicode codepoints. """ - # FIXME(MSTDL-160): - # Actually perform UTF-8 decoding here to count the codepoints. - return len(self._slice) + var unicode_length = self.byte_length() + + for i in range(unicode_length): + if _utf8_byte_type(self._slice[i]) == 1: + unicode_length -= 1 + + return unicode_length fn format_to(self, inout writer: Formatter): """ @@ -258,14 +331,35 @@ struct StringSlice[ """ return not self == rhs + fn __iter__(ref [_]self) -> _StringSliceIter[__lifetime_of(self)]: + """Iterate over elements of the string, returning immutable references. + + Returns: + An iterator of references to the string elements. + """ + return _StringSliceIter[__lifetime_of(self)]( + unsafe_pointer=self.unsafe_ptr(), length=self.byte_length() + ) + + fn __reversed__( + ref [_]self, + ) -> _StringSliceIter[__lifetime_of(self), False]: + """Iterate backwards over the string, returning immutable references. + + Returns: + A reversed iterator of references to the string elements. + """ + return _StringSliceIter[__lifetime_of(self), forward=False]( + unsafe_pointer=self.unsafe_ptr(), length=self.byte_length() + ) + # ===------------------------------------------------------------------===# # Methods # ===------------------------------------------------------------------===# @always_inline fn as_bytes_slice(self) -> Span[UInt8, lifetime]: - """ - Get the sequence of encoded bytes as a slice of the underlying string. + """Get the sequence of encoded bytes as a slice of the underlying string. Returns: A slice containing the underlying sequence of encoded bytes. @@ -274,8 +368,7 @@ struct StringSlice[ @always_inline fn unsafe_ptr(self) -> UnsafePointer[UInt8]: - """ - Gets a pointer to the first element of this string slice. + """Gets a pointer to the first element of this string slice. Returns: A pointer pointing at the first element of this string slice. @@ -284,9 +377,8 @@ struct StringSlice[ return self._slice.unsafe_ptr() @always_inline - fn _byte_length(self) -> Int: - """ - Get the length of this string slice in bytes. + fn byte_length(self) -> Int: + """Get the length of this string slice in bytes. Returns: The length of this string slice in bytes. @@ -295,8 +387,7 @@ struct StringSlice[ return len(self.as_bytes_slice()) fn _strref_dangerous(self) -> StringRef: - """ - Returns an inner pointer to the string as a StringRef. + """Returns an inner pointer to the string as a StringRef. Safety: This functionality is extremely dangerous because Mojo eagerly @@ -304,27 +395,30 @@ struct StringSlice[ _strref_keepalive() method to keep the underlying string alive long enough. """ - return StringRef(self.unsafe_ptr(), self._byte_length()) + return StringRef(self.unsafe_ptr(), self.byte_length()) fn _strref_keepalive(self): - """ - A no-op that keeps `self` alive through the call. This + """A no-op that keeps `self` alive through the call. This can be carefully used with `_strref_dangerous()` to wield inner pointers without the string getting deallocated early. """ pass fn isspace(self) -> Bool: - """Determines whether the given StringSlice is a python - whitespace String. This corresponds to Python's + """Determines whether every character in the given StringSlice is a + python whitespace String. This corresponds to Python's [universal separators]( https://docs.python.org/3/library/stdtypes.html#str.splitlines) `" \\t\\n\\r\\f\\v\\x1c\\x1d\\x1e\\x85\\u2028\\u2029"`. Returns: - True if the String is one of the whitespace characters + True if the whole StringSlice is made up of whitespace characters listed above, otherwise False. """ + + if self.byte_length() == 0: + return False + # TODO add line and paragraph separator as stringliteral # once unicode escape secuences are accepted var next_line = List[UInt8](0xC2, 0x85) @@ -342,16 +436,19 @@ struct StringSlice[ var ptr2 = DTypePointer(item2) return memcmp(ptr1, ptr2, amnt) == 0 - var no_null_len = len(self) - var ptr = self.unsafe_ptr() - if no_null_len == 1 and _isspace(ptr[0]): - return True - elif no_null_len == 2 and _compare(ptr, next_line.unsafe_ptr(), 2): - return True - elif no_null_len == 3 and ( - _compare(ptr, unicode_line_sep.unsafe_ptr(), 3) - or _compare(ptr, unicode_paragraph_sep.unsafe_ptr(), 3) - ): - return True + for s in self: + var no_null_len = s.byte_length() + var ptr = s.unsafe_ptr() + if no_null_len == 1 and _isspace(ptr[0]): + continue + elif no_null_len == 2 and _compare(ptr, next_line.unsafe_ptr(), 2): + continue + elif no_null_len == 3 and ( + _compare(ptr, unicode_line_sep.unsafe_ptr(), 3) + or _compare(ptr, unicode_paragraph_sep.unsafe_ptr(), 3) + ): + continue + else: + return False _ = next_line, unicode_line_sep, unicode_paragraph_sep - return False + return True From 056902551f55affbdf183643631cf98ca249ac74 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 8 Jul 2024 14:58:09 -0400 Subject: [PATCH 16/27] fix byte_length usage Signed-off-by: martinvuyk --- stdlib/src/builtin/io.mojo | 4 ++-- stdlib/src/builtin/string.mojo | 12 +++++++++--- stdlib/src/builtin/string_literal.mojo | 2 +- stdlib/src/sys/ffi.mojo | 2 +- stdlib/src/utils/inline_string.mojo | 10 +++++----- 5 files changed, 18 insertions(+), 12 deletions(-) diff --git a/stdlib/src/builtin/io.mojo b/stdlib/src/builtin/io.mojo index 788253ad03..a6f445a6ff 100644 --- a/stdlib/src/builtin/io.mojo +++ b/stdlib/src/builtin/io.mojo @@ -320,7 +320,7 @@ fn _put(x: DType, file: FileDescriptor = stdout): @no_inline fn _put(x: StringSlice, file: FileDescriptor = stdout): # Avoid printing "(null)" for an empty/default constructed `String` - var str_len = x._byte_length() + var str_len = x.byte_length() if not str_len: return @@ -341,7 +341,7 @@ fn _put(x: StringSlice, file: FileDescriptor = stdout): # The string can be printed, so that's fine. if str_len < MAX_STR_LEN: - _printf["%.*s"](x._byte_length(), x.unsafe_ptr(), file=file) + _printf["%.*s"](x.byte_length(), x.unsafe_ptr(), file=file) return # The string is large, then we need to chunk it. diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index d65c95957a..7b7e6e571e 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1213,11 +1213,17 @@ struct String( """ return self.byte_length() > 0 + @deprecated( + "string length, in bytes (for now) PREFER: String.byte_length(), a" + " future version will make this method return Unicode codepoints." + ) fn __len__(self) -> Int: - """Gets the string length, in bytes. + """Gets the string length, in bytes (for now) PREFER: + String.byte_length(), a future version will make this method return + Unicode codepoints. Returns: - The string length, in bytes. + The string length, in bytes (for now). """ var unicode_length = self.byte_length() @@ -2136,7 +2142,7 @@ struct String( return _is_ascii_uppercase(c) or _is_ascii_lowercase(c) for c in self: - debug_assert(c._byte_length() == 1, "only implemented for ASCII") + debug_assert(c.byte_length() == 1, "only implemented for ASCII") if is_ascii_cased(ord(c)): @parameter diff --git a/stdlib/src/builtin/string_literal.mojo b/stdlib/src/builtin/string_literal.mojo index 1085db984b..0dcdb0d4c9 100644 --- a/stdlib/src/builtin/string_literal.mojo +++ b/stdlib/src/builtin/string_literal.mojo @@ -221,7 +221,7 @@ struct StringLiteral( A new string. """ var string = String() - var length = self._byte_length() + var length = self.byte_length() var buffer = String._buffer_type() var new_capacity = length + 1 buffer._realloc(new_capacity) diff --git a/stdlib/src/sys/ffi.mojo b/stdlib/src/sys/ffi.mojo index 3c65863f0d..fd2d3b8d50 100644 --- a/stdlib/src/sys/ffi.mojo +++ b/stdlib/src/sys/ffi.mojo @@ -231,7 +231,7 @@ fn _get_global[ fn _get_global_or_null[name: StringLiteral]() -> UnsafePointer[NoneType]: return external_call[ "KGEN_CompilerRT_GetGlobalOrNull", UnsafePointer[NoneType] - ](name.unsafe_ptr(), name._byte_length()) + ](name.unsafe_ptr(), name.byte_length()) @always_inline diff --git a/stdlib/src/utils/inline_string.mojo b/stdlib/src/utils/inline_string.mojo index d5481cd223..db93624194 100644 --- a/stdlib/src/utils/inline_string.mojo +++ b/stdlib/src/utils/inline_string.mojo @@ -123,7 +123,7 @@ struct InlineString(Sized, Stringable, CollectionElement, CollectionElementNew): Args: str_slice: The string to append. """ - var total_len = len(self) + str_slice._byte_length() + var total_len = len(self) + str_slice.byte_length() # NOTE: Not guaranteed that we're in the small layout even if our # length is shorter than the small capacity. @@ -157,7 +157,7 @@ struct InlineString(Sized, Stringable, CollectionElement, CollectionElementNew): memcpy( dest=buffer.unsafe_ptr() + len(self), src=str_slice.unsafe_ptr(), - count=str_slice._byte_length(), + count=str_slice.byte_length(), ) # Record that we've initialized `total_len` count of elements @@ -441,14 +441,14 @@ struct _FixedString[CAP: Int]( inout self, str_slice: StringSlice[_], ) -> Optional[Error]: - var total_len = len(self) + str_slice._byte_length() + var total_len = len(self) + str_slice.byte_length() # Ensure there is sufficient capacity to append `str_slice` if total_len > CAP: return Optional( Error( "Insufficient capacity to append len=" - + str(str_slice._byte_length()) + + str(str_slice.byte_length()) + " string to len=" + str(len(self)) + " FixedString with capacity=" @@ -460,7 +460,7 @@ struct _FixedString[CAP: Int]( memcpy( dest=self.buffer.unsafe_ptr() + len(self), src=str_slice.unsafe_ptr(), - count=str_slice._byte_length(), + count=str_slice.byte_length(), ) self.size = total_len From 1a91e5f51e85ddf2234daf4124afe9f7acb66fd4 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 8 Jul 2024 15:12:24 -0400 Subject: [PATCH 17/27] fix byte_length usage in tests Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 4 ++-- stdlib/test/builtin/test_string.mojo | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index 7b7e6e571e..71123005a8 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1662,7 +1662,7 @@ struct String( for s in self[lhs:]: if not s.isspace(): break - lhs += len(s) + lhs += s.byte_length() # if it went until the end of the String, then # it should be sliced up until the original # start of the whitespace which was already appended @@ -1676,7 +1676,7 @@ struct String( for s in self[lhs + 1 :]: if s.isspace(): break - rhs += len(s) + rhs += s.byte_length() if maxsplit > -1: if items == maxsplit: diff --git a/stdlib/test/builtin/test_string.mojo b/stdlib/test/builtin/test_string.mojo index 6932cbd39c..2007ee6f75 100644 --- a/stdlib/test/builtin/test_string.mojo +++ b/stdlib/test/builtin/test_string.mojo @@ -1274,7 +1274,7 @@ def test_string_iter(): var utf8_sequence_len = 0 var byte_idx = 0 for v in item: - var byte_len = len(v) + var byte_len = v.byte_length() assert_equal(item[byte_idx : byte_idx + byte_len], v) byte_idx += byte_len utf8_sequence_len += 1 From 01e54502cc241644b004d4805bdf8feaa9e3d245 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 8 Jul 2024 15:17:58 -0400 Subject: [PATCH 18/27] remove deprecation warning Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 4 ---- 1 file changed, 4 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index 71123005a8..6e0f143652 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1213,10 +1213,6 @@ struct String( """ return self.byte_length() > 0 - @deprecated( - "string length, in bytes (for now) PREFER: String.byte_length(), a" - " future version will make this method return Unicode codepoints." - ) fn __len__(self) -> Int: """Gets the string length, in bytes (for now) PREFER: String.byte_length(), a future version will make this method return From cbebc06d8678e03b7ffec4c96a5474104eb79bd1 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 8 Jul 2024 15:27:18 -0400 Subject: [PATCH 19/27] add deprecation warning, fix docstring Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index 6e0f143652..4cc4068674 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1213,6 +1213,10 @@ struct String( """ return self.byte_length() > 0 + @deprecated( + "A future version will make this method return Unicode codepoints " + "PREFER: String.byte_length()" + ) fn __len__(self) -> Int: """Gets the string length, in bytes (for now) PREFER: String.byte_length(), a future version will make this method return @@ -1434,6 +1438,7 @@ struct String( @always_inline fn as_bytes_slice(ref [_]self) -> Span[UInt8, __lifetime_of(self)]: """Returns a contiguous slice of the bytes owned by this string. + Returns: A contiguous slice pointing to the bytes owned by this string. From 05d33ac96757b5a583603c678cf6b140f77cae5d Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 8 Jul 2024 16:14:55 -0400 Subject: [PATCH 20/27] fix uses of String.__len__ Signed-off-by: martinvuyk --- stdlib/src/base64/base64.mojo | 8 +-- stdlib/src/builtin/error.mojo | 2 +- stdlib/src/builtin/file.mojo | 8 +-- stdlib/src/builtin/string.mojo | 96 ++++++++++++++++++++----------- stdlib/src/tempfile/tempfile.mojo | 4 +- 5 files changed, 75 insertions(+), 43 deletions(-) diff --git a/stdlib/src/base64/base64.mojo b/stdlib/src/base64/base64.mojo index d042c27e21..62e021d38f 100644 --- a/stdlib/src/base64/base64.mojo +++ b/stdlib/src/base64/base64.mojo @@ -72,7 +72,7 @@ fn b64encode(str: String) -> String: alias lookup = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" var b64chars = lookup.unsafe_ptr() - var length = len(str) + var length = str.byte_length() var out = String._buffer_type(capacity=length + 1) @parameter @@ -121,7 +121,7 @@ fn b64decode(str: String) -> String: Returns: The decoded string. """ - var n = len(str) + var n = str.byte_length() debug_assert(n % 4 == 0, "Input length must be divisible by 4") var p = String._buffer_type(capacity=n + 1) @@ -170,7 +170,7 @@ fn b16encode(str: String) -> String: alias lookup = "0123456789ABCDEF" var b16chars = lookup.unsafe_ptr() - var length = len(str) + var length = str.byte_length() var out = List[UInt8](capacity=length * 2 + 1) @parameter @@ -221,7 +221,7 @@ fn b16decode(str: String) -> String: return -1 - var n = len(str) + var n = str.byte_length() debug_assert(n % 2 == 0, "Input length must be divisible by 2") var p = List[UInt8](capacity=n // 2 + 1) diff --git a/stdlib/src/builtin/error.mojo b/stdlib/src/builtin/error.mojo index 59b05566bf..ba8fd223b5 100644 --- a/stdlib/src/builtin/error.mojo +++ b/stdlib/src/builtin/error.mojo @@ -80,7 +80,7 @@ struct Error( Returns: The constructed Error object. """ - var length = len(src) + var length = src.byte_length() var dest = UnsafePointer[UInt8].alloc(length + 1) memcpy( dest=dest, diff --git a/stdlib/src/builtin/file.mojo b/stdlib/src/builtin/file.mojo index 143ba314df..faa7e73653 100644 --- a/stdlib/src/builtin/file.mojo +++ b/stdlib/src/builtin/file.mojo @@ -239,7 +239,7 @@ struct FileHandle: var bytes = file.read(ptr, 8) print("bytes read", bytes) - var first_element = ptr.load(0) + var first_element = ptr[0] print(first_element) # Skip 2 elements @@ -374,7 +374,7 @@ struct FileHandle: ```mojo import os var f = open("/tmp/example.txt", "r") - f.seek(os.SEEK_CUR, 32) + _ = f.seek(32, os.SEEK_CUR) ``` Start from 32 bytes from the end of the file: @@ -382,7 +382,7 @@ struct FileHandle: ```mojo import os var f = open("/tmp/example.txt", "r") - f.seek(os.SEEK_END, -32) + _ = f.seek(-32, os.SEEK_END) ``` . """ @@ -409,7 +409,7 @@ struct FileHandle: Args: data: The data to write to the file. """ - self._write(data.unsafe_ptr(), len(data)) + self._write(data.unsafe_ptr(), data.byte_length()) fn write(self, data: Span[UInt8, _]) raises: """Write a borrowed sequence of data to the file. diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index 4cc4068674..d630cbdd33 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1006,6 +1006,17 @@ struct String( Returns: A new string containing the character at the specified position. """ + # FIXME: this should work with unicode codepoints, but this doesn't yet + # work at compile time because of issue #933 + # var i = 0 + # var buf = Self._buffer_type(capacity=5) + # for s in self: + # if i != idx: + # continue + # for j in range(s.byte_length()): + # buf[j] = self._buffer[i + j] + # buf.append(0) + # buf.resize(len(buf)) var normalized_idx = normalize_index["String"](idx, self) var buf = Self._buffer_type(capacity=1) buf.append(self._buffer[normalized_idx]) @@ -1024,13 +1035,27 @@ struct String( var start: Int var end: Int var step: Int - start, end, step = span.indices(len(self)) + # FIXME: this should work with unicode codepoints, but this doesn't yet + # work at compile time because of issue #933 + # var idx = 0 + # var start = span.start.value() if span.start else 0 + # var end = span.end.value() if span.end else 0 + # var buffer = Self._buffer_type() + # for s in self: + # var amnt_bytes = s.byte_length() + # if not (start <= idx < end) or idx % span.step != 0: + # idx += amnt_bytes + # continue + # for i in range(amnt_bytes): + # buffer[idx + i] = self._buffer[idx + i] + # buffer.append(0) + # buffer.resize(len(buffer)) + # return Self(buffer^) + + start, end, step = span.indices(self.byte_length()) var r = range(start, end, step) if step == 1: - return StringRef( - self._buffer.data + start, - len(r), - ) + return StringRef(self._buffer.data + start, len(r)) var buffer = Self._buffer_type() var result_len = len(r) @@ -1127,8 +1152,8 @@ struct String( return other if not other: return self - var self_len = len(self) - var other_len = len(other) + var self_len = self.byte_length() + var other_len = other.byte_length() var total_len = self_len + other_len var buffer = Self._buffer_type() buffer.resize(total_len + 1, 0) @@ -1167,8 +1192,8 @@ struct String( return if not other: return - var self_len = len(self) - var other_len = len(other) + var self_len = self.byte_length() + var other_len = other.byte_length() var total_len = self_len + other_len self._buffer.resize(total_len + 1, 0) # Copy the data alongside the terminator. @@ -1387,7 +1412,7 @@ struct String( strings. Using this requires the use of the _strref_keepalive() method to keep the underlying string alive long enough. """ - return StringRef(self.unsafe_ptr(), len(self)) + return StringRef(self.unsafe_ptr(), self.byte_length()) fn _strref_keepalive(self): """ @@ -1512,7 +1537,7 @@ struct String( break res += 1 - offset = pos + len(substr) + offset = pos + substr.byte_length() return res @@ -1700,7 +1725,7 @@ struct String( A List of Strings containing the input split by line boundaries. """ var output = List[String]() - var length = len(self) + var length = self.byte_length() var current_offset = 0 while current_offset < length: @@ -1751,9 +1776,9 @@ struct String( var self_ptr = self.unsafe_ptr() var new_ptr = new.unsafe_ptr() - var self_len = len(self) - var old_len = len(old) - var new_len = len(new) + var self_len = self.byte_length() + var old_len = old.byte_length() + var new_len = new.byte_length() var res = List[UInt8]() res.reserve(self_len + (old_len - new_len) * occurrences + 1) @@ -1818,7 +1843,7 @@ struct String( A copy of the string with no trailing characters. """ - var r_idx = len(self) + var r_idx = self.byte_length() while r_idx > 0 and self[r_idx - 1] in chars: r_idx -= 1 @@ -1830,7 +1855,7 @@ struct String( Returns: A copy of the string with no trailing whitespaces. """ - var r_idx = len(self) + var r_idx = self.byte_length() # TODO (#933): should use this once llvm intrinsics can be used at comp time # for s in self.__reversed__(): # if not s.isspace(): @@ -1851,7 +1876,7 @@ struct String( """ var l_idx = 0 - while l_idx < len(self) and self[l_idx] in chars: + while l_idx < self.byte_length() and self[l_idx] in chars: l_idx += 1 return self[l_idx:] @@ -1868,7 +1893,9 @@ struct String( # if not s.isspace(): # break # l_idx += 1 - while l_idx < len(self) and _isspace(self._buffer.unsafe_get(l_idx)): + while l_idx < self.byte_length() and _isspace( + self._buffer.unsafe_get(l_idx) + ): l_idx += 1 return self[l_idx:] @@ -1886,9 +1913,9 @@ struct String( var res = List[UInt8]() var val_ptr = val.unsafe_ptr() var self_ptr = self.unsafe_ptr() - res.reserve(len(val) * len(self) + 1) - for i in range(len(self)): - for j in range(len(val)): + res.reserve(val.byte_length() * self.byte_length() + 1) + for i in range(self.byte_length()): + for j in range(val.byte_length()): res.append(val_ptr[j]) res.append(self_ptr[i]) res.append(0) @@ -1925,7 +1952,7 @@ struct String( var char_ptr = copy.unsafe_ptr() - for i in range(len(self)): + for i in range(self.byte_length()): var char: UInt8 = char_ptr[i] if check_case(char): var lower = _toggle_ascii_case(char) @@ -1947,7 +1974,7 @@ struct String( """ if end == -1: return StringRef( - self.unsafe_ptr() + start, len(self) - start + self.unsafe_ptr() + start, self.byte_length() - start ).startswith(prefix._strref_dangerous()) return StringRef(self.unsafe_ptr() + start, end - start).startswith( @@ -1968,7 +1995,7 @@ struct String( """ if end == -1: return StringRef( - self.unsafe_ptr() + start, len(self) - start + self.unsafe_ptr() + start, self.byte_length() - start ).endswith(suffix._strref_dangerous()) return StringRef(self.unsafe_ptr() + start, end - start).endswith( @@ -1995,7 +2022,7 @@ struct String( or a copy of the original string otherwise. """ if self.startswith(prefix): - return self[len(prefix) :] + return self[prefix.byte_length() :] return self fn removesuffix(self, suffix: String, /) -> String: @@ -2018,7 +2045,7 @@ struct String( or a copy of the original string otherwise. """ if suffix and self.endswith(suffix): - return self[: -len(suffix)] + return self[: -suffix.byte_length()] return self fn __int__(self) raises -> Int: @@ -2044,7 +2071,7 @@ struct String( """ if n <= 0: return "" - var len_self = len(self) + var len_self = self.byte_length() var count = len_self * n + 1 var buf = Self._buffer_type(capacity=count) buf.resize(count, 0) @@ -2097,7 +2124,10 @@ struct String( var current_automatic_arg_index = 0 for e in entries: - debug_assert(pos_in_self < len(self), "pos_in_self >= len(self)") + debug_assert( + pos_in_self < self.byte_length(), + "pos_in_self >= self.byte_length()", + ) res += self[pos_in_self : e[].first_curly] if e[].is_escaped_brace(): @@ -2120,8 +2150,8 @@ struct String( pos_in_self = e[].last_curly + 1 - if pos_in_self < len(self): - res += self[pos_in_self : len(self)] + if pos_in_self < self.byte_length(): + res += self[pos_in_self : self.byte_length()] return res^ @@ -2396,7 +2426,7 @@ struct _FormatCurlyEntry(CollectionElement, CollectionElementNew): var entries = List[Self]() var start = Optional[Int](None) var skip_next = False - for i in range(len(format_src)): + for i in range(format_src.byte_length()): if skip_next: skip_next = False continue @@ -2453,7 +2483,7 @@ struct _FormatCurlyEntry(CollectionElement, CollectionElementNew): start = None else: # python escapes double curlies - if (i + 1) < len(format_src): + if (i + 1) < format_src.byte_length(): if format_src[i + 1] == "}": var curren_entry = Self( first_curly=i, last_curly=i + 1, field=True diff --git a/stdlib/src/tempfile/tempfile.mojo b/stdlib/src/tempfile/tempfile.mojo index 9864ba1b5f..bd3c9d69f2 100644 --- a/stdlib/src/tempfile/tempfile.mojo +++ b/stdlib/src/tempfile/tempfile.mojo @@ -31,7 +31,9 @@ fn _get_random_name(size: Int = 8) -> String: alias characters = String("abcdefghijklmnopqrstuvwxyz0123456789_") var name_list = List[UInt8](capacity=size + 1) for _ in range(size): - var rand_index = int(random.random_ui64(0, len(characters) - 1)) + var rand_index = int( + random.random_ui64(0, characters.byte_length() - 1) + ) name_list.append(ord(characters[rand_index])) name_list.append(0) return String(name_list^) From b10fbc128d7eb8515e61ea775addd6e6a14d7ad3 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 8 Jul 2024 16:27:49 -0400 Subject: [PATCH 21/27] fix uses of String.__len__ and remove deprecation warning Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 4 ---- stdlib/src/pathlib/path.mojo | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index d630cbdd33..909e8c6475 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1238,10 +1238,6 @@ struct String( """ return self.byte_length() > 0 - @deprecated( - "A future version will make this method return Unicode codepoints " - "PREFER: String.byte_length()" - ) fn __len__(self) -> Int: """Gets the string length, in bytes (for now) PREFER: String.byte_length(), a future version will make this method return diff --git a/stdlib/src/pathlib/path.mojo b/stdlib/src/pathlib/path.mojo index 9d67ddf916..43976519c7 100644 --- a/stdlib/src/pathlib/path.mojo +++ b/stdlib/src/pathlib/path.mojo @@ -162,7 +162,7 @@ struct Path( Returns: True if the path length is greater than zero, and False otherwise. """ - return len(self.path) > 0 + return self.path.byte_length() > 0 fn format_to(self, inout writer: Formatter): """ From d579222f19ce89dc2886e83899998049a1e62f38 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 8 Jul 2024 17:48:18 -0400 Subject: [PATCH 22/27] add _byte_length with deprecation warning Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 14 ++++++++++++++ stdlib/src/builtin/string_literal.mojo | 13 +++++++++++++ stdlib/src/utils/string_slice.mojo | 11 +++++++++++ 3 files changed, 38 insertions(+) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index 909e8c6475..d96c7b5a11 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1484,6 +1484,7 @@ struct String( # guaranteed to be valid. return StringSlice(unsafe_from_utf8=self.as_bytes_slice()) + @always_inline fn byte_length(self) -> Int: """Get the string length in bytes. @@ -1495,6 +1496,19 @@ struct String( """ return max(len(self._buffer) - 1, 0) + @always_inline + @deprecated("use byte_length() instead") + fn _byte_length(self) -> Int: + """Get the string length in bytes. + + Returns: + The length of this string in bytes, excluding null terminator. + + Notes: + This does not include the trailing null terminator in the count. + """ + return max(len(self._buffer) - 1, 0) + fn _steal_ptr(inout self) -> UnsafePointer[UInt8]: """Transfer ownership of pointer to the underlying memory. The caller is responsible for freeing up the memory. diff --git a/stdlib/src/builtin/string_literal.mojo b/stdlib/src/builtin/string_literal.mojo index 0dcdb0d4c9..650f075b2e 100644 --- a/stdlib/src/builtin/string_literal.mojo +++ b/stdlib/src/builtin/string_literal.mojo @@ -276,6 +276,19 @@ struct StringLiteral( """ return __mlir_op.`pop.string.size`(self.value) + @always_inline + @deprecated("use byte_length() instead") + fn _byte_length(self) -> Int: + """Get the string length in bytes. + + Returns: + The length of this StringLiteral in bytes. + + Notes: + This does not include the trailing null terminator in the count. + """ + return __mlir_op.`pop.string.size`(self.value) + @always_inline("nodebug") fn unsafe_ptr(self) -> UnsafePointer[UInt8]: """Get raw pointer to the underlying data. diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index 90ef29eceb..d6b9663786 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -386,6 +386,17 @@ struct StringSlice[ return len(self.as_bytes_slice()) + @always_inline + @deprecated("use byte_length() instead") + fn _byte_length(self) -> Int: + """Get the length of this string slice in bytes. + + Returns: + The length of this string slice in bytes. + """ + + return len(self.as_bytes_slice()) + fn _strref_dangerous(self) -> StringRef: """Returns an inner pointer to the string as a StringRef. From e51df82a0da9e2f92ffaf502a858b49925265fc3 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 8 Jul 2024 21:32:56 -0400 Subject: [PATCH 23/27] split isspace and iter into another PR Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 124 ++++++++++++++++++++++--- stdlib/src/utils/string_slice.mojo | 141 ----------------------------- 2 files changed, 113 insertions(+), 152 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index d96c7b5a11..9d97fd5268 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -720,6 +720,76 @@ fn _utf8_byte_type(b: UInt8) -> UInt8: return countl_zero(~(b & 0b1111_0000)) +@value +struct _StringIter[ + is_mutable: Bool, //, + lifetime: AnyLifetime[is_mutable].type, + forward: Bool = True, +]: + """Iterator for String. + + Parameters: + is_mutable: Whether the slice is mutable. + lifetime: The lifetime of the underlying string data. + forward: The iteration direction. `False` is backwards. + """ + + var index: Int + var continuation_bytes: Int + var ptr: UnsafePointer[UInt8] + var length: Int + + fn __init__( + inout self, *, unsafe_pointer: UnsafePointer[UInt8], length: Int + ): + self.index = 0 if forward else length + self.ptr = unsafe_pointer + self.length = length + self.continuation_bytes = 0 + for i in range(length): + if _utf8_byte_type(int(unsafe_pointer[i])) == 1: + self.continuation_bytes += 1 + + fn __iter__(self) -> Self: + return self + + fn __next__(inout self) -> StringSlice[lifetime]: + @parameter + if forward: + var byte_len = 1 + if self.continuation_bytes > 0: + var byte_type = _utf8_byte_type(int(self.ptr[self.index])) + if byte_type != 0: + byte_len = int(byte_type) + self.continuation_bytes -= byte_len - 1 + self.index += byte_len + return StringSlice[lifetime]( + unsafe_from_utf8_ptr=self.ptr + (self.index - byte_len), + len=byte_len, + ) + else: + var byte_len = 1 + if self.continuation_bytes > 0: + var byte_type = _utf8_byte_type(int(self.ptr[self.index - 1])) + if byte_type != 0: + while byte_type == 1: + byte_len += 1 + var b = int(self.ptr[self.index - byte_len]) + byte_type = _utf8_byte_type(b) + self.continuation_bytes -= byte_len - 1 + self.index -= byte_len + return StringSlice[lifetime]( + unsafe_from_utf8_ptr=self.ptr + self.index, len=byte_len + ) + + fn __len__(self) -> Int: + @parameter + if forward: + return self.length - self.index - self.continuation_bytes + else: + return self.index - self.continuation_bytes + + struct String( Sized, Stringable, @@ -1203,25 +1273,23 @@ struct String( count=other_len + 1, ) - fn __iter__(ref [_]self) -> _StringSliceIter[__lifetime_of(self)]: + fn __iter__(ref [_]self) -> _StringIter[__lifetime_of(self)]: """Iterate over elements of the string, returning immutable references. Returns: An iterator of references to the string elements. """ - return _StringSliceIter[__lifetime_of(self)]( + return _StringIter[__lifetime_of(self)]( unsafe_pointer=self.unsafe_ptr(), length=self.byte_length() ) - fn __reversed__( - ref [_]self, - ) -> _StringSliceIter[__lifetime_of(self), False]: + fn __reversed__(ref [_]self) -> _StringIter[__lifetime_of(self), False]: """Iterate backwards over the string, returning immutable references. Returns: A reversed iterator of references to the string elements. """ - return _StringSliceIter[__lifetime_of(self), forward=False]( + return _StringIter[__lifetime_of(self), forward=False]( unsafe_pointer=self.unsafe_ptr(), length=self.byte_length() ) @@ -1595,17 +1663,51 @@ struct String( ) fn isspace(self) -> Bool: - """Determines whether every character in the given String is a - python whitespace String. This corresponds to Python's + """Determines whether the given String is a python + whitespace String. This corresponds to Python's [universal separators]( https://docs.python.org/3/library/stdtypes.html#str.splitlines) - `" \\t\\n\\r\\f\\v\\x1c\\x1d\\x1e\\x85\\u2028\\u2029"`. + `" \\t\\n\\r\\f\\v\\x1c\\x1e\\x85\\u2028\\u2029"`. Returns: - True if the whole String is made up of whitespace characters + True if the String is one of the whitespace characters listed above, otherwise False. """ - return self.as_string_slice().isspace() + # TODO add line and paragraph separator as stringliteral + # once unicode escape secuences are accepted + var next_line = List[UInt8](0xC2, 0x85) + """TODO: \\x85""" + var unicode_line_sep = List[UInt8](0xE2, 0x80, 0xA8) + """TODO: \\u2028""" + var unicode_paragraph_sep = List[UInt8](0xE2, 0x80, 0xA9) + """TODO: \\u2029""" + + @always_inline + fn _compare( + item1: UnsafePointer[UInt8], item2: UnsafePointer[UInt8], amnt: Int + ) -> Bool: + var ptr1 = DTypePointer(item1) + var ptr2 = DTypePointer(item2) + return memcmp(ptr1, ptr2, amnt) == 0 + + if len(self) == 0: + return False + + for s in self: + var no_null_len = len(s) + var ptr = s.unsafe_ptr() + if no_null_len == 1 and not _isspace(ptr[0]): + return False + elif no_null_len == 2 and not _compare( + ptr, next_line.unsafe_ptr(), 2 + ): + return False + elif no_null_len == 3 and not ( + _compare(ptr, unicode_line_sep.unsafe_ptr(), 3) + or _compare(ptr, unicode_paragraph_sep.unsafe_ptr(), 3) + ): + return False + return True fn split(self, sep: String, maxsplit: Int = -1) raises -> List[String]: """Split the string by a separator. diff --git a/stdlib/src/utils/string_slice.mojo b/stdlib/src/utils/string_slice.mojo index d6b9663786..ca15404743 100644 --- a/stdlib/src/utils/string_slice.mojo +++ b/stdlib/src/utils/string_slice.mojo @@ -27,76 +27,6 @@ alias StaticString = StringSlice[ImmutableStaticLifetime] """An immutable static string slice.""" -@value -struct _StringSliceIter[ - is_mutable: Bool, //, - lifetime: AnyLifetime[is_mutable].type, - forward: Bool = True, -]: - """Iterator for String. - - Parameters: - is_mutable: Whether the slice is mutable. - lifetime: The lifetime of the underlying string data. - forward: The iteration direction. `False` is backwards. - """ - - var index: Int - var continuation_bytes: Int - var ptr: UnsafePointer[UInt8] - var length: Int - - fn __init__( - inout self, *, unsafe_pointer: UnsafePointer[UInt8], length: Int - ): - self.index = 0 if forward else length - self.ptr = unsafe_pointer - self.length = length - self.continuation_bytes = 0 - for i in range(length): - if _utf8_byte_type(unsafe_pointer[i]) == 1: - self.continuation_bytes += 1 - - fn __iter__(self) -> Self: - return self - - fn __next__(inout self) -> StringSlice[lifetime]: - @parameter - if forward: - var byte_len = 1 - if self.continuation_bytes > 0: - var byte_type = _utf8_byte_type(self.ptr[self.index]) - if byte_type != 0: - byte_len = int(byte_type) - self.continuation_bytes -= byte_len - 1 - self.index += byte_len - return StringSlice[lifetime]( - unsafe_from_utf8_ptr=self.ptr + (self.index - byte_len), - len=byte_len, - ) - else: - var byte_len = 1 - if self.continuation_bytes > 0: - var byte_type = _utf8_byte_type(self.ptr[self.index - 1]) - if byte_type != 0: - while byte_type == 1: - byte_len += 1 - var b = self.ptr[self.index - byte_len] - byte_type = _utf8_byte_type(b) - self.continuation_bytes -= byte_len - 1 - self.index -= byte_len - return StringSlice[lifetime]( - unsafe_from_utf8_ptr=self.ptr + self.index, len=byte_len - ) - - fn __len__(self) -> Int: - @parameter - if forward: - return self.length - self.index - self.continuation_bytes - else: - return self.index - self.continuation_bytes - - struct StringSlice[ is_mutable: Bool, //, lifetime: AnyLifetime[is_mutable].type, @@ -331,28 +261,6 @@ struct StringSlice[ """ return not self == rhs - fn __iter__(ref [_]self) -> _StringSliceIter[__lifetime_of(self)]: - """Iterate over elements of the string, returning immutable references. - - Returns: - An iterator of references to the string elements. - """ - return _StringSliceIter[__lifetime_of(self)]( - unsafe_pointer=self.unsafe_ptr(), length=self.byte_length() - ) - - fn __reversed__( - ref [_]self, - ) -> _StringSliceIter[__lifetime_of(self), False]: - """Iterate backwards over the string, returning immutable references. - - Returns: - A reversed iterator of references to the string elements. - """ - return _StringSliceIter[__lifetime_of(self), forward=False]( - unsafe_pointer=self.unsafe_ptr(), length=self.byte_length() - ) - # ===------------------------------------------------------------------===# # Methods # ===------------------------------------------------------------------===# @@ -414,52 +322,3 @@ struct StringSlice[ without the string getting deallocated early. """ pass - - fn isspace(self) -> Bool: - """Determines whether every character in the given StringSlice is a - python whitespace String. This corresponds to Python's - [universal separators]( - https://docs.python.org/3/library/stdtypes.html#str.splitlines) - `" \\t\\n\\r\\f\\v\\x1c\\x1d\\x1e\\x85\\u2028\\u2029"`. - - Returns: - True if the whole StringSlice is made up of whitespace characters - listed above, otherwise False. - """ - - if self.byte_length() == 0: - return False - - # TODO add line and paragraph separator as stringliteral - # once unicode escape secuences are accepted - var next_line = List[UInt8](0xC2, 0x85) - """TODO: \\x85""" - var unicode_line_sep = List[UInt8](0xE2, 0x80, 0xA8) - """TODO: \\u2028""" - var unicode_paragraph_sep = List[UInt8](0xE2, 0x80, 0xA9) - """TODO: \\u2029""" - - @always_inline - fn _compare( - item1: UnsafePointer[UInt8], item2: UnsafePointer[UInt8], amnt: Int - ) -> Bool: - var ptr1 = DTypePointer(item1) - var ptr2 = DTypePointer(item2) - return memcmp(ptr1, ptr2, amnt) == 0 - - for s in self: - var no_null_len = s.byte_length() - var ptr = s.unsafe_ptr() - if no_null_len == 1 and _isspace(ptr[0]): - continue - elif no_null_len == 2 and _compare(ptr, next_line.unsafe_ptr(), 2): - continue - elif no_null_len == 3 and ( - _compare(ptr, unicode_line_sep.unsafe_ptr(), 3) - or _compare(ptr, unicode_paragraph_sep.unsafe_ptr(), 3) - ): - continue - else: - return False - _ = next_line, unicode_line_sep, unicode_paragraph_sep - return True From 1c952c4c64ad3433c8e585fa2149c0de443baab0 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 8 Jul 2024 21:35:25 -0400 Subject: [PATCH 24/27] fix dangling import and isspace use Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index 9d97fd5268..e8a848fa74 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -25,7 +25,6 @@ from memory import DTypePointer, LegacyPointer, UnsafePointer, memcmp, memcpy from utils import Span, StaticIntTuple, StringRef, StringSlice from utils._format import Formattable, Formatter, ToFormatter -from utils.string_slice import _StringSliceIter # ===----------------------------------------------------------------------=== # # ord @@ -1798,7 +1797,7 @@ struct String( # Python adds all "whitespace chars" as one separator # if no separator was specified for s in self[lhs:]: - if not s.isspace(): + if not str(s).isspace(): # TODO: with StringSlice.isspace() break lhs += s.byte_length() # if it went until the end of the String, then @@ -1812,7 +1811,7 @@ struct String( break rhs = lhs + 1 for s in self[lhs + 1 :]: - if s.isspace(): + if str(s).isspace(): # TODO: with StringSlice.isspace() break rhs += s.byte_length() From 633ba241916fa7babcb66459e05e569b4a67d66a Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Mon, 8 Jul 2024 21:41:16 -0400 Subject: [PATCH 25/27] fix isspace Signed-off-by: martinvuyk --- stdlib/src/builtin/string.mojo | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index e8a848fa74..e0cace5f62 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1689,11 +1689,11 @@ struct String( var ptr2 = DTypePointer(item2) return memcmp(ptr1, ptr2, amnt) == 0 - if len(self) == 0: + if self.byte_length() == 0: return False for s in self: - var no_null_len = len(s) + var no_null_len = s.byte_length() var ptr = s.unsafe_ptr() if no_null_len == 1 and not _isspace(ptr[0]): return False From b7630b677adc26757253691ecc8226136dc1bd05 Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 9 Jul 2024 11:45:23 -0400 Subject: [PATCH 26/27] add suggestions Signed-off-by: martinvuyk --- docs/changelog.md | 5 +++++ stdlib/src/builtin/string.mojo | 29 ++--------------------------- 2 files changed, 7 insertions(+), 27 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 961c586dee..30aadc4f97 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -139,6 +139,11 @@ what we publish. - Added `StringSlice(..)` initializer from a `StringLiteral`. +- Added a `byte_length()` method to `String`, `StringSlice`, and `StringLiteral` +and deprecated their private `_byte_length()` methods. Added a warning to +`String.__len__` method that it will return length in Unicode codepoints in the +future and `StringSlice.__len__` now does return the Unicode codepoints length. + - Added new `StaticString` type alias. This can be used in place of `StringLiteral` for runtime string arguments. diff --git a/stdlib/src/builtin/string.mojo b/stdlib/src/builtin/string.mojo index e0cace5f62..8c514bdfb8 100644 --- a/stdlib/src/builtin/string.mojo +++ b/stdlib/src/builtin/string.mojo @@ -1075,17 +1075,7 @@ struct String( Returns: A new string containing the character at the specified position. """ - # FIXME: this should work with unicode codepoints, but this doesn't yet - # work at compile time because of issue #933 - # var i = 0 - # var buf = Self._buffer_type(capacity=5) - # for s in self: - # if i != idx: - # continue - # for j in range(s.byte_length()): - # buf[j] = self._buffer[i + j] - # buf.append(0) - # buf.resize(len(buf)) + # TODO(#933): implement this for unicode when we support llvm intrinsic evaluation at compile time var normalized_idx = normalize_index["String"](idx, self) var buf = Self._buffer_type(capacity=1) buf.append(self._buffer[normalized_idx]) @@ -1104,22 +1094,7 @@ struct String( var start: Int var end: Int var step: Int - # FIXME: this should work with unicode codepoints, but this doesn't yet - # work at compile time because of issue #933 - # var idx = 0 - # var start = span.start.value() if span.start else 0 - # var end = span.end.value() if span.end else 0 - # var buffer = Self._buffer_type() - # for s in self: - # var amnt_bytes = s.byte_length() - # if not (start <= idx < end) or idx % span.step != 0: - # idx += amnt_bytes - # continue - # for i in range(amnt_bytes): - # buffer[idx + i] = self._buffer[idx + i] - # buffer.append(0) - # buffer.resize(len(buffer)) - # return Self(buffer^) + # TODO(#933): implement this for unicode when we support llvm intrinsic evaluation at compile time start, end, step = span.indices(self.byte_length()) var r = range(start, end, step) From 10d88ecf3b08832ba7435b05f9457fa81e14592e Mon Sep 17 00:00:00 2001 From: martinvuyk Date: Tue, 9 Jul 2024 12:09:59 -0400 Subject: [PATCH 27/27] add suggestions Signed-off-by: martinvuyk --- docs/changelog.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/changelog.md b/docs/changelog.md index 30aadc4f97..7759de019b 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -143,6 +143,7 @@ what we publish. and deprecated their private `_byte_length()` methods. Added a warning to `String.__len__` method that it will return length in Unicode codepoints in the future and `StringSlice.__len__` now does return the Unicode codepoints length. +([PR #2960](https://github.com/modularml/mojo/pull/2960) by [@martinvuyk](https://github.com/martinvuyk)) - Added new `StaticString` type alias. This can be used in place of `StringLiteral` for runtime string arguments.