From 000f20fe570cb1a463a8259032e7bde3670dcc9e Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Wed, 4 Dec 2024 20:43:04 -0700 Subject: [PATCH] Add back array overloads; add unit test for TryUTF8toUTF16 --- src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs | 13 ++ src/Lucene.Net/Util/UnicodeUtil.cs | 189 ++++++++++++++++--- 2 files changed, 171 insertions(+), 31 deletions(-) diff --git a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs index bb8e736a7c..be98e7a3d4 100644 --- a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs +++ b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs @@ -327,5 +327,18 @@ public virtual void TestUTF8UTF16CharsRef() Assert.AreEqual(cRef.ToString(), unicode); } } + + [Test] + [LuceneNetSpecific] // this is a Lucene.NET specific method + public void TestTryUTF8toUTF16() + { + string unicode = TestUtil.RandomRealisticUnicodeString(Random); + var utf8 = new BytesRef(IOUtils.CHARSET_UTF_8.GetBytes(unicode)); + + bool success = UnicodeUtil.TryUTF8toUTF16(utf8, out var chars); + + Assert.IsTrue(success); + Assert.AreEqual(unicode, chars?.ToString()); + } } } diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs index 65dd2fabc9..3069ef0379 100644 --- a/src/Lucene.Net/Util/UnicodeUtil.cs +++ b/src/Lucene.Net/Util/UnicodeUtil.cs @@ -6,6 +6,7 @@ using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Text; +#nullable enable namespace Lucene.Net.Util { @@ -108,7 +109,10 @@ public static class UnicodeUtil /// /// WARNING: this is not a valid UTF8 Term /// - public static readonly BytesRef BIG_TERM = new BytesRef(new byte[] { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }); // TODO this is unrelated here find a better place for it + public static readonly BytesRef BIG_TERM = new BytesRef(new byte[] + { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + }); // TODO this is unrelated here find a better place for it public const int UNI_SUR_HIGH_START = 0xD800; public const int UNI_SUR_HIGH_END = 0xDBFF; @@ -121,7 +125,8 @@ public static class UnicodeUtil private const long HALF_SHIFT = 10; private const long HALF_MASK = 0x3FFL; - private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START; + private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - + (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START; /// /// Encode characters from a (with generic type argument ) , starting at @@ -149,6 +154,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result) { @out = result.Bytes = new byte[maxLen]; } + result.Offset = 0; while (i < end) @@ -189,6 +195,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result) continue; } } + // replace unpaired surrogate or out-of-order low surrogate // with substitution character @out[upto++] = 0xEF; @@ -196,12 +203,13 @@ public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result) @out[upto++] = 0xBD; } } + //assert matches(source, offset, length, out, upto); result.Length = upto; } /// - /// Encode characters from a (with generic type argument ) , starting at + /// Encode characters from a , starting at /// for chars. After encoding, result.Offset will always be 0. /// /// or is null. @@ -213,6 +221,31 @@ public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result) /// and refer to a location outside of . /// // TODO: broken if incoming result.offset != 0 + public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef result) + { + // LUCENENET: Added guard clauses + if (source is null) + throw new ArgumentNullException(nameof(source)); + + UTF16toUTF8(source.AsSpan(), offset, length, result); + } + + /// + /// Encode characters from a (with generic type argument ) , starting at + /// for chars. After encoding, result.Offset will always be 0. + /// + /// is null. + /// + /// or is less than zero. + /// + /// -or- + /// + /// and refer to a location outside of . + /// + /// + /// LUCENENET specific overload. + /// + // TODO: broken if incoming result.offset != 0 public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length, BytesRef result) { // LUCENENET: Added guard clauses @@ -223,7 +256,8 @@ public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length if (length < 0) throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative."); if (offset > source.Length - length) // Checks for int overflow - throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); + throw new ArgumentOutOfRangeException(nameof(length), + $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); int upto = 0; int i = offset; @@ -235,6 +269,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length { @out = result.Bytes = new byte[maxLen]; } + result.Offset = 0; while (i < end) @@ -275,6 +310,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length continue; } } + // replace unpaired surrogate or out-of-order low surrogate // with substitution character @out[upto++] = 0xEF; @@ -282,6 +318,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length @out[upto++] = 0xBD; } } + //assert matches(source, offset, length, out, upto); result.Length = upto; } @@ -311,7 +348,8 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt if (length < 0) throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative."); if (offset > source.Length - length) // Checks for int overflow - throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); + throw new ArgumentOutOfRangeException(nameof(length), + $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); int end = offset + length; @@ -362,6 +400,7 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt continue; } } + // replace unpaired surrogate or out-of-order low surrogate // with substitution character @out[upto++] = 0xEF; @@ -369,6 +408,7 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt @out[upto++] = 0xBD; } } + //assert matches(s, offset, length, out, upto); result.Length = upto; } @@ -400,7 +440,8 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r if (length < 0) throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative."); if (offset > source.Length - length) // Checks for int overflow - throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); + throw new ArgumentOutOfRangeException(nameof(length), + $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); int end = offset + length; @@ -451,6 +492,7 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r continue; } } + // replace unpaired surrogate or out-of-order low surrogate // with substitution character @out[upto++] = 0xEF; @@ -458,6 +500,7 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r @out[upto++] = 0xBD; } } + //assert matches(s, offset, length, out, upto); result.Length = upto; } @@ -535,19 +578,19 @@ public static bool ValidUTF16String(ICharSequence s) // Valid surrogate pair } else - // Unmatched high surrogate + // Unmatched high surrogate { return false; } } else - // Unmatched high surrogate + // Unmatched high surrogate { return false; } } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) - // Unmatched low surrogate + // Unmatched low surrogate { return false; } @@ -556,7 +599,8 @@ public static bool ValidUTF16String(ICharSequence s) return true; } - public static bool ValidUTF16String(string s) // LUCENENET specific overload because string doesn't implement ICharSequence + public static bool + ValidUTF16String(string s) // LUCENENET specific overload because string doesn't implement ICharSequence { int size = s.Length; for (int i = 0; i < size; i++) @@ -573,19 +617,19 @@ public static bool ValidUTF16String(string s) // LUCENENET specific overload bec // Valid surrogate pair } else - // Unmatched high surrogate + // Unmatched high surrogate { return false; } } else - // Unmatched high surrogate + // Unmatched high surrogate { return false; } } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) - // Unmatched low surrogate + // Unmatched low surrogate { return false; } @@ -594,7 +638,9 @@ public static bool ValidUTF16String(string s) // LUCENENET specific overload bec return true; } - public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence + public static bool + ValidUTF16String( + StringBuilder s) // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence { int size = s.Length; for (int i = 0; i < size; i++) @@ -611,19 +657,19 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl // Valid surrogate pair } else - // Unmatched high surrogate + // Unmatched high surrogate { return false; } } else - // Unmatched high surrogate + // Unmatched high surrogate { return false; } } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) - // Unmatched low surrogate + // Unmatched low surrogate { return false; } @@ -632,6 +678,8 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl return true; } + public static bool ValidUTF16String(char[] s, int size) => ValidUTF16String(s.AsSpan(), size); + public static bool ValidUTF16String(ReadOnlySpan s, int size) { for (int i = 0; i < size; i++) @@ -658,7 +706,7 @@ public static bool ValidUTF16String(ReadOnlySpan s, int size) } } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) - // Unmatched low surrogate + // Unmatched low surrogate { return false; } @@ -676,10 +724,13 @@ public static bool ValidUTF16String(ReadOnlySpan s, int size) /* Map UTF-8 encoded prefix byte to sequence length. -1 (0xFF) * means illegal prefix. see RFC 2279 for details */ internal static readonly int[] utf8CodeLength = LoadUTF8CodeLength(); - private static int[] LoadUTF8CodeLength() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) + + private static int[] + LoadUTF8CodeLength() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) { - int v = int.MinValue; - return new int[] { + const int v = int.MinValue; + return new int[] + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -720,12 +771,31 @@ public static int CodePointCount(BytesRef utf8) for (; pos < limit; codePointCount++) { int v = bytes[pos] & 0xFF; - if (v < /* 0xxx xxxx */ 0x80) { pos += 1; continue; } - if (v >= /* 110x xxxx */ 0xc0) + if (v < /* 0xxx xxxx */ 0x80) + { + pos += 1; + continue; + } + + if (v >= /* 110x xxxx */ 0xc0) { - if (v < /* 111x xxxx */ 0xe0) { pos += 2; continue; } - if (v < /* 1111 xxxx */ 0xf0) { pos += 3; continue; } - if (v < /* 1111 1xxx */ 0xf8) { pos += 4; continue; } + if (v < /* 111x xxxx */ 0xe0) + { + pos += 2; + continue; + } + + if (v < /* 1111 xxxx */ 0xf0) + { + pos += 3; + continue; + } + + if (v < /* 1111 1xxx */ 0xf8) + { + pos += 4; + continue; + } // fallthrough, consider 5 and 6 byte sequences invalid. } @@ -756,6 +826,7 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32) { utf32.Int32s = new int[utf8.Length]; } + int utf32Count = 0; int utf8Upto = utf8.Offset; int[] ints = utf32.Int32s; @@ -795,6 +866,7 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32) { v = v << 6 | bytes[utf8Upto++] & 63; } + ints[utf32Count++] = v; } @@ -824,7 +896,25 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32) /// /// Value that all lead surrogate starts with. - private const int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_); + private const int LEAD_SURROGATE_OFFSET_ = + LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_); + + /// + /// Cover JDK 1.5 API. Create a String from an array of . + /// + /// The code point array. + /// The start of the text in the code point array. + /// The number of code points. + /// a String representing the code points between offset and count. + /// If an invalid code point is encountered. + /// If the offset or count are out of bounds. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static string NewString(int[] codePoints, int offset, int count) + { + // LUCENENET: Character.ToString() was optimized to use the stack for arrays + // of codepoints 256 or less, so it performs better than using ToCharArray(). + return Character.ToString(codePoints, offset, count); + } /// /// Cover JDK 1.5 API. Create a String from a span of . @@ -843,6 +933,23 @@ public static string NewString(ReadOnlySpan codePoints, int offset, int cou return Character.ToString(codePoints, offset, count); } + /// + /// Generates char array that represents the provided input code points. + /// + /// LUCENENET specific. + /// + /// The code array. + /// The start of the text in the code point array. + /// The number of code points. + /// a char array representing the code points between offset and count. + // LUCENENET NOTE: This code was originally in the NewString() method (above). + // It has been refactored from the original to remove the exception throw/catch and + // instead proactively resizes the array instead of relying on exceptions + copy operations + public static char[] ToCharArray(int[] codePoints, int offset, int count) + { + return ToCharArray(codePoints.AsSpan(), offset, count); + } + /// /// Generates char array that represents the provided input code points. /// @@ -949,6 +1056,20 @@ public static string ToHexString(string s) return sb.ToString(); } + /// + /// Interprets the given byte array as UTF-8 and converts to UTF-16. The will be extended if + /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. + /// + /// NOTE: Full characters are read, even if this reads past the length passed (and + /// can result in an if invalid UTF-8 is passed). + /// Explicit checks for valid UTF-8 are not performed. + /// + // TODO: broken if chars.offset != 0 + public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) + { + UTF8toUTF16(utf8.AsSpan(), offset, length, chars); + } + /// /// Interprets the given byte span as UTF-8 and converts to UTF-16. The will be extended if /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. @@ -958,7 +1079,7 @@ public static string ToHexString(string s) /// Explicit checks for valid UTF-8 are not performed. /// /// - /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[]. + /// LUCENENET specific overload. /// // TODO: broken if chars.offset != 0 public static void UTF8toUTF16(ReadOnlySpan utf8, int offset, int length, CharsRef chars) @@ -1003,7 +1124,6 @@ public static void UTF8toUTF16(ReadOnlySpan utf8, int offset, int length, chars.Length = out_offset - chars.Offset; } - #nullable enable /// /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new . /// @@ -1077,7 +1197,6 @@ public static bool TryUTF8toUTF16(ReadOnlySpan utf8, int offset, int lengt chars = result; return true; } - #nullable restore /// /// Utility method for @@ -1085,7 +1204,15 @@ public static bool TryUTF8toUTF16(ReadOnlySpan utf8, int offset, int lengt [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars) { - UTF8toUTF16(bytesRef.Bytes, bytesRef.Offset, bytesRef.Length, chars); + UTF8toUTF16(bytesRef.Bytes.AsSpan(), bytesRef.Offset, bytesRef.Length, chars); + } + + /// + /// Utility method for + /// + public static bool TryUTF8toUTF16(BytesRef bytesRef, out CharsRef? chars) + { + return TryUTF8toUTF16(bytesRef.Bytes.AsSpan(), bytesRef.Offset, bytesRef.Length, out chars); } } }