From 1845c38fcc37c6d335b5347488fb3da326761317 Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Wed, 4 Dec 2024 16:52:33 -0700 Subject: [PATCH 01/10] UnicodeUtil updates: TryUTF8toUTF16, ReadOnlySpan methods, #1024 --- src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs | 51 +++++--- src/Lucene.Net/Util/BytesRef.cs | 20 +++ src/Lucene.Net/Util/UnicodeUtil.cs | 122 ++++++++++++++---- 3 files changed, 152 insertions(+), 41 deletions(-) diff --git a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs index bab16426f7..cdace9c1cc 100644 --- a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs +++ b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs @@ -50,15 +50,15 @@ namespace Lucene.Net.Codecs public static class BlockTreeTermsWriter { /// - /// Suggested default value for the - /// minItemsInBlock parameter to + /// Suggested default value for the + /// minItemsInBlock parameter to /// . /// public const int DEFAULT_MIN_BLOCK_SIZE = 25; /// - /// Suggested default value for the - /// maxItemsInBlock parameter to + /// Suggested default value for the + /// maxItemsInBlock parameter to /// . /// public const int DEFAULT_MAX_BLOCK_SIZE = 48; @@ -296,12 +296,12 @@ public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long /// to set state. It is *optional* and can be used when overriding the WriteHeader(), /// WriteIndexHeader(). It only matters in the case where the state /// is required inside of any of those methods that is passed in to the subclass constructor. - /// + /// /// When passed to the constructor, it is set to the protected field m_subclassState before /// any of the above methods are called where it is available for reading when overriding the above methods. - /// + /// /// If your subclass needs to pass more than one piece of data, you can create a class or struct to do so. - /// All other virtual members of BlockTreeTermsWriter are not called in the constructor, + /// All other virtual members of BlockTreeTermsWriter are not called in the constructor, /// so the overrides of those methods won't specifically need to use this field (although they could for consistency). /// [SuppressMessage("CodeQuality", "IDE0079:Remove unnecessary suppression", Justification = "This is a SonarCloud issue")] @@ -468,7 +468,20 @@ public PendingBlock(BytesRef prefix, long fp, bool hasTerms, bool isFloor, int f public override string ToString() { - return "BLOCK: " + Prefix.Utf8ToString(); + return $"BLOCK: {Prefix.Utf8ToString()}"; + } + + #nullable enable + public bool TryToString([NotNullWhen(true)] out string? result) + { + if (Prefix.TryUtf8ToString(out string? prefixString)) + { + result = $"BLOCK: {prefixString}"; + return true; + } + + result = null; + return false; } // LUCENENET specific - to keep the Debug.Assert statement from throwing exceptions @@ -476,12 +489,11 @@ public override string ToString() // to using PendingBlock.Prefix.ToString() if PendingBlock.ToString() errors. // This struct defers formatting the string until it is actually used as a parameter // in string.Format(). - private struct PendingBlocksFormatter // For assert + private readonly struct PendingBlocksFormatter // For assert { -#pragma warning disable IDE0044 // Add readonly modifier - private IList blocks; -#pragma warning restore IDE0044 // Add readonly modifier - public PendingBlocksFormatter(IList blocks) + private readonly IList? blocks; + + public PendingBlocksFormatter(IList? blocks) { this.blocks = blocks; // May be null } @@ -500,17 +512,17 @@ public override string ToString() // For assert it.MoveNext(); while (true) { - var e = it.Current; + var e = it.Current ?? throw new InvalidOperationException("Expected a non-null value in the enumerator due to Count check above."); // There is a chance that the Prefix will contain invalid UTF8, // so we catch that and use the alternative way of displaying it - try + if (e.TryToString(out string? eString)) { - sb.Append(e.ToString()); + sb.Append(eString); } - catch (IndexOutOfRangeException) + else { sb.Append("BLOCK: "); - sb.Append(e.Prefix.ToString()); + sb.Append(e.Prefix); } if (!it.MoveNext()) { @@ -520,6 +532,7 @@ public override string ToString() // For assert } } } + #nullable restore public void CompileIndex(IList floorBlocks, RAMOutputStream scratchBytes) { @@ -1351,4 +1364,4 @@ protected override void Dispose(bool disposing) } } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net/Util/BytesRef.cs b/src/Lucene.Net/Util/BytesRef.cs index 8012c77282..2e3679f8e0 100644 --- a/src/Lucene.Net/Util/BytesRef.cs +++ b/src/Lucene.Net/Util/BytesRef.cs @@ -243,6 +243,26 @@ public string Utf8ToString() return @ref.ToString(); } + #nullable enable + /// + /// Tries to interpret the stored bytes as UTF8 bytes, returning the + /// resulting as an output parameter . + /// + /// The resulting string output. + /// true if successful, false otherwise. + public bool TryUtf8ToString([NotNullWhen(true)] out string? result) + { + if (UnicodeUtil.TryUTF8toUTF16(bytes, Offset, Length, out CharsRef? @ref)) + { + result = @ref.ToString(); + return true; + } + + result = null; + return false; + } + #nullable restore + /// /// Returns hex encoded bytes, eg [0x6c 0x75 0x63 0x65 0x6e 0x65] public override string ToString() diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs index 5974af1a16..65dd2fabc9 100644 --- a/src/Lucene.Net/Util/UnicodeUtil.cs +++ b/src/Lucene.Net/Util/UnicodeUtil.cs @@ -3,6 +3,7 @@ using Lucene.Net.Diagnostics; using Lucene.Net.Support; using System; +using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Text; @@ -123,13 +124,13 @@ public static class UnicodeUtil private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START; /// - /// Encode characters from a , starting at + /// Encode characters from a (with generic type argument ) , starting at /// and ending at . After encoding, result.Offset will always be 0. /// /// is null. // TODO: broken if incoming result.offset != 0 // LUCENENET specific overload - public static void UTF16toUTF8(Span source, BytesRef result) + public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result) { // LUCENENET: Added guard clause if (result is null) @@ -200,7 +201,7 @@ public static void UTF16toUTF8(Span source, BytesRef result) } /// - /// Encode characters from a , starting at + /// Encode characters from a (with generic type argument ) , starting at /// for chars. After encoding, result.Offset will always be 0. /// /// or is null. @@ -212,11 +213,9 @@ public static void UTF16toUTF8(Span source, BytesRef result) /// and refer to a location outside of . /// // TODO: broken if incoming result.offset != 0 - public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef result) + public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length, BytesRef result) { // LUCENENET: Added guard clauses - if (source is null) - throw new ArgumentNullException(nameof(source)); if (result is null) throw new ArgumentNullException(nameof(result)); if (offset < 0) @@ -633,7 +632,7 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl return true; } - public static bool ValidUTF16String(char[] s, int size) + public static bool ValidUTF16String(ReadOnlySpan s, int size) { for (int i = 0; i < size; i++) { @@ -828,16 +827,16 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32) private const int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_); /// - /// Cover JDK 1.5 API. Create a String from an array of . + /// Cover JDK 1.5 API. Create a String from a span of . /// - /// The code array. - /// The start of the text in the code point array. + /// The code point span. + /// The start of the text in the code point span. /// The number of code points. /// a String representing the code points between offset and count. /// If an invalid code point is encountered. /// If the offset or count are out of bounds. [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static string NewString(int[] codePoints, int offset, int count) + public static string NewString(ReadOnlySpan codePoints, int offset, int count) { // LUCENENET: Character.ToString() was optimized to use the stack for arrays // of codepoints 256 or less, so it performs better than using ToCharArray(). @@ -849,26 +848,26 @@ public static string NewString(int[] codePoints, int offset, int count) /// /// LUCENENET specific. /// - /// The code array. - /// The start of the text in the code point array. + /// The code span. + /// The start of the text in the code point span. /// The number of code points. /// a char array representing the code points between offset and count. // LUCENENET NOTE: This code was originally in the NewString() method (above). // It has been refactored from the original to remove the exception throw/catch and - // instead proactively resizes the array instead of relying on excpetions + copy operations - public static char[] ToCharArray(int[] codePoints, int offset, int count) + // instead proactively resizes the array instead of relying on exceptions + copy operations + public static char[] ToCharArray(ReadOnlySpan codePoints, int offset, int count) { if (count < 0) { throw new ArgumentOutOfRangeException(nameof(count), "count must be >= 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) } - const int countThreashold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2 + const int countThreshold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2 // LUCENENET: as a first approximation, assume each codepoint // is 2 characters (since it cannot be longer than this) int arrayLength = count * 2; - // LUCENENET: if we go over the threashold, count the number of + // LUCENENET: if we go over the threshold, count the number of // chars we will need so we can allocate the precise amount of memory - if (count > countThreashold) + if (count > countThreshold) { arrayLength = 0; for (int r = offset, e = offset + count; r < e; ++r) @@ -951,15 +950,18 @@ public static string ToHexString(string s) } /// - /// Interprets the given byte array as UTF-8 and converts to UTF-16. The will be extended if + /// Interprets the given byte span as UTF-8 and converts to UTF-16. The will be extended if /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. /// /// NOTE: Full characters are read, even if this reads past the length passed (and /// can result in an if invalid UTF-8 is passed). /// Explicit checks for valid UTF-8 are not performed. /// + /// + /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[]. + /// // TODO: broken if chars.offset != 0 - public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) + public static void UTF8toUTF16(ReadOnlySpan utf8, int offset, int length, CharsRef chars) { int out_offset = chars.Offset = 0; char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, length); @@ -1001,9 +1003,85 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha chars.Length = out_offset - chars.Offset; } + #nullable enable + /// + /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new . + /// + /// NOTE: Explicit checks for valid UTF-8 are not performed. + /// + /// + /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[]. + /// + public static bool TryUTF8toUTF16(ReadOnlySpan utf8, int offset, int length, [NotNullWhen(true)] out CharsRef? chars) + { + CharsRef result = new CharsRef(length); + int out_offset = 0; + char[] @out = result.Chars; + int limit = offset + length; + while (offset < limit) + { + if (utf8.Length <= offset) + { + chars = null; + return false; + } + + int b = utf8[offset++] & 0xff; + if (b < 0xc0) + { + if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80); + @out[out_offset++] = (char)b; + } + else if (b < 0xe0) + { + if (utf8.Length <= offset) + { + chars = null; + return false; + } + @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f)); + } + else if (b < 0xf0) + { + if (utf8.Length <= offset + 1) + { + chars = null; + return false; + } + @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f)); + offset += 2; + } + else + { + if (utf8.Length <= offset + 2) + { + chars = null; + return false; + } + if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b); + int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f); + offset += 3; + if (ch < UNI_MAX_BMP) + { + @out[out_offset++] = (char)ch; + } + else + { + int chHalf = ch - 0x0010000; + @out[out_offset++] = (char)((chHalf >> 10) + 0xD800); + @out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00); + } + } + } + result.Length = out_offset; + chars = result; + return true; + } + #nullable restore + /// - /// Utility method for - /// + /// Utility method for + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars) { From e24d99c5d64ba97fe6dd23a8b15f78afa4ab1deb Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Wed, 4 Dec 2024 20:43:04 -0700 Subject: [PATCH 02/10] Add back array overloads; add unit test for TryUTF8toUTF16 --- src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs | 13 ++ src/Lucene.Net/Util/UnicodeUtil.cs | 189 ++++++++++++++++--- 2 files changed, 171 insertions(+), 31 deletions(-) diff --git a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs index bb8e736a7c..be98e7a3d4 100644 --- a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs +++ b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs @@ -327,5 +327,18 @@ public virtual void TestUTF8UTF16CharsRef() Assert.AreEqual(cRef.ToString(), unicode); } } + + [Test] + [LuceneNetSpecific] // this is a Lucene.NET specific method + public void TestTryUTF8toUTF16() + { + string unicode = TestUtil.RandomRealisticUnicodeString(Random); + var utf8 = new BytesRef(IOUtils.CHARSET_UTF_8.GetBytes(unicode)); + + bool success = UnicodeUtil.TryUTF8toUTF16(utf8, out var chars); + + Assert.IsTrue(success); + Assert.AreEqual(unicode, chars?.ToString()); + } } } diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs index 65dd2fabc9..3069ef0379 100644 --- a/src/Lucene.Net/Util/UnicodeUtil.cs +++ b/src/Lucene.Net/Util/UnicodeUtil.cs @@ -6,6 +6,7 @@ using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Text; +#nullable enable namespace Lucene.Net.Util { @@ -108,7 +109,10 @@ public static class UnicodeUtil /// /// WARNING: this is not a valid UTF8 Term /// - public static readonly BytesRef BIG_TERM = new BytesRef(new byte[] { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }); // TODO this is unrelated here find a better place for it + public static readonly BytesRef BIG_TERM = new BytesRef(new byte[] + { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + }); // TODO this is unrelated here find a better place for it public const int UNI_SUR_HIGH_START = 0xD800; public const int UNI_SUR_HIGH_END = 0xDBFF; @@ -121,7 +125,8 @@ public static class UnicodeUtil private const long HALF_SHIFT = 10; private const long HALF_MASK = 0x3FFL; - private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START; + private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - + (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START; /// /// Encode characters from a (with generic type argument ) , starting at @@ -149,6 +154,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result) { @out = result.Bytes = new byte[maxLen]; } + result.Offset = 0; while (i < end) @@ -189,6 +195,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result) continue; } } + // replace unpaired surrogate or out-of-order low surrogate // with substitution character @out[upto++] = 0xEF; @@ -196,12 +203,13 @@ public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result) @out[upto++] = 0xBD; } } + //assert matches(source, offset, length, out, upto); result.Length = upto; } /// - /// Encode characters from a (with generic type argument ) , starting at + /// Encode characters from a , starting at /// for chars. After encoding, result.Offset will always be 0. /// /// or is null. @@ -213,6 +221,31 @@ public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result) /// and refer to a location outside of . /// // TODO: broken if incoming result.offset != 0 + public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef result) + { + // LUCENENET: Added guard clauses + if (source is null) + throw new ArgumentNullException(nameof(source)); + + UTF16toUTF8(source.AsSpan(), offset, length, result); + } + + /// + /// Encode characters from a (with generic type argument ) , starting at + /// for chars. After encoding, result.Offset will always be 0. + /// + /// is null. + /// + /// or is less than zero. + /// + /// -or- + /// + /// and refer to a location outside of . + /// + /// + /// LUCENENET specific overload. + /// + // TODO: broken if incoming result.offset != 0 public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length, BytesRef result) { // LUCENENET: Added guard clauses @@ -223,7 +256,8 @@ public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length if (length < 0) throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative."); if (offset > source.Length - length) // Checks for int overflow - throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); + throw new ArgumentOutOfRangeException(nameof(length), + $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); int upto = 0; int i = offset; @@ -235,6 +269,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length { @out = result.Bytes = new byte[maxLen]; } + result.Offset = 0; while (i < end) @@ -275,6 +310,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length continue; } } + // replace unpaired surrogate or out-of-order low surrogate // with substitution character @out[upto++] = 0xEF; @@ -282,6 +318,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length @out[upto++] = 0xBD; } } + //assert matches(source, offset, length, out, upto); result.Length = upto; } @@ -311,7 +348,8 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt if (length < 0) throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative."); if (offset > source.Length - length) // Checks for int overflow - throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); + throw new ArgumentOutOfRangeException(nameof(length), + $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); int end = offset + length; @@ -362,6 +400,7 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt continue; } } + // replace unpaired surrogate or out-of-order low surrogate // with substitution character @out[upto++] = 0xEF; @@ -369,6 +408,7 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt @out[upto++] = 0xBD; } } + //assert matches(s, offset, length, out, upto); result.Length = upto; } @@ -400,7 +440,8 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r if (length < 0) throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative."); if (offset > source.Length - length) // Checks for int overflow - throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); + throw new ArgumentOutOfRangeException(nameof(length), + $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); int end = offset + length; @@ -451,6 +492,7 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r continue; } } + // replace unpaired surrogate or out-of-order low surrogate // with substitution character @out[upto++] = 0xEF; @@ -458,6 +500,7 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r @out[upto++] = 0xBD; } } + //assert matches(s, offset, length, out, upto); result.Length = upto; } @@ -535,19 +578,19 @@ public static bool ValidUTF16String(ICharSequence s) // Valid surrogate pair } else - // Unmatched high surrogate + // Unmatched high surrogate { return false; } } else - // Unmatched high surrogate + // Unmatched high surrogate { return false; } } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) - // Unmatched low surrogate + // Unmatched low surrogate { return false; } @@ -556,7 +599,8 @@ public static bool ValidUTF16String(ICharSequence s) return true; } - public static bool ValidUTF16String(string s) // LUCENENET specific overload because string doesn't implement ICharSequence + public static bool + ValidUTF16String(string s) // LUCENENET specific overload because string doesn't implement ICharSequence { int size = s.Length; for (int i = 0; i < size; i++) @@ -573,19 +617,19 @@ public static bool ValidUTF16String(string s) // LUCENENET specific overload bec // Valid surrogate pair } else - // Unmatched high surrogate + // Unmatched high surrogate { return false; } } else - // Unmatched high surrogate + // Unmatched high surrogate { return false; } } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) - // Unmatched low surrogate + // Unmatched low surrogate { return false; } @@ -594,7 +638,9 @@ public static bool ValidUTF16String(string s) // LUCENENET specific overload bec return true; } - public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence + public static bool + ValidUTF16String( + StringBuilder s) // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence { int size = s.Length; for (int i = 0; i < size; i++) @@ -611,19 +657,19 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl // Valid surrogate pair } else - // Unmatched high surrogate + // Unmatched high surrogate { return false; } } else - // Unmatched high surrogate + // Unmatched high surrogate { return false; } } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) - // Unmatched low surrogate + // Unmatched low surrogate { return false; } @@ -632,6 +678,8 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl return true; } + public static bool ValidUTF16String(char[] s, int size) => ValidUTF16String(s.AsSpan(), size); + public static bool ValidUTF16String(ReadOnlySpan s, int size) { for (int i = 0; i < size; i++) @@ -658,7 +706,7 @@ public static bool ValidUTF16String(ReadOnlySpan s, int size) } } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) - // Unmatched low surrogate + // Unmatched low surrogate { return false; } @@ -676,10 +724,13 @@ public static bool ValidUTF16String(ReadOnlySpan s, int size) /* Map UTF-8 encoded prefix byte to sequence length. -1 (0xFF) * means illegal prefix. see RFC 2279 for details */ internal static readonly int[] utf8CodeLength = LoadUTF8CodeLength(); - private static int[] LoadUTF8CodeLength() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) + + private static int[] + LoadUTF8CodeLength() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) { - int v = int.MinValue; - return new int[] { + const int v = int.MinValue; + return new int[] + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -720,12 +771,31 @@ public static int CodePointCount(BytesRef utf8) for (; pos < limit; codePointCount++) { int v = bytes[pos] & 0xFF; - if (v < /* 0xxx xxxx */ 0x80) { pos += 1; continue; } - if (v >= /* 110x xxxx */ 0xc0) + if (v < /* 0xxx xxxx */ 0x80) + { + pos += 1; + continue; + } + + if (v >= /* 110x xxxx */ 0xc0) { - if (v < /* 111x xxxx */ 0xe0) { pos += 2; continue; } - if (v < /* 1111 xxxx */ 0xf0) { pos += 3; continue; } - if (v < /* 1111 1xxx */ 0xf8) { pos += 4; continue; } + if (v < /* 111x xxxx */ 0xe0) + { + pos += 2; + continue; + } + + if (v < /* 1111 xxxx */ 0xf0) + { + pos += 3; + continue; + } + + if (v < /* 1111 1xxx */ 0xf8) + { + pos += 4; + continue; + } // fallthrough, consider 5 and 6 byte sequences invalid. } @@ -756,6 +826,7 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32) { utf32.Int32s = new int[utf8.Length]; } + int utf32Count = 0; int utf8Upto = utf8.Offset; int[] ints = utf32.Int32s; @@ -795,6 +866,7 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32) { v = v << 6 | bytes[utf8Upto++] & 63; } + ints[utf32Count++] = v; } @@ -824,7 +896,25 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32) /// /// Value that all lead surrogate starts with. - private const int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_); + private const int LEAD_SURROGATE_OFFSET_ = + LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_); + + /// + /// Cover JDK 1.5 API. Create a String from an array of . + /// + /// The code point array. + /// The start of the text in the code point array. + /// The number of code points. + /// a String representing the code points between offset and count. + /// If an invalid code point is encountered. + /// If the offset or count are out of bounds. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static string NewString(int[] codePoints, int offset, int count) + { + // LUCENENET: Character.ToString() was optimized to use the stack for arrays + // of codepoints 256 or less, so it performs better than using ToCharArray(). + return Character.ToString(codePoints, offset, count); + } /// /// Cover JDK 1.5 API. Create a String from a span of . @@ -843,6 +933,23 @@ public static string NewString(ReadOnlySpan codePoints, int offset, int cou return Character.ToString(codePoints, offset, count); } + /// + /// Generates char array that represents the provided input code points. + /// + /// LUCENENET specific. + /// + /// The code array. + /// The start of the text in the code point array. + /// The number of code points. + /// a char array representing the code points between offset and count. + // LUCENENET NOTE: This code was originally in the NewString() method (above). + // It has been refactored from the original to remove the exception throw/catch and + // instead proactively resizes the array instead of relying on exceptions + copy operations + public static char[] ToCharArray(int[] codePoints, int offset, int count) + { + return ToCharArray(codePoints.AsSpan(), offset, count); + } + /// /// Generates char array that represents the provided input code points. /// @@ -949,6 +1056,20 @@ public static string ToHexString(string s) return sb.ToString(); } + /// + /// Interprets the given byte array as UTF-8 and converts to UTF-16. The will be extended if + /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. + /// + /// NOTE: Full characters are read, even if this reads past the length passed (and + /// can result in an if invalid UTF-8 is passed). + /// Explicit checks for valid UTF-8 are not performed. + /// + // TODO: broken if chars.offset != 0 + public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) + { + UTF8toUTF16(utf8.AsSpan(), offset, length, chars); + } + /// /// Interprets the given byte span as UTF-8 and converts to UTF-16. The will be extended if /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. @@ -958,7 +1079,7 @@ public static string ToHexString(string s) /// Explicit checks for valid UTF-8 are not performed. /// /// - /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[]. + /// LUCENENET specific overload. /// // TODO: broken if chars.offset != 0 public static void UTF8toUTF16(ReadOnlySpan utf8, int offset, int length, CharsRef chars) @@ -1003,7 +1124,6 @@ public static void UTF8toUTF16(ReadOnlySpan utf8, int offset, int length, chars.Length = out_offset - chars.Offset; } - #nullable enable /// /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new . /// @@ -1077,7 +1197,6 @@ public static bool TryUTF8toUTF16(ReadOnlySpan utf8, int offset, int lengt chars = result; return true; } - #nullable restore /// /// Utility method for @@ -1085,7 +1204,15 @@ public static bool TryUTF8toUTF16(ReadOnlySpan utf8, int offset, int lengt [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars) { - UTF8toUTF16(bytesRef.Bytes, bytesRef.Offset, bytesRef.Length, chars); + UTF8toUTF16(bytesRef.Bytes.AsSpan(), bytesRef.Offset, bytesRef.Length, chars); + } + + /// + /// Utility method for + /// + public static bool TryUTF8toUTF16(BytesRef bytesRef, out CharsRef? chars) + { + return TryUTF8toUTF16(bytesRef.Bytes.AsSpan(), bytesRef.Offset, bytesRef.Length, out chars); } } } From 0afee05c8b2122d024cb2d8ed99bdfdada8a59b7 Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Wed, 4 Dec 2024 20:45:57 -0700 Subject: [PATCH 03/10] Fix comment formatting --- src/Lucene.Net/Util/UnicodeUtil.cs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs index 3069ef0379..75af2b53ea 100644 --- a/src/Lucene.Net/Util/UnicodeUtil.cs +++ b/src/Lucene.Net/Util/UnicodeUtil.cs @@ -578,20 +578,20 @@ public static bool ValidUTF16String(ICharSequence s) // Valid surrogate pair } else - // Unmatched high surrogate { + // Unmatched high surrogate return false; } } else - // Unmatched high surrogate { + // Unmatched high surrogate return false; } } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) - // Unmatched low surrogate { + // Unmatched low surrogate return false; } } @@ -617,20 +617,20 @@ public static bool // Valid surrogate pair } else - // Unmatched high surrogate { + // Unmatched high surrogate return false; } } else - // Unmatched high surrogate { + // Unmatched high surrogate return false; } } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) - // Unmatched low surrogate { + // Unmatched low surrogate return false; } } @@ -657,20 +657,20 @@ public static bool // Valid surrogate pair } else - // Unmatched high surrogate { + // Unmatched high surrogate return false; } } else - // Unmatched high surrogate { + // Unmatched high surrogate return false; } } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) - // Unmatched low surrogate { + // Unmatched low surrogate return false; } } @@ -706,8 +706,8 @@ public static bool ValidUTF16String(ReadOnlySpan s, int size) } } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) - // Unmatched low surrogate { + // Unmatched low surrogate return false; } } From 95966c7f886a7b7d81f0f39af51c297300164360 Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Tue, 17 Dec 2024 13:22:46 -0700 Subject: [PATCH 04/10] Remove offset/length parameters from Span-based methods, #1024 --- src/Lucene.Net/Util/UnicodeUtil.cs | 198 +++++++++-------------------- 1 file changed, 60 insertions(+), 138 deletions(-) diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs index 75af2b53ea..af53ec15d5 100644 --- a/src/Lucene.Net/Util/UnicodeUtil.cs +++ b/src/Lucene.Net/Util/UnicodeUtil.cs @@ -159,7 +159,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result) while (i < end) { - int code = (int)source[i++]; + var code = (int)source[i++]; if (code < 0x80) { @@ -227,100 +227,7 @@ public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef r if (source is null) throw new ArgumentNullException(nameof(source)); - UTF16toUTF8(source.AsSpan(), offset, length, result); - } - - /// - /// Encode characters from a (with generic type argument ) , starting at - /// for chars. After encoding, result.Offset will always be 0. - /// - /// is null. - /// - /// or is less than zero. - /// - /// -or- - /// - /// and refer to a location outside of . - /// - /// - /// LUCENENET specific overload. - /// - // TODO: broken if incoming result.offset != 0 - public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length, BytesRef result) - { - // LUCENENET: Added guard clauses - if (result is null) - throw new ArgumentNullException(nameof(result)); - if (offset < 0) - throw new ArgumentOutOfRangeException(nameof(offset), $"{nameof(offset)} must not be negative."); - if (length < 0) - throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative."); - if (offset > source.Length - length) // Checks for int overflow - throw new ArgumentOutOfRangeException(nameof(length), - $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); - - int upto = 0; - int i = offset; - int end = offset + length; - var @out = result.Bytes; - // Pre-allocate for worst case 4-for-1 - int maxLen = length * 4; - if (@out.Length < maxLen) - { - @out = result.Bytes = new byte[maxLen]; - } - - result.Offset = 0; - - while (i < end) - { - int code = (int)source[i++]; - - if (code < 0x80) - { - @out[upto++] = (byte)code; - } - else if (code < 0x800) - { - @out[upto++] = (byte)(0xC0 | (code >> 6)); - @out[upto++] = (byte)(0x80 | (code & 0x3F)); - } - else if (code < 0xD800 || code > 0xDFFF) - { - @out[upto++] = (byte)(0xE0 | (code >> 12)); - @out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F)); - @out[upto++] = (byte)(0x80 | (code & 0x3F)); - } - else - { - // surrogate pair - // confirm valid high surrogate - if (code < 0xDC00 && i < end) - { - var utf32 = (int)source[i]; - // confirm valid low surrogate and write pair - if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) - { - utf32 = (code << 10) + utf32 + SURROGATE_OFFSET; - i++; - @out[upto++] = (byte)(0xF0 | (utf32 >> 18)); - @out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)); - @out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F)); - @out[upto++] = (byte)(0x80 | (utf32 & 0x3F)); - continue; - } - } - - // replace unpaired surrogate or out-of-order low surrogate - // with substitution character - @out[upto++] = 0xEF; - @out[upto++] = 0xBF; - @out[upto++] = 0xBD; - } - } - - //assert matches(source, offset, length, out, upto); - result.Length = upto; + UTF16toUTF8(source.AsSpan(offset, length), result); } /// @@ -599,8 +506,8 @@ public static bool ValidUTF16String(ICharSequence s) return true; } - public static bool - ValidUTF16String(string s) // LUCENENET specific overload because string doesn't implement ICharSequence + // LUCENENET specific overload because string doesn't implement ICharSequence + public static bool ValidUTF16String(string s) { int size = s.Length; for (int i = 0; i < size; i++) @@ -638,9 +545,8 @@ public static bool return true; } - public static bool - ValidUTF16String( - StringBuilder s) // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence + // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence + public static bool ValidUTF16String(StringBuilder s) { int size = s.Length; for (int i = 0; i < size; i++) @@ -725,8 +631,8 @@ public static bool ValidUTF16String(ReadOnlySpan s, int size) * means illegal prefix. see RFC 2279 for details */ internal static readonly int[] utf8CodeLength = LoadUTF8CodeLength(); - private static int[] - LoadUTF8CodeLength() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) + // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) + private static int[] LoadUTF8CodeLength() { const int v = int.MinValue; return new int[] @@ -947,7 +853,7 @@ public static string NewString(ReadOnlySpan codePoints, int offset, int cou // instead proactively resizes the array instead of relying on exceptions + copy operations public static char[] ToCharArray(int[] codePoints, int offset, int count) { - return ToCharArray(codePoints.AsSpan(), offset, count); + return ToCharArray(codePoints.AsSpan(offset), count); } /// @@ -956,13 +862,12 @@ public static char[] ToCharArray(int[] codePoints, int offset, int count) /// LUCENENET specific. /// /// The code span. - /// The start of the text in the code point span. /// The number of code points. /// a char array representing the code points between offset and count. // LUCENENET NOTE: This code was originally in the NewString() method (above). // It has been refactored from the original to remove the exception throw/catch and // instead proactively resizes the array instead of relying on exceptions + copy operations - public static char[] ToCharArray(ReadOnlySpan codePoints, int offset, int count) + public static char[] ToCharArray(ReadOnlySpan codePoints, int count) { if (count < 0) { @@ -977,7 +882,7 @@ public static char[] ToCharArray(ReadOnlySpan codePoints, int offset, int c if (count > countThreshold) { arrayLength = 0; - for (int r = offset, e = offset + count; r < e; ++r) + for (int r = 0; r < count; ++r) { arrayLength += codePoints[r] < 0x010000 ? 1 : 2; } @@ -990,7 +895,7 @@ public static char[] ToCharArray(ReadOnlySpan codePoints, int offset, int c // It is now safe to assume we have enough space for all of the characters. char[] chars = new char[arrayLength]; int w = 0; - for (int r = offset, e = offset + count; r < e; ++r) + for (int r = 0; r < count; ++r) { int cp = codePoints[r]; if (cp < 0 || cp > 0x10ffff) @@ -1064,10 +969,11 @@ public static string ToHexString(string s) /// can result in an if invalid UTF-8 is passed). /// Explicit checks for valid UTF-8 are not performed. /// + /// // TODO: broken if chars.offset != 0 public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) { - UTF8toUTF16(utf8.AsSpan(), offset, length, chars); + UTF8toUTF16(utf8.AsSpan(offset, length), chars); } /// @@ -1082,14 +988,15 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha /// LUCENENET specific overload. /// // TODO: broken if chars.offset != 0 - public static void UTF8toUTF16(ReadOnlySpan utf8, int offset, int length, CharsRef chars) + public static void UTF8toUTF16(ReadOnlySpan utf8, CharsRef chars) { int out_offset = chars.Offset = 0; - char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, length); - int limit = offset + length; - while (offset < limit) + char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, utf8.Length); + int i = 0; + + while (i < utf8.Length) { - int b = utf8[offset++] & 0xff; + int b = utf8[i++] & 0xff; if (b < 0xc0) { if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80); @@ -1097,18 +1004,18 @@ public static void UTF8toUTF16(ReadOnlySpan utf8, int offset, int length, } else if (b < 0xe0) { - @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f)); + @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f)); } else if (b < 0xf0) { - @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f)); - offset += 2; + @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f)); + i += 2; } else { if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b); - int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f); - offset += 3; + int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f); + i += 3; if (ch < UNI_MAX_BMP) { @out[out_offset++] = (char)ch; @@ -1132,21 +1039,36 @@ public static void UTF8toUTF16(ReadOnlySpan utf8, int offset, int length, /// /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[]. /// - public static bool TryUTF8toUTF16(ReadOnlySpan utf8, int offset, int length, [NotNullWhen(true)] out CharsRef? chars) + /// + public static bool TryUTF8toUTF16(byte[] utf8, int offset, int length, [NotNullWhen(true)] out CharsRef? chars) { - CharsRef result = new CharsRef(length); + return TryUTF8toUTF16(utf8.AsSpan(offset, length), out chars); + } + + /// + /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new . + /// + /// NOTE: Explicit checks for valid UTF-8 are not performed. + /// + /// + /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[]. + /// + public static bool TryUTF8toUTF16(ReadOnlySpan utf8, [NotNullWhen(true)] out CharsRef? chars) + { + CharsRef result = new CharsRef(utf8.Length); int out_offset = 0; char[] @out = result.Chars; - int limit = offset + length; - while (offset < limit) + int i = 0; + + while (i < utf8.Length) { - if (utf8.Length <= offset) + if (utf8.Length <= i) { chars = null; return false; } - int b = utf8[offset++] & 0xff; + int b = utf8[i++] & 0xff; if (b < 0xc0) { if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80); @@ -1154,33 +1076,33 @@ public static bool TryUTF8toUTF16(ReadOnlySpan utf8, int offset, int lengt } else if (b < 0xe0) { - if (utf8.Length <= offset) + if (utf8.Length <= i) { chars = null; return false; } - @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f)); + @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f)); } else if (b < 0xf0) { - if (utf8.Length <= offset + 1) + if (utf8.Length <= i + 1) { chars = null; return false; } - @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f)); - offset += 2; + @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f)); + i += 2; } else { - if (utf8.Length <= offset + 2) + if (utf8.Length <= i + 2) { chars = null; return false; } if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b); - int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f); - offset += 3; + int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f); + i += 3; if (ch < UNI_MAX_BMP) { @out[out_offset++] = (char)ch; @@ -1199,20 +1121,20 @@ public static bool TryUTF8toUTF16(ReadOnlySpan utf8, int offset, int lengt } /// - /// Utility method for - /// + /// Utility method for + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars) { - UTF8toUTF16(bytesRef.Bytes.AsSpan(), bytesRef.Offset, bytesRef.Length, chars); + UTF8toUTF16(bytesRef.Bytes.AsSpan(bytesRef.Offset, bytesRef.Length), chars); } /// - /// Utility method for - /// + /// Utility method for + /// public static bool TryUTF8toUTF16(BytesRef bytesRef, out CharsRef? chars) { - return TryUTF8toUTF16(bytesRef.Bytes.AsSpan(), bytesRef.Offset, bytesRef.Length, out chars); + return TryUTF8toUTF16(bytesRef.Bytes.AsSpan(bytesRef.Offset, bytesRef.Length), out chars); } } } From f131d978a9cb3fa5e8f89c5aa91ee58ab268b00e Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Tue, 17 Dec 2024 15:43:49 -0700 Subject: [PATCH 05/10] Move ToCharArray methods to ObsoleteAPI for removal in 4.8.0 RC, #1024 --- .../Support/ObsoleteAPI/UnicodeUtil.cs | 175 ++++++++++++++++++ src/Lucene.Net/Util/UnicodeUtil.cs | 85 +-------- 2 files changed, 178 insertions(+), 82 deletions(-) create mode 100644 src/Lucene.Net/Support/ObsoleteAPI/UnicodeUtil.cs diff --git a/src/Lucene.Net/Support/ObsoleteAPI/UnicodeUtil.cs b/src/Lucene.Net/Support/ObsoleteAPI/UnicodeUtil.cs new file mode 100644 index 0000000000..325218f5c8 --- /dev/null +++ b/src/Lucene.Net/Support/ObsoleteAPI/UnicodeUtil.cs @@ -0,0 +1,175 @@ +using Lucene.Net.Support; +using System; + +#nullable enable + +namespace Lucene.Net.Util +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /* + * Some of this code came from the excellent Unicode + * conversion examples from: + * + * http://www.unicode.org/Public/PROGRAMS/CVTUTF + * + * Full Copyright for that code follows: + */ + + /* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * this source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute this Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + + /* + * Additional code came from the IBM ICU library. + * + * http://www.icu-project.org + * + * Full Copyright for that code follows. + */ + + /* + * Copyright (C) 1999-2010, International Business Machines + * Corporation and others. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do so, + * provided that the above copyright notice(s) and this permission notice appear + * in all copies of the Software and that both the above copyright notice(s) and + * this permission notice appear in supporting documentation. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN this NOTICE BE + * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR + * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER + * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF this SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall not + * be used in advertising or otherwise to promote the sale, use or other + * dealings in this Software without prior written authorization of the + * copyright holder. + */ + + public static partial class UnicodeUtil + { + /// + /// Generates char array that represents the provided input code points. + /// + /// LUCENENET specific. + /// + /// The code array. + /// The start of the text in the code point array. + /// The number of code points. + /// a char array representing the code points between offset and count. + // LUCENENET NOTE: This code was originally in the NewString() method. + // It has been refactored from the original to remove the exception throw/catch and + // instead proactively resizes the array instead of relying on exceptions + copy operations + [Obsolete("Use NewString method instead. This method will be removed in 4.8.0 release candidate."), System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)] + public static char[] ToCharArray(int[] codePoints, int offset, int count) + { + return ToCharArray(codePoints.AsSpan(offset), count); + } + + /// + /// Generates char array that represents the provided input code points. + /// + /// LUCENENET specific. + /// + /// The code span. + /// The number of code points. + /// a char array representing the code points between offset and count. + // LUCENENET NOTE: This code was originally in the NewString() method. + // It has been refactored from the original to remove the exception throw/catch and + // instead proactively resizes the array instead of relying on exceptions + copy operations + [Obsolete("Use NewString method instead. This method will be removed in 4.8.0 release candidate."), System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)] + public static char[] ToCharArray(ReadOnlySpan codePoints, int count) + { + if (count < 0) + { + throw new ArgumentOutOfRangeException(nameof(count), "count must be >= 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) + } + const int countThreshold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2 + // LUCENENET: as a first approximation, assume each codepoint + // is 2 characters (since it cannot be longer than this) + int arrayLength = count * 2; + // LUCENENET: if we go over the threshold, count the number of + // chars we will need so we can allocate the precise amount of memory + if (count > countThreshold) + { + arrayLength = 0; + for (int r = 0; r < count; ++r) + { + arrayLength += codePoints[r] < 0x010000 ? 1 : 2; + } + if (arrayLength < 1) + { + arrayLength = count * 2; + } + } + // Initialize our array to our exact or oversized length. + // It is now safe to assume we have enough space for all of the characters. + char[] chars = new char[arrayLength]; + int w = 0; + for (int r = 0; r < count; ++r) + { + int cp = codePoints[r]; + if (cp < 0 || cp > 0x10ffff) + { + throw new ArgumentException($"Invalid code point: {cp}", nameof(codePoints)); + } + if (cp < 0x010000) + { + chars[w++] = (char)cp; + } + else + { + chars[w++] = (char)(LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_)); + chars[w++] = (char)(TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_)); + } + } + + var result = new char[w]; + Arrays.Copy(chars, result, w); + return result; + } + } +} diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs index af53ec15d5..ee7b66c041 100644 --- a/src/Lucene.Net/Util/UnicodeUtil.cs +++ b/src/Lucene.Net/Util/UnicodeUtil.cs @@ -101,7 +101,7 @@ namespace Lucene.Net.Util /// /// @lucene.internal /// - public static class UnicodeUtil + public static partial class UnicodeUtil { /// /// A binary term consisting of a number of 0xff bytes, likely to be bigger than other terms @@ -818,7 +818,7 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32) public static string NewString(int[] codePoints, int offset, int count) { // LUCENENET: Character.ToString() was optimized to use the stack for arrays - // of codepoints 256 or less, so it performs better than using ToCharArray(). + // of codepoints 256 or less, so it performs better than the Lucene implementation. return Character.ToString(codePoints, offset, count); } @@ -835,89 +835,10 @@ public static string NewString(int[] codePoints, int offset, int count) public static string NewString(ReadOnlySpan codePoints, int offset, int count) { // LUCENENET: Character.ToString() was optimized to use the stack for arrays - // of codepoints 256 or less, so it performs better than using ToCharArray(). + // of codepoints 256 or less, so it performs better than the Lucene implementation. return Character.ToString(codePoints, offset, count); } - /// - /// Generates char array that represents the provided input code points. - /// - /// LUCENENET specific. - /// - /// The code array. - /// The start of the text in the code point array. - /// The number of code points. - /// a char array representing the code points between offset and count. - // LUCENENET NOTE: This code was originally in the NewString() method (above). - // It has been refactored from the original to remove the exception throw/catch and - // instead proactively resizes the array instead of relying on exceptions + copy operations - public static char[] ToCharArray(int[] codePoints, int offset, int count) - { - return ToCharArray(codePoints.AsSpan(offset), count); - } - - /// - /// Generates char array that represents the provided input code points. - /// - /// LUCENENET specific. - /// - /// The code span. - /// The number of code points. - /// a char array representing the code points between offset and count. - // LUCENENET NOTE: This code was originally in the NewString() method (above). - // It has been refactored from the original to remove the exception throw/catch and - // instead proactively resizes the array instead of relying on exceptions + copy operations - public static char[] ToCharArray(ReadOnlySpan codePoints, int count) - { - if (count < 0) - { - throw new ArgumentOutOfRangeException(nameof(count), "count must be >= 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) - } - const int countThreshold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2 - // LUCENENET: as a first approximation, assume each codepoint - // is 2 characters (since it cannot be longer than this) - int arrayLength = count * 2; - // LUCENENET: if we go over the threshold, count the number of - // chars we will need so we can allocate the precise amount of memory - if (count > countThreshold) - { - arrayLength = 0; - for (int r = 0; r < count; ++r) - { - arrayLength += codePoints[r] < 0x010000 ? 1 : 2; - } - if (arrayLength < 1) - { - arrayLength = count * 2; - } - } - // Initialize our array to our exact or oversized length. - // It is now safe to assume we have enough space for all of the characters. - char[] chars = new char[arrayLength]; - int w = 0; - for (int r = 0; r < count; ++r) - { - int cp = codePoints[r]; - if (cp < 0 || cp > 0x10ffff) - { - throw new ArgumentException($"Invalid code point: {cp}", nameof(codePoints)); - } - if (cp < 0x010000) - { - chars[w++] = (char)cp; - } - else - { - chars[w++] = (char)(LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_)); - chars[w++] = (char)(TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_)); - } - } - - var result = new char[w]; - Arrays.Copy(chars, result, w); - return result; - } - // for debugging public static string ToHexString(string s) { From 9a13647cc0d8b413c736e4f4a11e10223c96213a Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Tue, 17 Dec 2024 16:46:49 -0700 Subject: [PATCH 06/10] Add fallback version of UTF8toUTF16 --- src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs | 12 +++ .../ExceptionHandling/ExceptionExtensions.cs | 4 +- src/Lucene.Net/Util/BytesRef.cs | 17 ++++ src/Lucene.Net/Util/UnicodeUtil.cs | 90 +++++++++++++++++-- 4 files changed, 115 insertions(+), 8 deletions(-) diff --git a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs index be98e7a3d4..716124af5a 100644 --- a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs +++ b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs @@ -340,5 +340,17 @@ public void TestTryUTF8toUTF16() Assert.IsTrue(success); Assert.AreEqual(unicode, chars?.ToString()); } + + [Test] + [LuceneNetSpecific] // this is a Lucene.NET specific method + public void TestUTF8toUTF16WithFallback() + { + byte[] invalidUtf8 = { 0x63, 0xc3 }; // Invalid ending UTF-8 sequence + var scratch = new CharsRef(); + + UnicodeUtil.UTF8toUTF16WithFallback(invalidUtf8, scratch); + + Assert.AreEqual("c\ufffd", scratch.ToString()); + } } } diff --git a/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs b/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs index 5a25bf64d2..1c3ed526ad 100644 --- a/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs +++ b/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs @@ -305,7 +305,7 @@ public static bool IsNoSuchFileExceptionOrFileNotFoundException(this Exception e [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsParseException(this Exception e) { - // LUCENNET: Added this exception in J2N to cover this case because it is not a RuntimeException + // LUCENENET: Added this exception in J2N to cover this case because it is not a RuntimeException // which makes it different from NumberFormatException in Java and FormatException in .NET. return e is ParseException; } @@ -591,7 +591,7 @@ public static bool IsIllegalStateException(this Exception e) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsStackOverflowError(this Exception e) { - return e is StackOverflowException; // Uncatchable in .NET core, be sure to use with + return e is StackOverflowException; // Uncatchable in .NET core, be sure to use with } /// diff --git a/src/Lucene.Net/Util/BytesRef.cs b/src/Lucene.Net/Util/BytesRef.cs index 2e3679f8e0..a7c339f7c9 100644 --- a/src/Lucene.Net/Util/BytesRef.cs +++ b/src/Lucene.Net/Util/BytesRef.cs @@ -243,6 +243,23 @@ public string Utf8ToString() return @ref.ToString(); } + /// + /// Interprets stored bytes as UTF8 bytes, returning the + /// resulting . + /// + /// + /// LUCENENET specific version that does not throw exceptions, + /// primarily for use in ToString() and other methods that + /// should not throw exceptions. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public string Utf8ToStringWithFallback() + { + CharsRef @ref = new CharsRef(Length); + UnicodeUtil.UTF8toUTF16WithFallback(bytes, Offset, Length, @ref); + return @ref.ToString(); + } + #nullable enable /// /// Tries to interpret the stored bytes as UTF8 bytes, returning the diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs index ee7b66c041..8bcb24f450 100644 --- a/src/Lucene.Net/Util/UnicodeUtil.cs +++ b/src/Lucene.Net/Util/UnicodeUtil.cs @@ -952,6 +952,90 @@ public static void UTF8toUTF16(ReadOnlySpan utf8, CharsRef chars) chars.Length = out_offset - chars.Offset; } + /// + /// Interprets the given byte array as UTF-8 and converts to UTF-16. The will be extended if + /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. + /// + /// NOTE: This method will replace any invalid UTF-8 byte sequences with the Unicode replacement character U+FFFD. + /// + /// + /// LUCENENET specific, for use in ToString() where we want to avoid throwing exceptions. + /// + /// + // TODO: broken if chars.offset != 0 + public static void UTF8toUTF16WithFallback(byte[] utf8, int offset, int length, CharsRef chars) + { + UTF8toUTF16(utf8.AsSpan(offset, length), chars); + } + + /// + /// Interprets the given byte span as UTF-8 and converts to UTF-16. The will be extended if + /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. + /// + /// NOTE: This method will replace any invalid UTF-8 byte sequences with the Unicode replacement character U+FFFD. + /// + /// + /// LUCENENET specific, for use in ToString() where we want to avoid throwing exceptions. + /// + // TODO: broken if chars.offset != 0 + public static void UTF8toUTF16WithFallback(ReadOnlySpan utf8, CharsRef chars) + { + int out_offset = chars.Offset = 0; + char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, utf8.Length); + int i = 0; + + while (i < utf8.Length) + { + int b = utf8[i++] & 0xff; + if (b < 0xc0) + { + if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80); + @out[out_offset++] = (char)b; + } + else if (b < 0xe0) + { + if (utf8.Length <= i) + { + @out[out_offset++] = (char)0xfffd; + continue; + } + @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f)); + } + else if (b < 0xf0) + { + if (utf8.Length <= i + 1) + { + @out[out_offset++] = (char)0xfffd; + break; + } + @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f)); + i += 2; + } + else + { + if (utf8.Length <= i + 2) + { + @out[out_offset++] = (char)0xfffd; + break; + } + if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b); + int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f); + i += 3; + if (ch < UNI_MAX_BMP) + { + @out[out_offset++] = (char)ch; + } + else + { + int chHalf = ch - 0x0010000; + @out[out_offset++] = (char)((chHalf >> 10) + 0xD800); + @out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00); + } + } + } + chars.Length = out_offset - chars.Offset; + } + /// /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new . /// @@ -983,12 +1067,6 @@ public static bool TryUTF8toUTF16(ReadOnlySpan utf8, [NotNullWhen(true)] o while (i < utf8.Length) { - if (utf8.Length <= i) - { - chars = null; - return false; - } - int b = utf8[i++] & 0xff; if (b < 0xc0) { From a04df3a197cd18fe7d4ef979f2d979346a629359 Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Tue, 17 Dec 2024 20:54:56 -0700 Subject: [PATCH 07/10] Throw ParseException on out of range in UTF8toUTF16, add more tests --- src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs | 32 +++++++++++++++++--- src/Lucene.Net/Util/UnicodeUtil.cs | 12 ++++++++ 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs index 716124af5a..6cad0a4e4e 100644 --- a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs +++ b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs @@ -328,8 +328,29 @@ public virtual void TestUTF8UTF16CharsRef() } } + [Test] + [LuceneNetSpecific] + [TestCase(new byte[] { 0x63, 0x61, 0xc3 }, true)] // ca�, start of 2-byte sequence + [TestCase(new byte[] { 0x63, 0x61, 0xe3 }, true)] // ca�, start of 3-byte sequence + [TestCase(new byte[] { 0x63, 0x61, 0xf3 }, true)] // ca�, start of 4-byte sequence + [TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, false)] // cañon + public void TestUTF8toUTF16Exception(byte[] invalidUtf8, bool shouldThrow) + { + var scratch = new CharsRef(); + + if (shouldThrow) + { + Assert.Throws(() => UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch)); + } + else + { + UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch); + } + } + [Test] [LuceneNetSpecific] // this is a Lucene.NET specific method + [Repeat(100)] public void TestTryUTF8toUTF16() { string unicode = TestUtil.RandomRealisticUnicodeString(Random); @@ -343,14 +364,17 @@ public void TestTryUTF8toUTF16() [Test] [LuceneNetSpecific] // this is a Lucene.NET specific method - public void TestUTF8toUTF16WithFallback() + [TestCase(new byte[] { 0x63, 0x61, 0xc3 }, "ca\ufffd")] // ca�, start of 2-byte sequence + [TestCase(new byte[] { 0x63, 0x61, 0xe3 }, "ca\ufffd")] // ca�, start of 3-byte sequence + [TestCase(new byte[] { 0x63, 0x61, 0xf3 }, "ca\ufffd")] // ca�, start of 4-byte sequence + [TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, "cañon")] + public void TestUTF8toUTF16WithFallback(byte[] utf8, string expected) { - byte[] invalidUtf8 = { 0x63, 0xc3 }; // Invalid ending UTF-8 sequence var scratch = new CharsRef(); - UnicodeUtil.UTF8toUTF16WithFallback(invalidUtf8, scratch); + UnicodeUtil.UTF8toUTF16WithFallback(utf8, scratch); - Assert.AreEqual("c\ufffd", scratch.ToString()); + Assert.AreEqual(expected, scratch.ToString()); } } } diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs index 8bcb24f450..434ca2d265 100644 --- a/src/Lucene.Net/Util/UnicodeUtil.cs +++ b/src/Lucene.Net/Util/UnicodeUtil.cs @@ -925,15 +925,27 @@ public static void UTF8toUTF16(ReadOnlySpan utf8, CharsRef chars) } else if (b < 0xe0) { + if (utf8.Length <= i) + { + throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1); + } @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f)); } else if (b < 0xf0) { + if (utf8.Length <= i + 1) + { + throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1); + } @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f)); i += 2; } else { + if (utf8.Length <= i + 2) + { + throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1); + } if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b); int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f); i += 3; From 3f2dbc13bdc1c9574974f5fe972216228a1d09ea Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Tue, 17 Dec 2024 21:16:09 -0700 Subject: [PATCH 08/10] Use Utf8ToStringWithFallback in ToString and exception/logging message building --- .../SimpleText/SimpleTextUtil.cs | 7 ++-- .../DefaultSortedSetDocValuesReaderState.cs | 9 ++--- .../AbstractGroupFacetCollector.cs | 2 +- .../TermsIncludingScoreQuery.cs | 34 +++++++++---------- src/Lucene.Net.Misc/Misc/TermStats.cs | 3 +- src/Lucene.Net.Queries/TermsFilter.cs | 2 +- .../Suggest/Fst/FSTCompletion.cs | 3 +- src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs | 6 ++-- .../Codecs/Lucene3x/Lucene3xFields.cs | 23 ++++++++----- src/Lucene.Net/Util/BytesRef.cs | 12 +++---- 10 files changed, 56 insertions(+), 45 deletions(-) diff --git a/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs b/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs index 695e817552..c3b8f41e6b 100644 --- a/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs +++ b/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs @@ -79,7 +79,7 @@ public static void ReadLine(DataInput input, BytesRef scratch) { break; } - + scratch.Bytes[upto++] = b; } } @@ -106,8 +106,9 @@ public static void CheckFooter(ChecksumIndexInput input) if (StringHelper.StartsWith(scratch, CHECKSUM) == false) { + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes throw new CorruptIndexException("SimpleText failure: expected checksum line but got " + - scratch.Utf8ToString() + " (resource=" + input + ")"); + scratch.Utf8ToStringWithFallback() + " (resource=" + input + ")"); } var actualChecksum = (new BytesRef(scratch.Bytes, CHECKSUM.Length, scratch.Length - CHECKSUM.Length)).Utf8ToString(); @@ -124,4 +125,4 @@ public static void CheckFooter(ChecksumIndexInput input) } } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs b/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs index 91d4c355ab..3ca21e13f5 100644 --- a/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs +++ b/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs @@ -41,7 +41,7 @@ public class DefaultSortedSetDocValuesReaderState : SortedSetDocValuesReaderStat /// /// Creates this, pulling doc values from the specified - /// field. + /// field. /// public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = FacetsConfig.DEFAULT_INDEX_FIELD_NAME) { @@ -79,7 +79,8 @@ public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = F string[] components = FacetsConfig.StringToPath(spare.Utf8ToString()); if (components.Length != 2) { - throw new ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + spare.Utf8ToString()); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + throw new ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + spare.Utf8ToStringWithFallback()); } if (!components[0].Equals(lastDim, StringComparison.Ordinal)) { @@ -101,7 +102,7 @@ public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = F /// /// Return top-level doc values. /// - public override SortedSetDocValues GetDocValues() + public override SortedSetDocValues GetDocValues() { return topReader.GetSortedSetDocValues(field); } @@ -132,4 +133,4 @@ public override OrdRange GetOrdRange(string dim) /// public override int Count => valueCount; } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs b/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs index 8abb12520e..89599b49a6 100644 --- a/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs +++ b/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs @@ -275,7 +275,7 @@ public override int GetHashCode() public override string ToString() { return "FacetEntry{" + - "value=" + value.Utf8ToString() + + "value=" + value.Utf8ToStringWithFallback() + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes ", count=" + count + '}'; } diff --git a/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs b/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs index 9d91aa64c1..9a5c9c1476 100644 --- a/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs +++ b/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs @@ -134,7 +134,7 @@ public WeightAnonymousClass(TermsIncludingScoreQuery outerInstance, Weight origi private TermsEnum segmentTermsEnum; - + public override Explanation Explain(AtomicReaderContext context, int doc) { SVInnerScorer scorer = (SVInnerScorer) GetBulkScorer(context, false, null); @@ -161,7 +161,7 @@ public override void Normalize(float norm, float topLevelBoost) { originalWeight.Normalize(norm, topLevelBoost*outerInstance.Boost); } - + public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs) { Terms terms = context.AtomicReader.GetTerms(outerInstance._field); @@ -181,7 +181,7 @@ public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs) return new SVInOrderScorer(outerInstance, this, acceptDocs, segmentTermsEnum, context.AtomicReader.MaxDoc, cost); } - + public override BulkScorer GetBulkScorer(AtomicReaderContext context, bool scoreDocsInOrder, IBits acceptDocs) { if (scoreDocsInOrder) @@ -236,7 +236,7 @@ internal SVInnerScorer(TermsIncludingScoreQuery outerInstance, /* Weight weight, //_cost = cost; // LUCENENET: Never read _doc = -1; } - + public override bool Score(ICollector collector, int max) { FakeScorer fakeScorer = new FakeScorer(); @@ -285,12 +285,12 @@ private int NextDocOutOfOrder() } } } - + protected virtual int DocsEnumNextDoc() { return docsEnum.NextDoc(); } - + internal Explanation Explain(int target) // LUCENENET NOTE: changed accessibility from private to internal { int docId; @@ -314,7 +314,7 @@ internal Explanation Explain(int target) // LUCENENET NOTE: changed accessibilit } while (docId != DocIdSetIterator.NO_MORE_DOCS); return new ComplexExplanation(true, outerInstance._scores[outerInstance._ords[_scoreUpto]], - "Score based on join value " + _termsEnum.Term.Utf8ToString()); + "Score based on join value " + _termsEnum.Term.Utf8ToStringWithFallback()); // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes } } @@ -326,13 +326,13 @@ internal class MVInnerScorer : SVInnerScorer internal readonly FixedBitSet alreadyEmittedDocs; internal MVInnerScorer(TermsIncludingScoreQuery outerInstance, /* Weight weight, // LUCENENET: Never read */ - IBits acceptDocs, TermsEnum termsEnum, int maxDoc /*, long cost // LUCENENET: Never read */) + IBits acceptDocs, TermsEnum termsEnum, int maxDoc /*, long cost // LUCENENET: Never read */) : base(outerInstance, /*weight, // LUCENENET: Never read */ acceptDocs, termsEnum /*, cost // LUCENENET: Never read */) { alreadyEmittedDocs = new FixedBitSet(maxDoc); } - + protected override int DocsEnumNextDoc() { while (true) @@ -360,11 +360,11 @@ internal class SVInOrderScorer : Scorer internal readonly long cost; internal int currentDoc = -1; - + [SuppressMessage("CodeQuality", "IDE0079:Remove unnecessary suppression", Justification = "This is a SonarCloud issue")] [SuppressMessage("CodeQuality", "S1699:Constructors should only call non-overridable methods", Justification = "Internal class")] internal SVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight, IBits acceptDocs, - TermsEnum termsEnum, int maxDoc, long cost) + TermsEnum termsEnum, int maxDoc, long cost) : base(weight) { this.m_outerInstance = outerInstance; @@ -374,7 +374,7 @@ internal SVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight, matchingDocsIterator = matchingDocs.GetIterator(); this.cost = cost; } - + protected virtual void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptDocs, TermsEnum termsEnum) { @@ -398,12 +398,12 @@ protected virtual void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptD } } } - + public override float GetScore() { return scores[currentDoc]; } - + public override int Freq => 1; public override int DocID => currentDoc; @@ -412,7 +412,7 @@ public override int NextDoc() { return currentDoc = matchingDocsIterator.NextDoc(); } - + public override int Advance(int target) { return currentDoc = matchingDocsIterator.Advance(target); @@ -432,7 +432,7 @@ internal MVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight, : base(outerInstance, weight, acceptDocs, termsEnum, maxDoc, cost) { } - + protected override void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptDocs, TermsEnum termsEnum) { @@ -465,4 +465,4 @@ protected override void FillDocsAndScores(FixedBitSet matchingDocs, IBits accept } } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Misc/Misc/TermStats.cs b/src/Lucene.Net.Misc/Misc/TermStats.cs index 7ac5fbc918..c2d1664870 100644 --- a/src/Lucene.Net.Misc/Misc/TermStats.cs +++ b/src/Lucene.Net.Misc/Misc/TermStats.cs @@ -45,7 +45,8 @@ internal string GetTermText() public override string ToString() { - return ("TermStats: Term=" + TermText.Utf8ToString() + " DocFreq=" + DocFreq + " TotalTermFreq=" + TotalTermFreq); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + return "TermStats: Term=" + TermText.Utf8ToStringWithFallback() + " DocFreq=" + DocFreq + " TotalTermFreq=" + TotalTermFreq; } } } diff --git a/src/Lucene.Net.Queries/TermsFilter.cs b/src/Lucene.Net.Queries/TermsFilter.cs index 3aae8295f3..ced20eed59 100644 --- a/src/Lucene.Net.Queries/TermsFilter.cs +++ b/src/Lucene.Net.Queries/TermsFilter.cs @@ -319,7 +319,7 @@ public override string ToString() } first = false; builder.Append(current.field).Append(':'); - builder.Append(spare.Utf8ToString()); + builder.Append(spare.Utf8ToStringWithFallback()); // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes } } diff --git a/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs b/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs index 06587b33ca..ae71302ae0 100644 --- a/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs +++ b/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs @@ -59,7 +59,8 @@ internal Completion(BytesRef key, int bucket) public override string ToString() { - return Utf8.Utf8ToString() + "/" + Bucket.ToString("0.0", CultureInfo.InvariantCulture); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + return Utf8.Utf8ToStringWithFallback() + "/" + Bucket.ToString("0.0", CultureInfo.InvariantCulture); } /// diff --git a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs index cdace9c1cc..e771023d59 100644 --- a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs +++ b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs @@ -440,7 +440,8 @@ public PendingTerm(BytesRef term, BlockTermState state) public override string ToString() { - return Term.Utf8ToString(); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + return Term.Utf8ToStringWithFallback(); } } @@ -468,7 +469,8 @@ public PendingBlock(BytesRef prefix, long fp, bool hasTerms, bool isFloor, int f public override string ToString() { - return $"BLOCK: {Prefix.Utf8ToString()}"; + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + return $"BLOCK: {Prefix.Utf8ToStringWithFallback()}"; } #nullable enable diff --git a/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs b/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs index 26a5b54a7a..fc941fab63 100644 --- a/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs +++ b/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs @@ -48,7 +48,7 @@ namespace Lucene.Net.Codecs.Lucene3x /// /// Exposes flex API on a pre-flex index, as a codec. /// - /// @lucene.experimental + /// @lucene.experimental /// [Obsolete("(4.0)")] internal class Lucene3xFields : FieldsProducer @@ -344,7 +344,8 @@ private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) if (DEBUG_SURROGATES) { - Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString())); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToStringWithFallback())); } // Seek "back": @@ -488,7 +489,8 @@ private bool DoPop() if (DEBUG_SURROGATES) { - Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString()); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback()) + " " + scratchTerm.ToString()); } // TODO: more efficient seek? can we simply swap @@ -599,10 +601,11 @@ private void SurrogateDance() if (DEBUG_SURROGATES) { + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes Console.WriteLine(" dance"); - Console.WriteLine(" prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToString())); + Console.WriteLine(" prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToStringWithFallback())); Console.WriteLine(" " + prevTerm.ToString()); - Console.WriteLine(" term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString())); + Console.WriteLine(" term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback())); Console.WriteLine(" " + scratchTerm.ToString()); } @@ -679,7 +682,8 @@ private void DoPushes() if (DEBUG_SURROGATES) { - Console.WriteLine(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + Console.WriteLine(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length); } // Seek "forward": @@ -777,7 +781,7 @@ internal virtual void Reset(FieldInfo fieldInfo) { //System.out.println("pff.reset te=" + termEnum); this.fieldInfo = fieldInfo; - + internedFieldName = fieldInfo.Name.Intern(); Term term = new Term(internedFieldName); @@ -832,7 +836,8 @@ public override SeekStatus SeekCeil(BytesRef term) { if (DEBUG_SURROGATES) { - Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToString())); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToStringWithFallback())); } skipNext = false; TermInfosReader tis = outerInstance.TermsDict; @@ -1232,4 +1237,4 @@ public override void CheckIntegrity() { } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net/Util/BytesRef.cs b/src/Lucene.Net/Util/BytesRef.cs index a7c339f7c9..0124269216 100644 --- a/src/Lucene.Net/Util/BytesRef.cs +++ b/src/Lucene.Net/Util/BytesRef.cs @@ -248,9 +248,9 @@ public string Utf8ToString() /// resulting . /// /// - /// LUCENENET specific version that does not throw exceptions, - /// primarily for use in ToString() and other methods that - /// should not throw exceptions. + /// LUCENENET specific version that does not throw exceptions on invalid UTF-8, + /// primarily for use in ToString() and other cases that should not throw exceptions, + /// such as when building a message for another exception. /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public string Utf8ToStringWithFallback() @@ -604,11 +604,11 @@ public override string ToString() switch (format) { case BytesRefFormat.UTF8: - try + if (bytesRef.TryUtf8ToString(out var utf8String)) { - return bytesRef.Utf8ToString(); + return utf8String; } - catch (Exception e) when (e.IsIndexOutOfBoundsException()) + else { return bytesRef.ToString(); } From 76bb2da03527ffe86b2b40b12a42bbdc5c60601d Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Wed, 18 Dec 2024 08:41:46 -0700 Subject: [PATCH 09/10] Use FormatException instead of ParseException --- src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs | 2 +- src/Lucene.Net/Util/UnicodeUtil.cs | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs index 6cad0a4e4e..a35c0a4d62 100644 --- a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs +++ b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs @@ -340,7 +340,7 @@ public void TestUTF8toUTF16Exception(byte[] invalidUtf8, bool shouldThrow) if (shouldThrow) { - Assert.Throws(() => UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch)); + Assert.Throws(() => UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch)); } else { diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs index 434ca2d265..9a08d8bb4c 100644 --- a/src/Lucene.Net/Util/UnicodeUtil.cs +++ b/src/Lucene.Net/Util/UnicodeUtil.cs @@ -1,7 +1,6 @@ using J2N; using J2N.Text; using Lucene.Net.Diagnostics; -using Lucene.Net.Support; using System; using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; @@ -887,7 +886,7 @@ public static string ToHexString(string s) /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. /// /// NOTE: Full characters are read, even if this reads past the length passed (and - /// can result in an if invalid UTF-8 is passed). + /// can result in an if invalid UTF-8 is passed). /// Explicit checks for valid UTF-8 are not performed. /// /// @@ -902,7 +901,7 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. /// /// NOTE: Full characters are read, even if this reads past the length passed (and - /// can result in an if invalid UTF-8 is passed). + /// can result in an if invalid UTF-8 is passed). /// Explicit checks for valid UTF-8 are not performed. /// /// @@ -927,7 +926,7 @@ public static void UTF8toUTF16(ReadOnlySpan utf8, CharsRef chars) { if (utf8.Length <= i) { - throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1); + throw new FormatException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}"); } @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f)); } @@ -935,7 +934,7 @@ public static void UTF8toUTF16(ReadOnlySpan utf8, CharsRef chars) { if (utf8.Length <= i + 1) { - throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1); + throw new FormatException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}"); } @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f)); i += 2; @@ -944,7 +943,7 @@ public static void UTF8toUTF16(ReadOnlySpan utf8, CharsRef chars) { if (utf8.Length <= i + 2) { - throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1); + throw new FormatException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}"); } if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b); int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f); From 99d2076c74b04a3923d2e189cf7e1ec647c0bdbf Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Wed, 18 Dec 2024 08:45:35 -0700 Subject: [PATCH 10/10] Add FormatException case to Utf8ToString() catch --- src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs b/src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs index 8b8b08e6e0..1125feaffc 100644 --- a/src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs +++ b/src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs @@ -407,10 +407,10 @@ public override int GetOrd(int docId) SimpleTextUtil.ReadLine(_input, _scratch); try { - // LUCNENENET: .NET doesn't have a way to specify a pattern with integer, but all of the standard ones are built in. + // LUCENENET: .NET doesn't have a way to specify a pattern with integer, but all of the standard ones are built in. return int.Parse(_scratch.Utf8ToString(), NumberStyles.Integer, CultureInfo.InvariantCulture) - 1; } - catch (Exception pe) when (pe.IsParseException()) + catch (Exception pe) when (pe.IsParseException() || pe.IsNumberFormatException()) { var e = new CorruptIndexException($"failed to parse ord (resource={_input})", pe); throw e;