From 1845c38fcc37c6d335b5347488fb3da326761317 Mon Sep 17 00:00:00 2001
From: Paul Irwin <paulirwin@gmail.com>
Date: Wed, 4 Dec 2024 16:52:33 -0700
Subject: [PATCH 01/10] UnicodeUtil updates: TryUTF8toUTF16, ReadOnlySpan
 methods, #1024

---
 src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs |  51 +++++---
 src/Lucene.Net/Util/BytesRef.cs               |  20 +++
 src/Lucene.Net/Util/UnicodeUtil.cs            | 122 ++++++++++++++----
 3 files changed, 152 insertions(+), 41 deletions(-)
diff --git a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs
index bab16426f7..cdace9c1cc 100644
--- a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs
+++ b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs
@@ -50,15 +50,15 @@ namespace Lucene.Net.Codecs
     public static class BlockTreeTermsWriter
     {
         /// <summary>
-        /// Suggested default value for the 
-        /// <c>minItemsInBlock</c> parameter to 
+        /// Suggested default value for the
+        /// <c>minItemsInBlock</c> parameter to
         /// <see cref="BlockTreeTermsWriter{TSubclassState}(SegmentWriteState, PostingsWriterBase, int, int, TSubclassState)"/>.
         /// </summary>
         public const int DEFAULT_MIN_BLOCK_SIZE = 25;
 
         /// <summary>
-        /// Suggested default value for the 
-        /// <c>maxItemsInBlock</c> parameter to 
+        /// Suggested default value for the
+        /// <c>maxItemsInBlock</c> parameter to
         /// <see cref="BlockTreeTermsWriter{TSubclassState}(SegmentWriteState, PostingsWriterBase, int, int, TSubclassState)"/>.
         /// </summary>
         public const int DEFAULT_MAX_BLOCK_SIZE = 48;
@@ -296,12 +296,12 @@ public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long
         /// to set state. It is *optional* and can be used when overriding the WriteHeader(),
         /// WriteIndexHeader(). It only matters in the case where the state
         /// is required inside of any of those methods that is passed in to the subclass constructor.
-        /// 
+        ///
         /// When passed to the constructor, it is set to the protected field m_subclassState before
         /// any of the above methods are called where it is available for reading when overriding the above methods.
-        /// 
+        ///
         /// If your subclass needs to pass more than one piece of data, you can create a class or struct to do so.
-        /// All other virtual members of BlockTreeTermsWriter are not called in the constructor, 
+        /// All other virtual members of BlockTreeTermsWriter are not called in the constructor,
         /// so the overrides of those methods won't specifically need to use this field (although they could for consistency).
         /// </param>
         [SuppressMessage("CodeQuality", "IDE0079:Remove unnecessary suppression", Justification = "This is a SonarCloud issue")]
@@ -468,7 +468,20 @@ public PendingBlock(BytesRef prefix, long fp, bool hasTerms, bool isFloor, int f
 
             public override string ToString()
             {
-                return "BLOCK: " + Prefix.Utf8ToString();
+                return $"BLOCK: {Prefix.Utf8ToString()}";
+            }
+
+            #nullable enable
+            public bool TryToString([NotNullWhen(true)] out string? result)
+            {
+                if (Prefix.TryUtf8ToString(out string? prefixString))
+                {
+                    result = $"BLOCK: {prefixString}";
+                    return true;
+                }
+
+                result = null;
+                return false;
             }
 
             // LUCENENET specific - to keep the Debug.Assert statement from throwing exceptions
@@ -476,12 +489,11 @@ public override string ToString()
             // to using PendingBlock.Prefix.ToString() if PendingBlock.ToString() errors.
             // This struct defers formatting the string until it is actually used as a parameter
             // in string.Format().
-            private struct PendingBlocksFormatter // For assert
+            private readonly struct PendingBlocksFormatter // For assert
             {
-#pragma warning disable IDE0044 // Add readonly modifier
-                private IList<PendingBlock> blocks;
-#pragma warning restore IDE0044 // Add readonly modifier
-                public PendingBlocksFormatter(IList<PendingBlock> blocks)
+                private readonly IList<PendingBlock>? blocks;
+
+                public PendingBlocksFormatter(IList<PendingBlock>? blocks)
                 {
                     this.blocks = blocks; // May be null
                 }
@@ -500,17 +512,17 @@ public override string ToString() // For assert
                     it.MoveNext();
                     while (true)
                     {
-                        var e = it.Current;
+                        var e = it.Current ?? throw new InvalidOperationException("Expected a non-null value in the enumerator due to Count check above.");
                         // There is a chance that the Prefix will contain invalid UTF8,
                         // so we catch that and use the alternative way of displaying it
-                        try
+                        if (e.TryToString(out string? eString))
                         {
-                            sb.Append(e.ToString());
+                            sb.Append(eString);
                         }
-                        catch (IndexOutOfRangeException)
+                        else
                         {
                             sb.Append("BLOCK: ");
-                            sb.Append(e.Prefix.ToString());
+                            sb.Append(e.Prefix);
                         }
                         if (!it.MoveNext())
                         {
@@ -520,6 +532,7 @@ public override string ToString() // For assert
                     }
                 }
             }
+            #nullable restore
 
             public void CompileIndex(IList<PendingBlock> floorBlocks, RAMOutputStream scratchBytes)
             {
@@ -1351,4 +1364,4 @@ protected override void Dispose(bool disposing)
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net/Util/BytesRef.cs b/src/Lucene.Net/Util/BytesRef.cs
index 8012c77282..2e3679f8e0 100644
--- a/src/Lucene.Net/Util/BytesRef.cs
+++ b/src/Lucene.Net/Util/BytesRef.cs
@@ -243,6 +243,26 @@ public string Utf8ToString()
             return @ref.ToString();
         }
 
+        #nullable enable
+        /// <summary>
+        /// Tries to interpret the stored bytes as UTF8 bytes, returning the
+        /// resulting <see cref="string"/> as an output parameter <paramref name="result"/>.
+        /// </summary>
+        /// <param name="result">The resulting string output.</param>
+        /// <returns><c>true</c> if successful, <c>false</c> otherwise.</returns>
+        public bool TryUtf8ToString([NotNullWhen(true)] out string? result)
+        {
+            if (UnicodeUtil.TryUTF8toUTF16(bytes, Offset, Length, out CharsRef? @ref))
+            {
+                result = @ref.ToString();
+                return true;
+            }
+
+            result = null;
+            return false;
+        }
+        #nullable restore
+
         /// <summary>
         /// Returns hex encoded bytes, eg [0x6c 0x75 0x63 0x65 0x6e 0x65] </summary>
         public override string ToString()
diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs
index 5974af1a16..65dd2fabc9 100644
--- a/src/Lucene.Net/Util/UnicodeUtil.cs
+++ b/src/Lucene.Net/Util/UnicodeUtil.cs
@@ -3,6 +3,7 @@
 using Lucene.Net.Diagnostics;
 using Lucene.Net.Support;
 using System;
+using System.Diagnostics.CodeAnalysis;
 using System.Runtime.CompilerServices;
 using System.Text;
 
@@ -123,13 +124,13 @@ public static class UnicodeUtil
         private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START;
 
         /// <summary>
-        /// Encode characters from a <see cref="T:char[]"/> <paramref name="source"/>, starting at
+        /// Encode characters from a <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="char"/>) <paramref name="source"/>, starting at
         /// and ending at <paramref name="result"/>. After encoding, <c>result.Offset</c> will always be 0.
         /// </summary>
         /// <exception cref="ArgumentNullException"><paramref name="result"/> is <c>null</c>.</exception>
         // TODO: broken if incoming result.offset != 0
         // LUCENENET specific overload
-        public static void UTF16toUTF8(Span<char> source, BytesRef result)
+        public static void UTF16toUTF8(ReadOnlySpan<char> source, BytesRef result)
         {
             // LUCENENET: Added guard clause
             if (result is null)
@@ -200,7 +201,7 @@ public static void UTF16toUTF8(Span<char> source, BytesRef result)
         }
 
         /// <summary>
-        /// Encode characters from a <see cref="T:char[]"/> <paramref name="source"/>, starting at
+        /// Encode characters from a <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="char"/>) <paramref name="source"/>, starting at
         /// <paramref name="offset"/> for <paramref name="length"/> chars. After encoding, <c>result.Offset</c> will always be 0.
         /// </summary>
         /// <exception cref="ArgumentNullException"><paramref name="source"/> or <paramref name="result"/> is <c>null</c>.</exception>
@@ -212,11 +213,9 @@ public static void UTF16toUTF8(Span<char> source, BytesRef result)
         /// <paramref name="offset"/> and <paramref name="length"/> refer to a location outside of <paramref name="source"/>.
         /// </exception>
         // TODO: broken if incoming result.offset != 0
-        public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef result)
+        public static void UTF16toUTF8(ReadOnlySpan<char> source, int offset, int length, BytesRef result)
         {
             // LUCENENET: Added guard clauses
-            if (source is null)
-                throw new ArgumentNullException(nameof(source));
             if (result is null)
                 throw new ArgumentNullException(nameof(result));
             if (offset < 0)
@@ -633,7 +632,7 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl
             return true;
         }
 
-        public static bool ValidUTF16String(char[] s, int size)
+        public static bool ValidUTF16String(ReadOnlySpan<char> s, int size)
         {
             for (int i = 0; i < size; i++)
             {
@@ -828,16 +827,16 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32)
         private const int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
 
         /// <summary>
-        /// Cover JDK 1.5 API. Create a String from an array of <paramref name="codePoints"/>.
+        /// Cover JDK 1.5 API. Create a String from a span of <paramref name="codePoints"/>.
         /// </summary>
-        /// <param name="codePoints"> The code array. </param>
-        /// <param name="offset"> The start of the text in the code point array. </param>
+        /// <param name="codePoints"> The code point span. </param>
+        /// <param name="offset"> The start of the text in the code point span. </param>
         /// <param name="count"> The number of code points. </param>
         /// <returns> a String representing the code points between offset and count. </returns>
         /// <exception cref="ArgumentException"> If an invalid code point is encountered. </exception>
         /// <exception cref="IndexOutOfRangeException"> If the offset or count are out of bounds. </exception>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public static string NewString(int[] codePoints, int offset, int count)
+        public static string NewString(ReadOnlySpan<int> codePoints, int offset, int count)
         {
             // LUCENENET: Character.ToString() was optimized to use the stack for arrays
             // of codepoints 256 or less, so it performs better than using ToCharArray().
@@ -849,26 +848,26 @@ public static string NewString(int[] codePoints, int offset, int count)
         /// <para/>
         /// LUCENENET specific.
         /// </summary>
-        /// <param name="codePoints"> The code array. </param>
-        /// <param name="offset"> The start of the text in the code point array. </param>
+        /// <param name="codePoints"> The code span. </param>
+        /// <param name="offset"> The start of the text in the code point span. </param>
         /// <param name="count"> The number of code points. </param>
         /// <returns> a char array representing the code points between offset and count. </returns>
         // LUCENENET NOTE: This code was originally in the NewString() method (above).
         // It has been refactored from the original to remove the exception throw/catch and
-        // instead proactively resizes the array instead of relying on excpetions + copy operations
-        public static char[] ToCharArray(int[] codePoints, int offset, int count)
+        // instead proactively resizes the array instead of relying on exceptions + copy operations
+        public static char[] ToCharArray(ReadOnlySpan<int> codePoints, int offset, int count)
         {
             if (count < 0)
             {
                 throw new ArgumentOutOfRangeException(nameof(count), "count must be >= 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
             }
-            const int countThreashold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2
+            const int countThreshold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2
             // LUCENENET: as a first approximation, assume each codepoint
             // is 2 characters (since it cannot be longer than this)
             int arrayLength = count * 2;
-            // LUCENENET: if we go over the threashold, count the number of
+            // LUCENENET: if we go over the threshold, count the number of
             // chars we will need so we can allocate the precise amount of memory
-            if (count > countThreashold)
+            if (count > countThreshold)
             {
                 arrayLength = 0;
                 for (int r = offset, e = offset + count; r < e; ++r)
@@ -951,15 +950,18 @@ public static string ToHexString(string s)
         }
 
         /// <summary>
-        /// Interprets the given byte array as UTF-8 and converts to UTF-16. The <see cref="CharsRef"/> will be extended if
+        /// Interprets the given byte span as UTF-8 and converts to UTF-16. The <see cref="CharsRef"/> will be extended if
         /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
         /// <para/>
         /// NOTE: Full characters are read, even if this reads past the length passed (and
         /// can result in an <see cref="IndexOutOfRangeException"/> if invalid UTF-8 is passed).
         /// Explicit checks for valid UTF-8 are not performed.
         /// </summary>
+        /// <remarks>
+        /// LUCENENET specific: This method uses <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="byte"/>) instead of byte[].
+        /// </remarks>
         // TODO: broken if chars.offset != 0
-        public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars)
+        public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int length, CharsRef chars)
         {
             int out_offset = chars.Offset = 0;
             char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, length);
@@ -1001,9 +1003,85 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha
             chars.Length = out_offset - chars.Offset;
         }
 
+        #nullable enable
+        /// <summary>
+        /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new <see cref="CharsRef"/>.
+        /// <para/>
+        /// NOTE: Explicit checks for valid UTF-8 are not performed.
+        /// </summary>
+        /// <remarks>
+        /// LUCENENET specific: This method uses <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="byte"/>) instead of byte[].
+        /// </remarks>
+        public static bool TryUTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int length, [NotNullWhen(true)] out CharsRef? chars)
+        {
+            CharsRef result = new CharsRef(length);
+            int out_offset = 0;
+            char[] @out = result.Chars;
+            int limit = offset + length;
+            while (offset < limit)
+            {
+                if (utf8.Length <= offset)
+                {
+                    chars = null;
+                    return false;
+                }
+
+                int b = utf8[offset++] & 0xff;
+                if (b < 0xc0)
+                {
+                    if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80);
+                    @out[out_offset++] = (char)b;
+                }
+                else if (b < 0xe0)
+                {
+                    if (utf8.Length <= offset)
+                    {
+                        chars = null;
+                        return false;
+                    }
+                    @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f));
+                }
+                else if (b < 0xf0)
+                {
+                    if (utf8.Length <= offset + 1)
+                    {
+                        chars = null;
+                        return false;
+                    }
+                    @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f));
+                    offset += 2;
+                }
+                else
+                {
+                    if (utf8.Length <= offset + 2)
+                    {
+                        chars = null;
+                        return false;
+                    }
+                    if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
+                    int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f);
+                    offset += 3;
+                    if (ch < UNI_MAX_BMP)
+                    {
+                        @out[out_offset++] = (char)ch;
+                    }
+                    else
+                    {
+                        int chHalf = ch - 0x0010000;
+                        @out[out_offset++] = (char)((chHalf >> 10) + 0xD800);
+                        @out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00);
+                    }
+                }
+            }
+            result.Length = out_offset;
+            chars = result;
+            return true;
+        }
+        #nullable restore
+
         /// <summary>
-        /// Utility method for <see cref="UTF8toUTF16(byte[], int, int, CharsRef)"/> </summary>
-        /// <seealso cref="UTF8toUTF16(byte[], int, int, CharsRef)"/>
+        /// Utility method for <see cref="UTF8toUTF16(ReadOnlySpan{byte}, int, int, CharsRef)"/> </summary>
+        /// <seealso cref="UTF8toUTF16(ReadOnlySpan{byte}, int, int, CharsRef)"/>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars)
         {

From e24d99c5d64ba97fe6dd23a8b15f78afa4ab1deb Mon Sep 17 00:00:00 2001
From: Paul Irwin <paulirwin@gmail.com>
Date: Wed, 4 Dec 2024 20:43:04 -0700
Subject: [PATCH 02/10] Add back array overloads; add unit test for
 TryUTF8toUTF16

---
 src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs |  13 ++
 src/Lucene.Net/Util/UnicodeUtil.cs           | 189 ++++++++++++++++---
 2 files changed, 171 insertions(+), 31 deletions(-)

diff --git a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
index bb8e736a7c..be98e7a3d4 100644
--- a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
+++ b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
@@ -327,5 +327,18 @@ public virtual void TestUTF8UTF16CharsRef()
                 Assert.AreEqual(cRef.ToString(), unicode);
             }
         }
+
+        [Test]
+        [LuceneNetSpecific] // this is a Lucene.NET specific method
+        public void TestTryUTF8toUTF16()
+        {
+            string unicode = TestUtil.RandomRealisticUnicodeString(Random);
+            var utf8 = new BytesRef(IOUtils.CHARSET_UTF_8.GetBytes(unicode));
+
+            bool success = UnicodeUtil.TryUTF8toUTF16(utf8, out var chars);
+
+            Assert.IsTrue(success);
+            Assert.AreEqual(unicode, chars?.ToString());
+        }
     }
 }
diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs
index 65dd2fabc9..3069ef0379 100644
--- a/src/Lucene.Net/Util/UnicodeUtil.cs
+++ b/src/Lucene.Net/Util/UnicodeUtil.cs
@@ -6,6 +6,7 @@
 using System.Diagnostics.CodeAnalysis;
 using System.Runtime.CompilerServices;
 using System.Text;
+#nullable enable
 
 namespace Lucene.Net.Util
 {
@@ -108,7 +109,10 @@ public static class UnicodeUtil
         /// <para/>
         /// WARNING: this is not a valid UTF8 Term
         /// </summary>
-        public static readonly BytesRef BIG_TERM = new BytesRef(new byte[] { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }); // TODO this is unrelated here find a better place for it
+        public static readonly BytesRef BIG_TERM = new BytesRef(new byte[]
+        {
+            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+        }); // TODO this is unrelated here find a better place for it
 
         public const int UNI_SUR_HIGH_START = 0xD800;
         public const int UNI_SUR_HIGH_END = 0xDBFF;
@@ -121,7 +125,8 @@ public static class UnicodeUtil
         private const long HALF_SHIFT = 10;
         private const long HALF_MASK = 0x3FFL;
 
-        private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START;
+        private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint -
+                                             (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START;
 
         /// <summary>
         /// Encode characters from a <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="char"/>) <paramref name="source"/>, starting at
@@ -149,6 +154,7 @@ public static void UTF16toUTF8(ReadOnlySpan<char> source, BytesRef result)
             {
                 @out = result.Bytes = new byte[maxLen];
             }
+
             result.Offset = 0;
 
             while (i < end)
@@ -189,6 +195,7 @@ public static void UTF16toUTF8(ReadOnlySpan<char> source, BytesRef result)
                             continue;
                         }
                     }
+
                     // replace unpaired surrogate or out-of-order low surrogate
                     // with substitution character
                     @out[upto++] = 0xEF;
@@ -196,12 +203,13 @@ public static void UTF16toUTF8(ReadOnlySpan<char> source, BytesRef result)
                     @out[upto++] = 0xBD;
                 }
             }
+
             //assert matches(source, offset, length, out, upto);
             result.Length = upto;
         }
 
         /// <summary>
-        /// Encode characters from a <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="char"/>) <paramref name="source"/>, starting at
+        /// Encode characters from a <see cref="T:char[]"/> <paramref name="source"/>, starting at
         /// <paramref name="offset"/> for <paramref name="length"/> chars. After encoding, <c>result.Offset</c> will always be 0.
         /// </summary>
         /// <exception cref="ArgumentNullException"><paramref name="source"/> or <paramref name="result"/> is <c>null</c>.</exception>
@@ -213,6 +221,31 @@ public static void UTF16toUTF8(ReadOnlySpan<char> source, BytesRef result)
         /// <paramref name="offset"/> and <paramref name="length"/> refer to a location outside of <paramref name="source"/>.
         /// </exception>
         // TODO: broken if incoming result.offset != 0
+        public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef result)
+        {
+            // LUCENENET: Added guard clauses
+            if (source is null)
+                throw new ArgumentNullException(nameof(source));
+
+            UTF16toUTF8(source.AsSpan(), offset, length, result);
+        }
+
+        /// <summary>
+        /// Encode characters from a <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="char"/>) <paramref name="source"/>, starting at
+        /// <paramref name="offset"/> for <paramref name="length"/> chars. After encoding, <c>result.Offset</c> will always be 0.
+        /// </summary>
+        /// <exception cref="ArgumentNullException"><paramref name="result"/> is <c>null</c>.</exception>
+        /// <exception cref="ArgumentOutOfRangeException">
+        /// <paramref name="offset"/> or <paramref name="length"/> is less than zero.
+        /// <para/>
+        /// -or-
+        /// <para/>
+        /// <paramref name="offset"/> and <paramref name="length"/> refer to a location outside of <paramref name="source"/>.
+        /// </exception>
+        /// <remarks>
+        /// LUCENENET specific overload.
+        /// </remarks>
+        // TODO: broken if incoming result.offset != 0
         public static void UTF16toUTF8(ReadOnlySpan<char> source, int offset, int length, BytesRef result)
         {
             // LUCENENET: Added guard clauses
@@ -223,7 +256,8 @@ public static void UTF16toUTF8(ReadOnlySpan<char> source, int offset, int length
             if (length < 0)
                 throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
             if (offset > source.Length - length) // Checks for int overflow
-                throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
+                throw new ArgumentOutOfRangeException(nameof(length),
+                    $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
 
             int upto = 0;
             int i = offset;
@@ -235,6 +269,7 @@ public static void UTF16toUTF8(ReadOnlySpan<char> source, int offset, int length
             {
                 @out = result.Bytes = new byte[maxLen];
             }
+
             result.Offset = 0;
 
             while (i < end)
@@ -275,6 +310,7 @@ public static void UTF16toUTF8(ReadOnlySpan<char> source, int offset, int length
                             continue;
                         }
                     }
+
                     // replace unpaired surrogate or out-of-order low surrogate
                     // with substitution character
                     @out[upto++] = 0xEF;
@@ -282,6 +318,7 @@ public static void UTF16toUTF8(ReadOnlySpan<char> source, int offset, int length
                     @out[upto++] = 0xBD;
                 }
             }
+
             //assert matches(source, offset, length, out, upto);
             result.Length = upto;
         }
@@ -311,7 +348,8 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt
             if (length < 0)
                 throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
             if (offset > source.Length - length) // Checks for int overflow
-                throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
+                throw new ArgumentOutOfRangeException(nameof(length),
+                    $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
 
             int end = offset + length;
 
@@ -362,6 +400,7 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt
                             continue;
                         }
                     }
+
                     // replace unpaired surrogate or out-of-order low surrogate
                     // with substitution character
                     @out[upto++] = 0xEF;
@@ -369,6 +408,7 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt
                     @out[upto++] = 0xBD;
                 }
             }
+
             //assert matches(s, offset, length, out, upto);
             result.Length = upto;
         }
@@ -400,7 +440,8 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r
             if (length < 0)
                 throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
             if (offset > source.Length - length) // Checks for int overflow
-                throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
+                throw new ArgumentOutOfRangeException(nameof(length),
+                    $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
 
             int end = offset + length;
 
@@ -451,6 +492,7 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r
                             continue;
                         }
                     }
+
                     // replace unpaired surrogate or out-of-order low surrogate
                     // with substitution character
                     @out[upto++] = 0xEF;
@@ -458,6 +500,7 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r
                     @out[upto++] = 0xBD;
                 }
             }
+
             //assert matches(s, offset, length, out, upto);
             result.Length = upto;
         }
@@ -535,19 +578,19 @@ public static bool ValidUTF16String(ICharSequence s)
                             // Valid surrogate pair
                         }
                         else
-                        // Unmatched high surrogate
+                            // Unmatched high surrogate
                         {
                             return false;
                         }
                     }
                     else
-                    // Unmatched high surrogate
+                        // Unmatched high surrogate
                     {
                         return false;
                     }
                 }
                 else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
-                // Unmatched low surrogate
+                    // Unmatched low surrogate
                 {
                     return false;
                 }
@@ -556,7 +599,8 @@ public static bool ValidUTF16String(ICharSequence s)
             return true;
         }
 
-        public static bool ValidUTF16String(string s) // LUCENENET specific overload because string doesn't implement ICharSequence
+        public static bool
+            ValidUTF16String(string s) // LUCENENET specific overload because string doesn't implement ICharSequence
         {
             int size = s.Length;
             for (int i = 0; i < size; i++)
@@ -573,19 +617,19 @@ public static bool ValidUTF16String(string s) // LUCENENET specific overload bec
                             // Valid surrogate pair
                         }
                         else
-                        // Unmatched high surrogate
+                            // Unmatched high surrogate
                         {
                             return false;
                         }
                     }
                     else
-                    // Unmatched high surrogate
+                        // Unmatched high surrogate
                     {
                         return false;
                     }
                 }
                 else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
-                // Unmatched low surrogate
+                    // Unmatched low surrogate
                 {
                     return false;
                 }
@@ -594,7 +638,9 @@ public static bool ValidUTF16String(string s) // LUCENENET specific overload bec
             return true;
         }
 
-        public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence
+        public static bool
+            ValidUTF16String(
+                StringBuilder s) // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence
         {
             int size = s.Length;
             for (int i = 0; i < size; i++)
@@ -611,19 +657,19 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl
                             // Valid surrogate pair
                         }
                         else
-                        // Unmatched high surrogate
+                            // Unmatched high surrogate
                         {
                             return false;
                         }
                     }
                     else
-                    // Unmatched high surrogate
+                        // Unmatched high surrogate
                     {
                         return false;
                     }
                 }
                 else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
-                // Unmatched low surrogate
+                    // Unmatched low surrogate
                 {
                     return false;
                 }
@@ -632,6 +678,8 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl
             return true;
         }
 
+        public static bool ValidUTF16String(char[] s, int size) => ValidUTF16String(s.AsSpan(), size);
+
         public static bool ValidUTF16String(ReadOnlySpan<char> s, int size)
         {
             for (int i = 0; i < size; i++)
@@ -658,7 +706,7 @@ public static bool ValidUTF16String(ReadOnlySpan<char> s, int size)
                     }
                 }
                 else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
-                // Unmatched low surrogate
+                    // Unmatched low surrogate
                 {
                     return false;
                 }
@@ -676,10 +724,13 @@ public static bool ValidUTF16String(ReadOnlySpan<char> s, int size)
         /* Map UTF-8 encoded prefix byte to sequence length.  -1 (0xFF)
          * means illegal prefix.  see RFC 2279 for details */
         internal static readonly int[] utf8CodeLength = LoadUTF8CodeLength();
-        private static int[] LoadUTF8CodeLength() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
+
+        private static int[]
+            LoadUTF8CodeLength() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
         {
-            int v = int.MinValue;
-            return new int[] {
+            const int v = int.MinValue;
+            return new int[]
+            {
                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -720,12 +771,31 @@ public static int CodePointCount(BytesRef utf8)
             for (; pos < limit; codePointCount++)
             {
                 int v = bytes[pos] & 0xFF;
-                if (v <   /* 0xxx xxxx */ 0x80) { pos += 1; continue; }
-                if (v >=  /* 110x xxxx */ 0xc0)
+                if (v < /* 0xxx xxxx */ 0x80)
+                {
+                    pos += 1;
+                    continue;
+                }
+
+                if (v >= /* 110x xxxx */ 0xc0)
                 {
-                    if (v < /* 111x xxxx */ 0xe0) { pos += 2; continue; }
-                    if (v < /* 1111 xxxx */ 0xf0) { pos += 3; continue; }
-                    if (v < /* 1111 1xxx */ 0xf8) { pos += 4; continue; }
+                    if (v < /* 111x xxxx */ 0xe0)
+                    {
+                        pos += 2;
+                        continue;
+                    }
+
+                    if (v < /* 1111 xxxx */ 0xf0)
+                    {
+                        pos += 3;
+                        continue;
+                    }
+
+                    if (v < /* 1111 1xxx */ 0xf8)
+                    {
+                        pos += 4;
+                        continue;
+                    }
                     // fallthrough, consider 5 and 6 byte sequences invalid.
                 }
 
@@ -756,6 +826,7 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32)
             {
                 utf32.Int32s = new int[utf8.Length];
             }
+
             int utf32Count = 0;
             int utf8Upto = utf8.Offset;
             int[] ints = utf32.Int32s;
@@ -795,6 +866,7 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32)
                 {
                     v = v << 6 | bytes[utf8Upto++] & 63;
                 }
+
                 ints[utf32Count++] = v;
             }
 
@@ -824,7 +896,25 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32)
 
         /// <summary>
         /// Value that all lead surrogate starts with. </summary>
-        private const int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
+        private const int LEAD_SURROGATE_OFFSET_ =
+            LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
+
+        /// <summary>
+        /// Cover JDK 1.5 API. Create a String from an array of <paramref name="codePoints"/>.
+        /// </summary>
+        /// <param name="codePoints"> The code point array. </param>
+        /// <param name="offset"> The start of the text in the code point array. </param>
+        /// <param name="count"> The number of code points. </param>
+        /// <returns> a String representing the code points between offset and count. </returns>
+        /// <exception cref="ArgumentException"> If an invalid code point is encountered. </exception>
+        /// <exception cref="IndexOutOfRangeException"> If the offset or count are out of bounds. </exception>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static string NewString(int[] codePoints, int offset, int count)
+        {
+            // LUCENENET: Character.ToString() was optimized to use the stack for arrays
+            // of codepoints 256 or less, so it performs better than using ToCharArray().
+            return Character.ToString(codePoints, offset, count);
+        }
 
         /// <summary>
         /// Cover JDK 1.5 API. Create a String from a span of <paramref name="codePoints"/>.
@@ -843,6 +933,23 @@ public static string NewString(ReadOnlySpan<int> codePoints, int offset, int cou
             return Character.ToString(codePoints, offset, count);
         }
 
+        /// <summary>
+        /// Generates char array that represents the provided input code points.
+        /// <para/>
+        /// LUCENENET specific.
+        /// </summary>
+        /// <param name="codePoints"> The code array. </param>
+        /// <param name="offset"> The start of the text in the code point array. </param>
+        /// <param name="count"> The number of code points. </param>
+        /// <returns> a char array representing the code points between offset and count. </returns>
+        // LUCENENET NOTE: This code was originally in the NewString() method (above).
+        // It has been refactored from the original to remove the exception throw/catch and
+        // instead proactively resizes the array instead of relying on exceptions + copy operations
+        public static char[] ToCharArray(int[] codePoints, int offset, int count)
+        {
+            return ToCharArray(codePoints.AsSpan(), offset, count);
+        }
+
         /// <summary>
         /// Generates char array that represents the provided input code points.
         /// <para/>
@@ -949,6 +1056,20 @@ public static string ToHexString(string s)
             return sb.ToString();
         }
 
+        /// <summary>
+        /// Interprets the given byte array as UTF-8 and converts to UTF-16. The <see cref="CharsRef"/> will be extended if
+        /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
+        /// <para/>
+        /// NOTE: Full characters are read, even if this reads past the length passed (and
+        /// can result in an <see cref="IndexOutOfRangeException"/> if invalid UTF-8 is passed).
+        /// Explicit checks for valid UTF-8 are not performed.
+        /// </summary>
+        // TODO: broken if chars.offset != 0
+        public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars)
+        {
+            UTF8toUTF16(utf8.AsSpan(), offset, length, chars);
+        }
+
         /// <summary>
         /// Interprets the given byte span as UTF-8 and converts to UTF-16. The <see cref="CharsRef"/> will be extended if
         /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
@@ -958,7 +1079,7 @@ public static string ToHexString(string s)
         /// Explicit checks for valid UTF-8 are not performed.
         /// </summary>
         /// <remarks>
-        /// LUCENENET specific: This method uses <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="byte"/>) instead of byte[].
+        /// LUCENENET specific overload.
         /// </remarks>
         // TODO: broken if chars.offset != 0
         public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int length, CharsRef chars)
@@ -1003,7 +1124,6 @@ public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int length,
             chars.Length = out_offset - chars.Offset;
         }
 
-        #nullable enable
         /// <summary>
         /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new <see cref="CharsRef"/>.
         /// <para/>
@@ -1077,7 +1197,6 @@ public static bool TryUTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int lengt
             chars = result;
             return true;
         }
-        #nullable restore
 
         /// <summary>
         /// Utility method for <see cref="UTF8toUTF16(ReadOnlySpan{byte}, int, int, CharsRef)"/> </summary>
@@ -1085,7 +1204,15 @@ public static bool TryUTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int lengt
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars)
         {
-            UTF8toUTF16(bytesRef.Bytes, bytesRef.Offset, bytesRef.Length, chars);
+            UTF8toUTF16(bytesRef.Bytes.AsSpan(), bytesRef.Offset, bytesRef.Length, chars);
+        }
+
+        /// <summary>
+        /// Utility method for <see cref="TryUTF8toUTF16(ReadOnlySpan{byte}, int, int, out CharsRef)"/> </summary>
+        /// <seealso cref="TryUTF8toUTF16(ReadOnlySpan{byte}, int, int, out CharsRef)"/>
+        public static bool TryUTF8toUTF16(BytesRef bytesRef, out CharsRef? chars)
+        {
+            return TryUTF8toUTF16(bytesRef.Bytes.AsSpan(), bytesRef.Offset, bytesRef.Length, out chars);
         }
     }
 }

From 0afee05c8b2122d024cb2d8ed99bdfdada8a59b7 Mon Sep 17 00:00:00 2001
From: Paul Irwin <paulirwin@gmail.com>
Date: Wed, 4 Dec 2024 20:45:57 -0700
Subject: [PATCH 03/10] Fix comment formatting

---
 src/Lucene.Net/Util/UnicodeUtil.cs | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs
index 3069ef0379..75af2b53ea 100644
--- a/src/Lucene.Net/Util/UnicodeUtil.cs
+++ b/src/Lucene.Net/Util/UnicodeUtil.cs
@@ -578,20 +578,20 @@ public static bool ValidUTF16String(ICharSequence s)
                             // Valid surrogate pair
                         }
                         else
-                            // Unmatched high surrogate
                         {
+                            // Unmatched high surrogate
                             return false;
                         }
                     }
                     else
-                        // Unmatched high surrogate
                     {
+                        // Unmatched high surrogate
                         return false;
                     }
                 }
                 else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
-                    // Unmatched low surrogate
                 {
+                    // Unmatched low surrogate
                     return false;
                 }
             }
@@ -617,20 +617,20 @@ public static bool
                             // Valid surrogate pair
                         }
                         else
-                            // Unmatched high surrogate
                         {
+                            // Unmatched high surrogate
                             return false;
                         }
                     }
                     else
-                        // Unmatched high surrogate
                     {
+                        // Unmatched high surrogate
                         return false;
                     }
                 }
                 else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
-                    // Unmatched low surrogate
                 {
+                    // Unmatched low surrogate
                     return false;
                 }
             }
@@ -657,20 +657,20 @@ public static bool
                             // Valid surrogate pair
                         }
                         else
-                            // Unmatched high surrogate
                         {
+                            // Unmatched high surrogate
                             return false;
                         }
                     }
                     else
-                        // Unmatched high surrogate
                     {
+                        // Unmatched high surrogate
                         return false;
                     }
                 }
                 else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
-                    // Unmatched low surrogate
                 {
+                    // Unmatched low surrogate
                     return false;
                 }
             }
@@ -706,8 +706,8 @@ public static bool ValidUTF16String(ReadOnlySpan<char> s, int size)
                     }
                 }
                 else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
-                    // Unmatched low surrogate
                 {
+                    // Unmatched low surrogate
                     return false;
                 }
             }

From 95966c7f886a7b7d81f0f39af51c297300164360 Mon Sep 17 00:00:00 2001
From: Paul Irwin <paulirwin@gmail.com>
Date: Tue, 17 Dec 2024 13:22:46 -0700
Subject: [PATCH 04/10] Remove offset/length parameters from Span-based
 methods, #1024

---
 src/Lucene.Net/Util/UnicodeUtil.cs | 198 +++++++++--------------------
 1 file changed, 60 insertions(+), 138 deletions(-)

diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs
index 75af2b53ea..af53ec15d5 100644
--- a/src/Lucene.Net/Util/UnicodeUtil.cs
+++ b/src/Lucene.Net/Util/UnicodeUtil.cs
@@ -159,7 +159,7 @@ public static void UTF16toUTF8(ReadOnlySpan<char> source, BytesRef result)
 
             while (i < end)
             {
-                int code = (int)source[i++];
+                var code = (int)source[i++];
 
                 if (code < 0x80)
                 {
@@ -227,100 +227,7 @@ public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef r
             if (source is null)
                 throw new ArgumentNullException(nameof(source));
 
-            UTF16toUTF8(source.AsSpan(), offset, length, result);
-        }
-
-        /// <summary>
-        /// Encode characters from a <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="char"/>) <paramref name="source"/>, starting at
-        /// <paramref name="offset"/> for <paramref name="length"/> chars. After encoding, <c>result.Offset</c> will always be 0.
-        /// </summary>
-        /// <exception cref="ArgumentNullException"><paramref name="result"/> is <c>null</c>.</exception>
-        /// <exception cref="ArgumentOutOfRangeException">
-        /// <paramref name="offset"/> or <paramref name="length"/> is less than zero.
-        /// <para/>
-        /// -or-
-        /// <para/>
-        /// <paramref name="offset"/> and <paramref name="length"/> refer to a location outside of <paramref name="source"/>.
-        /// </exception>
-        /// <remarks>
-        /// LUCENENET specific overload.
-        /// </remarks>
-        // TODO: broken if incoming result.offset != 0
-        public static void UTF16toUTF8(ReadOnlySpan<char> source, int offset, int length, BytesRef result)
-        {
-            // LUCENENET: Added guard clauses
-            if (result is null)
-                throw new ArgumentNullException(nameof(result));
-            if (offset < 0)
-                throw new ArgumentOutOfRangeException(nameof(offset), $"{nameof(offset)} must not be negative.");
-            if (length < 0)
-                throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
-            if (offset > source.Length - length) // Checks for int overflow
-                throw new ArgumentOutOfRangeException(nameof(length),
-                    $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
-
-            int upto = 0;
-            int i = offset;
-            int end = offset + length;
-            var @out = result.Bytes;
-            // Pre-allocate for worst case 4-for-1
-            int maxLen = length * 4;
-            if (@out.Length < maxLen)
-            {
-                @out = result.Bytes = new byte[maxLen];
-            }
-
-            result.Offset = 0;
-
-            while (i < end)
-            {
-                int code = (int)source[i++];
-
-                if (code < 0x80)
-                {
-                    @out[upto++] = (byte)code;
-                }
-                else if (code < 0x800)
-                {
-                    @out[upto++] = (byte)(0xC0 | (code >> 6));
-                    @out[upto++] = (byte)(0x80 | (code & 0x3F));
-                }
-                else if (code < 0xD800 || code > 0xDFFF)
-                {
-                    @out[upto++] = (byte)(0xE0 | (code >> 12));
-                    @out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
-                    @out[upto++] = (byte)(0x80 | (code & 0x3F));
-                }
-                else
-                {
-                    // surrogate pair
-                    // confirm valid high surrogate
-                    if (code < 0xDC00 && i < end)
-                    {
-                        var utf32 = (int)source[i];
-                        // confirm valid low surrogate and write pair
-                        if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
-                        {
-                            utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
-                            i++;
-                            @out[upto++] = (byte)(0xF0 | (utf32 >> 18));
-                            @out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
-                            @out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
-                            @out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
-                            continue;
-                        }
-                    }
-
-                    // replace unpaired surrogate or out-of-order low surrogate
-                    // with substitution character
-                    @out[upto++] = 0xEF;
-                    @out[upto++] = 0xBF;
-                    @out[upto++] = 0xBD;
-                }
-            }
-
-            //assert matches(source, offset, length, out, upto);
-            result.Length = upto;
+            UTF16toUTF8(source.AsSpan(offset, length), result);
         }
 
         /// <summary>
@@ -599,8 +506,8 @@ public static bool ValidUTF16String(ICharSequence s)
             return true;
         }
 
-        public static bool
-            ValidUTF16String(string s) // LUCENENET specific overload because string doesn't implement ICharSequence
+        // LUCENENET specific overload because string doesn't implement ICharSequence
+        public static bool ValidUTF16String(string s)
         {
             int size = s.Length;
             for (int i = 0; i < size; i++)
@@ -638,9 +545,8 @@ public static bool
             return true;
         }
 
-        public static bool
-            ValidUTF16String(
-                StringBuilder s) // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence
+        // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence
+        public static bool ValidUTF16String(StringBuilder s)
         {
             int size = s.Length;
             for (int i = 0; i < size; i++)
@@ -725,8 +631,8 @@ public static bool ValidUTF16String(ReadOnlySpan<char> s, int size)
          * means illegal prefix.  see RFC 2279 for details */
         internal static readonly int[] utf8CodeLength = LoadUTF8CodeLength();
 
-        private static int[]
-            LoadUTF8CodeLength() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
+        // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
+        private static int[] LoadUTF8CodeLength()
         {
             const int v = int.MinValue;
             return new int[]
@@ -947,7 +853,7 @@ public static string NewString(ReadOnlySpan<int> codePoints, int offset, int cou
         // instead proactively resizes the array instead of relying on exceptions + copy operations
         public static char[] ToCharArray(int[] codePoints, int offset, int count)
         {
-            return ToCharArray(codePoints.AsSpan(), offset, count);
+            return ToCharArray(codePoints.AsSpan(offset), count);
         }
 
         /// <summary>
@@ -956,13 +862,12 @@ public static char[] ToCharArray(int[] codePoints, int offset, int count)
         /// LUCENENET specific.
         /// </summary>
         /// <param name="codePoints"> The code span. </param>
-        /// <param name="offset"> The start of the text in the code point span. </param>
         /// <param name="count"> The number of code points. </param>
         /// <returns> a char array representing the code points between offset and count. </returns>
         // LUCENENET NOTE: This code was originally in the NewString() method (above).
         // It has been refactored from the original to remove the exception throw/catch and
         // instead proactively resizes the array instead of relying on exceptions + copy operations
-        public static char[] ToCharArray(ReadOnlySpan<int> codePoints, int offset, int count)
+        public static char[] ToCharArray(ReadOnlySpan<int> codePoints, int count)
         {
             if (count < 0)
             {
@@ -977,7 +882,7 @@ public static char[] ToCharArray(ReadOnlySpan<int> codePoints, int offset, int c
             if (count > countThreshold)
             {
                 arrayLength = 0;
-                for (int r = offset, e = offset + count; r < e; ++r)
+                for (int r = 0; r < count; ++r)
                 {
                     arrayLength += codePoints[r] < 0x010000 ? 1 : 2;
                 }
@@ -990,7 +895,7 @@ public static char[] ToCharArray(ReadOnlySpan<int> codePoints, int offset, int c
             // It is now safe to assume we have enough space for all of the characters.
             char[] chars = new char[arrayLength];
             int w = 0;
-            for (int r = offset, e = offset + count; r < e; ++r)
+            for (int r = 0; r < count; ++r)
             {
                 int cp = codePoints[r];
                 if (cp < 0 || cp > 0x10ffff)
@@ -1064,10 +969,11 @@ public static string ToHexString(string s)
         /// can result in an <see cref="IndexOutOfRangeException"/> if invalid UTF-8 is passed).
         /// Explicit checks for valid UTF-8 are not performed.
         /// </summary>
+        /// <seealso cref="UTF8toUTF16(ReadOnlySpan{byte}, CharsRef)"/>
         // TODO: broken if chars.offset != 0
         public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars)
         {
-            UTF8toUTF16(utf8.AsSpan(), offset, length, chars);
+            UTF8toUTF16(utf8.AsSpan(offset, length), chars);
         }
 
         /// <summary>
@@ -1082,14 +988,15 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha
         /// LUCENENET specific overload.
         /// </remarks>
         // TODO: broken if chars.offset != 0
-        public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int length, CharsRef chars)
+        public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, CharsRef chars)
         {
             int out_offset = chars.Offset = 0;
-            char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, length);
-            int limit = offset + length;
-            while (offset < limit)
+            char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, utf8.Length);
+            int i = 0;
+
+            while (i < utf8.Length)
             {
-                int b = utf8[offset++] & 0xff;
+                int b = utf8[i++] & 0xff;
                 if (b < 0xc0)
                 {
                     if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80);
@@ -1097,18 +1004,18 @@ public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int length,
                 }
                 else if (b < 0xe0)
                 {
-                    @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f));
+                    @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f));
                 }
                 else if (b < 0xf0)
                 {
-                    @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f));
-                    offset += 2;
+                    @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f));
+                    i += 2;
                 }
                 else
                 {
                     if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
-                    int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f);
-                    offset += 3;
+                    int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f);
+                    i += 3;
                     if (ch < UNI_MAX_BMP)
                     {
                         @out[out_offset++] = (char)ch;
@@ -1132,21 +1039,36 @@ public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int length,
         /// <remarks>
         /// LUCENENET specific: This method uses <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="byte"/>) instead of byte[].
         /// </remarks>
-        public static bool TryUTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int length, [NotNullWhen(true)] out CharsRef? chars)
+        /// <seealso cref="UTF8toUTF16(ReadOnlySpan{byte}, CharsRef)"/>
+        public static bool TryUTF8toUTF16(byte[] utf8, int offset, int length, [NotNullWhen(true)] out CharsRef? chars)
         {
-            CharsRef result = new CharsRef(length);
+            return TryUTF8toUTF16(utf8.AsSpan(offset, length), out chars);
+        }
+
+        /// <summary>
+        /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new <see cref="CharsRef"/>.
+        /// <para/>
+        /// NOTE: Explicit checks for valid UTF-8 are not performed.
+        /// </summary>
+        /// <remarks>
+        /// LUCENENET specific: This method uses <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="byte"/>) instead of byte[].
+        /// </remarks>
+        public static bool TryUTF8toUTF16(ReadOnlySpan<byte> utf8, [NotNullWhen(true)] out CharsRef? chars)
+        {
+            CharsRef result = new CharsRef(utf8.Length);
             int out_offset = 0;
             char[] @out = result.Chars;
-            int limit = offset + length;
-            while (offset < limit)
+            int i = 0;
+
+            while (i < utf8.Length)
             {
-                if (utf8.Length <= offset)
+                if (utf8.Length <= i)
                 {
                     chars = null;
                     return false;
                 }
 
-                int b = utf8[offset++] & 0xff;
+                int b = utf8[i++] & 0xff;
                 if (b < 0xc0)
                 {
                     if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80);
@@ -1154,33 +1076,33 @@ public static bool TryUTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int lengt
                 }
                 else if (b < 0xe0)
                 {
-                    if (utf8.Length <= offset)
+                    if (utf8.Length <= i)
                     {
                         chars = null;
                         return false;
                     }
-                    @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f));
+                    @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f));
                 }
                 else if (b < 0xf0)
                 {
-                    if (utf8.Length <= offset + 1)
+                    if (utf8.Length <= i + 1)
                     {
                         chars = null;
                         return false;
                     }
-                    @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f));
-                    offset += 2;
+                    @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f));
+                    i += 2;
                 }
                 else
                 {
-                    if (utf8.Length <= offset + 2)
+                    if (utf8.Length <= i + 2)
                     {
                         chars = null;
                         return false;
                     }
                     if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
-                    int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f);
-                    offset += 3;
+                    int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f);
+                    i += 3;
                     if (ch < UNI_MAX_BMP)
                     {
                         @out[out_offset++] = (char)ch;
@@ -1199,20 +1121,20 @@ public static bool TryUTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int lengt
         }
 
         /// <summary>
-        /// Utility method for <see cref="UTF8toUTF16(ReadOnlySpan{byte}, int, int, CharsRef)"/> </summary>
-        /// <seealso cref="UTF8toUTF16(ReadOnlySpan{byte}, int, int, CharsRef)"/>
+        /// Utility method for <see cref="UTF8toUTF16(ReadOnlySpan{byte}, CharsRef)"/> </summary>
+        /// <seealso cref="UTF8toUTF16(ReadOnlySpan{byte}, CharsRef)"/>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars)
         {
-            UTF8toUTF16(bytesRef.Bytes.AsSpan(), bytesRef.Offset, bytesRef.Length, chars);
+            UTF8toUTF16(bytesRef.Bytes.AsSpan(bytesRef.Offset, bytesRef.Length), chars);
         }
 
         /// <summary>
-        /// Utility method for <see cref="TryUTF8toUTF16(ReadOnlySpan{byte}, int, int, out CharsRef)"/> </summary>
-        /// <seealso cref="TryUTF8toUTF16(ReadOnlySpan{byte}, int, int, out CharsRef)"/>
+        /// Utility method for <see cref="TryUTF8toUTF16(ReadOnlySpan{byte}, out CharsRef)"/> </summary>
+        /// <seealso cref="TryUTF8toUTF16(ReadOnlySpan{byte}, out CharsRef)"/>
         public static bool TryUTF8toUTF16(BytesRef bytesRef, out CharsRef? chars)
         {
-            return TryUTF8toUTF16(bytesRef.Bytes.AsSpan(), bytesRef.Offset, bytesRef.Length, out chars);
+            return TryUTF8toUTF16(bytesRef.Bytes.AsSpan(bytesRef.Offset, bytesRef.Length), out chars);
         }
     }
 }

From f131d978a9cb3fa5e8f89c5aa91ee58ab268b00e Mon Sep 17 00:00:00 2001
From: Paul Irwin <paulirwin@gmail.com>
Date: Tue, 17 Dec 2024 15:43:49 -0700
Subject: [PATCH 05/10] Move ToCharArray methods to ObsoleteAPI for removal in
 4.8.0 RC, #1024

---
 .../Support/ObsoleteAPI/UnicodeUtil.cs        | 175 ++++++++++++++++++
 src/Lucene.Net/Util/UnicodeUtil.cs            |  85 +--------
 2 files changed, 178 insertions(+), 82 deletions(-)
 create mode 100644 src/Lucene.Net/Support/ObsoleteAPI/UnicodeUtil.cs

diff --git a/src/Lucene.Net/Support/ObsoleteAPI/UnicodeUtil.cs b/src/Lucene.Net/Support/ObsoleteAPI/UnicodeUtil.cs
new file mode 100644
index 0000000000..325218f5c8
--- /dev/null
+++ b/src/Lucene.Net/Support/ObsoleteAPI/UnicodeUtil.cs
@@ -0,0 +1,175 @@
+using Lucene.Net.Support;
+using System;
+
+#nullable enable
+
+namespace Lucene.Net.Util
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /*
+     * Some of this code came from the excellent Unicode
+     * conversion examples from:
+     *
+     *   http://www.unicode.org/Public/PROGRAMS/CVTUTF
+     *
+     * Full Copyright for that code follows:
+    */
+
+    /*
+     * Copyright 2001-2004 Unicode, Inc.
+     *
+     * Disclaimer
+     *
+     * this source code is provided as is by Unicode, Inc. No claims are
+     * made as to fitness for any particular purpose. No warranties of any
+     * kind are expressed or implied. The recipient agrees to determine
+     * applicability of information provided. If this file has been
+     * purchased on magnetic or optical media from Unicode, Inc., the
+     * sole remedy for any claim will be exchange of defective media
+     * within 90 days of receipt.
+     *
+     * Limitations on Rights to Redistribute this Code
+     *
+     * Unicode, Inc. hereby grants the right to freely use the information
+     * supplied in this file in the creation of products supporting the
+     * Unicode Standard, and to make copies of this file in any form
+     * for internal or external distribution as long as this notice
+     * remains attached.
+     */
+
+    /*
+     * Additional code came from the IBM ICU library.
+     *
+     *  http://www.icu-project.org
+     *
+     * Full Copyright for that code follows.
+     */
+
+    /*
+     * Copyright (C) 1999-2010, International Business Machines
+     * Corporation and others.  All Rights Reserved.
+     *
+     * Permission is hereby granted, free of charge, to any person obtaining a copy
+     * of this software and associated documentation files (the "Software"), to deal
+     * in the Software without restriction, including without limitation the rights
+     * to use, copy, modify, merge, publish, distribute, and/or sell copies of the
+     * Software, and to permit persons to whom the Software is furnished to do so,
+     * provided that the above copyright notice(s) and this permission notice appear
+     * in all copies of the Software and that both the above copyright notice(s) and
+     * this permission notice appear in supporting documentation.
+     *
+     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+     * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN this NOTICE BE
+     * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
+     * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
+     * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+     * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF this SOFTWARE.
+     *
+     * Except as contained in this notice, the name of a copyright holder shall not
+     * be used in advertising or otherwise to promote the sale, use or other
+     * dealings in this Software without prior written authorization of the
+     * copyright holder.
+     */
+
+    public static partial class UnicodeUtil
+    {
+        /// <summary>
+        /// Generates char array that represents the provided input code points.
+        /// <para/>
+        /// LUCENENET specific.
+        /// </summary>
+        /// <param name="codePoints"> The code array. </param>
+        /// <param name="offset"> The start of the text in the code point array. </param>
+        /// <param name="count"> The number of code points. </param>
+        /// <returns> a char array representing the code points between offset and count. </returns>
+        // LUCENENET NOTE: This code was originally in the NewString() method.
+        // It has been refactored from the original to remove the exception throw/catch and
+        // instead proactively resizes the array instead of relying on exceptions + copy operations
+        [Obsolete("Use NewString method instead. This method will be removed in 4.8.0 release candidate."), System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
+        public static char[] ToCharArray(int[] codePoints, int offset, int count)
+        {
+            return ToCharArray(codePoints.AsSpan(offset), count);
+        }
+
+        /// <summary>
+        /// Generates char array that represents the provided input code points.
+        /// <para/>
+        /// LUCENENET specific.
+        /// </summary>
+        /// <param name="codePoints"> The code span. </param>
+        /// <param name="count"> The number of code points. </param>
+        /// <returns> a char array representing the code points between offset and count. </returns>
+        // LUCENENET NOTE: This code was originally in the NewString() method.
+        // It has been refactored from the original to remove the exception throw/catch and
+        // instead proactively resizes the array instead of relying on exceptions + copy operations
+        [Obsolete("Use NewString method instead. This method will be removed in 4.8.0 release candidate."), System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
+        public static char[] ToCharArray(ReadOnlySpan<int> codePoints, int count)
+        {
+            if (count < 0)
+            {
+                throw new ArgumentOutOfRangeException(nameof(count), "count must be >= 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
+            }
+            const int countThreshold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2
+            // LUCENENET: as a first approximation, assume each codepoint
+            // is 2 characters (since it cannot be longer than this)
+            int arrayLength = count * 2;
+            // LUCENENET: if we go over the threshold, count the number of
+            // chars we will need so we can allocate the precise amount of memory
+            if (count > countThreshold)
+            {
+                arrayLength = 0;
+                for (int r = 0; r < count; ++r)
+                {
+                    arrayLength += codePoints[r] < 0x010000 ? 1 : 2;
+                }
+                if (arrayLength < 1)
+                {
+                    arrayLength = count * 2;
+                }
+            }
+            // Initialize our array to our exact or oversized length.
+            // It is now safe to assume we have enough space for all of the characters.
+            char[] chars = new char[arrayLength];
+            int w = 0;
+            for (int r = 0; r < count; ++r)
+            {
+                int cp = codePoints[r];
+                if (cp < 0 || cp > 0x10ffff)
+                {
+                    throw new ArgumentException($"Invalid code point: {cp}", nameof(codePoints));
+                }
+                if (cp < 0x010000)
+                {
+                    chars[w++] = (char)cp;
+                }
+                else
+                {
+                    chars[w++] = (char)(LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
+                    chars[w++] = (char)(TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
+                }
+            }
+
+            var result = new char[w];
+            Arrays.Copy(chars, result, w);
+            return result;
+        }
+    }
+}
diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs
index af53ec15d5..ee7b66c041 100644
--- a/src/Lucene.Net/Util/UnicodeUtil.cs
+++ b/src/Lucene.Net/Util/UnicodeUtil.cs
@@ -101,7 +101,7 @@ namespace Lucene.Net.Util
     /// <para/>
     /// @lucene.internal
     /// </summary>
-    public static class UnicodeUtil
+    public static partial class UnicodeUtil
     {
         /// <summary>
         /// A binary term consisting of a number of 0xff bytes, likely to be bigger than other terms
@@ -818,7 +818,7 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32)
         public static string NewString(int[] codePoints, int offset, int count)
         {
             // LUCENENET: Character.ToString() was optimized to use the stack for arrays
-            // of codepoints 256 or less, so it performs better than using ToCharArray().
+            // of codepoints 256 or less, so it performs better than the Lucene implementation.
             return Character.ToString(codePoints, offset, count);
         }
 
@@ -835,89 +835,10 @@ public static string NewString(int[] codePoints, int offset, int count)
         public static string NewString(ReadOnlySpan<int> codePoints, int offset, int count)
         {
             // LUCENENET: Character.ToString() was optimized to use the stack for arrays
-            // of codepoints 256 or less, so it performs better than using ToCharArray().
+            // of codepoints 256 or less, so it performs better than the Lucene implementation.
             return Character.ToString(codePoints, offset, count);
         }
 
-        /// <summary>
-        /// Generates char array that represents the provided input code points.
-        /// <para/>
-        /// LUCENENET specific.
-        /// </summary>
-        /// <param name="codePoints"> The code array. </param>
-        /// <param name="offset"> The start of the text in the code point array. </param>
-        /// <param name="count"> The number of code points. </param>
-        /// <returns> a char array representing the code points between offset and count. </returns>
-        // LUCENENET NOTE: This code was originally in the NewString() method (above).
-        // It has been refactored from the original to remove the exception throw/catch and
-        // instead proactively resizes the array instead of relying on exceptions + copy operations
-        public static char[] ToCharArray(int[] codePoints, int offset, int count)
-        {
-            return ToCharArray(codePoints.AsSpan(offset), count);
-        }
-
-        /// <summary>
-        /// Generates char array that represents the provided input code points.
-        /// <para/>
-        /// LUCENENET specific.
-        /// </summary>
-        /// <param name="codePoints"> The code span. </param>
-        /// <param name="count"> The number of code points. </param>
-        /// <returns> a char array representing the code points between offset and count. </returns>
-        // LUCENENET NOTE: This code was originally in the NewString() method (above).
-        // It has been refactored from the original to remove the exception throw/catch and
-        // instead proactively resizes the array instead of relying on exceptions + copy operations
-        public static char[] ToCharArray(ReadOnlySpan<int> codePoints, int count)
-        {
-            if (count < 0)
-            {
-                throw new ArgumentOutOfRangeException(nameof(count), "count must be >= 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
-            }
-            const int countThreshold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2
-            // LUCENENET: as a first approximation, assume each codepoint
-            // is 2 characters (since it cannot be longer than this)
-            int arrayLength = count * 2;
-            // LUCENENET: if we go over the threshold, count the number of
-            // chars we will need so we can allocate the precise amount of memory
-            if (count > countThreshold)
-            {
-                arrayLength = 0;
-                for (int r = 0; r < count; ++r)
-                {
-                    arrayLength += codePoints[r] < 0x010000 ? 1 : 2;
-                }
-                if (arrayLength < 1)
-                {
-                    arrayLength = count * 2;
-                }
-            }
-            // Initialize our array to our exact or oversized length.
-            // It is now safe to assume we have enough space for all of the characters.
-            char[] chars = new char[arrayLength];
-            int w = 0;
-            for (int r = 0; r < count; ++r)
-            {
-                int cp = codePoints[r];
-                if (cp < 0 || cp > 0x10ffff)
-                {
-                    throw new ArgumentException($"Invalid code point: {cp}", nameof(codePoints));
-                }
-                if (cp < 0x010000)
-                {
-                    chars[w++] = (char)cp;
-                }
-                else
-                {
-                    chars[w++] = (char)(LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
-                    chars[w++] = (char)(TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
-                }
-            }
-
-            var result = new char[w];
-            Arrays.Copy(chars, result, w);
-            return result;
-        }
-
         // for debugging
         public static string ToHexString(string s)
         {

From 9a13647cc0d8b413c736e4f4a11e10223c96213a Mon Sep 17 00:00:00 2001
From: Paul Irwin <paulirwin@gmail.com>
Date: Tue, 17 Dec 2024 16:46:49 -0700
Subject: [PATCH 06/10] Add fallback version of UTF8toUTF16

---
 src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs  | 12 +++
 .../ExceptionHandling/ExceptionExtensions.cs  |  4 +-
 src/Lucene.Net/Util/BytesRef.cs               | 17 ++++
 src/Lucene.Net/Util/UnicodeUtil.cs            | 90 +++++++++++++++++--
 4 files changed, 115 insertions(+), 8 deletions(-)

diff --git a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
index be98e7a3d4..716124af5a 100644
--- a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
+++ b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
@@ -340,5 +340,17 @@ public void TestTryUTF8toUTF16()
             Assert.IsTrue(success);
             Assert.AreEqual(unicode, chars?.ToString());
         }
+
+        [Test]
+        [LuceneNetSpecific] // this is a Lucene.NET specific method
+        public void TestUTF8toUTF16WithFallback()
+        {
+            byte[] invalidUtf8 = { 0x63, 0xc3 }; // Invalid ending UTF-8 sequence
+            var scratch = new CharsRef();
+
+            UnicodeUtil.UTF8toUTF16WithFallback(invalidUtf8, scratch);
+
+            Assert.AreEqual("c\ufffd", scratch.ToString());
+        }
     }
 }
diff --git a/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs b/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs
index 5a25bf64d2..1c3ed526ad 100644
--- a/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs
+++ b/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs
@@ -305,7 +305,7 @@ public static bool IsNoSuchFileExceptionOrFileNotFoundException(this Exception e
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static bool IsParseException(this Exception e)
         {
-            // LUCENNET: Added this exception in J2N to cover this case because it is not a RuntimeException
+            // LUCENENET: Added this exception in J2N to cover this case because it is not a RuntimeException
             // which makes it different from NumberFormatException in Java and FormatException in .NET.
             return e is ParseException;
         }
@@ -591,7 +591,7 @@ public static bool IsIllegalStateException(this Exception e)
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static bool IsStackOverflowError(this Exception e)
         {
-            return e is StackOverflowException; // Uncatchable in .NET core, be sure to use with 
+            return e is StackOverflowException; // Uncatchable in .NET core, be sure to use with
         }
 
         /// <summary>
diff --git a/src/Lucene.Net/Util/BytesRef.cs b/src/Lucene.Net/Util/BytesRef.cs
index 2e3679f8e0..a7c339f7c9 100644
--- a/src/Lucene.Net/Util/BytesRef.cs
+++ b/src/Lucene.Net/Util/BytesRef.cs
@@ -243,6 +243,23 @@ public string Utf8ToString()
             return @ref.ToString();
         }
 
+        /// <summary>
+        /// Interprets stored bytes as UTF8 bytes, returning the
+        /// resulting <see cref="string"/>.
+        /// </summary>
+        /// <remarks>
+        /// LUCENENET specific version that does not throw exceptions,
+        /// primarily for use in ToString() and other methods that
+        /// should not throw exceptions.
+        /// </remarks>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public string Utf8ToStringWithFallback()
+        {
+            CharsRef @ref = new CharsRef(Length);
+            UnicodeUtil.UTF8toUTF16WithFallback(bytes, Offset, Length, @ref);
+            return @ref.ToString();
+        }
+
         #nullable enable
         /// <summary>
         /// Tries to interpret the stored bytes as UTF8 bytes, returning the
diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs
index ee7b66c041..8bcb24f450 100644
--- a/src/Lucene.Net/Util/UnicodeUtil.cs
+++ b/src/Lucene.Net/Util/UnicodeUtil.cs
@@ -952,6 +952,90 @@ public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, CharsRef chars)
             chars.Length = out_offset - chars.Offset;
         }
 
+        /// <summary>
+        /// Interprets the given byte array as UTF-8 and converts to UTF-16. The <see cref="CharsRef"/> will be extended if
+        /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
+        /// <para/>
+        /// NOTE: This method will replace any invalid UTF-8 byte sequences with the Unicode replacement character U+FFFD.
+        /// </summary>
+        /// <remarks>
+        /// LUCENENET specific, for use in ToString() where we want to avoid throwing exceptions.
+        /// </remarks>
+        /// <seealso cref="UTF8toUTF16WithFallback(ReadOnlySpan{byte}, CharsRef)"/>
+        // TODO: broken if chars.offset != 0
+        public static void UTF8toUTF16WithFallback(byte[] utf8, int offset, int length, CharsRef chars)
+        {
+            UTF8toUTF16(utf8.AsSpan(offset, length), chars);
+        }
+
+        /// <summary>
+        /// Interprets the given byte span as UTF-8 and converts to UTF-16. The <see cref="CharsRef"/> will be extended if
+        /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
+        /// <para/>
+        /// NOTE: This method will replace any invalid UTF-8 byte sequences with the Unicode replacement character U+FFFD.
+        /// </summary>
+        /// <remarks>
+        /// LUCENENET specific, for use in ToString() where we want to avoid throwing exceptions.
+        /// </remarks>
+        // TODO: broken if chars.offset != 0
+        public static void UTF8toUTF16WithFallback(ReadOnlySpan<byte> utf8, CharsRef chars)
+        {
+            int out_offset = chars.Offset = 0;
+            char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, utf8.Length);
+            int i = 0;
+
+            while (i < utf8.Length)
+            {
+                int b = utf8[i++] & 0xff;
+                if (b < 0xc0)
+                {
+                    if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80);
+                    @out[out_offset++] = (char)b;
+                }
+                else if (b < 0xe0)
+                {
+                    if (utf8.Length <= i)
+                    {
+                        @out[out_offset++] = (char)0xfffd;
+                        continue;
+                    }
+                    @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f));
+                }
+                else if (b < 0xf0)
+                {
+                    if (utf8.Length <= i + 1)
+                    {
+                        @out[out_offset++] = (char)0xfffd;
+                        break;
+                    }
+                    @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f));
+                    i += 2;
+                }
+                else
+                {
+                    if (utf8.Length <= i + 2)
+                    {
+                        @out[out_offset++] = (char)0xfffd;
+                        break;
+                    }
+                    if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
+                    int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f);
+                    i += 3;
+                    if (ch < UNI_MAX_BMP)
+                    {
+                        @out[out_offset++] = (char)ch;
+                    }
+                    else
+                    {
+                        int chHalf = ch - 0x0010000;
+                        @out[out_offset++] = (char)((chHalf >> 10) + 0xD800);
+                        @out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00);
+                    }
+                }
+            }
+            chars.Length = out_offset - chars.Offset;
+        }
+
         /// <summary>
         /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new <see cref="CharsRef"/>.
         /// <para/>
@@ -983,12 +1067,6 @@ public static bool TryUTF8toUTF16(ReadOnlySpan<byte> utf8, [NotNullWhen(true)] o
 
             while (i < utf8.Length)
             {
-                if (utf8.Length <= i)
-                {
-                    chars = null;
-                    return false;
-                }
-
                 int b = utf8[i++] & 0xff;
                 if (b < 0xc0)
                 {

From a04df3a197cd18fe7d4ef979f2d979346a629359 Mon Sep 17 00:00:00 2001
From: Paul Irwin <paulirwin@gmail.com>
Date: Tue, 17 Dec 2024 20:54:56 -0700
Subject: [PATCH 07/10] Throw ParseException on out of range in UTF8toUTF16,
 add more tests

---
 src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs | 32 +++++++++++++++++---
 src/Lucene.Net/Util/UnicodeUtil.cs           | 12 ++++++++
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
index 716124af5a..6cad0a4e4e 100644
--- a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
+++ b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
@@ -328,8 +328,29 @@ public virtual void TestUTF8UTF16CharsRef()
             }
         }
 
+        [Test]
+        [LuceneNetSpecific]
+        [TestCase(new byte[] { 0x63, 0x61, 0xc3 }, true)] // ca�, start of 2-byte sequence
+        [TestCase(new byte[] { 0x63, 0x61, 0xe3 }, true)] // ca�, start of 3-byte sequence
+        [TestCase(new byte[] { 0x63, 0x61, 0xf3 }, true)] // ca�, start of 4-byte sequence
+        [TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, false)] // cañon
+        public void TestUTF8toUTF16Exception(byte[] invalidUtf8, bool shouldThrow)
+        {
+            var scratch = new CharsRef();
+
+            if (shouldThrow)
+            {
+                Assert.Throws<ParseException>(() => UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch));
+            }
+            else
+            {
+                UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch);
+            }
+        }
+
         [Test]
         [LuceneNetSpecific] // this is a Lucene.NET specific method
+        [Repeat(100)]
         public void TestTryUTF8toUTF16()
         {
             string unicode = TestUtil.RandomRealisticUnicodeString(Random);
@@ -343,14 +364,17 @@ public void TestTryUTF8toUTF16()
 
         [Test]
         [LuceneNetSpecific] // this is a Lucene.NET specific method
-        public void TestUTF8toUTF16WithFallback()
+        [TestCase(new byte[] { 0x63, 0x61, 0xc3 }, "ca\ufffd")] // ca�, start of 2-byte sequence
+        [TestCase(new byte[] { 0x63, 0x61, 0xe3 }, "ca\ufffd")] // ca�, start of 3-byte sequence
+        [TestCase(new byte[] { 0x63, 0x61, 0xf3 }, "ca\ufffd")] // ca�, start of 4-byte sequence
+        [TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, "cañon")]
+        public void TestUTF8toUTF16WithFallback(byte[] utf8, string expected)
         {
-            byte[] invalidUtf8 = { 0x63, 0xc3 }; // Invalid ending UTF-8 sequence
             var scratch = new CharsRef();
 
-            UnicodeUtil.UTF8toUTF16WithFallback(invalidUtf8, scratch);
+            UnicodeUtil.UTF8toUTF16WithFallback(utf8, scratch);
 
-            Assert.AreEqual("c\ufffd", scratch.ToString());
+            Assert.AreEqual(expected, scratch.ToString());
         }
     }
 }
diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs
index 8bcb24f450..434ca2d265 100644
--- a/src/Lucene.Net/Util/UnicodeUtil.cs
+++ b/src/Lucene.Net/Util/UnicodeUtil.cs
@@ -925,15 +925,27 @@ public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, CharsRef chars)
                 }
                 else if (b < 0xe0)
                 {
+                    if (utf8.Length <= i)
+                    {
+                        throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1);
+                    }
                     @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f));
                 }
                 else if (b < 0xf0)
                 {
+                    if (utf8.Length <= i + 1)
+                    {
+                        throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1);
+                    }
                     @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f));
                     i += 2;
                 }
                 else
                 {
+                    if (utf8.Length <= i + 2)
+                    {
+                        throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1);
+                    }
                     if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
                     int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f);
                     i += 3;

From 3f2dbc13bdc1c9574974f5fe972216228a1d09ea Mon Sep 17 00:00:00 2001
From: Paul Irwin <paulirwin@gmail.com>
Date: Tue, 17 Dec 2024 21:16:09 -0700
Subject: [PATCH 08/10] Use Utf8ToStringWithFallback in ToString and
 exception/logging message building

---
 .../SimpleText/SimpleTextUtil.cs              |  7 ++--
 .../DefaultSortedSetDocValuesReaderState.cs   |  9 ++---
 .../AbstractGroupFacetCollector.cs            |  2 +-
 .../TermsIncludingScoreQuery.cs               | 34 +++++++++----------
 src/Lucene.Net.Misc/Misc/TermStats.cs         |  3 +-
 src/Lucene.Net.Queries/TermsFilter.cs         |  2 +-
 .../Suggest/Fst/FSTCompletion.cs              |  3 +-
 src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs |  6 ++--
 .../Codecs/Lucene3x/Lucene3xFields.cs         | 23 ++++++++-----
 src/Lucene.Net/Util/BytesRef.cs               | 12 +++----
 10 files changed, 56 insertions(+), 45 deletions(-)

diff --git a/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs b/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs
index 695e817552..c3b8f41e6b 100644
--- a/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs
+++ b/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs
@@ -79,7 +79,7 @@ public static void ReadLine(DataInput input, BytesRef scratch)
                     {
                         break;
                     }
-                    
+
                     scratch.Bytes[upto++] = b;
                 }
             }
@@ -106,8 +106,9 @@ public static void CheckFooter(ChecksumIndexInput input)
 
             if (StringHelper.StartsWith(scratch, CHECKSUM) == false)
             {
+                // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
                 throw new CorruptIndexException("SimpleText failure: expected checksum line but got " +
-                                                scratch.Utf8ToString() + " (resource=" + input + ")");
+                                                scratch.Utf8ToStringWithFallback() + " (resource=" + input + ")");
             }
             var actualChecksum =
                 (new BytesRef(scratch.Bytes, CHECKSUM.Length, scratch.Length - CHECKSUM.Length)).Utf8ToString();
@@ -124,4 +125,4 @@ public static void CheckFooter(ChecksumIndexInput input)
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs b/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs
index 91d4c355ab..3ca21e13f5 100644
--- a/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs
+++ b/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs
@@ -41,7 +41,7 @@ public class DefaultSortedSetDocValuesReaderState : SortedSetDocValuesReaderStat
 
         /// <summary>
         /// Creates this, pulling doc values from the specified
-        /// field. 
+        /// field.
         /// </summary>
         public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = FacetsConfig.DEFAULT_INDEX_FIELD_NAME)
         {
@@ -79,7 +79,8 @@ public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = F
                 string[] components = FacetsConfig.StringToPath(spare.Utf8ToString());
                 if (components.Length != 2)
                 {
-                    throw new ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + spare.Utf8ToString());
+                    // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+                    throw new ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + spare.Utf8ToStringWithFallback());
                 }
                 if (!components[0].Equals(lastDim, StringComparison.Ordinal))
                 {
@@ -101,7 +102,7 @@ public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = F
         /// <summary>
         /// Return top-level doc values.
         /// </summary>
-        public override SortedSetDocValues GetDocValues() 
+        public override SortedSetDocValues GetDocValues()
         {
             return topReader.GetSortedSetDocValues(field);
         }
@@ -132,4 +133,4 @@ public override OrdRange GetOrdRange(string dim)
         /// </summary>
         public override int Count => valueCount;
     }
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs b/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs
index 8abb12520e..89599b49a6 100644
--- a/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs
+++ b/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs
@@ -275,7 +275,7 @@ public override int GetHashCode()
             public override string ToString()
             {
                 return "FacetEntry{" +
-                    "value=" + value.Utf8ToString() +
+                    "value=" + value.Utf8ToStringWithFallback() + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
                     ", count=" + count +
                     '}';
             }
diff --git a/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs b/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs
index 9d91aa64c1..9a5c9c1476 100644
--- a/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs
+++ b/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs
@@ -134,7 +134,7 @@ public WeightAnonymousClass(TermsIncludingScoreQuery outerInstance, Weight origi
 
 
             private TermsEnum segmentTermsEnum;
-            
+
             public override Explanation Explain(AtomicReaderContext context, int doc)
             {
                 SVInnerScorer scorer = (SVInnerScorer) GetBulkScorer(context, false, null);
@@ -161,7 +161,7 @@ public override void Normalize(float norm, float topLevelBoost)
             {
                 originalWeight.Normalize(norm, topLevelBoost*outerInstance.Boost);
             }
-            
+
             public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)
             {
                 Terms terms = context.AtomicReader.GetTerms(outerInstance._field);
@@ -181,7 +181,7 @@ public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)
 
                 return new SVInOrderScorer(outerInstance, this, acceptDocs, segmentTermsEnum, context.AtomicReader.MaxDoc, cost);
             }
-            
+
             public override BulkScorer GetBulkScorer(AtomicReaderContext context, bool scoreDocsInOrder, IBits acceptDocs)
             {
                 if (scoreDocsInOrder)
@@ -236,7 +236,7 @@ internal SVInnerScorer(TermsIncludingScoreQuery outerInstance, /* Weight weight,
                 //_cost = cost; // LUCENENET: Never read
                 _doc = -1;
             }
-            
+
             public override bool Score(ICollector collector, int max)
             {
                 FakeScorer fakeScorer = new FakeScorer();
@@ -285,12 +285,12 @@ private int NextDocOutOfOrder()
                     }
                 }
             }
-            
+
             protected virtual int DocsEnumNextDoc()
             {
                 return docsEnum.NextDoc();
             }
-            
+
             internal Explanation Explain(int target) // LUCENENET NOTE: changed accessibility from private to internal
             {
                 int docId;
@@ -314,7 +314,7 @@ internal Explanation Explain(int target) // LUCENENET NOTE: changed accessibilit
                 } while (docId != DocIdSetIterator.NO_MORE_DOCS);
 
                 return new ComplexExplanation(true, outerInstance._scores[outerInstance._ords[_scoreUpto]],
-                    "Score based on join value " + _termsEnum.Term.Utf8ToString());
+                    "Score based on join value " + _termsEnum.Term.Utf8ToStringWithFallback()); // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
             }
         }
 
@@ -326,13 +326,13 @@ internal class MVInnerScorer : SVInnerScorer
             internal readonly FixedBitSet alreadyEmittedDocs;
 
             internal MVInnerScorer(TermsIncludingScoreQuery outerInstance, /* Weight weight, // LUCENENET: Never read */
-                IBits acceptDocs, TermsEnum termsEnum, int maxDoc /*, long cost // LUCENENET: Never read */) 
+                IBits acceptDocs, TermsEnum termsEnum, int maxDoc /*, long cost // LUCENENET: Never read */)
                 : base(outerInstance, /*weight, // LUCENENET: Never read */
                       acceptDocs, termsEnum /*, cost // LUCENENET: Never read */)
             {
                 alreadyEmittedDocs = new FixedBitSet(maxDoc);
             }
-            
+
             protected override int DocsEnumNextDoc()
             {
                 while (true)
@@ -360,11 +360,11 @@ internal class SVInOrderScorer : Scorer
             internal readonly long cost;
 
             internal int currentDoc = -1;
-            
+
             [SuppressMessage("CodeQuality", "IDE0079:Remove unnecessary suppression", Justification = "This is a SonarCloud issue")]
             [SuppressMessage("CodeQuality", "S1699:Constructors should only call non-overridable methods", Justification = "Internal class")]
             internal SVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight, IBits acceptDocs,
-                TermsEnum termsEnum, int maxDoc, long cost) 
+                TermsEnum termsEnum, int maxDoc, long cost)
                 : base(weight)
             {
                 this.m_outerInstance = outerInstance;
@@ -374,7 +374,7 @@ internal SVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight,
                 matchingDocsIterator = matchingDocs.GetIterator();
                 this.cost = cost;
             }
-            
+
             protected virtual void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptDocs,
                 TermsEnum termsEnum)
             {
@@ -398,12 +398,12 @@ protected virtual void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptD
                     }
                 }
             }
-            
+
             public override float GetScore()
             {
                 return scores[currentDoc];
             }
-            
+
             public override int Freq => 1;
 
             public override int DocID => currentDoc;
@@ -412,7 +412,7 @@ public override int NextDoc()
             {
                 return currentDoc = matchingDocsIterator.NextDoc();
             }
-            
+
             public override int Advance(int target)
             {
                 return currentDoc = matchingDocsIterator.Advance(target);
@@ -432,7 +432,7 @@ internal MVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight,
                 : base(outerInstance, weight, acceptDocs, termsEnum, maxDoc, cost)
             {
             }
-            
+
             protected override void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptDocs,
                 TermsEnum termsEnum)
             {
@@ -465,4 +465,4 @@ protected override void FillDocsAndScores(FixedBitSet matchingDocs, IBits accept
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net.Misc/Misc/TermStats.cs b/src/Lucene.Net.Misc/Misc/TermStats.cs
index 7ac5fbc918..c2d1664870 100644
--- a/src/Lucene.Net.Misc/Misc/TermStats.cs
+++ b/src/Lucene.Net.Misc/Misc/TermStats.cs
@@ -45,7 +45,8 @@ internal string GetTermText()
 
         public override string ToString()
         {
-            return ("TermStats: Term=" + TermText.Utf8ToString() + " DocFreq=" + DocFreq + " TotalTermFreq=" + TotalTermFreq);
+            // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+            return "TermStats: Term=" + TermText.Utf8ToStringWithFallback() + " DocFreq=" + DocFreq + " TotalTermFreq=" + TotalTermFreq;
         }
     }
 }
diff --git a/src/Lucene.Net.Queries/TermsFilter.cs b/src/Lucene.Net.Queries/TermsFilter.cs
index 3aae8295f3..ced20eed59 100644
--- a/src/Lucene.Net.Queries/TermsFilter.cs
+++ b/src/Lucene.Net.Queries/TermsFilter.cs
@@ -319,7 +319,7 @@ public override string ToString()
                     }
                     first = false;
                     builder.Append(current.field).Append(':');
-                    builder.Append(spare.Utf8ToString());
+                    builder.Append(spare.Utf8ToStringWithFallback()); // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
                 }
             }
 
diff --git a/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs b/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs
index 06587b33ca..ae71302ae0 100644
--- a/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs
+++ b/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs
@@ -59,7 +59,8 @@ internal Completion(BytesRef key, int bucket)
 
             public override string ToString()
             {
-                return Utf8.Utf8ToString() + "/" + Bucket.ToString("0.0", CultureInfo.InvariantCulture);
+                // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+                return Utf8.Utf8ToStringWithFallback() + "/" + Bucket.ToString("0.0", CultureInfo.InvariantCulture);
             }
 
             /// <seealso cref="BytesRef.CompareTo(object)"></seealso>
diff --git a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs
index cdace9c1cc..e771023d59 100644
--- a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs
+++ b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs
@@ -440,7 +440,8 @@ public PendingTerm(BytesRef term, BlockTermState state)
 
             public override string ToString()
             {
-                return Term.Utf8ToString();
+                // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+                return Term.Utf8ToStringWithFallback();
             }
         }
 
@@ -468,7 +469,8 @@ public PendingBlock(BytesRef prefix, long fp, bool hasTerms, bool isFloor, int f
 
             public override string ToString()
             {
-                return $"BLOCK: {Prefix.Utf8ToString()}";
+                // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+                return $"BLOCK: {Prefix.Utf8ToStringWithFallback()}";
             }
 
             #nullable enable
diff --git a/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs b/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs
index 26a5b54a7a..fc941fab63 100644
--- a/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs
+++ b/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs
@@ -48,7 +48,7 @@ namespace Lucene.Net.Codecs.Lucene3x
     /// <summary>
     /// Exposes flex API on a pre-flex index, as a codec.
     /// <para/>
-    /// @lucene.experimental 
+    /// @lucene.experimental
     /// </summary>
     [Obsolete("(4.0)")]
     internal class Lucene3xFields : FieldsProducer
@@ -344,7 +344,8 @@ private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos)
 
                 if (DEBUG_SURROGATES)
                 {
-                    Console.WriteLine("      try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
+                    // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+                    Console.WriteLine("      try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToStringWithFallback()));
                 }
 
                 // Seek "back":
@@ -488,7 +489,8 @@ private bool DoPop()
 
                     if (DEBUG_SURROGATES)
                     {
-                        Console.WriteLine("    seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString());
+                        // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+                        Console.WriteLine("    seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback()) + " " + scratchTerm.ToString());
                     }
 
                     // TODO: more efficient seek?  can we simply swap
@@ -599,10 +601,11 @@ private void SurrogateDance()
 
                 if (DEBUG_SURROGATES)
                 {
+                    // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
                     Console.WriteLine("  dance");
-                    Console.WriteLine("    prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToString()));
+                    Console.WriteLine("    prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToStringWithFallback()));
                     Console.WriteLine("         " + prevTerm.ToString());
-                    Console.WriteLine("    term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()));
+                    Console.WriteLine("    term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback()));
                     Console.WriteLine("         " + scratchTerm.ToString());
                 }
 
@@ -679,7 +682,8 @@ private void DoPushes()
 
                         if (DEBUG_SURROGATES)
                         {
-                            Console.WriteLine("    try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length);
+                            // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+                            Console.WriteLine("    try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length);
                         }
 
                         // Seek "forward":
@@ -777,7 +781,7 @@ internal virtual void Reset(FieldInfo fieldInfo)
             {
                 //System.out.println("pff.reset te=" + termEnum);
                 this.fieldInfo = fieldInfo;
-                
+
                 internedFieldName = fieldInfo.Name.Intern();
 
                 Term term = new Term(internedFieldName);
@@ -832,7 +836,8 @@ public override SeekStatus SeekCeil(BytesRef term)
             {
                 if (DEBUG_SURROGATES)
                 {
-                    Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
+                    // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+                    Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToStringWithFallback()));
                 }
                 skipNext = false;
                 TermInfosReader tis = outerInstance.TermsDict;
@@ -1232,4 +1237,4 @@ public override void CheckIntegrity()
         {
         }
     }
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net/Util/BytesRef.cs b/src/Lucene.Net/Util/BytesRef.cs
index a7c339f7c9..0124269216 100644
--- a/src/Lucene.Net/Util/BytesRef.cs
+++ b/src/Lucene.Net/Util/BytesRef.cs
@@ -248,9 +248,9 @@ public string Utf8ToString()
         /// resulting <see cref="string"/>.
         /// </summary>
         /// <remarks>
-        /// LUCENENET specific version that does not throw exceptions,
-        /// primarily for use in ToString() and other methods that
-        /// should not throw exceptions.
+        /// LUCENENET specific version that does not throw exceptions on invalid UTF-8,
+        /// primarily for use in ToString() and other cases that should not throw exceptions,
+        /// such as when building a message for another exception.
         /// </remarks>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public string Utf8ToStringWithFallback()
@@ -604,11 +604,11 @@ public override string ToString()
             switch (format)
             {
                 case BytesRefFormat.UTF8:
-                    try
+                    if (bytesRef.TryUtf8ToString(out var utf8String))
                     {
-                        return bytesRef.Utf8ToString();
+                        return utf8String;
                     }
-                    catch (Exception e) when (e.IsIndexOutOfBoundsException())
+                    else
                     {
                         return bytesRef.ToString();
                     }

From 76bb2da03527ffe86b2b40b12a42bbdc5c60601d Mon Sep 17 00:00:00 2001
From: Paul Irwin <paulirwin@gmail.com>
Date: Wed, 18 Dec 2024 08:41:46 -0700
Subject: [PATCH 09/10] Use FormatException instead of ParseException

---
 src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs |  2 +-
 src/Lucene.Net/Util/UnicodeUtil.cs           | 11 +++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
index 6cad0a4e4e..a35c0a4d62 100644
--- a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
+++ b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
@@ -340,7 +340,7 @@ public void TestUTF8toUTF16Exception(byte[] invalidUtf8, bool shouldThrow)
 
             if (shouldThrow)
             {
-                Assert.Throws<ParseException>(() => UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch));
+                Assert.Throws<FormatException>(() => UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch));
             }
             else
             {
diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs
index 434ca2d265..9a08d8bb4c 100644
--- a/src/Lucene.Net/Util/UnicodeUtil.cs
+++ b/src/Lucene.Net/Util/UnicodeUtil.cs
@@ -1,7 +1,6 @@
 ﻿using J2N;
 using J2N.Text;
 using Lucene.Net.Diagnostics;
-using Lucene.Net.Support;
 using System;
 using System.Diagnostics.CodeAnalysis;
 using System.Runtime.CompilerServices;
@@ -887,7 +886,7 @@ public static string ToHexString(string s)
         /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
         /// <para/>
         /// NOTE: Full characters are read, even if this reads past the length passed (and
-        /// can result in an <see cref="IndexOutOfRangeException"/> if invalid UTF-8 is passed).
+        /// can result in an <see cref="FormatException"/> if invalid UTF-8 is passed).
         /// Explicit checks for valid UTF-8 are not performed.
         /// </summary>
         /// <seealso cref="UTF8toUTF16(ReadOnlySpan{byte}, CharsRef)"/>
@@ -902,7 +901,7 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha
         /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
         /// <para/>
         /// NOTE: Full characters are read, even if this reads past the length passed (and
-        /// can result in an <see cref="IndexOutOfRangeException"/> if invalid UTF-8 is passed).
+        /// can result in an <see cref="FormatException"/> if invalid UTF-8 is passed).
         /// Explicit checks for valid UTF-8 are not performed.
         /// </summary>
         /// <remarks>
@@ -927,7 +926,7 @@ public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, CharsRef chars)
                 {
                     if (utf8.Length <= i)
                     {
-                        throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1);
+                        throw new FormatException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}");
                     }
                     @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f));
                 }
@@ -935,7 +934,7 @@ public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, CharsRef chars)
                 {
                     if (utf8.Length <= i + 1)
                     {
-                        throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1);
+                        throw new FormatException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}");
                     }
                     @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f));
                     i += 2;
@@ -944,7 +943,7 @@ public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, CharsRef chars)
                 {
                     if (utf8.Length <= i + 2)
                     {
-                        throw new ParseException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}", i - 1);
+                        throw new FormatException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}");
                     }
                     if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
                     int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f);

From 99d2076c74b04a3923d2e189cf7e1ec647c0bdbf Mon Sep 17 00:00:00 2001
From: Paul Irwin <paulirwin@gmail.com>
Date: Wed, 18 Dec 2024 08:45:35 -0700
Subject: [PATCH 10/10] Add FormatException case to Utf8ToString() catch

---
 src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs b/src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs
index 8b8b08e6e0..1125feaffc 100644
--- a/src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs
+++ b/src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs
@@ -407,10 +407,10 @@ public override int GetOrd(int docId)
                     SimpleTextUtil.ReadLine(_input, _scratch);
                     try
                     {
-                        // LUCNENENET: .NET doesn't have a way to specify a pattern with integer, but all of the standard ones are built in.
+                        // LUCENENET: .NET doesn't have a way to specify a pattern with integer, but all of the standard ones are built in.
                         return int.Parse(_scratch.Utf8ToString(), NumberStyles.Integer, CultureInfo.InvariantCulture) - 1;
                     }
-                    catch (Exception pe) when (pe.IsParseException())
+                    catch (Exception pe) when (pe.IsParseException() || pe.IsNumberFormatException())
                     {
                         var e = new CorruptIndexException($"failed to parse ord (resource={_input})", pe);
                         throw e;