diff --git a/src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs b/src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs
index 8b8b08e6e0..1125feaffc 100644
--- a/src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs
+++ b/src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs
@@ -407,10 +407,10 @@ public override int GetOrd(int docId)
SimpleTextUtil.ReadLine(_input, _scratch);
try
{
- // LUCNENENET: .NET doesn't have a way to specify a pattern with integer, but all of the standard ones are built in.
+ // LUCENENET: .NET doesn't have a way to specify a pattern with integer, but all of the standard ones are built in.
return int.Parse(_scratch.Utf8ToString(), NumberStyles.Integer, CultureInfo.InvariantCulture) - 1;
}
- catch (Exception pe) when (pe.IsParseException())
+ catch (Exception pe) when (pe.IsParseException() || pe.IsNumberFormatException())
{
var e = new CorruptIndexException($"failed to parse ord (resource={_input})", pe);
throw e;
diff --git a/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs b/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs
index 695e817552..c3b8f41e6b 100644
--- a/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs
+++ b/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs
@@ -79,7 +79,7 @@ public static void ReadLine(DataInput input, BytesRef scratch)
{
break;
}
-
+
scratch.Bytes[upto++] = b;
}
}
@@ -106,8 +106,9 @@ public static void CheckFooter(ChecksumIndexInput input)
if (StringHelper.StartsWith(scratch, CHECKSUM) == false)
{
+ // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
throw new CorruptIndexException("SimpleText failure: expected checksum line but got " +
- scratch.Utf8ToString() + " (resource=" + input + ")");
+ scratch.Utf8ToStringWithFallback() + " (resource=" + input + ")");
}
var actualChecksum =
(new BytesRef(scratch.Bytes, CHECKSUM.Length, scratch.Length - CHECKSUM.Length)).Utf8ToString();
@@ -124,4 +125,4 @@ public static void CheckFooter(ChecksumIndexInput input)
}
}
}
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs b/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs
index 91d4c355ab..3ca21e13f5 100644
--- a/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs
+++ b/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs
@@ -41,7 +41,7 @@ public class DefaultSortedSetDocValuesReaderState : SortedSetDocValuesReaderStat
///
/// Creates this, pulling doc values from the specified
- /// field.
+ /// field.
///
public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = FacetsConfig.DEFAULT_INDEX_FIELD_NAME)
{
@@ -79,7 +79,8 @@ public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = F
string[] components = FacetsConfig.StringToPath(spare.Utf8ToString());
if (components.Length != 2)
{
- throw new ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + spare.Utf8ToString());
+ // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+ throw new ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + spare.Utf8ToStringWithFallback());
}
if (!components[0].Equals(lastDim, StringComparison.Ordinal))
{
@@ -101,7 +102,7 @@ public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = F
///
/// Return top-level doc values.
///
- public override SortedSetDocValues GetDocValues()
+ public override SortedSetDocValues GetDocValues()
{
return topReader.GetSortedSetDocValues(field);
}
@@ -132,4 +133,4 @@ public override OrdRange GetOrdRange(string dim)
///
public override int Count => valueCount;
}
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs b/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs
index 8abb12520e..89599b49a6 100644
--- a/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs
+++ b/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs
@@ -275,7 +275,7 @@ public override int GetHashCode()
public override string ToString()
{
return "FacetEntry{" +
- "value=" + value.Utf8ToString() +
+ "value=" + value.Utf8ToStringWithFallback() + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
", count=" + count +
'}';
}
diff --git a/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs b/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs
index 9d91aa64c1..9a5c9c1476 100644
--- a/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs
+++ b/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs
@@ -134,7 +134,7 @@ public WeightAnonymousClass(TermsIncludingScoreQuery outerInstance, Weight origi
private TermsEnum segmentTermsEnum;
-
+
public override Explanation Explain(AtomicReaderContext context, int doc)
{
SVInnerScorer scorer = (SVInnerScorer) GetBulkScorer(context, false, null);
@@ -161,7 +161,7 @@ public override void Normalize(float norm, float topLevelBoost)
{
originalWeight.Normalize(norm, topLevelBoost*outerInstance.Boost);
}
-
+
public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)
{
Terms terms = context.AtomicReader.GetTerms(outerInstance._field);
@@ -181,7 +181,7 @@ public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)
return new SVInOrderScorer(outerInstance, this, acceptDocs, segmentTermsEnum, context.AtomicReader.MaxDoc, cost);
}
-
+
public override BulkScorer GetBulkScorer(AtomicReaderContext context, bool scoreDocsInOrder, IBits acceptDocs)
{
if (scoreDocsInOrder)
@@ -236,7 +236,7 @@ internal SVInnerScorer(TermsIncludingScoreQuery outerInstance, /* Weight weight,
//_cost = cost; // LUCENENET: Never read
_doc = -1;
}
-
+
public override bool Score(ICollector collector, int max)
{
FakeScorer fakeScorer = new FakeScorer();
@@ -285,12 +285,12 @@ private int NextDocOutOfOrder()
}
}
}
-
+
protected virtual int DocsEnumNextDoc()
{
return docsEnum.NextDoc();
}
-
+
internal Explanation Explain(int target) // LUCENENET NOTE: changed accessibility from private to internal
{
int docId;
@@ -314,7 +314,7 @@ internal Explanation Explain(int target) // LUCENENET NOTE: changed accessibilit
} while (docId != DocIdSetIterator.NO_MORE_DOCS);
return new ComplexExplanation(true, outerInstance._scores[outerInstance._ords[_scoreUpto]],
- "Score based on join value " + _termsEnum.Term.Utf8ToString());
+ "Score based on join value " + _termsEnum.Term.Utf8ToStringWithFallback()); // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
}
}
@@ -326,13 +326,13 @@ internal class MVInnerScorer : SVInnerScorer
internal readonly FixedBitSet alreadyEmittedDocs;
internal MVInnerScorer(TermsIncludingScoreQuery outerInstance, /* Weight weight, // LUCENENET: Never read */
- IBits acceptDocs, TermsEnum termsEnum, int maxDoc /*, long cost // LUCENENET: Never read */)
+ IBits acceptDocs, TermsEnum termsEnum, int maxDoc /*, long cost // LUCENENET: Never read */)
: base(outerInstance, /*weight, // LUCENENET: Never read */
acceptDocs, termsEnum /*, cost // LUCENENET: Never read */)
{
alreadyEmittedDocs = new FixedBitSet(maxDoc);
}
-
+
protected override int DocsEnumNextDoc()
{
while (true)
@@ -360,11 +360,11 @@ internal class SVInOrderScorer : Scorer
internal readonly long cost;
internal int currentDoc = -1;
-
+
[SuppressMessage("CodeQuality", "IDE0079:Remove unnecessary suppression", Justification = "This is a SonarCloud issue")]
[SuppressMessage("CodeQuality", "S1699:Constructors should only call non-overridable methods", Justification = "Internal class")]
internal SVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight, IBits acceptDocs,
- TermsEnum termsEnum, int maxDoc, long cost)
+ TermsEnum termsEnum, int maxDoc, long cost)
: base(weight)
{
this.m_outerInstance = outerInstance;
@@ -374,7 +374,7 @@ internal SVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight,
matchingDocsIterator = matchingDocs.GetIterator();
this.cost = cost;
}
-
+
protected virtual void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptDocs,
TermsEnum termsEnum)
{
@@ -398,12 +398,12 @@ protected virtual void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptD
}
}
}
-
+
public override float GetScore()
{
return scores[currentDoc];
}
-
+
public override int Freq => 1;
public override int DocID => currentDoc;
@@ -412,7 +412,7 @@ public override int NextDoc()
{
return currentDoc = matchingDocsIterator.NextDoc();
}
-
+
public override int Advance(int target)
{
return currentDoc = matchingDocsIterator.Advance(target);
@@ -432,7 +432,7 @@ internal MVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight,
: base(outerInstance, weight, acceptDocs, termsEnum, maxDoc, cost)
{
}
-
+
protected override void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptDocs,
TermsEnum termsEnum)
{
@@ -465,4 +465,4 @@ protected override void FillDocsAndScores(FixedBitSet matchingDocs, IBits accept
}
}
}
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net.Misc/Misc/TermStats.cs b/src/Lucene.Net.Misc/Misc/TermStats.cs
index 7ac5fbc918..c2d1664870 100644
--- a/src/Lucene.Net.Misc/Misc/TermStats.cs
+++ b/src/Lucene.Net.Misc/Misc/TermStats.cs
@@ -45,7 +45,8 @@ internal string GetTermText()
public override string ToString()
{
- return ("TermStats: Term=" + TermText.Utf8ToString() + " DocFreq=" + DocFreq + " TotalTermFreq=" + TotalTermFreq);
+ // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+ return "TermStats: Term=" + TermText.Utf8ToStringWithFallback() + " DocFreq=" + DocFreq + " TotalTermFreq=" + TotalTermFreq;
}
}
}
diff --git a/src/Lucene.Net.Queries/TermsFilter.cs b/src/Lucene.Net.Queries/TermsFilter.cs
index 3aae8295f3..ced20eed59 100644
--- a/src/Lucene.Net.Queries/TermsFilter.cs
+++ b/src/Lucene.Net.Queries/TermsFilter.cs
@@ -319,7 +319,7 @@ public override string ToString()
}
first = false;
builder.Append(current.field).Append(':');
- builder.Append(spare.Utf8ToString());
+ builder.Append(spare.Utf8ToStringWithFallback()); // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
}
}
diff --git a/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs b/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs
index 06587b33ca..ae71302ae0 100644
--- a/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs
+++ b/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs
@@ -59,7 +59,8 @@ internal Completion(BytesRef key, int bucket)
public override string ToString()
{
- return Utf8.Utf8ToString() + "/" + Bucket.ToString("0.0", CultureInfo.InvariantCulture);
+ // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+ return Utf8.Utf8ToStringWithFallback() + "/" + Bucket.ToString("0.0", CultureInfo.InvariantCulture);
}
///
diff --git a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
index bb8e736a7c..a35c0a4d62 100644
--- a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
+++ b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
@@ -327,5 +327,54 @@ public virtual void TestUTF8UTF16CharsRef()
Assert.AreEqual(cRef.ToString(), unicode);
}
}
+
+ [Test]
+ [LuceneNetSpecific]
+ [TestCase(new byte[] { 0x63, 0x61, 0xc3 }, true)] // ca�, start of 2-byte sequence
+ [TestCase(new byte[] { 0x63, 0x61, 0xe3 }, true)] // ca�, start of 3-byte sequence
+ [TestCase(new byte[] { 0x63, 0x61, 0xf3 }, true)] // ca�, start of 4-byte sequence
+ [TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, false)] // cañon
+ public void TestUTF8toUTF16Exception(byte[] invalidUtf8, bool shouldThrow)
+ {
+ var scratch = new CharsRef();
+
+ if (shouldThrow)
+ {
+ Assert.Throws(() => UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch));
+ }
+ else
+ {
+ UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch);
+ }
+ }
+
+ [Test]
+ [LuceneNetSpecific] // this is a Lucene.NET specific method
+ [Repeat(100)]
+ public void TestTryUTF8toUTF16()
+ {
+ string unicode = TestUtil.RandomRealisticUnicodeString(Random);
+ var utf8 = new BytesRef(IOUtils.CHARSET_UTF_8.GetBytes(unicode));
+
+ bool success = UnicodeUtil.TryUTF8toUTF16(utf8, out var chars);
+
+ Assert.IsTrue(success);
+ Assert.AreEqual(unicode, chars?.ToString());
+ }
+
+ [Test]
+ [LuceneNetSpecific] // this is a Lucene.NET specific method
+ [TestCase(new byte[] { 0x63, 0x61, 0xc3 }, "ca\ufffd")] // ca�, start of 2-byte sequence
+ [TestCase(new byte[] { 0x63, 0x61, 0xe3 }, "ca\ufffd")] // ca�, start of 3-byte sequence
+ [TestCase(new byte[] { 0x63, 0x61, 0xf3 }, "ca\ufffd")] // ca�, start of 4-byte sequence
+ [TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, "cañon")]
+ public void TestUTF8toUTF16WithFallback(byte[] utf8, string expected)
+ {
+ var scratch = new CharsRef();
+
+ UnicodeUtil.UTF8toUTF16WithFallback(utf8, scratch);
+
+ Assert.AreEqual(expected, scratch.ToString());
+ }
}
}
diff --git a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs
index bab16426f7..e771023d59 100644
--- a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs
+++ b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs
@@ -50,15 +50,15 @@ namespace Lucene.Net.Codecs
public static class BlockTreeTermsWriter
{
///
- /// Suggested default value for the
- /// minItemsInBlock parameter to
+ /// Suggested default value for the
+ /// minItemsInBlock parameter to
/// .
///
public const int DEFAULT_MIN_BLOCK_SIZE = 25;
///
- /// Suggested default value for the
- /// maxItemsInBlock parameter to
+ /// Suggested default value for the
+ /// maxItemsInBlock parameter to
/// .
///
public const int DEFAULT_MAX_BLOCK_SIZE = 48;
@@ -296,12 +296,12 @@ public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long
/// to set state. It is *optional* and can be used when overriding the WriteHeader(),
/// WriteIndexHeader(). It only matters in the case where the state
/// is required inside of any of those methods that is passed in to the subclass constructor.
- ///
+ ///
/// When passed to the constructor, it is set to the protected field m_subclassState before
/// any of the above methods are called where it is available for reading when overriding the above methods.
- ///
+ ///
/// If your subclass needs to pass more than one piece of data, you can create a class or struct to do so.
- /// All other virtual members of BlockTreeTermsWriter are not called in the constructor,
+ /// All other virtual members of BlockTreeTermsWriter are not called in the constructor,
/// so the overrides of those methods won't specifically need to use this field (although they could for consistency).
///
[SuppressMessage("CodeQuality", "IDE0079:Remove unnecessary suppression", Justification = "This is a SonarCloud issue")]
@@ -440,7 +440,8 @@ public PendingTerm(BytesRef term, BlockTermState state)
public override string ToString()
{
- return Term.Utf8ToString();
+ // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+ return Term.Utf8ToStringWithFallback();
}
}
@@ -468,7 +469,21 @@ public PendingBlock(BytesRef prefix, long fp, bool hasTerms, bool isFloor, int f
public override string ToString()
{
- return "BLOCK: " + Prefix.Utf8ToString();
+ // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+ return $"BLOCK: {Prefix.Utf8ToStringWithFallback()}";
+ }
+
+ #nullable enable
+ public bool TryToString([NotNullWhen(true)] out string? result)
+ {
+ if (Prefix.TryUtf8ToString(out string? prefixString))
+ {
+ result = $"BLOCK: {prefixString}";
+ return true;
+ }
+
+ result = null;
+ return false;
}
// LUCENENET specific - to keep the Debug.Assert statement from throwing exceptions
@@ -476,12 +491,11 @@ public override string ToString()
// to using PendingBlock.Prefix.ToString() if PendingBlock.ToString() errors.
// This struct defers formatting the string until it is actually used as a parameter
// in string.Format().
- private struct PendingBlocksFormatter // For assert
+ private readonly struct PendingBlocksFormatter // For assert
{
-#pragma warning disable IDE0044 // Add readonly modifier
- private IList blocks;
-#pragma warning restore IDE0044 // Add readonly modifier
- public PendingBlocksFormatter(IList blocks)
+ private readonly IList? blocks;
+
+ public PendingBlocksFormatter(IList? blocks)
{
this.blocks = blocks; // May be null
}
@@ -500,17 +514,17 @@ public override string ToString() // For assert
it.MoveNext();
while (true)
{
- var e = it.Current;
+ var e = it.Current ?? throw new InvalidOperationException("Expected a non-null value in the enumerator due to Count check above.");
// There is a chance that the Prefix will contain invalid UTF8,
// so we catch that and use the alternative way of displaying it
- try
+ if (e.TryToString(out string? eString))
{
- sb.Append(e.ToString());
+ sb.Append(eString);
}
- catch (IndexOutOfRangeException)
+ else
{
sb.Append("BLOCK: ");
- sb.Append(e.Prefix.ToString());
+ sb.Append(e.Prefix);
}
if (!it.MoveNext())
{
@@ -520,6 +534,7 @@ public override string ToString() // For assert
}
}
}
+ #nullable restore
public void CompileIndex(IList floorBlocks, RAMOutputStream scratchBytes)
{
@@ -1351,4 +1366,4 @@ protected override void Dispose(bool disposing)
}
}
}
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs b/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs
index 26a5b54a7a..fc941fab63 100644
--- a/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs
+++ b/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs
@@ -48,7 +48,7 @@ namespace Lucene.Net.Codecs.Lucene3x
///
/// Exposes flex API on a pre-flex index, as a codec.
///
- /// @lucene.experimental
+ /// @lucene.experimental
///
[Obsolete("(4.0)")]
internal class Lucene3xFields : FieldsProducer
@@ -344,7 +344,8 @@ private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos)
if (DEBUG_SURROGATES)
{
- Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
+ // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+ Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToStringWithFallback()));
}
// Seek "back":
@@ -488,7 +489,8 @@ private bool DoPop()
if (DEBUG_SURROGATES)
{
- Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString());
+ // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+ Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback()) + " " + scratchTerm.ToString());
}
// TODO: more efficient seek? can we simply swap
@@ -599,10 +601,11 @@ private void SurrogateDance()
if (DEBUG_SURROGATES)
{
+ // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
Console.WriteLine(" dance");
- Console.WriteLine(" prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToString()));
+ Console.WriteLine(" prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToStringWithFallback()));
Console.WriteLine(" " + prevTerm.ToString());
- Console.WriteLine(" term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()));
+ Console.WriteLine(" term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback()));
Console.WriteLine(" " + scratchTerm.ToString());
}
@@ -679,7 +682,8 @@ private void DoPushes()
if (DEBUG_SURROGATES)
{
- Console.WriteLine(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length);
+ // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+ Console.WriteLine(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length);
}
// Seek "forward":
@@ -777,7 +781,7 @@ internal virtual void Reset(FieldInfo fieldInfo)
{
//System.out.println("pff.reset te=" + termEnum);
this.fieldInfo = fieldInfo;
-
+
internedFieldName = fieldInfo.Name.Intern();
Term term = new Term(internedFieldName);
@@ -832,7 +836,8 @@ public override SeekStatus SeekCeil(BytesRef term)
{
if (DEBUG_SURROGATES)
{
- Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToString()));
+ // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
+ Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToStringWithFallback()));
}
skipNext = false;
TermInfosReader tis = outerInstance.TermsDict;
@@ -1232,4 +1237,4 @@ public override void CheckIntegrity()
{
}
}
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs b/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs
index 5a25bf64d2..1c3ed526ad 100644
--- a/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs
+++ b/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs
@@ -305,7 +305,7 @@ public static bool IsNoSuchFileExceptionOrFileNotFoundException(this Exception e
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsParseException(this Exception e)
{
- // LUCENNET: Added this exception in J2N to cover this case because it is not a RuntimeException
+ // LUCENENET: Added this exception in J2N to cover this case because it is not a RuntimeException
// which makes it different from NumberFormatException in Java and FormatException in .NET.
return e is ParseException;
}
@@ -591,7 +591,7 @@ public static bool IsIllegalStateException(this Exception e)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsStackOverflowError(this Exception e)
{
- return e is StackOverflowException; // Uncatchable in .NET core, be sure to use with
+ return e is StackOverflowException; // Uncatchable in .NET core, be sure to use with
}
///
diff --git a/src/Lucene.Net/Support/ObsoleteAPI/UnicodeUtil.cs b/src/Lucene.Net/Support/ObsoleteAPI/UnicodeUtil.cs
new file mode 100644
index 0000000000..325218f5c8
--- /dev/null
+++ b/src/Lucene.Net/Support/ObsoleteAPI/UnicodeUtil.cs
@@ -0,0 +1,175 @@
+using Lucene.Net.Support;
+using System;
+
+#nullable enable
+
+namespace Lucene.Net.Util
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /*
+ * Some of this code came from the excellent Unicode
+ * conversion examples from:
+ *
+ * http://www.unicode.org/Public/PROGRAMS/CVTUTF
+ *
+ * Full Copyright for that code follows:
+ */
+
+ /*
+ * Copyright 2001-2004 Unicode, Inc.
+ *
+ * Disclaimer
+ *
+ * this source code is provided as is by Unicode, Inc. No claims are
+ * made as to fitness for any particular purpose. No warranties of any
+ * kind are expressed or implied. The recipient agrees to determine
+ * applicability of information provided. If this file has been
+ * purchased on magnetic or optical media from Unicode, Inc., the
+ * sole remedy for any claim will be exchange of defective media
+ * within 90 days of receipt.
+ *
+ * Limitations on Rights to Redistribute this Code
+ *
+ * Unicode, Inc. hereby grants the right to freely use the information
+ * supplied in this file in the creation of products supporting the
+ * Unicode Standard, and to make copies of this file in any form
+ * for internal or external distribution as long as this notice
+ * remains attached.
+ */
+
+ /*
+ * Additional code came from the IBM ICU library.
+ *
+ * http://www.icu-project.org
+ *
+ * Full Copyright for that code follows.
+ */
+
+ /*
+ * Copyright (C) 1999-2010, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, and/or sell copies of the
+ * Software, and to permit persons to whom the Software is furnished to do so,
+ * provided that the above copyright notice(s) and this permission notice appear
+ * in all copies of the Software and that both the above copyright notice(s) and
+ * this permission notice appear in supporting documentation.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN this NOTICE BE
+ * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
+ * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+ * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF this SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall not
+ * be used in advertising or otherwise to promote the sale, use or other
+ * dealings in this Software without prior written authorization of the
+ * copyright holder.
+ */
+
+ public static partial class UnicodeUtil
+ {
+ ///
+ /// Generates char array that represents the provided input code points.
+ ///
+ /// LUCENENET specific.
+ ///
+ /// The code array.
+ /// The start of the text in the code point array.
+ /// The number of code points.
+ /// a char array representing the code points between offset and count.
+ // LUCENENET NOTE: This code was originally in the NewString() method.
+ // It has been refactored from the original to remove the exception throw/catch and
+ // instead proactively resizes the array instead of relying on exceptions + copy operations
+ [Obsolete("Use NewString method instead. This method will be removed in 4.8.0 release candidate."), System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
+ public static char[] ToCharArray(int[] codePoints, int offset, int count)
+ {
+ return ToCharArray(codePoints.AsSpan(offset), count);
+ }
+
+ ///
+ /// Generates char array that represents the provided input code points.
+ ///
+ /// LUCENENET specific.
+ ///
+ /// The code span.
+ /// The number of code points.
+ /// a char array representing the code points between offset and count.
+ // LUCENENET NOTE: This code was originally in the NewString() method.
+ // It has been refactored from the original to remove the exception throw/catch and
+ // instead proactively resizes the array instead of relying on exceptions + copy operations
+ [Obsolete("Use NewString method instead. This method will be removed in 4.8.0 release candidate."), System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
+ public static char[] ToCharArray(ReadOnlySpan codePoints, int count)
+ {
+ if (count < 0)
+ {
+ throw new ArgumentOutOfRangeException(nameof(count), "count must be >= 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
+ }
+ const int countThreshold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2
+ // LUCENENET: as a first approximation, assume each codepoint
+ // is 2 characters (since it cannot be longer than this)
+ int arrayLength = count * 2;
+ // LUCENENET: if we go over the threshold, count the number of
+ // chars we will need so we can allocate the precise amount of memory
+ if (count > countThreshold)
+ {
+ arrayLength = 0;
+ for (int r = 0; r < count; ++r)
+ {
+ arrayLength += codePoints[r] < 0x010000 ? 1 : 2;
+ }
+ if (arrayLength < 1)
+ {
+ arrayLength = count * 2;
+ }
+ }
+ // Initialize our array to our exact or oversized length.
+ // It is now safe to assume we have enough space for all of the characters.
+ char[] chars = new char[arrayLength];
+ int w = 0;
+ for (int r = 0; r < count; ++r)
+ {
+ int cp = codePoints[r];
+ if (cp < 0 || cp > 0x10ffff)
+ {
+ throw new ArgumentException($"Invalid code point: {cp}", nameof(codePoints));
+ }
+ if (cp < 0x010000)
+ {
+ chars[w++] = (char)cp;
+ }
+ else
+ {
+ chars[w++] = (char)(LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
+ chars[w++] = (char)(TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
+ }
+ }
+
+ var result = new char[w];
+ Arrays.Copy(chars, result, w);
+ return result;
+ }
+ }
+}
diff --git a/src/Lucene.Net/Util/BytesRef.cs b/src/Lucene.Net/Util/BytesRef.cs
index 8012c77282..0124269216 100644
--- a/src/Lucene.Net/Util/BytesRef.cs
+++ b/src/Lucene.Net/Util/BytesRef.cs
@@ -243,6 +243,43 @@ public string Utf8ToString()
return @ref.ToString();
}
+ ///
+ /// Interprets stored bytes as UTF8 bytes, returning the
+ /// resulting .
+ ///
+ ///
+ /// LUCENENET specific version that does not throw exceptions on invalid UTF-8,
+ /// primarily for use in ToString() and other cases that should not throw exceptions,
+ /// such as when building a message for another exception.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public string Utf8ToStringWithFallback()
+ {
+ CharsRef @ref = new CharsRef(Length);
+ UnicodeUtil.UTF8toUTF16WithFallback(bytes, Offset, Length, @ref);
+ return @ref.ToString();
+ }
+
+ #nullable enable
+ ///
+ /// Tries to interpret the stored bytes as UTF8 bytes, returning the
+ /// resulting as an output parameter .
+ ///
+ /// The resulting string output.
+ /// true if successful, false otherwise.
+ public bool TryUtf8ToString([NotNullWhen(true)] out string? result)
+ {
+ if (UnicodeUtil.TryUTF8toUTF16(bytes, Offset, Length, out CharsRef? @ref))
+ {
+ result = @ref.ToString();
+ return true;
+ }
+
+ result = null;
+ return false;
+ }
+ #nullable restore
+
///
/// Returns hex encoded bytes, eg [0x6c 0x75 0x63 0x65 0x6e 0x65]
public override string ToString()
@@ -567,11 +604,11 @@ public override string ToString()
switch (format)
{
case BytesRefFormat.UTF8:
- try
+ if (bytesRef.TryUtf8ToString(out var utf8String))
{
- return bytesRef.Utf8ToString();
+ return utf8String;
}
- catch (Exception e) when (e.IsIndexOutOfBoundsException())
+ else
{
return bytesRef.ToString();
}
diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs
index 5974af1a16..9a08d8bb4c 100644
--- a/src/Lucene.Net/Util/UnicodeUtil.cs
+++ b/src/Lucene.Net/Util/UnicodeUtil.cs
@@ -1,10 +1,11 @@
using J2N;
using J2N.Text;
using Lucene.Net.Diagnostics;
-using Lucene.Net.Support;
using System;
+using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Text;
+#nullable enable
namespace Lucene.Net.Util
{
@@ -99,7 +100,7 @@ namespace Lucene.Net.Util
///
/// @lucene.internal
///
- public static class UnicodeUtil
+ public static partial class UnicodeUtil
{
///
/// A binary term consisting of a number of 0xff bytes, likely to be bigger than other terms
@@ -107,7 +108,10 @@ public static class UnicodeUtil
///
/// WARNING: this is not a valid UTF8 Term
///
- public static readonly BytesRef BIG_TERM = new BytesRef(new byte[] { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }); // TODO this is unrelated here find a better place for it
+ public static readonly BytesRef BIG_TERM = new BytesRef(new byte[]
+ {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+ }); // TODO this is unrelated here find a better place for it
public const int UNI_SUR_HIGH_START = 0xD800;
public const int UNI_SUR_HIGH_END = 0xDBFF;
@@ -120,16 +124,17 @@ public static class UnicodeUtil
private const long HALF_SHIFT = 10;
private const long HALF_MASK = 0x3FFL;
- private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START;
+ private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint -
+ (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START;
///
- /// Encode characters from a , starting at
+ /// Encode characters from a (with generic type argument ) , starting at
/// and ending at . After encoding, result.Offset will always be 0.
///
/// is null.
// TODO: broken if incoming result.offset != 0
// LUCENENET specific overload
- public static void UTF16toUTF8(Span source, BytesRef result)
+ public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result)
{
// LUCENENET: Added guard clause
if (result is null)
@@ -148,11 +153,12 @@ public static void UTF16toUTF8(Span source, BytesRef result)
{
@out = result.Bytes = new byte[maxLen];
}
+
result.Offset = 0;
while (i < end)
{
- int code = (int)source[i++];
+ var code = (int)source[i++];
if (code < 0x80)
{
@@ -188,6 +194,7 @@ public static void UTF16toUTF8(Span source, BytesRef result)
continue;
}
}
+
// replace unpaired surrogate or out-of-order low surrogate
// with substitution character
@out[upto++] = 0xEF;
@@ -195,6 +202,7 @@ public static void UTF16toUTF8(Span source, BytesRef result)
@out[upto++] = 0xBD;
}
}
+
//assert matches(source, offset, length, out, upto);
result.Length = upto;
}
@@ -217,74 +225,8 @@ public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef r
// LUCENENET: Added guard clauses
if (source is null)
throw new ArgumentNullException(nameof(source));
- if (result is null)
- throw new ArgumentNullException(nameof(result));
- if (offset < 0)
- throw new ArgumentOutOfRangeException(nameof(offset), $"{nameof(offset)} must not be negative.");
- if (length < 0)
- throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
- if (offset > source.Length - length) // Checks for int overflow
- throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
-
- int upto = 0;
- int i = offset;
- int end = offset + length;
- var @out = result.Bytes;
- // Pre-allocate for worst case 4-for-1
- int maxLen = length * 4;
- if (@out.Length < maxLen)
- {
- @out = result.Bytes = new byte[maxLen];
- }
- result.Offset = 0;
-
- while (i < end)
- {
- int code = (int)source[i++];
- if (code < 0x80)
- {
- @out[upto++] = (byte)code;
- }
- else if (code < 0x800)
- {
- @out[upto++] = (byte)(0xC0 | (code >> 6));
- @out[upto++] = (byte)(0x80 | (code & 0x3F));
- }
- else if (code < 0xD800 || code > 0xDFFF)
- {
- @out[upto++] = (byte)(0xE0 | (code >> 12));
- @out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
- @out[upto++] = (byte)(0x80 | (code & 0x3F));
- }
- else
- {
- // surrogate pair
- // confirm valid high surrogate
- if (code < 0xDC00 && i < end)
- {
- var utf32 = (int)source[i];
- // confirm valid low surrogate and write pair
- if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
- {
- utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
- i++;
- @out[upto++] = (byte)(0xF0 | (utf32 >> 18));
- @out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
- @out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
- @out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
- continue;
- }
- }
- // replace unpaired surrogate or out-of-order low surrogate
- // with substitution character
- @out[upto++] = 0xEF;
- @out[upto++] = 0xBF;
- @out[upto++] = 0xBD;
- }
- }
- //assert matches(source, offset, length, out, upto);
- result.Length = upto;
+ UTF16toUTF8(source.AsSpan(offset, length), result);
}
///
@@ -312,7 +254,8 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt
if (length < 0)
throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
if (offset > source.Length - length) // Checks for int overflow
- throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
+ throw new ArgumentOutOfRangeException(nameof(length),
+ $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
int end = offset + length;
@@ -363,6 +306,7 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt
continue;
}
}
+
// replace unpaired surrogate or out-of-order low surrogate
// with substitution character
@out[upto++] = 0xEF;
@@ -370,6 +314,7 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt
@out[upto++] = 0xBD;
}
}
+
//assert matches(s, offset, length, out, upto);
result.Length = upto;
}
@@ -401,7 +346,8 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r
if (length < 0)
throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
if (offset > source.Length - length) // Checks for int overflow
- throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
+ throw new ArgumentOutOfRangeException(nameof(length),
+ $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");
int end = offset + length;
@@ -452,6 +398,7 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r
continue;
}
}
+
// replace unpaired surrogate or out-of-order low surrogate
// with substitution character
@out[upto++] = 0xEF;
@@ -459,6 +406,7 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r
@out[upto++] = 0xBD;
}
}
+
//assert matches(s, offset, length, out, upto);
result.Length = upto;
}
@@ -536,20 +484,20 @@ public static bool ValidUTF16String(ICharSequence s)
// Valid surrogate pair
}
else
- // Unmatched high surrogate
{
+ // Unmatched high surrogate
return false;
}
}
else
- // Unmatched high surrogate
{
+ // Unmatched high surrogate
return false;
}
}
else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
- // Unmatched low surrogate
{
+ // Unmatched low surrogate
return false;
}
}
@@ -557,7 +505,8 @@ public static bool ValidUTF16String(ICharSequence s)
return true;
}
- public static bool ValidUTF16String(string s) // LUCENENET specific overload because string doesn't implement ICharSequence
+ // LUCENENET specific overload because string doesn't implement ICharSequence
+ public static bool ValidUTF16String(string s)
{
int size = s.Length;
for (int i = 0; i < size; i++)
@@ -574,20 +523,20 @@ public static bool ValidUTF16String(string s) // LUCENENET specific overload bec
// Valid surrogate pair
}
else
- // Unmatched high surrogate
{
+ // Unmatched high surrogate
return false;
}
}
else
- // Unmatched high surrogate
{
+ // Unmatched high surrogate
return false;
}
}
else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
- // Unmatched low surrogate
{
+ // Unmatched low surrogate
return false;
}
}
@@ -595,7 +544,8 @@ public static bool ValidUTF16String(string s) // LUCENENET specific overload bec
return true;
}
- public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence
+ // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence
+ public static bool ValidUTF16String(StringBuilder s)
{
int size = s.Length;
for (int i = 0; i < size; i++)
@@ -612,20 +562,20 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl
// Valid surrogate pair
}
else
- // Unmatched high surrogate
{
+ // Unmatched high surrogate
return false;
}
}
else
- // Unmatched high surrogate
{
+ // Unmatched high surrogate
return false;
}
}
else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
- // Unmatched low surrogate
{
+ // Unmatched low surrogate
return false;
}
}
@@ -633,7 +583,9 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl
return true;
}
- public static bool ValidUTF16String(char[] s, int size)
+ public static bool ValidUTF16String(char[] s, int size) => ValidUTF16String(s.AsSpan(), size);
+
+ public static bool ValidUTF16String(ReadOnlySpan s, int size)
{
for (int i = 0; i < size; i++)
{
@@ -659,8 +611,8 @@ public static bool ValidUTF16String(char[] s, int size)
}
}
else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
- // Unmatched low surrogate
{
+ // Unmatched low surrogate
return false;
}
}
@@ -677,10 +629,13 @@ public static bool ValidUTF16String(char[] s, int size)
/* Map UTF-8 encoded prefix byte to sequence length. -1 (0xFF)
* means illegal prefix. see RFC 2279 for details */
internal static readonly int[] utf8CodeLength = LoadUTF8CodeLength();
- private static int[] LoadUTF8CodeLength() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
+
+ // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
+ private static int[] LoadUTF8CodeLength()
{
- int v = int.MinValue;
- return new int[] {
+ const int v = int.MinValue;
+ return new int[]
+ {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -721,12 +676,31 @@ public static int CodePointCount(BytesRef utf8)
for (; pos < limit; codePointCount++)
{
int v = bytes[pos] & 0xFF;
- if (v < /* 0xxx xxxx */ 0x80) { pos += 1; continue; }
- if (v >= /* 110x xxxx */ 0xc0)
+ if (v < /* 0xxx xxxx */ 0x80)
+ {
+ pos += 1;
+ continue;
+ }
+
+ if (v >= /* 110x xxxx */ 0xc0)
{
- if (v < /* 111x xxxx */ 0xe0) { pos += 2; continue; }
- if (v < /* 1111 xxxx */ 0xf0) { pos += 3; continue; }
- if (v < /* 1111 1xxx */ 0xf8) { pos += 4; continue; }
+ if (v < /* 111x xxxx */ 0xe0)
+ {
+ pos += 2;
+ continue;
+ }
+
+ if (v < /* 1111 xxxx */ 0xf0)
+ {
+ pos += 3;
+ continue;
+ }
+
+ if (v < /* 1111 1xxx */ 0xf8)
+ {
+ pos += 4;
+ continue;
+ }
// fallthrough, consider 5 and 6 byte sequences invalid.
}
@@ -757,6 +731,7 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32)
{
utf32.Int32s = new int[utf8.Length];
}
+
int utf32Count = 0;
int utf8Upto = utf8.Offset;
int[] ints = utf32.Int32s;
@@ -796,6 +771,7 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32)
{
v = v << 6 | bytes[utf8Upto++] & 63;
}
+
ints[utf32Count++] = v;
}
@@ -825,12 +801,13 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32)
///
/// Value that all lead surrogate starts with.
- private const int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
+ private const int LEAD_SURROGATE_OFFSET_ =
+ LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_);
///
/// Cover JDK 1.5 API. Create a String from an array of .
///
- /// The code array.
+ /// The code point array.
/// The start of the text in the code point array.
/// The number of code points.
/// a String representing the code points between offset and count.
@@ -840,71 +817,25 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32)
public static string NewString(int[] codePoints, int offset, int count)
{
// LUCENENET: Character.ToString() was optimized to use the stack for arrays
- // of codepoints 256 or less, so it performs better than using ToCharArray().
+ // of codepoints 256 or less, so it performs better than the Lucene implementation.
return Character.ToString(codePoints, offset, count);
}
///
- /// Generates char array that represents the provided input code points.
- ///
- /// LUCENENET specific.
+ /// Cover JDK 1.5 API. Create a String from a span of .
///
- /// The code array.
- /// The start of the text in the code point array.
+ /// The code point span.
+ /// The start of the text in the code point span.
/// The number of code points.
- /// a char array representing the code points between offset and count.
- // LUCENENET NOTE: This code was originally in the NewString() method (above).
- // It has been refactored from the original to remove the exception throw/catch and
- // instead proactively resizes the array instead of relying on excpetions + copy operations
- public static char[] ToCharArray(int[] codePoints, int offset, int count)
+ /// a String representing the code points between offset and count.
+ /// If an invalid code point is encountered.
+ /// If the offset or count are out of bounds.
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static string NewString(ReadOnlySpan codePoints, int offset, int count)
{
- if (count < 0)
- {
- throw new ArgumentOutOfRangeException(nameof(count), "count must be >= 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention)
- }
- const int countThreashold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2
- // LUCENENET: as a first approximation, assume each codepoint
- // is 2 characters (since it cannot be longer than this)
- int arrayLength = count * 2;
- // LUCENENET: if we go over the threashold, count the number of
- // chars we will need so we can allocate the precise amount of memory
- if (count > countThreashold)
- {
- arrayLength = 0;
- for (int r = offset, e = offset + count; r < e; ++r)
- {
- arrayLength += codePoints[r] < 0x010000 ? 1 : 2;
- }
- if (arrayLength < 1)
- {
- arrayLength = count * 2;
- }
- }
- // Initialize our array to our exact or oversized length.
- // It is now safe to assume we have enough space for all of the characters.
- char[] chars = new char[arrayLength];
- int w = 0;
- for (int r = offset, e = offset + count; r < e; ++r)
- {
- int cp = codePoints[r];
- if (cp < 0 || cp > 0x10ffff)
- {
- throw new ArgumentException($"Invalid code point: {cp}", nameof(codePoints));
- }
- if (cp < 0x010000)
- {
- chars[w++] = (char)cp;
- }
- else
- {
- chars[w++] = (char)(LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_));
- chars[w++] = (char)(TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_));
- }
- }
-
- var result = new char[w];
- Arrays.Copy(chars, result, w);
- return result;
+ // LUCENENET: Character.ToString() was optimized to use the stack for arrays
+ // of codepoints 256 or less, so it performs better than the Lucene implementation.
+ return Character.ToString(codePoints, offset, count);
}
// for debugging
@@ -955,18 +886,37 @@ public static string ToHexString(string s)
/// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
///
/// NOTE: Full characters are read, even if this reads past the length passed (and
- /// can result in an if invalid UTF-8 is passed).
+ /// can result in an if invalid UTF-8 is passed).
/// Explicit checks for valid UTF-8 are not performed.
///
+ ///
// TODO: broken if chars.offset != 0
public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars)
+ {
+ UTF8toUTF16(utf8.AsSpan(offset, length), chars);
+ }
+
+ ///
+ /// Interprets the given byte span as UTF-8 and converts to UTF-16. The will be extended if
+ /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
+ ///
+ /// NOTE: Full characters are read, even if this reads past the length passed (and
+ /// can result in an if invalid UTF-8 is passed).
+ /// Explicit checks for valid UTF-8 are not performed.
+ ///
+ ///
+ /// LUCENENET specific overload.
+ ///
+ // TODO: broken if chars.offset != 0
+ public static void UTF8toUTF16(ReadOnlySpan utf8, CharsRef chars)
{
int out_offset = chars.Offset = 0;
- char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, length);
- int limit = offset + length;
- while (offset < limit)
+ char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, utf8.Length);
+ int i = 0;
+
+ while (i < utf8.Length)
{
- int b = utf8[offset++] & 0xff;
+ int b = utf8[i++] & 0xff;
if (b < 0xc0)
{
if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80);
@@ -974,18 +924,30 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha
}
else if (b < 0xe0)
{
- @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f));
+ if (utf8.Length <= i)
+ {
+ throw new FormatException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}");
+ }
+ @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f));
}
else if (b < 0xf0)
{
- @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f));
- offset += 2;
+ if (utf8.Length <= i + 1)
+ {
+ throw new FormatException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}");
+ }
+ @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f));
+ i += 2;
}
else
{
+ if (utf8.Length <= i + 2)
+ {
+ throw new FormatException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}");
+ }
if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
- int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f);
- offset += 3;
+ int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f);
+ i += 3;
if (ch < UNI_MAX_BMP)
{
@out[out_offset++] = (char)ch;
@@ -1002,12 +964,187 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha
}
///
- /// Utility method for
- ///
+ /// Interprets the given byte array as UTF-8 and converts to UTF-16. The will be extended if
+ /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
+ ///
+ /// NOTE: This method will replace any invalid UTF-8 byte sequences with the Unicode replacement character U+FFFD.
+ ///
+ ///
+ /// LUCENENET specific, for use in ToString() where we want to avoid throwing exceptions.
+ ///
+ ///
+ // TODO: broken if chars.offset != 0
+ public static void UTF8toUTF16WithFallback(byte[] utf8, int offset, int length, CharsRef chars)
+ {
+ UTF8toUTF16(utf8.AsSpan(offset, length), chars);
+ }
+
+ ///
+ /// Interprets the given byte span as UTF-8 and converts to UTF-16. The will be extended if
+ /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint.
+ ///
+ /// NOTE: This method will replace any invalid UTF-8 byte sequences with the Unicode replacement character U+FFFD.
+ ///
+ ///
+ /// LUCENENET specific, for use in ToString() where we want to avoid throwing exceptions.
+ ///
+ // TODO: broken if chars.offset != 0
+ public static void UTF8toUTF16WithFallback(ReadOnlySpan utf8, CharsRef chars)
+ {
+ int out_offset = chars.Offset = 0;
+ char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, utf8.Length);
+ int i = 0;
+
+ while (i < utf8.Length)
+ {
+ int b = utf8[i++] & 0xff;
+ if (b < 0xc0)
+ {
+ if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80);
+ @out[out_offset++] = (char)b;
+ }
+ else if (b < 0xe0)
+ {
+ if (utf8.Length <= i)
+ {
+ @out[out_offset++] = (char)0xfffd;
+ continue;
+ }
+ @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f));
+ }
+ else if (b < 0xf0)
+ {
+ if (utf8.Length <= i + 1)
+ {
+ @out[out_offset++] = (char)0xfffd;
+ break;
+ }
+ @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f));
+ i += 2;
+ }
+ else
+ {
+ if (utf8.Length <= i + 2)
+ {
+ @out[out_offset++] = (char)0xfffd;
+ break;
+ }
+ if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
+ int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f);
+ i += 3;
+ if (ch < UNI_MAX_BMP)
+ {
+ @out[out_offset++] = (char)ch;
+ }
+ else
+ {
+ int chHalf = ch - 0x0010000;
+ @out[out_offset++] = (char)((chHalf >> 10) + 0xD800);
+ @out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00);
+ }
+ }
+ }
+ chars.Length = out_offset - chars.Offset;
+ }
+
+ ///
+ /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new .
+ ///
+ /// NOTE: Explicit checks for valid UTF-8 are not performed.
+ ///
+ ///
+ /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[].
+ ///
+ ///
+ public static bool TryUTF8toUTF16(byte[] utf8, int offset, int length, [NotNullWhen(true)] out CharsRef? chars)
+ {
+ return TryUTF8toUTF16(utf8.AsSpan(offset, length), out chars);
+ }
+
+ ///
+ /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new .
+ ///
+ /// NOTE: Explicit checks for valid UTF-8 are not performed.
+ ///
+ ///
+ /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[].
+ ///
+ public static bool TryUTF8toUTF16(ReadOnlySpan utf8, [NotNullWhen(true)] out CharsRef? chars)
+ {
+ CharsRef result = new CharsRef(utf8.Length);
+ int out_offset = 0;
+ char[] @out = result.Chars;
+ int i = 0;
+
+ while (i < utf8.Length)
+ {
+ int b = utf8[i++] & 0xff;
+ if (b < 0xc0)
+ {
+ if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80);
+ @out[out_offset++] = (char)b;
+ }
+ else if (b < 0xe0)
+ {
+ if (utf8.Length <= i)
+ {
+ chars = null;
+ return false;
+ }
+ @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f));
+ }
+ else if (b < 0xf0)
+ {
+ if (utf8.Length <= i + 1)
+ {
+ chars = null;
+ return false;
+ }
+ @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f));
+ i += 2;
+ }
+ else
+ {
+ if (utf8.Length <= i + 2)
+ {
+ chars = null;
+ return false;
+ }
+ if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
+ int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f);
+ i += 3;
+ if (ch < UNI_MAX_BMP)
+ {
+ @out[out_offset++] = (char)ch;
+ }
+ else
+ {
+ int chHalf = ch - 0x0010000;
+ @out[out_offset++] = (char)((chHalf >> 10) + 0xD800);
+ @out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00);
+ }
+ }
+ }
+ result.Length = out_offset;
+ chars = result;
+ return true;
+ }
+
+ ///
+ /// Utility method for
+ ///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars)
{
- UTF8toUTF16(bytesRef.Bytes, bytesRef.Offset, bytesRef.Length, chars);
+ UTF8toUTF16(bytesRef.Bytes.AsSpan(bytesRef.Offset, bytesRef.Length), chars);
+ }
+
+ ///
+ /// Utility method for
+ ///
+ public static bool TryUTF8toUTF16(BytesRef bytesRef, out CharsRef? chars)
+ {
+ return TryUTF8toUTF16(bytesRef.Bytes.AsSpan(bytesRef.Offset, bytesRef.Length), out chars);
}
}
}