diff --git a/src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs b/src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs index 8b8b08e6e0..1125feaffc 100644 --- a/src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs +++ b/src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs @@ -407,10 +407,10 @@ public override int GetOrd(int docId) SimpleTextUtil.ReadLine(_input, _scratch); try { - // LUCNENENET: .NET doesn't have a way to specify a pattern with integer, but all of the standard ones are built in. + // LUCENENET: .NET doesn't have a way to specify a pattern with integer, but all of the standard ones are built in. return int.Parse(_scratch.Utf8ToString(), NumberStyles.Integer, CultureInfo.InvariantCulture) - 1; } - catch (Exception pe) when (pe.IsParseException()) + catch (Exception pe) when (pe.IsParseException() || pe.IsNumberFormatException()) { var e = new CorruptIndexException($"failed to parse ord (resource={_input})", pe); throw e; diff --git a/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs b/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs index 695e817552..c3b8f41e6b 100644 --- a/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs +++ b/src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs @@ -79,7 +79,7 @@ public static void ReadLine(DataInput input, BytesRef scratch) { break; } - + scratch.Bytes[upto++] = b; } } @@ -106,8 +106,9 @@ public static void CheckFooter(ChecksumIndexInput input) if (StringHelper.StartsWith(scratch, CHECKSUM) == false) { + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes throw new CorruptIndexException("SimpleText failure: expected checksum line but got " + - scratch.Utf8ToString() + " (resource=" + input + ")"); + scratch.Utf8ToStringWithFallback() + " (resource=" + input + ")"); } var actualChecksum = (new BytesRef(scratch.Bytes, CHECKSUM.Length, scratch.Length - CHECKSUM.Length)).Utf8ToString(); @@ -124,4 +125,4 @@ public static void CheckFooter(ChecksumIndexInput input) } } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs b/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs index 91d4c355ab..3ca21e13f5 100644 --- a/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs +++ b/src/Lucene.Net.Facet/SortedSet/DefaultSortedSetDocValuesReaderState.cs @@ -41,7 +41,7 @@ public class DefaultSortedSetDocValuesReaderState : SortedSetDocValuesReaderStat /// /// Creates this, pulling doc values from the specified - /// field. + /// field. /// public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = FacetsConfig.DEFAULT_INDEX_FIELD_NAME) { @@ -79,7 +79,8 @@ public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = F string[] components = FacetsConfig.StringToPath(spare.Utf8ToString()); if (components.Length != 2) { - throw new ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + spare.Utf8ToString()); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + throw new ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + spare.Utf8ToStringWithFallback()); } if (!components[0].Equals(lastDim, StringComparison.Ordinal)) { @@ -101,7 +102,7 @@ public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = F /// /// Return top-level doc values. /// - public override SortedSetDocValues GetDocValues() + public override SortedSetDocValues GetDocValues() { return topReader.GetSortedSetDocValues(field); } @@ -132,4 +133,4 @@ public override OrdRange GetOrdRange(string dim) /// public override int Count => valueCount; } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs b/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs index 8abb12520e..89599b49a6 100644 --- a/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs +++ b/src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs @@ -275,7 +275,7 @@ public override int GetHashCode() public override string ToString() { return "FacetEntry{" + - "value=" + value.Utf8ToString() + + "value=" + value.Utf8ToStringWithFallback() + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes ", count=" + count + '}'; } diff --git a/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs b/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs index 9d91aa64c1..9a5c9c1476 100644 --- a/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs +++ b/src/Lucene.Net.Join/TermsIncludingScoreQuery.cs @@ -134,7 +134,7 @@ public WeightAnonymousClass(TermsIncludingScoreQuery outerInstance, Weight origi private TermsEnum segmentTermsEnum; - + public override Explanation Explain(AtomicReaderContext context, int doc) { SVInnerScorer scorer = (SVInnerScorer) GetBulkScorer(context, false, null); @@ -161,7 +161,7 @@ public override void Normalize(float norm, float topLevelBoost) { originalWeight.Normalize(norm, topLevelBoost*outerInstance.Boost); } - + public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs) { Terms terms = context.AtomicReader.GetTerms(outerInstance._field); @@ -181,7 +181,7 @@ public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs) return new SVInOrderScorer(outerInstance, this, acceptDocs, segmentTermsEnum, context.AtomicReader.MaxDoc, cost); } - + public override BulkScorer GetBulkScorer(AtomicReaderContext context, bool scoreDocsInOrder, IBits acceptDocs) { if (scoreDocsInOrder) @@ -236,7 +236,7 @@ internal SVInnerScorer(TermsIncludingScoreQuery outerInstance, /* Weight weight, //_cost = cost; // LUCENENET: Never read _doc = -1; } - + public override bool Score(ICollector collector, int max) { FakeScorer fakeScorer = new FakeScorer(); @@ -285,12 +285,12 @@ private int NextDocOutOfOrder() } } } - + protected virtual int DocsEnumNextDoc() { return docsEnum.NextDoc(); } - + internal Explanation Explain(int target) // LUCENENET NOTE: changed accessibility from private to internal { int docId; @@ -314,7 +314,7 @@ internal Explanation Explain(int target) // LUCENENET NOTE: changed accessibilit } while (docId != DocIdSetIterator.NO_MORE_DOCS); return new ComplexExplanation(true, outerInstance._scores[outerInstance._ords[_scoreUpto]], - "Score based on join value " + _termsEnum.Term.Utf8ToString()); + "Score based on join value " + _termsEnum.Term.Utf8ToStringWithFallback()); // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes } } @@ -326,13 +326,13 @@ internal class MVInnerScorer : SVInnerScorer internal readonly FixedBitSet alreadyEmittedDocs; internal MVInnerScorer(TermsIncludingScoreQuery outerInstance, /* Weight weight, // LUCENENET: Never read */ - IBits acceptDocs, TermsEnum termsEnum, int maxDoc /*, long cost // LUCENENET: Never read */) + IBits acceptDocs, TermsEnum termsEnum, int maxDoc /*, long cost // LUCENENET: Never read */) : base(outerInstance, /*weight, // LUCENENET: Never read */ acceptDocs, termsEnum /*, cost // LUCENENET: Never read */) { alreadyEmittedDocs = new FixedBitSet(maxDoc); } - + protected override int DocsEnumNextDoc() { while (true) @@ -360,11 +360,11 @@ internal class SVInOrderScorer : Scorer internal readonly long cost; internal int currentDoc = -1; - + [SuppressMessage("CodeQuality", "IDE0079:Remove unnecessary suppression", Justification = "This is a SonarCloud issue")] [SuppressMessage("CodeQuality", "S1699:Constructors should only call non-overridable methods", Justification = "Internal class")] internal SVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight, IBits acceptDocs, - TermsEnum termsEnum, int maxDoc, long cost) + TermsEnum termsEnum, int maxDoc, long cost) : base(weight) { this.m_outerInstance = outerInstance; @@ -374,7 +374,7 @@ internal SVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight, matchingDocsIterator = matchingDocs.GetIterator(); this.cost = cost; } - + protected virtual void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptDocs, TermsEnum termsEnum) { @@ -398,12 +398,12 @@ protected virtual void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptD } } } - + public override float GetScore() { return scores[currentDoc]; } - + public override int Freq => 1; public override int DocID => currentDoc; @@ -412,7 +412,7 @@ public override int NextDoc() { return currentDoc = matchingDocsIterator.NextDoc(); } - + public override int Advance(int target) { return currentDoc = matchingDocsIterator.Advance(target); @@ -432,7 +432,7 @@ internal MVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight, : base(outerInstance, weight, acceptDocs, termsEnum, maxDoc, cost) { } - + protected override void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptDocs, TermsEnum termsEnum) { @@ -465,4 +465,4 @@ protected override void FillDocsAndScores(FixedBitSet matchingDocs, IBits accept } } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Misc/Misc/TermStats.cs b/src/Lucene.Net.Misc/Misc/TermStats.cs index 7ac5fbc918..c2d1664870 100644 --- a/src/Lucene.Net.Misc/Misc/TermStats.cs +++ b/src/Lucene.Net.Misc/Misc/TermStats.cs @@ -45,7 +45,8 @@ internal string GetTermText() public override string ToString() { - return ("TermStats: Term=" + TermText.Utf8ToString() + " DocFreq=" + DocFreq + " TotalTermFreq=" + TotalTermFreq); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + return "TermStats: Term=" + TermText.Utf8ToStringWithFallback() + " DocFreq=" + DocFreq + " TotalTermFreq=" + TotalTermFreq; } } } diff --git a/src/Lucene.Net.Queries/TermsFilter.cs b/src/Lucene.Net.Queries/TermsFilter.cs index 3aae8295f3..ced20eed59 100644 --- a/src/Lucene.Net.Queries/TermsFilter.cs +++ b/src/Lucene.Net.Queries/TermsFilter.cs @@ -319,7 +319,7 @@ public override string ToString() } first = false; builder.Append(current.field).Append(':'); - builder.Append(spare.Utf8ToString()); + builder.Append(spare.Utf8ToStringWithFallback()); // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes } } diff --git a/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs b/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs index 06587b33ca..ae71302ae0 100644 --- a/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs +++ b/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs @@ -59,7 +59,8 @@ internal Completion(BytesRef key, int bucket) public override string ToString() { - return Utf8.Utf8ToString() + "/" + Bucket.ToString("0.0", CultureInfo.InvariantCulture); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + return Utf8.Utf8ToStringWithFallback() + "/" + Bucket.ToString("0.0", CultureInfo.InvariantCulture); } /// diff --git a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs index bb8e736a7c..a35c0a4d62 100644 --- a/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs +++ b/src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs @@ -327,5 +327,54 @@ public virtual void TestUTF8UTF16CharsRef() Assert.AreEqual(cRef.ToString(), unicode); } } + + [Test] + [LuceneNetSpecific] + [TestCase(new byte[] { 0x63, 0x61, 0xc3 }, true)] // ca�, start of 2-byte sequence + [TestCase(new byte[] { 0x63, 0x61, 0xe3 }, true)] // ca�, start of 3-byte sequence + [TestCase(new byte[] { 0x63, 0x61, 0xf3 }, true)] // ca�, start of 4-byte sequence + [TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, false)] // cañon + public void TestUTF8toUTF16Exception(byte[] invalidUtf8, bool shouldThrow) + { + var scratch = new CharsRef(); + + if (shouldThrow) + { + Assert.Throws(() => UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch)); + } + else + { + UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch); + } + } + + [Test] + [LuceneNetSpecific] // this is a Lucene.NET specific method + [Repeat(100)] + public void TestTryUTF8toUTF16() + { + string unicode = TestUtil.RandomRealisticUnicodeString(Random); + var utf8 = new BytesRef(IOUtils.CHARSET_UTF_8.GetBytes(unicode)); + + bool success = UnicodeUtil.TryUTF8toUTF16(utf8, out var chars); + + Assert.IsTrue(success); + Assert.AreEqual(unicode, chars?.ToString()); + } + + [Test] + [LuceneNetSpecific] // this is a Lucene.NET specific method + [TestCase(new byte[] { 0x63, 0x61, 0xc3 }, "ca\ufffd")] // ca�, start of 2-byte sequence + [TestCase(new byte[] { 0x63, 0x61, 0xe3 }, "ca\ufffd")] // ca�, start of 3-byte sequence + [TestCase(new byte[] { 0x63, 0x61, 0xf3 }, "ca\ufffd")] // ca�, start of 4-byte sequence + [TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, "cañon")] + public void TestUTF8toUTF16WithFallback(byte[] utf8, string expected) + { + var scratch = new CharsRef(); + + UnicodeUtil.UTF8toUTF16WithFallback(utf8, scratch); + + Assert.AreEqual(expected, scratch.ToString()); + } } } diff --git a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs index bab16426f7..e771023d59 100644 --- a/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs +++ b/src/Lucene.Net/Codecs/BlockTreeTermsWriter.cs @@ -50,15 +50,15 @@ namespace Lucene.Net.Codecs public static class BlockTreeTermsWriter { /// - /// Suggested default value for the - /// minItemsInBlock parameter to + /// Suggested default value for the + /// minItemsInBlock parameter to /// . /// public const int DEFAULT_MIN_BLOCK_SIZE = 25; /// - /// Suggested default value for the - /// maxItemsInBlock parameter to + /// Suggested default value for the + /// maxItemsInBlock parameter to /// . /// public const int DEFAULT_MAX_BLOCK_SIZE = 48; @@ -296,12 +296,12 @@ public FieldMetaData(FieldInfo fieldInfo, BytesRef rootCode, long numTerms, long /// to set state. It is *optional* and can be used when overriding the WriteHeader(), /// WriteIndexHeader(). It only matters in the case where the state /// is required inside of any of those methods that is passed in to the subclass constructor. - /// + /// /// When passed to the constructor, it is set to the protected field m_subclassState before /// any of the above methods are called where it is available for reading when overriding the above methods. - /// + /// /// If your subclass needs to pass more than one piece of data, you can create a class or struct to do so. - /// All other virtual members of BlockTreeTermsWriter are not called in the constructor, + /// All other virtual members of BlockTreeTermsWriter are not called in the constructor, /// so the overrides of those methods won't specifically need to use this field (although they could for consistency). /// [SuppressMessage("CodeQuality", "IDE0079:Remove unnecessary suppression", Justification = "This is a SonarCloud issue")] @@ -440,7 +440,8 @@ public PendingTerm(BytesRef term, BlockTermState state) public override string ToString() { - return Term.Utf8ToString(); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + return Term.Utf8ToStringWithFallback(); } } @@ -468,7 +469,21 @@ public PendingBlock(BytesRef prefix, long fp, bool hasTerms, bool isFloor, int f public override string ToString() { - return "BLOCK: " + Prefix.Utf8ToString(); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + return $"BLOCK: {Prefix.Utf8ToStringWithFallback()}"; + } + + #nullable enable + public bool TryToString([NotNullWhen(true)] out string? result) + { + if (Prefix.TryUtf8ToString(out string? prefixString)) + { + result = $"BLOCK: {prefixString}"; + return true; + } + + result = null; + return false; } // LUCENENET specific - to keep the Debug.Assert statement from throwing exceptions @@ -476,12 +491,11 @@ public override string ToString() // to using PendingBlock.Prefix.ToString() if PendingBlock.ToString() errors. // This struct defers formatting the string until it is actually used as a parameter // in string.Format(). - private struct PendingBlocksFormatter // For assert + private readonly struct PendingBlocksFormatter // For assert { -#pragma warning disable IDE0044 // Add readonly modifier - private IList blocks; -#pragma warning restore IDE0044 // Add readonly modifier - public PendingBlocksFormatter(IList blocks) + private readonly IList? blocks; + + public PendingBlocksFormatter(IList? blocks) { this.blocks = blocks; // May be null } @@ -500,17 +514,17 @@ public override string ToString() // For assert it.MoveNext(); while (true) { - var e = it.Current; + var e = it.Current ?? throw new InvalidOperationException("Expected a non-null value in the enumerator due to Count check above."); // There is a chance that the Prefix will contain invalid UTF8, // so we catch that and use the alternative way of displaying it - try + if (e.TryToString(out string? eString)) { - sb.Append(e.ToString()); + sb.Append(eString); } - catch (IndexOutOfRangeException) + else { sb.Append("BLOCK: "); - sb.Append(e.Prefix.ToString()); + sb.Append(e.Prefix); } if (!it.MoveNext()) { @@ -520,6 +534,7 @@ public override string ToString() // For assert } } } + #nullable restore public void CompileIndex(IList floorBlocks, RAMOutputStream scratchBytes) { @@ -1351,4 +1366,4 @@ protected override void Dispose(bool disposing) } } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs b/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs index 26a5b54a7a..fc941fab63 100644 --- a/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs +++ b/src/Lucene.Net/Codecs/Lucene3x/Lucene3xFields.cs @@ -48,7 +48,7 @@ namespace Lucene.Net.Codecs.Lucene3x /// /// Exposes flex API on a pre-flex index, as a codec. /// - /// @lucene.experimental + /// @lucene.experimental /// [Obsolete("(4.0)")] internal class Lucene3xFields : FieldsProducer @@ -344,7 +344,8 @@ private bool SeekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) if (DEBUG_SURROGATES) { - Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToString())); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + Console.WriteLine(" try seek term=" + UnicodeUtil.ToHexString(term.Utf8ToStringWithFallback())); } // Seek "back": @@ -488,7 +489,8 @@ private bool DoPop() if (DEBUG_SURROGATES) { - Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString()); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + Console.WriteLine(" seek to term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback()) + " " + scratchTerm.ToString()); } // TODO: more efficient seek? can we simply swap @@ -599,10 +601,11 @@ private void SurrogateDance() if (DEBUG_SURROGATES) { + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes Console.WriteLine(" dance"); - Console.WriteLine(" prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToString())); + Console.WriteLine(" prev=" + UnicodeUtil.ToHexString(prevTerm.Utf8ToStringWithFallback())); Console.WriteLine(" " + prevTerm.ToString()); - Console.WriteLine(" term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString())); + Console.WriteLine(" term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback())); Console.WriteLine(" " + scratchTerm.ToString()); } @@ -679,7 +682,8 @@ private void DoPushes() if (DEBUG_SURROGATES) { - Console.WriteLine(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToString()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + Console.WriteLine(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.ToHexString(scratchTerm.Utf8ToStringWithFallback()) + " " + scratchTerm.ToString() + " len=" + scratchTerm.Length); } // Seek "forward": @@ -777,7 +781,7 @@ internal virtual void Reset(FieldInfo fieldInfo) { //System.out.println("pff.reset te=" + termEnum); this.fieldInfo = fieldInfo; - + internedFieldName = fieldInfo.Name.Intern(); Term term = new Term(internedFieldName); @@ -832,7 +836,8 @@ public override SeekStatus SeekCeil(BytesRef term) { if (DEBUG_SURROGATES) { - Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToString())); + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes + Console.WriteLine("TE.seek target=" + UnicodeUtil.ToHexString(term.Utf8ToStringWithFallback())); } skipNext = false; TermInfosReader tis = outerInstance.TermsDict; @@ -1232,4 +1237,4 @@ public override void CheckIntegrity() { } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs b/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs index 5a25bf64d2..1c3ed526ad 100644 --- a/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs +++ b/src/Lucene.Net/Support/ExceptionHandling/ExceptionExtensions.cs @@ -305,7 +305,7 @@ public static bool IsNoSuchFileExceptionOrFileNotFoundException(this Exception e [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsParseException(this Exception e) { - // LUCENNET: Added this exception in J2N to cover this case because it is not a RuntimeException + // LUCENENET: Added this exception in J2N to cover this case because it is not a RuntimeException // which makes it different from NumberFormatException in Java and FormatException in .NET. return e is ParseException; } @@ -591,7 +591,7 @@ public static bool IsIllegalStateException(this Exception e) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsStackOverflowError(this Exception e) { - return e is StackOverflowException; // Uncatchable in .NET core, be sure to use with + return e is StackOverflowException; // Uncatchable in .NET core, be sure to use with } /// diff --git a/src/Lucene.Net/Support/ObsoleteAPI/UnicodeUtil.cs b/src/Lucene.Net/Support/ObsoleteAPI/UnicodeUtil.cs new file mode 100644 index 0000000000..325218f5c8 --- /dev/null +++ b/src/Lucene.Net/Support/ObsoleteAPI/UnicodeUtil.cs @@ -0,0 +1,175 @@ +using Lucene.Net.Support; +using System; + +#nullable enable + +namespace Lucene.Net.Util +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /* + * Some of this code came from the excellent Unicode + * conversion examples from: + * + * http://www.unicode.org/Public/PROGRAMS/CVTUTF + * + * Full Copyright for that code follows: + */ + + /* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * this source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute this Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + + /* + * Additional code came from the IBM ICU library. + * + * http://www.icu-project.org + * + * Full Copyright for that code follows. + */ + + /* + * Copyright (C) 1999-2010, International Business Machines + * Corporation and others. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, and/or sell copies of the + * Software, and to permit persons to whom the Software is furnished to do so, + * provided that the above copyright notice(s) and this permission notice appear + * in all copies of the Software and that both the above copyright notice(s) and + * this permission notice appear in supporting documentation. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN this NOTICE BE + * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR + * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER + * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF this SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall not + * be used in advertising or otherwise to promote the sale, use or other + * dealings in this Software without prior written authorization of the + * copyright holder. + */ + + public static partial class UnicodeUtil + { + /// + /// Generates char array that represents the provided input code points. + /// + /// LUCENENET specific. + /// + /// The code array. + /// The start of the text in the code point array. + /// The number of code points. + /// a char array representing the code points between offset and count. + // LUCENENET NOTE: This code was originally in the NewString() method. + // It has been refactored from the original to remove the exception throw/catch and + // instead proactively resizes the array instead of relying on exceptions + copy operations + [Obsolete("Use NewString method instead. This method will be removed in 4.8.0 release candidate."), System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)] + public static char[] ToCharArray(int[] codePoints, int offset, int count) + { + return ToCharArray(codePoints.AsSpan(offset), count); + } + + /// + /// Generates char array that represents the provided input code points. + /// + /// LUCENENET specific. + /// + /// The code span. + /// The number of code points. + /// a char array representing the code points between offset and count. + // LUCENENET NOTE: This code was originally in the NewString() method. + // It has been refactored from the original to remove the exception throw/catch and + // instead proactively resizes the array instead of relying on exceptions + copy operations + [Obsolete("Use NewString method instead. This method will be removed in 4.8.0 release candidate."), System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)] + public static char[] ToCharArray(ReadOnlySpan codePoints, int count) + { + if (count < 0) + { + throw new ArgumentOutOfRangeException(nameof(count), "count must be >= 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) + } + const int countThreshold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2 + // LUCENENET: as a first approximation, assume each codepoint + // is 2 characters (since it cannot be longer than this) + int arrayLength = count * 2; + // LUCENENET: if we go over the threshold, count the number of + // chars we will need so we can allocate the precise amount of memory + if (count > countThreshold) + { + arrayLength = 0; + for (int r = 0; r < count; ++r) + { + arrayLength += codePoints[r] < 0x010000 ? 1 : 2; + } + if (arrayLength < 1) + { + arrayLength = count * 2; + } + } + // Initialize our array to our exact or oversized length. + // It is now safe to assume we have enough space for all of the characters. + char[] chars = new char[arrayLength]; + int w = 0; + for (int r = 0; r < count; ++r) + { + int cp = codePoints[r]; + if (cp < 0 || cp > 0x10ffff) + { + throw new ArgumentException($"Invalid code point: {cp}", nameof(codePoints)); + } + if (cp < 0x010000) + { + chars[w++] = (char)cp; + } + else + { + chars[w++] = (char)(LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_)); + chars[w++] = (char)(TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_)); + } + } + + var result = new char[w]; + Arrays.Copy(chars, result, w); + return result; + } + } +} diff --git a/src/Lucene.Net/Util/BytesRef.cs b/src/Lucene.Net/Util/BytesRef.cs index 8012c77282..0124269216 100644 --- a/src/Lucene.Net/Util/BytesRef.cs +++ b/src/Lucene.Net/Util/BytesRef.cs @@ -243,6 +243,43 @@ public string Utf8ToString() return @ref.ToString(); } + /// + /// Interprets stored bytes as UTF8 bytes, returning the + /// resulting . + /// + /// + /// LUCENENET specific version that does not throw exceptions on invalid UTF-8, + /// primarily for use in ToString() and other cases that should not throw exceptions, + /// such as when building a message for another exception. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public string Utf8ToStringWithFallback() + { + CharsRef @ref = new CharsRef(Length); + UnicodeUtil.UTF8toUTF16WithFallback(bytes, Offset, Length, @ref); + return @ref.ToString(); + } + + #nullable enable + /// + /// Tries to interpret the stored bytes as UTF8 bytes, returning the + /// resulting as an output parameter . + /// + /// The resulting string output. + /// true if successful, false otherwise. + public bool TryUtf8ToString([NotNullWhen(true)] out string? result) + { + if (UnicodeUtil.TryUTF8toUTF16(bytes, Offset, Length, out CharsRef? @ref)) + { + result = @ref.ToString(); + return true; + } + + result = null; + return false; + } + #nullable restore + /// /// Returns hex encoded bytes, eg [0x6c 0x75 0x63 0x65 0x6e 0x65] public override string ToString() @@ -567,11 +604,11 @@ public override string ToString() switch (format) { case BytesRefFormat.UTF8: - try + if (bytesRef.TryUtf8ToString(out var utf8String)) { - return bytesRef.Utf8ToString(); + return utf8String; } - catch (Exception e) when (e.IsIndexOutOfBoundsException()) + else { return bytesRef.ToString(); } diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs index 5974af1a16..9a08d8bb4c 100644 --- a/src/Lucene.Net/Util/UnicodeUtil.cs +++ b/src/Lucene.Net/Util/UnicodeUtil.cs @@ -1,10 +1,11 @@ using J2N; using J2N.Text; using Lucene.Net.Diagnostics; -using Lucene.Net.Support; using System; +using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Text; +#nullable enable namespace Lucene.Net.Util { @@ -99,7 +100,7 @@ namespace Lucene.Net.Util /// /// @lucene.internal /// - public static class UnicodeUtil + public static partial class UnicodeUtil { /// /// A binary term consisting of a number of 0xff bytes, likely to be bigger than other terms @@ -107,7 +108,10 @@ public static class UnicodeUtil /// /// WARNING: this is not a valid UTF8 Term /// - public static readonly BytesRef BIG_TERM = new BytesRef(new byte[] { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }); // TODO this is unrelated here find a better place for it + public static readonly BytesRef BIG_TERM = new BytesRef(new byte[] + { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + }); // TODO this is unrelated here find a better place for it public const int UNI_SUR_HIGH_START = 0xD800; public const int UNI_SUR_HIGH_END = 0xDBFF; @@ -120,16 +124,17 @@ public static class UnicodeUtil private const long HALF_SHIFT = 10; private const long HALF_MASK = 0x3FFL; - private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START; + private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - + (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START; /// - /// Encode characters from a , starting at + /// Encode characters from a (with generic type argument ) , starting at /// and ending at . After encoding, result.Offset will always be 0. /// /// is null. // TODO: broken if incoming result.offset != 0 // LUCENENET specific overload - public static void UTF16toUTF8(Span source, BytesRef result) + public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result) { // LUCENENET: Added guard clause if (result is null) @@ -148,11 +153,12 @@ public static void UTF16toUTF8(Span source, BytesRef result) { @out = result.Bytes = new byte[maxLen]; } + result.Offset = 0; while (i < end) { - int code = (int)source[i++]; + var code = (int)source[i++]; if (code < 0x80) { @@ -188,6 +194,7 @@ public static void UTF16toUTF8(Span source, BytesRef result) continue; } } + // replace unpaired surrogate or out-of-order low surrogate // with substitution character @out[upto++] = 0xEF; @@ -195,6 +202,7 @@ public static void UTF16toUTF8(Span source, BytesRef result) @out[upto++] = 0xBD; } } + //assert matches(source, offset, length, out, upto); result.Length = upto; } @@ -217,74 +225,8 @@ public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef r // LUCENENET: Added guard clauses if (source is null) throw new ArgumentNullException(nameof(source)); - if (result is null) - throw new ArgumentNullException(nameof(result)); - if (offset < 0) - throw new ArgumentOutOfRangeException(nameof(offset), $"{nameof(offset)} must not be negative."); - if (length < 0) - throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative."); - if (offset > source.Length - length) // Checks for int overflow - throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); - - int upto = 0; - int i = offset; - int end = offset + length; - var @out = result.Bytes; - // Pre-allocate for worst case 4-for-1 - int maxLen = length * 4; - if (@out.Length < maxLen) - { - @out = result.Bytes = new byte[maxLen]; - } - result.Offset = 0; - - while (i < end) - { - int code = (int)source[i++]; - if (code < 0x80) - { - @out[upto++] = (byte)code; - } - else if (code < 0x800) - { - @out[upto++] = (byte)(0xC0 | (code >> 6)); - @out[upto++] = (byte)(0x80 | (code & 0x3F)); - } - else if (code < 0xD800 || code > 0xDFFF) - { - @out[upto++] = (byte)(0xE0 | (code >> 12)); - @out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F)); - @out[upto++] = (byte)(0x80 | (code & 0x3F)); - } - else - { - // surrogate pair - // confirm valid high surrogate - if (code < 0xDC00 && i < end) - { - var utf32 = (int)source[i]; - // confirm valid low surrogate and write pair - if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) - { - utf32 = (code << 10) + utf32 + SURROGATE_OFFSET; - i++; - @out[upto++] = (byte)(0xF0 | (utf32 >> 18)); - @out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)); - @out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F)); - @out[upto++] = (byte)(0x80 | (utf32 & 0x3F)); - continue; - } - } - // replace unpaired surrogate or out-of-order low surrogate - // with substitution character - @out[upto++] = 0xEF; - @out[upto++] = 0xBF; - @out[upto++] = 0xBD; - } - } - //assert matches(source, offset, length, out, upto); - result.Length = upto; + UTF16toUTF8(source.AsSpan(offset, length), result); } /// @@ -312,7 +254,8 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt if (length < 0) throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative."); if (offset > source.Length - length) // Checks for int overflow - throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); + throw new ArgumentOutOfRangeException(nameof(length), + $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); int end = offset + length; @@ -363,6 +306,7 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt continue; } } + // replace unpaired surrogate or out-of-order low surrogate // with substitution character @out[upto++] = 0xEF; @@ -370,6 +314,7 @@ public static void UTF16toUTF8(ICharSequence source, int offset, int length, Byt @out[upto++] = 0xBD; } } + //assert matches(s, offset, length, out, upto); result.Length = upto; } @@ -401,7 +346,8 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r if (length < 0) throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative."); if (offset > source.Length - length) // Checks for int overflow - throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); + throw new ArgumentOutOfRangeException(nameof(length), + $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); int end = offset + length; @@ -452,6 +398,7 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r continue; } } + // replace unpaired surrogate or out-of-order low surrogate // with substitution character @out[upto++] = 0xEF; @@ -459,6 +406,7 @@ public static void UTF16toUTF8(string source, int offset, int length, BytesRef r @out[upto++] = 0xBD; } } + //assert matches(s, offset, length, out, upto); result.Length = upto; } @@ -536,20 +484,20 @@ public static bool ValidUTF16String(ICharSequence s) // Valid surrogate pair } else - // Unmatched high surrogate { + // Unmatched high surrogate return false; } } else - // Unmatched high surrogate { + // Unmatched high surrogate return false; } } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) - // Unmatched low surrogate { + // Unmatched low surrogate return false; } } @@ -557,7 +505,8 @@ public static bool ValidUTF16String(ICharSequence s) return true; } - public static bool ValidUTF16String(string s) // LUCENENET specific overload because string doesn't implement ICharSequence + // LUCENENET specific overload because string doesn't implement ICharSequence + public static bool ValidUTF16String(string s) { int size = s.Length; for (int i = 0; i < size; i++) @@ -574,20 +523,20 @@ public static bool ValidUTF16String(string s) // LUCENENET specific overload bec // Valid surrogate pair } else - // Unmatched high surrogate { + // Unmatched high surrogate return false; } } else - // Unmatched high surrogate { + // Unmatched high surrogate return false; } } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) - // Unmatched low surrogate { + // Unmatched low surrogate return false; } } @@ -595,7 +544,8 @@ public static bool ValidUTF16String(string s) // LUCENENET specific overload bec return true; } - public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence + // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence + public static bool ValidUTF16String(StringBuilder s) { int size = s.Length; for (int i = 0; i < size; i++) @@ -612,20 +562,20 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl // Valid surrogate pair } else - // Unmatched high surrogate { + // Unmatched high surrogate return false; } } else - // Unmatched high surrogate { + // Unmatched high surrogate return false; } } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) - // Unmatched low surrogate { + // Unmatched low surrogate return false; } } @@ -633,7 +583,9 @@ public static bool ValidUTF16String(StringBuilder s) // LUCENENET specific overl return true; } - public static bool ValidUTF16String(char[] s, int size) + public static bool ValidUTF16String(char[] s, int size) => ValidUTF16String(s.AsSpan(), size); + + public static bool ValidUTF16String(ReadOnlySpan s, int size) { for (int i = 0; i < size; i++) { @@ -659,8 +611,8 @@ public static bool ValidUTF16String(char[] s, int size) } } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) - // Unmatched low surrogate { + // Unmatched low surrogate return false; } } @@ -677,10 +629,13 @@ public static bool ValidUTF16String(char[] s, int size) /* Map UTF-8 encoded prefix byte to sequence length. -1 (0xFF) * means illegal prefix. see RFC 2279 for details */ internal static readonly int[] utf8CodeLength = LoadUTF8CodeLength(); - private static int[] LoadUTF8CodeLength() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) + + // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) + private static int[] LoadUTF8CodeLength() { - int v = int.MinValue; - return new int[] { + const int v = int.MinValue; + return new int[] + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -721,12 +676,31 @@ public static int CodePointCount(BytesRef utf8) for (; pos < limit; codePointCount++) { int v = bytes[pos] & 0xFF; - if (v < /* 0xxx xxxx */ 0x80) { pos += 1; continue; } - if (v >= /* 110x xxxx */ 0xc0) + if (v < /* 0xxx xxxx */ 0x80) + { + pos += 1; + continue; + } + + if (v >= /* 110x xxxx */ 0xc0) { - if (v < /* 111x xxxx */ 0xe0) { pos += 2; continue; } - if (v < /* 1111 xxxx */ 0xf0) { pos += 3; continue; } - if (v < /* 1111 1xxx */ 0xf8) { pos += 4; continue; } + if (v < /* 111x xxxx */ 0xe0) + { + pos += 2; + continue; + } + + if (v < /* 1111 xxxx */ 0xf0) + { + pos += 3; + continue; + } + + if (v < /* 1111 1xxx */ 0xf8) + { + pos += 4; + continue; + } // fallthrough, consider 5 and 6 byte sequences invalid. } @@ -757,6 +731,7 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32) { utf32.Int32s = new int[utf8.Length]; } + int utf32Count = 0; int utf8Upto = utf8.Offset; int[] ints = utf32.Int32s; @@ -796,6 +771,7 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32) { v = v << 6 | bytes[utf8Upto++] & 63; } + ints[utf32Count++] = v; } @@ -825,12 +801,13 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32) /// /// Value that all lead surrogate starts with. - private const int LEAD_SURROGATE_OFFSET_ = LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_); + private const int LEAD_SURROGATE_OFFSET_ = + LEAD_SURROGATE_MIN_VALUE - (SUPPLEMENTARY_MIN_VALUE >> LEAD_SURROGATE_SHIFT_); /// /// Cover JDK 1.5 API. Create a String from an array of . /// - /// The code array. + /// The code point array. /// The start of the text in the code point array. /// The number of code points. /// a String representing the code points between offset and count. @@ -840,71 +817,25 @@ public static void UTF8toUTF32(BytesRef utf8, Int32sRef utf32) public static string NewString(int[] codePoints, int offset, int count) { // LUCENENET: Character.ToString() was optimized to use the stack for arrays - // of codepoints 256 or less, so it performs better than using ToCharArray(). + // of codepoints 256 or less, so it performs better than the Lucene implementation. return Character.ToString(codePoints, offset, count); } /// - /// Generates char array that represents the provided input code points. - /// - /// LUCENENET specific. + /// Cover JDK 1.5 API. Create a String from a span of . /// - /// The code array. - /// The start of the text in the code point array. + /// The code point span. + /// The start of the text in the code point span. /// The number of code points. - /// a char array representing the code points between offset and count. - // LUCENENET NOTE: This code was originally in the NewString() method (above). - // It has been refactored from the original to remove the exception throw/catch and - // instead proactively resizes the array instead of relying on excpetions + copy operations - public static char[] ToCharArray(int[] codePoints, int offset, int count) + /// a String representing the code points between offset and count. + /// If an invalid code point is encountered. + /// If the offset or count are out of bounds. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static string NewString(ReadOnlySpan codePoints, int offset, int count) { - if (count < 0) - { - throw new ArgumentOutOfRangeException(nameof(count), "count must be >= 0"); // LUCENENET specific - changed from IllegalArgumentException to ArgumentOutOfRangeException (.NET convention) - } - const int countThreashold = 1024; // If the number of chars exceeds this, we count them instead of allocating count * 2 - // LUCENENET: as a first approximation, assume each codepoint - // is 2 characters (since it cannot be longer than this) - int arrayLength = count * 2; - // LUCENENET: if we go over the threashold, count the number of - // chars we will need so we can allocate the precise amount of memory - if (count > countThreashold) - { - arrayLength = 0; - for (int r = offset, e = offset + count; r < e; ++r) - { - arrayLength += codePoints[r] < 0x010000 ? 1 : 2; - } - if (arrayLength < 1) - { - arrayLength = count * 2; - } - } - // Initialize our array to our exact or oversized length. - // It is now safe to assume we have enough space for all of the characters. - char[] chars = new char[arrayLength]; - int w = 0; - for (int r = offset, e = offset + count; r < e; ++r) - { - int cp = codePoints[r]; - if (cp < 0 || cp > 0x10ffff) - { - throw new ArgumentException($"Invalid code point: {cp}", nameof(codePoints)); - } - if (cp < 0x010000) - { - chars[w++] = (char)cp; - } - else - { - chars[w++] = (char)(LEAD_SURROGATE_OFFSET_ + (cp >> LEAD_SURROGATE_SHIFT_)); - chars[w++] = (char)(TRAIL_SURROGATE_MIN_VALUE + (cp & TRAIL_SURROGATE_MASK_)); - } - } - - var result = new char[w]; - Arrays.Copy(chars, result, w); - return result; + // LUCENENET: Character.ToString() was optimized to use the stack for arrays + // of codepoints 256 or less, so it performs better than the Lucene implementation. + return Character.ToString(codePoints, offset, count); } // for debugging @@ -955,18 +886,37 @@ public static string ToHexString(string s) /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. /// /// NOTE: Full characters are read, even if this reads past the length passed (and - /// can result in an if invalid UTF-8 is passed). + /// can result in an if invalid UTF-8 is passed). /// Explicit checks for valid UTF-8 are not performed. /// + /// // TODO: broken if chars.offset != 0 public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) + { + UTF8toUTF16(utf8.AsSpan(offset, length), chars); + } + + /// + /// Interprets the given byte span as UTF-8 and converts to UTF-16. The will be extended if + /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. + /// + /// NOTE: Full characters are read, even if this reads past the length passed (and + /// can result in an if invalid UTF-8 is passed). + /// Explicit checks for valid UTF-8 are not performed. + /// + /// + /// LUCENENET specific overload. + /// + // TODO: broken if chars.offset != 0 + public static void UTF8toUTF16(ReadOnlySpan utf8, CharsRef chars) { int out_offset = chars.Offset = 0; - char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, length); - int limit = offset + length; - while (offset < limit) + char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, utf8.Length); + int i = 0; + + while (i < utf8.Length) { - int b = utf8[offset++] & 0xff; + int b = utf8[i++] & 0xff; if (b < 0xc0) { if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80); @@ -974,18 +924,30 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha } else if (b < 0xe0) { - @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f)); + if (utf8.Length <= i) + { + throw new FormatException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}"); + } + @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f)); } else if (b < 0xf0) { - @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f)); - offset += 2; + if (utf8.Length <= i + 1) + { + throw new FormatException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}"); + } + @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f)); + i += 2; } else { + if (utf8.Length <= i + 2) + { + throw new FormatException($"Invalid UTF-8 starting at [{b:x2}] at offset {i - 1}"); + } if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b); - int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f); - offset += 3; + int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f); + i += 3; if (ch < UNI_MAX_BMP) { @out[out_offset++] = (char)ch; @@ -1002,12 +964,187 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha } /// - /// Utility method for - /// + /// Interprets the given byte array as UTF-8 and converts to UTF-16. The will be extended if + /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. + /// + /// NOTE: This method will replace any invalid UTF-8 byte sequences with the Unicode replacement character U+FFFD. + /// + /// + /// LUCENENET specific, for use in ToString() where we want to avoid throwing exceptions. + /// + /// + // TODO: broken if chars.offset != 0 + public static void UTF8toUTF16WithFallback(byte[] utf8, int offset, int length, CharsRef chars) + { + UTF8toUTF16(utf8.AsSpan(offset, length), chars); + } + + /// + /// Interprets the given byte span as UTF-8 and converts to UTF-16. The will be extended if + /// it doesn't provide enough space to hold the worst case of each byte becoming a UTF-16 codepoint. + /// + /// NOTE: This method will replace any invalid UTF-8 byte sequences with the Unicode replacement character U+FFFD. + /// + /// + /// LUCENENET specific, for use in ToString() where we want to avoid throwing exceptions. + /// + // TODO: broken if chars.offset != 0 + public static void UTF8toUTF16WithFallback(ReadOnlySpan utf8, CharsRef chars) + { + int out_offset = chars.Offset = 0; + char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, utf8.Length); + int i = 0; + + while (i < utf8.Length) + { + int b = utf8[i++] & 0xff; + if (b < 0xc0) + { + if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80); + @out[out_offset++] = (char)b; + } + else if (b < 0xe0) + { + if (utf8.Length <= i) + { + @out[out_offset++] = (char)0xfffd; + continue; + } + @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f)); + } + else if (b < 0xf0) + { + if (utf8.Length <= i + 1) + { + @out[out_offset++] = (char)0xfffd; + break; + } + @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f)); + i += 2; + } + else + { + if (utf8.Length <= i + 2) + { + @out[out_offset++] = (char)0xfffd; + break; + } + if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b); + int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f); + i += 3; + if (ch < UNI_MAX_BMP) + { + @out[out_offset++] = (char)ch; + } + else + { + int chHalf = ch - 0x0010000; + @out[out_offset++] = (char)((chHalf >> 10) + 0xD800); + @out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00); + } + } + } + chars.Length = out_offset - chars.Offset; + } + + /// + /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new . + /// + /// NOTE: Explicit checks for valid UTF-8 are not performed. + /// + /// + /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[]. + /// + /// + public static bool TryUTF8toUTF16(byte[] utf8, int offset, int length, [NotNullWhen(true)] out CharsRef? chars) + { + return TryUTF8toUTF16(utf8.AsSpan(offset, length), out chars); + } + + /// + /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new . + /// + /// NOTE: Explicit checks for valid UTF-8 are not performed. + /// + /// + /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[]. + /// + public static bool TryUTF8toUTF16(ReadOnlySpan utf8, [NotNullWhen(true)] out CharsRef? chars) + { + CharsRef result = new CharsRef(utf8.Length); + int out_offset = 0; + char[] @out = result.Chars; + int i = 0; + + while (i < utf8.Length) + { + int b = utf8[i++] & 0xff; + if (b < 0xc0) + { + if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80); + @out[out_offset++] = (char)b; + } + else if (b < 0xe0) + { + if (utf8.Length <= i) + { + chars = null; + return false; + } + @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f)); + } + else if (b < 0xf0) + { + if (utf8.Length <= i + 1) + { + chars = null; + return false; + } + @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f)); + i += 2; + } + else + { + if (utf8.Length <= i + 2) + { + chars = null; + return false; + } + if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b); + int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f); + i += 3; + if (ch < UNI_MAX_BMP) + { + @out[out_offset++] = (char)ch; + } + else + { + int chHalf = ch - 0x0010000; + @out[out_offset++] = (char)((chHalf >> 10) + 0xD800); + @out[out_offset++] = (char)((chHalf & HALF_MASK) + 0xDC00); + } + } + } + result.Length = out_offset; + chars = result; + return true; + } + + /// + /// Utility method for + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars) { - UTF8toUTF16(bytesRef.Bytes, bytesRef.Offset, bytesRef.Length, chars); + UTF8toUTF16(bytesRef.Bytes.AsSpan(bytesRef.Offset, bytesRef.Length), chars); + } + + /// + /// Utility method for + /// + public static bool TryUTF8toUTF16(BytesRef bytesRef, out CharsRef? chars) + { + return TryUTF8toUTF16(bytesRef.Bytes.AsSpan(bytesRef.Offset, bytesRef.Length), out chars); } } }