Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UnicodeUtil updates: TryUTF8toUTF16, ReadOnlySpan methods, #1024 #1057

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
4 changes: 2 additions & 2 deletions src/Lucene.Net.Codecs/SimpleText/SimpleTextDocValuesReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -407,10 +407,10 @@ public override int GetOrd(int docId)
SimpleTextUtil.ReadLine(_input, _scratch);
try
{
// LUCNENENET: .NET doesn't have a way to specify a pattern with integer, but all of the standard ones are built in.
// LUCENENET: .NET doesn't have a way to specify a pattern with integer, but all of the standard ones are built in.
return int.Parse(_scratch.Utf8ToString(), NumberStyles.Integer, CultureInfo.InvariantCulture) - 1;
}
catch (Exception pe) when (pe.IsParseException())
catch (Exception pe) when (pe.IsParseException() || pe.IsNumberFormatException())
{
var e = new CorruptIndexException($"failed to parse ord (resource={_input})", pe);
throw e;
Expand Down
7 changes: 4 additions & 3 deletions src/Lucene.Net.Codecs/SimpleText/SimpleTextUtil.cs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ public static void ReadLine(DataInput input, BytesRef scratch)
{
break;
}

scratch.Bytes[upto++] = b;
}
}
Expand All @@ -106,8 +106,9 @@ public static void CheckFooter(ChecksumIndexInput input)

if (StringHelper.StartsWith(scratch, CHECKSUM) == false)
{
// LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
throw new CorruptIndexException("SimpleText failure: expected checksum line but got " +
scratch.Utf8ToString() + " (resource=" + input + ")");
scratch.Utf8ToStringWithFallback() + " (resource=" + input + ")");
}
var actualChecksum =
(new BytesRef(scratch.Bytes, CHECKSUM.Length, scratch.Length - CHECKSUM.Length)).Utf8ToString();
Expand All @@ -124,4 +125,4 @@ public static void CheckFooter(ChecksumIndexInput input)
}
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public class DefaultSortedSetDocValuesReaderState : SortedSetDocValuesReaderStat

/// <summary>
/// Creates this, pulling doc values from the specified
/// field.
/// field.
/// </summary>
public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = FacetsConfig.DEFAULT_INDEX_FIELD_NAME)
{
Expand Down Expand Up @@ -79,7 +79,8 @@ public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = F
string[] components = FacetsConfig.StringToPath(spare.Utf8ToString());
if (components.Length != 2)
{
throw new ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + spare.Utf8ToString());
// LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
throw new ArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.ToString(components) + " " + spare.Utf8ToStringWithFallback());
}
if (!components[0].Equals(lastDim, StringComparison.Ordinal))
{
Expand All @@ -101,7 +102,7 @@ public DefaultSortedSetDocValuesReaderState(IndexReader reader, string field = F
/// <summary>
/// Return top-level doc values.
/// </summary>
public override SortedSetDocValues GetDocValues()
public override SortedSetDocValues GetDocValues()
{
return topReader.GetSortedSetDocValues(field);
}
Expand Down Expand Up @@ -132,4 +133,4 @@ public override OrdRange GetOrdRange(string dim)
/// </summary>
public override int Count => valueCount;
}
}
}
2 changes: 1 addition & 1 deletion src/Lucene.Net.Grouping/AbstractGroupFacetCollector.cs
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ public override int GetHashCode()
public override string ToString()
{
return "FacetEntry{" +
"value=" + value.Utf8ToString() +
"value=" + value.Utf8ToStringWithFallback() + // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
", count=" + count +
'}';
}
Expand Down
34 changes: 17 additions & 17 deletions src/Lucene.Net.Join/TermsIncludingScoreQuery.cs
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ public WeightAnonymousClass(TermsIncludingScoreQuery outerInstance, Weight origi


private TermsEnum segmentTermsEnum;

public override Explanation Explain(AtomicReaderContext context, int doc)
{
SVInnerScorer scorer = (SVInnerScorer) GetBulkScorer(context, false, null);
Expand All @@ -161,7 +161,7 @@ public override void Normalize(float norm, float topLevelBoost)
{
originalWeight.Normalize(norm, topLevelBoost*outerInstance.Boost);
}

public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)
{
Terms terms = context.AtomicReader.GetTerms(outerInstance._field);
Expand All @@ -181,7 +181,7 @@ public override Scorer GetScorer(AtomicReaderContext context, IBits acceptDocs)

return new SVInOrderScorer(outerInstance, this, acceptDocs, segmentTermsEnum, context.AtomicReader.MaxDoc, cost);
}

public override BulkScorer GetBulkScorer(AtomicReaderContext context, bool scoreDocsInOrder, IBits acceptDocs)
{
if (scoreDocsInOrder)
Expand Down Expand Up @@ -236,7 +236,7 @@ internal SVInnerScorer(TermsIncludingScoreQuery outerInstance, /* Weight weight,
//_cost = cost; // LUCENENET: Never read
_doc = -1;
}

public override bool Score(ICollector collector, int max)
{
FakeScorer fakeScorer = new FakeScorer();
Expand Down Expand Up @@ -285,12 +285,12 @@ private int NextDocOutOfOrder()
}
}
}

protected virtual int DocsEnumNextDoc()
{
return docsEnum.NextDoc();
}

internal Explanation Explain(int target) // LUCENENET NOTE: changed accessibility from private to internal
{
int docId;
Expand All @@ -314,7 +314,7 @@ internal Explanation Explain(int target) // LUCENENET NOTE: changed accessibilit
} while (docId != DocIdSetIterator.NO_MORE_DOCS);

return new ComplexExplanation(true, outerInstance._scores[outerInstance._ords[_scoreUpto]],
"Score based on join value " + _termsEnum.Term.Utf8ToString());
"Score based on join value " + _termsEnum.Term.Utf8ToStringWithFallback()); // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
}
}

Expand All @@ -326,13 +326,13 @@ internal class MVInnerScorer : SVInnerScorer
internal readonly FixedBitSet alreadyEmittedDocs;

internal MVInnerScorer(TermsIncludingScoreQuery outerInstance, /* Weight weight, // LUCENENET: Never read */
IBits acceptDocs, TermsEnum termsEnum, int maxDoc /*, long cost // LUCENENET: Never read */)
IBits acceptDocs, TermsEnum termsEnum, int maxDoc /*, long cost // LUCENENET: Never read */)
: base(outerInstance, /*weight, // LUCENENET: Never read */
acceptDocs, termsEnum /*, cost // LUCENENET: Never read */)
{
alreadyEmittedDocs = new FixedBitSet(maxDoc);
}

protected override int DocsEnumNextDoc()
{
while (true)
Expand Down Expand Up @@ -360,11 +360,11 @@ internal class SVInOrderScorer : Scorer
internal readonly long cost;

internal int currentDoc = -1;

[SuppressMessage("CodeQuality", "IDE0079:Remove unnecessary suppression", Justification = "This is a SonarCloud issue")]
[SuppressMessage("CodeQuality", "S1699:Constructors should only call non-overridable methods", Justification = "Internal class")]
internal SVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight, IBits acceptDocs,
TermsEnum termsEnum, int maxDoc, long cost)
TermsEnum termsEnum, int maxDoc, long cost)
: base(weight)
{
this.m_outerInstance = outerInstance;
Expand All @@ -374,7 +374,7 @@ internal SVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight,
matchingDocsIterator = matchingDocs.GetIterator();
this.cost = cost;
}

protected virtual void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptDocs,
TermsEnum termsEnum)
{
Expand All @@ -398,12 +398,12 @@ protected virtual void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptD
}
}
}

public override float GetScore()
{
return scores[currentDoc];
}

public override int Freq => 1;

public override int DocID => currentDoc;
Expand All @@ -412,7 +412,7 @@ public override int NextDoc()
{
return currentDoc = matchingDocsIterator.NextDoc();
}

public override int Advance(int target)
{
return currentDoc = matchingDocsIterator.Advance(target);
Expand All @@ -432,7 +432,7 @@ internal MVInOrderScorer(TermsIncludingScoreQuery outerInstance, Weight weight,
: base(outerInstance, weight, acceptDocs, termsEnum, maxDoc, cost)
{
}

protected override void FillDocsAndScores(FixedBitSet matchingDocs, IBits acceptDocs,
TermsEnum termsEnum)
{
Expand Down Expand Up @@ -465,4 +465,4 @@ protected override void FillDocsAndScores(FixedBitSet matchingDocs, IBits accept
}
}
}
}
}
3 changes: 2 additions & 1 deletion src/Lucene.Net.Misc/Misc/TermStats.cs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ internal string GetTermText()

public override string ToString()
{
return ("TermStats: Term=" + TermText.Utf8ToString() + " DocFreq=" + DocFreq + " TotalTermFreq=" + TotalTermFreq);
// LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
return "TermStats: Term=" + TermText.Utf8ToStringWithFallback() + " DocFreq=" + DocFreq + " TotalTermFreq=" + TotalTermFreq;
}
}
}
2 changes: 1 addition & 1 deletion src/Lucene.Net.Queries/TermsFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ public override string ToString()
}
first = false;
builder.Append(current.field).Append(':');
builder.Append(spare.Utf8ToString());
builder.Append(spare.Utf8ToStringWithFallback()); // LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
}
}

Expand Down
3 changes: 2 additions & 1 deletion src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ internal Completion(BytesRef key, int bucket)

public override string ToString()
{
return Utf8.Utf8ToString() + "/" + Bucket.ToString("0.0", CultureInfo.InvariantCulture);
// LUCENENET specific - use Utf8ToStringWithFallback() to handle invalid UTF-8 bytes
return Utf8.Utf8ToStringWithFallback() + "/" + Bucket.ToString("0.0", CultureInfo.InvariantCulture);
}

/// <seealso cref="BytesRef.CompareTo(object)"></seealso>
Expand Down
49 changes: 49 additions & 0 deletions src/Lucene.Net.Tests/Util/TestUnicodeUtil.cs
Original file line number Diff line number Diff line change
Expand Up @@ -327,5 +327,54 @@ public virtual void TestUTF8UTF16CharsRef()
Assert.AreEqual(cRef.ToString(), unicode);
}
}

[Test]
[LuceneNetSpecific]
[TestCase(new byte[] { 0x63, 0x61, 0xc3 }, true)] // ca�, start of 2-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xe3 }, true)] // ca�, start of 3-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xf3 }, true)] // ca�, start of 4-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, false)] // cañon
public void TestUTF8toUTF16Exception(byte[] invalidUtf8, bool shouldThrow)
{
var scratch = new CharsRef();

if (shouldThrow)
{
Assert.Throws<FormatException>(() => UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch));
}
else
{
UnicodeUtil.UTF8toUTF16(invalidUtf8, scratch);
}
}

[Test]
[LuceneNetSpecific] // this is a Lucene.NET specific method
[Repeat(100)]
public void TestTryUTF8toUTF16()
{
string unicode = TestUtil.RandomRealisticUnicodeString(Random);
var utf8 = new BytesRef(IOUtils.CHARSET_UTF_8.GetBytes(unicode));

bool success = UnicodeUtil.TryUTF8toUTF16(utf8, out var chars);

Assert.IsTrue(success);
Assert.AreEqual(unicode, chars?.ToString());
}

[Test]
[LuceneNetSpecific] // this is a Lucene.NET specific method
[TestCase(new byte[] { 0x63, 0x61, 0xc3 }, "ca\ufffd")] // ca�, start of 2-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xe3 }, "ca\ufffd")] // ca�, start of 3-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xf3 }, "ca\ufffd")] // ca�, start of 4-byte sequence
[TestCase(new byte[] { 0x63, 0x61, 0xc3, 0xb1, 0x6f, 0x6e }, "cañon")]
public void TestUTF8toUTF16WithFallback(byte[] utf8, string expected)
{
var scratch = new CharsRef();

UnicodeUtil.UTF8toUTF16WithFallback(utf8, scratch);

Assert.AreEqual(expected, scratch.ToString());
}
}
}
Loading
Loading