Skip to content

Commit

Permalink
More tests, and various minor fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Lemire committed Mar 1, 2024
1 parent ee9ee16 commit 4f34a47
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 60 deletions.
2 changes: 1 addition & 1 deletion benchmark/UTF8_runtime.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
namespace DotnetRuntime
{

internal static unsafe partial class Utf8Utility
public static unsafe partial class Utf8Utility
{
/// <summary>
/// Returns <see langword="true"/> iff the low byte of <paramref name="value"/>
Expand Down
1 change: 0 additions & 1 deletion benchmark/benchmark.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

<ItemGroup>
<ProjectReference Include="..\src\SimdUnicode.csproj" />
<ProjectReference Include="..\test\tests.csproj" />
</ItemGroup>

<ItemGroup>
Expand Down
46 changes: 14 additions & 32 deletions src/UTF8.cs
Original file line number Diff line number Diff line change
Expand Up @@ -45,21 +45,6 @@ public static class UTF8
uint codePoint = 0;
while (pos < inputLength)
{
// If the next 16 bytes are ascii, we can skip them.
nextPos = pos + 16;
if (nextPos <= inputLength)
{ // if it is safe to read 16 more bytes, check that they are ascii
ulong v1 = *(ulong*)pInputBuffer;
ulong v2 = *(ulong*)(pInputBuffer + 8);
ulong v = v1 | v2;

if ((v & 0x8080808080808080) == 0)
{
pos = nextPos;
continue;
}

}
byte firstByte = pInputBuffer[pos];
while (firstByte < 0b10000000)
{
Expand Down Expand Up @@ -233,17 +218,17 @@ public static class UTF8

for (; processedLength + 32 <= inputLength; processedLength += 32)
{

Vector256<byte> currentBlock = Avx.LoadVector256(pInputBuffer + processedLength);

int mask = Avx2.MoveMask(currentBlock);
if (mask == 0)
{
// We have an ASCII block, no need to process it, but
// we need to check if the previous block was incomplete.
if (Avx2.MoveMask(prevIncomplete) != 0)
if (!Avx2.TestZ(prevIncomplete, prevIncomplete))
{
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
int off = processedLength >= 32 ? processedLength - 32 : processedLength;
return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
}
prevIncomplete = Vector256<byte>.Zero;
}
Expand All @@ -264,34 +249,31 @@ public static class UTF8
Vector256<byte> must23 = Avx2.Or(isThirdByte, isFourthByte);
Vector256<byte> must23As80 = Avx2.And(must23, v80);
Vector256<byte> error = Avx2.Xor(must23As80, sc);
if (Avx2.MoveMask(error) != 0)
if (!Avx2.TestZ(error, error))
{
return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
int off = processedLength >= 32 ? processedLength - 32 : processedLength;
return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
}
prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue);
}
}

if (!Avx2.TestZ(prevIncomplete, prevIncomplete))
{
int off = processedLength >= 32 ? processedLength - 32 : processedLength;
return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
}
}
}
// We have processed all the blocks using SIMD, we need to process the remaining bytes.

// We have processed all the blocks using SIMD, we need to process the remaining bytes.
// Process the remaining bytes with the scalar function
if (processedLength < inputLength)
{
// We need to possibly backtrack to the start of the last code point
// worst possible case is 4 bytes, where we need to backtrack 3 bytes
// 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte
if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
{
processedLength -= 1;
if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
{
processedLength -= 1;
if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
{
processedLength -= 1;
}
}
}
byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength);
if (invalidBytePointer != pInputBuffer + inputLength)
Expand Down
87 changes: 63 additions & 24 deletions test/UTF8ValidationTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,35 @@ public class Utf8SIMDValidationTests
private readonly RandomUtf8 generator = new RandomUtf8(1234, 1, 1, 1, 1);
private static readonly Random rand = new Random();


/*
[Fact]
public void TooLongErrorTest()
{
for (int trial = 0; trial < 10; trial++)
{
Console.WriteLine("Trial {0}", trial);
byte[] utf8 = generator.Generate(3*64);
for (int i = 0; i < utf8.Length; i++)
{
Console.WriteLine("Trial {0} i = {1}", trial, i);
if ((utf8[i] & 0b11000000) != 0b10000000) // Only process leading bytes
{
byte oldByte = utf8[i];
utf8[i] = 0b10000000; // Forcing a too long error
Assert.False(ValidateUtf8(utf8));
Assert.True(InvalidateUtf8(utf8, i));
utf8[i] = oldByte; // Restore the original byte
}
Console.WriteLine("Trial {0} i = {1} DONE", trial, i);
}
}
}*/


[Fact]
public void TestGoodSequences()
{
Expand Down Expand Up @@ -38,7 +67,7 @@ public void TestGoodSequences()

byte* SIMDResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length);
Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)SIMDResult,
$"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}"); // byte* result = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length);
$"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}");
}
}
}
Expand Down Expand Up @@ -92,7 +121,7 @@ public void TestBadSequences()

byte* SIMDResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length);
Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)SIMDResult,
$"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}"); // byte* result = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length);
$"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}");

}
}
Expand All @@ -111,9 +140,7 @@ public void NoErrorTest()
{
for (int trial = 0; trial < NumTrials; trial++)
{
// Console.WriteLine("Trial run:" + trial);
byte[] utf8 = generator.Generate(512);
// Assert.True(ValidateUtf8(utf8),$"Failure NoErrorTest: {utf8}");
bool isValidUtf8 = ValidateUtf8(utf8);
string utf8HexString = BitConverter.ToString(utf8).Replace("-", " ");
Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}");
Expand Down Expand Up @@ -150,19 +177,6 @@ public void NoErrorTest4Bytes()
RunTestForByteLength(4);
}

// private void RunTestForByteLength(int byteLength)
// {
// for (int trial = 0; trial < NumTrials; trial++)
// {
// // Console.WriteLine($"Trial run {trial} for byte length {byteLength}");
// byte[] utf8 = generator.Generate(990, byteLength);
// bool isValidUtf8 = ValidateUtf8(utf8);
// // string utf8HexString = BitConverter.ToString(utf8).Replace("-", " ");
// // Assert.True(isValidUtf8, $"Failure NoErrorTest for {byteLength}-byte UTF8. Sequence: {utf8HexString}");
// Assert.True(isValidUtf8);
// }
// }

private void RunTestForByteLength(int byteLength)
{
int[] outputLengths = { 128, 256, 512, 1024, 1000 }; // Example lengths
Expand Down Expand Up @@ -191,6 +205,7 @@ public void HeaderBitsErrorTest()
byte oldByte = utf8[i];
utf8[i] = 0b11111000; // Forcing a header bits error
Assert.False(ValidateUtf8(utf8));
Assert.True(InvalidateUtf8(utf8, i));
utf8[i] = oldByte; // Restore the original byte
}
}
Expand All @@ -211,6 +226,7 @@ public void TooShortErrorTest()
byte oldByte = utf8[i];
utf8[i] = 0b11100000; // Forcing a too short error
Assert.False(ValidateUtf8(utf8));
Assert.True(InvalidateUtf8(utf8, i));
utf8[i] = oldByte; // Restore the original byte
}
}
Expand All @@ -231,14 +247,13 @@ public void TooLongErrorTest()
byte oldByte = utf8[i];
utf8[i] = 0b10000000; // Forcing a too long error
Assert.False(ValidateUtf8(utf8));
Assert.True(InvalidateUtf8(utf8, i));
utf8[i] = oldByte; // Restore the original byte
}
}
}
}

//

[Fact]
public void OverlongErrorTest()
{
Expand Down Expand Up @@ -294,7 +309,7 @@ public void TooLargeErrorTest()
utf8[i] += (byte)(((utf8[i] & 0b100) == 0b100) ? 0b10 : 0b100);

Assert.False(ValidateUtf8(utf8));

Assert.True(InvalidateUtf8(utf8, i));
utf8[i] = old;
}
}
Expand Down Expand Up @@ -322,6 +337,7 @@ public void SurrogateErrorTest()
utf8[i + 1] = (byte)((utf8[i + 1] & 0b11000011) | (s << 2));

Assert.False(ValidateUtf8(utf8));
Assert.True(InvalidateUtf8(utf8, i));
}

utf8[i] = old;
Expand All @@ -334,8 +350,6 @@ public void SurrogateErrorTest()
[Fact]
public void BruteForceTest()
{
// Random rand = new Random(); // Random instance for test

for (int i = 0; i < NumTrials; i++)
{
// Generate random UTF-8 sequence
Expand Down Expand Up @@ -368,9 +382,7 @@ public void BruteForceTest()
}
}


// credit: based on code from Google Fuchsia (Apache Licensed)

public static bool ValidateUtf8Fuschia(byte[] data)
{
int pos = 0;
Expand Down Expand Up @@ -424,7 +436,34 @@ public static bool ValidateUtf8Fuschia(byte[] data)
return true;
}

// Check that all functions agree on the result when the input might be invalid.
private bool InvalidateUtf8(byte[] utf8, int badindex)
{
unsafe
{
fixed (byte* pInput = utf8)
{
byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, utf8.Length);
int scalarOffset = (int)(scalarResult - pInput);
byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, utf8.Length);
int simdOffset = (int)(simdResult - pInput);
int utf16CodeUnitCountAdjustment, scalarCountAdjustment;
byte* dotnetResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, utf8.Length, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
int dotnetOffset = (int)(dotnetResult - pInput);
if (scalarOffset != simdOffset)
{
Console.WriteLine("Suprisingly, scalarResult != simdResult {0} != {1}, badindex = {2}, length = {3}", scalarOffset, simdOffset, badindex, utf8.Length);
}
if (dotnetOffset != simdOffset)
{
Console.WriteLine("Suprisingly, dotnetOffset != simdResult {0} != {1}, badindex = {2}, length = {3}", dotnetOffset, simdOffset, badindex, utf8.Length);
}
return (scalarResult == simdResult) && (simdResult == dotnetResult);
}
}
}

// check that all methods agree that the result is valid
private bool ValidateUtf8(byte[] utf8)
{
unsafe
Expand Down
3 changes: 1 addition & 2 deletions test/tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@

<ItemGroup>
<ProjectReference Include="..\src\SimdUnicode.csproj" />
<!-- <ProjectReference Include="..\test\randomutf8.csproj" /> -->

<ProjectReference Include="..\benchmark\benchmark.csproj" />
</ItemGroup>

</Project>

0 comments on commit 4f34a47

Please sign in to comment.