More tests, and various minor fixes

simdutf · Mar 1, 2024 · 4f34a47 · 4f34a47
1 parent ee9ee16
commit 4f34a47
Show file tree

Hide file tree

Showing 5 changed files with 79 additions and 60 deletions.
diff --git a/benchmark/UTF8_runtime.cs b/benchmark/UTF8_runtime.cs
@@ -28,7 +28,7 @@
 namespace DotnetRuntime
 {
 
-    internal static unsafe partial class Utf8Utility
+    public static unsafe partial class Utf8Utility
     {
         /// <summary>
         /// Returns <see langword="true"/> iff the low byte of <paramref name="value"/>

diff --git a/benchmark/benchmark.csproj b/benchmark/benchmark.csproj
@@ -14,7 +14,6 @@
 
   <ItemGroup>
     <ProjectReference Include="..\src\SimdUnicode.csproj" />
-    <ProjectReference Include="..\test\tests.csproj" />
   </ItemGroup>
 
   <ItemGroup>

diff --git a/src/UTF8.cs b/src/UTF8.cs
@@ -45,21 +45,6 @@ public static class UTF8
             uint codePoint = 0;
             while (pos < inputLength)
             {
-                // If the next  16 bytes are ascii, we can skip them.
-                nextPos = pos + 16;
-                if (nextPos <= inputLength)
-                { // if it is safe to read 16 more bytes, check that they are ascii
-                    ulong v1 = *(ulong*)pInputBuffer;
-                    ulong v2 = *(ulong*)(pInputBuffer + 8);
-                    ulong v = v1 | v2;
-
-                    if ((v & 0x8080808080808080) == 0)
-                    {
-                        pos = nextPos;
-                        continue;
-                    }
-
-                }
                 byte firstByte = pInputBuffer[pos];
                 while (firstByte < 0b10000000)
                 {
@@ -233,17 +218,17 @@ public static class UTF8
 
                     for (; processedLength + 32 <= inputLength; processedLength += 32)
                     {
-
                         Vector256<byte> currentBlock = Avx.LoadVector256(pInputBuffer + processedLength);
 
                         int mask = Avx2.MoveMask(currentBlock);
                         if (mask == 0)
                         {
                             // We have an ASCII block, no need to process it, but
                             // we need to check if the previous block was incomplete.
-                            if (Avx2.MoveMask(prevIncomplete) != 0)
+                            if (!Avx2.TestZ(prevIncomplete, prevIncomplete))
                             {
-                                return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
+                                int off = processedLength >= 32 ? processedLength - 32 : processedLength;
+                                return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
                             }
                             prevIncomplete = Vector256<byte>.Zero;
                         }
@@ -264,34 +249,31 @@ public static class UTF8
                             Vector256<byte> must23 = Avx2.Or(isThirdByte, isFourthByte);
                             Vector256<byte> must23As80 = Avx2.And(must23, v80);
                             Vector256<byte> error = Avx2.Xor(must23As80, sc);
-                            if (Avx2.MoveMask(error) != 0)
+                            if (!Avx2.TestZ(error, error))
                             {
-                                return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength);
+                                int off = processedLength >= 32 ? processedLength - 32 : processedLength;
+                                return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
                             }
                             prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue);
                         }
                     }
+
+                                    if (!Avx2.TestZ(prevIncomplete, prevIncomplete))
+                {
+                    int off = processedLength >= 32 ? processedLength - 32 : processedLength;
+                    return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off);
+                }
                 }
             }
-            // We have processed all the blocks using SIMD, we need to process the remaining bytes.
 
+            // We have processed all the blocks using SIMD, we need to process the remaining bytes.
             // Process the remaining bytes with the scalar function
             if (processedLength < inputLength)
             {
                 // We need to possibly backtrack to the start of the last code point
-                // worst possible case is 4 bytes, where we need to backtrack 3 bytes
-                // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte
-                if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
+                while (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
                 {
                     processedLength -= 1;
-                    if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
-                    {
-                        processedLength -= 1;
-                        if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
-                        {
-                            processedLength -= 1;
-                        }
-                    }
                 }
                 byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength);
                 if (invalidBytePointer != pInputBuffer + inputLength)

diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs
@@ -11,6 +11,35 @@ public class Utf8SIMDValidationTests
     private readonly RandomUtf8 generator = new RandomUtf8(1234, 1, 1, 1, 1);
     private static readonly Random rand = new Random();
 
+
+    /*
+       [Fact]
+       public void TooLongErrorTest()
+       {
+           for (int trial = 0; trial < 10; trial++)
+           {
+               Console.WriteLine("Trial {0}", trial);
+
+               byte[] utf8 = generator.Generate(3*64);
+               for (int i = 0; i < utf8.Length; i++)
+               {
+                               Console.WriteLine("Trial {0} i = {1}", trial, i);
+
+                   if ((utf8[i] & 0b11000000) != 0b10000000) // Only process leading bytes
+                   {
+                       byte oldByte = utf8[i];
+                       utf8[i] = 0b10000000; // Forcing a too long error
+                       Assert.False(ValidateUtf8(utf8));
+                       Assert.True(InvalidateUtf8(utf8, i));
+                       utf8[i] = oldByte; // Restore the original byte
+                   }
+                                               Console.WriteLine("Trial {0} i = {1} DONE", trial, i);
+
+               }
+           }
+       }*/
+
+
     [Fact]
     public void TestGoodSequences()
     {
@@ -38,7 +67,7 @@ public void TestGoodSequences()
 
                     byte* SIMDResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length);
                     Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)SIMDResult,
-                                $"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}");                // byte* result = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length);
+                                $"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}");
                 }
             }
         }
@@ -92,7 +121,7 @@ public void TestBadSequences()
 
                     byte* SIMDResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length);
                     Assert.True((IntPtr)(pInput + input.Length) == (IntPtr)SIMDResult,
-                                $"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}");                // byte* result = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, input.Length);
+                                $"Failure in SIMD function: Utf8Utility.GetPointerToFirstInvalidByte.Sequence: {seq}");
 
                 }
             }
@@ -111,9 +140,7 @@ public void NoErrorTest()
     {
         for (int trial = 0; trial < NumTrials; trial++)
         {
-            // Console.WriteLine("Trial run:" + trial);
             byte[] utf8 = generator.Generate(512);
-            // Assert.True(ValidateUtf8(utf8),$"Failure NoErrorTest: {utf8}");
             bool isValidUtf8 = ValidateUtf8(utf8);
             string utf8HexString = BitConverter.ToString(utf8).Replace("-", " ");
             Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}");
@@ -150,19 +177,6 @@ public void NoErrorTest4Bytes()
         RunTestForByteLength(4);
     }
 
-    // private void RunTestForByteLength(int byteLength)
-    // {
-    //     for (int trial = 0; trial < NumTrials; trial++)
-    //     {
-    //         // Console.WriteLine($"Trial run {trial} for byte length {byteLength}");
-    //         byte[] utf8 = generator.Generate(990, byteLength);
-    //         bool isValidUtf8 = ValidateUtf8(utf8);
-    //         // string utf8HexString = BitConverter.ToString(utf8).Replace("-", " ");
-    //         // Assert.True(isValidUtf8, $"Failure NoErrorTest for {byteLength}-byte UTF8. Sequence: {utf8HexString}");
-    //         Assert.True(isValidUtf8);
-    //     }
-    // }
-
     private void RunTestForByteLength(int byteLength)
     {
         int[] outputLengths = { 128, 256, 512, 1024, 1000 }; // Example lengths
@@ -191,6 +205,7 @@ public void HeaderBitsErrorTest()
                     byte oldByte = utf8[i];
                     utf8[i] = 0b11111000; // Forcing a header bits error
                     Assert.False(ValidateUtf8(utf8));
+                    Assert.True(InvalidateUtf8(utf8, i));
                     utf8[i] = oldByte; // Restore the original byte
                 }
             }
@@ -211,6 +226,7 @@ public void TooShortErrorTest()
                     byte oldByte = utf8[i];
                     utf8[i] = 0b11100000; // Forcing a too short error
                     Assert.False(ValidateUtf8(utf8));
+                    Assert.True(InvalidateUtf8(utf8, i));
                     utf8[i] = oldByte; // Restore the original byte
                 }
             }
@@ -231,14 +247,13 @@ public void TooLongErrorTest()
                     byte oldByte = utf8[i];
                     utf8[i] = 0b10000000; // Forcing a too long error
                     Assert.False(ValidateUtf8(utf8));
+                    Assert.True(InvalidateUtf8(utf8, i));
                     utf8[i] = oldByte; // Restore the original byte
                 }
             }
         }
     }
 
-    // 
-
     [Fact]
     public void OverlongErrorTest()
     {
@@ -294,7 +309,7 @@ public void TooLargeErrorTest()
                     utf8[i] += (byte)(((utf8[i] & 0b100) == 0b100) ? 0b10 : 0b100);
 
                     Assert.False(ValidateUtf8(utf8));
-
+                    Assert.True(InvalidateUtf8(utf8, i));
                     utf8[i] = old;
                 }
             }
@@ -322,6 +337,7 @@ public void SurrogateErrorTest()
                         utf8[i + 1] = (byte)((utf8[i + 1] & 0b11000011) | (s << 2));
 
                         Assert.False(ValidateUtf8(utf8));
+                        Assert.True(InvalidateUtf8(utf8, i));
                     }
 
                     utf8[i] = old;
@@ -334,8 +350,6 @@ public void SurrogateErrorTest()
     [Fact]
     public void BruteForceTest()
     {
-        // Random rand = new Random(); // Random instance for test
-
         for (int i = 0; i < NumTrials; i++)
         {
             // Generate random UTF-8 sequence
@@ -368,9 +382,7 @@ public void BruteForceTest()
         }
     }
 
-
     // credit: based on code from Google Fuchsia (Apache Licensed)
-
     public static bool ValidateUtf8Fuschia(byte[] data)
     {
         int pos = 0;
@@ -424,7 +436,34 @@ public static bool ValidateUtf8Fuschia(byte[] data)
         return true;
     }
 
+    // Check that all functions agree on the result when the input might be invalid.
+    private bool InvalidateUtf8(byte[] utf8, int badindex)
+    {
+        unsafe
+        {
+            fixed (byte* pInput = utf8)
+            {
+                byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, utf8.Length);
+                int scalarOffset = (int)(scalarResult - pInput);
+                byte* simdResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInput, utf8.Length);
+                int simdOffset = (int)(simdResult - pInput);
+                int utf16CodeUnitCountAdjustment, scalarCountAdjustment;
+                byte* dotnetResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, utf8.Length, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
+                int dotnetOffset = (int)(dotnetResult - pInput);
+                if (scalarOffset != simdOffset)
+                {
+                    Console.WriteLine("Suprisingly, scalarResult != simdResult {0} != {1}, badindex = {2}, length = {3}", scalarOffset, simdOffset, badindex, utf8.Length);
+                }
+                if (dotnetOffset != simdOffset)
+                {
+                    Console.WriteLine("Suprisingly, dotnetOffset != simdResult {0} != {1}, badindex = {2}, length = {3}", dotnetOffset, simdOffset, badindex, utf8.Length);
+                }
+                return (scalarResult == simdResult) && (simdResult == dotnetResult);
+            }
+        }
+    }
 
+    // check that all methods agree that the result is valid
     private bool ValidateUtf8(byte[] utf8)
     {
         unsafe

diff --git a/test/tests.csproj b/test/tests.csproj
@@ -26,8 +26,7 @@
 
   <ItemGroup>
     <ProjectReference Include="..\src\SimdUnicode.csproj" />
-    <!-- <ProjectReference Include="..\test\randomutf8.csproj" /> -->
-
+    <ProjectReference Include="..\benchmark\benchmark.csproj" />
   </ItemGroup>
 
 </Project>