diff --git a/README.md b/README.md index 6915db7..83245c7 100644 --- a/README.md +++ b/README.md @@ -6,15 +6,31 @@ This is a fast C# library to validate UTF-8 strings. ## Motivation -We seek to speed up the `Utf8Utility.GetPointerToFirstInvalidByte` function. Using the algorithm used by Node.js, Oracle GraalVM and other important systems. - -- John Keiser, Daniel Lemire, [Validating UTF-8 In Less Than One Instruction Per Byte](https://arxiv.org/abs/2010.03090), Software: Practice and Experience 51 (5), 2021 +We seek to speed up the `Utf8Utility.GetPointerToFirstInvalidByte` function from the C# runtime library. +[The function is private in the Microsoft Runtime](https://github.com/dotnet/runtime/blob/4d709cd12269fcbb3d0fccfb2515541944475954/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs), but we can expose it manually. -The algorithm in question is part of popular JavaScript runtimes such as Node.js and Bun, [by PHP](https://github.com/php/php-src/blob/90e0ce7f0db99767c58dc21e4213c0f8763f657a/ext/mbstring/mbstring.c#L5270), by Oracle GraalVM and many important systems. +Specifically, we provide the function `SimdUnicode.UTF8.GetPointerToFirstInvalidByte` which is a faster +drop-in replacement: +```cs +// Returns &inputBuffer[inputLength] if the input buffer is valid. +/// +/// Given an input buffer of byte length , +/// returns a pointer to where the first invalid data appears in . +/// The parameter is set according to the content of the valid UTF-8 characters encountered, counting -1 for each 2-byte character, -2 for each 3-byte and 4-byte characters. +/// The parameter is set according to the content of the valid UTF-8 characters encountered, counting -1 for each 4-byte character. +/// +/// +/// Returns a pointer to the end of if the buffer is well-formed. +/// +public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength, out int Utf16CodeUnitCountAdjustment, out int ScalarCodeUnitCountAdjustment); +``` -[The function is private in the Microsoft Runtime](https://github.com/dotnet/runtime/blob/4d709cd12269fcbb3d0fccfb2515541944475954/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs), but we can expose it manually. +The function uses advanced instructions (SIMD) on 64-bit ARM and x64 processors, but fallbacks on a +conventional implementation on other systems. We provide extensive tests and benchmarks. +We apply the algorithm used by Node.js, Bun, Oracle GraalVM, by the PHP interpreter and other important systems. The algorithm has been described in the follow article: +- John Keiser, Daniel Lemire, [Validating UTF-8 In Less Than One Instruction Per Byte](https://arxiv.org/abs/2010.03090), Software: Practice and Experience 51 (5), 2021 ## Requirements @@ -30,6 +46,11 @@ dotnet test To see which tests are running, we recommend setting the verbosity level: +``` +dotnet test -v=normal +``` + +More details could be useful: ``` dotnet test -v d ``` @@ -44,7 +65,7 @@ To run specific tests, it is helpful to use the filter parameter: ``` -dotnet test --filter TooShortErrorAVX +dotnet test --filter TooShortErrorAvx2 ``` Or to target specific categories: @@ -89,7 +110,6 @@ dotnet build We recommend you use `dotnet format`. E.g., ``` -cd test dotnet format ``` @@ -115,6 +135,7 @@ You can print the content of a vector register like so: ## Performance tips - Be careful: `Vector128.Shuffle` is not the same as `Ssse3.Shuffle` nor is `Vector128.Shuffle` the same as `Avx2.Shuffle`. Prefer the latter. +- Similarly `Vector128.Shuffle` is not the same as `AdvSimd.Arm64.VectorTableLookup`, use the latter. ## More reading diff --git a/benchmark/Benchmark.cs b/benchmark/Benchmark.cs index c48e075..0900059 100644 --- a/benchmark/Benchmark.cs +++ b/benchmark/Benchmark.cs @@ -27,13 +27,17 @@ public class Speed : IColumn { public string GetValue(Summary summary, BenchmarkCase benchmarkCase) { + if (summary is null || benchmarkCase is null || benchmarkCase.Parameters is null) + { + return "N/A"; + } var ourReport = summary.Reports.First(x => x.BenchmarkCase.Equals(benchmarkCase)); var fileName = (string)benchmarkCase.Parameters["FileName"]; - long length = new System.IO.FileInfo(fileName).Length; - if (ourReport.ResultStatistics is null) + if (ourReport is null || ourReport.ResultStatistics is null) { return "N/A"; } + long length = new System.IO.FileInfo(fileName).Length; var mean = ourReport.ResultStatistics.Mean; return $"{(length / ourReport.ResultStatistics.Mean):#####.00}"; } @@ -46,8 +50,8 @@ public string GetValue(Summary summary, BenchmarkCase benchmarkCase) public string ColumnName { get; } = "Speed (GB/s)"; public bool AlwaysShow { get; } = true; public ColumnCategory Category { get; } = ColumnCategory.Custom; - public int PriorityInCategory { get; } = 0; - public bool IsNumeric { get; } = false; + public int PriorityInCategory { get; } + public bool IsNumeric { get; } public UnitType UnitType { get; } = UnitType.Dimensionless; public string Legend { get; } = "The speed in gigabytes per second"; } @@ -57,8 +61,8 @@ public string GetValue(Summary summary, BenchmarkCase benchmarkCase) [Config(typeof(Config))] public class RealDataBenchmark { - - private class Config : ManualConfig +#pragma warning disable CA1812 + private sealed class Config : ManualConfig { public Config() { @@ -67,6 +71,7 @@ public Config() if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64) { +#pragma warning disable CA1303 Console.WriteLine("ARM64 system detected."); AddFilter(new AnyCategoriesFilter(["arm64", "scalar", "runtime"])); @@ -75,21 +80,25 @@ public Config() { if (Vector512.IsHardwareAccelerated && System.Runtime.Intrinsics.X86.Avx512Vbmi.IsSupported) { +#pragma warning disable CA1303 Console.WriteLine("X64 system detected (Intel, AMD,...) with AVX-512 support."); AddFilter(new AnyCategoriesFilter(["avx512", "avx", "sse", "scalar", "runtime"])); } else if (Avx2.IsSupported) { +#pragma warning disable CA1303 Console.WriteLine("X64 system detected (Intel, AMD,...) with AVX2 support."); AddFilter(new AnyCategoriesFilter(["avx", "sse", "scalar", "runtime"])); } else if (Ssse3.IsSupported) { +#pragma warning disable CA1303 Console.WriteLine("X64 system detected (Intel, AMD,...) with Sse4.2 support."); AddFilter(new AnyCategoriesFilter(["sse", "scalar", "runtime"])); } else { +#pragma warning disable CA1303 Console.WriteLine("X64 system detected (Intel, AMD,...) without relevant SIMD support."); AddFilter(new AnyCategoriesFilter(["scalar", "runtime"])); } @@ -130,14 +139,15 @@ public Config() @"data/thai.utf8.txt", @"data/turkish.utf8.txt", @"data/vietnamese.utf8.txt")] +#pragma warning disable CA1051 public string? FileName; - public byte[] allLinesUtf8 = new byte[0]; + private byte[] allLinesUtf8 = Array.Empty(); public unsafe delegate byte* Utf8ValidationFunction(byte* pUtf8, int length); public unsafe delegate byte* DotnetRuntimeUtf8ValidationFunction(byte* pUtf8, int length, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment); - public void RunUtf8ValidationBenchmark(byte[] data, Utf8ValidationFunction validationFunction) + private void RunUtf8ValidationBenchmark(byte[] data, Utf8ValidationFunction validationFunction) { unsafe { @@ -146,13 +156,13 @@ public void RunUtf8ValidationBenchmark(byte[] data, Utf8ValidationFunction valid var res = validationFunction(pUtf8, data.Length); if (res != pUtf8 + data.Length) { - throw new Exception("Invalid UTF-8: I expected the pointer to be at the end of the buffer."); + throw new ArgumentException("Invalid UTF-8: I expected the pointer to be at the end of the buffer."); } } } } - public void RunDotnetRuntimeUtf8ValidationBenchmark(byte[] data, DotnetRuntimeUtf8ValidationFunction validationFunction) + private void RunDotnetRuntimeUtf8ValidationBenchmark(byte[] data, DotnetRuntimeUtf8ValidationFunction validationFunction) { unsafe { @@ -183,20 +193,17 @@ public unsafe void SIMDUtf8ValidationRealData() { if (allLinesUtf8 != null) { - // RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByte); + RunUtf8ValidationBenchmark(allLinesUtf8, (byte* pInputBuffer, int inputLength) => + { + int dummyUtf16CodeUnitCountAdjustment, dummyScalarCountAdjustment; + // Call the method with additional out parameters within the lambda. + // You must handle these additional out parameters inside the lambda, as they cannot be passed back through the delegate. + return SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInputBuffer, inputLength, out dummyUtf16CodeUnitCountAdjustment, out dummyScalarCountAdjustment); + }); } } [Benchmark] - // [BenchmarkCategory("scalar")] - // public unsafe void Utf8ValidationRealDataScalar() - // { - // if (allLinesUtf8 != null) - // { - // RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); - // } - // } - [BenchmarkCategory("scalar")] public unsafe void Utf8ValidationRealDataScalar() { @@ -213,45 +220,48 @@ public unsafe void Utf8ValidationRealDataScalar() } } - [Benchmark] [BenchmarkCategory("arm64")] public unsafe void SIMDUtf8ValidationRealDataArm64() { if (allLinesUtf8 != null) { - RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + RunUtf8ValidationBenchmark(allLinesUtf8, (byte* pInputBuffer, int inputLength) => + { + int dummyUtf16CodeUnitCountAdjustment, dummyScalarCountAdjustment; + // Call the method with additional out parameters within the lambda. + // You must handle these additional out parameters inside the lambda, as they cannot be passed back through the delegate. + return SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength, out dummyUtf16CodeUnitCountAdjustment, out dummyScalarCountAdjustment); + }); } + } - // [Benchmark] - // [BenchmarkCategory("avx")] - // public unsafe void SIMDUtf8ValidationRealDataAvx2() - // { - // if (allLinesUtf8 != null) - // { - // RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); - // } - // } + [Benchmark] - [BenchmarkCategory("sse")] - public unsafe void SIMDUtf8ValidationRealDataSse() + [BenchmarkCategory("avx")] + public unsafe void SIMDUtf8ValidationRealDataAvx2() { if (allLinesUtf8 != null) { - RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); + RunUtf8ValidationBenchmark(allLinesUtf8, (byte* pInputBuffer, int inputLength) => + { + int dummyUtf16CodeUnitCountAdjustment, dummyScalarCountAdjustment; + // Call the method with additional out parameters within the lambda. + // You must handle these additional out parameters inside the lambda, as they cannot be passed back through the delegate. + return SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength, out dummyUtf16CodeUnitCountAdjustment, out dummyScalarCountAdjustment); + }); } } - /* - // TODO: enable this benchmark when the AVX-512 implementation is ready + [Benchmark] - [BenchmarkCategory("avx512")] - public unsafe void SIMDUtf8ValidationRealDataAvx512() + [BenchmarkCategory("sse")] + public unsafe void SIMDUtf8ValidationRealDataSse() { if (allLinesUtf8 != null) { - RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); + RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); } - }*/ + } } public class Program diff --git a/src/Ascii.cs b/src/Ascii.cs index e92845b..9eb94b5 100644 --- a/src/Ascii.cs +++ b/src/Ascii.cs @@ -25,6 +25,7 @@ public unsafe static class Ascii public static bool IsAscii(this string s) { + if (s == null) return true; foreach (var c in s) { if (!c.IsAscii()) return false; diff --git a/src/UTF8.cs b/src/UTF8.cs index 6eef42d..0a96ece 100644 --- a/src/UTF8.cs +++ b/src/UTF8.cs @@ -10,8 +10,42 @@ namespace SimdUnicode public static class UTF8 { + // Returns &inputBuffer[inputLength] if the input buffer is valid. + /// + /// Given an input buffer of byte length , + /// returns a pointer to where the first invalid data appears in . + /// The parameter is set according to the content of the valid UTF-8 characters encountered, counting -1 for each 2-byte character, -2 for each 3-byte and 4-byte characters. + /// The parameter is set according to the content of the valid UTF-8 characters encountered, counting -1 for each 4-byte character. + /// + /// + /// Returns a pointer to the end of if the buffer is well-formed. + /// + public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength, out int Utf16CodeUnitCountAdjustment, out int ScalarCodeUnitCountAdjustment) + { + + if (AdvSimd.Arm64.IsSupported) + { + return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment); + } + if (Avx2.IsSupported) + { + return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment); + } + /*if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported) + { + return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength); + }*/ + // if (Ssse3.IsSupported) + // { + // return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength); + // } + // return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength); + + return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment); + + } // prevents double counting in case there is a toolong error on the edge - public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byte headerByte) + private static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byte headerByte) { // Check if the header byte belongs to a 2-byte UTF-8 character if ((headerByte & 0b11100000) == 0b11000000) @@ -32,10 +66,141 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt return (0, 0); } - - public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len, ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) + // We scan the input from buf to len, possibly going back howFarBack bytes, to find the end of + // a valid UTF-8 sequence. We return buf + len if the buffer is valid, otherwise we return the + // pointer to the first invalid byte. + private unsafe static byte* SimpleRewindAndValidateWithErrors(int howFarBack, byte* buf, int len) { + int extraLen = 0; + bool foundLeadingBytes = false; + + for (int i = 0; i <= howFarBack; i++) + { + byte candidateByte = buf[0 - i]; + foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; + + if (foundLeadingBytes) + { + buf -= i; + extraLen = i; + break; + } + } + + if (!foundLeadingBytes) + { + return buf - howFarBack; + } + + int pos = 0; + int nextPos; + uint codePoint = 0; + + len += extraLen; + + while (pos < len) + { + + byte firstByte = buf[pos]; + + while (firstByte < 0b10000000) + { + if (++pos == len) + { + return buf + len; + } + firstByte = buf[pos]; + } + + if ((firstByte & 0b11100000) == 0b11000000) + { + nextPos = pos + 2; + if (nextPos > len) + { + return buf + pos; + } // Too short + if ((buf[pos + 1] & 0b11000000) != 0b10000000) + { + return buf + pos; + } // Too short + // range check + codePoint = (uint)(firstByte & 0b00011111) << 6 | (uint)(buf[pos + 1] & 0b00111111); + if ((codePoint < 0x80) || (0x7ff < codePoint)) + { + return buf + pos; + } // Overlong + } + else if ((firstByte & 0b11110000) == 0b11100000) + { + nextPos = pos + 3; + if (nextPos > len) + { + return buf + pos; + } // Too short + // range check + codePoint = (uint)(firstByte & 0b00001111) << 12 | + (uint)(buf[pos + 1] & 0b00111111) << 6 | + (uint)(buf[pos + 2] & 0b00111111); + // Either overlong or too large: + if ((codePoint < 0x800) || (0xffff < codePoint) || + (0xd7ff < codePoint && codePoint < 0xe000)) + { + return buf + pos; + } + if ((buf[pos + 1] & 0b11000000) != 0b10000000) + { + return buf + pos; + } // Too short + if ((buf[pos + 2] & 0b11000000) != 0b10000000) + { + return buf + pos; + } // Too short + } + else if ((firstByte & 0b11111000) == 0b11110000) + { + nextPos = pos + 4; + if (nextPos > len) + { + return buf + pos; + } + if ((buf[pos + 1] & 0b11000000) != 0b10000000) + { + return buf + pos; + } + if ((buf[pos + 2] & 0b11000000) != 0b10000000) + { + return buf + pos; + } + if ((buf[pos + 3] & 0b11000000) != 0b10000000) + { + return buf + pos; + } + // range check + codePoint = + (uint)(firstByte & 0b00000111) << 18 | (uint)(buf[pos + 1] & 0b00111111) << 12 | + (uint)(buf[pos + 2] & 0b00111111) << 6 | (uint)(buf[pos + 3] & 0b00111111); + if (codePoint <= 0xffff || 0x10ffff < codePoint) + { + return buf + pos; + } + } + else + { + // we may have a continuation/too long error + return buf + pos; + } + pos = nextPos; + } + + return buf + len; // no error + } + + // We scan the input from buf to len, possibly going back howFarBack bytes, to find the end of + // a valid UTF-8 sequence. We return buf + len if the buffer is valid, otherwise we return the + // pointer to the first invalid byte. Also updated the utf16CodeUnitCountAdjustment and scalarCountAdjustment + private unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len, ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment) + { int extraLen = 0; bool foundLeadingBytes = false; @@ -252,7 +417,7 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt // ... pInputBuffer[returnedvalue - 1] should be continuation bytes. // Note that this function is unsafe, and it is the caller's responsibility // to ensure that we can read at least 4 bytes before pInputBuffer. - public unsafe static (int totalbyteadjustment, int backedupByHowMuch, int ascii, int contbyte, int n4) adjustmentFactor(byte* pInputBuffer) + private unsafe static (int totalbyteadjustment, int backedupByHowMuch, int ascii, int contbyte, int n4) adjustmentFactor(byte* pInputBuffer) { // Find the first non-continuation byte, working backward. int i = 1; @@ -281,7 +446,7 @@ public unsafe static (int totalbyteadjustment, int backedupByHowMuch, int ascii, return (4 - i, i, 0, contbyteadjust, -1); // We have that i == 1 or i == 2 or i == 3 or i == 4, if i == 1, we are missing three bytes, if i == 2, we are missing two bytes, if i == 3, we are missing one byte. } - public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte) + private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte) { int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte; int n2 = -2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte; @@ -291,7 +456,7 @@ public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustment return (utfadjust, scalaradjust); } - public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes) + private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes) { // Calculate the total bytes from start_point to processedLength int totalbyte = processedLength - start_point; @@ -461,16 +626,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { int processedLength = 0; - int TempUtf16CodeUnitCountAdjustment = 0; - int TempScalarCountAdjustment = 0; - - int TailScalarCodeUnitCountAdjustment = 0; - int TailUtf16CodeUnitCountAdjustment = 0; - if (pInputBuffer == null || inputLength <= 0) { - utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TempScalarCountAdjustment; + utf16CodeUnitCountAdjustment = 0; + scalarCountAdjustment = 0; return pInputBuffer; } if (inputLength > 128) @@ -598,12 +757,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( int asciibytes = 0; // number of ascii bytes in the block (could also be called n1) int contbytes = 0; // number of continuation bytes in the block int n4 = 0; // number of 4-byte sequences that start in this block - for (; processedLength + 32 <= inputLength; processedLength += 32) { Vector256 currentBlock = Avx.LoadVector256(pInputBuffer + processedLength); - int mask = Avx2.MoveMask(currentBlock); if (mask == 0) { @@ -612,14 +769,20 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) { - int totalbyteasciierror = processedLength - start_point; - var (utfadjustasciierror, scalaradjustasciierror) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyteasciierror); - - utf16CodeUnitCountAdjustment = utfadjustasciierror; - scalarCountAdjustment = scalaradjustasciierror; - int off = processedLength >= 3 ? processedLength - 3 : processedLength; - return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment, ref scalarCountAdjustment); + byte* invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(16 - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); + // So the code is correct up to invalidBytePointer + if (invalidBytePointer < pInputBuffer + processedLength) + { + removeCounters(invalidBytePointer, pInputBuffer + processedLength, ref asciibytes, ref n4, ref contbytes); + } + else + { + addCounters(pInputBuffer + processedLength, invalidBytePointer, ref asciibytes, ref n4, ref contbytes); + } + int totalbyteasciierror = processedLength - start_point; + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyteasciierror); + return invalidBytePointer; } prevIncomplete = Vector256.Zero; } @@ -646,31 +809,29 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( if (!Avx2.TestZ(error, error)) { - int off = processedLength > 32 ? processedLength - 32 : processedLength;// this does not backup ff processedlength = 32 - byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment, ref TailScalarCodeUnitCountAdjustment); - utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TailScalarCodeUnitCountAdjustment; - - int totalbyteasciierror = processedLength - start_point; - var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes); - - utf16CodeUnitCountAdjustment += utfadjustasciierror; - scalarCountAdjustment += scalaradjustasciierror; - + byte* invalidBytePointer; + if (processedLength == 0) + { + invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength); + } + else + { + invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(processedLength - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); + } + if (invalidBytePointer < pInputBuffer + processedLength) + { + removeCounters(invalidBytePointer, pInputBuffer + processedLength, ref asciibytes, ref n4, ref contbytes); + } + else + { + addCounters(pInputBuffer + processedLength, invalidBytePointer, ref asciibytes, ref n4, ref contbytes); + } + int total_bytes_processed = (int)(invalidBytePointer - (pInputBuffer + start_point)); + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, total_bytes_processed); return invalidBytePointer; } prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue); - - if (!Avx2.TestZ(prevIncomplete, prevIncomplete)) - { - // We have an unterminated sequence. - var (totalbyteadjustment, i, tempascii, tempcont, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32); - processedLength -= i; - n4 += tempn4; - contbytes += tempcont; - } - contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(byte_2_high)); // We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation. n4 += (int)Popcnt.PopCount((uint)Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte))); @@ -681,50 +842,53 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // and no expensive operation: asciibytes += (int)(32 - Popcnt.PopCount((uint)mask)); } + // We may still have an error. + if (processedLength < inputLength || !Avx2.TestZ(prevIncomplete, prevIncomplete)) + { + byte* invalidBytePointer; + if (processedLength == 0) + { + invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength); + } + else + { + invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(processedLength - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); - int totalbyte = processedLength - start_point; - var (utf16adjust, scalaradjust) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyte); - - TempUtf16CodeUnitCountAdjustment = utf16adjust; - TempScalarCountAdjustment = scalaradjust; - } - - - } - // We have processed all the blocks using SIMD, we need to process the remaining bytes. - // Process the remaining bytes with the scalar function - - // worst possible case is 4 bytes, where we need to backtrack 3 bytes - // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte - if (processedLength < inputLength) - { - - byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(32, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment, ref TailScalarCodeUnitCountAdjustment); - if (invalidBytePointer != pInputBuffer + inputLength) - { - utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; - - // An invalid byte was found by the scalar function - return invalidBytePointer; + } + if (invalidBytePointer != pInputBuffer + inputLength) + { + if (invalidBytePointer < pInputBuffer + processedLength) + { + removeCounters(invalidBytePointer, pInputBuffer + processedLength, ref asciibytes, ref n4, ref contbytes); + } + else + { + addCounters(pInputBuffer + processedLength, invalidBytePointer, ref asciibytes, ref n4, ref contbytes); + } + int total_bytes_processed = (int)(invalidBytePointer - (pInputBuffer + start_point)); + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, total_bytes_processed); + return invalidBytePointer; + } + else + { + addCounters(pInputBuffer + processedLength, invalidBytePointer, ref asciibytes, ref n4, ref contbytes); + } + } + int final_total_bytes_processed = inputLength - start_point; + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, final_total_bytes_processed); + return pInputBuffer + inputLength; } } - utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment; - scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment; - - return pInputBuffer + inputLength; + return GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment); } - public unsafe static byte* GetPointerToFirstInvalidByteArm64(byte* pInputBuffer, int inputLength) + public unsafe static byte* GetPointerToFirstInvalidByteArm64(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { int processedLength = 0; - int TempUtf16CodeUnitCountAdjustment = 0; - int TempScalarCountAdjustment = 0; - - int utf16CodeUnitCountAdjustment = 0, scalarCountAdjustment = 0; - if (pInputBuffer == null || inputLength <= 0) { + utf16CodeUnitCountAdjustment = 0; + scalarCountAdjustment = 0; return pInputBuffer; } if (inputLength > 128) @@ -793,18 +957,37 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( Vector128 v0f = Vector128.Create((byte)0x0F); Vector128 v80 = Vector128.Create((byte)0x80); // Performance note: we could process 64 bytes at a time for better speed in some cases. + int start_point = processedLength; + + // The block goes from processedLength to processedLength/16*16. + int asciibytes = 0; // number of ascii bytes in the block (could also be called n1) + int contbytes = 0; // number of continuation bytes in the block + int n4 = 0; // number of 4-byte sequences that start in this block for (; processedLength + 16 <= inputLength; processedLength += 16) { Vector128 currentBlock = AdvSimd.LoadVector128(pInputBuffer + processedLength); - if (AdvSimd.Arm64.MaxAcross(currentBlock).ToScalar() > 127) + if (AdvSimd.Arm64.MaxAcross(currentBlock).ToScalar() <= 127) { // We have an ASCII block, no need to process it, but // we need to check if the previous block was incomplete. if (AdvSimd.Arm64.MaxAcross(prevIncomplete).ToScalar() != 0) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength, ref utf16CodeUnitCountAdjustment, ref scalarCountAdjustment); + int off = processedLength >= 3 ? processedLength - 3 : processedLength; + byte* invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(16 - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); + // So the code is correct up to invalidBytePointer + if (invalidBytePointer < pInputBuffer + processedLength) + { + removeCounters(invalidBytePointer, pInputBuffer + processedLength, ref asciibytes, ref n4, ref contbytes); + } + else + { + addCounters(pInputBuffer + processedLength, invalidBytePointer, ref asciibytes, ref n4, ref contbytes); + } + int totalbyteasciierror = processedLength - start_point; + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyteasciierror); + return invalidBytePointer; } prevIncomplete = Vector128.Zero; } @@ -812,9 +995,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( { // Contains non-ASCII characters, we need to do non-trivial processing Vector128 prev1 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 1)); - Vector128 byte_1_high = Vector128.Shuffle(shuf1, AdvSimd.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f); - Vector128 byte_1_low = Vector128.Shuffle(shuf2, (prev1 & v0f)); - Vector128 byte_2_high = Vector128.Shuffle(shuf3, AdvSimd.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); + // Vector128.Shuffle vs AdvSimd.Arm64.VectorTableLookup: prefer the latter!!! + Vector128 byte_1_high = AdvSimd.Arm64.VectorTableLookup(shuf1, AdvSimd.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f); + Vector128 byte_1_low = AdvSimd.Arm64.VectorTableLookup(shuf2, (prev1 & v0f)); + Vector128 byte_2_high = AdvSimd.Arm64.VectorTableLookup(shuf3, AdvSimd.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); Vector128 sc = AdvSimd.And(AdvSimd.And(byte_1_high, byte_1_low), byte_2_high); Vector128 prev2 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 2)); Vector128 prev3 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 3)); @@ -829,68 +1013,116 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust( // hardware: if (AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(error)).ToScalar() != 0) { - return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength, ref utf16CodeUnitCountAdjustment, ref scalarCountAdjustment); + byte* invalidBytePointer; + if (processedLength == 0) + { + invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength); + } + else + { + invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(processedLength - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); + } + if (invalidBytePointer < pInputBuffer + processedLength) + { + removeCounters(invalidBytePointer, pInputBuffer + processedLength, ref asciibytes, ref n4, ref contbytes); + } + else + { + addCounters(pInputBuffer + processedLength, invalidBytePointer, ref asciibytes, ref n4, ref contbytes); + } + int total_bytes_processed = (int)(invalidBytePointer - (pInputBuffer + start_point)); + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, total_bytes_processed); + return invalidBytePointer; } prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue); + Vector128 largestcont = Vector128.Create((sbyte)-65); // -65 => 0b10111111 + contbytes += -AdvSimd.Arm64.AddAcross(AdvSimd.CompareLessThanOrEqual(Vector128.AsSByte(currentBlock), largestcont)).ToScalar(); + + // computing n4 is more expensive than we would like: + Vector128 fourthByteMinusOne = Vector128.Create((byte)(0b11110000u - 1)); + Vector128 largerthan0f = AdvSimd.CompareGreaterThan(currentBlock, fourthByteMinusOne); + byte n4add = (byte)AdvSimd.Arm64.AddAcross(largerthan0f).ToScalar(); + int negn4add = (int)(byte)-n4add; + n4 += negn4add; } + asciibytes -= (sbyte)AdvSimd.Arm64.AddAcross(AdvSimd.CompareLessThan(currentBlock, v80)).ToScalar(); } - } - } - // We have processed all the blocks using SIMD, we need to process the remaining bytes. - // Process the remaining bytes with the scalar function - if (processedLength < inputLength) - { - // We need to possibly backtrack to the start of the last code point - // worst possible case is 4 bytes, where we need to backtrack 3 bytes - // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte - if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) - { - processedLength -= 1; - if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) + // We may still have an error. + if (processedLength < inputLength || AdvSimd.Arm64.MaxAcross(prevIncomplete).ToScalar() != 0) { - processedLength -= 1; - if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65) + byte* invalidBytePointer; + if (processedLength == 0) { - processedLength -= 1; + invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength); + } + else + { + invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(processedLength - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); + } + if (invalidBytePointer != pInputBuffer + inputLength) + { + if (invalidBytePointer < pInputBuffer + processedLength) + { + removeCounters(invalidBytePointer, pInputBuffer + processedLength, ref asciibytes, ref n4, ref contbytes); + } + else + { + addCounters(pInputBuffer + processedLength, invalidBytePointer, ref asciibytes, ref n4, ref contbytes); + } + int total_bytes_processed = (int)(invalidBytePointer - (pInputBuffer + start_point)); + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, total_bytes_processed); + return invalidBytePointer; + } + else + { + addCounters(pInputBuffer + processedLength, invalidBytePointer, ref asciibytes, ref n4, ref contbytes); } } + int final_total_bytes_processed = inputLength - start_point; + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, final_total_bytes_processed); + return pInputBuffer + inputLength; } - int TailScalarCodeUnitCountAdjustment = 0; - int TailUtf16CodeUnitCountAdjustment = 0; - byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength, out TailUtf16CodeUnitCountAdjustment, out TailScalarCodeUnitCountAdjustment); - if (invalidBytePointer != pInputBuffer + inputLength) + } + return GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment); + } + + private static unsafe void removeCounters(byte* start, byte* end, ref int asciibytes, ref int n4, ref int contbytes) + { + for (byte* p = start; p < end; p++) + { + if ((*p & 0b10000000) == 0) { - // An invalid byte was found by the scalar function - return invalidBytePointer; + asciibytes -= 1; + } + if ((*p & 0b11000000) == 0b10000000) + { + contbytes -= 1; + } + if ((*p & 0b11110000) == 0b11110000) + { + n4 -= 1; } } - - return pInputBuffer + inputLength; } - public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength, out int Utf16CodeUnitCountAdjustment, out int ScalarCodeUnitCountAdjustment) - { - // if (AdvSimd.Arm64.IsSupported) - // { - // return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength); - // } - if (Avx2.IsSupported) + private static unsafe void addCounters(byte* start, byte* end, ref int asciibytes, ref int n4, ref int contbytes) + { + for (byte* p = start; p < end; p++) { - return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment); + if ((*p & 0b10000000) == 0) + { + asciibytes += 1; + } + if ((*p & 0b11000000) == 0b10000000) + { + contbytes += 1; + } + if ((*p & 0b11110000) == 0b11110000) + { + n4 += 1; + } } - /*if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported) - { - return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength); - }*/ - // if (Ssse3.IsSupported) - // { - // return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength); - // } - // return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength); - - return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment); - } } diff --git a/test/AsciiTest.cs b/test/AsciiTest.cs index 6b04362..2928302 100644 --- a/test/AsciiTest.cs +++ b/test/AsciiTest.cs @@ -89,7 +89,7 @@ public void HardCodedSequencesTest() } [Fact] - public void Test_ASCII_generator() + public void TestASCIIGenerator() { const int NUM_TRIALS = 1000; const int MAX_LENGTH = 255; @@ -112,7 +112,7 @@ public void Test_ASCII_generator() // Assertion or check to ensure all sequences were valid ASCII if (validSequencesCount != NUM_TRIALS) { - throw new Exception($"Invalid ASCII sequences were generated for {length}-byte sequences!"); + throw new ArgumentException($"Invalid ASCII sequences were generated for {length}-byte sequences!"); } } } @@ -136,7 +136,7 @@ public void TestNoErrorGetIndexOfFirstNonAsciiByte() nuint result = SimdUnicode.Ascii.GetIndexOfFirstNonAsciiByte(pAscii, (nuint)ascii.Length); if (result != (nuint)ascii.Length) { - throw new Exception($"Unexpected non-ASCII character found at index {result}"); + throw new ArgumentException($"Unexpected non-ASCII character found at index {result}"); } } } @@ -166,7 +166,7 @@ public void TestErrorGetIndexOfFirstNonAsciiByte() nuint result = SimdUnicode.Ascii.GetIndexOfFirstNonAsciiByte(pAscii, (nuint)ascii.Length); if (result != (nuint)i) { - throw new Exception($"Expected non-ASCII character at index {i}, but found at index {result}"); + throw new ArgumentException($"Expected non-ASCII character at index {i}, but found at index {result}"); } } } diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs index d30e6e1..eef06e1 100644 --- a/test/UTF8ValidationTests.cs +++ b/test/UTF8ValidationTests.cs @@ -13,12 +13,12 @@ public unsafe class Utf8SIMDValidationTests { - private const int NumTrials = 100; + private const int NumTrials = 400; // Number of trials for the brute force tests, takes about a minute to run on a powerful server. private static readonly RandomUtf8 generator = new RandomUtf8(1234, 1, 1, 1, 1); - private static readonly Random rand = new Random(); + // Never use non-deterministic random number generators in tests. E.g., never do new Random() in a test. + private static readonly Random rand = new Random(1245); - // int[] outputLengths = { 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280, 1344, 1408, 1472, 1536, 1600, 1664, 1728, 1792, 1856, 1920, 1984, 2048, 2112, 2176, 2240, 2304, 2368, 2432, 2496, 2560, 2624, 2688, 2752, 2816, 2880, 2944, 3008, 3072, 3136, 3200, 3264, 3328, 3392, 3456, 3520, 3584, 3648, 3712, 3776, 3840, 3904, 3968, 4032, 4096, 4160, 4224, 4288, 4352, 4416, 4480, 4544, 4608, 4672, 4736, 4800, 4864, 4928, 4992, 5056, 5120, 5184, 5248, 5312, 5376, 5440, 5504, 5568, 5632, 5696, 5760, 5824, 5888, 5952, 6016, 6080, 6144, 6208, 6272, 6336, 6400, 6464, 6528, 6592, 6656, 6720, 6784, 6848, 6912, 6976, 7040, 7104, 7168, 7232, 7296, 7360, 7424, 7488, 7552, 7616, 7680, 7744, 7808, 7872, 7936, 8000, 8064, 8128, 8192, 8256, 8320, 8384, 8448, 8512, 8576, 8640, 8704, 8768, 8832, 8896, 8960, 9024, 9088, 9152, 9216, 9280, 9344, 9408, 9472, 9536, 9600, 9664, 9728, 9792, 9856, 9920, 9984, 10000 }; - static int[] outputLengths = { 128, 345, 1000 }; + static int[] outputLengths = { 128, 129, 345, 512, 735, 1000, 2010 }; [Flags] public enum TestSystemRequirements @@ -31,7 +31,7 @@ public enum TestSystemRequirements // Add more as needed } - public class FactOnSystemRequirementAttribute : FactAttribute + private sealed class FactOnSystemRequirementAttribute : FactAttribute { private TestSystemRequirements RequiredSystems; @@ -45,7 +45,7 @@ public FactOnSystemRequirementAttribute(TestSystemRequirements requiredSystems) } } - private bool IsSystemSupported(TestSystemRequirements requiredSystems) + private static bool IsSystemSupported(TestSystemRequirements requiredSystems) { switch (RuntimeInformation.ProcessArchitecture) { @@ -62,21 +62,27 @@ private bool IsSystemSupported(TestSystemRequirements requiredSystems) } - public class TestIfCondition : FactAttribute + private sealed class TestIfCondition : FactAttribute { public TestIfCondition(Func condition, string skipReason) { // Only set the Skip property if the condition evaluates to false if (!condition.Invoke()) { + if(skipReason == null) { + throw new ArgumentNullException(nameof(skipReason), "skipReason cannot be null when condition is false."); + } Skip = skipReason; } } + + public Func? Condition { get; } + public string? SkipReason { get; } } - - private void simpleGoodSequences(Utf8ValidationDelegate utf8ValidationDelegate) + + private void simpleGoodSequences(Utf8ValidationFunction utf8ValidationDelegate) { string[] goodSequences = { "a", @@ -96,10 +102,10 @@ private void simpleGoodSequences(Utf8ValidationDelegate utf8ValidationDelegate) { fixed (byte* pInput = input) { - Assert.True(ValidateUtf8(input,utf8ValidationDelegate), + Assert.True(ValidateUtf8(input, utf8ValidationDelegate), $"Failure in Scalar function: SimdUnicode.UTF8.GetPointerToFirstInvalidByte.Sequence: {seq}"); - Assert.True(ValidateCount(input,utf8ValidationDelegate)); + Assert.True(ValidateCount(input, utf8ValidationDelegate)); } } } @@ -112,40 +118,21 @@ public void simpleGoodSequencesScalar() simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - // TODO:Uncomment when SSE is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] - // [Fact] - // [Trait("Category", "sse")] - // public void simpleGoodSequencesSse() - // { - // simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); - // } - - // TODO:Uncomment when AVX512 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] - // [Trait("Category", "avx512")] - // public void simpleGoodSequencesAvx512() - // { - // simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); - // } - - // TODO:Uncomment when Arm64 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] - // [Trait("Category", "arm64")] - // public void simpleGoodSequencesArm64() - // { - // simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); - // } - [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] - public void simpleGoodSequencesAVX() + public void simpleGoodSequencesAvx2() { simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } + [Trait("Category", "arm64")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + public void simpleGoodSequencesArm64() + { + simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + } - private void BadSequences(Utf8ValidationDelegate utf8ValidationDelegate) + private void BadSequences(Utf8ValidationFunction utf8ValidationDelegate) { string[] badSequences = { "\xC3\x28", @@ -186,8 +173,8 @@ private void BadSequences(Utf8ValidationDelegate utf8ValidationDelegate) { fixed (byte* pInput = input) { - ValidateUtf8(input,utf8ValidationDelegate); - Assert.True(ValidateCount(input,utf8ValidationDelegate)); + ValidateUtf8(input, utf8ValidationDelegate); + Assert.True(ValidateCount(input, utf8ValidationDelegate)); } } } @@ -200,59 +187,41 @@ public void BadSequencesScalar() BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - // TODO:Uncomment when SSE is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] - // [Fact] - // [Trait("Category", "sse")] - // public void BadSequencesSse() - // { - // BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); - // } - - // TODO:Uncomment when AVX512 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] - // [Trait("Category", "avx512")] - // public void BadSequencesAvx512() - // { - // BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); - // } - - // TODO:Uncomment when Arm64 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] - // [Trait("Category", "arm64")] - // public void BadSequencesArm64() - // { - // BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); - // } - [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] - public void BadSequencesAVX() + public void BadSequencesAvx2() { BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } + [Trait("Category", "arm64")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + public void BadSequencesArm64() + { + BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + } + // this was in the C++ code - private void Node48995Test(Utf8ValidationDelegate utf8ValidationDelegate) + private void Node48995Test(Utf8ValidationFunction utf8ValidationDelegate) { byte[] bad = new byte[] { 0x80 }; - Assert.False(ValidateUtf8(bad,utf8ValidationDelegate)); + Assert.False(ValidateUtf8(bad, utf8ValidationDelegate)); } - private void NoError(Utf8ValidationDelegate utf8ValidationDelegate) + private void NoError(Utf8ValidationFunction utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { for (int trial = 0; trial < NumTrials; trial++) { byte[] utf8 = generator.Generate(outputLength).ToArray(); - bool isValidUtf8 = ValidateUtf8(utf8,utf8ValidationDelegate); - string utf8HexString = BitConverter.ToString(utf8).Replace("-", " "); + bool isValidUtf8 = ValidateUtf8(utf8, utf8ValidationDelegate); + string utf8HexString = BitConverter.ToString(utf8).Replace("-", " ", System.StringComparison.InvariantCulture); try { Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}"); - Assert.True(InvalidateUtf8(utf8, utf8.Length,utf8ValidationDelegate)); - Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, utf8.Length, utf8ValidationDelegate)); + Assert.True(ValidateCount(utf8, utf8ValidationDelegate)); } catch (Xunit.Sdk.XunitException) { @@ -270,47 +239,29 @@ public void NoErrorScalar() NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - // TODO:Uncomment when SSE is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] - // [Fact] - // [Trait("Category", "sse")] - // public void NoErrorSse() - // { - // NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); - // } - - // TODO:Uncomment when AVX512 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] - // [Trait("Category", "avx512")] - // public void NoErrorAvx512() - // { - // NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); - // } - - // TODO:Uncomment when Arm64 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] - // [Trait("Category", "arm64")] - // public void NoErrorArm64() - // { - // NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); - // } - [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] - public void NoErrorAVX() + public void NoErrorAvx2() { NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - private void NoErrorSpecificByteCount(Utf8ValidationDelegate utf8ValidationDelegate) + [Trait("Category", "arm64")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + public void NoErrorArm64() + { + NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + } + + private void NoErrorSpecificByteCount(Utf8ValidationFunction utf8ValidationDelegate) { - RunTestForByteLength(1,utf8ValidationDelegate); - RunTestForByteLength(2,utf8ValidationDelegate); - RunTestForByteLength(3,utf8ValidationDelegate); - RunTestForByteLength(4,utf8ValidationDelegate); + RunTestForByteLength(1, utf8ValidationDelegate); + RunTestForByteLength(2, utf8ValidationDelegate); + RunTestForByteLength(3, utf8ValidationDelegate); + RunTestForByteLength(4, utf8ValidationDelegate); } - private void RunTestForByteLength(int byteLength,Utf8ValidationDelegate utf8ValidationDelegate) + private void RunTestForByteLength(int byteLength, Utf8ValidationFunction utf8ValidationDelegate) { // int[] outputLengths = { 128, 256, 512, 1024, 1000 }; // Example lengths foreach (int outputLength in outputLengths) @@ -318,11 +269,11 @@ private void RunTestForByteLength(int byteLength,Utf8ValidationDelegate utf8Vali for (int trial = 0; trial < NumTrials; trial++) { byte[] utf8 = generator.Generate(outputLength, byteLength).ToArray(); - bool isValidUtf8 = ValidateUtf8(utf8,utf8ValidationDelegate); + bool isValidUtf8 = ValidateUtf8(utf8, utf8ValidationDelegate); try { Assert.True(isValidUtf8, $"Failure NoErrorTest. "); - Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); + Assert.True(ValidateCount(utf8, utf8ValidationDelegate)); } catch (Xunit.Sdk.XunitException) { @@ -341,144 +292,105 @@ public void NoErrorSpecificByteCountScalar() NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - // TODO:Uncomment when SSE is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] - // [Fact] - // [Trait("Category", "sse")] - // public void NoErrorSpecificByteCountSse() - // { - // NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); - // } - - // TODO:Uncomment when AVX512 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] - // [Trait("Category", "avx512")] - // public void NoErrorSpecificByteCountAvx512() - // { - // NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); - // } - - // TODO:Uncomment when Arm64 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] - // [Trait("Category", "arm64")] - // public void NoErrorSpecificByteCountArm64() - // { - // NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); - // } - [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] - public void NoErrorSpecificByteCountAVX() + public void NoErrorSpecificByteCountAvx2() { NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } -private void NoErrorIncompleteThenASCII(Utf8ValidationDelegate utf8ValidationDelegate) -{ - foreach (int outputLength in outputLengths){ - for (int trial = 0; trial < NumTrials; trial++) + [Trait("Category", "arm64")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + public void NoErrorSpecificByteCountArm64() + { + NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + } + private void NoErrorIncompleteThenASCII(Utf8ValidationFunction utf8ValidationDelegate) + { + foreach (int outputLength in outputLengths) { - var allAscii = new List(Enumerable.Repeat((byte)0, outputLength)); - int firstCodeLength = rand.Next(2, 5); - List singleBytes = generator.Generate(1, firstCodeLength); - - int incompleteLocation = 128 - rand.Next(1, firstCodeLength - 1); - allAscii.InsertRange(incompleteLocation, singleBytes); - - var utf8 = allAscii.ToArray(); - int cutOffLength = 128;//utf8.Length - rand.Next(1, firstCodeLength); - cutOffLength = Math.Min(cutOffLength, outputLength); // Ensure it doesn't exceed the length of truncatedUtf8 - byte[] truncatedUtf8 = new byte[outputLength]; // Initialized to zero - - Array.Copy(utf8, 0, truncatedUtf8, 0, cutOffLength); - - bool isValidUtf8 = ValidateUtf8(truncatedUtf8, utf8ValidationDelegate); - // string utf8HexString = BitConverter.ToString(truncatedUtf8).Replace("-", " "); - try - { - Assert.False(isValidUtf8); - Assert.True(InvalidateUtf8(truncatedUtf8, truncatedUtf8.Length, utf8ValidationDelegate)); - Assert.True(ValidateCount(truncatedUtf8, utf8ValidationDelegate)); - } - catch (Xunit.Sdk.XunitException) + for (int trial = 0; trial < NumTrials; trial++) { - PrintHexAndBinary(truncatedUtf8, incompleteLocation); - throw; + var allAscii = new List(Enumerable.Repeat((byte)0, outputLength)); +#pragma warning disable CA5394 + int firstCodeLength = rand.Next(2, 5); + List singleBytes = generator.Generate(1, firstCodeLength); + + int incompleteLocation = 128 - rand.Next(1, firstCodeLength - 1); + allAscii.InsertRange(incompleteLocation, singleBytes); + + var utf8 = allAscii.ToArray(); + int cutOffLength = 128;//utf8.Length - rand.Next(1, firstCodeLength); + cutOffLength = Math.Min(cutOffLength, outputLength); // Ensure it doesn't exceed the length of truncatedUtf8 + byte[] truncatedUtf8 = new byte[outputLength]; // Initialized to zero + + Array.Copy(utf8, 0, truncatedUtf8, 0, cutOffLength); + + bool isValidUtf8 = ValidateUtf8(truncatedUtf8, utf8ValidationDelegate); + // string utf8HexString = BitConverter.ToString(truncatedUtf8).Replace("-", " "); + try + { + Assert.False(isValidUtf8); + Assert.True(InvalidateUtf8(truncatedUtf8, truncatedUtf8.Length, utf8ValidationDelegate)); + Assert.True(ValidateCount(truncatedUtf8, utf8ValidationDelegate)); + } + catch (Xunit.Sdk.XunitException) + { + PrintHexAndBinary(truncatedUtf8, incompleteLocation); + throw; + } } } } -} - [Fact] + [Fact] [Trait("Category", "scalar")] public void NoErrorIncompleteThenASCIIScalar() { NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - // TODO:Uncomment when SSE is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] - // [Fact] - // [Trait("Category", "sse")] - // public void NoErrorIncompleteThenASCIISse() - // { - // NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); - // } - - // TODO:Uncomment when AVX512 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] - // [Trait("Category", "avx512")] - // public void NoErrorIncompleteThenASCIIAvx512() - // { - // NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); - // } - - // TODO:Uncomment when Arm64 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] - // [Trait("Category", "arm64")] - // public void NoErrorIncompleteThenASCIIArm64() - // { - // NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); - // } - [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] - public void NoErrorIncompleteThenASCIIAVX() + public void NoErrorIncompleteThenASCIIAvx2() { NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } + [Trait("Category", "arm64")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + public void NoErrorIncompleteThenASCIIArm64() + { + NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + } - private void NoErrorIncompleteAt256Vector(Utf8ValidationDelegate utf8ValidationDelegate) + private void NoErrorIncompleteAt256Vector(Utf8ValidationFunction utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { for (int trial = 0; trial < NumTrials; trial++) { - - - // var allAscii = generator.Generate(outputLength,1); var allAscii = new List(Enumerable.Repeat((byte)0, 256)); - int firstcodeLength = rand.Next(2,5); - List singlebytes = generator.Generate(1,firstcodeLength);//recall:generate a utf8 code between 2 and 4 bytes - int incompleteLocation = 128 - rand.Next(1,firstcodeLength - 1); - allAscii.InsertRange(incompleteLocation,singlebytes); + int firstcodeLength = rand.Next(2, 5); + List singlebytes = generator.Generate(1, firstcodeLength); //recall:generate a utf8 code between 2 and 4 bytes + int incompleteLocation = 128 - rand.Next(1, firstcodeLength - 1); + allAscii.InsertRange(incompleteLocation, singlebytes); var utf8 = allAscii.ToArray(); - bool isValidUtf8 = ValidateUtf8(utf8,utf8ValidationDelegate); - string utf8HexString = BitConverter.ToString(utf8).Replace("-", " "); + bool isValidUtf8 = ValidateUtf8(utf8, utf8ValidationDelegate); + string utf8HexString = BitConverter.ToString(utf8).Replace("-", " ", System.StringComparison.InvariantCulture); try { Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}"); - Assert.True(InvalidateUtf8(utf8, utf8.Length,utf8ValidationDelegate)); - Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, utf8.Length, utf8ValidationDelegate)); + Assert.True(ValidateCount(utf8, utf8ValidationDelegate)); } catch (Xunit.Sdk.XunitException) { - PrintHexAndBinary(utf8,incompleteLocation); + PrintHexAndBinary(utf8, incompleteLocation); throw; // Rethrow the exception to fail the test. } } @@ -493,42 +405,24 @@ public void NoErrorIncompleteAt256VectorScalar() NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - // TODO:Uncomment when SSE is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] - // [Fact] - // [Trait("Category", "sse")] - // public void NoErrorIncompleteAt256VectorSse() - // { - // NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); - // } - - // TODO:Uncomment when AVX512 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] - // [Trait("Category", "avx512")] - // public void NoErrorIncompleteAt256VectorAvx512() - // { - // NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); - // } - - // TODO:Uncomment when Arm64 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] - // [Trait("Category", "arm64")] - // public void NoErrorIncompleteAt256VectorArm64() - // { - // NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); - // } - [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] - public void NoErrorIncompleteAt256VectorAVX() + public void NoErrorIncompleteAt256VectorAvx2() { NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - private void BadHeaderBits(Utf8ValidationDelegate utf8ValidationDelegate) + [Trait("Category", "arm64")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + public void NoErrorIncompleteAt256VectorArm64() + { + NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + } + + private void BadHeaderBits(Utf8ValidationFunction utf8ValidationDelegate) { foreach (int outputLength in outputLengths) - { + { for (int trial = 0; trial < NumTrials; trial++) { @@ -541,9 +435,9 @@ private void BadHeaderBits(Utf8ValidationDelegate utf8ValidationDelegate) utf8[i] = 0b11111000; // Forcing a header bits error try { - Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); - Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); - Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); + Assert.False(ValidateUtf8(utf8, utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, i, utf8ValidationDelegate)); + Assert.True(ValidateCount(utf8, utf8ValidationDelegate)); } catch (Xunit.Sdk.XunitException) { @@ -566,47 +460,28 @@ public void BadHeaderBitsScalar() { BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - - - // TODO:Uncomment when SSE is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] - // [Fact] - // [Trait("Category", "sse")] - // public void BadHeaderBitsSse() - // { - // BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); - // } - - // TODO:Uncomment when AVX512 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] - // [Trait("Category", "avx512")] - // public void BadHeaderBitsAvx512() - // { - // BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); - // } - - // TODO:Uncomment when Arm64 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] - // [Trait("Category", "arm64")] - // public void NoErrorSpecificByteCountArm64() - // { - // NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); - // } [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] - public void BadHeaderBitsAVX() + public void BadHeaderBitsAvx2() { BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - private void TooShortError(Utf8ValidationDelegate utf8ValidationDelegate) + [Trait("Category", "arm64")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + public void BadHeaderBitsArm64() + { + BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + } + + private void TooShortError(Utf8ValidationFunction utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { for (int trial = 0; trial < NumTrials; trial++) { - byte[] utf8 = generator.Generate(outputLength).ToArray(); + byte[] utf8 = generator.Generate(outputLength).ToArray(); for (int i = 0; i < utf8.Length; i++) { @@ -614,24 +489,24 @@ private void TooShortError(Utf8ValidationDelegate utf8ValidationDelegate) { byte oldByte = utf8[i]; utf8[i] = 0b11100000; // Forcing a too short error - try - { - Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); - Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); - Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); - } - catch (Xunit.Sdk.XunitException) - { - Console.WriteLine($"Assertion failed at index: {i}"); - PrintHexAndBinary(utf8, i); - throw; // Rethrow the exception to fail the test. - } + try + { + Assert.False(ValidateUtf8(utf8, utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, i, utf8ValidationDelegate)); + Assert.True(ValidateCount(utf8, utf8ValidationDelegate)); + } + catch (Xunit.Sdk.XunitException) + { + Console.WriteLine($"Assertion failed at index: {i}"); + PrintHexAndBinary(utf8, i); + throw; // Rethrow the exception to fail the test. + } utf8[i] = oldByte; // Restore the original byte } } } } - + } [Fact] @@ -641,39 +516,21 @@ public void TooShortErrorScalar() TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - // TODO:Uncomment when SSE is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] - // [Fact] - // [Trait("Category", "sse")] - // public void TooShortErrorSse() - // { - // TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); - // } - - // TODO:Uncomment when AVX512 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] - // [Trait("Category", "avx512")] - // public void TooShortErrorAvx512() - // { - // TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); - // } - - // TODO:Uncomment when Arm64 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] - // [Trait("Category", "arm64")] - // public void TooShortErrorArm64() - // { - // TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); - // } - [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] - public void TooShortErrorAVX() + public void TooShortErrorAvx2() { TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - private void TooLongError(Utf8ValidationDelegate utf8ValidationDelegate) + [Trait("Category", "arm64")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + public void TooShortErrorArm64() + { + TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + } + + private void TooLongError(Utf8ValidationFunction utf8ValidationDelegate) { foreach (int outputLength in outputLengths) @@ -690,9 +547,9 @@ private void TooLongError(Utf8ValidationDelegate utf8ValidationDelegate) utf8[i] = 0b10000000; // Forcing a too long error try { - Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); - Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); - Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); + Assert.False(ValidateUtf8(utf8, utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, i, utf8ValidationDelegate)); + Assert.True(ValidateCount(utf8, utf8ValidationDelegate)); } catch (Xunit.Sdk.XunitException) { @@ -714,39 +571,21 @@ public void TooLongErrorScalar() TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - // TODO:Uncomment when SSE is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] - // [Fact] - // [Trait("Category", "sse")] - // public void TooLongErrorSse() - // { - // TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); - // } - - // TODO:Uncomment when AVX512 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] - // [Trait("Category", "avx512")] - // public void TooLongErrorAvx512() - // { - // TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); - // } - - // TODO:Uncomment when Arm64 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] - // [Trait("Category", "arm64")] - // public void TooLongErrorArm64() - // { - // TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); - // } - [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] - public void TooLongErrorAVX() + public void TooLongErrorAvx2() { TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - private void OverlongError(Utf8ValidationDelegate utf8ValidationDelegate) + [Trait("Category", "arm64")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + public void TooLongErrorArm64() + { + TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + } + + private void OverlongError(Utf8ValidationFunction utf8ValidationDelegate) { for (int trial = 0; trial < NumTrials; trial++) { @@ -777,9 +616,9 @@ private void OverlongError(Utf8ValidationDelegate utf8ValidationDelegate) utf8[i + 1] = (byte)(utf8[i + 1] & 0b11001111); } - Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); - Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); - Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); + Assert.False(ValidateUtf8(utf8, utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, i, utf8ValidationDelegate)); + Assert.True(ValidateCount(utf8, utf8ValidationDelegate)); utf8[i] = old; utf8[i + 1] = secondOld; @@ -796,66 +635,44 @@ public void OverlongErrorScalar() OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - // TODO:Uncomment when SSE is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] - // [Fact] - // [Trait("Category", "sse")] - // public void OverlongErrorSse() - // { - // OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); - // } - - // TODO:Uncomment when AVX512 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] - // [Trait("Category", "avx512")] - // public void OverlongErrorAvx512() - // { - // OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); - // } - - // TODO:Uncomment when Arm64 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] - // [Trait("Category", "arm64")] - // public void OverlongErrorArm64() - // { - // OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); - // } - - [Trait("Category", "avx")] - [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] - public void OverlongErrorAVX() + [Trait("Category", "arm64")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + public void OverlongErrorArm64() { - OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); } - private void TooShortErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate) + private void TooShortErrorAtEnd(Utf8ValidationFunction utf8ValidationDelegate) { for (int trial = 0; trial < NumTrials; trial++) { foreach (int outputLength in outputLengths) { - byte[] utf8 = generator.Generate(outputLength,byteCountInUnit: 1).ToArray(); - + byte[] utf8 = generator.Generate(outputLength, byteCountInUnit: 1).ToArray(); + unsafe { fixed (byte* pInput = utf8) { for (int i = 0; i < utf8.Length; i++) - { - int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; - byte currentByte = utf8[i]; - int offset = 0; + { + int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment; + byte currentByte = utf8[i]; + int offset = 0; - if ((currentByte & 0b11100000) == 0b11000000) { // This is a header byte of a 2-byte sequence + if ((currentByte & 0b11100000) == 0b11000000) + { // This is a header byte of a 2-byte sequence offset = 0; - } - if ((currentByte & 0b11110000) == 0b11100000) { + } + if ((currentByte & 0b11110000) == 0b11100000) + { // This is a header byte of a 3-byte sequence offset = rand.Next(0, 3); - } - if ((currentByte & 0b11111000) == 0b11110000) { + } + if ((currentByte & 0b11111000) == 0b11110000) + { // This is a header byte of a 4-byte sequence offset = rand.Next(0, 4); } @@ -866,10 +683,10 @@ private void TooShortErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate) byte* dotnetResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, i + offset, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); Assert.True(dotnetResult == pInput + i + offset); - Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); - } + Assert.True(ValidateCount(utf8, utf8ValidationDelegate)); + } - } + } } } } @@ -883,48 +700,23 @@ public void TooShortErrorAtEndScalar() TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - // TODO:Uncomment when SSE is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] - // [Fact] - // [Trait("Category", "sse")] - // public void TooShortErrorAtEndSse() - // { - // TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); - // } - - // TODO:Uncomment when AVX512 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] - // [Trait("Category", "avx512")] - // public void TooShortErrorAtEndAvx512() - // { - // TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); - // } - - // TODO:Uncomment when Arm64 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] - // [Trait("Category", "arm64")] - // public void TooShortErrorAtEndArm64() - // { - // TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); - // } [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] - public void TooShortErrorAtEndAVX() + public void TooShortErrorAtEndAvx2() { TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - [Trait("Category", "avx")] - [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] - public void TooShortErrorAtEndAvx2() + [Trait("Category", "arm64")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + public void TooShortErrorAtEndArm64() { - TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); } - //corresponds to condition 5.4.1 in the paper - private void Invalid0xf50xff(Utf8ValidationDelegate utf8ValidationDelegate) + private void Invalid0xf50xff(Utf8ValidationFunction utf8ValidationDelegate) { var invalidBytes = Enumerable.Range(0xF5, 0x100 - 0xF5).Select(i => (byte)i).ToArray(); // 0xF5 to 0xFF @@ -936,9 +728,9 @@ private void Invalid0xf50xff(Utf8ValidationDelegate utf8ValidationDelegate) foreach (var invalidByte in invalidBytes) { utf8[position] = invalidByte; - Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); // Expect the validation to fail due to the invalid byte - Assert.True(InvalidateUtf8(utf8,position,utf8ValidationDelegate)); - Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); + Assert.False(ValidateUtf8(utf8, utf8ValidationDelegate)); // Expect the validation to fail due to the invalid byte + Assert.True(InvalidateUtf8(utf8, position, utf8ValidationDelegate)); + Assert.True(ValidateCount(utf8, utf8ValidationDelegate)); } } } @@ -951,37 +743,6 @@ public void Invalid0xf50xffScalar() Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - // TODO:Uncomment when SSE is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] - // [Fact] - // [Trait("Category", "sse")] - // public void Invalid0xf50xffSse() - // { - // Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); - // } - - // TODO:Uncomment when AVX512 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] - // [Trait("Category", "avx512")] - // public void Invalid0xf50xffAvx512() - // { - // Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); - // } - - // TODO:Uncomment when Arm64 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] - // [Trait("Category", "arm64")] - // public void Invalid0xf50xffArm64() - // { - // Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); - // } - - [Trait("Category", "avx")] - [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] - public void Invalid0xf50xffAVX() - { - Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); - } [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] @@ -990,71 +751,82 @@ public void Invalid0xf50xffAvx2() Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } -// helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index -static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) -{ - int chunkSize = 16; // 128 bits = 16 bytes - // Process each chunk for hexadecimal - Console.Write("Hex: "); - for (int i = 0; i < bytes.Length; i++) + [Trait("Category", "arm64")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + public void Invalid0xf50xffArm64() { - if (i > 0 && i % chunkSize == 0) - Console.WriteLine(); // New line after every 16 bytes - - if (i == highlightIndex) - { - Console.ForegroundColor = ConsoleColor.Red; - Console.Write($"{bytes[i]:X2} "); - Console.ResetColor(); - } - else if (i % (chunkSize * 2) == 0) // print green every 256 bytes - { - Console.ForegroundColor = ConsoleColor.Green; - Console.Write($"{bytes[i]:X2} "); - Console.ResetColor(); - } - else - { - Console.Write($"{bytes[i]:X2} "); - } - - if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line + Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); } - Console.WriteLine("\n"); // New line for readability and to separate hex from binary - // Process each chunk for binary - Console.Write("Binary: "); - for (int i = 0; i < bytes.Length; i++) + // helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index + static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1) { - if (i > 0 && i % chunkSize == 0) - Console.WriteLine(); // New line after every 16 bytes + int chunkSize = 16; // 128 bits = 16 bytes - string binaryString = Convert.ToString(bytes[i], 2).PadLeft(8, '0'); - if (i == highlightIndex) + // Process each chunk for hexadecimal +#pragma warning disable CA1303 + Console.Write("Hex: "); + for (int i = 0; i < bytes.Length; i++) { - Console.ForegroundColor = ConsoleColor.Red; - Console.Write($"{binaryString} "); - Console.ResetColor(); - } - else if (i % (chunkSize * 2) == 0) // print green every 256 bytes - { - Console.ForegroundColor = ConsoleColor.Green; - Console.Write($"{binaryString} "); - Console.ResetColor(); + if (i > 0 && i % chunkSize == 0) + Console.WriteLine(); // New line after every 16 bytes + + if (i == highlightIndex) + { + Console.ForegroundColor = ConsoleColor.Red; + Console.Write($"{bytes[i]:X2} "); + Console.ResetColor(); + } + else if (i % (chunkSize * 2) == 0) // print green every 256 bytes + { + Console.ForegroundColor = ConsoleColor.Green; + Console.Write($"{bytes[i]:X2} "); + Console.ResetColor(); + } + else + { + Console.Write($"{bytes[i]:X2} "); + } +#pragma warning disable CA1303 + if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line } - else +#pragma warning disable CA1303 + Console.WriteLine("\n"); // New line for readability and to separate hex from binary + + // Process each chunk for binary +#pragma warning disable CA1303 + Console.Write("Binary: "); + for (int i = 0; i < bytes.Length; i++) { - Console.Write($"{binaryString} "); - } + if (i > 0 && i % chunkSize == 0) + Console.WriteLine(); // New line after every 16 bytes - if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line + string binaryString = Convert.ToString(bytes[i], 2).PadLeft(8, '0'); + if (i == highlightIndex) + { + Console.ForegroundColor = ConsoleColor.Red; + Console.Write($"{binaryString} "); + Console.ResetColor(); + } + else if (i % (chunkSize * 2) == 0) // print green every 256 bytes + { + Console.ForegroundColor = ConsoleColor.Green; + Console.Write($"{binaryString} "); + Console.ResetColor(); + } + else + { + Console.Write($"{binaryString} "); + } +#pragma warning disable CA1303 + if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line + } + Console.WriteLine(); // New line for readability } - Console.WriteLine(); // New line for readability -} - private void TooLargeError(Utf8ValidationDelegate utf8ValidationDelegate) + private void TooLargeError(Utf8ValidationFunction utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -1069,9 +841,9 @@ private void TooLargeError(Utf8ValidationDelegate utf8ValidationDelegate) byte old = utf8[i]; utf8[i] += (byte)(((utf8[i] & 0b100) == 0b100) ? 0b10 : 0b100); - Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); - Assert.True(InvalidateUtf8(utf8, i+1,utf8ValidationDelegate)); - Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); + Assert.False(ValidateUtf8(utf8, utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, i + 1, utf8ValidationDelegate)); + Assert.True(ValidateCount(utf8, utf8ValidationDelegate)); utf8[i] = old; } } @@ -1086,31 +858,6 @@ public void TooLargeErrorScalar() TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - // TODO:Uncomment when SSE is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] - // [Fact] - // [Trait("Category", "sse")] - // public void TooLargeErrorSse() - // { - // TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); - // } - - // TODO:Uncomment when AVX512 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] - // [Trait("Category", "avx512")] - // public void TooLargeErrorAvx512() - // { - // TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); - // } - - // TODO:Uncomment when Arm64 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] - // [Trait("Category", "arm64")] - // public void TooLargeErrorArm64() - // { - // TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); - // } - [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] public void TooLargeErrorAvx() @@ -1119,7 +866,14 @@ public void TooLargeErrorAvx() } - private void AsciiPlusContinuationAtEndError(Utf8ValidationDelegate utf8ValidationDelegate) + [Trait("Category", "arm64")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + public void TooLargeErrorArm64() + { + TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + } + + private void AsciiPlusContinuationAtEndError(Utf8ValidationFunction utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -1127,21 +881,21 @@ private void AsciiPlusContinuationAtEndError(Utf8ValidationDelegate utf8Validati { for (int i = 1; i <= 4; i++) { - byte[] filler = generator.Generate(outputLength,byteCountInUnit:1).ToArray(); - byte[] toolong = generator.AppendContinuationByte(generator.Generate(1,i)).ToArray(); + byte[] filler = generator.Generate(outputLength, byteCountInUnit: 1).ToArray(); + byte[] toolong = generator.AppendContinuationByte(generator.Generate(1, i)).ToArray(); - generator.ReplaceEndOfArray(filler,toolong); + RandomUtf8.ReplaceEndOfArray(filler, toolong); - Assert.False(ValidateUtf8(filler,utf8ValidationDelegate)); - Assert.True(InvalidateUtf8(filler, filler.Length - 1,utf8ValidationDelegate)); - Assert.True(ValidateCount(filler,utf8ValidationDelegate)); + Assert.False(ValidateUtf8(filler, utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(filler, filler.Length - 1, utf8ValidationDelegate)); + Assert.True(ValidateCount(filler, utf8ValidationDelegate)); } } } } - + [Fact] [Trait("Category", "scalar")] public void AsciiPlusContinuationAtEndErrorScalar() @@ -1149,36 +903,11 @@ public void AsciiPlusContinuationAtEndErrorScalar() AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - // TODO:Uncomment when SSE is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] - // [Fact] - // [Trait("Category", "sse")] - // public void AsciiPlusContinuationAtEndErrorSse() - // { - // AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); - // } - - // TODO:Uncomment when AVX512 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] - // [Trait("Category", "avx512")] - // public void AsciiPlusContinuationAtEndErrorAvx512() - // { - // AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); - // } - - // TODO:Uncomment when Arm64 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] - // [Trait("Category", "arm64")] - // public void AsciiPlusContinuationAtEndErrorArm64() - // { - // AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); - // } - - [Trait("Category", "avx")] - [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] - public void AsciiPlusContinuationAtEndErrorAVX() + [Trait("Category", "arm64")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + public void AsciiPlusContinuationAtEndErrorArm64() { - AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); + AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); } [Trait("Category", "avx")] @@ -1188,7 +917,7 @@ public void AsciiPlusContinuationAtEndErrorAvx2() AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } - private void SurrogateErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) + private void SurrogateErrorTest(Utf8ValidationFunction utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -1209,9 +938,9 @@ private void SurrogateErrorTest(Utf8ValidationDelegate utf8ValidationDelegate) { utf8[i + 1] = (byte)((utf8[i + 1] & 0b11000011) | (s << 2)); - Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); - Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate)); - Assert.True(ValidateCount(utf8,utf8ValidationDelegate)); + Assert.False(ValidateUtf8(utf8, utf8ValidationDelegate)); + Assert.True(InvalidateUtf8(utf8, i, utf8ValidationDelegate)); + Assert.True(ValidateCount(utf8, utf8ValidationDelegate)); } utf8[i] = old; @@ -1230,40 +959,21 @@ public void SurrogateErrorTestScalar() SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - // TODO:Uncomment when SSE is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] - // [Fact] - // [Trait("Category", "sse")] - // public void SurrogateErrorTestSse() - // { - // SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); - // } - - // TODO:Uncomment when AVX512 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] - // [Trait("Category", "avx512")] - // public void SurrogateErrorTestAvx512() - // { - // SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); - // } - - // TODO:Uncomment when Arm64 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] - // [Trait("Category", "arm64")] - // public void SurrogateErrorTestArm64() - // { - // SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); - // } - [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] - public void SurrogateErrorTestAVX() + public void SurrogateErrorTestAvx2() { SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } + [Trait("Category", "arm64")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + public void SurrogateErrorTestArm64() + { + SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + } - private void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate) + private void BruteForceTest(Utf8ValidationFunction utf8ValidationDelegate) { foreach (int outputLength in outputLengths) { @@ -1273,7 +983,7 @@ private void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate) // Generate random UTF-8 sequence byte[] utf8 = generator.Generate(rand.Next(outputLength)).ToArray(); - Assert.True(ValidateUtf8(utf8,utf8ValidationDelegate), "Initial UTF-8 validation (primary) failed."); + Assert.True(ValidateUtf8(utf8, utf8ValidationDelegate), "Initial UTF-8 validation (primary) failed."); Assert.True(ValidateUtf8Fuschia(utf8), "Initial UTF-8 validation (Fuschia) failed."); @@ -1291,68 +1001,56 @@ private void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate) modifiedUtf8[byteIndex] ^= (byte)bitFlip; // Validate the modified sequence with both methods - bool isValidPrimary = ValidateUtf8(modifiedUtf8,utf8ValidationDelegate); + bool isValidPrimary = ValidateUtf8(modifiedUtf8, utf8ValidationDelegate); bool isValidFuschia = ValidateUtf8Fuschia(modifiedUtf8); // Ensure both methods agree on the validation result - try{ Assert.Equal(isValidPrimary, isValidFuschia); - Assert.True(ValidateCount(modifiedUtf8,utf8ValidationDelegate)); - } - catch (Xunit.Sdk.XunitException) - { - Console.WriteLine($"Assertion failed. Byte randomly changed at index: {byteIndex}"); - PrintHexAndBinary(utf8, byteIndex); - throw; // Rethrow the exception to fail the test. - } - + try + { + Assert.Equal(isValidPrimary, isValidFuschia); + Assert.True(ValidateCount(modifiedUtf8, utf8ValidationDelegate)); + } + catch (Xunit.Sdk.XunitException) + { + Console.WriteLine($"Fushia validation: {isValidFuschia}"); + Console.WriteLine($"Testing validation: {isValidPrimary}"); + + Console.WriteLine($"Assertion failed. Byte randomly changed at index: {byteIndex}"); + PrintHexAndBinary(utf8, byteIndex); + throw; // Rethrow the exception to fail the test. + } + } } } } - [Fact] + [Fact] [Trait("Category", "scalar")] public void BruteForceTestScalar() { BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar); } - // TODO:Uncomment when SSE is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)] - // [Fact] - // [Trait("Category", "sse")] - // public void BruteForceTestSse() - // { - // BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse); - // } - - // TODO:Uncomment when AVX512 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)] - // [Trait("Category", "avx512")] - // public void BruteForceTestAvx512() - // { - // BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512); - // } - - // TODO:Uncomment when Arm64 is updated - // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] - // [Trait("Category", "arm64")] - // public void BruteForceTestArm64() - // { - // BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); - // } - [Trait("Category", "avx")] [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)] - public void BruteForceTestAVX() + public void BruteForceTestAvx2() { BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2); } + [Trait("Category", "arm64")] + [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)] + public void BruteForceTestArm64() + { + BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64); + } + // credit: based on code from Google Fuchsia (Apache Licensed) public static bool ValidateUtf8Fuschia(byte[] data) { + if (data == null) return false; int pos = 0; int len = data.Length; uint codePoint; @@ -1405,7 +1103,7 @@ public static bool ValidateUtf8Fuschia(byte[] data) } // Check that all functions agree on the result when the input might be invalid. - private bool InvalidateUtf8(byte[] utf8, int badindex,Utf8ValidationDelegate utf8ValidationDelegate) + private bool InvalidateUtf8(byte[] utf8, int badindex, Utf8ValidationFunction utf8ValidationDelegate) { unsafe { @@ -1415,27 +1113,28 @@ private bool InvalidateUtf8(byte[] utf8, int badindex,Utf8ValidationDelegate utf int TailUtf16CodeUnitCountAdjustment = 0; int SIMDUtf16CodeUnitCountAdjustment, SIMDScalarCountAdjustment; - byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, utf8.Length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment); + byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, utf8.Length, out TailUtf16CodeUnitCountAdjustment, out TailScalarCodeUnitCountAdjustment); int scalarOffset = (int)(scalarResult - pInput); - byte* simdResult = utf8ValidationDelegate(pInput, utf8.Length,out SIMDUtf16CodeUnitCountAdjustment,out SIMDScalarCountAdjustment); + byte* simdResult = utf8ValidationDelegate(pInput, utf8.Length, out SIMDUtf16CodeUnitCountAdjustment, out SIMDScalarCountAdjustment); int simdOffset = (int)(simdResult - pInput); int utf16CodeUnitCountAdjustment, scalarCountAdjustment; byte* dotnetResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, utf8.Length, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment); int dotnetOffset = (int)(dotnetResult - pInput); + var message = "Suprisingly, scalarResult != simdResult, scalarResult is {0} != simdResult is {1}, badindex = {2}, length = {3}"; if (scalarOffset != simdOffset) { - Console.WriteLine("Suprisingly, scalarResult != simdResult {0} != {1}, badindex = {2}, length = {3}", scalarOffset, simdOffset, badindex, utf8.Length); + Console.WriteLine(message, scalarOffset, simdOffset, badindex, utf8.Length); } if (dotnetOffset != simdOffset) { - Console.WriteLine("Suprisingly, dotnetOffset != simdResult {0} != {1}, badindex = {2}, length = {3}", dotnetOffset, simdOffset, badindex, utf8.Length); + Console.WriteLine(message, dotnetOffset, simdOffset, badindex, utf8.Length); } return (scalarResult == simdResult) && (simdResult == dotnetResult); } } } - private bool ValidateUtf8(byte[] utf8,Utf8ValidationDelegate utf8ValidationDelegate, Range range = default) + private bool ValidateUtf8(byte[] utf8, Utf8ValidationFunction utf8ValidationDelegate, Range range = default) { // Adjusted check for default Range var isDefaultRange = range.Equals(default(Range)); @@ -1453,14 +1152,12 @@ private bool ValidateUtf8(byte[] utf8,Utf8ValidationDelegate utf8ValidationDeleg if (dotnetResult != startPtr + length) { - // PrintDebugInfo(dotnetResult, startPtr, utf8, "DotnetRuntime fails to return the correct invalid position"); return false; } byte* simdResult = utf8ValidationDelegate(startPtr, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment); if (simdResult != startPtr + length) { - // PrintDebugInfo(simdResult, startPtr, utf8, "Our result fails to return the correct invalid position"); return false; } return true; @@ -1469,8 +1166,8 @@ private bool ValidateUtf8(byte[] utf8,Utf8ValidationDelegate utf8ValidationDeleg } } - // Helper method to calculate the actual offset and length from a Range - private (int offset, int length) GetOffsetAndLength(int totalLength, Range range) + // Helper method to calculate the actual offset and length from a Range + private static (int offset, int length) GetOffsetAndLength(int totalLength, Range range) { var start = range.Start.GetOffset(totalLength); var end = range.End.GetOffset(totalLength); @@ -1479,58 +1176,62 @@ private bool ValidateUtf8(byte[] utf8,Utf8ValidationDelegate utf8ValidationDeleg } -// Define a delegate that matches the signature of the methods you want to test - public unsafe delegate byte* Utf8ValidationDelegate(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment); + // Define a delegate that matches the signature of the methods you want to test + public unsafe delegate byte* Utf8ValidationFunction(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment); -public bool ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDelegate, Range range = default) -{ - int dotnetUtf16Adjustment, dotnetScalarCountAdjustment; - int simdUnicodeUtf16Adjustment, simdUnicodeScalarCountAdjustment; + public bool ValidateCount(byte[] utf8, Utf8ValidationFunction utf8ValidationDelegate, Range range = default) + { + int dotnetUtf16Adjustment, dotnetScalarCountAdjustment; + int simdUnicodeUtf16Adjustment, simdUnicodeScalarCountAdjustment; + if (utf8 == null || utf8ValidationDelegate == null) + { + return false; + } - var isDefaultRange = range.Equals(default(Range)); - var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range); + var isDefaultRange = range.Equals(default(Range)); + var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range); - unsafe - { - fixed (byte* pInput = utf8) + unsafe { - byte* startPtr = pInput + offset; + fixed (byte* pInput = utf8) + { + byte* startPtr = pInput + offset; - // Initialize adjustments - dotnetUtf16Adjustment = 0; - dotnetScalarCountAdjustment = 0; - DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out dotnetUtf16Adjustment, out dotnetScalarCountAdjustment); + // Initialize adjustments + dotnetUtf16Adjustment = 0; + dotnetScalarCountAdjustment = 0; + byte* Result = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out dotnetUtf16Adjustment, out dotnetScalarCountAdjustment); - simdUnicodeUtf16Adjustment = 0; - simdUnicodeScalarCountAdjustment = 0; - byte* simdResult = utf8ValidationDelegate(pInput, length, out simdUnicodeUtf16Adjustment, out simdUnicodeScalarCountAdjustment); + simdUnicodeUtf16Adjustment = 0; + simdUnicodeScalarCountAdjustment = 0; + byte* simdResult = utf8ValidationDelegate(pInput, length, out simdUnicodeUtf16Adjustment, out simdUnicodeScalarCountAdjustment); - // Check for discrepancies and report them in one combined message - bool adjustmentsMatch = true; - string errorMessage = "Error: Adjustments mismatch - "; + // Check for discrepancies and report them in one combined message + bool adjustmentsMatch = true; + if (Result != simdResult) + { + Console.WriteLine($"Expected error at location : {Result - pInput}, but got: {simdResult - pInput}. "); + adjustmentsMatch = false; + } - if (dotnetScalarCountAdjustment != simdUnicodeScalarCountAdjustment) - { - errorMessage += $"Expected Scalar Count Adjustment: {dotnetScalarCountAdjustment}, but got: {simdUnicodeScalarCountAdjustment}. "; - adjustmentsMatch = false; - } + if (dotnetScalarCountAdjustment != simdUnicodeScalarCountAdjustment) + { + Console.WriteLine($"Expected Scalar Count Adjustment: {dotnetScalarCountAdjustment}, but got: {simdUnicodeScalarCountAdjustment}. "); + adjustmentsMatch = false; + } + + if (dotnetUtf16Adjustment != simdUnicodeUtf16Adjustment) + { + Console.WriteLine($"Expected UTF16 Adjustment: {dotnetUtf16Adjustment}, but got: {simdUnicodeUtf16Adjustment}."); + adjustmentsMatch = false; + } - if (dotnetUtf16Adjustment != simdUnicodeUtf16Adjustment) - { - errorMessage += $"Expected UTF16 Adjustment: {dotnetUtf16Adjustment}, but got: {simdUnicodeUtf16Adjustment}."; - adjustmentsMatch = false; - } - if (!adjustmentsMatch) - { - Console.WriteLine(errorMessage); - return false; - } - return true; + return adjustmentsMatch; + } } } -} } diff --git a/test/helpers/randomutf8.cs b/test/helpers/randomutf8.cs index 5b30cd0..7c0ff20 100644 --- a/test/helpers/randomutf8.cs +++ b/test/helpers/randomutf8.cs @@ -1,3 +1,4 @@ +namespace tests; using System; using System.Collections.Generic; using System.Linq; @@ -9,28 +10,13 @@ public class RandomUtf8 private double[] probabilities; private const int maxByteLength = 4; - public RandomUtf8(uint seed, int prob_1byte, int prob_2bytes, int prob_3bytes, int prob_4bytes) + public RandomUtf8(uint seed, int prob1byte, int prob2bytes, int prob3bytes, int prob4bytes) { gen = new Random((int)seed); - probabilities = new double[maxByteLength] { prob_1byte, prob_2bytes, prob_3bytes, prob_4bytes }; + probabilities = new double[maxByteLength] { prob1byte, prob2bytes, prob3bytes, prob4bytes }; } - // public byte[] Generate(int howManyUnits, int? byteCountInUnit = null) - // { - // var result = new List(); - // while (result.Count < howManyUnits) - // { - // int count = byteCountInUnit ?? PickRandomByteCount(); - // int codePoint = GenerateCodePoint(count); - // byte[] utf8Bytes = Encoding.UTF8.GetBytes(char.ConvertFromUtf32(codePoint)); - - // result.AddRange(utf8Bytes); - // if (result.Count + utf8Bytes.Length > howManyUnits) - // break; - // } - // return result.ToArray(); - // } - +#pragma warning disable CA1002 public List Generate(int howManyUnits, int? byteCountInUnit = null) { var result = new List(); @@ -47,67 +33,22 @@ public List Generate(int howManyUnits, int? byteCountInUnit = null) return result; } - // public List Generate(int howManyUnits, int? byteCountInUnit = null) - // { - // var result = new List(); - // var unitsAdded = 0; // Track the number of characters added. - - // while (unitsAdded < howManyUnits) - // { - // int count = byteCountInUnit ?? PickRandomByteCount(); - // int codePoint = GenerateCodePoint(count); - // byte[] utf8Bytes = Encoding.UTF8.GetBytes(char.ConvertFromUtf32(codePoint)); - - // // Ensure adding the new character won't exceed the howManyUnits limit. - // if (unitsAdded + 1 > howManyUnits) - // break; - - // result.AddRange(utf8Bytes); - // unitsAdded++; // Increment the units (characters) count. - // } - - // return result; - // } - - - // public object Generate(int howManyUnits, int? byteCountInUnit = null, bool returnAsList = false) - // { - // var result = new List(); - // while (result.Count < howManyUnits) - // { - // int count = byteCountInUnit ?? PickRandomByteCount(); - // int codePoint = GenerateCodePoint(count); - // byte[] utf8Bytes = Encoding.UTF8.GetBytes(char.ConvertFromUtf32(codePoint)); - - // if (result.Count + utf8Bytes.Length > howManyUnits) - // break; - - // result.AddRange(utf8Bytes); - // } - - // if (returnAsList) - // { - // return result; - // } - // else - // { - // return result.ToArray(); - // } - // } - private int GenerateCodePoint(int byteCount) { switch (byteCount) { case 1: // Generate a code point for a 1-byte UTF-8 character (ASCII) +#pragma warning disable CA5394 return gen.Next(0x0000, 0x007F + 1);// +1 because gen.Next() excludes the upper bound case 2: // Generate a code point for a 2-byte UTF-8 character (Latin) +#pragma warning disable CA5394 return gen.Next(0x0080, 0x07FF + 1); case 3: // Generate a code point for a 3-byte UTF-8 character (Asiatic) // Note: This range skips over the surrogate pair range U+D800 to U+DFFF +#pragma warning disable CA5394 if (gen.NextDouble() < 0.5) { // Generate code point in U+0800 to U+D7FF range @@ -116,24 +57,27 @@ private int GenerateCodePoint(int byteCount) else { // Generate code point in U+E000 to U+FFFF range +#pragma warning disable CA5394 return gen.Next(0xE000, 0xFFFF + 1); } case 4: // Generate a code point for a 4-byte UTF-8 character (Supplementary) // The +1 is factored into the ConvertFromUtf32 method +#pragma warning disable CA5394 return gen.Next(0x010000, 0x10FFFF); default: throw new InvalidOperationException($"Invalid byte count: {byteCount}"); } } +#pragma warning disable CA1002 public List AppendContinuationByte(List utf8Bytes) => utf8Bytes.Concat(new byte[] { (byte)gen.Next(0x80, 0xBF + 1) }).ToList(); - - public void ReplaceEndOfArray(byte[] original, byte[] replacement)//, int startIndex) +#pragma warning disable CA1062 + public static void ReplaceEndOfArray(byte[] original, byte[] replacement)//, int startIndex) { // Calculate the start index for replacement int startIndex = original.Length - replacement.Length;