diff --git a/README.md b/README.md
index 6915db7..83245c7 100644
--- a/README.md
+++ b/README.md
@@ -6,15 +6,31 @@ This is a fast C# library to validate UTF-8 strings.
## Motivation
-We seek to speed up the `Utf8Utility.GetPointerToFirstInvalidByte` function. Using the algorithm used by Node.js, Oracle GraalVM and other important systems.
-
-- John Keiser, Daniel Lemire, [Validating UTF-8 In Less Than One Instruction Per Byte](https://arxiv.org/abs/2010.03090), Software: Practice and Experience 51 (5), 2021
+We seek to speed up the `Utf8Utility.GetPointerToFirstInvalidByte` function from the C# runtime library.
+[The function is private in the Microsoft Runtime](https://github.com/dotnet/runtime/blob/4d709cd12269fcbb3d0fccfb2515541944475954/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs), but we can expose it manually.
-The algorithm in question is part of popular JavaScript runtimes such as Node.js and Bun, [by PHP](https://github.com/php/php-src/blob/90e0ce7f0db99767c58dc21e4213c0f8763f657a/ext/mbstring/mbstring.c#L5270), by Oracle GraalVM and many important systems.
+Specifically, we provide the function `SimdUnicode.UTF8.GetPointerToFirstInvalidByte` which is a faster
+drop-in replacement:
+```cs
+// Returns &inputBuffer[inputLength] if the input buffer is valid.
+///
+/// Given an input buffer of byte length ,
+/// returns a pointer to where the first invalid data appears in .
+/// The parameter is set according to the content of the valid UTF-8 characters encountered, counting -1 for each 2-byte character, -2 for each 3-byte and 4-byte characters.
+/// The parameter is set according to the content of the valid UTF-8 characters encountered, counting -1 for each 4-byte character.
+///
+///
+/// Returns a pointer to the end of if the buffer is well-formed.
+///
+public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength, out int Utf16CodeUnitCountAdjustment, out int ScalarCodeUnitCountAdjustment);
+```
-[The function is private in the Microsoft Runtime](https://github.com/dotnet/runtime/blob/4d709cd12269fcbb3d0fccfb2515541944475954/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs), but we can expose it manually.
+The function uses advanced instructions (SIMD) on 64-bit ARM and x64 processors, but fallbacks on a
+conventional implementation on other systems. We provide extensive tests and benchmarks.
+We apply the algorithm used by Node.js, Bun, Oracle GraalVM, by the PHP interpreter and other important systems. The algorithm has been described in the follow article:
+- John Keiser, Daniel Lemire, [Validating UTF-8 In Less Than One Instruction Per Byte](https://arxiv.org/abs/2010.03090), Software: Practice and Experience 51 (5), 2021
## Requirements
@@ -30,6 +46,11 @@ dotnet test
To see which tests are running, we recommend setting the verbosity level:
+```
+dotnet test -v=normal
+```
+
+More details could be useful:
```
dotnet test -v d
```
@@ -44,7 +65,7 @@ To run specific tests, it is helpful to use the filter parameter:
```
-dotnet test --filter TooShortErrorAVX
+dotnet test --filter TooShortErrorAvx2
```
Or to target specific categories:
@@ -89,7 +110,6 @@ dotnet build
We recommend you use `dotnet format`. E.g.,
```
-cd test
dotnet format
```
@@ -115,6 +135,7 @@ You can print the content of a vector register like so:
## Performance tips
- Be careful: `Vector128.Shuffle` is not the same as `Ssse3.Shuffle` nor is `Vector128.Shuffle` the same as `Avx2.Shuffle`. Prefer the latter.
+- Similarly `Vector128.Shuffle` is not the same as `AdvSimd.Arm64.VectorTableLookup`, use the latter.
## More reading
diff --git a/benchmark/Benchmark.cs b/benchmark/Benchmark.cs
index c48e075..0900059 100644
--- a/benchmark/Benchmark.cs
+++ b/benchmark/Benchmark.cs
@@ -27,13 +27,17 @@ public class Speed : IColumn
{
public string GetValue(Summary summary, BenchmarkCase benchmarkCase)
{
+ if (summary is null || benchmarkCase is null || benchmarkCase.Parameters is null)
+ {
+ return "N/A";
+ }
var ourReport = summary.Reports.First(x => x.BenchmarkCase.Equals(benchmarkCase));
var fileName = (string)benchmarkCase.Parameters["FileName"];
- long length = new System.IO.FileInfo(fileName).Length;
- if (ourReport.ResultStatistics is null)
+ if (ourReport is null || ourReport.ResultStatistics is null)
{
return "N/A";
}
+ long length = new System.IO.FileInfo(fileName).Length;
var mean = ourReport.ResultStatistics.Mean;
return $"{(length / ourReport.ResultStatistics.Mean):#####.00}";
}
@@ -46,8 +50,8 @@ public string GetValue(Summary summary, BenchmarkCase benchmarkCase)
public string ColumnName { get; } = "Speed (GB/s)";
public bool AlwaysShow { get; } = true;
public ColumnCategory Category { get; } = ColumnCategory.Custom;
- public int PriorityInCategory { get; } = 0;
- public bool IsNumeric { get; } = false;
+ public int PriorityInCategory { get; }
+ public bool IsNumeric { get; }
public UnitType UnitType { get; } = UnitType.Dimensionless;
public string Legend { get; } = "The speed in gigabytes per second";
}
@@ -57,8 +61,8 @@ public string GetValue(Summary summary, BenchmarkCase benchmarkCase)
[Config(typeof(Config))]
public class RealDataBenchmark
{
-
- private class Config : ManualConfig
+#pragma warning disable CA1812
+ private sealed class Config : ManualConfig
{
public Config()
{
@@ -67,6 +71,7 @@ public Config()
if (RuntimeInformation.ProcessArchitecture == Architecture.Arm64)
{
+#pragma warning disable CA1303
Console.WriteLine("ARM64 system detected.");
AddFilter(new AnyCategoriesFilter(["arm64", "scalar", "runtime"]));
@@ -75,21 +80,25 @@ public Config()
{
if (Vector512.IsHardwareAccelerated && System.Runtime.Intrinsics.X86.Avx512Vbmi.IsSupported)
{
+#pragma warning disable CA1303
Console.WriteLine("X64 system detected (Intel, AMD,...) with AVX-512 support.");
AddFilter(new AnyCategoriesFilter(["avx512", "avx", "sse", "scalar", "runtime"]));
}
else if (Avx2.IsSupported)
{
+#pragma warning disable CA1303
Console.WriteLine("X64 system detected (Intel, AMD,...) with AVX2 support.");
AddFilter(new AnyCategoriesFilter(["avx", "sse", "scalar", "runtime"]));
}
else if (Ssse3.IsSupported)
{
+#pragma warning disable CA1303
Console.WriteLine("X64 system detected (Intel, AMD,...) with Sse4.2 support.");
AddFilter(new AnyCategoriesFilter(["sse", "scalar", "runtime"]));
}
else
{
+#pragma warning disable CA1303
Console.WriteLine("X64 system detected (Intel, AMD,...) without relevant SIMD support.");
AddFilter(new AnyCategoriesFilter(["scalar", "runtime"]));
}
@@ -130,14 +139,15 @@ public Config()
@"data/thai.utf8.txt",
@"data/turkish.utf8.txt",
@"data/vietnamese.utf8.txt")]
+#pragma warning disable CA1051
public string? FileName;
- public byte[] allLinesUtf8 = new byte[0];
+ private byte[] allLinesUtf8 = Array.Empty();
public unsafe delegate byte* Utf8ValidationFunction(byte* pUtf8, int length);
public unsafe delegate byte* DotnetRuntimeUtf8ValidationFunction(byte* pUtf8, int length, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment);
- public void RunUtf8ValidationBenchmark(byte[] data, Utf8ValidationFunction validationFunction)
+ private void RunUtf8ValidationBenchmark(byte[] data, Utf8ValidationFunction validationFunction)
{
unsafe
{
@@ -146,13 +156,13 @@ public void RunUtf8ValidationBenchmark(byte[] data, Utf8ValidationFunction valid
var res = validationFunction(pUtf8, data.Length);
if (res != pUtf8 + data.Length)
{
- throw new Exception("Invalid UTF-8: I expected the pointer to be at the end of the buffer.");
+ throw new ArgumentException("Invalid UTF-8: I expected the pointer to be at the end of the buffer.");
}
}
}
}
- public void RunDotnetRuntimeUtf8ValidationBenchmark(byte[] data, DotnetRuntimeUtf8ValidationFunction validationFunction)
+ private void RunDotnetRuntimeUtf8ValidationBenchmark(byte[] data, DotnetRuntimeUtf8ValidationFunction validationFunction)
{
unsafe
{
@@ -183,20 +193,17 @@ public unsafe void SIMDUtf8ValidationRealData()
{
if (allLinesUtf8 != null)
{
- // RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByte);
+ RunUtf8ValidationBenchmark(allLinesUtf8, (byte* pInputBuffer, int inputLength) =>
+ {
+ int dummyUtf16CodeUnitCountAdjustment, dummyScalarCountAdjustment;
+ // Call the method with additional out parameters within the lambda.
+ // You must handle these additional out parameters inside the lambda, as they cannot be passed back through the delegate.
+ return SimdUnicode.UTF8.GetPointerToFirstInvalidByte(pInputBuffer, inputLength, out dummyUtf16CodeUnitCountAdjustment, out dummyScalarCountAdjustment);
+ });
}
}
[Benchmark]
- // [BenchmarkCategory("scalar")]
- // public unsafe void Utf8ValidationRealDataScalar()
- // {
- // if (allLinesUtf8 != null)
- // {
- // RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
- // }
- // }
-
[BenchmarkCategory("scalar")]
public unsafe void Utf8ValidationRealDataScalar()
{
@@ -213,45 +220,48 @@ public unsafe void Utf8ValidationRealDataScalar()
}
}
-
[Benchmark]
[BenchmarkCategory("arm64")]
public unsafe void SIMDUtf8ValidationRealDataArm64()
{
if (allLinesUtf8 != null)
{
- RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
+ RunUtf8ValidationBenchmark(allLinesUtf8, (byte* pInputBuffer, int inputLength) =>
+ {
+ int dummyUtf16CodeUnitCountAdjustment, dummyScalarCountAdjustment;
+ // Call the method with additional out parameters within the lambda.
+ // You must handle these additional out parameters inside the lambda, as they cannot be passed back through the delegate.
+ return SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength, out dummyUtf16CodeUnitCountAdjustment, out dummyScalarCountAdjustment);
+ });
}
+
}
- // [Benchmark]
- // [BenchmarkCategory("avx")]
- // public unsafe void SIMDUtf8ValidationRealDataAvx2()
- // {
- // if (allLinesUtf8 != null)
- // {
- // RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
- // }
- // }
+
[Benchmark]
- [BenchmarkCategory("sse")]
- public unsafe void SIMDUtf8ValidationRealDataSse()
+ [BenchmarkCategory("avx")]
+ public unsafe void SIMDUtf8ValidationRealDataAvx2()
{
if (allLinesUtf8 != null)
{
- RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
+ RunUtf8ValidationBenchmark(allLinesUtf8, (byte* pInputBuffer, int inputLength) =>
+ {
+ int dummyUtf16CodeUnitCountAdjustment, dummyScalarCountAdjustment;
+ // Call the method with additional out parameters within the lambda.
+ // You must handle these additional out parameters inside the lambda, as they cannot be passed back through the delegate.
+ return SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength, out dummyUtf16CodeUnitCountAdjustment, out dummyScalarCountAdjustment);
+ });
}
}
- /*
- // TODO: enable this benchmark when the AVX-512 implementation is ready
+
[Benchmark]
- [BenchmarkCategory("avx512")]
- public unsafe void SIMDUtf8ValidationRealDataAvx512()
+ [BenchmarkCategory("sse")]
+ public unsafe void SIMDUtf8ValidationRealDataSse()
{
if (allLinesUtf8 != null)
{
- RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
+ RunUtf8ValidationBenchmark(allLinesUtf8, SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
}
- }*/
+ }
}
public class Program
diff --git a/src/Ascii.cs b/src/Ascii.cs
index e92845b..9eb94b5 100644
--- a/src/Ascii.cs
+++ b/src/Ascii.cs
@@ -25,6 +25,7 @@ public unsafe static class Ascii
public static bool IsAscii(this string s)
{
+ if (s == null) return true;
foreach (var c in s)
{
if (!c.IsAscii()) return false;
diff --git a/src/UTF8.cs b/src/UTF8.cs
index 6eef42d..0a96ece 100644
--- a/src/UTF8.cs
+++ b/src/UTF8.cs
@@ -10,8 +10,42 @@ namespace SimdUnicode
public static class UTF8
{
+ // Returns &inputBuffer[inputLength] if the input buffer is valid.
+ ///
+ /// Given an input buffer of byte length ,
+ /// returns a pointer to where the first invalid data appears in .
+ /// The parameter is set according to the content of the valid UTF-8 characters encountered, counting -1 for each 2-byte character, -2 for each 3-byte and 4-byte characters.
+ /// The parameter is set according to the content of the valid UTF-8 characters encountered, counting -1 for each 4-byte character.
+ ///
+ ///
+ /// Returns a pointer to the end of if the buffer is well-formed.
+ ///
+ public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength, out int Utf16CodeUnitCountAdjustment, out int ScalarCodeUnitCountAdjustment)
+ {
+
+ if (AdvSimd.Arm64.IsSupported)
+ {
+ return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment);
+ }
+ if (Avx2.IsSupported)
+ {
+ return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment);
+ }
+ /*if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported)
+ {
+ return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength);
+ }*/
+ // if (Ssse3.IsSupported)
+ // {
+ // return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength);
+ // }
+ // return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength);
+
+ return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment);
+
+ }
// prevents double counting in case there is a toolong error on the edge
- public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byte headerByte)
+ private static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byte headerByte)
{
// Check if the header byte belongs to a 2-byte UTF-8 character
if ((headerByte & 0b11100000) == 0b11000000)
@@ -32,10 +66,141 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
return (0, 0);
}
-
- public unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len, ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment)
+ // We scan the input from buf to len, possibly going back howFarBack bytes, to find the end of
+ // a valid UTF-8 sequence. We return buf + len if the buffer is valid, otherwise we return the
+ // pointer to the first invalid byte.
+ private unsafe static byte* SimpleRewindAndValidateWithErrors(int howFarBack, byte* buf, int len)
{
+ int extraLen = 0;
+ bool foundLeadingBytes = false;
+
+ for (int i = 0; i <= howFarBack; i++)
+ {
+ byte candidateByte = buf[0 - i];
+ foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000;
+
+ if (foundLeadingBytes)
+ {
+ buf -= i;
+ extraLen = i;
+ break;
+ }
+ }
+
+ if (!foundLeadingBytes)
+ {
+ return buf - howFarBack;
+ }
+
+ int pos = 0;
+ int nextPos;
+ uint codePoint = 0;
+
+ len += extraLen;
+
+ while (pos < len)
+ {
+
+ byte firstByte = buf[pos];
+
+ while (firstByte < 0b10000000)
+ {
+ if (++pos == len)
+ {
+ return buf + len;
+ }
+ firstByte = buf[pos];
+ }
+
+ if ((firstByte & 0b11100000) == 0b11000000)
+ {
+ nextPos = pos + 2;
+ if (nextPos > len)
+ {
+ return buf + pos;
+ } // Too short
+ if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+ {
+ return buf + pos;
+ } // Too short
+ // range check
+ codePoint = (uint)(firstByte & 0b00011111) << 6 | (uint)(buf[pos + 1] & 0b00111111);
+ if ((codePoint < 0x80) || (0x7ff < codePoint))
+ {
+ return buf + pos;
+ } // Overlong
+ }
+ else if ((firstByte & 0b11110000) == 0b11100000)
+ {
+ nextPos = pos + 3;
+ if (nextPos > len)
+ {
+ return buf + pos;
+ } // Too short
+ // range check
+ codePoint = (uint)(firstByte & 0b00001111) << 12 |
+ (uint)(buf[pos + 1] & 0b00111111) << 6 |
+ (uint)(buf[pos + 2] & 0b00111111);
+ // Either overlong or too large:
+ if ((codePoint < 0x800) || (0xffff < codePoint) ||
+ (0xd7ff < codePoint && codePoint < 0xe000))
+ {
+ return buf + pos;
+ }
+ if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+ {
+ return buf + pos;
+ } // Too short
+ if ((buf[pos + 2] & 0b11000000) != 0b10000000)
+ {
+ return buf + pos;
+ } // Too short
+ }
+ else if ((firstByte & 0b11111000) == 0b11110000)
+ {
+ nextPos = pos + 4;
+ if (nextPos > len)
+ {
+ return buf + pos;
+ }
+ if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+ {
+ return buf + pos;
+ }
+ if ((buf[pos + 2] & 0b11000000) != 0b10000000)
+ {
+ return buf + pos;
+ }
+ if ((buf[pos + 3] & 0b11000000) != 0b10000000)
+ {
+ return buf + pos;
+ }
+ // range check
+ codePoint =
+ (uint)(firstByte & 0b00000111) << 18 | (uint)(buf[pos + 1] & 0b00111111) << 12 |
+ (uint)(buf[pos + 2] & 0b00111111) << 6 | (uint)(buf[pos + 3] & 0b00111111);
+ if (codePoint <= 0xffff || 0x10ffff < codePoint)
+ {
+ return buf + pos;
+ }
+ }
+ else
+ {
+ // we may have a continuation/too long error
+ return buf + pos;
+ }
+ pos = nextPos;
+ }
+
+ return buf + len; // no error
+ }
+
+ // We scan the input from buf to len, possibly going back howFarBack bytes, to find the end of
+ // a valid UTF-8 sequence. We return buf + len if the buffer is valid, otherwise we return the
+ // pointer to the first invalid byte. Also updated the utf16CodeUnitCountAdjustment and scalarCountAdjustment
+ private unsafe static byte* RewindAndValidateWithErrors(int howFarBack, byte* buf, int len, ref int utf16CodeUnitCountAdjustment, ref int scalarCountAdjustment)
+ {
int extraLen = 0;
bool foundLeadingBytes = false;
@@ -252,7 +417,7 @@ public static (int utfAdjust, int scalarAdjust) GetFinalScalarUtfAdjustments(byt
// ... pInputBuffer[returnedvalue - 1] should be continuation bytes.
// Note that this function is unsafe, and it is the caller's responsibility
// to ensure that we can read at least 4 bytes before pInputBuffer.
- public unsafe static (int totalbyteadjustment, int backedupByHowMuch, int ascii, int contbyte, int n4) adjustmentFactor(byte* pInputBuffer)
+ private unsafe static (int totalbyteadjustment, int backedupByHowMuch, int ascii, int contbyte, int n4) adjustmentFactor(byte* pInputBuffer)
{
// Find the first non-continuation byte, working backward.
int i = 1;
@@ -281,7 +446,7 @@ public unsafe static (int totalbyteadjustment, int backedupByHowMuch, int ascii,
return (4 - i, i, 0, contbyteadjust, -1); // We have that i == 1 or i == 2 or i == 3 or i == 4, if i == 1, we are missing three bytes, if i == 2, we are missing two bytes, if i == 3, we are missing one byte.
}
- public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte)
+ private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustments(int asciibytes, int n4, int contbytes, int totalbyte)
{
int n3 = asciibytes - 2 * n4 + 2 * contbytes - totalbyte;
int n2 = -2 * asciibytes + n4 - 3 * contbytes + 2 * totalbyte;
@@ -291,7 +456,7 @@ public static (int utfadjust, int scalaradjust) CalculateN2N3FinalSIMDAdjustment
return (utfadjust, scalaradjust);
}
- public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes)
+ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(int start_point, int processedLength, byte* pInputBuffer, int asciibytes, int n4, int contbytes)
{
// Calculate the total bytes from start_point to processedLength
int totalbyte = processedLength - start_point;
@@ -461,16 +626,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
public unsafe static byte* GetPointerToFirstInvalidByteAvx2(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
{
int processedLength = 0;
- int TempUtf16CodeUnitCountAdjustment = 0;
- int TempScalarCountAdjustment = 0;
-
- int TailScalarCodeUnitCountAdjustment = 0;
- int TailUtf16CodeUnitCountAdjustment = 0;
-
if (pInputBuffer == null || inputLength <= 0)
{
- utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment;
- scalarCountAdjustment = TempScalarCountAdjustment;
+ utf16CodeUnitCountAdjustment = 0;
+ scalarCountAdjustment = 0;
return pInputBuffer;
}
if (inputLength > 128)
@@ -598,12 +757,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
int asciibytes = 0; // number of ascii bytes in the block (could also be called n1)
int contbytes = 0; // number of continuation bytes in the block
int n4 = 0; // number of 4-byte sequences that start in this block
-
for (; processedLength + 32 <= inputLength; processedLength += 32)
{
Vector256 currentBlock = Avx.LoadVector256(pInputBuffer + processedLength);
-
int mask = Avx2.MoveMask(currentBlock);
if (mask == 0)
{
@@ -612,14 +769,20 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
//
if (!Avx2.TestZ(prevIncomplete, prevIncomplete))
{
- int totalbyteasciierror = processedLength - start_point;
- var (utfadjustasciierror, scalaradjustasciierror) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyteasciierror);
-
- utf16CodeUnitCountAdjustment = utfadjustasciierror;
- scalarCountAdjustment = scalaradjustasciierror;
-
int off = processedLength >= 3 ? processedLength - 3 : processedLength;
- return SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + off, inputLength - off, ref utf16CodeUnitCountAdjustment, ref scalarCountAdjustment);
+ byte* invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(16 - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
+ // So the code is correct up to invalidBytePointer
+ if (invalidBytePointer < pInputBuffer + processedLength)
+ {
+ removeCounters(invalidBytePointer, pInputBuffer + processedLength, ref asciibytes, ref n4, ref contbytes);
+ }
+ else
+ {
+ addCounters(pInputBuffer + processedLength, invalidBytePointer, ref asciibytes, ref n4, ref contbytes);
+ }
+ int totalbyteasciierror = processedLength - start_point;
+ (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyteasciierror);
+ return invalidBytePointer;
}
prevIncomplete = Vector256.Zero;
}
@@ -646,31 +809,29 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
if (!Avx2.TestZ(error, error))
{
- int off = processedLength > 32 ? processedLength - 32 : processedLength;// this does not backup ff processedlength = 32
- byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment, ref TailScalarCodeUnitCountAdjustment);
- utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment;
- scalarCountAdjustment = TailScalarCodeUnitCountAdjustment;
-
- int totalbyteasciierror = processedLength - start_point;
- var (utfadjustasciierror, scalaradjustasciierror) = calculateErrorPathadjust(start_point, processedLength, pInputBuffer, asciibytes, n4, contbytes);
-
- utf16CodeUnitCountAdjustment += utfadjustasciierror;
- scalarCountAdjustment += scalaradjustasciierror;
-
+ byte* invalidBytePointer;
+ if (processedLength == 0)
+ {
+ invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength);
+ }
+ else
+ {
+ invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(processedLength - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
+ }
+ if (invalidBytePointer < pInputBuffer + processedLength)
+ {
+ removeCounters(invalidBytePointer, pInputBuffer + processedLength, ref asciibytes, ref n4, ref contbytes);
+ }
+ else
+ {
+ addCounters(pInputBuffer + processedLength, invalidBytePointer, ref asciibytes, ref n4, ref contbytes);
+ }
+ int total_bytes_processed = (int)(invalidBytePointer - (pInputBuffer + start_point));
+ (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, total_bytes_processed);
return invalidBytePointer;
}
prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue);
-
- if (!Avx2.TestZ(prevIncomplete, prevIncomplete))
- {
- // We have an unterminated sequence.
- var (totalbyteadjustment, i, tempascii, tempcont, tempn4) = adjustmentFactor(pInputBuffer + processedLength + 32);
- processedLength -= i;
- n4 += tempn4;
- contbytes += tempcont;
- }
-
contbytes += (int)Popcnt.PopCount((uint)Avx2.MoveMask(byte_2_high));
// We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation.
n4 += (int)Popcnt.PopCount((uint)Avx2.MoveMask(Avx2.SubtractSaturate(currentBlock, fourthByte)));
@@ -681,50 +842,53 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
// and no expensive operation:
asciibytes += (int)(32 - Popcnt.PopCount((uint)mask));
}
+ // We may still have an error.
+ if (processedLength < inputLength || !Avx2.TestZ(prevIncomplete, prevIncomplete))
+ {
+ byte* invalidBytePointer;
+ if (processedLength == 0)
+ {
+ invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength);
+ }
+ else
+ {
+ invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(processedLength - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
- int totalbyte = processedLength - start_point;
- var (utf16adjust, scalaradjust) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyte);
-
- TempUtf16CodeUnitCountAdjustment = utf16adjust;
- TempScalarCountAdjustment = scalaradjust;
- }
-
-
- }
- // We have processed all the blocks using SIMD, we need to process the remaining bytes.
- // Process the remaining bytes with the scalar function
-
- // worst possible case is 4 bytes, where we need to backtrack 3 bytes
- // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte
- if (processedLength < inputLength)
- {
-
- byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(32, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment, ref TailScalarCodeUnitCountAdjustment);
- if (invalidBytePointer != pInputBuffer + inputLength)
- {
- utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment;
- scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment;
-
- // An invalid byte was found by the scalar function
- return invalidBytePointer;
+ }
+ if (invalidBytePointer != pInputBuffer + inputLength)
+ {
+ if (invalidBytePointer < pInputBuffer + processedLength)
+ {
+ removeCounters(invalidBytePointer, pInputBuffer + processedLength, ref asciibytes, ref n4, ref contbytes);
+ }
+ else
+ {
+ addCounters(pInputBuffer + processedLength, invalidBytePointer, ref asciibytes, ref n4, ref contbytes);
+ }
+ int total_bytes_processed = (int)(invalidBytePointer - (pInputBuffer + start_point));
+ (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, total_bytes_processed);
+ return invalidBytePointer;
+ }
+ else
+ {
+ addCounters(pInputBuffer + processedLength, invalidBytePointer, ref asciibytes, ref n4, ref contbytes);
+ }
+ }
+ int final_total_bytes_processed = inputLength - start_point;
+ (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, final_total_bytes_processed);
+ return pInputBuffer + inputLength;
}
}
- utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment;
- scalarCountAdjustment = TempScalarCountAdjustment + TailScalarCodeUnitCountAdjustment;
-
- return pInputBuffer + inputLength;
+ return GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
}
- public unsafe static byte* GetPointerToFirstInvalidByteArm64(byte* pInputBuffer, int inputLength)
+ public unsafe static byte* GetPointerToFirstInvalidByteArm64(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
{
int processedLength = 0;
- int TempUtf16CodeUnitCountAdjustment = 0;
- int TempScalarCountAdjustment = 0;
-
- int utf16CodeUnitCountAdjustment = 0, scalarCountAdjustment = 0;
-
if (pInputBuffer == null || inputLength <= 0)
{
+ utf16CodeUnitCountAdjustment = 0;
+ scalarCountAdjustment = 0;
return pInputBuffer;
}
if (inputLength > 128)
@@ -793,18 +957,37 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
Vector128 v0f = Vector128.Create((byte)0x0F);
Vector128 v80 = Vector128.Create((byte)0x80);
// Performance note: we could process 64 bytes at a time for better speed in some cases.
+ int start_point = processedLength;
+
+ // The block goes from processedLength to processedLength/16*16.
+ int asciibytes = 0; // number of ascii bytes in the block (could also be called n1)
+ int contbytes = 0; // number of continuation bytes in the block
+ int n4 = 0; // number of 4-byte sequences that start in this block
for (; processedLength + 16 <= inputLength; processedLength += 16)
{
Vector128 currentBlock = AdvSimd.LoadVector128(pInputBuffer + processedLength);
- if (AdvSimd.Arm64.MaxAcross(currentBlock).ToScalar() > 127)
+ if (AdvSimd.Arm64.MaxAcross(currentBlock).ToScalar() <= 127)
{
// We have an ASCII block, no need to process it, but
// we need to check if the previous block was incomplete.
if (AdvSimd.Arm64.MaxAcross(prevIncomplete).ToScalar() != 0)
{
- return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength, ref utf16CodeUnitCountAdjustment, ref scalarCountAdjustment);
+ int off = processedLength >= 3 ? processedLength - 3 : processedLength;
+ byte* invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(16 - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
+ // So the code is correct up to invalidBytePointer
+ if (invalidBytePointer < pInputBuffer + processedLength)
+ {
+ removeCounters(invalidBytePointer, pInputBuffer + processedLength, ref asciibytes, ref n4, ref contbytes);
+ }
+ else
+ {
+ addCounters(pInputBuffer + processedLength, invalidBytePointer, ref asciibytes, ref n4, ref contbytes);
+ }
+ int totalbyteasciierror = processedLength - start_point;
+ (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, totalbyteasciierror);
+ return invalidBytePointer;
}
prevIncomplete = Vector128.Zero;
}
@@ -812,9 +995,10 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
{
// Contains non-ASCII characters, we need to do non-trivial processing
Vector128 prev1 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 1));
- Vector128 byte_1_high = Vector128.Shuffle(shuf1, AdvSimd.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);
- Vector128 byte_1_low = Vector128.Shuffle(shuf2, (prev1 & v0f));
- Vector128 byte_2_high = Vector128.Shuffle(shuf3, AdvSimd.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f);
+ // Vector128.Shuffle vs AdvSimd.Arm64.VectorTableLookup: prefer the latter!!!
+ Vector128 byte_1_high = AdvSimd.Arm64.VectorTableLookup(shuf1, AdvSimd.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);
+ Vector128 byte_1_low = AdvSimd.Arm64.VectorTableLookup(shuf2, (prev1 & v0f));
+ Vector128 byte_2_high = AdvSimd.Arm64.VectorTableLookup(shuf3, AdvSimd.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f);
Vector128 sc = AdvSimd.And(AdvSimd.And(byte_1_high, byte_1_low), byte_2_high);
Vector128 prev2 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 2));
Vector128 prev3 = AdvSimd.ExtractVector128(prevInputBlock, currentBlock, (byte)(16 - 3));
@@ -829,68 +1013,116 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
// hardware:
if (AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(error)).ToScalar() != 0)
{
- return SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength, ref utf16CodeUnitCountAdjustment, ref scalarCountAdjustment);
+ byte* invalidBytePointer;
+ if (processedLength == 0)
+ {
+ invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength);
+ }
+ else
+ {
+ invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(processedLength - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
+ }
+ if (invalidBytePointer < pInputBuffer + processedLength)
+ {
+ removeCounters(invalidBytePointer, pInputBuffer + processedLength, ref asciibytes, ref n4, ref contbytes);
+ }
+ else
+ {
+ addCounters(pInputBuffer + processedLength, invalidBytePointer, ref asciibytes, ref n4, ref contbytes);
+ }
+ int total_bytes_processed = (int)(invalidBytePointer - (pInputBuffer + start_point));
+ (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, total_bytes_processed);
+ return invalidBytePointer;
}
prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue);
+ Vector128 largestcont = Vector128.Create((sbyte)-65); // -65 => 0b10111111
+ contbytes += -AdvSimd.Arm64.AddAcross(AdvSimd.CompareLessThanOrEqual(Vector128.AsSByte(currentBlock), largestcont)).ToScalar();
+
+ // computing n4 is more expensive than we would like:
+ Vector128 fourthByteMinusOne = Vector128.Create((byte)(0b11110000u - 1));
+ Vector128 largerthan0f = AdvSimd.CompareGreaterThan(currentBlock, fourthByteMinusOne);
+ byte n4add = (byte)AdvSimd.Arm64.AddAcross(largerthan0f).ToScalar();
+ int negn4add = (int)(byte)-n4add;
+ n4 += negn4add;
}
+ asciibytes -= (sbyte)AdvSimd.Arm64.AddAcross(AdvSimd.CompareLessThan(currentBlock, v80)).ToScalar();
}
- }
- }
- // We have processed all the blocks using SIMD, we need to process the remaining bytes.
- // Process the remaining bytes with the scalar function
- if (processedLength < inputLength)
- {
- // We need to possibly backtrack to the start of the last code point
- // worst possible case is 4 bytes, where we need to backtrack 3 bytes
- // 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte
- if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
- {
- processedLength -= 1;
- if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
+ // We may still have an error.
+ if (processedLength < inputLength || AdvSimd.Arm64.MaxAcross(prevIncomplete).ToScalar() != 0)
{
- processedLength -= 1;
- if (processedLength > 0 && (sbyte)pInputBuffer[processedLength] <= -65)
+ byte* invalidBytePointer;
+ if (processedLength == 0)
{
- processedLength -= 1;
+ invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength);
+ }
+ else
+ {
+ invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(processedLength - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
+ }
+ if (invalidBytePointer != pInputBuffer + inputLength)
+ {
+ if (invalidBytePointer < pInputBuffer + processedLength)
+ {
+ removeCounters(invalidBytePointer, pInputBuffer + processedLength, ref asciibytes, ref n4, ref contbytes);
+ }
+ else
+ {
+ addCounters(pInputBuffer + processedLength, invalidBytePointer, ref asciibytes, ref n4, ref contbytes);
+ }
+ int total_bytes_processed = (int)(invalidBytePointer - (pInputBuffer + start_point));
+ (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, total_bytes_processed);
+ return invalidBytePointer;
+ }
+ else
+ {
+ addCounters(pInputBuffer + processedLength, invalidBytePointer, ref asciibytes, ref n4, ref contbytes);
}
}
+ int final_total_bytes_processed = inputLength - start_point;
+ (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSIMDAdjustments(asciibytes, n4, contbytes, final_total_bytes_processed);
+ return pInputBuffer + inputLength;
}
- int TailScalarCodeUnitCountAdjustment = 0;
- int TailUtf16CodeUnitCountAdjustment = 0;
- byte* invalidBytePointer = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength, out TailUtf16CodeUnitCountAdjustment, out TailScalarCodeUnitCountAdjustment);
- if (invalidBytePointer != pInputBuffer + inputLength)
+ }
+ return GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
+ }
+
+ private static unsafe void removeCounters(byte* start, byte* end, ref int asciibytes, ref int n4, ref int contbytes)
+ {
+ for (byte* p = start; p < end; p++)
+ {
+ if ((*p & 0b10000000) == 0)
{
- // An invalid byte was found by the scalar function
- return invalidBytePointer;
+ asciibytes -= 1;
+ }
+ if ((*p & 0b11000000) == 0b10000000)
+ {
+ contbytes -= 1;
+ }
+ if ((*p & 0b11110000) == 0b11110000)
+ {
+ n4 -= 1;
}
}
-
- return pInputBuffer + inputLength;
}
- public unsafe static byte* GetPointerToFirstInvalidByte(byte* pInputBuffer, int inputLength, out int Utf16CodeUnitCountAdjustment, out int ScalarCodeUnitCountAdjustment)
- {
- // if (AdvSimd.Arm64.IsSupported)
- // {
- // return GetPointerToFirstInvalidByteArm64(pInputBuffer, inputLength);
- // }
- if (Avx2.IsSupported)
+ private static unsafe void addCounters(byte* start, byte* end, ref int asciibytes, ref int n4, ref int contbytes)
+ {
+ for (byte* p = start; p < end; p++)
{
- return GetPointerToFirstInvalidByteAvx2(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment);
+ if ((*p & 0b10000000) == 0)
+ {
+ asciibytes += 1;
+ }
+ if ((*p & 0b11000000) == 0b10000000)
+ {
+ contbytes += 1;
+ }
+ if ((*p & 0b11110000) == 0b11110000)
+ {
+ n4 += 1;
+ }
}
- /*if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported)
- {
- return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength);
- }*/
- // if (Ssse3.IsSupported)
- // {
- // return GetPointerToFirstInvalidByteSse(pInputBuffer, inputLength);
- // }
- // return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength);
-
- return GetPointerToFirstInvalidByteScalar(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment);
-
}
}
diff --git a/test/AsciiTest.cs b/test/AsciiTest.cs
index 6b04362..2928302 100644
--- a/test/AsciiTest.cs
+++ b/test/AsciiTest.cs
@@ -89,7 +89,7 @@ public void HardCodedSequencesTest()
}
[Fact]
- public void Test_ASCII_generator()
+ public void TestASCIIGenerator()
{
const int NUM_TRIALS = 1000;
const int MAX_LENGTH = 255;
@@ -112,7 +112,7 @@ public void Test_ASCII_generator()
// Assertion or check to ensure all sequences were valid ASCII
if (validSequencesCount != NUM_TRIALS)
{
- throw new Exception($"Invalid ASCII sequences were generated for {length}-byte sequences!");
+ throw new ArgumentException($"Invalid ASCII sequences were generated for {length}-byte sequences!");
}
}
}
@@ -136,7 +136,7 @@ public void TestNoErrorGetIndexOfFirstNonAsciiByte()
nuint result = SimdUnicode.Ascii.GetIndexOfFirstNonAsciiByte(pAscii, (nuint)ascii.Length);
if (result != (nuint)ascii.Length)
{
- throw new Exception($"Unexpected non-ASCII character found at index {result}");
+ throw new ArgumentException($"Unexpected non-ASCII character found at index {result}");
}
}
}
@@ -166,7 +166,7 @@ public void TestErrorGetIndexOfFirstNonAsciiByte()
nuint result = SimdUnicode.Ascii.GetIndexOfFirstNonAsciiByte(pAscii, (nuint)ascii.Length);
if (result != (nuint)i)
{
- throw new Exception($"Expected non-ASCII character at index {i}, but found at index {result}");
+ throw new ArgumentException($"Expected non-ASCII character at index {i}, but found at index {result}");
}
}
}
diff --git a/test/UTF8ValidationTests.cs b/test/UTF8ValidationTests.cs
index d30e6e1..eef06e1 100644
--- a/test/UTF8ValidationTests.cs
+++ b/test/UTF8ValidationTests.cs
@@ -13,12 +13,12 @@ public unsafe class Utf8SIMDValidationTests
{
- private const int NumTrials = 100;
+ private const int NumTrials = 400; // Number of trials for the brute force tests, takes about a minute to run on a powerful server.
private static readonly RandomUtf8 generator = new RandomUtf8(1234, 1, 1, 1, 1);
- private static readonly Random rand = new Random();
+ // Never use non-deterministic random number generators in tests. E.g., never do new Random() in a test.
+ private static readonly Random rand = new Random(1245);
- // int[] outputLengths = { 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280, 1344, 1408, 1472, 1536, 1600, 1664, 1728, 1792, 1856, 1920, 1984, 2048, 2112, 2176, 2240, 2304, 2368, 2432, 2496, 2560, 2624, 2688, 2752, 2816, 2880, 2944, 3008, 3072, 3136, 3200, 3264, 3328, 3392, 3456, 3520, 3584, 3648, 3712, 3776, 3840, 3904, 3968, 4032, 4096, 4160, 4224, 4288, 4352, 4416, 4480, 4544, 4608, 4672, 4736, 4800, 4864, 4928, 4992, 5056, 5120, 5184, 5248, 5312, 5376, 5440, 5504, 5568, 5632, 5696, 5760, 5824, 5888, 5952, 6016, 6080, 6144, 6208, 6272, 6336, 6400, 6464, 6528, 6592, 6656, 6720, 6784, 6848, 6912, 6976, 7040, 7104, 7168, 7232, 7296, 7360, 7424, 7488, 7552, 7616, 7680, 7744, 7808, 7872, 7936, 8000, 8064, 8128, 8192, 8256, 8320, 8384, 8448, 8512, 8576, 8640, 8704, 8768, 8832, 8896, 8960, 9024, 9088, 9152, 9216, 9280, 9344, 9408, 9472, 9536, 9600, 9664, 9728, 9792, 9856, 9920, 9984, 10000 };
- static int[] outputLengths = { 128, 345, 1000 };
+ static int[] outputLengths = { 128, 129, 345, 512, 735, 1000, 2010 };
[Flags]
public enum TestSystemRequirements
@@ -31,7 +31,7 @@ public enum TestSystemRequirements
// Add more as needed
}
- public class FactOnSystemRequirementAttribute : FactAttribute
+ private sealed class FactOnSystemRequirementAttribute : FactAttribute
{
private TestSystemRequirements RequiredSystems;
@@ -45,7 +45,7 @@ public FactOnSystemRequirementAttribute(TestSystemRequirements requiredSystems)
}
}
- private bool IsSystemSupported(TestSystemRequirements requiredSystems)
+ private static bool IsSystemSupported(TestSystemRequirements requiredSystems)
{
switch (RuntimeInformation.ProcessArchitecture)
{
@@ -62,21 +62,27 @@ private bool IsSystemSupported(TestSystemRequirements requiredSystems)
}
- public class TestIfCondition : FactAttribute
+ private sealed class TestIfCondition : FactAttribute
{
public TestIfCondition(Func condition, string skipReason)
{
// Only set the Skip property if the condition evaluates to false
if (!condition.Invoke())
{
+ if(skipReason == null) {
+ throw new ArgumentNullException(nameof(skipReason), "skipReason cannot be null when condition is false.");
+ }
Skip = skipReason;
}
}
+
+ public Func? Condition { get; }
+ public string? SkipReason { get; }
}
-
- private void simpleGoodSequences(Utf8ValidationDelegate utf8ValidationDelegate)
+
+ private void simpleGoodSequences(Utf8ValidationFunction utf8ValidationDelegate)
{
string[] goodSequences = {
"a",
@@ -96,10 +102,10 @@ private void simpleGoodSequences(Utf8ValidationDelegate utf8ValidationDelegate)
{
fixed (byte* pInput = input)
{
- Assert.True(ValidateUtf8(input,utf8ValidationDelegate),
+ Assert.True(ValidateUtf8(input, utf8ValidationDelegate),
$"Failure in Scalar function: SimdUnicode.UTF8.GetPointerToFirstInvalidByte.Sequence: {seq}");
- Assert.True(ValidateCount(input,utf8ValidationDelegate));
+ Assert.True(ValidateCount(input, utf8ValidationDelegate));
}
}
}
@@ -112,40 +118,21 @@ public void simpleGoodSequencesScalar()
simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}
- // TODO:Uncomment when SSE is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
- // [Fact]
- // [Trait("Category", "sse")]
- // public void simpleGoodSequencesSse()
- // {
- // simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
- // }
-
- // TODO:Uncomment when AVX512 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)]
- // [Trait("Category", "avx512")]
- // public void simpleGoodSequencesAvx512()
- // {
- // simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
- // }
-
- // TODO:Uncomment when Arm64 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
- // [Trait("Category", "arm64")]
- // public void simpleGoodSequencesArm64()
- // {
- // simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
- // }
-
[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
- public void simpleGoodSequencesAVX()
+ public void simpleGoodSequencesAvx2()
{
simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}
+ [Trait("Category", "arm64")]
+ [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
+ public void simpleGoodSequencesArm64()
+ {
+ simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
+ }
- private void BadSequences(Utf8ValidationDelegate utf8ValidationDelegate)
+ private void BadSequences(Utf8ValidationFunction utf8ValidationDelegate)
{
string[] badSequences = {
"\xC3\x28",
@@ -186,8 +173,8 @@ private void BadSequences(Utf8ValidationDelegate utf8ValidationDelegate)
{
fixed (byte* pInput = input)
{
- ValidateUtf8(input,utf8ValidationDelegate);
- Assert.True(ValidateCount(input,utf8ValidationDelegate));
+ ValidateUtf8(input, utf8ValidationDelegate);
+ Assert.True(ValidateCount(input, utf8ValidationDelegate));
}
}
}
@@ -200,59 +187,41 @@ public void BadSequencesScalar()
BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}
- // TODO:Uncomment when SSE is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
- // [Fact]
- // [Trait("Category", "sse")]
- // public void BadSequencesSse()
- // {
- // BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
- // }
-
- // TODO:Uncomment when AVX512 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)]
- // [Trait("Category", "avx512")]
- // public void BadSequencesAvx512()
- // {
- // BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
- // }
-
- // TODO:Uncomment when Arm64 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
- // [Trait("Category", "arm64")]
- // public void BadSequencesArm64()
- // {
- // BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
- // }
-
[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
- public void BadSequencesAVX()
+ public void BadSequencesAvx2()
{
BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}
+ [Trait("Category", "arm64")]
+ [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
+ public void BadSequencesArm64()
+ {
+ BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
+ }
+
// this was in the C++ code
- private void Node48995Test(Utf8ValidationDelegate utf8ValidationDelegate)
+ private void Node48995Test(Utf8ValidationFunction utf8ValidationDelegate)
{
byte[] bad = new byte[] { 0x80 };
- Assert.False(ValidateUtf8(bad,utf8ValidationDelegate));
+ Assert.False(ValidateUtf8(bad, utf8ValidationDelegate));
}
- private void NoError(Utf8ValidationDelegate utf8ValidationDelegate)
+ private void NoError(Utf8ValidationFunction utf8ValidationDelegate)
{
foreach (int outputLength in outputLengths)
{
for (int trial = 0; trial < NumTrials; trial++)
{
byte[] utf8 = generator.Generate(outputLength).ToArray();
- bool isValidUtf8 = ValidateUtf8(utf8,utf8ValidationDelegate);
- string utf8HexString = BitConverter.ToString(utf8).Replace("-", " ");
+ bool isValidUtf8 = ValidateUtf8(utf8, utf8ValidationDelegate);
+ string utf8HexString = BitConverter.ToString(utf8).Replace("-", " ", System.StringComparison.InvariantCulture);
try
{
Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}");
- Assert.True(InvalidateUtf8(utf8, utf8.Length,utf8ValidationDelegate));
- Assert.True(ValidateCount(utf8,utf8ValidationDelegate));
+ Assert.True(InvalidateUtf8(utf8, utf8.Length, utf8ValidationDelegate));
+ Assert.True(ValidateCount(utf8, utf8ValidationDelegate));
}
catch (Xunit.Sdk.XunitException)
{
@@ -270,47 +239,29 @@ public void NoErrorScalar()
NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}
- // TODO:Uncomment when SSE is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
- // [Fact]
- // [Trait("Category", "sse")]
- // public void NoErrorSse()
- // {
- // NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
- // }
-
- // TODO:Uncomment when AVX512 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)]
- // [Trait("Category", "avx512")]
- // public void NoErrorAvx512()
- // {
- // NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
- // }
-
- // TODO:Uncomment when Arm64 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
- // [Trait("Category", "arm64")]
- // public void NoErrorArm64()
- // {
- // NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
- // }
-
[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
- public void NoErrorAVX()
+ public void NoErrorAvx2()
{
NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}
- private void NoErrorSpecificByteCount(Utf8ValidationDelegate utf8ValidationDelegate)
+ [Trait("Category", "arm64")]
+ [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
+ public void NoErrorArm64()
+ {
+ NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
+ }
+
+ private void NoErrorSpecificByteCount(Utf8ValidationFunction utf8ValidationDelegate)
{
- RunTestForByteLength(1,utf8ValidationDelegate);
- RunTestForByteLength(2,utf8ValidationDelegate);
- RunTestForByteLength(3,utf8ValidationDelegate);
- RunTestForByteLength(4,utf8ValidationDelegate);
+ RunTestForByteLength(1, utf8ValidationDelegate);
+ RunTestForByteLength(2, utf8ValidationDelegate);
+ RunTestForByteLength(3, utf8ValidationDelegate);
+ RunTestForByteLength(4, utf8ValidationDelegate);
}
- private void RunTestForByteLength(int byteLength,Utf8ValidationDelegate utf8ValidationDelegate)
+ private void RunTestForByteLength(int byteLength, Utf8ValidationFunction utf8ValidationDelegate)
{
// int[] outputLengths = { 128, 256, 512, 1024, 1000 }; // Example lengths
foreach (int outputLength in outputLengths)
@@ -318,11 +269,11 @@ private void RunTestForByteLength(int byteLength,Utf8ValidationDelegate utf8Vali
for (int trial = 0; trial < NumTrials; trial++)
{
byte[] utf8 = generator.Generate(outputLength, byteLength).ToArray();
- bool isValidUtf8 = ValidateUtf8(utf8,utf8ValidationDelegate);
+ bool isValidUtf8 = ValidateUtf8(utf8, utf8ValidationDelegate);
try
{
Assert.True(isValidUtf8, $"Failure NoErrorTest. ");
- Assert.True(ValidateCount(utf8,utf8ValidationDelegate));
+ Assert.True(ValidateCount(utf8, utf8ValidationDelegate));
}
catch (Xunit.Sdk.XunitException)
{
@@ -341,144 +292,105 @@ public void NoErrorSpecificByteCountScalar()
NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}
- // TODO:Uncomment when SSE is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
- // [Fact]
- // [Trait("Category", "sse")]
- // public void NoErrorSpecificByteCountSse()
- // {
- // NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
- // }
-
- // TODO:Uncomment when AVX512 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)]
- // [Trait("Category", "avx512")]
- // public void NoErrorSpecificByteCountAvx512()
- // {
- // NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
- // }
-
- // TODO:Uncomment when Arm64 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
- // [Trait("Category", "arm64")]
- // public void NoErrorSpecificByteCountArm64()
- // {
- // NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
- // }
-
[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
- public void NoErrorSpecificByteCountAVX()
+ public void NoErrorSpecificByteCountAvx2()
{
NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}
-private void NoErrorIncompleteThenASCII(Utf8ValidationDelegate utf8ValidationDelegate)
-{
- foreach (int outputLength in outputLengths){
- for (int trial = 0; trial < NumTrials; trial++)
+ [Trait("Category", "arm64")]
+ [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
+ public void NoErrorSpecificByteCountArm64()
+ {
+ NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
+ }
+ private void NoErrorIncompleteThenASCII(Utf8ValidationFunction utf8ValidationDelegate)
+ {
+ foreach (int outputLength in outputLengths)
{
- var allAscii = new List(Enumerable.Repeat((byte)0, outputLength));
- int firstCodeLength = rand.Next(2, 5);
- List singleBytes = generator.Generate(1, firstCodeLength);
-
- int incompleteLocation = 128 - rand.Next(1, firstCodeLength - 1);
- allAscii.InsertRange(incompleteLocation, singleBytes);
-
- var utf8 = allAscii.ToArray();
- int cutOffLength = 128;//utf8.Length - rand.Next(1, firstCodeLength);
- cutOffLength = Math.Min(cutOffLength, outputLength); // Ensure it doesn't exceed the length of truncatedUtf8
- byte[] truncatedUtf8 = new byte[outputLength]; // Initialized to zero
-
- Array.Copy(utf8, 0, truncatedUtf8, 0, cutOffLength);
-
- bool isValidUtf8 = ValidateUtf8(truncatedUtf8, utf8ValidationDelegate);
- // string utf8HexString = BitConverter.ToString(truncatedUtf8).Replace("-", " ");
- try
- {
- Assert.False(isValidUtf8);
- Assert.True(InvalidateUtf8(truncatedUtf8, truncatedUtf8.Length, utf8ValidationDelegate));
- Assert.True(ValidateCount(truncatedUtf8, utf8ValidationDelegate));
- }
- catch (Xunit.Sdk.XunitException)
+ for (int trial = 0; trial < NumTrials; trial++)
{
- PrintHexAndBinary(truncatedUtf8, incompleteLocation);
- throw;
+ var allAscii = new List(Enumerable.Repeat((byte)0, outputLength));
+#pragma warning disable CA5394
+ int firstCodeLength = rand.Next(2, 5);
+ List singleBytes = generator.Generate(1, firstCodeLength);
+
+ int incompleteLocation = 128 - rand.Next(1, firstCodeLength - 1);
+ allAscii.InsertRange(incompleteLocation, singleBytes);
+
+ var utf8 = allAscii.ToArray();
+ int cutOffLength = 128;//utf8.Length - rand.Next(1, firstCodeLength);
+ cutOffLength = Math.Min(cutOffLength, outputLength); // Ensure it doesn't exceed the length of truncatedUtf8
+ byte[] truncatedUtf8 = new byte[outputLength]; // Initialized to zero
+
+ Array.Copy(utf8, 0, truncatedUtf8, 0, cutOffLength);
+
+ bool isValidUtf8 = ValidateUtf8(truncatedUtf8, utf8ValidationDelegate);
+ // string utf8HexString = BitConverter.ToString(truncatedUtf8).Replace("-", " ");
+ try
+ {
+ Assert.False(isValidUtf8);
+ Assert.True(InvalidateUtf8(truncatedUtf8, truncatedUtf8.Length, utf8ValidationDelegate));
+ Assert.True(ValidateCount(truncatedUtf8, utf8ValidationDelegate));
+ }
+ catch (Xunit.Sdk.XunitException)
+ {
+ PrintHexAndBinary(truncatedUtf8, incompleteLocation);
+ throw;
+ }
}
}
}
-}
- [Fact]
+ [Fact]
[Trait("Category", "scalar")]
public void NoErrorIncompleteThenASCIIScalar()
{
NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}
- // TODO:Uncomment when SSE is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
- // [Fact]
- // [Trait("Category", "sse")]
- // public void NoErrorIncompleteThenASCIISse()
- // {
- // NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
- // }
-
- // TODO:Uncomment when AVX512 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)]
- // [Trait("Category", "avx512")]
- // public void NoErrorIncompleteThenASCIIAvx512()
- // {
- // NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
- // }
-
- // TODO:Uncomment when Arm64 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
- // [Trait("Category", "arm64")]
- // public void NoErrorIncompleteThenASCIIArm64()
- // {
- // NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
- // }
-
[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
- public void NoErrorIncompleteThenASCIIAVX()
+ public void NoErrorIncompleteThenASCIIAvx2()
{
NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}
+ [Trait("Category", "arm64")]
+ [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
+ public void NoErrorIncompleteThenASCIIArm64()
+ {
+ NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
+ }
- private void NoErrorIncompleteAt256Vector(Utf8ValidationDelegate utf8ValidationDelegate)
+ private void NoErrorIncompleteAt256Vector(Utf8ValidationFunction utf8ValidationDelegate)
{
foreach (int outputLength in outputLengths)
{
for (int trial = 0; trial < NumTrials; trial++)
{
-
-
- // var allAscii = generator.Generate(outputLength,1);
var allAscii = new List(Enumerable.Repeat((byte)0, 256));
- int firstcodeLength = rand.Next(2,5);
- List singlebytes = generator.Generate(1,firstcodeLength);//recall:generate a utf8 code between 2 and 4 bytes
- int incompleteLocation = 128 - rand.Next(1,firstcodeLength - 1);
- allAscii.InsertRange(incompleteLocation,singlebytes);
+ int firstcodeLength = rand.Next(2, 5);
+ List singlebytes = generator.Generate(1, firstcodeLength); //recall:generate a utf8 code between 2 and 4 bytes
+ int incompleteLocation = 128 - rand.Next(1, firstcodeLength - 1);
+ allAscii.InsertRange(incompleteLocation, singlebytes);
var utf8 = allAscii.ToArray();
- bool isValidUtf8 = ValidateUtf8(utf8,utf8ValidationDelegate);
- string utf8HexString = BitConverter.ToString(utf8).Replace("-", " ");
+ bool isValidUtf8 = ValidateUtf8(utf8, utf8ValidationDelegate);
+ string utf8HexString = BitConverter.ToString(utf8).Replace("-", " ", System.StringComparison.InvariantCulture);
try
{
Assert.True(isValidUtf8, $"Failure NoErrorTest. Sequence: {utf8HexString}");
- Assert.True(InvalidateUtf8(utf8, utf8.Length,utf8ValidationDelegate));
- Assert.True(ValidateCount(utf8,utf8ValidationDelegate));
+ Assert.True(InvalidateUtf8(utf8, utf8.Length, utf8ValidationDelegate));
+ Assert.True(ValidateCount(utf8, utf8ValidationDelegate));
}
catch (Xunit.Sdk.XunitException)
{
- PrintHexAndBinary(utf8,incompleteLocation);
+ PrintHexAndBinary(utf8, incompleteLocation);
throw; // Rethrow the exception to fail the test.
}
}
@@ -493,42 +405,24 @@ public void NoErrorIncompleteAt256VectorScalar()
NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}
- // TODO:Uncomment when SSE is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
- // [Fact]
- // [Trait("Category", "sse")]
- // public void NoErrorIncompleteAt256VectorSse()
- // {
- // NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
- // }
-
- // TODO:Uncomment when AVX512 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)]
- // [Trait("Category", "avx512")]
- // public void NoErrorIncompleteAt256VectorAvx512()
- // {
- // NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
- // }
-
- // TODO:Uncomment when Arm64 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
- // [Trait("Category", "arm64")]
- // public void NoErrorIncompleteAt256VectorArm64()
- // {
- // NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
- // }
-
[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
- public void NoErrorIncompleteAt256VectorAVX()
+ public void NoErrorIncompleteAt256VectorAvx2()
{
NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}
- private void BadHeaderBits(Utf8ValidationDelegate utf8ValidationDelegate)
+ [Trait("Category", "arm64")]
+ [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
+ public void NoErrorIncompleteAt256VectorArm64()
+ {
+ NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
+ }
+
+ private void BadHeaderBits(Utf8ValidationFunction utf8ValidationDelegate)
{
foreach (int outputLength in outputLengths)
- {
+ {
for (int trial = 0; trial < NumTrials; trial++)
{
@@ -541,9 +435,9 @@ private void BadHeaderBits(Utf8ValidationDelegate utf8ValidationDelegate)
utf8[i] = 0b11111000; // Forcing a header bits error
try
{
- Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate));
- Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate));
- Assert.True(ValidateCount(utf8,utf8ValidationDelegate));
+ Assert.False(ValidateUtf8(utf8, utf8ValidationDelegate));
+ Assert.True(InvalidateUtf8(utf8, i, utf8ValidationDelegate));
+ Assert.True(ValidateCount(utf8, utf8ValidationDelegate));
}
catch (Xunit.Sdk.XunitException)
{
@@ -566,47 +460,28 @@ public void BadHeaderBitsScalar()
{
BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}
-
-
- // TODO:Uncomment when SSE is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
- // [Fact]
- // [Trait("Category", "sse")]
- // public void BadHeaderBitsSse()
- // {
- // BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
- // }
-
- // TODO:Uncomment when AVX512 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)]
- // [Trait("Category", "avx512")]
- // public void BadHeaderBitsAvx512()
- // {
- // BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
- // }
-
- // TODO:Uncomment when Arm64 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
- // [Trait("Category", "arm64")]
- // public void NoErrorSpecificByteCountArm64()
- // {
- // NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
- // }
[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
- public void BadHeaderBitsAVX()
+ public void BadHeaderBitsAvx2()
{
BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}
- private void TooShortError(Utf8ValidationDelegate utf8ValidationDelegate)
+ [Trait("Category", "arm64")]
+ [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
+ public void BadHeaderBitsArm64()
+ {
+ BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
+ }
+
+ private void TooShortError(Utf8ValidationFunction utf8ValidationDelegate)
{
foreach (int outputLength in outputLengths)
{
for (int trial = 0; trial < NumTrials; trial++)
{
- byte[] utf8 = generator.Generate(outputLength).ToArray();
+ byte[] utf8 = generator.Generate(outputLength).ToArray();
for (int i = 0; i < utf8.Length; i++)
{
@@ -614,24 +489,24 @@ private void TooShortError(Utf8ValidationDelegate utf8ValidationDelegate)
{
byte oldByte = utf8[i];
utf8[i] = 0b11100000; // Forcing a too short error
- try
- {
- Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate));
- Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate));
- Assert.True(ValidateCount(utf8,utf8ValidationDelegate));
- }
- catch (Xunit.Sdk.XunitException)
- {
- Console.WriteLine($"Assertion failed at index: {i}");
- PrintHexAndBinary(utf8, i);
- throw; // Rethrow the exception to fail the test.
- }
+ try
+ {
+ Assert.False(ValidateUtf8(utf8, utf8ValidationDelegate));
+ Assert.True(InvalidateUtf8(utf8, i, utf8ValidationDelegate));
+ Assert.True(ValidateCount(utf8, utf8ValidationDelegate));
+ }
+ catch (Xunit.Sdk.XunitException)
+ {
+ Console.WriteLine($"Assertion failed at index: {i}");
+ PrintHexAndBinary(utf8, i);
+ throw; // Rethrow the exception to fail the test.
+ }
utf8[i] = oldByte; // Restore the original byte
}
}
}
}
-
+
}
[Fact]
@@ -641,39 +516,21 @@ public void TooShortErrorScalar()
TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}
- // TODO:Uncomment when SSE is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
- // [Fact]
- // [Trait("Category", "sse")]
- // public void TooShortErrorSse()
- // {
- // TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
- // }
-
- // TODO:Uncomment when AVX512 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)]
- // [Trait("Category", "avx512")]
- // public void TooShortErrorAvx512()
- // {
- // TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
- // }
-
- // TODO:Uncomment when Arm64 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
- // [Trait("Category", "arm64")]
- // public void TooShortErrorArm64()
- // {
- // TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
- // }
-
[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
- public void TooShortErrorAVX()
+ public void TooShortErrorAvx2()
{
TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}
- private void TooLongError(Utf8ValidationDelegate utf8ValidationDelegate)
+ [Trait("Category", "arm64")]
+ [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
+ public void TooShortErrorArm64()
+ {
+ TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
+ }
+
+ private void TooLongError(Utf8ValidationFunction utf8ValidationDelegate)
{
foreach (int outputLength in outputLengths)
@@ -690,9 +547,9 @@ private void TooLongError(Utf8ValidationDelegate utf8ValidationDelegate)
utf8[i] = 0b10000000; // Forcing a too long error
try
{
- Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate));
- Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate));
- Assert.True(ValidateCount(utf8,utf8ValidationDelegate));
+ Assert.False(ValidateUtf8(utf8, utf8ValidationDelegate));
+ Assert.True(InvalidateUtf8(utf8, i, utf8ValidationDelegate));
+ Assert.True(ValidateCount(utf8, utf8ValidationDelegate));
}
catch (Xunit.Sdk.XunitException)
{
@@ -714,39 +571,21 @@ public void TooLongErrorScalar()
TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}
- // TODO:Uncomment when SSE is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
- // [Fact]
- // [Trait("Category", "sse")]
- // public void TooLongErrorSse()
- // {
- // TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
- // }
-
- // TODO:Uncomment when AVX512 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)]
- // [Trait("Category", "avx512")]
- // public void TooLongErrorAvx512()
- // {
- // TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
- // }
-
- // TODO:Uncomment when Arm64 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
- // [Trait("Category", "arm64")]
- // public void TooLongErrorArm64()
- // {
- // TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
- // }
-
[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
- public void TooLongErrorAVX()
+ public void TooLongErrorAvx2()
{
TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}
- private void OverlongError(Utf8ValidationDelegate utf8ValidationDelegate)
+ [Trait("Category", "arm64")]
+ [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
+ public void TooLongErrorArm64()
+ {
+ TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
+ }
+
+ private void OverlongError(Utf8ValidationFunction utf8ValidationDelegate)
{
for (int trial = 0; trial < NumTrials; trial++)
{
@@ -777,9 +616,9 @@ private void OverlongError(Utf8ValidationDelegate utf8ValidationDelegate)
utf8[i + 1] = (byte)(utf8[i + 1] & 0b11001111);
}
- Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate));
- Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate));
- Assert.True(ValidateCount(utf8,utf8ValidationDelegate));
+ Assert.False(ValidateUtf8(utf8, utf8ValidationDelegate));
+ Assert.True(InvalidateUtf8(utf8, i, utf8ValidationDelegate));
+ Assert.True(ValidateCount(utf8, utf8ValidationDelegate));
utf8[i] = old;
utf8[i + 1] = secondOld;
@@ -796,66 +635,44 @@ public void OverlongErrorScalar()
OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}
- // TODO:Uncomment when SSE is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
- // [Fact]
- // [Trait("Category", "sse")]
- // public void OverlongErrorSse()
- // {
- // OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
- // }
-
- // TODO:Uncomment when AVX512 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)]
- // [Trait("Category", "avx512")]
- // public void OverlongErrorAvx512()
- // {
- // OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
- // }
-
- // TODO:Uncomment when Arm64 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
- // [Trait("Category", "arm64")]
- // public void OverlongErrorArm64()
- // {
- // OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
- // }
-
- [Trait("Category", "avx")]
- [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
- public void OverlongErrorAVX()
+ [Trait("Category", "arm64")]
+ [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
+ public void OverlongErrorArm64()
{
- OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
+ OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}
- private void TooShortErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate)
+ private void TooShortErrorAtEnd(Utf8ValidationFunction utf8ValidationDelegate)
{
for (int trial = 0; trial < NumTrials; trial++)
{
foreach (int outputLength in outputLengths)
{
- byte[] utf8 = generator.Generate(outputLength,byteCountInUnit: 1).ToArray();
-
+ byte[] utf8 = generator.Generate(outputLength, byteCountInUnit: 1).ToArray();
+
unsafe
{
fixed (byte* pInput = utf8)
{
for (int i = 0; i < utf8.Length; i++)
- {
- int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment;
- byte currentByte = utf8[i];
- int offset = 0;
+ {
+ int SimdUnicodeUtf16Adjustment, SimdUnicodeScalarCountAdjustment;
+ byte currentByte = utf8[i];
+ int offset = 0;
- if ((currentByte & 0b11100000) == 0b11000000) { // This is a header byte of a 2-byte sequence
+ if ((currentByte & 0b11100000) == 0b11000000)
+ { // This is a header byte of a 2-byte sequence
offset = 0;
- }
- if ((currentByte & 0b11110000) == 0b11100000) {
+ }
+ if ((currentByte & 0b11110000) == 0b11100000)
+ {
// This is a header byte of a 3-byte sequence
offset = rand.Next(0, 3);
- }
- if ((currentByte & 0b11111000) == 0b11110000) {
+ }
+ if ((currentByte & 0b11111000) == 0b11110000)
+ {
// This is a header byte of a 4-byte sequence
offset = rand.Next(0, 4);
}
@@ -866,10 +683,10 @@ private void TooShortErrorAtEnd(Utf8ValidationDelegate utf8ValidationDelegate)
byte* dotnetResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, i + offset, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment);
Assert.True(dotnetResult == pInput + i + offset);
- Assert.True(ValidateCount(utf8,utf8ValidationDelegate));
- }
+ Assert.True(ValidateCount(utf8, utf8ValidationDelegate));
+ }
- }
+ }
}
}
}
@@ -883,48 +700,23 @@ public void TooShortErrorAtEndScalar()
TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}
- // TODO:Uncomment when SSE is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
- // [Fact]
- // [Trait("Category", "sse")]
- // public void TooShortErrorAtEndSse()
- // {
- // TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
- // }
-
- // TODO:Uncomment when AVX512 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)]
- // [Trait("Category", "avx512")]
- // public void TooShortErrorAtEndAvx512()
- // {
- // TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
- // }
-
- // TODO:Uncomment when Arm64 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
- // [Trait("Category", "arm64")]
- // public void TooShortErrorAtEndArm64()
- // {
- // TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
- // }
[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
- public void TooShortErrorAtEndAVX()
+ public void TooShortErrorAtEndAvx2()
{
TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}
- [Trait("Category", "avx")]
- [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
- public void TooShortErrorAtEndAvx2()
+ [Trait("Category", "arm64")]
+ [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
+ public void TooShortErrorAtEndArm64()
{
- TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
+ TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}
-
//corresponds to condition 5.4.1 in the paper
- private void Invalid0xf50xff(Utf8ValidationDelegate utf8ValidationDelegate)
+ private void Invalid0xf50xff(Utf8ValidationFunction utf8ValidationDelegate)
{
var invalidBytes = Enumerable.Range(0xF5, 0x100 - 0xF5).Select(i => (byte)i).ToArray(); // 0xF5 to 0xFF
@@ -936,9 +728,9 @@ private void Invalid0xf50xff(Utf8ValidationDelegate utf8ValidationDelegate)
foreach (var invalidByte in invalidBytes)
{
utf8[position] = invalidByte;
- Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate)); // Expect the validation to fail due to the invalid byte
- Assert.True(InvalidateUtf8(utf8,position,utf8ValidationDelegate));
- Assert.True(ValidateCount(utf8,utf8ValidationDelegate));
+ Assert.False(ValidateUtf8(utf8, utf8ValidationDelegate)); // Expect the validation to fail due to the invalid byte
+ Assert.True(InvalidateUtf8(utf8, position, utf8ValidationDelegate));
+ Assert.True(ValidateCount(utf8, utf8ValidationDelegate));
}
}
}
@@ -951,37 +743,6 @@ public void Invalid0xf50xffScalar()
Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}
- // TODO:Uncomment when SSE is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
- // [Fact]
- // [Trait("Category", "sse")]
- // public void Invalid0xf50xffSse()
- // {
- // Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
- // }
-
- // TODO:Uncomment when AVX512 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)]
- // [Trait("Category", "avx512")]
- // public void Invalid0xf50xffAvx512()
- // {
- // Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
- // }
-
- // TODO:Uncomment when Arm64 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
- // [Trait("Category", "arm64")]
- // public void Invalid0xf50xffArm64()
- // {
- // Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
- // }
-
- [Trait("Category", "avx")]
- [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
- public void Invalid0xf50xffAVX()
- {
- Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
- }
[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
@@ -990,71 +751,82 @@ public void Invalid0xf50xffAvx2()
Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}
-// helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index
-static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
-{
- int chunkSize = 16; // 128 bits = 16 bytes
- // Process each chunk for hexadecimal
- Console.Write("Hex: ");
- for (int i = 0; i < bytes.Length; i++)
+ [Trait("Category", "arm64")]
+ [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
+ public void Invalid0xf50xffArm64()
{
- if (i > 0 && i % chunkSize == 0)
- Console.WriteLine(); // New line after every 16 bytes
-
- if (i == highlightIndex)
- {
- Console.ForegroundColor = ConsoleColor.Red;
- Console.Write($"{bytes[i]:X2} ");
- Console.ResetColor();
- }
- else if (i % (chunkSize * 2) == 0) // print green every 256 bytes
- {
- Console.ForegroundColor = ConsoleColor.Green;
- Console.Write($"{bytes[i]:X2} ");
- Console.ResetColor();
- }
- else
- {
- Console.Write($"{bytes[i]:X2} ");
- }
-
- if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line
+ Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}
- Console.WriteLine("\n"); // New line for readability and to separate hex from binary
- // Process each chunk for binary
- Console.Write("Binary: ");
- for (int i = 0; i < bytes.Length; i++)
+ // helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index
+ static void PrintHexAndBinary(byte[] bytes, int highlightIndex = -1)
{
- if (i > 0 && i % chunkSize == 0)
- Console.WriteLine(); // New line after every 16 bytes
+ int chunkSize = 16; // 128 bits = 16 bytes
- string binaryString = Convert.ToString(bytes[i], 2).PadLeft(8, '0');
- if (i == highlightIndex)
+ // Process each chunk for hexadecimal
+#pragma warning disable CA1303
+ Console.Write("Hex: ");
+ for (int i = 0; i < bytes.Length; i++)
{
- Console.ForegroundColor = ConsoleColor.Red;
- Console.Write($"{binaryString} ");
- Console.ResetColor();
- }
- else if (i % (chunkSize * 2) == 0) // print green every 256 bytes
- {
- Console.ForegroundColor = ConsoleColor.Green;
- Console.Write($"{binaryString} ");
- Console.ResetColor();
+ if (i > 0 && i % chunkSize == 0)
+ Console.WriteLine(); // New line after every 16 bytes
+
+ if (i == highlightIndex)
+ {
+ Console.ForegroundColor = ConsoleColor.Red;
+ Console.Write($"{bytes[i]:X2} ");
+ Console.ResetColor();
+ }
+ else if (i % (chunkSize * 2) == 0) // print green every 256 bytes
+ {
+ Console.ForegroundColor = ConsoleColor.Green;
+ Console.Write($"{bytes[i]:X2} ");
+ Console.ResetColor();
+ }
+ else
+ {
+ Console.Write($"{bytes[i]:X2} ");
+ }
+#pragma warning disable CA1303
+ if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line
}
- else
+#pragma warning disable CA1303
+ Console.WriteLine("\n"); // New line for readability and to separate hex from binary
+
+ // Process each chunk for binary
+#pragma warning disable CA1303
+ Console.Write("Binary: ");
+ for (int i = 0; i < bytes.Length; i++)
{
- Console.Write($"{binaryString} ");
- }
+ if (i > 0 && i % chunkSize == 0)
+ Console.WriteLine(); // New line after every 16 bytes
- if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line
+ string binaryString = Convert.ToString(bytes[i], 2).PadLeft(8, '0');
+ if (i == highlightIndex)
+ {
+ Console.ForegroundColor = ConsoleColor.Red;
+ Console.Write($"{binaryString} ");
+ Console.ResetColor();
+ }
+ else if (i % (chunkSize * 2) == 0) // print green every 256 bytes
+ {
+ Console.ForegroundColor = ConsoleColor.Green;
+ Console.Write($"{binaryString} ");
+ Console.ResetColor();
+ }
+ else
+ {
+ Console.Write($"{binaryString} ");
+ }
+#pragma warning disable CA1303
+ if ((i + 1) % chunkSize != 0) Console.Write(" "); // Add space between bytes but not at the end of the line
+ }
+ Console.WriteLine(); // New line for readability
}
- Console.WriteLine(); // New line for readability
-}
- private void TooLargeError(Utf8ValidationDelegate utf8ValidationDelegate)
+ private void TooLargeError(Utf8ValidationFunction utf8ValidationDelegate)
{
foreach (int outputLength in outputLengths)
{
@@ -1069,9 +841,9 @@ private void TooLargeError(Utf8ValidationDelegate utf8ValidationDelegate)
byte old = utf8[i];
utf8[i] += (byte)(((utf8[i] & 0b100) == 0b100) ? 0b10 : 0b100);
- Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate));
- Assert.True(InvalidateUtf8(utf8, i+1,utf8ValidationDelegate));
- Assert.True(ValidateCount(utf8,utf8ValidationDelegate));
+ Assert.False(ValidateUtf8(utf8, utf8ValidationDelegate));
+ Assert.True(InvalidateUtf8(utf8, i + 1, utf8ValidationDelegate));
+ Assert.True(ValidateCount(utf8, utf8ValidationDelegate));
utf8[i] = old;
}
}
@@ -1086,31 +858,6 @@ public void TooLargeErrorScalar()
TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}
- // TODO:Uncomment when SSE is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
- // [Fact]
- // [Trait("Category", "sse")]
- // public void TooLargeErrorSse()
- // {
- // TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
- // }
-
- // TODO:Uncomment when AVX512 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)]
- // [Trait("Category", "avx512")]
- // public void TooLargeErrorAvx512()
- // {
- // TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
- // }
-
- // TODO:Uncomment when Arm64 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
- // [Trait("Category", "arm64")]
- // public void TooLargeErrorArm64()
- // {
- // TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
- // }
-
[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void TooLargeErrorAvx()
@@ -1119,7 +866,14 @@ public void TooLargeErrorAvx()
}
- private void AsciiPlusContinuationAtEndError(Utf8ValidationDelegate utf8ValidationDelegate)
+ [Trait("Category", "arm64")]
+ [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
+ public void TooLargeErrorArm64()
+ {
+ TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
+ }
+
+ private void AsciiPlusContinuationAtEndError(Utf8ValidationFunction utf8ValidationDelegate)
{
foreach (int outputLength in outputLengths)
{
@@ -1127,21 +881,21 @@ private void AsciiPlusContinuationAtEndError(Utf8ValidationDelegate utf8Validati
{
for (int i = 1; i <= 4; i++)
{
- byte[] filler = generator.Generate(outputLength,byteCountInUnit:1).ToArray();
- byte[] toolong = generator.AppendContinuationByte(generator.Generate(1,i)).ToArray();
+ byte[] filler = generator.Generate(outputLength, byteCountInUnit: 1).ToArray();
+ byte[] toolong = generator.AppendContinuationByte(generator.Generate(1, i)).ToArray();
- generator.ReplaceEndOfArray(filler,toolong);
+ RandomUtf8.ReplaceEndOfArray(filler, toolong);
- Assert.False(ValidateUtf8(filler,utf8ValidationDelegate));
- Assert.True(InvalidateUtf8(filler, filler.Length - 1,utf8ValidationDelegate));
- Assert.True(ValidateCount(filler,utf8ValidationDelegate));
+ Assert.False(ValidateUtf8(filler, utf8ValidationDelegate));
+ Assert.True(InvalidateUtf8(filler, filler.Length - 1, utf8ValidationDelegate));
+ Assert.True(ValidateCount(filler, utf8ValidationDelegate));
}
}
}
}
-
+
[Fact]
[Trait("Category", "scalar")]
public void AsciiPlusContinuationAtEndErrorScalar()
@@ -1149,36 +903,11 @@ public void AsciiPlusContinuationAtEndErrorScalar()
AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}
- // TODO:Uncomment when SSE is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
- // [Fact]
- // [Trait("Category", "sse")]
- // public void AsciiPlusContinuationAtEndErrorSse()
- // {
- // AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
- // }
-
- // TODO:Uncomment when AVX512 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)]
- // [Trait("Category", "avx512")]
- // public void AsciiPlusContinuationAtEndErrorAvx512()
- // {
- // AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
- // }
-
- // TODO:Uncomment when Arm64 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
- // [Trait("Category", "arm64")]
- // public void AsciiPlusContinuationAtEndErrorArm64()
- // {
- // AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
- // }
-
- [Trait("Category", "avx")]
- [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
- public void AsciiPlusContinuationAtEndErrorAVX()
+ [Trait("Category", "arm64")]
+ [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
+ public void AsciiPlusContinuationAtEndErrorArm64()
{
- AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
+ AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}
[Trait("Category", "avx")]
@@ -1188,7 +917,7 @@ public void AsciiPlusContinuationAtEndErrorAvx2()
AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}
- private void SurrogateErrorTest(Utf8ValidationDelegate utf8ValidationDelegate)
+ private void SurrogateErrorTest(Utf8ValidationFunction utf8ValidationDelegate)
{
foreach (int outputLength in outputLengths)
{
@@ -1209,9 +938,9 @@ private void SurrogateErrorTest(Utf8ValidationDelegate utf8ValidationDelegate)
{
utf8[i + 1] = (byte)((utf8[i + 1] & 0b11000011) | (s << 2));
- Assert.False(ValidateUtf8(utf8,utf8ValidationDelegate));
- Assert.True(InvalidateUtf8(utf8, i,utf8ValidationDelegate));
- Assert.True(ValidateCount(utf8,utf8ValidationDelegate));
+ Assert.False(ValidateUtf8(utf8, utf8ValidationDelegate));
+ Assert.True(InvalidateUtf8(utf8, i, utf8ValidationDelegate));
+ Assert.True(ValidateCount(utf8, utf8ValidationDelegate));
}
utf8[i] = old;
@@ -1230,40 +959,21 @@ public void SurrogateErrorTestScalar()
SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}
- // TODO:Uncomment when SSE is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
- // [Fact]
- // [Trait("Category", "sse")]
- // public void SurrogateErrorTestSse()
- // {
- // SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
- // }
-
- // TODO:Uncomment when AVX512 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)]
- // [Trait("Category", "avx512")]
- // public void SurrogateErrorTestAvx512()
- // {
- // SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
- // }
-
- // TODO:Uncomment when Arm64 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
- // [Trait("Category", "arm64")]
- // public void SurrogateErrorTestArm64()
- // {
- // SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
- // }
-
[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
- public void SurrogateErrorTestAVX()
+ public void SurrogateErrorTestAvx2()
{
SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}
+ [Trait("Category", "arm64")]
+ [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
+ public void SurrogateErrorTestArm64()
+ {
+ SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
+ }
- private void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate)
+ private void BruteForceTest(Utf8ValidationFunction utf8ValidationDelegate)
{
foreach (int outputLength in outputLengths)
{
@@ -1273,7 +983,7 @@ private void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate)
// Generate random UTF-8 sequence
byte[] utf8 = generator.Generate(rand.Next(outputLength)).ToArray();
- Assert.True(ValidateUtf8(utf8,utf8ValidationDelegate), "Initial UTF-8 validation (primary) failed.");
+ Assert.True(ValidateUtf8(utf8, utf8ValidationDelegate), "Initial UTF-8 validation (primary) failed.");
Assert.True(ValidateUtf8Fuschia(utf8), "Initial UTF-8 validation (Fuschia) failed.");
@@ -1291,68 +1001,56 @@ private void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate)
modifiedUtf8[byteIndex] ^= (byte)bitFlip;
// Validate the modified sequence with both methods
- bool isValidPrimary = ValidateUtf8(modifiedUtf8,utf8ValidationDelegate);
+ bool isValidPrimary = ValidateUtf8(modifiedUtf8, utf8ValidationDelegate);
bool isValidFuschia = ValidateUtf8Fuschia(modifiedUtf8);
// Ensure both methods agree on the validation result
- try{ Assert.Equal(isValidPrimary, isValidFuschia);
- Assert.True(ValidateCount(modifiedUtf8,utf8ValidationDelegate));
- }
- catch (Xunit.Sdk.XunitException)
- {
- Console.WriteLine($"Assertion failed. Byte randomly changed at index: {byteIndex}");
- PrintHexAndBinary(utf8, byteIndex);
- throw; // Rethrow the exception to fail the test.
- }
-
+ try
+ {
+ Assert.Equal(isValidPrimary, isValidFuschia);
+ Assert.True(ValidateCount(modifiedUtf8, utf8ValidationDelegate));
+ }
+ catch (Xunit.Sdk.XunitException)
+ {
+ Console.WriteLine($"Fushia validation: {isValidFuschia}");
+ Console.WriteLine($"Testing validation: {isValidPrimary}");
+
+ Console.WriteLine($"Assertion failed. Byte randomly changed at index: {byteIndex}");
+ PrintHexAndBinary(utf8, byteIndex);
+ throw; // Rethrow the exception to fail the test.
+ }
+
}
}
}
}
- [Fact]
+ [Fact]
[Trait("Category", "scalar")]
public void BruteForceTestScalar()
{
BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}
- // TODO:Uncomment when SSE is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Sse)]
- // [Fact]
- // [Trait("Category", "sse")]
- // public void BruteForceTestSse()
- // {
- // BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteSse);
- // }
-
- // TODO:Uncomment when AVX512 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx512)]
- // [Trait("Category", "avx512")]
- // public void BruteForceTestAvx512()
- // {
- // BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx512);
- // }
-
- // TODO:Uncomment when Arm64 is updated
- // [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
- // [Trait("Category", "arm64")]
- // public void BruteForceTestArm64()
- // {
- // BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
- // }
-
[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
- public void BruteForceTestAVX()
+ public void BruteForceTestAvx2()
{
BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}
+ [Trait("Category", "arm64")]
+ [FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
+ public void BruteForceTestArm64()
+ {
+ BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
+ }
+
// credit: based on code from Google Fuchsia (Apache Licensed)
public static bool ValidateUtf8Fuschia(byte[] data)
{
+ if (data == null) return false;
int pos = 0;
int len = data.Length;
uint codePoint;
@@ -1405,7 +1103,7 @@ public static bool ValidateUtf8Fuschia(byte[] data)
}
// Check that all functions agree on the result when the input might be invalid.
- private bool InvalidateUtf8(byte[] utf8, int badindex,Utf8ValidationDelegate utf8ValidationDelegate)
+ private bool InvalidateUtf8(byte[] utf8, int badindex, Utf8ValidationFunction utf8ValidationDelegate)
{
unsafe
{
@@ -1415,27 +1113,28 @@ private bool InvalidateUtf8(byte[] utf8, int badindex,Utf8ValidationDelegate utf
int TailUtf16CodeUnitCountAdjustment = 0;
int SIMDUtf16CodeUnitCountAdjustment, SIMDScalarCountAdjustment;
- byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, utf8.Length,out TailUtf16CodeUnitCountAdjustment,out TailScalarCodeUnitCountAdjustment);
+ byte* scalarResult = SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar(pInput, utf8.Length, out TailUtf16CodeUnitCountAdjustment, out TailScalarCodeUnitCountAdjustment);
int scalarOffset = (int)(scalarResult - pInput);
- byte* simdResult = utf8ValidationDelegate(pInput, utf8.Length,out SIMDUtf16CodeUnitCountAdjustment,out SIMDScalarCountAdjustment);
+ byte* simdResult = utf8ValidationDelegate(pInput, utf8.Length, out SIMDUtf16CodeUnitCountAdjustment, out SIMDScalarCountAdjustment);
int simdOffset = (int)(simdResult - pInput);
int utf16CodeUnitCountAdjustment, scalarCountAdjustment;
byte* dotnetResult = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, utf8.Length, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
int dotnetOffset = (int)(dotnetResult - pInput);
+ var message = "Suprisingly, scalarResult != simdResult, scalarResult is {0} != simdResult is {1}, badindex = {2}, length = {3}";
if (scalarOffset != simdOffset)
{
- Console.WriteLine("Suprisingly, scalarResult != simdResult {0} != {1}, badindex = {2}, length = {3}", scalarOffset, simdOffset, badindex, utf8.Length);
+ Console.WriteLine(message, scalarOffset, simdOffset, badindex, utf8.Length);
}
if (dotnetOffset != simdOffset)
{
- Console.WriteLine("Suprisingly, dotnetOffset != simdResult {0} != {1}, badindex = {2}, length = {3}", dotnetOffset, simdOffset, badindex, utf8.Length);
+ Console.WriteLine(message, dotnetOffset, simdOffset, badindex, utf8.Length);
}
return (scalarResult == simdResult) && (simdResult == dotnetResult);
}
}
}
- private bool ValidateUtf8(byte[] utf8,Utf8ValidationDelegate utf8ValidationDelegate, Range range = default)
+ private bool ValidateUtf8(byte[] utf8, Utf8ValidationFunction utf8ValidationDelegate, Range range = default)
{
// Adjusted check for default Range
var isDefaultRange = range.Equals(default(Range));
@@ -1453,14 +1152,12 @@ private bool ValidateUtf8(byte[] utf8,Utf8ValidationDelegate utf8ValidationDeleg
if (dotnetResult != startPtr + length)
{
- // PrintDebugInfo(dotnetResult, startPtr, utf8, "DotnetRuntime fails to return the correct invalid position");
return false;
}
byte* simdResult = utf8ValidationDelegate(startPtr, length, out SimdUnicodeUtf16Adjustment, out SimdUnicodeScalarCountAdjustment);
if (simdResult != startPtr + length)
{
- // PrintDebugInfo(simdResult, startPtr, utf8, "Our result fails to return the correct invalid position");
return false;
}
return true;
@@ -1469,8 +1166,8 @@ private bool ValidateUtf8(byte[] utf8,Utf8ValidationDelegate utf8ValidationDeleg
}
}
- // Helper method to calculate the actual offset and length from a Range
- private (int offset, int length) GetOffsetAndLength(int totalLength, Range range)
+ // Helper method to calculate the actual offset and length from a Range
+ private static (int offset, int length) GetOffsetAndLength(int totalLength, Range range)
{
var start = range.Start.GetOffset(totalLength);
var end = range.End.GetOffset(totalLength);
@@ -1479,58 +1176,62 @@ private bool ValidateUtf8(byte[] utf8,Utf8ValidationDelegate utf8ValidationDeleg
}
-// Define a delegate that matches the signature of the methods you want to test
- public unsafe delegate byte* Utf8ValidationDelegate(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment);
+ // Define a delegate that matches the signature of the methods you want to test
+ public unsafe delegate byte* Utf8ValidationFunction(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment);
-public bool ValidateCount(byte[] utf8, Utf8ValidationDelegate utf8ValidationDelegate, Range range = default)
-{
- int dotnetUtf16Adjustment, dotnetScalarCountAdjustment;
- int simdUnicodeUtf16Adjustment, simdUnicodeScalarCountAdjustment;
+ public bool ValidateCount(byte[] utf8, Utf8ValidationFunction utf8ValidationDelegate, Range range = default)
+ {
+ int dotnetUtf16Adjustment, dotnetScalarCountAdjustment;
+ int simdUnicodeUtf16Adjustment, simdUnicodeScalarCountAdjustment;
+ if (utf8 == null || utf8ValidationDelegate == null)
+ {
+ return false;
+ }
- var isDefaultRange = range.Equals(default(Range));
- var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range);
+ var isDefaultRange = range.Equals(default(Range));
+ var (offset, length) = isDefaultRange ? (0, utf8.Length) : GetOffsetAndLength(utf8.Length, range);
- unsafe
- {
- fixed (byte* pInput = utf8)
+ unsafe
{
- byte* startPtr = pInput + offset;
+ fixed (byte* pInput = utf8)
+ {
+ byte* startPtr = pInput + offset;
- // Initialize adjustments
- dotnetUtf16Adjustment = 0;
- dotnetScalarCountAdjustment = 0;
- DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out dotnetUtf16Adjustment, out dotnetScalarCountAdjustment);
+ // Initialize adjustments
+ dotnetUtf16Adjustment = 0;
+ dotnetScalarCountAdjustment = 0;
+ byte* Result = DotnetRuntime.Utf8Utility.GetPointerToFirstInvalidByte(pInput, length, out dotnetUtf16Adjustment, out dotnetScalarCountAdjustment);
- simdUnicodeUtf16Adjustment = 0;
- simdUnicodeScalarCountAdjustment = 0;
- byte* simdResult = utf8ValidationDelegate(pInput, length, out simdUnicodeUtf16Adjustment, out simdUnicodeScalarCountAdjustment);
+ simdUnicodeUtf16Adjustment = 0;
+ simdUnicodeScalarCountAdjustment = 0;
+ byte* simdResult = utf8ValidationDelegate(pInput, length, out simdUnicodeUtf16Adjustment, out simdUnicodeScalarCountAdjustment);
- // Check for discrepancies and report them in one combined message
- bool adjustmentsMatch = true;
- string errorMessage = "Error: Adjustments mismatch - ";
+ // Check for discrepancies and report them in one combined message
+ bool adjustmentsMatch = true;
+ if (Result != simdResult)
+ {
+ Console.WriteLine($"Expected error at location : {Result - pInput}, but got: {simdResult - pInput}. ");
+ adjustmentsMatch = false;
+ }
- if (dotnetScalarCountAdjustment != simdUnicodeScalarCountAdjustment)
- {
- errorMessage += $"Expected Scalar Count Adjustment: {dotnetScalarCountAdjustment}, but got: {simdUnicodeScalarCountAdjustment}. ";
- adjustmentsMatch = false;
- }
+ if (dotnetScalarCountAdjustment != simdUnicodeScalarCountAdjustment)
+ {
+ Console.WriteLine($"Expected Scalar Count Adjustment: {dotnetScalarCountAdjustment}, but got: {simdUnicodeScalarCountAdjustment}. ");
+ adjustmentsMatch = false;
+ }
+
+ if (dotnetUtf16Adjustment != simdUnicodeUtf16Adjustment)
+ {
+ Console.WriteLine($"Expected UTF16 Adjustment: {dotnetUtf16Adjustment}, but got: {simdUnicodeUtf16Adjustment}.");
+ adjustmentsMatch = false;
+ }
- if (dotnetUtf16Adjustment != simdUnicodeUtf16Adjustment)
- {
- errorMessage += $"Expected UTF16 Adjustment: {dotnetUtf16Adjustment}, but got: {simdUnicodeUtf16Adjustment}.";
- adjustmentsMatch = false;
- }
- if (!adjustmentsMatch)
- {
- Console.WriteLine(errorMessage);
- return false;
- }
- return true;
+ return adjustmentsMatch;
+ }
}
}
-}
}
diff --git a/test/helpers/randomutf8.cs b/test/helpers/randomutf8.cs
index 5b30cd0..7c0ff20 100644
--- a/test/helpers/randomutf8.cs
+++ b/test/helpers/randomutf8.cs
@@ -1,3 +1,4 @@
+namespace tests;
using System;
using System.Collections.Generic;
using System.Linq;
@@ -9,28 +10,13 @@ public class RandomUtf8
private double[] probabilities;
private const int maxByteLength = 4;
- public RandomUtf8(uint seed, int prob_1byte, int prob_2bytes, int prob_3bytes, int prob_4bytes)
+ public RandomUtf8(uint seed, int prob1byte, int prob2bytes, int prob3bytes, int prob4bytes)
{
gen = new Random((int)seed);
- probabilities = new double[maxByteLength] { prob_1byte, prob_2bytes, prob_3bytes, prob_4bytes };
+ probabilities = new double[maxByteLength] { prob1byte, prob2bytes, prob3bytes, prob4bytes };
}
- // public byte[] Generate(int howManyUnits, int? byteCountInUnit = null)
- // {
- // var result = new List();
- // while (result.Count < howManyUnits)
- // {
- // int count = byteCountInUnit ?? PickRandomByteCount();
- // int codePoint = GenerateCodePoint(count);
- // byte[] utf8Bytes = Encoding.UTF8.GetBytes(char.ConvertFromUtf32(codePoint));
-
- // result.AddRange(utf8Bytes);
- // if (result.Count + utf8Bytes.Length > howManyUnits)
- // break;
- // }
- // return result.ToArray();
- // }
-
+#pragma warning disable CA1002
public List Generate(int howManyUnits, int? byteCountInUnit = null)
{
var result = new List();
@@ -47,67 +33,22 @@ public List Generate(int howManyUnits, int? byteCountInUnit = null)
return result;
}
- // public List Generate(int howManyUnits, int? byteCountInUnit = null)
- // {
- // var result = new List();
- // var unitsAdded = 0; // Track the number of characters added.
-
- // while (unitsAdded < howManyUnits)
- // {
- // int count = byteCountInUnit ?? PickRandomByteCount();
- // int codePoint = GenerateCodePoint(count);
- // byte[] utf8Bytes = Encoding.UTF8.GetBytes(char.ConvertFromUtf32(codePoint));
-
- // // Ensure adding the new character won't exceed the howManyUnits limit.
- // if (unitsAdded + 1 > howManyUnits)
- // break;
-
- // result.AddRange(utf8Bytes);
- // unitsAdded++; // Increment the units (characters) count.
- // }
-
- // return result;
- // }
-
-
- // public object Generate(int howManyUnits, int? byteCountInUnit = null, bool returnAsList = false)
- // {
- // var result = new List();
- // while (result.Count < howManyUnits)
- // {
- // int count = byteCountInUnit ?? PickRandomByteCount();
- // int codePoint = GenerateCodePoint(count);
- // byte[] utf8Bytes = Encoding.UTF8.GetBytes(char.ConvertFromUtf32(codePoint));
-
- // if (result.Count + utf8Bytes.Length > howManyUnits)
- // break;
-
- // result.AddRange(utf8Bytes);
- // }
-
- // if (returnAsList)
- // {
- // return result;
- // }
- // else
- // {
- // return result.ToArray();
- // }
- // }
-
private int GenerateCodePoint(int byteCount)
{
switch (byteCount)
{
case 1:
// Generate a code point for a 1-byte UTF-8 character (ASCII)
+#pragma warning disable CA5394
return gen.Next(0x0000, 0x007F + 1);// +1 because gen.Next() excludes the upper bound
case 2:
// Generate a code point for a 2-byte UTF-8 character (Latin)
+#pragma warning disable CA5394
return gen.Next(0x0080, 0x07FF + 1);
case 3:
// Generate a code point for a 3-byte UTF-8 character (Asiatic)
// Note: This range skips over the surrogate pair range U+D800 to U+DFFF
+#pragma warning disable CA5394
if (gen.NextDouble() < 0.5)
{
// Generate code point in U+0800 to U+D7FF range
@@ -116,24 +57,27 @@ private int GenerateCodePoint(int byteCount)
else
{
// Generate code point in U+E000 to U+FFFF range
+#pragma warning disable CA5394
return gen.Next(0xE000, 0xFFFF + 1);
}
case 4:
// Generate a code point for a 4-byte UTF-8 character (Supplementary)
// The +1 is factored into the ConvertFromUtf32 method
+#pragma warning disable CA5394
return gen.Next(0x010000, 0x10FFFF);
default:
throw new InvalidOperationException($"Invalid byte count: {byteCount}");
}
}
+#pragma warning disable CA1002
public List AppendContinuationByte(List utf8Bytes) =>
utf8Bytes.Concat(new byte[] { (byte)gen.Next(0x80, 0xBF + 1) }).ToList();
-
- public void ReplaceEndOfArray(byte[] original, byte[] replacement)//, int startIndex)
+#pragma warning disable CA1062
+ public static void ReplaceEndOfArray(byte[] original, byte[] replacement)//, int startIndex)
{
// Calculate the start index for replacement
int startIndex = original.Length - replacement.Length;