From b4541aa834bb99aaa00dfc57e17556ac534c25a0 Mon Sep 17 00:00:00 2001 From: SteveRuble Date: Sat, 25 Sep 2021 09:10:19 -0400 Subject: [PATCH] initial commit --- .gitignore | 45 ++ .idea/.idea.Bloomn/.idea/.gitignore | 13 + .idea/.idea.Bloomn/.idea/vcs.xml | 6 + Bloomn.sln | 22 + Bloomn.sln.DotSettings | 2 + readme.md | 10 + src/Bloomn/AssemblyAttributes.cs | 3 + src/Bloomn/BloomFilterBuilder.cs | 169 ++++++ src/Bloomn/BloomFilterCheckRequest.cs | 40 ++ src/Bloomn/BloomFilterConstants.cs | 7 + src/Bloomn/BloomFilterDimensions.cs | 213 +++++++ src/Bloomn/BloomFilterEntry.cs | 35 ++ src/Bloomn/BloomFilterParameters.cs | 102 ++++ src/Bloomn/BloomFilterState.cs | 122 ++++ src/Bloomn/Bloomn.csproj | 15 + src/Bloomn/Callbacks.cs | 77 +++ src/Bloomn/ClassicBloomFilter.cs | 332 +++++++++++ src/Bloomn/DependencyInjection.cs | 59 ++ src/Bloomn/HashFunction.cs | 8 + src/Bloomn/HashRegistry.cs | 45 ++ src/Bloomn/IBloomFilter.cs | 46 ++ src/Bloomn/IBloomFilterManager.cs | 11 + src/Bloomn/IKeyHasherFactory.cs | 10 + src/Bloomn/Murmur3HasherFactory.cs | 89 +++ src/Bloomn/PreparedAdd.cs | 42 ++ src/Bloomn/ScalingBloomFilter.cs | 341 ++++++++++++ src/Bloomn/ScalingParameters.cs | 52 ++ tests/Bloomn.Tests/AssertionHelpers.cs | 15 + .../BloomFilterDimensionsTests.cs | 57 ++ tests/Bloomn.Tests/BloomFilterManagerTests.cs | 47 ++ tests/Bloomn.Tests/BloomFilterStateTests.cs | 7 + tests/Bloomn.Tests/BloomFilterTests.cs | 521 ++++++++++++++++++ tests/Bloomn.Tests/Bloomn.Tests.csproj | 31 ++ tests/Bloomn.Tests/ClassicBloomFilterTests.cs | 19 + tests/Bloomn.Tests/PerformanceExperiments.cs | 104 ++++ .../Bloomn.Tests/ScalableBloomFilterTests.cs | 89 +++ 36 files changed, 2806 insertions(+) create mode 100644 .gitignore create mode 100644 .idea/.idea.Bloomn/.idea/.gitignore create mode 100644 .idea/.idea.Bloomn/.idea/vcs.xml create mode 100644 Bloomn.sln create mode 100644 Bloomn.sln.DotSettings create mode 100644 readme.md create mode 100644 src/Bloomn/AssemblyAttributes.cs create mode 100644 src/Bloomn/BloomFilterBuilder.cs create mode 100644 src/Bloomn/BloomFilterCheckRequest.cs create mode 100644 src/Bloomn/BloomFilterConstants.cs create mode 100644 src/Bloomn/BloomFilterDimensions.cs create mode 100644 src/Bloomn/BloomFilterEntry.cs create mode 100644 src/Bloomn/BloomFilterParameters.cs create mode 100644 src/Bloomn/BloomFilterState.cs create mode 100644 src/Bloomn/Bloomn.csproj create mode 100644 src/Bloomn/Callbacks.cs create mode 100644 src/Bloomn/ClassicBloomFilter.cs create mode 100644 src/Bloomn/DependencyInjection.cs create mode 100644 src/Bloomn/HashFunction.cs create mode 100644 src/Bloomn/HashRegistry.cs create mode 100644 src/Bloomn/IBloomFilter.cs create mode 100644 src/Bloomn/IBloomFilterManager.cs create mode 100644 src/Bloomn/IKeyHasherFactory.cs create mode 100644 src/Bloomn/Murmur3HasherFactory.cs create mode 100644 src/Bloomn/PreparedAdd.cs create mode 100644 src/Bloomn/ScalingBloomFilter.cs create mode 100644 src/Bloomn/ScalingParameters.cs create mode 100644 tests/Bloomn.Tests/AssertionHelpers.cs create mode 100644 tests/Bloomn.Tests/BloomFilterDimensionsTests.cs create mode 100644 tests/Bloomn.Tests/BloomFilterManagerTests.cs create mode 100644 tests/Bloomn.Tests/BloomFilterStateTests.cs create mode 100644 tests/Bloomn.Tests/BloomFilterTests.cs create mode 100644 tests/Bloomn.Tests/Bloomn.Tests.csproj create mode 100644 tests/Bloomn.Tests/ClassicBloomFilterTests.cs create mode 100644 tests/Bloomn.Tests/PerformanceExperiments.cs create mode 100644 tests/Bloomn.Tests/ScalableBloomFilterTests.cs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..769e20b --- /dev/null +++ b/.gitignore @@ -0,0 +1,45 @@ +# Common IntelliJ Platform excludes + +# User specific +**/.idea/**/workspace.xml +**/.idea/**/tasks.xml +**/.idea/shelf/* +**/.idea/dictionaries +**/.idea/httpRequests + +# Sensitive or high-churn files +**/.idea/**/dataSources/ +**/.idea/**/dataSources.ids +**/.idea/**/dataSources.xml +**/.idea/**/dataSources.local.xml +**/.idea/**/sqlDataSources.xml +**/.idea/**/dynamic.xml +**/.idea/**/sqldialects.xml + +# Rider +# Rider auto-generates .iml files, and contentModel.xml +**/.idea/**/*.iml +**/.idea/**/contentModel.xml +**/.idea/**/modules.xml + +**/*.suo +**/*.user +**/.vs/ +**/[Bb]in/ +**/[Oo]bj/ +publish/ +_UpgradeReport_Files/ +[Pp]ackages/ + +Thumbs.db +Desktop.ini +.DS_Store + +*.blob + +coverage.json + +.vscode/ + +# Snapshotter mismatches +**/__mismatch__/ \ No newline at end of file diff --git a/.idea/.idea.Bloomn/.idea/.gitignore b/.idea/.idea.Bloomn/.idea/.gitignore new file mode 100644 index 0000000..063d8a5 --- /dev/null +++ b/.idea/.idea.Bloomn/.idea/.gitignore @@ -0,0 +1,13 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Rider ignored files +/contentModel.xml +/.idea.Bloomn.iml +/projectSettingsUpdater.xml +/modules.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/.idea.Bloomn/.idea/vcs.xml b/.idea/.idea.Bloomn/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/.idea.Bloomn/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Bloomn.sln b/Bloomn.sln new file mode 100644 index 0000000..4079333 --- /dev/null +++ b/Bloomn.sln @@ -0,0 +1,22 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Bloomn", "src\Bloomn\Bloomn.csproj", "{7C864204-2CFC-4200-BCCF-0027A89C6CF3}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Bloomn.Tests", "tests\Bloomn.Tests\Bloomn.Tests.csproj", "{171BBD20-CB99-41CD-9F23-EA7BF99241E4}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {7C864204-2CFC-4200-BCCF-0027A89C6CF3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {7C864204-2CFC-4200-BCCF-0027A89C6CF3}.Debug|Any CPU.Build.0 = Debug|Any CPU + {7C864204-2CFC-4200-BCCF-0027A89C6CF3}.Release|Any CPU.ActiveCfg = Release|Any CPU + {7C864204-2CFC-4200-BCCF-0027A89C6CF3}.Release|Any CPU.Build.0 = Release|Any CPU + {171BBD20-CB99-41CD-9F23-EA7BF99241E4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {171BBD20-CB99-41CD-9F23-EA7BF99241E4}.Debug|Any CPU.Build.0 = Debug|Any CPU + {171BBD20-CB99-41CD-9F23-EA7BF99241E4}.Release|Any CPU.ActiveCfg = Release|Any CPU + {171BBD20-CB99-41CD-9F23-EA7BF99241E4}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection +EndGlobal diff --git a/Bloomn.sln.DotSettings b/Bloomn.sln.DotSettings new file mode 100644 index 0000000..3e60cce --- /dev/null +++ b/Bloomn.sln.DotSettings @@ -0,0 +1,2 @@ + + True \ No newline at end of file diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..6e6370d --- /dev/null +++ b/readme.md @@ -0,0 +1,10 @@ +#Bloomn Bloom Filter for .NET + +Bloomn provides a modern, high performance bloom filter implementation. + +### Features + +- Provides a very low allocation API for demanding scenarios +- Provides a simpler API for simpler scenarios +- Bloom filter state can be exported, serialized, and imported +- Integrates with standard .NET dependency injection framework. diff --git a/src/Bloomn/AssemblyAttributes.cs b/src/Bloomn/AssemblyAttributes.cs new file mode 100644 index 0000000..742dff9 --- /dev/null +++ b/src/Bloomn/AssemblyAttributes.cs @@ -0,0 +1,3 @@ + + +[assembly:System.Runtime.CompilerServices.InternalsVisibleTo("Bloomn.Tests")] \ No newline at end of file diff --git a/src/Bloomn/BloomFilterBuilder.cs b/src/Bloomn/BloomFilterBuilder.cs new file mode 100644 index 0000000..96aa04f --- /dev/null +++ b/src/Bloomn/BloomFilterBuilder.cs @@ -0,0 +1,169 @@ +using System; +using System.Linq; +using System.Text.Json; +using Microsoft.Extensions.Options; + +namespace Bloomn +{ + public interface IBloomFilterOptionsBuilder + { + IBloomFilterBuilder WithCapacityAndErrorRate(int capacity, double errorRate); + + IBloomFilterBuilder WithDimensions(BloomFilterDimensions dimensions); + + IBloomFilterBuilder WithScaling(double capacityScaling = 2, double errorRateScaling = 0.8); + + IBloomFilterBuilder WithHasher(IKeyHasherFactory hasherFactory); + } + + public interface IBloomFilterBuilder: IBloomFilterOptionsBuilder + { + IBloomFilterBuilder WithOptions(BloomFilterOptions options); + + IBloomFilterBuilder WithProfile(string profile); + + IBloomFilterBuilder WithState(BloomFilterState state); + + IBloomFilter Build(); + } + + internal class BloomFilterBuilder : IBloomFilterBuilder + { + private readonly IOptionsSnapshot? _optionsSnapshot; + internal BloomFilterOptions Options { get; set; } + + internal BloomFilterState? State { get; set; } + + public BloomFilterBuilder(IOptionsSnapshot options) + { + _optionsSnapshot = options; + Options = options.Value; + } + + public BloomFilterBuilder(BloomFilterOptions options) + { + Options = options; + } + + public IBloomFilterBuilder WithCapacityAndErrorRate(int capacity, double errorRate) + { + return WithDimensions(BloomFilterDimensions.ForCapacityAndErrorRate(capacity, errorRate)); + } + + public IBloomFilterBuilder WithDimensions(BloomFilterDimensions dimensions) + { + Options.Dimensions = dimensions; + return this; + } + + public IBloomFilterBuilder WithScaling(double capacityScaling = 2, double errorRateScaling = 0.8) + { + Options.ScalingParameters = new ScalingParameters() + { + MaxCapacityBehavior = MaxCapacityBehavior.Scale, + CapacityScaling = capacityScaling, + ErrorRateScaling = errorRateScaling + }; + return this; + } + + public IBloomFilterBuilder WithHasher(IKeyHasherFactory hasherFactory) + { + Options.SetHasher(hasherFactory); + return this; + } + + public IBloomFilterBuilder WithOptions(BloomFilterOptions options) + { + Options = options; + return this; + } + + public IBloomFilterBuilder WithProfile(string profile) + { + if (_optionsSnapshot == null) + { + throw new InvalidOperationException("This builder was not "); + } + Options = _optionsSnapshot.Get(profile); + return this; + } + + public IBloomFilterBuilder WithState(string? serializedState) + { + if (serializedState == null) + { + return this; + } + + var state = JsonSerializer.Deserialize(serializedState); + if (state == null) + { + throw new ArgumentException("Serialized state deserialized to null.", nameof(serializedState)); + } + + return WithState(state); + } + + public IBloomFilterBuilder WithState(BloomFilterState state) + { + State = state; + return this; + } + + public IBloomFilter Build() + { + var id = State?.Id ?? Guid.NewGuid().ToString(); + + var configuredParameters = new BloomFilterParameters(id) + { + Dimensions = Options.Dimensions, + ScalingParameters = Options.ScalingParameters, + HashAlgorithm = Options.GetHasher().Algorithm, + }; + + var state = State; + if (state != null) + { + var parametersFromState = state?.Parameters; + + if (parametersFromState != null) + { + var inconsistencies = parametersFromState.Diff(configuredParameters); + if (inconsistencies.Any()) + { + if (Options.DiscardInconsistentState) + { + state = null; + } + + throw new InvalidOperationException($"When state containing parameters are provided it must be consistent with the configured parameters. " + + $"Configured parameters: {configuredParameters}; " + + $"Parameters from state: {parametersFromState}; " + + $"Inconsistencies: {string.Join(", ", inconsistencies)}"); + } + } + } + + if (state == null) + { + state = new BloomFilterState() + { + Parameters = configuredParameters + }; + } + + if (state.Parameters == null) + { + throw new Exception("State parameters not found."); + } + + if (state.Parameters.ScalingParameters.MaxCapacityBehavior == MaxCapacityBehavior.Scale) + { + return new ScalingBloomFilter(Options, state); + } + + return new ClassicBloomFilter(Options, state); + } + } +} \ No newline at end of file diff --git a/src/Bloomn/BloomFilterCheckRequest.cs b/src/Bloomn/BloomFilterCheckRequest.cs new file mode 100644 index 0000000..2cc9f97 --- /dev/null +++ b/src/Bloomn/BloomFilterCheckRequest.cs @@ -0,0 +1,40 @@ +using System; +using System.Runtime.InteropServices; + +namespace Bloomn +{ + public readonly ref struct BloomFilterKey + { + public readonly ReadOnlySpan Bytes; + + public BloomFilterKey(string key) + { + Bytes = MemoryMarshal.AsBytes(key.AsSpan()); + } + + public static implicit operator BloomFilterKey(string s) => new BloomFilterKey(s); + } + + public enum BloomFilterCheckBehavior + { + CheckOnly, + PrepareForAdd, + AddImmediately + } + + public readonly ref struct BloomFilterCheckRequest + { + public readonly BloomFilterKey Key; + public readonly BloomFilterCheckBehavior Behavior; + + public static BloomFilterCheckRequest CheckOnly(BloomFilterKey key) => new BloomFilterCheckRequest(key, BloomFilterCheckBehavior.CheckOnly); + public static BloomFilterCheckRequest PrepareForAdd(BloomFilterKey key) => new BloomFilterCheckRequest(key, BloomFilterCheckBehavior.PrepareForAdd); + public static BloomFilterCheckRequest AddImmediately(BloomFilterKey key) => new BloomFilterCheckRequest(key, BloomFilterCheckBehavior.AddImmediately); + + public BloomFilterCheckRequest(BloomFilterKey key, BloomFilterCheckBehavior behavior) + { + Key = key; + Behavior = behavior; + } + } +} \ No newline at end of file diff --git a/src/Bloomn/BloomFilterConstants.cs b/src/Bloomn/BloomFilterConstants.cs new file mode 100644 index 0000000..5060843 --- /dev/null +++ b/src/Bloomn/BloomFilterConstants.cs @@ -0,0 +1,7 @@ +namespace Bloomn +{ + public class BloomFilterConstants + { + public const string Murmur3HasherKey = "murmur3"; + } +} \ No newline at end of file diff --git a/src/Bloomn/BloomFilterDimensions.cs b/src/Bloomn/BloomFilterDimensions.cs new file mode 100644 index 0000000..0ee89ce --- /dev/null +++ b/src/Bloomn/BloomFilterDimensions.cs @@ -0,0 +1,213 @@ +using System; +using System.Collections.Generic; +using System.ComponentModel.DataAnnotations; +using System.Diagnostics.CodeAnalysis; + +namespace Bloomn +{ + public interface IBloomFilterDimensions + { + double ErrorRate { get; } + int Capacity { get; } + int BitCount { get; } + int HashCount { get; } + } + + public record BloomFilterDimensions : IBloomFilterDimensions + { + public class Computer + { + public double? FalsePositiveRate { get; set; } + public int? Capacity { get; set; } + public int? BitCount { get; set; } + public int? HashCount { get; set; } + + [MemberNotNullWhen(true, nameof(FalsePositiveRate))] + [MemberNotNullWhen(true, nameof(Capacity))] + [MemberNotNullWhen(true, nameof(BitCount))] + [MemberNotNullWhen(true, nameof(HashCount))] + public bool FullySpecified => FalsePositiveRate != null && Capacity != null && BitCount != null && HashCount != null; + + public bool Computable => + (Capacity.HasValue && FalsePositiveRate.HasValue) + || (FalsePositiveRate.HasValue && BitCount.HasValue) + || (Capacity.HasValue && FalsePositiveRate.HasValue) + || (Capacity.HasValue && BitCount.HasValue) + || (FalsePositiveRate.HasValue && BitCount.HasValue); + + public BloomFilterDimensions Compute() + { + if (!Computable) + { + throw new InvalidOperationException("Not enough parameters are set."); + } + + var makingProgress = true; + while (!FullySpecified && makingProgress) + { + makingProgress = false; + if (!HashCount.HasValue && Capacity.HasValue && BitCount.HasValue) + { + HashCount = Equations.k(BitCount.Value, Capacity.Value); + makingProgress = true; + continue; + } + + if (!HashCount.HasValue && FalsePositiveRate.HasValue) + { + HashCount = Equations.k(FalsePositiveRate.Value); + makingProgress = true; + continue; + } + + if (!BitCount.HasValue && Capacity.HasValue && FalsePositiveRate.HasValue) + { + BitCount = Equations.m(Capacity.Value, FalsePositiveRate.Value); + makingProgress = true; + continue; + } + + if (!Capacity.HasValue && BitCount.HasValue && HashCount.HasValue && FalsePositiveRate.HasValue) + { + Capacity = Equations.n(BitCount.Value, HashCount.Value, FalsePositiveRate.Value); + makingProgress = true; + continue; + } + + if (!FalsePositiveRate.HasValue && BitCount.HasValue && Capacity.HasValue && HashCount.HasValue) + { + FalsePositiveRate = Equations.p(BitCount.Value, Capacity.Value, HashCount.Value); + makingProgress = true; + continue; + } + } + + if (!FullySpecified) + { + throw new InvalidOperationException($"Could not compute dimensions using provided values: {this}"); + } + + return new BloomFilterDimensions(FalsePositiveRate.Value, Capacity.Value, BitCount.Value, HashCount.Value); + } + + public override string ToString() + { + return $"{nameof(FalsePositiveRate)}: {FalsePositiveRate}, {nameof(Capacity)}: {Capacity}, {nameof(BitCount)}: {BitCount}, {nameof(HashCount)}: {HashCount}"; + } + } + + /// + /// These are the equations which relate the bloom filter parameters. + /// n => max items before invariants are broken + /// m => number of bits + /// k => number of hashes + /// p => false positive rate + /// + internal static class Equations + { + // ReSharper disable InconsistentNaming + + // n = ceil(m / (-k / log(1 - exp(log(p) / k)))) + // p = pow(1 - exp(-k / (m / n)), k) + // m = ceil((n * log(p)) / log(1 / pow(2, log(2)))); + // k = round((m / n) * log(2)); + + + public static int n(int m, int k, double p) => (int) Math.Ceiling(m / (-k / Math.Log(1 - Math.Exp(Math.Log(p) / k)))); + + public static double p(int m, int n, int k) => Math.Pow(1 - Math.Exp(-k / (m / (double) n)), k); + + public static int m(int n, double p) => (int) Math.Ceiling((n * Math.Log(p)) / Math.Log(1 / Math.Pow(2, Math.Log(2)))); + + public static int k(int m, int n) => (int) Math.Round((m / (double) n) * Math.Log(2)); + + public static int k(double p) => (int)Math.Round(-Math.Log2(p)); + + // ReSharper restore InconsistentNaming + } + + public double ErrorRate { get; init; } + public int Capacity { get; init; } + public int BitCount { get; init; } + public int HashCount { get; init; } + + public BloomFilterDimensions(double errorRate = 0.01, int capacity = 10000, int bitCount = 95851, int hashCount = 7) + { + if (hashCount < 2) + { + throw new ArgumentOutOfRangeException(nameof(errorRate), errorRate, "Parameters resulted in a hash count of 1, which is pointless."); + } + + ErrorRate = errorRate; + Capacity = capacity; + BitCount = bitCount; + HashCount = hashCount; + } + + public static BloomFilterDimensions ForCapacityAndErrorRate(int capacity, double errorRate) + { + return new Computer() + { + Capacity = capacity, + FalsePositiveRate = errorRate + }.Compute(); + } + + public void Validate() + { + if (Capacity <= 0) + { + throw new ValidationException("Capacity must be greater than 0."); + } + + if (ErrorRate is <= 0 or >= 1) + { + throw new ValidationException("ErrorRate must be between 0 and 1 exclusive."); + } + + if (BitCount <= 0) + { + throw new ValidationException("BitCount must be greater than 0."); + } + + if (HashCount is <= 2 or > 100) + { + throw new ValidationException("HashCount must be greater than 1 and less than 100."); + } + } + + public List Diff(BloomFilterDimensions other) + { + var diff = new List(); + if (BitCount != other.BitCount) + { + diff.Add($"{nameof(BitCount)}: {BitCount} != {other.BitCount}"); + } + + if (Capacity != other.Capacity) + { + diff.Add($"{nameof(Capacity)}: {Capacity} != {other.Capacity}"); + } + + if (HashCount != other.HashCount) + { + diff.Add($"{nameof(HashCount)}: {HashCount} != {other.HashCount}"); + } + + if (Math.Abs(ErrorRate - other.ErrorRate) > double.Epsilon) + { + diff.Add($"{nameof(ErrorRate)}: {ErrorRate} != {other.ErrorRate}"); + } + + return diff; + } + + public void Deconstruct(out double errorRate, out int capacity , out int bitCount , out int hashCount) + { + errorRate = this.ErrorRate; + capacity = this.Capacity; + bitCount = this.BitCount; + hashCount = this.HashCount; + } + } +} \ No newline at end of file diff --git a/src/Bloomn/BloomFilterEntry.cs b/src/Bloomn/BloomFilterEntry.cs new file mode 100644 index 0000000..d1ef8aa --- /dev/null +++ b/src/Bloomn/BloomFilterEntry.cs @@ -0,0 +1,35 @@ +using System; + +namespace Bloomn +{ + public readonly struct BloomFilterEntry : IDisposable + { + public static readonly BloomFilterEntry MaybePresent = new BloomFilterEntry(false, PreparedAdd.AlreadyAdded); + public static readonly BloomFilterEntry NotPresent = new BloomFilterEntry(true, PreparedAdd.AlreadyAdded); + public static BloomFilterEntry Addable(PreparedAdd preparedAdd) => new BloomFilterEntry(true, preparedAdd); + + public readonly bool IsNotPresent; + public readonly PreparedAdd PreparedAdd; + + internal BloomFilterEntry(bool isNotPresent, PreparedAdd preparedAdd) + { + IsNotPresent = isNotPresent; + PreparedAdd = preparedAdd; + } + + public bool Add() + { + if (PreparedAdd.CanAdd) + { + return PreparedAdd.Add(); + } + + return false; + } + + public void Dispose() + { + PreparedAdd.Dispose(); + } + } +} \ No newline at end of file diff --git a/src/Bloomn/BloomFilterParameters.cs b/src/Bloomn/BloomFilterParameters.cs new file mode 100644 index 0000000..4240c96 --- /dev/null +++ b/src/Bloomn/BloomFilterParameters.cs @@ -0,0 +1,102 @@ +using System.Collections.Generic; +using System.ComponentModel.DataAnnotations; + +namespace Bloomn +{ + public record BloomFilterParameters (string Id) + { + public BloomFilterDimensions Dimensions { get; init; } = new BloomFilterDimensions(); + + public ScalingParameters ScalingParameters { get; init; } = new ScalingParameters() + { + MaxCapacityBehavior = MaxCapacityBehavior.Throw, + }; + + public string HashAlgorithm { get; init; } = "murmur3"; + + public BloomFilterParameters WithCapacityAndErrorRate(int capacity, double errorRate) + { + return this with + { + Dimensions = BloomFilterDimensions.ForCapacityAndErrorRate(capacity, errorRate) + }; + } + + public BloomFilterParameters WithScaling(double capacityScaling = 2, double errorRateScaling = 0.8) + { + return this with + { + ScalingParameters = new ScalingParameters() + { + MaxCapacityBehavior = MaxCapacityBehavior.Scale, + CapacityScaling = capacityScaling, + ErrorRateScaling = errorRateScaling + } + }; + } + + public void Validate() + { + if (string.IsNullOrEmpty(HashAlgorithm)) + { + throw new ValidationException($"{nameof(HashAlgorithm)} must be set."); + } + + if (ScalingParameters == null) + { + throw new ValidationException($"{nameof(ScalingParameters)} must be set;"); + } + + if (Dimensions == null) + { + throw new ValidationException($"{nameof(Dimensions)} must be set;"); + } + + Dimensions.Validate(); + ScalingParameters.Validate(); + } + + public List Diff(BloomFilterParameters other) + { + var diff = Dimensions.Diff(other.Dimensions); + diff.AddRange(ScalingParameters.ValidateMigration(other.ScalingParameters)); + + if (HashAlgorithm != other.HashAlgorithm) + { + diff.Add($"{nameof(HashAlgorithm)}: {HashAlgorithm} != {other.HashAlgorithm}"); + } + + return diff; + } + } + + public enum MaxCapacityBehavior + { + /// + /// The bloom filter will throw an exception when it hits capacity. + /// + Throw, + + /// + /// The bloom filter will scale up when it reaches capacity, using the algorithm from "Scalable Bloom Filters". + /// Enabling scaling allows you to avoid over-allocating storage when you don't know how many items you'll + /// need to add to the filter. However, if you do know how many items you need to add you will get better performance + /// and storage efficiency by specifying the capacity initially. + /// + /// https://doi.org/10.1016/j.ipl.2006.10.007 + /// + /// + /// https://haslab.uminho.pt/cbm/files/dbloom.pdf + /// + /// + /// Almeida, P. S. et al. “Scalable Bloom Filters.” Inf. Process. Lett. 101 (2007): 255-261. + /// + /// + Scale, + + /// + /// The bloom filter will continue to add items even if it can no longer fulfil the requested error rate. + /// + Ignore + } +} \ No newline at end of file diff --git a/src/Bloomn/BloomFilterState.cs b/src/Bloomn/BloomFilterState.cs new file mode 100644 index 0000000..7dd1d8f --- /dev/null +++ b/src/Bloomn/BloomFilterState.cs @@ -0,0 +1,122 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Runtime.Serialization; +using System.Text.Json; + +namespace Bloomn +{ + public class BloomFilterState + { + public string? Id { get; set; } + + public string ApiVersion { get; set; } = "v1"; + + public BloomFilterParameters? Parameters { get; set; } + + public List? BitArrays { get; set; } + + public List? Children { get; set; } + + public long Count { get; set; } + + public string Serialize() + { + return System.Text.Json.JsonSerializer.Serialize(this, typeof(BloomFilterState), JsonSerializerOptions); + } + + public static BloomFilterState Deserialize(string serialized) + { + var state = System.Text.Json.JsonSerializer.Deserialize(serialized, JsonSerializerOptions); + if (state == null) + { + throw new SerializationException("Deserialization returned null."); + } + + return state; + } + + private static readonly JsonSerializerOptions JsonSerializerOptions = new JsonSerializerOptions() + { + Converters = { new StateJsonSerializer() } + }; + + private class StateJsonSerializer : System.Text.Json.Serialization.JsonConverter + { + public override BloomFilterState Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options) + { + var result = new BloomFilterState(); + + while (reader.Read()) + { + var propertyName = reader.GetString(); + switch (propertyName) + { + case "id": + result.Id = reader.GetString(); + break; + case "count": + result.Count = reader.GetInt64(); + break; + case "parameters": + result.Parameters = JsonSerializer.Deserialize(ref reader, options); + break; + case "bits": + reader.Read(); + result.BitArrays = new List(); + while (reader.TokenType != JsonTokenType.EndArray) + { + var bits = reader.GetBytesFromBase64(); + result.BitArrays.Add(bits); + } + break; + case "children": + result.Children = JsonSerializer.Deserialize>(ref reader, options); + break; + } + } + + return result; + } + + public override void Write(Utf8JsonWriter writer, BloomFilterState value, JsonSerializerOptions options) + { + writer.WriteStartObject(); + writer.WriteString("id", value.Id); + writer.WriteString("apiVersion", value.ApiVersion); + writer.WriteNumber("count", value.Count); + writer.WritePropertyName("parameters"); + var parameters = value.Parameters ?? new BloomFilterParameters(value.Id ?? "unknown"); + + JsonSerializer.Serialize(writer, parameters); + + if (value.BitArrays != null && value.BitArrays.Any()) + { + writer.WritePropertyName("bits"); + writer.WriteStartArray(); + foreach (var bitArray in value.BitArrays) + { + writer.WriteBase64StringValue(bitArray); + } + + writer.WriteEndArray(); + } + + if (value.Children != null && value.Children.Any()) + { + writer.WritePropertyName("children"); + writer.WriteStartArray(); + foreach (var child in value.Children) + { + JsonSerializer.Serialize(writer, child, options); + } + + writer.WriteEndArray(); + } + + writer.WriteEndObject(); + } + } + } +} \ No newline at end of file diff --git a/src/Bloomn/Bloomn.csproj b/src/Bloomn/Bloomn.csproj new file mode 100644 index 0000000..e9ae3e2 --- /dev/null +++ b/src/Bloomn/Bloomn.csproj @@ -0,0 +1,15 @@ + + + + net5.0 + enable + + + + + + + + + + diff --git a/src/Bloomn/Callbacks.cs b/src/Bloomn/Callbacks.cs new file mode 100644 index 0000000..8c869b1 --- /dev/null +++ b/src/Bloomn/Callbacks.cs @@ -0,0 +1,77 @@ +using System; +using System.Threading; + +namespace Bloomn +{ + public class Callbacks + { + public Action? OnCapacityChanged { get; set; } + public Action? OnCountChanged { get; set; } + public Action? OnBitCountChanged { get; set; } + public Action? OnScaled { get; set; } + public Action? OnHit { get; set; } + public Action? OnMiss { get; set; } + public Action? OnFalsePositive { get; set; } + } + + internal class StateMetrics: IBloomFilterDimensions + { + private long _count; + private int _setBitCount; + + private readonly Callbacks _callbacks; + public string Id { get; } + public double ErrorRate { get; } + public int Capacity { get; private set; } + + public long Count => _count; + public int BitCount { get; private set; } + public int HashCount { get; private set; } + + public int SetBitCount => _setBitCount; + + public StateMetrics(BloomFilterParameters parameters, Callbacks callbacks) + { + _callbacks = callbacks; + Id = parameters.Id; + ErrorRate = parameters.Dimensions.ErrorRate; + Capacity = parameters.Dimensions.Capacity; + BitCount = parameters.Dimensions.BitCount; + HashCount = parameters.Dimensions.HashCount; + } + + public void OnCapacityChanged(int value) + { + Capacity = value; + _callbacks.OnCapacityChanged?.Invoke(Id, value); + } + + public void IncrementCount(int amount) + { + var value = Interlocked.Add(ref _count, amount); + _callbacks.OnCountChanged?.Invoke(Id, value); + } + + public void IncrementSetBitCount(int amount) + { + Interlocked.Add(ref _setBitCount, amount); + } + + public void OnCountChanged(long value) + { + _count = value; + _callbacks.OnCountChanged?.Invoke(Id, _count); + } + + public void OnBitCountChanged(int value) + { + BitCount = value; + _callbacks.OnBitCountChanged?.Invoke(Id, value); + } + + public void OnScaled(BloomFilterParameters parameters) => _callbacks.OnScaled?.Invoke(Id, parameters); + public void OnHit() => _callbacks.OnHit?.Invoke(Id); + public void OnMiss() => _callbacks.OnMiss?.Invoke(Id); + public void OnFalsePositive() => _callbacks.OnFalsePositive?.Invoke(Id); + } +} \ No newline at end of file diff --git a/src/Bloomn/ClassicBloomFilter.cs b/src/Bloomn/ClassicBloomFilter.cs new file mode 100644 index 0000000..8741143 --- /dev/null +++ b/src/Bloomn/ClassicBloomFilter.cs @@ -0,0 +1,332 @@ +using System; +using System.Buffers; +using System.Collections; +using System.Linq; +using System.Threading; + +namespace Bloomn +{ + public class ClassicBloomFilter : IBloomFilter + { + /// + /// Seed used for second hash. + /// + private const int Hash2Seed = 1234567; + + private readonly ReaderWriterLockSlim _lock = new ReaderWriterLockSlim(); + + private readonly int _bitCount; + private readonly int _hashCount; + private readonly StateMetrics _metrics; + private readonly ArrayPool _indexPool; + private readonly IKeyHasherFactory _keyHasherFactory; + private readonly BitArray[] _bitArrays; + + public readonly BloomFilterParameters Parameters; + private readonly int _bitsPerSlice; + private readonly int _actualBitCount; + + public ClassicBloomFilter(BloomFilterOptions options, BloomFilterState state) + { + if (state.Parameters == null) + { + throw new ArgumentException("BloomFilterState.Parameters must not be null."); + } + + state.Parameters.Validate(); + + Parameters = state.Parameters; + + _keyHasherFactory = options.GetHasher(); + + _metrics = new StateMetrics(Parameters, options.Callbacks); + + _bitsPerSlice = ComputeBitsPerSlice(state.Parameters.Dimensions.BitCount, state.Parameters.Dimensions.HashCount); + + if (state.BitArrays != null && state.BitArrays.Count > 0) + { + _bitArrays = state.BitArrays.Select(x => new BitArray(x)).ToArray(); + } + else + { + _bitArrays = Enumerable.Range(0, Parameters.Dimensions.HashCount).Select(_ => new BitArray(_bitsPerSlice)).ToArray(); + } + + _bitCount = Parameters.Dimensions.BitCount; + _hashCount = Parameters.Dimensions.HashCount; + _actualBitCount = _bitsPerSlice * _hashCount; + _indexPool = ArrayPool.Create(_hashCount, 10); + } + + internal static int ComputeBitsPerSlice(int bitCount, int hashCount) + { + var n = bitCount / hashCount; + // Hash distribution is best when modded by a prime number + if (n % 2 == 0) + { + n++; + } + + // The maximum prime gap at 1,346,294,310,749 is 582 so we should never hit it + var safety = n + 582; + int i, j; + for (i = n; i < safety; i += 2) + { + var limit = Math.Sqrt(i); + for (j = 3; j <= limit; j += 2) + { + if (i % j == 0) + break; + } + + if (j > limit) + return i; + } + + throw new Exception($"Prime above {n} not found in a reasonable time (your filter must be unreasonably large)."); + } + + + public double Saturation => _metrics.SetBitCount / (double) _actualBitCount; + + public long Count => _metrics.Count; + + public BloomFilterEntry Check(BloomFilterCheckRequest checkRequest) + { + switch (checkRequest.Behavior) + { + case BloomFilterCheckBehavior.CheckOnly: + return IsNotPresent(checkRequest) ? BloomFilterEntry.NotPresent : BloomFilterEntry.MaybePresent; + + case BloomFilterCheckBehavior.PrepareForAdd: + var preparedAdd = PrepareAdd(checkRequest); + if (preparedAdd.CanAdd) + { + return new BloomFilterEntry(true, preparedAdd); + } + + return BloomFilterEntry.MaybePresent; + + case BloomFilterCheckBehavior.AddImmediately: + var wasNotPresent = TryAdd(checkRequest.Key); + if (wasNotPresent) + { + return new BloomFilterEntry(wasNotPresent, PreparedAdd.AlreadyAdded); + } + + return BloomFilterEntry.MaybePresent; + + default: + throw new ArgumentOutOfRangeException(nameof(checkRequest)); + } + } + + public bool IsNotPresent(BloomFilterCheckRequest checkRequest) + { + _lock.EnterReadLock(); + try + { + var maybePresent = true; + var bytes = checkRequest.Key.Bytes; + + var hash1 = _keyHasherFactory.Hash(bytes, 0); + var index = AdaptHash(hash1); + maybePresent = GetBit(0, index); + if (!maybePresent) + { + return true; + } + + var hash2 = _keyHasherFactory.Hash(bytes, Hash2Seed); + index = AdaptHash(hash2); + maybePresent = GetBit(1, index); + + if (!maybePresent) + { + return true; + } + + for (var i = 2; i < _hashCount; i++) + { + var hash = (hash1 + i) * hash2; + index = AdaptHash(hash); + maybePresent = GetBit(i, index); + if (!maybePresent) + { + return true; + } + } + + return false; + } + finally + { + _lock.ExitReadLock(); + } + } + + private PreparedAdd PrepareAdd(BloomFilterCheckRequest checkRequest) + { + _lock.EnterReadLock(); + try + { + var indexes = _indexPool.Rent(_hashCount); + + var maybePresent = true; + var bytes = checkRequest.Key.Bytes; + + var hash1 = _keyHasherFactory.Hash(bytes, 0); + var index = AdaptHash(hash1); + maybePresent &= GetBit(0, index); + indexes[0] = index; + + var hash2 = _keyHasherFactory.Hash(bytes, Hash2Seed); + index = AdaptHash(hash2); + if (maybePresent) + { + maybePresent &= GetBit(1, index); + } + + indexes[1] = index; + + for (var i = 2; i < _hashCount; i++) + { + var hash = (hash1 + i) * hash2; + index = AdaptHash(hash); + if (maybePresent) + { + maybePresent &= GetBit(i, index); + } + + indexes[i] = index; + } + + if (maybePresent) + { + _indexPool.Return(indexes); + return PreparedAdd.AlreadyAdded; + } + + return new PreparedAdd(Id, indexes, ApplyPreparedAdd, Release); + } + finally + { + _lock.ExitReadLock(); + } + } + + private bool TryAdd(BloomFilterKey key) + { + _lock.EnterWriteLock(); + try + { + var wasPresent = true; + var bytes = key.Bytes; + + var hash1 = _keyHasherFactory.Hash(bytes, 0); + var index = AdaptHash(hash1); + wasPresent &= SetBitAndReturnPreviousState(0, index); + + var hash2 = _keyHasherFactory.Hash(bytes, Hash2Seed); + index = AdaptHash(hash2); + wasPresent &= SetBitAndReturnPreviousState(1, index); + + for (var i = 2; i < _hashCount; i++) + { + var hash = (hash1 + i) * hash2; + index = AdaptHash(hash); + wasPresent &= SetBitAndReturnPreviousState(i, index); + } + + if (!wasPresent) + { + _metrics.IncrementCount(1); + } + + return !wasPresent; + } + finally + { + _lock.ExitWriteLock(); + } + } + + public string Id => Parameters.Id; + public IBloomFilterDimensions Dimensions => Parameters.Dimensions; + + public bool ApplyPreparedAdd(PreparedAdd preparedAdd) + { + _lock.EnterWriteLock(); + try + { + var madeChange = false; + if (preparedAdd.Indexes != null) + { + for (int i = 0; i < _hashCount; i++) + { + var index = preparedAdd.Indexes[i]; + madeChange |= !SetBitAndReturnPreviousState(i, index); + } + } + + if (madeChange) + { + _metrics.IncrementCount(1); + } + + return madeChange; + } + finally + { + _lock.ExitWriteLock(); + } + } + + public void Release(PreparedAdd preparedAdd) + { + if (preparedAdd.Indexes != null) + { + _indexPool.Return(preparedAdd.Indexes); + } + } + + private bool SetBitAndReturnPreviousState(int slice, int index) + { + if (GetBit(slice, index)) + { + return true; + } + + _metrics.IncrementSetBitCount(1); + _bitArrays[slice][index] = true; + return false; + } + + private bool GetBit(int slice, int index) + { + return _bitArrays[slice][index]; + } + + private int AdaptHash(long hash) + { + return (int) (Math.Abs(hash) % _bitsPerSlice); + } + + public BloomFilterState GetState() + { + var state = new BloomFilterState + { + Id = Parameters.Id, + Parameters = Parameters, + Count = Count, + BitArrays = _bitArrays.Select(x => + { + var bytes = new byte[_bitsPerSlice]; + x.CopyTo(bytes, 0); + return bytes; + }).ToList(), + }; + return state; + } + } +} \ No newline at end of file diff --git a/src/Bloomn/DependencyInjection.cs b/src/Bloomn/DependencyInjection.cs new file mode 100644 index 0000000..2faf4a5 --- /dev/null +++ b/src/Bloomn/DependencyInjection.cs @@ -0,0 +1,59 @@ +using System; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.DependencyInjection.Extensions; +using Microsoft.Extensions.Options; + +namespace Bloomn +{ + public class BloomFilterConfigurationBuilder + { + public IServiceCollection ServiceCollection { get; } + + public BloomFilterConfigurationBuilder(IServiceCollection serviceCollection) + { + ServiceCollection = serviceCollection; + } + + public BloomFilterConfigurationBuilder WithDefaultProfile(Action configureOptions) + { + ServiceCollection.Configure(options => + { + var builder = new BloomFilterBuilder(options); + configureOptions(builder); + }); + return this; + } + + public BloomFilterConfigurationBuilder AddProfile(string name, Action configureOptions) + { + ServiceCollection.Configure(name, options => + { + var builder = new BloomFilterBuilder(options); + configureOptions(builder); + }); + + return this; + } + } + + public static class DependencyInjectionExtensions + { + public static IServiceCollection AddBloomFilters(this IServiceCollection serviceCollection, Action? configure = null) + { + serviceCollection.AddOptions(); + if (configure != null) + { + var builder = new BloomFilterConfigurationBuilder(serviceCollection); + configure(builder); + } + else + { + serviceCollection.AddOptions(); + } + + serviceCollection.TryAddTransient(); + + return serviceCollection; + } + } +} \ No newline at end of file diff --git a/src/Bloomn/HashFunction.cs b/src/Bloomn/HashFunction.cs new file mode 100644 index 0000000..e0385ac --- /dev/null +++ b/src/Bloomn/HashFunction.cs @@ -0,0 +1,8 @@ +using System; + +namespace Bloomn +{ + public static class Murmur3 + { + } + } \ No newline at end of file diff --git a/src/Bloomn/HashRegistry.cs b/src/Bloomn/HashRegistry.cs new file mode 100644 index 0000000..83dc425 --- /dev/null +++ b/src/Bloomn/HashRegistry.cs @@ -0,0 +1,45 @@ +using System; +using System.Collections.Generic; + +namespace Bloomn +{ + public class BloomFilterOptions + { + private IKeyHasherFactory? _keyHasher; + + public static BloomFilterOptions DefaultOptions { get; set; } = new BloomFilterOptions(); + + public string HasherType { get; set; } = typeof(Murmur3HasherFactory).AssemblyQualifiedName!; + + public BloomFilterDimensions Dimensions { get; set; } = new BloomFilterDimensions(); + + public ScalingParameters ScalingParameters { get; set; } = new ScalingParameters(); + + public Callbacks Callbacks { get; set; } = new Callbacks(); + + public bool DiscardInconsistentState { get; set; } + + public void SetHasher(IKeyHasherFactory hasherFactory) + { + HasherType = hasherFactory.GetType().AssemblyQualifiedName!; + _keyHasher = hasherFactory; + } + + public IKeyHasherFactory GetHasher() + { + if (_keyHasher == null) + { + var type = Type.GetType(HasherType, true)!; + + _keyHasher = Activator.CreateInstance(type) as IKeyHasherFactory; + + if (_keyHasher == null) + { + throw new Exception($"HasherType {HasherType} does not implement {typeof(IKeyHasherFactory)}"); + } + } + + return _keyHasher; + } + } +} \ No newline at end of file diff --git a/src/Bloomn/IBloomFilter.cs b/src/Bloomn/IBloomFilter.cs new file mode 100644 index 0000000..38bbb32 --- /dev/null +++ b/src/Bloomn/IBloomFilter.cs @@ -0,0 +1,46 @@ +namespace Bloomn +{ + + public interface IBloomFilter + { + string Id { get; } + long Count { get; } + + IBloomFilterDimensions Dimensions { get; } + + double Saturation { get; } + + /// + /// Checks whether a key is not present in the filter. + /// Returned value can be used to add the key + /// + /// + /// + BloomFilterEntry Check(BloomFilterCheckRequest checkRequest); + + BloomFilterState GetState(); + } + + public interface IPreparedAddTarget + { + bool Add(int[] indexes); + } + + public static class BloomFilterExtensions + { + public static BloomFilterEntry CheckAndPrepareAdd(this IBloomFilter bloomFilter, BloomFilterKey key) + => bloomFilter.Check(BloomFilterCheckRequest.PrepareForAdd(key)); + + public static bool Add(this IBloomFilter bloomFilter, BloomFilterKey key) + { + var check = bloomFilter.Check(BloomFilterCheckRequest.AddImmediately(key)); + return check.IsNotPresent; + } + + public static bool IsNotPresent(this IBloomFilter bloomFilter, BloomFilterKey key) + { + var check = bloomFilter.Check(BloomFilterCheckRequest.CheckOnly(key)); + return check.IsNotPresent; + } + } +} \ No newline at end of file diff --git a/src/Bloomn/IBloomFilterManager.cs b/src/Bloomn/IBloomFilterManager.cs new file mode 100644 index 0000000..559b8bd --- /dev/null +++ b/src/Bloomn/IBloomFilterManager.cs @@ -0,0 +1,11 @@ +using System.Threading.Tasks; + +namespace Bloomn +{ + public interface IBloomFilterManager + { + Task GetOrCreateBloomFilter(BloomFilterParameters parameters); + Task SaveBloomFilter(ScalingBloomFilter scalingBloomFilter); + Task DeleteBloomFilter(string key); + } +} \ No newline at end of file diff --git a/src/Bloomn/IKeyHasherFactory.cs b/src/Bloomn/IKeyHasherFactory.cs new file mode 100644 index 0000000..931ebe8 --- /dev/null +++ b/src/Bloomn/IKeyHasherFactory.cs @@ -0,0 +1,10 @@ +using System; + +namespace Bloomn +{ + public interface IKeyHasherFactory + { + string Algorithm { get; } + uint Hash(ReadOnlySpan key, int seed); + } +} \ No newline at end of file diff --git a/src/Bloomn/Murmur3HasherFactory.cs b/src/Bloomn/Murmur3HasherFactory.cs new file mode 100644 index 0000000..c737381 --- /dev/null +++ b/src/Bloomn/Murmur3HasherFactory.cs @@ -0,0 +1,89 @@ +using System; + +namespace Bloomn +{ + public class Murmur3HasherFactory : IKeyHasherFactory + { + public string Algorithm => "murmur3"; + + public uint Hash(ReadOnlySpan key, int seed) + { + return Compute(key, (uint)key.Length, (uint)seed); + } + + public static uint Compute(ReadOnlySpan data, uint length, uint seed) + { + uint nblocks = length >> 2; + + uint h1 = seed; + + const uint c1 = 0xcc9e2d51; + const uint c2 = 0x1b873593; + + //---------- + // body + + int i = 0 ; + + for (uint j = nblocks; j > 0 ; --j) + { + uint k1l = BitConverter.ToUInt32(data[i..]); + + k1l *= c1; + k1l = Rotl32(k1l, 15); + k1l *= c2; + + h1 ^= k1l; + h1 = Rotl32(h1, 13); + h1 = h1 * 5 + 0xe6546b64; + + i+=4; + } + + //---------- + // tail + + nblocks <<= 2; + + uint k1 = 0; + + uint tailLength = length & 3; + + if (tailLength == 3) + k1 ^= (uint)data[2 + (int)nblocks] << 16; + if (tailLength >= 2) + k1 ^= (uint)data[1 + (int)nblocks] << 8; + if (tailLength >= 1) + { + k1 ^= data[(int)nblocks]; + k1 *= c1; k1 = Rotl32(k1, 15); k1 *= c2; h1 ^= k1; + } + + //---------- + // finalization + + h1 ^= length; + + h1 = Fmix32(h1); + + return h1; + } + + private static uint Fmix32(uint h) + { + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; + } + + private static uint Rotl32(uint x, byte r) + { + return (x << r) | (x >> (32 - r)); + } + + } +} \ No newline at end of file diff --git a/src/Bloomn/PreparedAdd.cs b/src/Bloomn/PreparedAdd.cs new file mode 100644 index 0000000..dc18f00 --- /dev/null +++ b/src/Bloomn/PreparedAdd.cs @@ -0,0 +1,42 @@ +using System; + +namespace Bloomn +{ + public readonly struct PreparedAdd : IDisposable + { + private readonly Func? _add; + public readonly string FilterId; + internal readonly int[]? Indexes; + internal readonly Action? Release; + public readonly bool CanAdd; + + public PreparedAdd(string filterId, int[]? indexes, Func? add, Action? release) + { + _add = add; + FilterId = filterId; + Release = release; + Indexes = indexes; + CanAdd = add != null && release != null && indexes != null; + } + + public static readonly PreparedAdd AlreadyAdded = new PreparedAdd("AlreadyAdded", null, null, null); + + public bool Add() + { + if (_add != null) + { + return _add(this); + } + + return false; + } + + public void Dispose() + { + if (Release != null) + { + Release(this); + } + } + } +} \ No newline at end of file diff --git a/src/Bloomn/ScalingBloomFilter.cs b/src/Bloomn/ScalingBloomFilter.cs new file mode 100644 index 0000000..2d0189a --- /dev/null +++ b/src/Bloomn/ScalingBloomFilter.cs @@ -0,0 +1,341 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Threading; + +namespace Bloomn +{ + public sealed class ScalingBloomFilter : IBloomFilter + { + private readonly List _filters = new(); + private readonly ReaderWriterLockSlim _lock = new(); + private readonly BloomFilterOptions _options; + private readonly BloomFilterParameters _parameters; + private readonly ScalingParameters _scalingParameters; + private readonly StateMetrics _metrics; + private ClassicBloomFilter _activeFilter; + + public ScalingBloomFilter(BloomFilterParameters parameters) : this(BloomFilterOptions.DefaultOptions, new BloomFilterState {Parameters = parameters}) + { + } + + public ScalingBloomFilter(BloomFilterState state) : this(BloomFilterOptions.DefaultOptions, state) + { + } + + public ScalingBloomFilter(BloomFilterOptions options, BloomFilterParameters parameters) : this(options, new BloomFilterState {Parameters = parameters}) + { + } + + public ScalingBloomFilter(BloomFilterOptions options, BloomFilterState state) + { + if (state.Parameters == null) + { + throw new ArgumentException("BloomFilterState.Parameters must not be null."); + } + + state.Parameters.Validate(); + + if (state.Parameters.ScalingParameters.MaxCapacityBehavior != MaxCapacityBehavior.Scale) + { + throw new ArgumentException(nameof(state), $"Parameters.ScalingParameters.MaxCapacityBehavior was not set to {MaxCapacityBehavior.Scale}"); + } + + _options = options; + _parameters = state.Parameters; + _scalingParameters = state.Parameters.ScalingParameters; + + _metrics = new StateMetrics(_parameters, options.Callbacks); + + if (state.Parameters.Id == null) + { + throw new ArgumentException("state.Parameters.Id must not be null", nameof(state)); + } + + if (_scalingParameters.MaxCapacityBehavior == MaxCapacityBehavior.Scale) + { + if (state.Children?.Count > 0) + { + _filters = state.Children.Select((childState, i) => + { + if (childState.Parameters == null) + { + throw new ArgumentException($"Invalid state: child filter {i} was missing parameters"); + } + + return new ClassicBloomFilter(options, childState); + }).ToList(); + _activeFilter = _filters.Last(); + _metrics.OnCapacityChanged(_filters.Sum(x => x.Parameters.Dimensions.Capacity)); + _metrics.OnBitCountChanged(_filters.Sum(x => x.Parameters.Dimensions.BitCount)); + } + else + { + Scale(); + } + } + else if (state.BitArrays != null) + { + _filters = new List(); + _activeFilter = new ClassicBloomFilter(options, state); + } + else + { + Scale(); + } + + _metrics.OnCountChanged(state.Count); + } + + public string Id => _parameters.Id; + + public long Count => _metrics.Count; + + public IBloomFilterDimensions Dimensions => _metrics; + + public double Saturation => _filters.Sum(x => x.Saturation); + + public BloomFilterEntry Check(BloomFilterCheckRequest checkRequest) + { + switch (checkRequest.Behavior) + { + case BloomFilterCheckBehavior.CheckOnly: + return IsNotPresent(checkRequest) ? BloomFilterEntry.NotPresent : BloomFilterEntry.MaybePresent; + case BloomFilterCheckBehavior.PrepareForAdd: + var preparedAdd = PrepareAdd(checkRequest); + if (preparedAdd.CanAdd) + { + return new BloomFilterEntry(true, preparedAdd); + } + + return BloomFilterEntry.MaybePresent; + + case BloomFilterCheckBehavior.AddImmediately: + + using (var entry = PrepareAdd(new BloomFilterCheckRequest(checkRequest.Key, BloomFilterCheckBehavior.PrepareForAdd))) + { + if (entry.CanAdd) + { + ApplyPreparedAdd(entry); + return BloomFilterEntry.NotPresent; + } + } + + return BloomFilterEntry.MaybePresent; + default: + throw new ArgumentOutOfRangeException(); + } + } + + private bool ApplyPreparedAdd(PreparedAdd preparedAdd) + { + try + { + _lock.EnterWriteLock(); + + var added = false; + + if (_activeFilter.Id == preparedAdd.FilterId) + { + added = _activeFilter.ApplyPreparedAdd(preparedAdd); + } + else + { + foreach (var filter in _filters) + { + if (filter.Id == preparedAdd.FilterId) + { + added = filter.ApplyPreparedAdd(preparedAdd); + } + } + } + + if (added) + { + _metrics.IncrementCount(1); + } + + if (_metrics.Count >= _metrics.Capacity) + { + Scale(); + } + + return added; + } + finally + { + _lock.ExitWriteLock(); + } + } + + public bool IsNotPresent(BloomFilterCheckRequest checkRequest) + { + try + { + _lock.EnterReadLock(); + for (var i = _filters.Count - 1; i >= 0; i--) + { + var filter = _filters[i]; + + var isNotPresent = filter.IsNotPresent(checkRequest); + if (!isNotPresent) + { + _metrics.OnHit(); + return false; + } + } + + _metrics.OnMiss(); + return true; + } + finally + { + _lock.ExitReadLock(); + } + } + + /// + /// Prepares to add the to the filter. + /// Returns a disposable struct on which you can call + /// to add the key to the filter without incurring the cost of computing the hashes + /// again. It's important to dispose the returned struct to limit allocations. + /// + /// True if the set probably contains the item + private PreparedAdd PrepareAdd(BloomFilterCheckRequest checkRequest) + { + try + { + _lock.EnterReadLock(); + // Check active filter because it's the largest and if we get + // a hit anywhere it's most likely to be in the active filter + var entry = _activeFilter.Check(checkRequest); + if (entry.IsNotPresent) + { + // Check the rest of the filters. + for (var i = _filters.Count - 2; i >= 0; i--) + { + var filter = _filters[i]; + + var isNotPresent = filter.Check(BloomFilterCheckRequest.CheckOnly(checkRequest.Key)).IsNotPresent; + if (!isNotPresent) + { + _metrics.OnHit(); + return PreparedAdd.AlreadyAdded; + } + } + } + + if (entry.IsNotPresent) + { + _metrics.OnMiss(); + } + else + { + _metrics.OnHit(); + } + + if (entry.PreparedAdd.CanAdd) + { + return new PreparedAdd(entry.PreparedAdd.FilterId, entry.PreparedAdd.Indexes, ApplyPreparedAdd, entry.PreparedAdd.Release); + } + + return PreparedAdd.AlreadyAdded; + } + finally + { + _lock.ExitReadLock(); + } + } + + + /// + /// Reports that a hit was a false positive, to allow unified metrics reporting. + /// + public void ReportFalsePositive() + { + _metrics.OnFalsePositive(); + } + + public BloomFilterState GetState() + { + _lock.EnterReadLock(); + try + { + var state = new BloomFilterState + { + Id = _parameters.Id, + Parameters = _parameters, + Count = Count + }; + + state.Children = _filters.Select(x => x.GetState()).ToList(); + + return state; + } + finally + { + _lock.ExitReadLock(); + } + } + + [MemberNotNull(nameof(_activeFilter))] + private void Scale() + { + if (_activeFilter == null) + { + var bloomFilterDimensions = _parameters.Dimensions; + if (_parameters.ScalingParameters.MaxCapacityBehavior == MaxCapacityBehavior.Scale) + { + // We need to create the filter with a lower error rate so that the compounded + // error rate for all filters will stay below the requested rate. + var rescaledErrorRate = _parameters.Dimensions.ErrorRate / (1 / (1 - _parameters.ScalingParameters.ErrorRateScaling)); + bloomFilterDimensions = new BloomFilterDimensions.Computer() + { + Capacity = bloomFilterDimensions.Capacity, + FalsePositiveRate = rescaledErrorRate + }.Compute(); + } + + var nextParameters = _parameters with + { + Id = $"{_parameters.Id}[{_filters.Count}]", + Dimensions = bloomFilterDimensions + }; + _activeFilter = new ClassicBloomFilter(_options, new BloomFilterState() + { + Parameters = nextParameters + }); + _filters.Add(_activeFilter); + } + else + { + var nextBitCount = (int) Math.Round(_activeFilter.Parameters.Dimensions.BitCount * _scalingParameters.CapacityScaling); + var nextErrorRate = _activeFilter.Parameters.Dimensions.ErrorRate * _scalingParameters.ErrorRateScaling; + var nextHashCount = (int) Math.Ceiling(_activeFilter.Parameters.Dimensions.HashCount + _filters.Count * Math.Log2(Math.Pow(_activeFilter.Parameters.ScalingParameters.ErrorRateScaling, -1))); + + var nextDimensions = new BloomFilterDimensions.Computer() + { + BitCount = nextBitCount, + FalsePositiveRate = nextErrorRate, + HashCount = nextHashCount + }.Compute(); + + var nextParameters = (_parameters with + { + Id = $"{_parameters.Id}[{_filters.Count}]", + Dimensions = nextDimensions + }); + _activeFilter = new ClassicBloomFilter(_options, new BloomFilterState() + { + Parameters = nextParameters, + }); + _filters.Add(_activeFilter); + } + + _metrics.OnCapacityChanged(_filters.Sum(x => x.Parameters.Dimensions.Capacity)); + _metrics.OnBitCountChanged(_filters.Sum(x => x.Parameters.Dimensions.BitCount)); + _metrics.OnScaled(_activeFilter.Parameters); + } + } +} \ No newline at end of file diff --git a/src/Bloomn/ScalingParameters.cs b/src/Bloomn/ScalingParameters.cs new file mode 100644 index 0000000..ccf878a --- /dev/null +++ b/src/Bloomn/ScalingParameters.cs @@ -0,0 +1,52 @@ +using System; +using System.Collections.Generic; +using System.ComponentModel.DataAnnotations; + +namespace Bloomn +{ + public record ScalingParameters + { + public MaxCapacityBehavior MaxCapacityBehavior { get; init; } = MaxCapacityBehavior.Throw; + + public double CapacityScaling { get; init; } = 2; + + public double ErrorRateScaling { get; init; } = 0.8; + + public void Validate() + { + if (MaxCapacityBehavior == MaxCapacityBehavior.Scale) + { + if (CapacityScaling <= 1) + { + throw new ValidationException("CapacityScaling must be greater than 1."); + } + + if (ErrorRateScaling is <= 0 or >= 1) + { + throw new ValidationException("ErrorRateScaling must be between 0 and 1 exclusive."); + } + } + } + + public IEnumerable ValidateMigration(ScalingParameters other) + { + var diff = new List(); + if (MaxCapacityBehavior != other.MaxCapacityBehavior) + { + diff.Add($"{nameof(MaxCapacityBehavior)}: {MaxCapacityBehavior} != {other.MaxCapacityBehavior}"); + } + + if (Math.Abs(CapacityScaling - other.CapacityScaling) > double.Epsilon) + { + diff.Add($"{nameof(CapacityScaling)}: {CapacityScaling} != {other.CapacityScaling}"); + } + + if (Math.Abs(ErrorRateScaling - other.ErrorRateScaling) > double.Epsilon) + { + diff.Add($"{nameof(ErrorRateScaling)}: {ErrorRateScaling} != {other.ErrorRateScaling}"); + } + + return diff; + } + } +} \ No newline at end of file diff --git a/tests/Bloomn.Tests/AssertionHelpers.cs b/tests/Bloomn.Tests/AssertionHelpers.cs new file mode 100644 index 0000000..3f20be1 --- /dev/null +++ b/tests/Bloomn.Tests/AssertionHelpers.cs @@ -0,0 +1,15 @@ +using FluentAssertions; +using JetBrains.Annotations; + +namespace Bloomn.Tests +{ + public static class AssertionHelpers + { + [ContractAnnotation("item:null=>halt")] + public static void ShouldNotBeNull([System.Diagnostics.CodeAnalysis.NotNull]this object item) + { + item.Should().NotBeNull(); + } + + } +} \ No newline at end of file diff --git a/tests/Bloomn.Tests/BloomFilterDimensionsTests.cs b/tests/Bloomn.Tests/BloomFilterDimensionsTests.cs new file mode 100644 index 0000000..b897aa5 --- /dev/null +++ b/tests/Bloomn.Tests/BloomFilterDimensionsTests.cs @@ -0,0 +1,57 @@ +using FluentAssertions; +using Xunit; + +namespace Bloomn.Tests +{ + + public class BloomFilterDimensionsTests + { + [Theory] + [InlineData(6550, 62783, 7, 0.01)] + [InlineData(4000, 38341, 7, 0.01)] + [InlineData(20000000, 124704485, 4, 0.05)] + public void CanComputeParametersFromCapacityAndErrorRate(int capacity, int bitCount, int hashCount, double errorRate) + { + BloomFilterDimensions.ForCapacityAndErrorRate(capacity, errorRate) + .Should().BeEquivalentTo(new BloomFilterDimensions() + { + Capacity = capacity, + ErrorRate = errorRate, + BitCount = bitCount, + HashCount = hashCount, + }, o => o.ComparingByMembers()); + } + + [Theory] + [InlineData(null, 100001, 8, 0.03, 12950, 100001, 8, 0.03)] + [InlineData(12345, 123456, 5, null, 12345, 123456, 5, 0.009429163)] + [InlineData(12345, 123456, null, null, 12345, 123456, 7, 0.008191797)] + public void CanComputeParameters( + int? capacity, + int? bitCount, + int? hashCount, + double? falsePositiveRate, + int expectedCapacity, + int expectedBitCount, + int expectedHashCount, + double expectedFalsePositiveRate + + ) + { + new BloomFilterDimensions.Computer() + { + Capacity = capacity, + BitCount = bitCount, + HashCount = hashCount, + FalsePositiveRate = falsePositiveRate + }.Compute().Should().BeEquivalentTo(new BloomFilterDimensions( + expectedFalsePositiveRate, + expectedCapacity, + expectedBitCount, + expectedHashCount), o => o.ComparingByMembers() + .Using(t => t.Subject.Should().BeApproximately(t.Expectation, 0.0001) + ).WhenTypeIs()); + + } + } +} \ No newline at end of file diff --git a/tests/Bloomn.Tests/BloomFilterManagerTests.cs b/tests/Bloomn.Tests/BloomFilterManagerTests.cs new file mode 100644 index 0000000..9afe44a --- /dev/null +++ b/tests/Bloomn.Tests/BloomFilterManagerTests.cs @@ -0,0 +1,47 @@ +#pragma warning disable 8618 +using System; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Threading.Tasks; +using FluentAssertions; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using Moq; +using Xunit; + +namespace Bloomn.Tests +{ + + public class BloomFilterBuilderTests + { + public BloomFilterOptions DefaultOptions { get; set; } + public Mock> OptionsMonitor { get; set; } + + public BloomFilterBuilderTests() + { + DefaultOptions = new BloomFilterOptions(); + + OptionsMonitor = new Mock>(); + OptionsMonitor.SetupGet(x => x.CurrentValue).Returns(DefaultOptions); + } + + [Fact] + public void BuilderCanCreateDefaultInstance() + { + var sut = new ServiceCollection() + .AddBloomFilters() + .BuildServiceProvider() + .GetRequiredService(); + + var actual = sut.Build(); + actual.Dimensions.Should().BeEquivalentTo(new BloomFilterDimensions()); + + var state = actual.GetState(); + state.Parameters.ShouldNotBeNull(); + state.Parameters.HashAlgorithm.Should().Be(new Murmur3HasherFactory().Algorithm); + + } + } + +} \ No newline at end of file diff --git a/tests/Bloomn.Tests/BloomFilterStateTests.cs b/tests/Bloomn.Tests/BloomFilterStateTests.cs new file mode 100644 index 0000000..4d2f3ce --- /dev/null +++ b/tests/Bloomn.Tests/BloomFilterStateTests.cs @@ -0,0 +1,7 @@ +namespace Bloomn.Tests +{ + public class BloomFilterStateTests + { + + } +} \ No newline at end of file diff --git a/tests/Bloomn.Tests/BloomFilterTests.cs b/tests/Bloomn.Tests/BloomFilterTests.cs new file mode 100644 index 0000000..2ca4521 --- /dev/null +++ b/tests/Bloomn.Tests/BloomFilterTests.cs @@ -0,0 +1,521 @@ +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Text; +using System.Threading; +using FluentAssertions; +using MathNet.Numerics.Statistics; +using Microsoft.VisualBasic; +using Xunit; +using Xunit.Abstractions; + +namespace Bloomn.Tests +{ + public abstract class BloomFilterTestsBase + { + private readonly ITestOutputHelper _testOutputHelper; + + protected static IEnumerable PredictableStrings(int count) + { + var rand = new Random(1234); + var buffer = new byte[16]; + return Enumerable.Range(0, count).Select((_, i) => + { + rand.NextBytes(buffer); + return i + "-" + new Guid(buffer); + }); + } + + protected static IEnumerable RandomStrings(int count) + { + return Enumerable.Range(0, count).Select((_, i) => i + "-" + Guid.NewGuid()); + } + + public void TearDown() + { + } + + public BloomFilterOptions Options { get; set; } + + private const bool AddLoggingCallbacks = false; + + protected BloomFilterTestsBase(ITestOutputHelper testOutputHelper) + { + _testOutputHelper = testOutputHelper; + + + Options = new BloomFilterOptions() + { + Callbacks = AddLoggingCallbacks + ? new Callbacks + { + OnCapacityChanged = (x, i) => testOutputHelper.WriteLine($"OnCapacityChanged({x}, {i})"), + OnCountChanged = (x, i) => testOutputHelper.WriteLine($"OnCountChanged({x}, {i})"), + OnBitCountChanged = (x, i) => testOutputHelper.WriteLine($"OnBitCountChanged({x}, {i})"), + OnScaled = (x, p) => testOutputHelper.WriteLine($"OnScaled({x}, {p})"), + OnHit = (x) => testOutputHelper.WriteLine($"OnHit({x})"), + OnMiss = (x) => testOutputHelper.WriteLine($"OnMiss({x})"), + OnFalsePositive = (x) => testOutputHelper.WriteLine($"OnFalsePositive({x})"), + } + : new Callbacks() + { + // OnHit = (x) => testOutputHelper.WriteLine($"OnHit({x})"), + OnScaled = (x, p) => testOutputHelper.WriteLine($"OnScaled({x}, {p})"), + } + }; + } + + public abstract IBloomFilter Create(BloomFilterOptions options, BloomFilterParameters parameters); + + [Fact] + public void CanAddAndCheckSingleItem() + { + var sut = Create(Options, new BloomFilterParameters("test").WithCapacityAndErrorRate(100, 0.1)); + var key = "test string"; + sut.Add(key).Should().BeTrue("the string hasn't been added before"); + sut.IsNotPresent(key).Should().BeFalse("the string has been added before"); + sut.Add(key).Should().BeFalse("the string hasn't been added before"); + sut.IsNotPresent(key).Should().BeFalse("the string has been added"); + } + + [Fact] + public void CanPrepareAndCommitSingleItem() + { + var sut = Create(Options, new BloomFilterParameters("test").WithCapacityAndErrorRate(100, 0.1)); + + var key = "test string"; + using (var entry = sut.CheckAndPrepareAdd(key)) + { + entry.IsNotPresent.Should().BeTrue("the key has not been added"); + sut.IsNotPresent(key).Should().BeTrue("the key has not been added"); + + entry.Add().Should().BeTrue("the key had not been added previously"); + + sut.IsNotPresent(key).Should().BeFalse("the key has been added"); + } + } + + /// + /// These tests are intended to be more debuggable and to provide a baseline for correct behavior. + /// They use the same set of keys on every run and log events. + /// + /// + /// + /// + /// + /// + [Theory] + [InlineData(1000, 0.1, 1)] + [InlineData(10000, 0.01, 1)] + [InlineData(10000, 0.01, 2)] + [InlineData(10000, 0.01, 4)] + [InlineData(10000, 0.01, 8)] + public void PredictableStringsTests(int count, double errorRate, int threads) + { + var parameters = new BloomFilterParameters("test").WithCapacityAndErrorRate(count, errorRate); + + VerifyContracts( + parameters, + () => Create(Options, parameters), + PredictableStrings, + 1, + count, + threads + ); + } + + [Theory] + [InlineData(11000, 0.01, 1000, 1000)] + public void FalsePositiveDistributionIsCorrect(int count, double errorRate, int sampleSize, int sampleInterval) + { + var parameters = new BloomFilterParameters("test").WithCapacityAndErrorRate(count, errorRate); + + ChartFalsePositiveRates( + parameters, + () => Create(Options, parameters), + RandomStrings, + count, + sampleSize, + sampleInterval + ); + } + + + public void ChartFalsePositiveRates(BloomFilterParameters parameters, Func factory, Func> keyFactory, int numberToInsert, int sampleSize, int sampleInterval) + { + var incrementalFalsePositiveCounts = new Dictionary(); + var maxCapacityFalsePositiveCounts = new List(); + var sut = factory(); + + var magnitude = (int) Math.Log10(numberToInsert); + + var sampleKeys = keyFactory(sampleSize).ToList(); + + var keys = keyFactory(numberToInsert).ToList(); + for (int i = 0; i < keys.Count; i++) + { + var key = keys[i]; + sut.Add(key); + + if (i % sampleInterval == 0) + { + var falsePositiveCount = 0; + foreach (var sampleKey in sampleKeys) + { + if (!sut.IsNotPresent(sampleKey)) + { + falsePositiveCount++; + } + } + + var fpr = falsePositiveCount / (double) sampleSize; + incrementalFalsePositiveCounts[i] = falsePositiveCount; + _testOutputHelper.WriteLine($"{i.ToString().PadLeft(magnitude, ' ')}: saturation:{sut.Saturation:F4} {fpr:F4} {new string('X', falsePositiveCount)}"); + } + } + + for (int i = 0; i < 100; i++) + { + var keySample = keyFactory(sampleSize).ToList(); + var falsePositiveCount = 0; + foreach (var sampleKey in keySample) + { + if (!sut.IsNotPresent(sampleKey)) + { + falsePositiveCount++; + } + } + maxCapacityFalsePositiveCounts.Add(falsePositiveCount); + } + + + var averageIncrementalFpr = incrementalFalsePositiveCounts.Values.Select(x => x / (double) sampleSize).Average(); + _testOutputHelper.WriteLine($"Average false positive rate while adding: {averageIncrementalFpr} (expected < {parameters.Dimensions.ErrorRate})"); + + var averageMaxedFpr = maxCapacityFalsePositiveCounts.Select(x => x / (double) sampleSize).Average(); + _testOutputHelper.WriteLine($"Average false positive rate while at max capacity: {averageMaxedFpr} (expected < {parameters.Dimensions.ErrorRate})"); + + averageIncrementalFpr.Should().BeLessThan(parameters.Dimensions.ErrorRate); + var maxAcceptableErrorRate = parameters.Dimensions.ErrorRate + parameters.Dimensions.ErrorRate / 10; + averageMaxedFpr.Should().BeLessThan(maxAcceptableErrorRate, $"the false positive rate should be close to or less than the max acceptable rate {parameters.Dimensions.ErrorRate}" ); + + + } + + /// + /// These tests focus on performance and statistical correctness. They use random values on each run + /// and run many reps of the same parameters to build a statistical picture of the behavior of the implementation. + /// + /// + /// + /// + /// + /// + /// + [Theory] + [InlineData(10, 10000, 0.01, 1)] + [InlineData(10, 10000, 0.05, 4)] + public void RandomStringsTests(int reps, int count, double errorRate, int threads) + { + var parameters = new BloomFilterParameters("test").WithCapacityAndErrorRate(count, errorRate); + + VerifyContracts( + parameters, + () => Create(Options, parameters), + RandomStrings, + reps, + count, + threads + ); + } + + public void VerifyContracts(BloomFilterParameters parameters, Func factory, Func> keyFactory, int reps, int sampleSize, int threads) + { + var minimumCapacityForErrorRate = (1d / parameters.Dimensions.ErrorRate) * 10; + var logOfMinimimuCapacity = Math.Ceiling(Math.Log10(minimumCapacityForErrorRate)); + var logOfCapacity = Math.Ceiling(Math.Log10(sampleSize)); + logOfMinimimuCapacity.Should().BeLessThan(logOfCapacity, "you can't get meaningful stats if the inverse of the error rate is within an order of magnitude of the sample size"); + + var falsePositiveRates = new List(); + + var times = new List(); + + // warmup run: + var timer = Stopwatch.StartNew(); + var warmupResult = GetFalsePositiveCount(); + timer.Stop(); + _testOutputHelper.WriteLine($"Warmup run completed in {timer.Elapsed.TotalMilliseconds}ms, with {warmupResult} false positives for an error rate of: {warmupResult / (double) sampleSize}"); + + + for (int i = 0; i < reps; i++) + { + timer.Restart(); + var falsePositiveCount = GetFalsePositiveCount(); + timer.Stop(); + times.Add(timer.Elapsed.TotalMilliseconds); + + var falsePositiveRate = (double) falsePositiveCount / (double) sampleSize; + falsePositiveRates.Add(falsePositiveRate); + } + + var falsePositiveStats = new DescriptiveStatistics(falsePositiveRates); + var timeStats = new DescriptiveStatistics(times); + + _testOutputHelper.WriteLine($"Expected error rate: {parameters.Dimensions.ErrorRate}"); + _testOutputHelper.WriteLine($"Reps: {reps}"); + _testOutputHelper.WriteLine($"Sample size: {sampleSize}"); + _testOutputHelper.WriteLine($"Observed error rate stats:"); + _testOutputHelper.WriteLine($" Mean: {falsePositiveStats.Mean}"); + _testOutputHelper.WriteLine($" Min: {falsePositiveStats.Minimum}"); + _testOutputHelper.WriteLine($" Max: {falsePositiveStats.Maximum}"); + _testOutputHelper.WriteLine($" σ: {falsePositiveStats.StandardDeviation}"); + + _testOutputHelper.WriteLine($"Duration stats:"); + _testOutputHelper.WriteLine($" Mean: {timeStats.Mean}ms"); + _testOutputHelper.WriteLine($" Min: {timeStats.Minimum}ms"); + _testOutputHelper.WriteLine($" Max: {timeStats.Maximum}ms"); + _testOutputHelper.WriteLine($" σ: {timeStats.StandardDeviation}ms"); + + if (reps == 1) + { + falsePositiveStats.Mean.Should().BeLessThan(3 * parameters.Dimensions.ErrorRate, "the actual false positive rate should be less than triple the expected rate"); + } + else + { + var minusOneStandardDeviation = falsePositiveStats.Mean - falsePositiveStats.StandardDeviation; + minusOneStandardDeviation.Should().BeLessThan(parameters.Dimensions.ErrorRate, "the actual false positive rate should be within 1 standard deviation of the expected false positive rate"); + } + + int GetFalsePositiveCount() + { + var sut = factory(); + var falsePositiveCount = 0; + var count = 0; + keyFactory(sampleSize).AsParallel() + .WithDegreeOfParallelism(threads) + .ForAll(s => + { + var c = Interlocked.Increment(ref count); + var f = falsePositiveCount; + if (!sut.IsNotPresent(s)) + { + f = Interlocked.Increment(ref falsePositiveCount); + var runningFpr = f / (double) c; + // _testOutputHelper.WriteLine($"False positive rate @ {c}: {runningFpr}"); + } + + + sut.Add(s); + }); + return falsePositiveCount; + } + } + + + + // [Fact] + // public void CanAddAndCheckBloomFilterConcurrently() + // { + // // We split the capacity between the training set and the verification set. + // var strings = Strings.Value; + // + // var count = strings.Count; + // var initialCapacity = count / 10; + // var errorRate = 0.01; + // var acceptableErrorRate = errorRate * 10; + // var acceptableErrorCount = (int) Math.Ceiling(count * acceptableErrorRate); + // + // var parameters = new BloomFilterParameters("test", initialCapacity, errorRate); + // var sut = new ScalingBloomFilter(parameters); + // + // var hitOnAddCount = 0; + // + // strings.AsParallel().WithDegreeOfParallelism(8) + // .ForAll(s => + // { + // if (!sut.Add(s)) + // { + // Interlocked.Increment(ref hitOnAddCount); + // } + // }); + // + // hitOnAddCount.Should().BeLessThan(acceptableErrorCount, "The hit rate on adds should be close to the false positive rate"); + // } + + // + // [Theory] + // [InlineData(100, 0.01)] + // [InlineData(1000, 0.01)] + // [InlineData(100, 0.5)] + // [InlineData(1000, 0.001)] + // public void CanAddAndCheckBloomFilterWithScaling(int count, double errorRate) + // { + // // We split the capacity between the training set and the verification set. + // var strings = Strings.Value.Take(count).ToList(); + // + // // Start out at a small capacity to exercise scaling + // var initialCapacity = count / 10; + // + // var acceptableErrorRate = errorRate * 10; + // var acceptableErrorCount = (int) Math.Ceiling(count * acceptableErrorRate); + // + // var parameters = new BloomFilterParameters("test", initialCapacity, errorRate); + // var sut = new ScalingBloomFilter(LoggingOptions, parameters); + // + // var hitOnAddCount = 0; + // + // foreach (var s in strings) + // { + // var isNotPresent = sut.IsNotPresent(s); + // var preparedAdd = sut.PrepareAdd(s); + // + // preparedAdd.IsNotPresent.Should().Be(isNotPresent, "both methods of checking should have the same result"); + // + // if (!preparedAdd.Add()) + // { + // hitOnAddCount++; + // } + // } + // + // hitOnAddCount.Should().BeLessOrEqualTo(acceptableErrorCount, "The hit rate on adds should be close to the false positive rate"); + // + // foreach (var s in strings) + // { + // var result = sut.PrepareAdd(s); + // result.IsNotPresent.Should().BeFalse($"every string added should be known including '{s}'"); + // } + // + // var stringsAlt = StringsAlt.Value.Take(count).ToList(); + // + // var hitsOnCheck = 0; + // foreach (var s in stringsAlt) + // { + // var isNotPresent = sut.IsNotPresent(s); + // var preparedAdd = sut.PrepareAdd(s); + // + // preparedAdd.IsNotPresent.Should().Be(isNotPresent, "both methods of checking should have the same result"); + // + // if (!isNotPresent) + // { + // hitsOnCheck++; + // } + // } + // + // hitsOnCheck.Should().BeLessOrEqualTo(acceptableErrorCount, "The hit rate on strings not added to the filter should be close to the false positive rate"); + // } + // + // [Fact] + // public void CanAddAndCheckBloomFilterConcurrently() + // { + // // We split the capacity between the training set and the verification set. + // var strings = Strings.Value; + // + // var count = strings.Count; + // var initialCapacity = count / 10; + // var errorRate = 0.01; + // var acceptableErrorRate = errorRate * 10; + // var acceptableErrorCount = (int) Math.Ceiling(count * acceptableErrorRate); + // + // var parameters = new BloomFilterParameters("test", initialCapacity, errorRate); + // var sut = new ScalingBloomFilter(parameters); + // + // var hitOnAddCount = 0; + // + // strings.AsParallel().WithDegreeOfParallelism(8) + // .ForAll(s => + // { + // if (!sut.Add(s)) + // { + // Interlocked.Increment(ref hitOnAddCount); + // } + // }); + // + // hitOnAddCount.Should().BeLessThan(acceptableErrorCount, "The hit rate on adds should be close to the false positive rate"); + // } + // + // [Theory] + // [InlineData(10, true)] + // [InlineData(100, false)] + // public void StateCanBeExportedAndImported(int initialCapacity, bool scalable) + // { + // const int count = 100; + // const double falsePositiveRate = 0.01; + // var strings = Strings.Value.Take(count).ToList(); + // + // var parameters = new BloomFilterParameters("test", initialCapacity, falsePositiveRate) + // { + // AllowScaling = scalable + // }; + // var sut = new ScalingBloomFilter(parameters); + // + // var hitOnAddCount = 0; + // + // foreach (var s in strings) + // { + // if (!sut.Add(s)) + // { + // hitOnAddCount++; + // } + // } + // + // var expectedHitCount = (int) Math.Ceiling(strings.Count * falsePositiveRate) + 1; + // hitOnAddCount.Should().BeLessOrEqualTo(expectedHitCount); + // + // foreach (var s in strings) + // { + // var key = new BloomFilterCheckRequest(s); + // var added = sut.Add(key); + // added.Should().BeFalse($"'{s} has been added before"); + // } + // + // var state = sut.ExportState(); + // state.Parameters.Should().BeEquivalentTo(parameters); + // state.Id.Should().Be(parameters.Id); + // state.Count.Should().Be(sut.Count); + // + // if (scalable) + // { + // state.Children.Should().HaveCountGreaterThan(1); + // } + // else + // { + // state.Base64BitArray.Should().NotBeNullOrEmpty(); + // } + // + // var json = JsonSerializer.Serialize(state, new JsonSerializerOptions() {WriteIndented = true}); + // _testOutputHelper.WriteLine(json); + // ; + // + // + // state.Count.Should().BeCloseTo(strings.Count, 10); + // + // var sut2 = new ScalingBloomFilter(LoggingOptions, state); + // + // var state2 = sut.ExportState(); + // + // state2.Should().BeEquivalentTo(state, "state should be applied correctly"); + // + // foreach (var s in strings) + // { + // var key = new BloomFilterCheckRequest(s); + // var added = sut2.PrepareAdd(key); + // added.IsNotPresent.Should().BeFalse($"'{s} has been added in previous instance"); + // } + // + // var stringsAlt = StringsAlt.Value.Take(count).ToList(); + // + // var hitsOnCheck = 0; + // foreach (var s in stringsAlt) + // { + // if (sut2.IsNotPresent(s) == false) + // { + // hitsOnCheck++; + // } + // } + // + // hitsOnCheck.Should().BeCloseTo(expectedHitCount, 10, "The hit rate on strings not added to the filter should be close to the false positive rate"); + // } + } +} \ No newline at end of file diff --git a/tests/Bloomn.Tests/Bloomn.Tests.csproj b/tests/Bloomn.Tests/Bloomn.Tests.csproj new file mode 100644 index 0000000..90fc9b1 --- /dev/null +++ b/tests/Bloomn.Tests/Bloomn.Tests.csproj @@ -0,0 +1,31 @@ + + + + net5.0 + + false + + + + + + + + + + + + runtime; build; native; contentfiles; analyzers; buildtransitive + all + + + runtime; build; native; contentfiles; analyzers; buildtransitive + all + + + + + + + + diff --git a/tests/Bloomn.Tests/ClassicBloomFilterTests.cs b/tests/Bloomn.Tests/ClassicBloomFilterTests.cs new file mode 100644 index 0000000..80a53a5 --- /dev/null +++ b/tests/Bloomn.Tests/ClassicBloomFilterTests.cs @@ -0,0 +1,19 @@ +using Xunit.Abstractions; + +namespace Bloomn.Tests +{ + public class ClassicBloomFilterTests : BloomFilterTestsBase + { + public ClassicBloomFilterTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper) + { + } + + public override IBloomFilter Create(BloomFilterOptions options, BloomFilterParameters parameters) + { + return new ClassicBloomFilter(options, new BloomFilterState() + { + Parameters = parameters + }); + } + } +} \ No newline at end of file diff --git a/tests/Bloomn.Tests/PerformanceExperiments.cs b/tests/Bloomn.Tests/PerformanceExperiments.cs new file mode 100644 index 0000000..b86f40c --- /dev/null +++ b/tests/Bloomn.Tests/PerformanceExperiments.cs @@ -0,0 +1,104 @@ +using System; +using System.Diagnostics; +using System.Timers; +using Xunit; +using Xunit.Abstractions; + +namespace Bloomn.Tests +{ + public class PerformanceExperiments + { + private readonly ITestOutputHelper _testOutputHelper; + + public interface ITestAdder + { + int Add(int a, int b); + } + + public class TestAdder: ITestAdder + { + public int Add(int a, int b) + { + return a + b; + } + } + + public abstract class TestAdderBase + { + public abstract int Add(int a, int b); + } + public class TestAdderDerived: TestAdderBase + { + public override int Add(int a, int b) + { + return a + b; + } + } + + public PerformanceExperiments(ITestOutputHelper testOutputHelper) + { + _testOutputHelper = testOutputHelper; + } + + public Stopwatch RunInterface(int reps) + { + ITestAdder viaInterface = new TestAdder(); + var timer = Stopwatch.StartNew(); + for (int i = 0; i < reps;) + { + i = viaInterface.Add(1, i); + } + timer.Stop(); + return timer; + } + + public Stopwatch RunDerived(int reps) + { + TestAdderBase sut = new TestAdderDerived(); + var timer = Stopwatch.StartNew(); + for (int i = 0; i < reps;) + { + i = sut.Add(1, i); + } + timer.Stop(); + return timer; + } + + public Stopwatch RunFunc(int reps) + { + Func sut = (a, b) => a + b; + + var timer = Stopwatch.StartNew(); + for (int i = 0; i < reps;) + { + i = sut(1, i); + } + timer.Stop(); + return timer; + } + + [Fact] + public void InterfaceVsFuncTime() + { + // warmup + RunFunc(1); + RunDerived(1); + RunInterface(1); + + // run + var reps = 100000; + + var funcTimer = RunFunc(reps); + var derivedTimer = RunDerived(reps); + var interfaceTimer = RunInterface(reps); + + _testOutputHelper.WriteLine($"Interface time with {reps}: {interfaceTimer.ElapsedTicks}"); + _testOutputHelper.WriteLine($"Derived time with {reps}: {derivedTimer.ElapsedTicks}"); + _testOutputHelper.WriteLine($"Func time with {reps}: {funcTimer.ElapsedTicks}"); + + + + + } + } +} \ No newline at end of file diff --git a/tests/Bloomn.Tests/ScalableBloomFilterTests.cs b/tests/Bloomn.Tests/ScalableBloomFilterTests.cs new file mode 100644 index 0000000..88bd1dd --- /dev/null +++ b/tests/Bloomn.Tests/ScalableBloomFilterTests.cs @@ -0,0 +1,89 @@ +using System.Collections.Concurrent; +using System.Linq; +using FluentAssertions; +using Xunit; +using Xunit.Abstractions; + +namespace Bloomn.Tests +{ + public class ScalableBloomFilterTests : BloomFilterTestsBase + { + public ScalableBloomFilterTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper) + { + } + + public override IBloomFilter Create(BloomFilterOptions options, BloomFilterParameters parameters) + { + parameters = parameters.WithScaling(4, 0.8); + + return new ScalingBloomFilter(options, new BloomFilterState() + { + Parameters = parameters + }); + } + + /// + /// These tests are intended to be more debuggable and to provide a baseline for correct behavior. + /// They use the same set of keys on every run and log events. + /// + /// + /// + /// + /// + /// + [Theory] + [InlineData(10000, 1000, 0.01, 1)] + [InlineData(10000, 1000, 0.01, 8)] + public void PredictableStringsTestsWithScaling(int count, int capacity, double errorRate, int threads) + { + var parameters = new BloomFilterParameters("test") + .WithCapacityAndErrorRate(capacity, errorRate); + + VerifyContracts( + parameters, + () => Create(Options, parameters), + PredictableStrings, + 1, + count, + threads + ); + } + + /// + /// These tests focus on performance and statistical correctness. They use random values on each run + /// and run many reps of the same parameters to build a statistical picture of the behavior of the implementation. + /// + /// + /// + /// + /// + /// + [Theory] + [InlineData(4, 10000, 1000, 0.01, 1)] + [InlineData(4, 10000, 1000, 0.01, 4)] + public void RandomStringsTestsWithScaling(int reps, int count, int capacity, double errorRate, int threads) + { + var parameters = new BloomFilterParameters("test") + .WithCapacityAndErrorRate(capacity, errorRate); + + ChartFalsePositiveRates( + parameters, + () => Create(Options, parameters), + RandomStrings, + count, + 1000, + 100 + ); + + VerifyContracts( + parameters, + () => Create(Options, parameters), + RandomStrings, + reps, + count, + threads + ); + } + + } +} \ No newline at end of file