New stable public API version of PipelineDP4j

C++: * Fix sensitivity in bounded sum when bounds provider returns 0 bounds. * Fix sensitivity issue in Mean when BoundsProvider returns equal bounds. PoB: * Remove flag enableShardedPublicPartitions. * Remove unused internal methods related to public partitions. PipelineDP4j: * Release new stable public API replacing the experimental API. * Update examples to the new API. Other: * Fix GitHub Actions by installing libreadline-dev. Change-Id: I64b49c2ba8284a0cbb0b4f814e5919ba35f04715 GitOrigin-RevId: 4c30df09c61fc11eb638bf8c46a549ae3cb669b6
google · Feb 6, 2025 · 38c0813 · 38c0813
1 parent 31123a5
commit 38c0813
Show file tree

Hide file tree

Showing 46 changed files with 7,318 additions and 1,747 deletions.
diff --git a/cc/algorithms/bounded-mean.h b/cc/algorithms/bounded-mean.h
@@ -382,6 +382,25 @@ class BoundedMeanWithApproxBounds : public BoundedMean<T> {
   absl::StatusOr<Output> GenerateResult(double noise_interval_level) override {
     ASSIGN_OR_RETURN(BoundsResult<T> bounds,
                      bounds_provider_->FinalizeAndCalculateBounds());
+
+    if (bounds.lower_bound == bounds.upper_bound) {
+      // When the bounds provider returns equal bounds, sensitivity is 0, so we
+      // need to slightly widen the bounds.  This is a quick fix that works with
+      // BoundsProvider returning powers of two.
+      //
+      // TODO: Find a better solution for this quick fix.
+      if (std::round(bounds.lower_bound) == -1 ||
+          std::round(bounds.lower_bound) == 0) {
+        bounds.upper_bound += 1;
+      } else if (std::round(bounds.upper_bound) == 1) {
+        bounds.lower_bound = 0;
+      } else if (bounds.lower_bound < 0) {
+        bounds.upper_bound = bounds.lower_bound / 2;
+      } else {
+        bounds.lower_bound = bounds.upper_bound / 2;
+      }
+    }
+
     RETURN_IF_ERROR(
         BoundedMean<T>::CheckBounds(bounds.lower_bound, bounds.upper_bound));
 

diff --git a/cc/algorithms/bounded-sum.h b/cc/algorithms/bounded-sum.h
@@ -375,6 +375,17 @@ class BoundedSumWithApproxBounds : public BoundedSum<T> {
   absl::StatusOr<Output> GenerateResult(double noise_interval_level) override {
     ASSIGN_OR_RETURN(BoundsResult<T> bounds_result,
                      bounds_provider_->FinalizeAndCalculateBounds());
+
+    if (bounds_result.lower_bound == 0 && bounds_result.upper_bound == 0) {
+      // In case both bounds are 0, we need to enlarge the bounds range as we
+      // would otherwise have 0 sensitivity.  This is a quick fix that works
+      // with BoundsProvider returning powers of two.
+      //
+      // TODO: Find a better solution for this quick fix.
+      bounds_result.upper_bound = 1;
+      bounds_result.lower_bound = -1;
+    }
+
     bounds_result = RelaxBoundsSameSumSensitivity(bounds_result);
 
     // Construct NumericalMechanism.

diff --git a/cc/algorithms/numerical-mechanisms_test.cc b/cc/algorithms/numerical-mechanisms_test.cc
@@ -412,36 +412,6 @@ TEST(NumericalMechanismsTest, LaplaceNoisedValueAboveThreshold) {
   }
 }
 
-TEST(NumericalMechanismsTest, LaplaceMechanismAddNoiseOverflowFromTypeCast) {
-  LaplaceMechanism mechanism(1);
-
-  // The noise should eventually be positive, which, when added to the numeric
-  // limit, will cause an overflow and return a negative result.
-  int i;
-  for (i = 0; i < 100; ++i) {
-    if (mechanism.AddNoise(std::numeric_limits<int64_t>::max()) < 0) {
-      // An overflow has happened, so return to end the test as a success.
-      return;
-    }
-  }
-  FAIL() << "No overflow occurred after " << i << " iterations.";
-}
-
-TEST(NumericalMechanismsTest, LaplaceMechanismAddNoiseUnderflowFromTypeCast) {
-  LaplaceMechanism mechanism(1);
-
-  // The noise should eventually be negative, which, when added to the numeric
-  // limit, will cause an underflow and return a positive result.
-  int i;
-  for (i = 0; i < 100; ++i) {
-    if (mechanism.AddNoise(std::numeric_limits<int64_t>::lowest()) > 0) {
-      // An underflow has happened, so return to end the test as a success.
-      return;
-    }
-  }
-  FAIL() << "No overflow occurred after " << i << " iterations.";
-}
-
 TEST(NumericalMechanismsTest, LaplaceDiversityCorrect) {
   LaplaceMechanism mechanism(1.0, 1.0);
   EXPECT_EQ(mechanism.GetDiversity(), 1.0);
@@ -1081,26 +1051,6 @@ TEST(NumericalMechanismsTest, GaussianMechanismNoisedValueAboveThreshold) {
   }
 }
 
-TEST(NumericalMechanismsTest, GaussianMechanismAddNoiseOverflowFromTypeCast) {
-  auto mechanism = GaussianMechanism::Builder()
-                       .SetL2Sensitivity(1)
-                       .SetEpsilon(1)
-                       .SetDelta(0.5)
-                       .Build();
-  ASSERT_OK(mechanism);
-
-  // The noise should eventually be positive, which, when added to the numeric
-  // limit, will cause an overflow and return a negative result.
-  int i;
-  for (i = 0; i < 100; ++i) {
-    if ((*mechanism)->AddNoise(std::numeric_limits<int64_t>::max()) < 0) {
-      // An overflow has happened, so return to end the test as a success.
-      return;
-    }
-  }
-  FAIL() << "No overflow occurred after " << i << " iterations.";
-}
-
 TEST(NumericalMechanismsTest, GaussianBuilderClone) {
   GaussianMechanism::Builder test_builder;
   auto clone =

diff --git a/examples/pipelinedp4j/README.md b/examples/pipelinedp4j/README.md
@@ -1,4 +1,4 @@
-# Running code walkthrough of BeamExample
+# Running code walkthrough of the examples
 
 Begin by reviewing the [Problem statement](#problem-statement) to understand the
 statistics we aim to calculate and the data involved.
@@ -27,10 +27,21 @@ Using this data, we want to compute the following statistics:
 *   Number of users who watched a certain movie (`privacy_id_count` metric)
 *   Average rating of a certain movie (`mean` metric)
 
-The output is a TXT file in this format:
+If you used a DataFrame API then the output is a CSV file in the following format:
 
 ```
-movieId=<value>, numberOfViews=<value>, averageOfRatings=<value>
+movieId, numberOfViewers, numberOfViews, averageOfRatings
+value, value, value, value
+value, value, value, value
+...
+```
+
+If you used a row-based API then the output will be in the following format:
+
+```
+movieId=<value>, numberOfViewers=<value>, numberOfViews=<value>, averageOfRatings=<value>
+movieId=<value>, numberOfViewers=<value>, numberOfViews=<value>, averageOfRatings=<value>
+...
 ```
 
 ## Running
@@ -87,12 +98,16 @@ Here's are the steps to build and run the example assuming you are in the
     bazelisk build ...
     ```
 
-1.  Run the program:
+1.  Run the program (change `BeamExample` to `SparkExample` or `SparkDatasetExample` if you need):
 
     ```shell
     bazel-bin/src/main/java/com/google/privacy/differentialprivacy/pipelinedp4j/examples/BeamExample --inputFilePath=netflix_data.csv --outputFilePath=output.txt
     ```
 
+    If you run examples on Apache Spark then `outputFilePath` should be changed
+    to `outputFolder` and will be a directory where output is written to.
+    The result will be stored in a file whose name starts with `part-00000`.
+
 1.  View the results:
 
     ```shell

diff --git a/...4j/src/main/java/com/google/privacy/differentialprivacy/pipelinedp4j/examples/BUILD.bazel b/...4j/src/main/java/com/google/privacy/differentialprivacy/pipelinedp4j/examples/BUILD.bazel
@@ -56,3 +56,26 @@ java_binary(
         "@maven//:org_scala_lang_scala_library",
     ],
 )
+
+java_binary(
+    name = "SparkDataFrameExample",
+    srcs = [
+        "MovieMetrics.java",
+        "MovieView.java",
+        "SparkDataFrameExample.java",
+    ],
+    main_class = "com.google.privacy.differentialprivacy.pipelinedp4j.examples.SparkDataFrameExample",
+    deps = [
+        "@com_google_privacy_differentialprivacy_pipielinedp4j//main/com/google/privacy/differentialprivacy/pipelinedp4j/api",
+        "@maven//:com_fasterxml_jackson_core_jackson_databind",
+        "@maven//:com_fasterxml_jackson_module_jackson_module_paranamer",
+        "@maven//:com_google_guava_guava",
+        "@maven//:info_picocli_picocli",
+        "@maven//:org_apache_spark_spark_catalyst_2_13",
+        "@maven//:org_apache_spark_spark_core_2_13",
+        "@maven//:org_apache_spark_spark_mllib_2_13",
+        "@maven//:org_apache_spark_spark_sql_2_13",
+        "@maven//:org_jetbrains_kotlin_kotlin_stdlib",
+        "@maven//:org_scala_lang_scala_library",
+    ],
+)
diff --git a/...c/main/java/com/google/privacy/differentialprivacy/pipelinedp4j/examples/BeamExample.java b/...c/main/java/com/google/privacy/differentialprivacy/pipelinedp4j/examples/BeamExample.java
@@ -24,10 +24,15 @@
 import static java.lang.Math.round;
 import static java.util.stream.Collectors.toCollection;
 
+import com.google.privacy.differentialprivacy.pipelinedp4j.api.BeamQueryBuilder;
+import com.google.privacy.differentialprivacy.pipelinedp4j.api.Bounds;
+import com.google.privacy.differentialprivacy.pipelinedp4j.api.ContributionBoundingLevel;
+import com.google.privacy.differentialprivacy.pipelinedp4j.api.ContributionBounds;
+import com.google.privacy.differentialprivacy.pipelinedp4j.api.GroupsType;
 import com.google.privacy.differentialprivacy.pipelinedp4j.api.NoiseKind;
-import com.google.privacy.differentialprivacy.pipelinedp4j.api.QueryBuilder;
 import com.google.privacy.differentialprivacy.pipelinedp4j.api.QueryPerGroupResult;
 import com.google.privacy.differentialprivacy.pipelinedp4j.api.TotalBudget;
+import com.google.privacy.differentialprivacy.pipelinedp4j.api.ValueAggregationsBuilder;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.stream.IntStream;
@@ -100,29 +105,33 @@ static void runBeamExample(BeamExampleOptions options) {
     PCollection<MovieView> data = readData(pipeline, options.getInputFilePath());
 
     // Define the query
+    var groupsType =
+        options.getUsePublicGroups()
+            ? GroupsType.PublicGroups.create(publiclyKnownMovieIds(pipeline))
+            : new GroupsType.PrivateGroups();
     var query =
-        QueryBuilder.from(data, /* privacyIdExtractor= */ new UserIdExtractor())
-            .groupBy(
-                /* groupKeyExtractor= */ new MovieIdExtractor(),
-                /* maxGroupsContributed= */ 3,
-                /* maxContributionsPerGroup= */ 1,
-                options.getUsePublicGroups() ? publiclyKnownMovieIds(pipeline) : null)
-            .countDistinctPrivacyUnits("numberOfViewers")
+        BeamQueryBuilder.from(
+                data,
+                /* privacyUnitExtractor= */ MovieView::getUserId,
+                new ContributionBoundingLevel.DATASET_LEVEL(
+                    /* maxGroupsContributed= */ 3, /* maxContributionsPerGroup= */ 1))
+            .groupBy(/* groupKeyExtractor= */ MovieView::getMovieId, groupsType)
+            .countDistinctPrivacyUnits(/* outputColumnName= */ "numberOfViewers")
             .count(/* outputColumnName= */ "numberOfViews")
-            .mean(
-                new RatingExtractor(),
-                /* minValue= */ 1.0,
-                /* maxValue= */ 5.0,
-                /* outputColumnName= */ "averageOfRatings",
-                /* budget= */ null)
-            .build();
+            .aggregateValue(
+                /* valueExtractor= */ new RatingExtractor(),
+                /* valueAggregations= */ new ValueAggregationsBuilder()
+                    .mean(/* outputColumnName= */ "averageOfRatings"),
+                /* contributionBounds= */ new ContributionBounds(
+                    /* totalValueBounds= */ null,
+                    /* valueBounds= */ new Bounds(/* minValue= */ 1.0, /* maxValue= */ 5.0)))
+            .build(new TotalBudget(/* epsilon= */ 1.1, /* delta= */ 1e-10), NoiseKind.LAPLACE);
     // Run the query with DP parameters.
-    PCollection<QueryPerGroupResult> result =
-        query.run(new TotalBudget(/* epsilon= */ 1.1, /* delta= */ 1e-10), NoiseKind.LAPLACE);
+    PCollection<QueryPerGroupResult<String>> result = query.run();
 
     // Convert the result to better representation, i.e. to MovieMetrics.
     var movieMetricsCoder = AvroCoder.of(MovieMetrics.class);
-    SerializableFunction<QueryPerGroupResult, MovieMetrics> mapToMovieMetricsFn =
+    SerializableFunction<QueryPerGroupResult<String>, MovieMetrics> mapToMovieMetricsFn =
         perGroupResult -> {
           String movieId = perGroupResult.getGroupKey();
           long numberOfViewers =
@@ -148,24 +157,12 @@ static void runBeamExample(BeamExampleOptions options) {
     System.out.println("Finished calculations.");
   }
 
-  // Data extractors. They always have to implement Function1 and Serializable interfaces. If it
-  // doesn't implement Serializable interface, it will fail on Beam. If it doesn't implement
-  // Function1, it will fail at compile time due to types mismatch. Do not use lambdas for data
-  // extractors as they won't be serializable.
-  private static class UserIdExtractor implements Function1<MovieView, String>, Serializable {
-    @Override
-    public String invoke(MovieView movieView) {
-      return movieView.getUserId();
-    }
-  }
-
-  private static class MovieIdExtractor implements Function1<MovieView, String>, Serializable {
-    @Override
-    public String invoke(MovieView movieView) {
-      return movieView.getMovieId();
-    }
-  }
-
+  /**
+   * Static extractor for rating extraction.
+   *
+   * <p>Rating extractor must be serializable. In Java, we can't use lambdas, method references or
+   * anonymous classes because they capture `this` and therefore are not serializable.
+   */
   private static class RatingExtractor implements Function1<MovieView, Double>, Serializable {
     @Override
     public Double invoke(MovieView movieView) {