Skip to content

Commit

Permalink
New stable public API version of PipelineDP4j
Browse files Browse the repository at this point in the history
C++:
* Fix sensitivity in bounded sum when bounds provider returns 0 bounds.
* Fix sensitivity issue in Mean when BoundsProvider returns equal bounds.

PoB:
* Remove flag enableShardedPublicPartitions.
* Remove unused internal methods related to public partitions.

PipelineDP4j:
* Release new stable public API replacing the experimental API.
* Update examples to the new API.

Other:
* Fix GitHub Actions by installing libreadline-dev.

Change-Id: I64b49c2ba8284a0cbb0b4f814e5919ba35f04715
GitOrigin-RevId: 4c30df09c61fc11eb638bf8c46a549ae3cb669b6
  • Loading branch information
Differential Privacy Team authored and RamSaw committed Feb 6, 2025
1 parent 31123a5 commit 38c0813
Show file tree
Hide file tree
Showing 46 changed files with 7,318 additions and 1,747 deletions.
19 changes: 19 additions & 0 deletions cc/algorithms/bounded-mean.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,25 @@ class BoundedMeanWithApproxBounds : public BoundedMean<T> {
absl::StatusOr<Output> GenerateResult(double noise_interval_level) override {
ASSIGN_OR_RETURN(BoundsResult<T> bounds,
bounds_provider_->FinalizeAndCalculateBounds());

if (bounds.lower_bound == bounds.upper_bound) {
// When the bounds provider returns equal bounds, sensitivity is 0, so we
// need to slightly widen the bounds. This is a quick fix that works with
// BoundsProvider returning powers of two.
//
// TODO: Find a better solution for this quick fix.
if (std::round(bounds.lower_bound) == -1 ||
std::round(bounds.lower_bound) == 0) {
bounds.upper_bound += 1;
} else if (std::round(bounds.upper_bound) == 1) {
bounds.lower_bound = 0;
} else if (bounds.lower_bound < 0) {
bounds.upper_bound = bounds.lower_bound / 2;
} else {
bounds.lower_bound = bounds.upper_bound / 2;
}
}

RETURN_IF_ERROR(
BoundedMean<T>::CheckBounds(bounds.lower_bound, bounds.upper_bound));

Expand Down
11 changes: 11 additions & 0 deletions cc/algorithms/bounded-sum.h
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,17 @@ class BoundedSumWithApproxBounds : public BoundedSum<T> {
absl::StatusOr<Output> GenerateResult(double noise_interval_level) override {
ASSIGN_OR_RETURN(BoundsResult<T> bounds_result,
bounds_provider_->FinalizeAndCalculateBounds());

if (bounds_result.lower_bound == 0 && bounds_result.upper_bound == 0) {
// In case both bounds are 0, we need to enlarge the bounds range as we
// would otherwise have 0 sensitivity. This is a quick fix that works
// with BoundsProvider returning powers of two.
//
// TODO: Find a better solution for this quick fix.
bounds_result.upper_bound = 1;
bounds_result.lower_bound = -1;
}

bounds_result = RelaxBoundsSameSumSensitivity(bounds_result);

// Construct NumericalMechanism.
Expand Down
50 changes: 0 additions & 50 deletions cc/algorithms/numerical-mechanisms_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -412,36 +412,6 @@ TEST(NumericalMechanismsTest, LaplaceNoisedValueAboveThreshold) {
}
}

TEST(NumericalMechanismsTest, LaplaceMechanismAddNoiseOverflowFromTypeCast) {
LaplaceMechanism mechanism(1);

// The noise should eventually be positive, which, when added to the numeric
// limit, will cause an overflow and return a negative result.
int i;
for (i = 0; i < 100; ++i) {
if (mechanism.AddNoise(std::numeric_limits<int64_t>::max()) < 0) {
// An overflow has happened, so return to end the test as a success.
return;
}
}
FAIL() << "No overflow occurred after " << i << " iterations.";
}

TEST(NumericalMechanismsTest, LaplaceMechanismAddNoiseUnderflowFromTypeCast) {
LaplaceMechanism mechanism(1);

// The noise should eventually be negative, which, when added to the numeric
// limit, will cause an underflow and return a positive result.
int i;
for (i = 0; i < 100; ++i) {
if (mechanism.AddNoise(std::numeric_limits<int64_t>::lowest()) > 0) {
// An underflow has happened, so return to end the test as a success.
return;
}
}
FAIL() << "No overflow occurred after " << i << " iterations.";
}

TEST(NumericalMechanismsTest, LaplaceDiversityCorrect) {
LaplaceMechanism mechanism(1.0, 1.0);
EXPECT_EQ(mechanism.GetDiversity(), 1.0);
Expand Down Expand Up @@ -1081,26 +1051,6 @@ TEST(NumericalMechanismsTest, GaussianMechanismNoisedValueAboveThreshold) {
}
}

TEST(NumericalMechanismsTest, GaussianMechanismAddNoiseOverflowFromTypeCast) {
auto mechanism = GaussianMechanism::Builder()
.SetL2Sensitivity(1)
.SetEpsilon(1)
.SetDelta(0.5)
.Build();
ASSERT_OK(mechanism);

// The noise should eventually be positive, which, when added to the numeric
// limit, will cause an overflow and return a negative result.
int i;
for (i = 0; i < 100; ++i) {
if ((*mechanism)->AddNoise(std::numeric_limits<int64_t>::max()) < 0) {
// An overflow has happened, so return to end the test as a success.
return;
}
}
FAIL() << "No overflow occurred after " << i << " iterations.";
}

TEST(NumericalMechanismsTest, GaussianBuilderClone) {
GaussianMechanism::Builder test_builder;
auto clone =
Expand Down
23 changes: 19 additions & 4 deletions examples/pipelinedp4j/README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Running code walkthrough of BeamExample
# Running code walkthrough of the examples

Begin by reviewing the [Problem statement](#problem-statement) to understand the
statistics we aim to calculate and the data involved.
Expand Down Expand Up @@ -27,10 +27,21 @@ Using this data, we want to compute the following statistics:
* Number of users who watched a certain movie (`privacy_id_count` metric)
* Average rating of a certain movie (`mean` metric)

The output is a TXT file in this format:
If you used a DataFrame API then the output is a CSV file in the following format:

```
movieId=<value>, numberOfViews=<value>, averageOfRatings=<value>
movieId, numberOfViewers, numberOfViews, averageOfRatings
value, value, value, value
value, value, value, value
...
```

If you used a row-based API then the output will be in the following format:

```
movieId=<value>, numberOfViewers=<value>, numberOfViews=<value>, averageOfRatings=<value>
movieId=<value>, numberOfViewers=<value>, numberOfViews=<value>, averageOfRatings=<value>
...
```

## Running
Expand Down Expand Up @@ -87,12 +98,16 @@ Here's are the steps to build and run the example assuming you are in the
bazelisk build ...
```
1. Run the program:
1. Run the program (change `BeamExample` to `SparkExample` or `SparkDatasetExample` if you need):
```shell
bazel-bin/src/main/java/com/google/privacy/differentialprivacy/pipelinedp4j/examples/BeamExample --inputFilePath=netflix_data.csv --outputFilePath=output.txt
```
If you run examples on Apache Spark then `outputFilePath` should be changed
to `outputFolder` and will be a directory where output is written to.
The result will be stored in a file whose name starts with `part-00000`.
1. View the results:
```shell
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,26 @@ java_binary(
"@maven//:org_scala_lang_scala_library",
],
)

java_binary(
name = "SparkDataFrameExample",
srcs = [
"MovieMetrics.java",
"MovieView.java",
"SparkDataFrameExample.java",
],
main_class = "com.google.privacy.differentialprivacy.pipelinedp4j.examples.SparkDataFrameExample",
deps = [
"@com_google_privacy_differentialprivacy_pipielinedp4j//main/com/google/privacy/differentialprivacy/pipelinedp4j/api",
"@maven//:com_fasterxml_jackson_core_jackson_databind",
"@maven//:com_fasterxml_jackson_module_jackson_module_paranamer",
"@maven//:com_google_guava_guava",
"@maven//:info_picocli_picocli",
"@maven//:org_apache_spark_spark_catalyst_2_13",
"@maven//:org_apache_spark_spark_core_2_13",
"@maven//:org_apache_spark_spark_mllib_2_13",
"@maven//:org_apache_spark_spark_sql_2_13",
"@maven//:org_jetbrains_kotlin_kotlin_stdlib",
"@maven//:org_scala_lang_scala_library",
],
)
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,15 @@
import static java.lang.Math.round;
import static java.util.stream.Collectors.toCollection;

import com.google.privacy.differentialprivacy.pipelinedp4j.api.BeamQueryBuilder;
import com.google.privacy.differentialprivacy.pipelinedp4j.api.Bounds;
import com.google.privacy.differentialprivacy.pipelinedp4j.api.ContributionBoundingLevel;
import com.google.privacy.differentialprivacy.pipelinedp4j.api.ContributionBounds;
import com.google.privacy.differentialprivacy.pipelinedp4j.api.GroupsType;
import com.google.privacy.differentialprivacy.pipelinedp4j.api.NoiseKind;
import com.google.privacy.differentialprivacy.pipelinedp4j.api.QueryBuilder;
import com.google.privacy.differentialprivacy.pipelinedp4j.api.QueryPerGroupResult;
import com.google.privacy.differentialprivacy.pipelinedp4j.api.TotalBudget;
import com.google.privacy.differentialprivacy.pipelinedp4j.api.ValueAggregationsBuilder;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.stream.IntStream;
Expand Down Expand Up @@ -100,29 +105,33 @@ static void runBeamExample(BeamExampleOptions options) {
PCollection<MovieView> data = readData(pipeline, options.getInputFilePath());

// Define the query
var groupsType =
options.getUsePublicGroups()
? GroupsType.PublicGroups.create(publiclyKnownMovieIds(pipeline))
: new GroupsType.PrivateGroups();
var query =
QueryBuilder.from(data, /* privacyIdExtractor= */ new UserIdExtractor())
.groupBy(
/* groupKeyExtractor= */ new MovieIdExtractor(),
/* maxGroupsContributed= */ 3,
/* maxContributionsPerGroup= */ 1,
options.getUsePublicGroups() ? publiclyKnownMovieIds(pipeline) : null)
.countDistinctPrivacyUnits("numberOfViewers")
BeamQueryBuilder.from(
data,
/* privacyUnitExtractor= */ MovieView::getUserId,
new ContributionBoundingLevel.DATASET_LEVEL(
/* maxGroupsContributed= */ 3, /* maxContributionsPerGroup= */ 1))
.groupBy(/* groupKeyExtractor= */ MovieView::getMovieId, groupsType)
.countDistinctPrivacyUnits(/* outputColumnName= */ "numberOfViewers")
.count(/* outputColumnName= */ "numberOfViews")
.mean(
new RatingExtractor(),
/* minValue= */ 1.0,
/* maxValue= */ 5.0,
/* outputColumnName= */ "averageOfRatings",
/* budget= */ null)
.build();
.aggregateValue(
/* valueExtractor= */ new RatingExtractor(),
/* valueAggregations= */ new ValueAggregationsBuilder()
.mean(/* outputColumnName= */ "averageOfRatings"),
/* contributionBounds= */ new ContributionBounds(
/* totalValueBounds= */ null,
/* valueBounds= */ new Bounds(/* minValue= */ 1.0, /* maxValue= */ 5.0)))
.build(new TotalBudget(/* epsilon= */ 1.1, /* delta= */ 1e-10), NoiseKind.LAPLACE);
// Run the query with DP parameters.
PCollection<QueryPerGroupResult> result =
query.run(new TotalBudget(/* epsilon= */ 1.1, /* delta= */ 1e-10), NoiseKind.LAPLACE);
PCollection<QueryPerGroupResult<String>> result = query.run();

// Convert the result to better representation, i.e. to MovieMetrics.
var movieMetricsCoder = AvroCoder.of(MovieMetrics.class);
SerializableFunction<QueryPerGroupResult, MovieMetrics> mapToMovieMetricsFn =
SerializableFunction<QueryPerGroupResult<String>, MovieMetrics> mapToMovieMetricsFn =
perGroupResult -> {
String movieId = perGroupResult.getGroupKey();
long numberOfViewers =
Expand All @@ -148,24 +157,12 @@ static void runBeamExample(BeamExampleOptions options) {
System.out.println("Finished calculations.");
}

// Data extractors. They always have to implement Function1 and Serializable interfaces. If it
// doesn't implement Serializable interface, it will fail on Beam. If it doesn't implement
// Function1, it will fail at compile time due to types mismatch. Do not use lambdas for data
// extractors as they won't be serializable.
private static class UserIdExtractor implements Function1<MovieView, String>, Serializable {
@Override
public String invoke(MovieView movieView) {
return movieView.getUserId();
}
}

private static class MovieIdExtractor implements Function1<MovieView, String>, Serializable {
@Override
public String invoke(MovieView movieView) {
return movieView.getMovieId();
}
}

/**
* Static extractor for rating extraction.
*
* <p>Rating extractor must be serializable. In Java, we can't use lambdas, method references or
* anonymous classes because they capture `this` and therefore are not serializable.
*/
private static class RatingExtractor implements Function1<MovieView, Double>, Serializable {
@Override
public Double invoke(MovieView movieView) {
Expand Down
Loading

0 comments on commit 38c0813

Please sign in to comment.