Merge pull request #49 from msamsami/add-var-smoothing-gwnb

feat: add `var_smoothing` parameter to `GaussianWNB`
msamsami · Feb 13, 2025 · 1bb8693 · 1bb8693
2 parents 47b5422 + 4eb242a
commit 1bb8693
Show file tree

Hide file tree

Showing 4 changed files with 68 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -130,7 +130,7 @@ We conducted benchmarks on four datasets, [Wine](https://scikit-learn.org/stable
 | Wine             | GaussianNB              | 0.9749    | GeneralNB      | **0.9812**     |
 | Iris             | GaussianNB              | 0.9556    | GeneralNB      | **0.9602**     |
 | Digits           | GaussianNB              | 0.8372    | GeneralNB      | **0.8905**     |
-| Breast Cancer    | GaussianNB              | 0.9389    | GaussianWNB    | **0.9512**     |
+| Breast Cancer    | GaussianNB              | 0.9389    | GaussianWNB    | **0.9519**     |
 
 These benchmarks highlight the potential of WNB classifiers to provide better performance in certain scenarios by allowing more flexibility in the choice of distributions and incorporating weighting strategies.
 

diff --git a/tests/benchmarks/main.py b/tests/benchmarks/main.py
@@ -45,7 +45,7 @@ def benchmark_digits() -> None:
 def benchmark_breast_cancer() -> None:
     X, y = load_breast_cancer(return_X_y=True)
 
-    clf_wnb = GaussianWNB(max_iter=20, step_size=0.01, C=1.5)
+    clf_wnb = GaussianWNB(max_iter=30, step_size=0.01, C=1.5, var_smoothing=1e-12)
     clf_sklearn = GaussianNB()
     score_wnb, score_sklearn = benchmark(X, y, clf_wnb, clf_sklearn, MAX_ITER)
 

diff --git a/tests/test_gwnb.py b/tests/test_gwnb.py
@@ -116,6 +116,40 @@ def test_gwnb_prior_large_bias():
     assert clf.predict(np.array([[-0.1, -0.1]])) == np.array([2])
 
 
+def test_gwnb_var_smoothing():
+    """
+    Test whether var_smoothing parameter properly affects the variances.
+    """
+    X = np.array([[1, 0], [2, 0], [3, 0], [4, 0], [5, 0]])  # First feature has variance 2.0
+    y = np.array([1, 1, 2, 2, 2])
+
+    clf1 = GaussianWNB(var_smoothing=0.0)
+    clf1.fit(X, y)
+
+    clf2 = GaussianWNB(var_smoothing=1.0)
+    clf2.fit(X, y)
+
+    test_point = np.array([[2.5, 0]])
+    prob1 = clf1.predict_proba(test_point)
+    prob2 = clf2.predict_proba(test_point)
+
+    assert not np.allclose(prob1, prob2)
+    assert clf1.epsilon_ == 0.0
+    assert clf2.epsilon_ > clf1.epsilon_
+
+
+def test_gwnb_neg_var_smoothing():
+    """
+    Test whether an error is raised in case of negative var_smoothing.
+    """
+    clf = GaussianWNB(var_smoothing=-1.0)
+
+    msg_1 = "Variance smoothing parameter must be a non-negative real number"
+    msg_2 = "'var_smoothing' parameter of GaussianWNB must be a float in the range \[0.0, inf\)"
+    with pytest.raises(ValueError, match=rf"{msg_1}|{msg_2}"):
+        clf.fit(X, y)
+
+
 def test_gwnb_non_binary():
     """
     Test if an error is raised when given non-binary targets.

diff --git a/wnb/gwnb.py b/wnb/gwnb.py
@@ -42,6 +42,7 @@ def _get_parameter_constraints() -> dict[str, list[Any]]:
             "step_size": [Interval(Real, 0.0, None, closed="neither")],
             "penalty": [StrOptions({"l1", "l2"})],
             "C": [Interval(Real, 0.0, None, closed="left")],
+            "var_smoothing": [Interval(Real, 0, None, closed="left")],
             "learning_hist": ["boolean"],
         }
     except (ImportError, ModuleNotFoundError):
@@ -73,6 +74,10 @@ class GaussianWNB(_BaseNB):
     C : float, default=1.0
         Regularization strength. Must be strictly positive.
 
+    var_smoothing : float, default=1e-9
+        Portion of the largest variance of all features that is added to
+        variances for calculation stability.
+
     learning_hist : bool, default=False
         Whether to record the learning history, i.e., the value of cost function
         in each learning iteration.
@@ -91,6 +96,9 @@ class GaussianWNB(_BaseNB):
     n_classes_ : int
         Number of classes seen during :term:`fit`.
 
+    epsilon_ : float
+        Absolute additive value to variances.
+
     n_features_in_ : int
         Number of features seen during :term:`fit`.
 
@@ -151,6 +159,7 @@ def __init__(
         step_size: Float = 1e-4,
         penalty: str = "l2",
         C: Float = 1.0,
+        var_smoothing: Float = 1e-9,
         learning_hist: bool = False,
     ) -> None:
         self.priors = priors
@@ -159,6 +168,7 @@ def __init__(
         self.step_size = step_size
         self.penalty = penalty
         self.C = C
+        self.var_smoothing = var_smoothing
         self.learning_hist = learning_hist
 
     if SKLEARN_V1_6_OR_LATER:
@@ -269,6 +279,13 @@ def _init_parameters(self) -> None:
                 % self.max_iter
             )
 
+        # Ensure variance smoothing is a non-negative real number
+        if not isinstance(self.var_smoothing, Real) or self.var_smoothing < 0:
+            raise ValueError(
+                "Variance smoothing parameter must be a non-negative real number; got (var_smoothing=%r) instead."
+                % self.var_smoothing
+            )
+
     @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X: MatrixLike, y: ArrayLike) -> Self:
         """Fits Gaussian Binary MLD-WNB classifier according to X, y.
@@ -299,6 +316,8 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self:
 
         self._init_parameters()
 
+        self.epsilon_ = self.var_smoothing * np.var(X, axis=0).max()
+
         self.theta_: np.ndarray = np.zeros((self.n_features_in_, self.n_classes_))
         self.std_: np.ndarray = np.zeros((self.n_features_in_, self.n_classes_))
         self.var_: np.ndarray = np.zeros((self.n_features_in_, self.n_classes_))
@@ -325,18 +344,22 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self:
 
         return self
 
+    def _get_std(self) -> np.ndarray:
+        return np.sqrt(self.var_ + self.epsilon_)
+
     def _calculate_cost(self, X, y, y_hat, learning_hist: bool) -> tuple[Float, list[Float]]:
         _lambda = [self.error_weights_[y[i], y_hat[i]] for i in range(X.shape[0])]
 
         if learning_hist:
+            std = self._get_std()
             _cost = 0.0
             for i in range(X.shape[0]):
                 _sum = np.log(self.class_prior_[1] / self.class_prior_[0])
                 x = X[i, :]
                 for j in range(self.n_features_in_):
                     _sum += self.coef_[j] * (
-                        np.log(1e-20 + norm.pdf(x[j], self.theta_[j, 1], self.std_[j, 1]))
-                        - np.log(1e-20 + norm.pdf(x[j], self.theta_[j, 0], self.std_[j, 0]))
+                        np.log(1e-20 + norm.pdf(x[j], self.theta_[j, 1], std[j, 1]))
+                        - np.log(1e-20 + norm.pdf(x[j], self.theta_[j, 0], std[j, 0]))
                     )
                 _cost += _lambda[i] * _sum
         else:
@@ -345,24 +368,25 @@ def _calculate_cost(self, X, y, y_hat, learning_hist: bool) -> tuple[Float, list
         return _cost, _lambda
 
     def _calculate_grad(self, X, _lambda: list[Float]) -> np.ndarray:
+        std = self._get_std()
         _grad = np.repeat(
-            np.log(self.std_[:, 0] / self.std_[:, 1]).reshape(1, -1),
+            np.log(std[:, 0] / std[:, 1]).reshape(1, -1),
             X.shape[0],
             axis=0,
         )
         _grad += (
             0.5
             * (
                 (X - np.repeat(self.theta_[:, 0].reshape(1, -1), X.shape[0], axis=0))
-                / (np.repeat(self.std_[:, 0].reshape(1, -1), X.shape[0], axis=0))
+                / (np.repeat(std[:, 0].reshape(1, -1), X.shape[0], axis=0))
             )
             ** 2
         )
         _grad -= (
             0.5
             * (
                 (X - np.repeat(self.theta_[:, 1].reshape(1, -1), X.shape[0], axis=0))
-                / (np.repeat(self.std_[:, 1].reshape(1, -1), X.shape[0], axis=0))
+                / (np.repeat(std[:, 1].reshape(1, -1), X.shape[0], axis=0))
             )
             ** 2
         )
@@ -376,10 +400,11 @@ def _predict(self, X: MatrixLike) -> np.ndarray:
         return np.argmax(jll, axis=1)
 
     def _joint_log_likelihood(self, X) -> np.ndarray:
+        std = self._get_std()
         log_priors = np.tile(np.log(self.class_prior_), (X.shape[0], 1))
         w_reshaped = np.tile(self.coef_.reshape(-1, 1), (1, self.n_classes_))
-        term1 = np.sum(np.multiply(w_reshaped, -np.log(np.sqrt(2 * np.pi) * self.std_)))
-        var_inv = np.multiply(w_reshaped, 1.0 / np.multiply(self.std_, self.std_))
+        term1 = np.sum(np.multiply(w_reshaped, -np.log(np.sqrt(2 * np.pi) * std)))
+        var_inv = np.multiply(w_reshaped, 1.0 / np.multiply(std, std))
         mu_by_var = np.multiply(self.theta_, var_inv)
         term2 = -0.5 * (
             np.matmul(np.multiply(X, X), var_inv)