diff --git a/README.md b/README.md index e4e4644..80f176e 100644 --- a/README.md +++ b/README.md @@ -130,7 +130,7 @@ We conducted benchmarks on four datasets, [Wine](https://scikit-learn.org/stable | Wine | GaussianNB | 0.9749 | GeneralNB | **0.9812** | | Iris | GaussianNB | 0.9556 | GeneralNB | **0.9602** | | Digits | GaussianNB | 0.8372 | GeneralNB | **0.8905** | -| Breast Cancer | GaussianNB | 0.9389 | GaussianWNB | **0.9512** | +| Breast Cancer | GaussianNB | 0.9389 | GaussianWNB | **0.9519** | These benchmarks highlight the potential of WNB classifiers to provide better performance in certain scenarios by allowing more flexibility in the choice of distributions and incorporating weighting strategies. diff --git a/tests/benchmarks/main.py b/tests/benchmarks/main.py index 6d4b2a3..278478a 100644 --- a/tests/benchmarks/main.py +++ b/tests/benchmarks/main.py @@ -45,7 +45,7 @@ def benchmark_digits() -> None: def benchmark_breast_cancer() -> None: X, y = load_breast_cancer(return_X_y=True) - clf_wnb = GaussianWNB(max_iter=20, step_size=0.01, C=1.5) + clf_wnb = GaussianWNB(max_iter=30, step_size=0.01, C=1.5, var_smoothing=1e-12) clf_sklearn = GaussianNB() score_wnb, score_sklearn = benchmark(X, y, clf_wnb, clf_sklearn, MAX_ITER) diff --git a/tests/test_gwnb.py b/tests/test_gwnb.py index ac611a2..fc89b89 100644 --- a/tests/test_gwnb.py +++ b/tests/test_gwnb.py @@ -116,6 +116,40 @@ def test_gwnb_prior_large_bias(): assert clf.predict(np.array([[-0.1, -0.1]])) == np.array([2]) +def test_gwnb_var_smoothing(): + """ + Test whether var_smoothing parameter properly affects the variances. + """ + X = np.array([[1, 0], [2, 0], [3, 0], [4, 0], [5, 0]]) # First feature has variance 2.0 + y = np.array([1, 1, 2, 2, 2]) + + clf1 = GaussianWNB(var_smoothing=0.0) + clf1.fit(X, y) + + clf2 = GaussianWNB(var_smoothing=1.0) + clf2.fit(X, y) + + test_point = np.array([[2.5, 0]]) + prob1 = clf1.predict_proba(test_point) + prob2 = clf2.predict_proba(test_point) + + assert not np.allclose(prob1, prob2) + assert clf1.epsilon_ == 0.0 + assert clf2.epsilon_ > clf1.epsilon_ + + +def test_gwnb_neg_var_smoothing(): + """ + Test whether an error is raised in case of negative var_smoothing. + """ + clf = GaussianWNB(var_smoothing=-1.0) + + msg_1 = "Variance smoothing parameter must be a non-negative real number" + msg_2 = "'var_smoothing' parameter of GaussianWNB must be a float in the range \[0.0, inf\)" + with pytest.raises(ValueError, match=rf"{msg_1}|{msg_2}"): + clf.fit(X, y) + + def test_gwnb_non_binary(): """ Test if an error is raised when given non-binary targets. diff --git a/wnb/gwnb.py b/wnb/gwnb.py index d1e76e2..219bdb6 100644 --- a/wnb/gwnb.py +++ b/wnb/gwnb.py @@ -42,6 +42,7 @@ def _get_parameter_constraints() -> dict[str, list[Any]]: "step_size": [Interval(Real, 0.0, None, closed="neither")], "penalty": [StrOptions({"l1", "l2"})], "C": [Interval(Real, 0.0, None, closed="left")], + "var_smoothing": [Interval(Real, 0, None, closed="left")], "learning_hist": ["boolean"], } except (ImportError, ModuleNotFoundError): @@ -73,6 +74,10 @@ class GaussianWNB(_BaseNB): C : float, default=1.0 Regularization strength. Must be strictly positive. + var_smoothing : float, default=1e-9 + Portion of the largest variance of all features that is added to + variances for calculation stability. + learning_hist : bool, default=False Whether to record the learning history, i.e., the value of cost function in each learning iteration. @@ -91,6 +96,9 @@ class GaussianWNB(_BaseNB): n_classes_ : int Number of classes seen during :term:`fit`. + epsilon_ : float + Absolute additive value to variances. + n_features_in_ : int Number of features seen during :term:`fit`. @@ -151,6 +159,7 @@ def __init__( step_size: Float = 1e-4, penalty: str = "l2", C: Float = 1.0, + var_smoothing: Float = 1e-9, learning_hist: bool = False, ) -> None: self.priors = priors @@ -159,6 +168,7 @@ def __init__( self.step_size = step_size self.penalty = penalty self.C = C + self.var_smoothing = var_smoothing self.learning_hist = learning_hist if SKLEARN_V1_6_OR_LATER: @@ -269,6 +279,13 @@ def _init_parameters(self) -> None: % self.max_iter ) + # Ensure variance smoothing is a non-negative real number + if not isinstance(self.var_smoothing, Real) or self.var_smoothing < 0: + raise ValueError( + "Variance smoothing parameter must be a non-negative real number; got (var_smoothing=%r) instead." + % self.var_smoothing + ) + @_fit_context(prefer_skip_nested_validation=True) def fit(self, X: MatrixLike, y: ArrayLike) -> Self: """Fits Gaussian Binary MLD-WNB classifier according to X, y. @@ -299,6 +316,8 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self: self._init_parameters() + self.epsilon_ = self.var_smoothing * np.var(X, axis=0).max() + self.theta_: np.ndarray = np.zeros((self.n_features_in_, self.n_classes_)) self.std_: np.ndarray = np.zeros((self.n_features_in_, self.n_classes_)) self.var_: np.ndarray = np.zeros((self.n_features_in_, self.n_classes_)) @@ -325,18 +344,22 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self: return self + def _get_std(self) -> np.ndarray: + return np.sqrt(self.var_ + self.epsilon_) + def _calculate_cost(self, X, y, y_hat, learning_hist: bool) -> tuple[Float, list[Float]]: _lambda = [self.error_weights_[y[i], y_hat[i]] for i in range(X.shape[0])] if learning_hist: + std = self._get_std() _cost = 0.0 for i in range(X.shape[0]): _sum = np.log(self.class_prior_[1] / self.class_prior_[0]) x = X[i, :] for j in range(self.n_features_in_): _sum += self.coef_[j] * ( - np.log(1e-20 + norm.pdf(x[j], self.theta_[j, 1], self.std_[j, 1])) - - np.log(1e-20 + norm.pdf(x[j], self.theta_[j, 0], self.std_[j, 0])) + np.log(1e-20 + norm.pdf(x[j], self.theta_[j, 1], std[j, 1])) + - np.log(1e-20 + norm.pdf(x[j], self.theta_[j, 0], std[j, 0])) ) _cost += _lambda[i] * _sum else: @@ -345,8 +368,9 @@ def _calculate_cost(self, X, y, y_hat, learning_hist: bool) -> tuple[Float, list return _cost, _lambda def _calculate_grad(self, X, _lambda: list[Float]) -> np.ndarray: + std = self._get_std() _grad = np.repeat( - np.log(self.std_[:, 0] / self.std_[:, 1]).reshape(1, -1), + np.log(std[:, 0] / std[:, 1]).reshape(1, -1), X.shape[0], axis=0, ) @@ -354,7 +378,7 @@ def _calculate_grad(self, X, _lambda: list[Float]) -> np.ndarray: 0.5 * ( (X - np.repeat(self.theta_[:, 0].reshape(1, -1), X.shape[0], axis=0)) - / (np.repeat(self.std_[:, 0].reshape(1, -1), X.shape[0], axis=0)) + / (np.repeat(std[:, 0].reshape(1, -1), X.shape[0], axis=0)) ) ** 2 ) @@ -362,7 +386,7 @@ def _calculate_grad(self, X, _lambda: list[Float]) -> np.ndarray: 0.5 * ( (X - np.repeat(self.theta_[:, 1].reshape(1, -1), X.shape[0], axis=0)) - / (np.repeat(self.std_[:, 1].reshape(1, -1), X.shape[0], axis=0)) + / (np.repeat(std[:, 1].reshape(1, -1), X.shape[0], axis=0)) ) ** 2 ) @@ -376,10 +400,11 @@ def _predict(self, X: MatrixLike) -> np.ndarray: return np.argmax(jll, axis=1) def _joint_log_likelihood(self, X) -> np.ndarray: + std = self._get_std() log_priors = np.tile(np.log(self.class_prior_), (X.shape[0], 1)) w_reshaped = np.tile(self.coef_.reshape(-1, 1), (1, self.n_classes_)) - term1 = np.sum(np.multiply(w_reshaped, -np.log(np.sqrt(2 * np.pi) * self.std_))) - var_inv = np.multiply(w_reshaped, 1.0 / np.multiply(self.std_, self.std_)) + term1 = np.sum(np.multiply(w_reshaped, -np.log(np.sqrt(2 * np.pi) * std))) + var_inv = np.multiply(w_reshaped, 1.0 / np.multiply(std, std)) mu_by_var = np.multiply(self.theta_, var_inv) term2 = -0.5 * ( np.matmul(np.multiply(X, X), var_inv)