Skip to content

Commit

Permalink
Merge pull request #49 from msamsami/add-var-smoothing-gwnb
Browse files Browse the repository at this point in the history
feat: add `var_smoothing` parameter to `GaussianWNB`
  • Loading branch information
msamsami authored Feb 13, 2025
2 parents 47b5422 + 4eb242a commit 1bb8693
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 9 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ We conducted benchmarks on four datasets, [Wine](https://scikit-learn.org/stable
| Wine | GaussianNB | 0.9749 | GeneralNB | **0.9812** |
| Iris | GaussianNB | 0.9556 | GeneralNB | **0.9602** |
| Digits | GaussianNB | 0.8372 | GeneralNB | **0.8905** |
| Breast Cancer | GaussianNB | 0.9389 | GaussianWNB | **0.9512** |
| Breast Cancer | GaussianNB | 0.9389 | GaussianWNB | **0.9519** |

These benchmarks highlight the potential of WNB classifiers to provide better performance in certain scenarios by allowing more flexibility in the choice of distributions and incorporating weighting strategies.

Expand Down
2 changes: 1 addition & 1 deletion tests/benchmarks/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def benchmark_digits() -> None:
def benchmark_breast_cancer() -> None:
X, y = load_breast_cancer(return_X_y=True)

clf_wnb = GaussianWNB(max_iter=20, step_size=0.01, C=1.5)
clf_wnb = GaussianWNB(max_iter=30, step_size=0.01, C=1.5, var_smoothing=1e-12)
clf_sklearn = GaussianNB()
score_wnb, score_sklearn = benchmark(X, y, clf_wnb, clf_sklearn, MAX_ITER)

Expand Down
34 changes: 34 additions & 0 deletions tests/test_gwnb.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,40 @@ def test_gwnb_prior_large_bias():
assert clf.predict(np.array([[-0.1, -0.1]])) == np.array([2])


def test_gwnb_var_smoothing():
"""
Test whether var_smoothing parameter properly affects the variances.
"""
X = np.array([[1, 0], [2, 0], [3, 0], [4, 0], [5, 0]]) # First feature has variance 2.0
y = np.array([1, 1, 2, 2, 2])

clf1 = GaussianWNB(var_smoothing=0.0)
clf1.fit(X, y)

clf2 = GaussianWNB(var_smoothing=1.0)
clf2.fit(X, y)

test_point = np.array([[2.5, 0]])
prob1 = clf1.predict_proba(test_point)
prob2 = clf2.predict_proba(test_point)

assert not np.allclose(prob1, prob2)
assert clf1.epsilon_ == 0.0
assert clf2.epsilon_ > clf1.epsilon_


def test_gwnb_neg_var_smoothing():
"""
Test whether an error is raised in case of negative var_smoothing.
"""
clf = GaussianWNB(var_smoothing=-1.0)

msg_1 = "Variance smoothing parameter must be a non-negative real number"
msg_2 = "'var_smoothing' parameter of GaussianWNB must be a float in the range \[0.0, inf\)"
with pytest.raises(ValueError, match=rf"{msg_1}|{msg_2}"):
clf.fit(X, y)


def test_gwnb_non_binary():
"""
Test if an error is raised when given non-binary targets.
Expand Down
39 changes: 32 additions & 7 deletions wnb/gwnb.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def _get_parameter_constraints() -> dict[str, list[Any]]:
"step_size": [Interval(Real, 0.0, None, closed="neither")],
"penalty": [StrOptions({"l1", "l2"})],
"C": [Interval(Real, 0.0, None, closed="left")],
"var_smoothing": [Interval(Real, 0, None, closed="left")],
"learning_hist": ["boolean"],
}
except (ImportError, ModuleNotFoundError):
Expand Down Expand Up @@ -73,6 +74,10 @@ class GaussianWNB(_BaseNB):
C : float, default=1.0
Regularization strength. Must be strictly positive.
var_smoothing : float, default=1e-9
Portion of the largest variance of all features that is added to
variances for calculation stability.
learning_hist : bool, default=False
Whether to record the learning history, i.e., the value of cost function
in each learning iteration.
Expand All @@ -91,6 +96,9 @@ class GaussianWNB(_BaseNB):
n_classes_ : int
Number of classes seen during :term:`fit`.
epsilon_ : float
Absolute additive value to variances.
n_features_in_ : int
Number of features seen during :term:`fit`.
Expand Down Expand Up @@ -151,6 +159,7 @@ def __init__(
step_size: Float = 1e-4,
penalty: str = "l2",
C: Float = 1.0,
var_smoothing: Float = 1e-9,
learning_hist: bool = False,
) -> None:
self.priors = priors
Expand All @@ -159,6 +168,7 @@ def __init__(
self.step_size = step_size
self.penalty = penalty
self.C = C
self.var_smoothing = var_smoothing
self.learning_hist = learning_hist

if SKLEARN_V1_6_OR_LATER:
Expand Down Expand Up @@ -269,6 +279,13 @@ def _init_parameters(self) -> None:
% self.max_iter
)

# Ensure variance smoothing is a non-negative real number
if not isinstance(self.var_smoothing, Real) or self.var_smoothing < 0:
raise ValueError(
"Variance smoothing parameter must be a non-negative real number; got (var_smoothing=%r) instead."
% self.var_smoothing
)

@_fit_context(prefer_skip_nested_validation=True)
def fit(self, X: MatrixLike, y: ArrayLike) -> Self:
"""Fits Gaussian Binary MLD-WNB classifier according to X, y.
Expand Down Expand Up @@ -299,6 +316,8 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self:

self._init_parameters()

self.epsilon_ = self.var_smoothing * np.var(X, axis=0).max()

self.theta_: np.ndarray = np.zeros((self.n_features_in_, self.n_classes_))
self.std_: np.ndarray = np.zeros((self.n_features_in_, self.n_classes_))
self.var_: np.ndarray = np.zeros((self.n_features_in_, self.n_classes_))
Expand All @@ -325,18 +344,22 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self:

return self

def _get_std(self) -> np.ndarray:
return np.sqrt(self.var_ + self.epsilon_)

def _calculate_cost(self, X, y, y_hat, learning_hist: bool) -> tuple[Float, list[Float]]:
_lambda = [self.error_weights_[y[i], y_hat[i]] for i in range(X.shape[0])]

if learning_hist:
std = self._get_std()
_cost = 0.0
for i in range(X.shape[0]):
_sum = np.log(self.class_prior_[1] / self.class_prior_[0])
x = X[i, :]
for j in range(self.n_features_in_):
_sum += self.coef_[j] * (
np.log(1e-20 + norm.pdf(x[j], self.theta_[j, 1], self.std_[j, 1]))
- np.log(1e-20 + norm.pdf(x[j], self.theta_[j, 0], self.std_[j, 0]))
np.log(1e-20 + norm.pdf(x[j], self.theta_[j, 1], std[j, 1]))
- np.log(1e-20 + norm.pdf(x[j], self.theta_[j, 0], std[j, 0]))
)
_cost += _lambda[i] * _sum
else:
Expand All @@ -345,24 +368,25 @@ def _calculate_cost(self, X, y, y_hat, learning_hist: bool) -> tuple[Float, list
return _cost, _lambda

def _calculate_grad(self, X, _lambda: list[Float]) -> np.ndarray:
std = self._get_std()
_grad = np.repeat(
np.log(self.std_[:, 0] / self.std_[:, 1]).reshape(1, -1),
np.log(std[:, 0] / std[:, 1]).reshape(1, -1),
X.shape[0],
axis=0,
)
_grad += (
0.5
* (
(X - np.repeat(self.theta_[:, 0].reshape(1, -1), X.shape[0], axis=0))
/ (np.repeat(self.std_[:, 0].reshape(1, -1), X.shape[0], axis=0))
/ (np.repeat(std[:, 0].reshape(1, -1), X.shape[0], axis=0))
)
** 2
)
_grad -= (
0.5
* (
(X - np.repeat(self.theta_[:, 1].reshape(1, -1), X.shape[0], axis=0))
/ (np.repeat(self.std_[:, 1].reshape(1, -1), X.shape[0], axis=0))
/ (np.repeat(std[:, 1].reshape(1, -1), X.shape[0], axis=0))
)
** 2
)
Expand All @@ -376,10 +400,11 @@ def _predict(self, X: MatrixLike) -> np.ndarray:
return np.argmax(jll, axis=1)

def _joint_log_likelihood(self, X) -> np.ndarray:
std = self._get_std()
log_priors = np.tile(np.log(self.class_prior_), (X.shape[0], 1))
w_reshaped = np.tile(self.coef_.reshape(-1, 1), (1, self.n_classes_))
term1 = np.sum(np.multiply(w_reshaped, -np.log(np.sqrt(2 * np.pi) * self.std_)))
var_inv = np.multiply(w_reshaped, 1.0 / np.multiply(self.std_, self.std_))
term1 = np.sum(np.multiply(w_reshaped, -np.log(np.sqrt(2 * np.pi) * std)))
var_inv = np.multiply(w_reshaped, 1.0 / np.multiply(std, std))
mu_by_var = np.multiply(self.theta_, var_inv)
term2 = -0.5 * (
np.matmul(np.multiply(X, X), var_inv)
Expand Down

0 comments on commit 1bb8693

Please sign in to comment.