Merge pull request #11 from msamsami/refactor-gwnb

Refactor GaussianWNB classifier
msamsami · Jun 3, 2023 · 7df204f · 7df204f
2 parents e870bee + b259463
commit 7df204f
Show file tree

Hide file tree

Showing 5 changed files with 168 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # WNB: General and weighted naive Bayes classifiers
 
-![](https://img.shields.io/badge/version-v0.1.11-green)
+![](https://img.shields.io/badge/version-v0.1.12-green)
 ![](https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue)
 ![](https://github.com/msamsami/weighted-naive-bayes/actions/workflows/python-publish.yml/badge.svg)
 [![](https://img.shields.io/pypi/v/wnb)](https://pypi.org/project/wnb/)

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='wnb',
-    version='0.1.11',
+    version='0.1.12',
     description='Python library for the implementations of general and weighted naive Bayes (WNB) classifiers.',
     keywords=['python', 'bayes', 'naivebayes', 'classifier', 'probabilistic'],
     author='Mehdi Samsami',

diff --git a/tests/test_gwnb.py b/tests/test_gwnb.py
@@ -0,0 +1,142 @@
+import numpy as np
+
+import pytest
+from sklearn.utils.estimator_checks import check_estimator
+from sklearn.base import is_classifier
+from sklearn.utils._testing import assert_array_equal, assert_array_almost_equal
+
+from wnb import GaussianWNB
+
+# Data is just 6 separable points in the plane
+X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
+y = np.array([1, 1, 1, 2, 2, 2])
+
+
+@pytest.fixture
+def global_random_seed():
+    return np.random.randint(0, 1000)
+
+
+def get_random_normal_x_binary_y(global_random_seed):
+    # A bit more random tests
+    rng = np.random.RandomState(global_random_seed)
+    X1 = rng.normal(size=(10, 3))
+    y1 = (rng.normal(size=10) > 0).astype(int)
+    return X1, y1
+
+
+def test_gwnb():
+    """Binary Gaussian MLD-WNB classification
+
+    Checks that GaussianWNB implements fit and predict and returns correct values for a simple toy dataset.
+    """
+    clf = GaussianWNB()
+    y_pred = clf.fit(X, y).predict(X)
+    assert_array_equal(y_pred, y)
+
+    y_pred_proba = clf.predict_proba(X)
+    y_pred_log_proba = clf.predict_log_proba(X)
+    assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
+
+
+def test_gwnb_estimator():
+    """
+    Test whether GaussianWNB estimator adheres to scikit-learn conventions.
+    """
+    check_estimator(GaussianWNB())
+    assert is_classifier(GaussianWNB)
+
+
+def test_gwnb_prior(global_random_seed):
+    """
+    Test whether class priors are properly set.
+    """
+    clf = GaussianWNB().fit(X, y)
+    assert_array_almost_equal(np.array([3, 3]) / 6.0, clf.class_prior_, 8)
+
+    X1, y1 = get_random_normal_x_binary_y(global_random_seed)
+    clf = GaussianWNB().fit(X1, y1)
+
+    # Check that the class priors sum to 1
+    assert_array_almost_equal(clf.class_prior_.sum(), 1)
+
+
+def test_gwnb_neg_priors():
+    """
+    Test whether an error is raised in case of negative priors.
+    """
+    clf = GaussianWNB(priors=np.array([-1.0, 2.0]))
+
+    msg = "Priors must be non-negative"
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
+
+
+def test_gwnb_priors():
+    """
+    Test whether the class priors override is properly used.
+    """
+    clf = GaussianWNB(priors=np.array([0.3, 0.7])).fit(X, y)
+    assert_array_almost_equal(
+        clf.predict_proba([[-0.1, -0.1]]),
+        np.array([[0.823571, 0.176429]]),
+        8,
+    )
+    assert_array_almost_equal(clf.class_prior_, np.array([0.3, 0.7]))
+
+
+def test_gwnb_wrong_nb_priors():
+    """
+    Test whether an error is raised if the number of priors is different from the number of classes.
+    """
+    clf = GaussianWNB(priors=np.array([0.25, 0.25, 0.25, 0.25]))
+
+    msg = "Number of priors must match the number of classes"
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
+
+
+def test_gwnb_prior_greater_one():
+    """
+    Test if an error is raised if the sum of priors greater than one.
+    """
+    clf = GaussianWNB(priors=np.array([2.0, 1.0]))
+
+    msg = "The sum of the priors should be 1"
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X, y)
+
+
+def test_gwnb_prior_large_bias():
+    """
+    Test if good prediction when class priors favor largely one class.
+    """
+    clf = GaussianWNB(priors=np.array([0.01, 0.99]))
+    clf.fit(X, y)
+    assert clf.predict(np.array([[-0.1, -0.1]])) == np.array([2])
+
+
+def test_gwnb_non_binary():
+    """
+    Test if an error is raised when given non-binary targets.
+    """
+    X_ = np.array(
+        [
+            [-1, -1],
+            [-2, -1],
+            [-3, -2],
+            [-4, -5],
+            [-5, -4],
+            [1, 1],
+            [2, 1],
+            [3, 2],
+            [4, 4],
+            [5, 5],
+        ]
+    )
+    y_ = np.array([1, 2, 3, 4, 4, 3, 2, 1, 1, 2])
+    clf = GaussianWNB()
+
+    msg = "Unknown label type: non-binary"
+    with pytest.raises(ValueError, match=msg):
+        clf.fit(X_, y_)
diff --git a/wnb/__init__.py b/wnb/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.1.11"
+__version__ = "0.1.12"
 __author__ = "Mehdi Samsami"
 
 __all__ = [

diff --git a/wnb/gwnb.py b/wnb/gwnb.py
@@ -1,6 +1,6 @@
 from abc import ABCMeta
 import numbers
-from typing import Union, Optional
+from typing import Union, Optional, Sequence
 import warnings
 
 import numpy as np
@@ -10,7 +10,7 @@
 
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.exceptions import DataConversionWarning
-from sklearn.utils import check_array, as_float_array
+from sklearn.utils import as_float_array, check_array, deprecated
 from sklearn.utils.validation import check_is_fitted
 from sklearn.utils.multiclass import type_of_target
 
@@ -20,12 +20,14 @@ class GaussianWNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
     Binary Gaussian Minimum Log-likelihood Difference Weighted Naive Bayes (MLD-WNB) Classifier
     """
 
-    def __init__(self, priors: Optional[Union[list, np.ndarray]] = None, error_weights: Optional[np.ndarray] = None,
+    def __init__(self, *,
+                 priors: Optional[Union[Sequence[float], np.ndarray]] = None,
+                 error_weights: Optional[np.ndarray] = None,
                  max_iter: int = 25, step_size: float = 1e-4, penalty: str = 'l2', C: float = 1.0) -> None:
         """Initializes an object of the class.
 
         Args:
-            priors (Optional[Union[list, np.ndarray]]): Prior probabilities. Defaults to None.
+            priors (Optional[Union[Sequence[float], np.ndarray]]): Prior probabilities. Defaults to None.
             error_weights (Optional[np.ndarray]): Matrix of error weights (n_classes * n_classes). Defaults to None.
             max_iter (int): Maximum number of gradient descent iterations. Defaults to 25.
             step_size (float): Step size of weight update (i.e., learning rate). Defaults to 1e-4.
@@ -48,7 +50,7 @@ def _more_tags(self):
             'requires_y': True
         }
 
-    def __check_inputs(self, X, y):
+    def _check_inputs(self, X, y):
         # Check that the dataset has only two unique labels
         if type_of_target(y) != 'binary':
             warnings.warn('This version of MLD-WNB only supports binary classification.')
@@ -117,7 +119,10 @@ def __check_inputs(self, X, y):
                 % self.max_iter
             )
 
-    def __prepare_X_y(self, X=None, y=None):
+    def _prepare_X_y(self, X=None, y=None, from_fit=False):
+        if from_fit and y is None:
+            raise ValueError("requires y to be passed, but the target y is None")
+
         if X is not None:
             # Convert to NumPy array if X is Pandas DataFrame
             if isinstance(X, pd.DataFrame):
@@ -141,7 +146,7 @@ def __prepare_X_y(self, X=None, y=None):
         output = output[0] if len(output) == 1 else output
         return output
 
-    def __prepare_parameters(self, X, y):
+    def _prepare_parameters(self, X, y):
         # Calculate mean and standard deviation of features for each class
         for c in range(self.n_classes_):
             self.mu_[:, c] = np.mean(X[y == c, :], axis=0)  # Calculate mean of features for class c
@@ -179,21 +184,21 @@ def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFr
         Returns:
             self: The instance itself.
         """
-        X, y = self.__prepare_X_y(X, y)
+        X, y = self._prepare_X_y(X, y, from_fit=True)
 
         self.classes_, y_ = np.unique(y, return_inverse=True)  # Unique class labels and their indices
         self.n_classes_ = len(self.classes_)  # Number of classes
         self.__n_samples, self.n_features_in_ = X.shape  # Number of samples and features
 
-        self.__check_inputs(X, y)
+        self._check_inputs(X, y)
         y = y_
 
         self.mu_ = np.zeros((self.n_features_in_, self.n_classes_))  # Mean of features (n_features x 1)
         self.std_ = np.zeros((self.n_features_in_, self.n_classes_))  # Standard deviation of features (n_features x 1)
         self.coef_ = np.ones((self.n_features_in_,))  # WNB coefficients (n_features x 1)
         self.cost_hist_ = np.array([np.nan for _ in range(self.max_iter)])  # To store cost value in each iteration
 
-        self.__prepare_parameters(X, y)
+        self._prepare_parameters(X, y)
 
         # Learn the weights using gradient descent
         self.n_iter_ = 0
@@ -202,10 +207,10 @@ def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFr
             y_hat = self.__predict(X)
 
             # Calculate cost
-            self.cost_hist_[self.n_iter_], _lambda = self.__calculate_cost(X, y, y_hat, learning_hist)
+            self.cost_hist_[self.n_iter_], _lambda = self._calculate_cost(X, y, y_hat, learning_hist)
 
             # Calculate gradients (most time-consuming)
-            _grad = self.__calculate_grad(X, _lambda)
+            _grad = self._calculate_grad(X, _lambda)
 
             # Add regularization
             if self.penalty == 'l1':
@@ -218,7 +223,7 @@ def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFr
 
         return self
 
-    def __calculate_cost(self, X, y, y_hat, learning_hist):
+    def _calculate_cost(self, X, y, y_hat, learning_hist):
         _lambda = [self.error_weights_[y[i], y_hat[i]] for i in range(self.__n_samples)]
 
         if learning_hist:
@@ -229,14 +234,14 @@ def __calculate_cost(self, X, y, y_hat, learning_hist):
                 x = X[i, :]
                 for j in range(self.n_features_in_):
                     _sum += self.coef_[j] * (np.log(1e-20 + norm.pdf(x[j], self.mu_[j, 1], self.std_[j, 1]))
-                                                - np.log(1e-20 + norm.pdf(x[j], self.mu_[j, 0], self.std_[j, 0])))
+                                             - np.log(1e-20 + norm.pdf(x[j], self.mu_[j, 0], self.std_[j, 0])))
                 _cost += _lambda[i] * _sum
         else:
             _cost = None
 
         return _cost, _lambda
 
-    def __calculate_grad(self, X, _lambda):
+    def _calculate_grad(self, X, _lambda):
         _grad = np.repeat(np.log(self.std_[:, 0] / self.std_[:, 1]).reshape(1, -1), self.__n_samples, axis=0)
         _grad += 0.5 * ((X - np.repeat(self.mu_[:, 0].reshape(1, -1), self.__n_samples, axis=0)) /
                         (np.repeat(self.std_[:, 0].reshape(1, -1), self.__n_samples, axis=0))) ** 2
@@ -247,7 +252,8 @@ def __calculate_grad(self, X, _lambda):
 
         return _grad
 
-    def __calculate_grad_slow(self, X, _lambda):
+    @deprecated()
+    def _calculate_grad_slow(self, X, _lambda):
         _grad = np.zeros((self.n_features_in_,))
         for i in range(self.__n_samples):
             x = X[i, :]
@@ -304,7 +310,7 @@ def predict_log_proba(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
 
         n_samples = X.shape[0]
 
-        X = self.__prepare_X_y(X=X)
+        X = self._prepare_X_y(X=X)
 
         log_priors = np.tile(np.log(self.class_prior_), (n_samples, 1))
         w_reshaped = np.tile(self.coef_.reshape(-1, 1), (1, self.n_classes_))