Skip to content

Commit

Permalink
Merge pull request #11 from msamsami/refactor-gwnb
Browse files Browse the repository at this point in the history
Refactor GaussianWNB classifier
  • Loading branch information
msamsami authored Jun 3, 2023
2 parents e870bee + b259463 commit 7df204f
Show file tree
Hide file tree
Showing 5 changed files with 168 additions and 20 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# WNB: General and weighted naive Bayes classifiers

![](https://img.shields.io/badge/version-v0.1.11-green)
![](https://img.shields.io/badge/version-v0.1.12-green)
![](https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue)
![](https://github.com/msamsami/weighted-naive-bayes/actions/workflows/python-publish.yml/badge.svg)
[![](https://img.shields.io/pypi/v/wnb)](https://pypi.org/project/wnb/)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name='wnb',
version='0.1.11',
version='0.1.12',
description='Python library for the implementations of general and weighted naive Bayes (WNB) classifiers.',
keywords=['python', 'bayes', 'naivebayes', 'classifier', 'probabilistic'],
author='Mehdi Samsami',
Expand Down
142 changes: 142 additions & 0 deletions tests/test_gwnb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import numpy as np

import pytest
from sklearn.utils.estimator_checks import check_estimator
from sklearn.base import is_classifier
from sklearn.utils._testing import assert_array_equal, assert_array_almost_equal

from wnb import GaussianWNB

# Data is just 6 separable points in the plane
X = np.array([[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]])
y = np.array([1, 1, 1, 2, 2, 2])


@pytest.fixture
def global_random_seed():
return np.random.randint(0, 1000)


def get_random_normal_x_binary_y(global_random_seed):
# A bit more random tests
rng = np.random.RandomState(global_random_seed)
X1 = rng.normal(size=(10, 3))
y1 = (rng.normal(size=10) > 0).astype(int)
return X1, y1


def test_gwnb():
"""Binary Gaussian MLD-WNB classification
Checks that GaussianWNB implements fit and predict and returns correct values for a simple toy dataset.
"""
clf = GaussianWNB()
y_pred = clf.fit(X, y).predict(X)
assert_array_equal(y_pred, y)

y_pred_proba = clf.predict_proba(X)
y_pred_log_proba = clf.predict_log_proba(X)
assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)


def test_gwnb_estimator():
"""
Test whether GaussianWNB estimator adheres to scikit-learn conventions.
"""
check_estimator(GaussianWNB())
assert is_classifier(GaussianWNB)


def test_gwnb_prior(global_random_seed):
"""
Test whether class priors are properly set.
"""
clf = GaussianWNB().fit(X, y)
assert_array_almost_equal(np.array([3, 3]) / 6.0, clf.class_prior_, 8)

X1, y1 = get_random_normal_x_binary_y(global_random_seed)
clf = GaussianWNB().fit(X1, y1)

# Check that the class priors sum to 1
assert_array_almost_equal(clf.class_prior_.sum(), 1)


def test_gwnb_neg_priors():
"""
Test whether an error is raised in case of negative priors.
"""
clf = GaussianWNB(priors=np.array([-1.0, 2.0]))

msg = "Priors must be non-negative"
with pytest.raises(ValueError, match=msg):
clf.fit(X, y)


def test_gwnb_priors():
"""
Test whether the class priors override is properly used.
"""
clf = GaussianWNB(priors=np.array([0.3, 0.7])).fit(X, y)
assert_array_almost_equal(
clf.predict_proba([[-0.1, -0.1]]),
np.array([[0.823571, 0.176429]]),
8,
)
assert_array_almost_equal(clf.class_prior_, np.array([0.3, 0.7]))


def test_gwnb_wrong_nb_priors():
"""
Test whether an error is raised if the number of priors is different from the number of classes.
"""
clf = GaussianWNB(priors=np.array([0.25, 0.25, 0.25, 0.25]))

msg = "Number of priors must match the number of classes"
with pytest.raises(ValueError, match=msg):
clf.fit(X, y)


def test_gwnb_prior_greater_one():
"""
Test if an error is raised if the sum of priors greater than one.
"""
clf = GaussianWNB(priors=np.array([2.0, 1.0]))

msg = "The sum of the priors should be 1"
with pytest.raises(ValueError, match=msg):
clf.fit(X, y)


def test_gwnb_prior_large_bias():
"""
Test if good prediction when class priors favor largely one class.
"""
clf = GaussianWNB(priors=np.array([0.01, 0.99]))
clf.fit(X, y)
assert clf.predict(np.array([[-0.1, -0.1]])) == np.array([2])


def test_gwnb_non_binary():
"""
Test if an error is raised when given non-binary targets.
"""
X_ = np.array(
[
[-1, -1],
[-2, -1],
[-3, -2],
[-4, -5],
[-5, -4],
[1, 1],
[2, 1],
[3, 2],
[4, 4],
[5, 5],
]
)
y_ = np.array([1, 2, 3, 4, 4, 3, 2, 1, 1, 2])
clf = GaussianWNB()

msg = "Unknown label type: non-binary"
with pytest.raises(ValueError, match=msg):
clf.fit(X_, y_)
2 changes: 1 addition & 1 deletion wnb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.1.11"
__version__ = "0.1.12"
__author__ = "Mehdi Samsami"

__all__ = [
Expand Down
40 changes: 23 additions & 17 deletions wnb/gwnb.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from abc import ABCMeta
import numbers
from typing import Union, Optional
from typing import Union, Optional, Sequence
import warnings

import numpy as np
Expand All @@ -10,7 +10,7 @@

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import DataConversionWarning
from sklearn.utils import check_array, as_float_array
from sklearn.utils import as_float_array, check_array, deprecated
from sklearn.utils.validation import check_is_fitted
from sklearn.utils.multiclass import type_of_target

Expand All @@ -20,12 +20,14 @@ class GaussianWNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
Binary Gaussian Minimum Log-likelihood Difference Weighted Naive Bayes (MLD-WNB) Classifier
"""

def __init__(self, priors: Optional[Union[list, np.ndarray]] = None, error_weights: Optional[np.ndarray] = None,
def __init__(self, *,
priors: Optional[Union[Sequence[float], np.ndarray]] = None,
error_weights: Optional[np.ndarray] = None,
max_iter: int = 25, step_size: float = 1e-4, penalty: str = 'l2', C: float = 1.0) -> None:
"""Initializes an object of the class.
Args:
priors (Optional[Union[list, np.ndarray]]): Prior probabilities. Defaults to None.
priors (Optional[Union[Sequence[float], np.ndarray]]): Prior probabilities. Defaults to None.
error_weights (Optional[np.ndarray]): Matrix of error weights (n_classes * n_classes). Defaults to None.
max_iter (int): Maximum number of gradient descent iterations. Defaults to 25.
step_size (float): Step size of weight update (i.e., learning rate). Defaults to 1e-4.
Expand All @@ -48,7 +50,7 @@ def _more_tags(self):
'requires_y': True
}

def __check_inputs(self, X, y):
def _check_inputs(self, X, y):
# Check that the dataset has only two unique labels
if type_of_target(y) != 'binary':
warnings.warn('This version of MLD-WNB only supports binary classification.')
Expand Down Expand Up @@ -117,7 +119,10 @@ def __check_inputs(self, X, y):
% self.max_iter
)

def __prepare_X_y(self, X=None, y=None):
def _prepare_X_y(self, X=None, y=None, from_fit=False):
if from_fit and y is None:
raise ValueError("requires y to be passed, but the target y is None")

if X is not None:
# Convert to NumPy array if X is Pandas DataFrame
if isinstance(X, pd.DataFrame):
Expand All @@ -141,7 +146,7 @@ def __prepare_X_y(self, X=None, y=None):
output = output[0] if len(output) == 1 else output
return output

def __prepare_parameters(self, X, y):
def _prepare_parameters(self, X, y):
# Calculate mean and standard deviation of features for each class
for c in range(self.n_classes_):
self.mu_[:, c] = np.mean(X[y == c, :], axis=0) # Calculate mean of features for class c
Expand Down Expand Up @@ -179,21 +184,21 @@ def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFr
Returns:
self: The instance itself.
"""
X, y = self.__prepare_X_y(X, y)
X, y = self._prepare_X_y(X, y, from_fit=True)

self.classes_, y_ = np.unique(y, return_inverse=True) # Unique class labels and their indices
self.n_classes_ = len(self.classes_) # Number of classes
self.__n_samples, self.n_features_in_ = X.shape # Number of samples and features

self.__check_inputs(X, y)
self._check_inputs(X, y)
y = y_

self.mu_ = np.zeros((self.n_features_in_, self.n_classes_)) # Mean of features (n_features x 1)
self.std_ = np.zeros((self.n_features_in_, self.n_classes_)) # Standard deviation of features (n_features x 1)
self.coef_ = np.ones((self.n_features_in_,)) # WNB coefficients (n_features x 1)
self.cost_hist_ = np.array([np.nan for _ in range(self.max_iter)]) # To store cost value in each iteration

self.__prepare_parameters(X, y)
self._prepare_parameters(X, y)

# Learn the weights using gradient descent
self.n_iter_ = 0
Expand All @@ -202,10 +207,10 @@ def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFr
y_hat = self.__predict(X)

# Calculate cost
self.cost_hist_[self.n_iter_], _lambda = self.__calculate_cost(X, y, y_hat, learning_hist)
self.cost_hist_[self.n_iter_], _lambda = self._calculate_cost(X, y, y_hat, learning_hist)

# Calculate gradients (most time-consuming)
_grad = self.__calculate_grad(X, _lambda)
_grad = self._calculate_grad(X, _lambda)

# Add regularization
if self.penalty == 'l1':
Expand All @@ -218,7 +223,7 @@ def fit(self, X: Union[np.ndarray, pd.DataFrame], y: Union[np.ndarray, pd.DataFr

return self

def __calculate_cost(self, X, y, y_hat, learning_hist):
def _calculate_cost(self, X, y, y_hat, learning_hist):
_lambda = [self.error_weights_[y[i], y_hat[i]] for i in range(self.__n_samples)]

if learning_hist:
Expand All @@ -229,14 +234,14 @@ def __calculate_cost(self, X, y, y_hat, learning_hist):
x = X[i, :]
for j in range(self.n_features_in_):
_sum += self.coef_[j] * (np.log(1e-20 + norm.pdf(x[j], self.mu_[j, 1], self.std_[j, 1]))
- np.log(1e-20 + norm.pdf(x[j], self.mu_[j, 0], self.std_[j, 0])))
- np.log(1e-20 + norm.pdf(x[j], self.mu_[j, 0], self.std_[j, 0])))
_cost += _lambda[i] * _sum
else:
_cost = None

return _cost, _lambda

def __calculate_grad(self, X, _lambda):
def _calculate_grad(self, X, _lambda):
_grad = np.repeat(np.log(self.std_[:, 0] / self.std_[:, 1]).reshape(1, -1), self.__n_samples, axis=0)
_grad += 0.5 * ((X - np.repeat(self.mu_[:, 0].reshape(1, -1), self.__n_samples, axis=0)) /
(np.repeat(self.std_[:, 0].reshape(1, -1), self.__n_samples, axis=0))) ** 2
Expand All @@ -247,7 +252,8 @@ def __calculate_grad(self, X, _lambda):

return _grad

def __calculate_grad_slow(self, X, _lambda):
@deprecated()
def _calculate_grad_slow(self, X, _lambda):
_grad = np.zeros((self.n_features_in_,))
for i in range(self.__n_samples):
x = X[i, :]
Expand Down Expand Up @@ -304,7 +310,7 @@ def predict_log_proba(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:

n_samples = X.shape[0]

X = self.__prepare_X_y(X=X)
X = self._prepare_X_y(X=X)

log_priors = np.tile(np.log(self.class_prior_), (n_samples, 1))
w_reshaped = np.tile(self.coef_.reshape(-1, 1), (1, self.n_classes_))
Expand Down

0 comments on commit 7df204f

Please sign in to comment.