Skip to content

Commit

Permalink
Merge pull request #13 from msamsami/upgrade-dist
Browse files Browse the repository at this point in the history
Upgrade dist module, bug fixing and improvements
  • Loading branch information
msamsami authored Jun 9, 2023
2 parents 32cbde3 + 02456bf commit cd17ee9
Show file tree
Hide file tree
Showing 8 changed files with 133 additions and 40 deletions.
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# WNB: General and weighted naive Bayes classifiers

![](https://img.shields.io/badge/version-v0.1.13-green)
![](https://img.shields.io/badge/version-v0.1.14-green)
![](https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue)
![](https://github.com/msamsami/weighted-naive-bayes/actions/workflows/python-publish.yml/badge.svg)
[![](https://img.shields.io/pypi/v/wnb)](https://pypi.org/project/wnb/)
Expand Down Expand Up @@ -90,3 +90,14 @@ wnb.fit(X, y)
```python
wnb.predict(x_test)
```

## Tests
To run the tests, install development requirements:
```
pip install -r requirements_dev.txt
```

Then, run pytest:
```
pytest
```
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name='wnb',
version='0.1.13',
version='0.1.14',
description='Python library for the implementations of general and weighted naive Bayes (WNB) classifiers.',
keywords=['python', 'bayes', 'naivebayes', 'classifier', 'probabilistic'],
author='Mehdi Samsami',
Expand Down
73 changes: 70 additions & 3 deletions tests/test_gnb.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from sklearn.utils.estimator_checks import check_estimator
from sklearn.base import is_classifier
from sklearn.utils._testing import assert_array_equal, assert_array_almost_equal
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB

from wnb import GeneralNB, Distribution as D

Expand Down Expand Up @@ -40,10 +40,10 @@ def test_gnb():
assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)


def test_gnb_vs_sklearn():
def test_gnb_vs_sklearn_gaussian():
"""General Naive Bayes classification vs sklearn Gaussian Naive Bayes classification.
Test GeneralNB with gaussian likelihoods returns the same outputs as the sklearn GaussianNB.
Test GeneralNB with gaussian likelihoods returns the same outputs as the sklearn MultinomialNB.
"""
clf1 = GeneralNB()
clf1.fit(X, y)
Expand All @@ -64,6 +64,73 @@ def test_gnb_vs_sklearn():
assert_array_almost_equal(y_pred_log_proba1, y_pred_log_proba2, 5)


def test_gnb_vs_sklearn_bernoulli():
"""General Naive Bayes classification vs sklearn Bernoulli Naive Bayes classification.
Test GeneralNB with bernoulli likelihoods returns the same outputs as the sklearn BernoulliNB.
"""
rng = np.random.RandomState(1)
X_ = rng.randint(2, size=(150, 100))
y_ = rng.randint(1, 5, size=(150, ))

clf1 = GeneralNB(distributions=[D.BERNOULLI for _ in range(100)])
clf1.fit(X_, y_)

clf2 = BernoulliNB(alpha=1e-10, force_alpha=True)
clf2.fit(X_, y_)

y_pred1 = clf1.predict(X_[2:3])
y_pred2 = clf2.predict(X_[2:3])
assert_array_equal(y_pred1, y_pred2)

y_pred_proba1 = clf1.predict_proba(X_[2:3])
y_pred_proba2 = clf2.predict_proba(X_[2:3])
assert_array_almost_equal(y_pred_proba1, y_pred_proba2, 6)

y_pred_log_proba1 = clf1.predict_log_proba(X_[2:3])
y_pred_log_proba2 = clf2.predict_log_proba(X_[2:3])
assert_array_almost_equal(y_pred_log_proba1, y_pred_log_proba2, 5)


def test_gnb_vs_sklearn_categorical():
"""General Naive Bayes classification vs sklearn Categorical Naive Bayes classification.
Test GeneralNB with categorical likelihoods returns the same outputs as the sklearn CategoricalNB.
"""
categorical_values = [
["cat", "dog"],
["morning", "noon", "afternoon", "evening"],
["apple", "orange", "watermelon"],
["black", "white"]
]
rng = np.random.RandomState(24)
X_str_ = np.empty((150, 4)).astype("str")
X_ = np.zeros((150, 4))
for i, options in enumerate(categorical_values):
rnd_values = rng.randint(len(options), size=(150, ))
X_str_[:, i] = np.array(options)[rnd_values]
X_[:, i] = rnd_values
y_ = rng.randint(1, 4, size=(150, ))

clf1 = GeneralNB(distributions=[D.CATEGORICAL for _ in range(len(categorical_values))])
clf1.fit(X_str_, y_)

clf2 = CategoricalNB(alpha=1e-10, force_alpha=True)
clf2.fit(X_, y_)

y_pred1 = clf1.predict(X_str_[2:3])
y_pred2 = clf2.predict(X_[2:3])
assert_array_equal(y_pred1, y_pred2)

y_pred_proba1 = clf1.predict_proba(X_str_[2:3])
y_pred_proba2 = clf2.predict_proba(X_[2:3])
assert_array_almost_equal(y_pred_proba1, y_pred_proba2, 6)

y_pred_log_proba1 = clf1.predict_log_proba(X_str_[2:3])
y_pred_log_proba2 = clf2.predict_log_proba(X_[2:3])
assert_array_almost_equal(y_pred_log_proba1, y_pred_log_proba2, 5)


def test_gnb_estimator():
"""
Test whether GeneralNB estimator adheres to scikit-learn conventions.
Expand Down
2 changes: 1 addition & 1 deletion wnb/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.1.13"
__version__ = "0.1.14"
__author__ = "Mehdi Samsami"

__all__ = [
Expand Down
3 changes: 1 addition & 2 deletions wnb/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,7 @@ def get_params(self) -> dict:
def support(self) -> Union[List[float], Tuple[float, float]]:
"""Returns the support of the probability distribution.
If support is a list, the support is a limited number of discrete values. If it is a tuple, it indicates a
limited set/range of continuous values.
If support is a list, it represents a limited number of discrete values. If it is a tuple, it indicates a limited or unlimited range of continuous values.
"""
return self._support
Expand Down
2 changes: 1 addition & 1 deletion wnb/_enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@ class Distribution(str, Enum):
GAMMA = "Gamma"
BERNOULLI = "Bernoulli"
CATEGORICAL = "Categorical"
MULTINOMIAL = "Multinomial"
# MULTINOMIAL = "Multinomial"
GEOMETRIC = "Geometric"
POISSON = "Poisson"
57 changes: 30 additions & 27 deletions wnb/dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
'GammaDist',
'BernoulliDist',
'CategoricalDist',
'MultinomialDist',
# 'MultinomialDist',
'GeometricDist',
'PoissonDist'
]
Expand Down Expand Up @@ -140,10 +140,10 @@ def __init__(self, p: float):

@classmethod
def from_data(cls, data):
return cls(p=(np.array(data) == 1).sum() / len(data))
return cls(p=((np.array(data) == 1).sum() + 1e-10) / len(data)) # TODO: use alpha instead of 1e-10

def pmf(self, x: int) -> float:
return 0.0 if x not in [0, 1] else self.p if x == 1 else 1 - self.p
return 0.0 if x not in self._support else self.p if x == 1 else 1 - self.p


class CategoricalDist(DiscreteDistMixin):
Expand All @@ -157,32 +157,32 @@ def __init__(self, prob: Mapping[Any, float]):
@classmethod
def from_data(cls, data):
values, counts = np.unique(data, return_counts=True)
return cls(prob={v: c/len(data) for v, c in zip(values, counts)})
return cls(prob={v: (c + 1e-10)/len(data) for v, c in zip(values, counts)}) # TODO: use alpha instead of 1e-10

def pmf(self, x: Any) -> float:
return self.prob.get(x)


class MultinomialDist(DiscreteDistMixin):
name = D.MULTINOMIAL

def __init__(self, n: int, prob: Mapping[Any, float]):
self.n = n
self.prob = prob
self._support = [i for i in range(self.n+1)]
super().__init__()

@classmethod
def from_data(cls, data: Sequence[int]):
values, counts = np.unique(data, return_counts=True)
return cls(n=int(np.sum(values)), prob={v: c / len(data) for v, c in zip(values, counts)})

def pmf(self, x: Sequence[int]) -> float:
if sum(x) != self.n:
return 0.0
else:
return np.math.factorial(self.n) * np.product([p**v for v, p in self.prob.items()]) / \
np.product([np.math.factorial(v) for v in self.prob.keys()])
return self.prob.get(x, 0.0)


# class MultinomialDist(DiscreteDistMixin):
# name = D.MULTINOMIAL
#
# def __init__(self, n: int, prob: Mapping[int, float]):
# self.n = n
# self.prob = prob
# self._support = [i for i in range(self.n+1)]
# super().__init__()
#
# @classmethod
# def from_data(cls, data: Sequence[int]):
# values, counts = np.unique(data, return_counts=True)
# return cls(n=int(np.sum(values)), prob={v: c / len(data) for v, c in zip(values, counts)})
#
# def pmf(self, x: Sequence[int]) -> float:
# if sum(x) != self.n:
# return 0.0
# else:
# return np.math.factorial(self.n) * np.product([self.prob.get(v, 0.0)**v for v in x]) / \
# np.product([np.math.factorial(v) for v in x])


class GeometricDist(DiscreteDistMixin):
Expand Down Expand Up @@ -221,3 +221,6 @@ def pmf(self, x: int) -> float:
eval(cls).name: eval(cls)
for cls in __all__
}


NonNumericDistributions = [D.CATEGORICAL, ]
21 changes: 17 additions & 4 deletions wnb/gnb.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from ._base import ContinuousDistMixin, DiscreteDistMixin
from ._enums import Distribution
from .dist import AllDistributions
from .dist import AllDistributions, NonNumericDistributions


class GeneralNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
Expand Down Expand Up @@ -44,6 +44,13 @@ def _more_tags(self):
'requires_y': True
}

def _get_distributions(self):
try:
if self.distributions_ is not None:
return self.distributions_
except:
return self.distributions if self.distributions is not None else []

def _check_inputs(self, X, y):
# Check if only one class is present in label vector
if self.n_classes_ == 1:
Expand All @@ -53,7 +60,7 @@ def _check_inputs(self, X, y):
array=X,
accept_sparse=False,
accept_large_sparse=False,
dtype='numeric',
dtype=None if any(d in self._get_distributions() for d in NonNumericDistributions) else 'numeric',
force_all_finite=True,
ensure_2d=True,
ensure_min_samples=1,
Expand All @@ -76,7 +83,7 @@ def _prepare_X_y(self, X=None, y=None):
# Convert to NumPy array if X is Pandas DataFrame
if isinstance(X, pd.DataFrame):
X = X.values
X = as_float_array(X)
X = X if any(d in self._get_distributions() for d in NonNumericDistributions) else as_float_array(X)

if y is not None:
# Convert to a NumPy array
Expand Down Expand Up @@ -210,7 +217,13 @@ def predict_log_proba(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
check_is_fitted(self)

# Input validation
X = check_array(array=X, accept_large_sparse=False, force_all_finite=True, estimator=self)
X = check_array(
array=X,
accept_large_sparse=False,
force_all_finite=True,
dtype=None if any(d in self._get_distributions() for d in NonNumericDistributions) else 'numeric',
estimator=self
)

# Check if the number of input features matches the data seen during fit
if not X.shape[1] == self.n_features_in_:
Expand Down

0 comments on commit cd17ee9

Please sign in to comment.