Merge pull request #13 from msamsami/upgrade-dist

Upgrade dist module, bug fixing and improvements
msamsami · Jun 9, 2023 · cd17ee9 · cd17ee9
2 parents 32cbde3 + 02456bf
commit cd17ee9
Show file tree

Hide file tree

Showing 8 changed files with 133 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # WNB: General and weighted naive Bayes classifiers
 
-![](https://img.shields.io/badge/version-v0.1.13-green)
+![](https://img.shields.io/badge/version-v0.1.14-green)
 ![](https://img.shields.io/badge/python-3.7%20%7C%203.8%20%7C%203.9-blue)
 ![](https://github.com/msamsami/weighted-naive-bayes/actions/workflows/python-publish.yml/badge.svg)
 [![](https://img.shields.io/pypi/v/wnb)](https://pypi.org/project/wnb/)
@@ -90,3 +90,14 @@ wnb.fit(X, y)
 ```python
 wnb.predict(x_test)
 ```
+
+## Tests
+To run the tests, install development requirements:
+```
+pip install -r requirements_dev.txt
+```
+
+Then, run pytest:
+```
+pytest
+```
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='wnb',
-    version='0.1.13',
+    version='0.1.14',
     description='Python library for the implementations of general and weighted naive Bayes (WNB) classifiers.',
     keywords=['python', 'bayes', 'naivebayes', 'classifier', 'probabilistic'],
     author='Mehdi Samsami',

diff --git a/tests/test_gnb.py b/tests/test_gnb.py
@@ -4,7 +4,7 @@
 from sklearn.utils.estimator_checks import check_estimator
 from sklearn.base import is_classifier
 from sklearn.utils._testing import assert_array_equal, assert_array_almost_equal
-from sklearn.naive_bayes import GaussianNB
+from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB
 
 from wnb import GeneralNB, Distribution as D
 
@@ -40,10 +40,10 @@ def test_gnb():
     assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8)
 
 
-def test_gnb_vs_sklearn():
+def test_gnb_vs_sklearn_gaussian():
     """General Naive Bayes classification vs sklearn Gaussian Naive Bayes classification.
 
-    Test GeneralNB with gaussian likelihoods returns the same outputs as the sklearn GaussianNB.
+    Test GeneralNB with gaussian likelihoods returns the same outputs as the sklearn MultinomialNB.
     """
     clf1 = GeneralNB()
     clf1.fit(X, y)
@@ -64,6 +64,73 @@ def test_gnb_vs_sklearn():
     assert_array_almost_equal(y_pred_log_proba1, y_pred_log_proba2, 5)
 
 
+def test_gnb_vs_sklearn_bernoulli():
+    """General Naive Bayes classification vs sklearn Bernoulli Naive Bayes classification.
+
+    Test GeneralNB with bernoulli likelihoods returns the same outputs as the sklearn BernoulliNB.
+    """
+    rng = np.random.RandomState(1)
+    X_ = rng.randint(2, size=(150, 100))
+    y_ = rng.randint(1, 5, size=(150, ))
+
+    clf1 = GeneralNB(distributions=[D.BERNOULLI for _ in range(100)])
+    clf1.fit(X_, y_)
+
+    clf2 = BernoulliNB(alpha=1e-10, force_alpha=True)
+    clf2.fit(X_, y_)
+
+    y_pred1 = clf1.predict(X_[2:3])
+    y_pred2 = clf2.predict(X_[2:3])
+    assert_array_equal(y_pred1, y_pred2)
+
+    y_pred_proba1 = clf1.predict_proba(X_[2:3])
+    y_pred_proba2 = clf2.predict_proba(X_[2:3])
+    assert_array_almost_equal(y_pred_proba1, y_pred_proba2, 6)
+
+    y_pred_log_proba1 = clf1.predict_log_proba(X_[2:3])
+    y_pred_log_proba2 = clf2.predict_log_proba(X_[2:3])
+    assert_array_almost_equal(y_pred_log_proba1, y_pred_log_proba2, 5)
+
+
+def test_gnb_vs_sklearn_categorical():
+    """General Naive Bayes classification vs sklearn Categorical Naive Bayes classification.
+
+    Test GeneralNB with categorical likelihoods returns the same outputs as the sklearn CategoricalNB.
+    """
+    categorical_values = [
+        ["cat", "dog"],
+        ["morning", "noon", "afternoon", "evening"],
+        ["apple", "orange", "watermelon"],
+        ["black", "white"]
+    ]
+    rng = np.random.RandomState(24)
+    X_str_ = np.empty((150, 4)).astype("str")
+    X_ = np.zeros((150, 4))
+    for i, options in enumerate(categorical_values):
+        rnd_values = rng.randint(len(options), size=(150, ))
+        X_str_[:, i] = np.array(options)[rnd_values]
+        X_[:, i] = rnd_values
+    y_ = rng.randint(1, 4, size=(150, ))
+
+    clf1 = GeneralNB(distributions=[D.CATEGORICAL for _ in range(len(categorical_values))])
+    clf1.fit(X_str_, y_)
+
+    clf2 = CategoricalNB(alpha=1e-10, force_alpha=True)
+    clf2.fit(X_, y_)
+
+    y_pred1 = clf1.predict(X_str_[2:3])
+    y_pred2 = clf2.predict(X_[2:3])
+    assert_array_equal(y_pred1, y_pred2)
+
+    y_pred_proba1 = clf1.predict_proba(X_str_[2:3])
+    y_pred_proba2 = clf2.predict_proba(X_[2:3])
+    assert_array_almost_equal(y_pred_proba1, y_pred_proba2, 6)
+
+    y_pred_log_proba1 = clf1.predict_log_proba(X_str_[2:3])
+    y_pred_log_proba2 = clf2.predict_log_proba(X_[2:3])
+    assert_array_almost_equal(y_pred_log_proba1, y_pred_log_proba2, 5)
+
+
 def test_gnb_estimator():
     """
     Test whether GeneralNB estimator adheres to scikit-learn conventions.

diff --git a/wnb/__init__.py b/wnb/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.1.13"
+__version__ = "0.1.14"
 __author__ = "Mehdi Samsami"
 
 __all__ = [

diff --git a/wnb/_base.py b/wnb/_base.py
@@ -93,8 +93,7 @@ def get_params(self) -> dict:
     def support(self) -> Union[List[float], Tuple[float, float]]:
         """Returns the support of the probability distribution.
 
-        If support is a list, the support is a limited number of discrete values. If it is a tuple, it indicates a
-        limited set/range of continuous values.
+        If support is a list, it represents a limited number of discrete values. If it is a tuple, it indicates a limited or unlimited range of continuous values.
 
         """
         return self._support

diff --git a/wnb/_enums.py b/wnb/_enums.py
@@ -18,6 +18,6 @@ class Distribution(str, Enum):
     GAMMA = "Gamma"
     BERNOULLI = "Bernoulli"
     CATEGORICAL = "Categorical"
-    MULTINOMIAL = "Multinomial"
+    # MULTINOMIAL = "Multinomial"
     GEOMETRIC = "Geometric"
     POISSON = "Poisson"
diff --git a/wnb/dist.py b/wnb/dist.py
@@ -16,7 +16,7 @@
     'GammaDist',
     'BernoulliDist',
     'CategoricalDist',
-    'MultinomialDist',
+    # 'MultinomialDist',
     'GeometricDist',
     'PoissonDist'
 ]
@@ -140,10 +140,10 @@ def __init__(self, p: float):
 
     @classmethod
     def from_data(cls, data):
-        return cls(p=(np.array(data) == 1).sum() / len(data))
+        return cls(p=((np.array(data) == 1).sum() + 1e-10) / len(data))  # TODO: use alpha instead of 1e-10
 
     def pmf(self, x: int) -> float:
-        return 0.0 if x not in [0, 1] else self.p if x == 1 else 1 - self.p
+        return 0.0 if x not in self._support else self.p if x == 1 else 1 - self.p
 
 
 class CategoricalDist(DiscreteDistMixin):
@@ -157,32 +157,32 @@ def __init__(self, prob: Mapping[Any, float]):
     @classmethod
     def from_data(cls, data):
         values, counts = np.unique(data, return_counts=True)
-        return cls(prob={v: c/len(data) for v, c in zip(values, counts)})
+        return cls(prob={v: (c + 1e-10)/len(data) for v, c in zip(values, counts)})  # TODO: use alpha instead of 1e-10
 
     def pmf(self, x: Any) -> float:
-        return self.prob.get(x)
-
-
-class MultinomialDist(DiscreteDistMixin):
-    name = D.MULTINOMIAL
-
-    def __init__(self, n: int, prob: Mapping[Any, float]):
-        self.n = n
-        self.prob = prob
-        self._support = [i for i in range(self.n+1)]
-        super().__init__()
-
-    @classmethod
-    def from_data(cls, data: Sequence[int]):
-        values, counts = np.unique(data, return_counts=True)
-        return cls(n=int(np.sum(values)), prob={v: c / len(data) for v, c in zip(values, counts)})
-
-    def pmf(self, x: Sequence[int]) -> float:
-        if sum(x) != self.n:
-            return 0.0
-        else:
-            return np.math.factorial(self.n) * np.product([p**v for v, p in self.prob.items()]) / \
-                np.product([np.math.factorial(v) for v in self.prob.keys()])
+        return self.prob.get(x, 0.0)
+
+
+# class MultinomialDist(DiscreteDistMixin):
+#     name = D.MULTINOMIAL
+#
+#     def __init__(self, n: int, prob: Mapping[int, float]):
+#         self.n = n
+#         self.prob = prob
+#         self._support = [i for i in range(self.n+1)]
+#         super().__init__()
+#
+#     @classmethod
+#     def from_data(cls, data: Sequence[int]):
+#         values, counts = np.unique(data, return_counts=True)
+#         return cls(n=int(np.sum(values)), prob={v: c / len(data) for v, c in zip(values, counts)})
+#
+#     def pmf(self, x: Sequence[int]) -> float:
+#         if sum(x) != self.n:
+#             return 0.0
+#         else:
+#             return np.math.factorial(self.n) * np.product([self.prob.get(v, 0.0)**v for v in x]) / \
+#                 np.product([np.math.factorial(v) for v in x])
 
 
 class GeometricDist(DiscreteDistMixin):
@@ -221,3 +221,6 @@ def pmf(self, x: int) -> float:
     eval(cls).name: eval(cls)
     for cls in __all__
 }
+
+
+NonNumericDistributions = [D.CATEGORICAL, ]
diff --git a/wnb/gnb.py b/wnb/gnb.py
@@ -13,7 +13,7 @@
 
 from ._base import ContinuousDistMixin, DiscreteDistMixin
 from ._enums import Distribution
-from .dist import AllDistributions
+from .dist import AllDistributions, NonNumericDistributions
 
 
 class GeneralNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
@@ -44,6 +44,13 @@ def _more_tags(self):
             'requires_y': True
         }
 
+    def _get_distributions(self):
+        try:
+            if self.distributions_ is not None:
+                return self.distributions_
+        except:
+            return self.distributions if self.distributions is not None else []
+
     def _check_inputs(self, X, y):
         # Check if only one class is present in label vector
         if self.n_classes_ == 1:
@@ -53,7 +60,7 @@ def _check_inputs(self, X, y):
             array=X,
             accept_sparse=False,
             accept_large_sparse=False,
-            dtype='numeric',
+            dtype=None if any(d in self._get_distributions() for d in NonNumericDistributions) else 'numeric',
             force_all_finite=True,
             ensure_2d=True,
             ensure_min_samples=1,
@@ -76,7 +83,7 @@ def _prepare_X_y(self, X=None, y=None):
             # Convert to NumPy array if X is Pandas DataFrame
             if isinstance(X, pd.DataFrame):
                 X = X.values
-            X = as_float_array(X)
+            X = X if any(d in self._get_distributions() for d in NonNumericDistributions) else as_float_array(X)
 
         if y is not None:
             # Convert to a NumPy array
@@ -210,7 +217,13 @@ def predict_log_proba(self, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
         check_is_fitted(self)
 
         # Input validation
-        X = check_array(array=X, accept_large_sparse=False, force_all_finite=True, estimator=self)
+        X = check_array(
+            array=X,
+            accept_large_sparse=False,
+            force_all_finite=True,
+            dtype=None if any(d in self._get_distributions() for d in NonNumericDistributions) else 'numeric',
+            estimator=self
+        )
 
         # Check if the number of input features matches the data seen during fit
         if not X.shape[1] == self.n_features_in_: