Merge pull request #34 from msamsami/fix-class-attrs

maint: fix class attributes, reformat the code
msamsami · Aug 6, 2024 · 3701e91 · 3701e91
2 parents 542a90c + bbcf431
commit 3701e91
Show file tree

Hide file tree

Showing 11 changed files with 82 additions and 191 deletions.
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
 
 <div align="center">
 
-![Lastest Release](https://img.shields.io/badge/release-v0.2.5-green)
+![Lastest Release](https://img.shields.io/badge/release-v0.2.6-green)
 [![PyPI Version](https://img.shields.io/pypi/v/wnb)](https://pypi.org/project/wnb/)
 ![Python Versions](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-blue)<br>
 ![GitHub Workflow Status (build)](https://github.com/msamsami/weighted-naive-bayes/actions/workflows/python-publish.yml/badge.svg)
@@ -83,7 +83,7 @@ wnb.fit(X, y)
 wnb.predict(x_test)
 ```
 
-## Compatibility with Scikit-learn
+## Compatibility with Scikit-learn 🤝
 
 The **wnb** library fully adheres to the Scikit-learn API, ensuring seamless integration with other Scikit-learn components and workflows. This means that users familiar with Scikit-learn will find the WNB classifiers intuitive to use.
 
@@ -100,7 +100,7 @@ Both Scikit-learn classifiers and WNB classifiers share these well-known methods
 
 By maintaining this consistency, WNB classifiers can be easily incorporated into existing machine learning pipelines and processes.
 
-## Benchmarks
+## Benchmarks 📊
 We conducted benchmarks on three datasets, [Breast Cancer](https://scikit-learn.org/stable/datasets/toy_dataset.html#breast-cancer-wisconsin-diagnostic-dataset), [Digits](https://scikit-learn.org/stable/datasets/toy_dataset.html#optical-recognition-of-handwritten-digits-dataset), and [Wine](https://scikit-learn.org/stable/datasets/toy_dataset.html#wine-recognition-dataset), to evaluate the performance of WNB classifiers and compare them with their Scikit-learn counterpart, `GaussianNB`. The results show that WNB classifiers generally perform better in certain cases.
 
 | Dataset          | Scikit-learn Classifier | Accuracy | WNB Classifier | Accuracy  |
@@ -124,7 +124,7 @@ Then, run pytest:
 pytest
 ```
 
-## Support us 🤝
+## Support us 💡
 You can support the project in the following ways:
 
 ⭐ Star WNB on GitHub (click the star button in the top right corner)

diff --git a/examples/gwnb_breast_cancer.py b/examples/gwnb_breast_cancer.py
@@ -10,9 +10,7 @@
 y = breast_cancer["target"]
 
 # Split the data into training and test sets
-X_train, X_test, y_train, y_test = train_test_split(
-    X, y, test_size=0.33, random_state=0
-)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
 
 # Train and score sklearn GaussianNB classifier
 gnb = GaussianNB()

diff --git a/pyproject.toml b/pyproject.toml
@@ -66,6 +66,9 @@ include = ["/README.md", "/wnb"]
 testpaths = ["tests"]
 filterwarnings = ["ignore"]
 
+[tool.black]
+line-length = 110
+
 [tool.isort]
 atomic = true
 profile = "black"

diff --git a/tests/benchmarks/utils.py b/tests/benchmarks/utils.py
@@ -17,13 +17,9 @@
 __all__ = ("benchmark",)
 
 
-def compare_score(
-    X, y, wnb, sklearn, random_state: int, test_size: float
-) -> tuple[float, float]:
+def compare_score(X, y, wnb, sklearn, random_state: int, test_size: float) -> tuple[float, float]:
     # Split the data into training and test sets
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=test_size, random_state=random_state
-    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
 
     # Train and score wnb classifier
     clf_wnb = clone(wnb)
@@ -36,14 +32,10 @@ def compare_score(
     return clf_wnb.score(X_test, y_test), clf_sklearn.score(X_test, y_test)
 
 
-def benchmark(
-    X, y, wnb, sklearn, max_iter: int = 50, test_size: float = 0.33
-) -> tuple[float, float]:
+def benchmark(X, y, wnb, sklearn, max_iter: int = 50, test_size: float = 0.33) -> tuple[float, float]:
     results = Parallel(n_jobs=-1, prefer="processes")(
         delayed(compare_score)(*param)
-        for param in tqdm(
-            [(X, y, wnb, sklearn, i, test_size) for i in range(max_iter)], ncols=80
-        )
+        for param in tqdm([(X, y, wnb, sklearn, i, test_size) for i in range(max_iter)], ncols=80)
     )
 
     return np.mean([r[0] for r in results]), np.mean([r[1] for r in results])
diff --git a/tests/test_gnb.py b/tests/test_gnb.py
@@ -107,9 +107,7 @@ def test_gnb_vs_sklearn_categorical():
         X_[:, i] = rnd_values
     y_ = rng.randint(1, 4, size=(150,))
 
-    clf1 = GeneralNB(
-        distributions=[D.CATEGORICAL for _ in range(len(categorical_values))]
-    )
+    clf1 = GeneralNB(distributions=[D.CATEGORICAL for _ in range(len(categorical_values))])
     clf1.fit(X_str_, y_)
 
     clf2 = CategoricalNB(alpha=1e-10, force_alpha=True)

diff --git a/wnb/__init__.py b/wnb/__init__.py
@@ -2,7 +2,7 @@
 Python library for the implementations of general and weighted naive Bayes (WNB) classifiers.
 """
 
-__version__ = "0.2.5"
+__version__ = "0.2.6"
 __author__ = "Mehdi Samsami"
 
 

diff --git a/wnb/_base.py b/wnb/_base.py
@@ -21,9 +21,7 @@ def vectorize(otypes=None, excluded=None, signature=None):
     """
 
     def decorator(func):
-        vectorized = np.vectorize(
-            func, otypes=otypes, excluded=excluded, signature=signature
-        )
+        vectorized = np.vectorize(func, otypes=otypes, excluded=excluded, signature=signature)
 
         @wraps(func)
         def wrapper(*args):
@@ -65,9 +63,7 @@ def _get_param_names(cls) -> list[str]:
 
         init_signature = inspect.signature(init)
         parameters = [
-            p
-            for p in init_signature.parameters.values()
-            if p.name != "self" and p.kind != p.VAR_KEYWORD
+            p for p in init_signature.parameters.values() if p.name != "self" and p.kind != p.VAR_KEYWORD
         ]
 
         for p in parameters:
@@ -105,8 +101,7 @@ def support(self) -> list[float] | tuple[float, float]:
 
     def _check_support(self, x) -> None:
         if (isinstance(self.support, list) and x not in self.support) or (
-            isinstance(self.support, tuple)
-            and (x < self.support[0] or x > self.support[1])
+            isinstance(self.support, tuple) and (x < self.support[0] or x > self.support[1])
         ):
             warnings.warn(
                 "Value doesn't lie within the support of the distribution",

diff --git a/wnb/dist.py b/wnb/dist.py
@@ -39,9 +39,7 @@ def from_data(cls, data: np.ndarray, **kwargs):
         return cls(mu=np.average(data), sigma=np.std(data))
 
     def pdf(self, x: float) -> float:
-        return (1.0 / np.sqrt(2 * np.pi * self.sigma**2)) * np.exp(
-            -0.5 * (((x - self.mu) / self.sigma) ** 2)
-        )
+        return (1.0 / np.sqrt(2 * np.pi * self.sigma**2)) * np.exp(-0.5 * (((x - self.mu) / self.sigma) ** 2))
 
 
 class LognormalDist(ContinuousDistMixin):
@@ -114,11 +112,7 @@ def from_data(cls, data, **kwargs):
         return cls(x_m=x_m, alpha=len(data) / np.sum(np.log(data / x_m)))
 
     def pdf(self, x: float) -> float:
-        return (
-            (self.alpha * self.x_m**self.alpha) / x ** (self.alpha + 1)
-            if x >= self.x_m
-            else 0.0
-        )
+        return (self.alpha * self.x_m**self.alpha) / x ** (self.alpha + 1) if x >= self.x_m else 0.0
 
 
 class GammaDist(ContinuousDistMixin):
@@ -134,19 +128,12 @@ def __init__(self, k: float, theta: float):
     def from_data(cls, data, **kwargs):
         n = len(data)
         return cls(
-            k=n
-            * np.sum(data)
-            / (n * np.sum(data * np.log(data)) - np.sum(data * np.sum(np.log(data)))),
-            theta=(
-                n * np.sum(data * np.log(data)) - np.sum(data * np.sum(np.log(data)))
-            )
-            / n**2,
+            k=n * np.sum(data) / (n * np.sum(data * np.log(data)) - np.sum(data * np.sum(np.log(data)))),
+            theta=(n * np.sum(data * np.log(data)) - np.sum(data * np.sum(np.log(data)))) / n**2,
         )
 
     def pdf(self, x: float) -> float:
-        return (x ** (self.k - 1) * np.exp(-x / self.theta)) / (
-            gamma(self.k) * self.theta**self.k
-        )
+        return (x ** (self.k - 1) * np.exp(-x / self.theta)) / (gamma(self.k) * self.theta**self.k)
 
 
 class BetaDist(ContinuousDistMixin):
@@ -169,9 +156,7 @@ def from_data(cls, data, **kwargs):
         )
 
     def pdf(self, x: float) -> float:
-        return ((x ** (self.alpha - 1)) * (1 - x) ** (self.beta - 1)) / beta(
-            self.alpha, self.beta
-        )
+        return ((x ** (self.alpha - 1)) * (1 - x) ** (self.beta - 1)) / beta(self.alpha, self.beta)
 
 
 class ChiSquaredDist(ContinuousDistMixin):
@@ -203,9 +188,9 @@ def from_data(cls, data, **kwargs):
         return cls(df=len(data) - 1)
 
     def pdf(self, x: float) -> float:
-        return (
-            gamma((self.df + 1) / 2) / (np.sqrt(self.df * np.pi) * gamma(self.df / 2))
-        ) * (1 + (x**2 / self.df)) ** (-(self.df + 1) / 2)
+        return (gamma((self.df + 1) / 2) / (np.sqrt(self.df * np.pi) * gamma(self.df / 2))) * (
+            1 + (x**2 / self.df)
+        ) ** (-(self.df + 1) / 2)
 
 
 class RayleighDist(ContinuousDistMixin):
@@ -222,11 +207,7 @@ def from_data(cls, data, **kwargs):
         return cls(sigma=sigma)
 
     def pdf(self, x: float) -> float:
-        return (
-            (x / self.sigma**2) * np.exp(-(x**2) / (2 * self.sigma**2))
-            if x >= 0
-            else 0.0
-        )
+        return (x / self.sigma**2) * np.exp(-(x**2) / (2 * self.sigma**2)) if x >= 0 else 0.0
 
 
 class BernoulliDist(DiscreteDistMixin):
@@ -281,11 +262,7 @@ def from_data(cls, data, **kwargs):
         return cls(p=len(data) / np.sum(data))
 
     def pmf(self, x: int) -> float:
-        return (
-            self.p * (1 - self.p) ** (x - 1)
-            if x >= self._support[0] and x - int(x) == 0
-            else 0.0
-        )
+        return self.p * (1 - self.p) ** (x - 1) if x >= self._support[0] and x - int(x) == 0 else 0.0
 
 
 class PoissonDist(DiscreteDistMixin):

diff --git a/wnb/gnb.py b/wnb/gnb.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import warnings
 from abc import ABCMeta
 from typing import Optional, Sequence
@@ -12,6 +14,7 @@
 from sklearn.utils.validation import check_is_fitted
 from typing_extensions import Self
 
+from ._base import DistMixin
 from ._typing import ArrayLike, DistibutionLike, Float, MatrixLike
 from .dist import NonNumericDistributions
 from .enums import Distribution
@@ -65,15 +68,6 @@ class GeneralNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
         A mapping from class labels to their fitted likelihood distributions.
     """
 
-    class_count_: np.ndarray
-    class_prior_: np.ndarray
-    classes_: np.ndarray
-    n_classes_: int
-    n_features_in_: int
-    feature_names_in_: np.ndarray
-    distributions_: list
-    likelihood_params_: dict
-
     def __init__(
         self,
         *,
@@ -108,9 +102,7 @@ def _check_inputs(self, X, y):
             accept_sparse=False,
             accept_large_sparse=False,
             dtype=(
-                None
-                if any(d in self._get_distributions() for d in NonNumericDistributions)
-                else "numeric"
+                None if any(d in self._get_distributions() for d in NonNumericDistributions) else "numeric"
             ),
             force_all_finite=True,
             ensure_2d=True,
@@ -125,10 +117,7 @@ def _check_inputs(self, X, y):
 
         # Check that the number of samples and labels are compatible
         if X.shape[0] != y.shape[0]:
-            raise ValueError(
-                "X.shape[0]=%d and y.shape[0]=%d are incompatible."
-                % (X.shape[0], y.shape[0])
-            )
+            raise ValueError("X.shape[0]=%d and y.shape[0]=%d are incompatible." % (X.shape[0], y.shape[0]))
 
     def _prepare_X_y(self, X=None, y=None, from_fit=False):
         if from_fit and y is None:
@@ -164,6 +153,8 @@ def _prepare_X_y(self, X=None, y=None, from_fit=False):
         return output[0] if len(output) == 1 else output
 
     def _prepare_parameters(self):
+        self.class_prior_: np.ndarray
+
         # Set priors if not specified
         if self.priors is None:
             self.class_prior_ = (
@@ -189,8 +180,7 @@ def _prepare_parameters(self):
 
         # Set distributions if not specified
         if self.distributions is None:
-            self.distributions_ = [Distribution.NORMAL] * self.n_features_in_
-
+            self.distributions_: list[DistibutionLike] = [Distribution.NORMAL] * self.n_features_in_
         else:
             # Check if the number of distributions matches the number of features
             if len(self.distributions) != self.n_features_in_:
@@ -202,11 +192,9 @@ def _prepare_parameters(self):
             # Check that all specified distributions are supported
             for i, dist in enumerate(self.distributions):
                 if not is_dist_supported(dist):
-                    raise ValueError(
-                        f"Distribution '{dist}' at index {i} is not supported."
-                    )
+                    raise ValueError(f"Distribution '{dist}' at index {i} is not supported.")
 
-            self.distributions_ = self.distributions
+            self.distributions_: list[DistibutionLike] = list(self.distributions)
 
     def fit(self, X: MatrixLike, y: ArrayLike) -> Self:
         """Fits general Naive Bayes classifier according to X, y.
@@ -225,26 +213,28 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self:
         self : object
             Returns the instance itself.
         """
+        self.n_features_in_: int
+        self.feature_names_in_: np.ndarray
         self._check_n_features(X=X, reset=True)
         self._check_feature_names(X=X, reset=True)
 
         X, y = self._prepare_X_y(X, y, from_fit=True)
 
+        self.classes_: np.ndarray
+        self.class_count_: np.ndarray
         self.classes_, y_, self.class_count_ = np.unique(
             y, return_counts=True, return_inverse=True
         )  # Unique class labels, their indices, and class counts
-        self.n_classes_ = len(self.classes_)  # Number of classes
+        self.n_classes_: int = len(self.classes_)  # Number of classes
 
         self._check_inputs(X, y)
 
         y = y_
         self._prepare_parameters()
 
-        self.likelihood_params_ = {
+        self.likelihood_params_: dict[int, list[DistMixin]] = {
             c: [
-                get_dist_class(self.distributions_[i]).from_data(
-                    X[y == c, i], alpha=self.alpha
-                )
+                get_dist_class(self.distributions_[i]).from_data(X[y == c, i], alpha=self.alpha)
                 for i in range(self.n_features_in_)
             ]
             for c in range(self.n_classes_)
@@ -293,18 +283,15 @@ def predict_log_proba(self, X: MatrixLike) -> np.ndarray:
             accept_large_sparse=False,
             force_all_finite=True,
             dtype=(
-                None
-                if any(d in self._get_distributions() for d in NonNumericDistributions)
-                else "numeric"
+                None if any(d in self._get_distributions() for d in NonNumericDistributions) else "numeric"
             ),
             estimator=self,
         )
 
         # Check if the number of input features matches the data seen during fit
         if X.shape[1] != self.n_features_in_:
             raise ValueError(
-                "Expected input with %d features, got %d instead."
-                % (self.n_features_in_, X.shape[1])
+                "Expected input with %d features, got %d instead." % (self.n_features_in_, X.shape[1])
             )
 
         n_samples = X.shape[0]
@@ -314,17 +301,12 @@ def predict_log_proba(self, X: MatrixLike) -> np.ndarray:
         log_joint = np.zeros((n_samples, self.n_classes_))
         for c in range(self.n_classes_):
             log_joint[:, c] = np.log(self.class_prior_[c]) + np.sum(
-                [
-                    np.log(likelihood(X[:, i]))
-                    for i, likelihood in enumerate(self.likelihood_params_[c])
-                ],
+                [np.log(likelihood(X[:, i])) for i, likelihood in enumerate(self.likelihood_params_[c])],
                 axis=0,
             )
 
         log_proba = log_joint - np.transpose(
-            np.repeat(
-                logsumexp(log_joint, axis=1).reshape(1, -1), self.n_classes_, axis=0
-            )
+            np.repeat(logsumexp(log_joint, axis=1).reshape(1, -1), self.n_classes_, axis=0)
         )
         return log_proba