Skip to content

Commit

Permalink
Merge pull request #34 from msamsami/fix-class-attrs
Browse files Browse the repository at this point in the history
maint: fix class attributes, reformat the code
  • Loading branch information
msamsami authored Aug 6, 2024
2 parents 542a90c + bbcf431 commit 3701e91
Show file tree
Hide file tree
Showing 11 changed files with 82 additions and 191 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

<div align="center">

![Lastest Release](https://img.shields.io/badge/release-v0.2.5-green)
![Lastest Release](https://img.shields.io/badge/release-v0.2.6-green)
[![PyPI Version](https://img.shields.io/pypi/v/wnb)](https://pypi.org/project/wnb/)
![Python Versions](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11%20%7C%203.12-blue)<br>
![GitHub Workflow Status (build)](https://github.com/msamsami/weighted-naive-bayes/actions/workflows/python-publish.yml/badge.svg)
Expand Down Expand Up @@ -83,7 +83,7 @@ wnb.fit(X, y)
wnb.predict(x_test)
```

## Compatibility with Scikit-learn
## Compatibility with Scikit-learn 🤝

The **wnb** library fully adheres to the Scikit-learn API, ensuring seamless integration with other Scikit-learn components and workflows. This means that users familiar with Scikit-learn will find the WNB classifiers intuitive to use.

Expand All @@ -100,7 +100,7 @@ Both Scikit-learn classifiers and WNB classifiers share these well-known methods

By maintaining this consistency, WNB classifiers can be easily incorporated into existing machine learning pipelines and processes.

## Benchmarks
## Benchmarks 📊
We conducted benchmarks on three datasets, [Breast Cancer](https://scikit-learn.org/stable/datasets/toy_dataset.html#breast-cancer-wisconsin-diagnostic-dataset), [Digits](https://scikit-learn.org/stable/datasets/toy_dataset.html#optical-recognition-of-handwritten-digits-dataset), and [Wine](https://scikit-learn.org/stable/datasets/toy_dataset.html#wine-recognition-dataset), to evaluate the performance of WNB classifiers and compare them with their Scikit-learn counterpart, `GaussianNB`. The results show that WNB classifiers generally perform better in certain cases.

| Dataset | Scikit-learn Classifier | Accuracy | WNB Classifier | Accuracy |
Expand All @@ -124,7 +124,7 @@ Then, run pytest:
pytest
```

## Support us 🤝
## Support us 💡
You can support the project in the following ways:

⭐ Star WNB on GitHub (click the star button in the top right corner)
Expand Down
4 changes: 1 addition & 3 deletions examples/gwnb_breast_cancer.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@
y = breast_cancer["target"]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=0
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# Train and score sklearn GaussianNB classifier
gnb = GaussianNB()
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ include = ["/README.md", "/wnb"]
testpaths = ["tests"]
filterwarnings = ["ignore"]

[tool.black]
line-length = 110

[tool.isort]
atomic = true
profile = "black"
Expand Down
16 changes: 4 additions & 12 deletions tests/benchmarks/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,9 @@
__all__ = ("benchmark",)


def compare_score(
X, y, wnb, sklearn, random_state: int, test_size: float
) -> tuple[float, float]:
def compare_score(X, y, wnb, sklearn, random_state: int, test_size: float) -> tuple[float, float]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_state
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

# Train and score wnb classifier
clf_wnb = clone(wnb)
Expand All @@ -36,14 +32,10 @@ def compare_score(
return clf_wnb.score(X_test, y_test), clf_sklearn.score(X_test, y_test)


def benchmark(
X, y, wnb, sklearn, max_iter: int = 50, test_size: float = 0.33
) -> tuple[float, float]:
def benchmark(X, y, wnb, sklearn, max_iter: int = 50, test_size: float = 0.33) -> tuple[float, float]:
results = Parallel(n_jobs=-1, prefer="processes")(
delayed(compare_score)(*param)
for param in tqdm(
[(X, y, wnb, sklearn, i, test_size) for i in range(max_iter)], ncols=80
)
for param in tqdm([(X, y, wnb, sklearn, i, test_size) for i in range(max_iter)], ncols=80)
)

return np.mean([r[0] for r in results]), np.mean([r[1] for r in results])
4 changes: 1 addition & 3 deletions tests/test_gnb.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,7 @@ def test_gnb_vs_sklearn_categorical():
X_[:, i] = rnd_values
y_ = rng.randint(1, 4, size=(150,))

clf1 = GeneralNB(
distributions=[D.CATEGORICAL for _ in range(len(categorical_values))]
)
clf1 = GeneralNB(distributions=[D.CATEGORICAL for _ in range(len(categorical_values))])
clf1.fit(X_str_, y_)

clf2 = CategoricalNB(alpha=1e-10, force_alpha=True)
Expand Down
2 changes: 1 addition & 1 deletion wnb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Python library for the implementations of general and weighted naive Bayes (WNB) classifiers.
"""

__version__ = "0.2.5"
__version__ = "0.2.6"
__author__ = "Mehdi Samsami"


Expand Down
11 changes: 3 additions & 8 deletions wnb/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,7 @@ def vectorize(otypes=None, excluded=None, signature=None):
"""

def decorator(func):
vectorized = np.vectorize(
func, otypes=otypes, excluded=excluded, signature=signature
)
vectorized = np.vectorize(func, otypes=otypes, excluded=excluded, signature=signature)

@wraps(func)
def wrapper(*args):
Expand Down Expand Up @@ -65,9 +63,7 @@ def _get_param_names(cls) -> list[str]:

init_signature = inspect.signature(init)
parameters = [
p
for p in init_signature.parameters.values()
if p.name != "self" and p.kind != p.VAR_KEYWORD
p for p in init_signature.parameters.values() if p.name != "self" and p.kind != p.VAR_KEYWORD
]

for p in parameters:
Expand Down Expand Up @@ -105,8 +101,7 @@ def support(self) -> list[float] | tuple[float, float]:

def _check_support(self, x) -> None:
if (isinstance(self.support, list) and x not in self.support) or (
isinstance(self.support, tuple)
and (x < self.support[0] or x > self.support[1])
isinstance(self.support, tuple) and (x < self.support[0] or x > self.support[1])
):
warnings.warn(
"Value doesn't lie within the support of the distribution",
Expand Down
45 changes: 11 additions & 34 deletions wnb/dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,7 @@ def from_data(cls, data: np.ndarray, **kwargs):
return cls(mu=np.average(data), sigma=np.std(data))

def pdf(self, x: float) -> float:
return (1.0 / np.sqrt(2 * np.pi * self.sigma**2)) * np.exp(
-0.5 * (((x - self.mu) / self.sigma) ** 2)
)
return (1.0 / np.sqrt(2 * np.pi * self.sigma**2)) * np.exp(-0.5 * (((x - self.mu) / self.sigma) ** 2))


class LognormalDist(ContinuousDistMixin):
Expand Down Expand Up @@ -114,11 +112,7 @@ def from_data(cls, data, **kwargs):
return cls(x_m=x_m, alpha=len(data) / np.sum(np.log(data / x_m)))

def pdf(self, x: float) -> float:
return (
(self.alpha * self.x_m**self.alpha) / x ** (self.alpha + 1)
if x >= self.x_m
else 0.0
)
return (self.alpha * self.x_m**self.alpha) / x ** (self.alpha + 1) if x >= self.x_m else 0.0


class GammaDist(ContinuousDistMixin):
Expand All @@ -134,19 +128,12 @@ def __init__(self, k: float, theta: float):
def from_data(cls, data, **kwargs):
n = len(data)
return cls(
k=n
* np.sum(data)
/ (n * np.sum(data * np.log(data)) - np.sum(data * np.sum(np.log(data)))),
theta=(
n * np.sum(data * np.log(data)) - np.sum(data * np.sum(np.log(data)))
)
/ n**2,
k=n * np.sum(data) / (n * np.sum(data * np.log(data)) - np.sum(data * np.sum(np.log(data)))),
theta=(n * np.sum(data * np.log(data)) - np.sum(data * np.sum(np.log(data)))) / n**2,
)

def pdf(self, x: float) -> float:
return (x ** (self.k - 1) * np.exp(-x / self.theta)) / (
gamma(self.k) * self.theta**self.k
)
return (x ** (self.k - 1) * np.exp(-x / self.theta)) / (gamma(self.k) * self.theta**self.k)


class BetaDist(ContinuousDistMixin):
Expand All @@ -169,9 +156,7 @@ def from_data(cls, data, **kwargs):
)

def pdf(self, x: float) -> float:
return ((x ** (self.alpha - 1)) * (1 - x) ** (self.beta - 1)) / beta(
self.alpha, self.beta
)
return ((x ** (self.alpha - 1)) * (1 - x) ** (self.beta - 1)) / beta(self.alpha, self.beta)


class ChiSquaredDist(ContinuousDistMixin):
Expand Down Expand Up @@ -203,9 +188,9 @@ def from_data(cls, data, **kwargs):
return cls(df=len(data) - 1)

def pdf(self, x: float) -> float:
return (
gamma((self.df + 1) / 2) / (np.sqrt(self.df * np.pi) * gamma(self.df / 2))
) * (1 + (x**2 / self.df)) ** (-(self.df + 1) / 2)
return (gamma((self.df + 1) / 2) / (np.sqrt(self.df * np.pi) * gamma(self.df / 2))) * (
1 + (x**2 / self.df)
) ** (-(self.df + 1) / 2)


class RayleighDist(ContinuousDistMixin):
Expand All @@ -222,11 +207,7 @@ def from_data(cls, data, **kwargs):
return cls(sigma=sigma)

def pdf(self, x: float) -> float:
return (
(x / self.sigma**2) * np.exp(-(x**2) / (2 * self.sigma**2))
if x >= 0
else 0.0
)
return (x / self.sigma**2) * np.exp(-(x**2) / (2 * self.sigma**2)) if x >= 0 else 0.0


class BernoulliDist(DiscreteDistMixin):
Expand Down Expand Up @@ -281,11 +262,7 @@ def from_data(cls, data, **kwargs):
return cls(p=len(data) / np.sum(data))

def pmf(self, x: int) -> float:
return (
self.p * (1 - self.p) ** (x - 1)
if x >= self._support[0] and x - int(x) == 0
else 0.0
)
return self.p * (1 - self.p) ** (x - 1) if x >= self._support[0] and x - int(x) == 0 else 0.0


class PoissonDist(DiscreteDistMixin):
Expand Down
60 changes: 21 additions & 39 deletions wnb/gnb.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import warnings
from abc import ABCMeta
from typing import Optional, Sequence
Expand All @@ -12,6 +14,7 @@
from sklearn.utils.validation import check_is_fitted
from typing_extensions import Self

from ._base import DistMixin
from ._typing import ArrayLike, DistibutionLike, Float, MatrixLike
from .dist import NonNumericDistributions
from .enums import Distribution
Expand Down Expand Up @@ -65,15 +68,6 @@ class GeneralNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
A mapping from class labels to their fitted likelihood distributions.
"""

class_count_: np.ndarray
class_prior_: np.ndarray
classes_: np.ndarray
n_classes_: int
n_features_in_: int
feature_names_in_: np.ndarray
distributions_: list
likelihood_params_: dict

def __init__(
self,
*,
Expand Down Expand Up @@ -108,9 +102,7 @@ def _check_inputs(self, X, y):
accept_sparse=False,
accept_large_sparse=False,
dtype=(
None
if any(d in self._get_distributions() for d in NonNumericDistributions)
else "numeric"
None if any(d in self._get_distributions() for d in NonNumericDistributions) else "numeric"
),
force_all_finite=True,
ensure_2d=True,
Expand All @@ -125,10 +117,7 @@ def _check_inputs(self, X, y):

# Check that the number of samples and labels are compatible
if X.shape[0] != y.shape[0]:
raise ValueError(
"X.shape[0]=%d and y.shape[0]=%d are incompatible."
% (X.shape[0], y.shape[0])
)
raise ValueError("X.shape[0]=%d and y.shape[0]=%d are incompatible." % (X.shape[0], y.shape[0]))

def _prepare_X_y(self, X=None, y=None, from_fit=False):
if from_fit and y is None:
Expand Down Expand Up @@ -164,6 +153,8 @@ def _prepare_X_y(self, X=None, y=None, from_fit=False):
return output[0] if len(output) == 1 else output

def _prepare_parameters(self):
self.class_prior_: np.ndarray

# Set priors if not specified
if self.priors is None:
self.class_prior_ = (
Expand All @@ -189,8 +180,7 @@ def _prepare_parameters(self):

# Set distributions if not specified
if self.distributions is None:
self.distributions_ = [Distribution.NORMAL] * self.n_features_in_

self.distributions_: list[DistibutionLike] = [Distribution.NORMAL] * self.n_features_in_
else:
# Check if the number of distributions matches the number of features
if len(self.distributions) != self.n_features_in_:
Expand All @@ -202,11 +192,9 @@ def _prepare_parameters(self):
# Check that all specified distributions are supported
for i, dist in enumerate(self.distributions):
if not is_dist_supported(dist):
raise ValueError(
f"Distribution '{dist}' at index {i} is not supported."
)
raise ValueError(f"Distribution '{dist}' at index {i} is not supported.")

self.distributions_ = self.distributions
self.distributions_: list[DistibutionLike] = list(self.distributions)

def fit(self, X: MatrixLike, y: ArrayLike) -> Self:
"""Fits general Naive Bayes classifier according to X, y.
Expand All @@ -225,26 +213,28 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self:
self : object
Returns the instance itself.
"""
self.n_features_in_: int
self.feature_names_in_: np.ndarray
self._check_n_features(X=X, reset=True)
self._check_feature_names(X=X, reset=True)

X, y = self._prepare_X_y(X, y, from_fit=True)

self.classes_: np.ndarray
self.class_count_: np.ndarray
self.classes_, y_, self.class_count_ = np.unique(
y, return_counts=True, return_inverse=True
) # Unique class labels, their indices, and class counts
self.n_classes_ = len(self.classes_) # Number of classes
self.n_classes_: int = len(self.classes_) # Number of classes

self._check_inputs(X, y)

y = y_
self._prepare_parameters()

self.likelihood_params_ = {
self.likelihood_params_: dict[int, list[DistMixin]] = {
c: [
get_dist_class(self.distributions_[i]).from_data(
X[y == c, i], alpha=self.alpha
)
get_dist_class(self.distributions_[i]).from_data(X[y == c, i], alpha=self.alpha)
for i in range(self.n_features_in_)
]
for c in range(self.n_classes_)
Expand Down Expand Up @@ -293,18 +283,15 @@ def predict_log_proba(self, X: MatrixLike) -> np.ndarray:
accept_large_sparse=False,
force_all_finite=True,
dtype=(
None
if any(d in self._get_distributions() for d in NonNumericDistributions)
else "numeric"
None if any(d in self._get_distributions() for d in NonNumericDistributions) else "numeric"
),
estimator=self,
)

# Check if the number of input features matches the data seen during fit
if X.shape[1] != self.n_features_in_:
raise ValueError(
"Expected input with %d features, got %d instead."
% (self.n_features_in_, X.shape[1])
"Expected input with %d features, got %d instead." % (self.n_features_in_, X.shape[1])
)

n_samples = X.shape[0]
Expand All @@ -314,17 +301,12 @@ def predict_log_proba(self, X: MatrixLike) -> np.ndarray:
log_joint = np.zeros((n_samples, self.n_classes_))
for c in range(self.n_classes_):
log_joint[:, c] = np.log(self.class_prior_[c]) + np.sum(
[
np.log(likelihood(X[:, i]))
for i, likelihood in enumerate(self.likelihood_params_[c])
],
[np.log(likelihood(X[:, i])) for i, likelihood in enumerate(self.likelihood_params_[c])],
axis=0,
)

log_proba = log_joint - np.transpose(
np.repeat(
logsumexp(log_joint, axis=1).reshape(1, -1), self.n_classes_, axis=0
)
np.repeat(logsumexp(log_joint, axis=1).reshape(1, -1), self.n_classes_, axis=0)
)
return log_proba

Expand Down
Loading

0 comments on commit 3701e91

Please sign in to comment.