From 187a31e9fcc4c7d317b5a7481b7202d0337bf631 Mon Sep 17 00:00:00 2001 From: Mehdi Samsami Date: Sun, 19 Jan 2025 04:30:40 +0330 Subject: [PATCH 1/6] remove test section from readme, remove manifest file --- MANIFEST.in | 4 ---- README.md | 12 ------------ 2 files changed, 16 deletions(-) delete mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index dca749c..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,4 +0,0 @@ -include README.md -include LICENSE -recursive-include wnb *.py -recursive-include tests *.py diff --git a/README.md b/README.md index ba484e7..45cb00e 100644 --- a/README.md +++ b/README.md @@ -123,18 +123,6 @@ These benchmarks highlight the potential of WNB classifiers to provide better pe The scripts used to generate these benchmark results are available in the _tests/benchmarks/_ directory. -## Tests -To run the tests, make sure to clone the repository and install the development requirements in addition to base requirements: -```bash -pip install -r requirements.txt -pip install -r requirements-dev.txt -``` - -Then, run pytest: -```bash -pytest -``` - ## Support us 💡 You can support the project in the following ways: From ac9302f905c6e8f62414642823c1749908ac44db Mon Sep 17 00:00:00 2001 From: Mehdi Samsami Date: Sun, 19 Jan 2025 04:32:02 +0330 Subject: [PATCH 2/6] major internal refactor in general naive bayes class --- tests/test_gnb.py | 21 ++++++ wnb/_utils.py | 10 +++ wnb/gnb.py | 179 ++++++++-------------------------------------- 3 files changed, 61 insertions(+), 149 deletions(-) diff --git a/tests/test_gnb.py b/tests/test_gnb.py index d6f2801..03dda03 100644 --- a/tests/test_gnb.py +++ b/tests/test_gnb.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd import pytest from sklearn.base import is_classifier from sklearn.naive_bayes import BernoulliNB, CategoricalNB, GaussianNB @@ -281,3 +282,23 @@ def test_gnb_var_smoothing_non_numeric(): clf = GeneralNB(distributions=[D.CATEGORICAL, D.CATEGORICAL], var_smoothing=1e-6) clf.fit(X, y) assert clf.epsilon_ == 0 + + +def test_gnb_attrs(): + """ + Test whether the attributes are properly set. + """ + clf = GeneralNB().fit(X, y) + assert np.array_equal(clf.class_count_, np.array([3, 3])) + assert np.array_equal(clf.class_prior_, np.array([0.5, 0.5])) + assert np.array_equal(clf.classes_, np.array([1, 2])) + assert clf.n_classes_ == 2 + assert clf.epsilon_ > 0 + assert clf.n_features_in_ == 2 + assert not hasattr(clf, "feature_names_in_") + assert clf.distributions_ == [D.NORMAL, D.NORMAL] + assert len(clf.likelihood_params_) == 2 + + feature_names = [f"x{i}" for i in range(X.shape[1])] + clf = GeneralNB().fit(pd.DataFrame(X, columns=feature_names), y) + assert np.array_equal(clf.feature_names_in_, np.array(feature_names)) diff --git a/wnb/_utils.py b/wnb/_utils.py index 0686d0d..27b568f 100644 --- a/wnb/_utils.py +++ b/wnb/_utils.py @@ -7,6 +7,7 @@ __all__ = [ "SKLEARN_V1_6_OR_LATER", "validate_data", + "check_X_y", "_check_n_features", "_check_feature_names", ] @@ -17,6 +18,7 @@ if SKLEARN_V1_6_OR_LATER: from sklearn.utils.validation import _check_feature_names, _check_n_features + from sklearn.utils.validation import check_X_y as _check_X_y from sklearn.utils.validation import validate_data as _validate_data def validate_data(*args, **kwargs): @@ -24,12 +26,20 @@ def validate_data(*args, **kwargs): kwargs["ensure_all_finite"] = kwargs.pop("force_all_finite") return _validate_data(*args, **kwargs) + def check_X_y(*args, **kwargs): + if kwargs.get("force_all_finite"): + kwargs["ensure_all_finite"] = kwargs.pop("force_all_finite") + return _check_X_y(*args, **kwargs) + else: def validate_data(estimator, X, **kwargs: Any): kwargs.pop("reset", None) return check_array(X, estimator=estimator, **kwargs) + def check_X_y(*args, **kwargs): + return _check_X_y(*args, **kwargs) + def _check_n_features(estimator, X, reset): return estimator._check_n_features(X, reset=reset) diff --git a/wnb/gnb.py b/wnb/gnb.py index 5a0b2fc..6313324 100644 --- a/wnb/gnb.py +++ b/wnb/gnb.py @@ -1,18 +1,11 @@ from __future__ import annotations import sys -import warnings -from abc import ABCMeta from typing import Optional, Sequence import numpy as np -import pandas as pd -from scipy.special import logsumexp -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.exceptions import DataConversionWarning -from sklearn.utils import as_float_array +from sklearn.naive_bayes import _BaseNB from sklearn.utils.multiclass import check_classification_targets -from sklearn.utils.validation import _ensure_no_complex_data, check_is_fitted if sys.version_info >= (3, 11): from typing import Self @@ -28,6 +21,7 @@ SKLEARN_V1_6_OR_LATER, _check_feature_names, _check_n_features, + check_X_y, validate_data, ) from .typing import ArrayLike, Float, MatrixLike @@ -35,7 +29,7 @@ __all__ = ["GeneralNB"] -class GeneralNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta): +class GeneralNB(_BaseNB): """A General Naive Bayes classifier that supports distinct likelihood distributions for individual features, enabling more tailored modeling beyond the standard single-distribution approaches such as GaussianNB and BernoulliNB. @@ -117,15 +111,8 @@ def _get_distributions(self) -> Sequence[DistributionLike]: except Exception: return self.distributions or [] - def _check_inputs(self, X, y) -> None: - # Check if the targets are suitable for classification - check_classification_targets(y) - - # Check if only one class is present in label vector - if self.n_classes_ == 1: - raise ValueError("Classifier can't train when only one class is present") - - X = validate_data( + def _check_X(self, X) -> np.ndarray: + return validate_data( self, X, accept_sparse=False, @@ -134,50 +121,25 @@ def _check_inputs(self, X, y) -> None: None if any(d in self._get_distributions() for d in NonNumericDistributions) else "numeric" ), force_all_finite=True, - ensure_2d=True, - ensure_min_samples=1, - ensure_min_features=1, + reset=False, ) - # Check that the number of samples and labels are compatible - if X.shape[0] != y.shape[0]: - raise ValueError("X.shape[0]=%d and y.shape[0]=%d are incompatible." % (X.shape[0], y.shape[0])) - - def _prepare_X_y(self, X=None, y=None, from_fit: bool = False): - if from_fit and y is None: - raise ValueError("requires y to be passed, but the target y is None.") - - if X is not None: - # Convert to NumPy array if X is Pandas DataFrame - if isinstance(X, pd.DataFrame): - X = X.values - _ensure_no_complex_data(X) - X = ( - X - if any(d in self._get_distributions() for d in NonNumericDistributions) - else as_float_array(X) - ) - - if y is not None: - # Convert to a NumPy array - if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series): - y = y.values - else: - y = np.array(y) - - # Warning in case of y being 2d - if y.ndim > 1: - warnings.warn( - "A column-vector y was passed when a 1d array was expected.", - DataConversionWarning, - ) - - y = y.flatten() - - output = tuple(item for item in [X, y] if item is not None) - return output[0] if len(output) == 1 else output + def _check_X_y(self, X, y) -> tuple[np.ndarray, np.ndarray]: + X, y = check_X_y( + X, + y, + accept_sparse=False, + accept_large_sparse=False, + dtype=( + None if any(d in self._get_distributions() for d in NonNumericDistributions) else "numeric" + ), + force_all_finite=True, + estimator=self, + ) + check_classification_targets(y) + return X, y - def _prepare_parameters(self) -> None: + def _init_parameters(self) -> None: self.class_prior_: np.ndarray # Set priors if not specified @@ -243,18 +205,12 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self: _check_n_features(self, X=X, reset=True) _check_feature_names(self, X=X, reset=True) - X, y = self._prepare_X_y(X, y, from_fit=True) + X, y = self._check_X_y(X, y) - self.classes_: np.ndarray - self.class_count_: np.ndarray - self.classes_, y_, self.class_count_ = np.unique( - y, return_counts=True, return_inverse=True - ) # Unique class labels, their indices, and class counts - self.n_classes_: int = len(self.classes_) # Number of classes + self.classes_, y_, self.class_count_ = np.unique(y, return_counts=True, return_inverse=True) + self.n_classes_: int = len(self.classes_) - self._check_inputs(X, y) - y = y_ - self._prepare_parameters() + self._init_parameters() self.epsilon_ = 0.0 if np.all(np.isreal(X)): @@ -263,7 +219,7 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self: self.likelihood_params_: dict[int, list[DistMixin]] = { c: [ get_dist_class(self.distributions_[i]).from_data( - X[y == c, i], alpha=self.alpha, epsilon=self.epsilon_ + X[y_ == c, i], alpha=self.alpha, epsilon=self.epsilon_ ) for i in range(self.n_features_in_) ] @@ -272,86 +228,11 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self: return self - def predict(self, X: MatrixLike) -> np.ndarray: - """Performs classification on an array of test vectors X. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The input samples. - - Returns - ------- - C : ndarray of shape (n_samples,) - Predicted target values for X. - """ - p_hat = self.predict_log_proba(X) - return self.classes_[np.argmax(p_hat, axis=1)] - - def predict_log_proba(self, X: MatrixLike) -> np.ndarray: - """Returns log-probability estimates for the array of test vectors X. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The input samples. - - Returns - ------- - C : array-like of shape (n_samples, n_classes) - Returns the log-probability of the samples for each class in - the model. The columns correspond to the classes in sorted - order, as they appear in the attribute :term:`classes_`. - """ - # Check is fit had been called - check_is_fitted(self) - - # Input validation - X = validate_data( - self, - X, - accept_large_sparse=False, - force_all_finite=True, - dtype=( - None if any(d in self._get_distributions() for d in NonNumericDistributions) else "numeric" - ), - reset=False, - ) - - # Check if the number of input features matches the data seen during fit - if X.shape[1] != self.n_features_in_: - raise ValueError( - "Expected input with %d features, got %d instead." % (self.n_features_in_, X.shape[1]) - ) - - n_samples = X.shape[0] - X = self._prepare_X_y(X=X) - - log_joint = np.zeros((n_samples, self.n_classes_)) + def _joint_log_likelihood(self, X) -> np.ndarray: + jll = np.zeros((X.shape[0], self.n_classes_)) for c in range(self.n_classes_): - log_joint[:, c] = np.log(self.class_prior_[c]) + np.sum( + jll[:, c] = np.log(self.class_prior_[c]) + np.sum( [np.log(likelihood(X[:, i])) for i, likelihood in enumerate(self.likelihood_params_[c])], axis=0, ) - - log_proba = log_joint - np.transpose( - np.repeat(logsumexp(log_joint, axis=1).reshape(1, -1), self.n_classes_, axis=0) - ) - return log_proba - - def predict_proba(self, X: MatrixLike) -> np.ndarray: - """Returns probability estimates for the array of test vectors X. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The input samples. - - Returns - ------- - C : array-like of shape (n_samples, n_classes) - Returns the probability of the samples for each class in - the model. The columns correspond to the classes in sorted - order, as they appear in the attribute :term:`classes_`. - """ - return np.exp(self.predict_log_proba(X)) + return jll From d7381810d036ab664d1668d16f736a5dfc26b84e Mon Sep 17 00:00:00 2001 From: Mehdi Samsami Date: Sun, 19 Jan 2025 04:36:34 +0330 Subject: [PATCH 3/6] fix import bug --- wnb/_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wnb/_utils.py b/wnb/_utils.py index 27b568f..3fe76b2 100644 --- a/wnb/_utils.py +++ b/wnb/_utils.py @@ -2,7 +2,8 @@ import sklearn from packaging import version -from sklearn.utils import check_array +from sklearn.utils.validation import check_array +from sklearn.utils.validation import check_X_y as _check_X_y __all__ = [ "SKLEARN_V1_6_OR_LATER", @@ -18,7 +19,6 @@ if SKLEARN_V1_6_OR_LATER: from sklearn.utils.validation import _check_feature_names, _check_n_features - from sklearn.utils.validation import check_X_y as _check_X_y from sklearn.utils.validation import validate_data as _validate_data def validate_data(*args, **kwargs): From 1bf3803f78abcd679162efb928e63d82118483a7 Mon Sep 17 00:00:00 2001 From: Mehdi Samsami Date: Sun, 19 Jan 2025 04:40:07 +0330 Subject: [PATCH 4/6] add extra validation for compatibility with earlier sklearn versions --- wnb/gnb.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/wnb/gnb.py b/wnb/gnb.py index 6313324..c6d9be6 100644 --- a/wnb/gnb.py +++ b/wnb/gnb.py @@ -112,7 +112,7 @@ def _get_distributions(self) -> Sequence[DistributionLike]: return self.distributions or [] def _check_X(self, X) -> np.ndarray: - return validate_data( + X = validate_data( self, X, accept_sparse=False, @@ -123,6 +123,11 @@ def _check_X(self, X) -> np.ndarray: force_all_finite=True, reset=False, ) + if X.shape[1] != self.n_features_in_: + raise ValueError( + "Expected input with %d features, got %d instead." % (self.n_features_in_, X.shape[1]) + ) + return X def _check_X_y(self, X, y) -> tuple[np.ndarray, np.ndarray]: X, y = check_X_y( From 00b36a19204dc6db149e239170571e2f7352909d Mon Sep 17 00:00:00 2001 From: Mehdi Samsami Date: Sun, 19 Jan 2025 05:52:46 +0330 Subject: [PATCH 5/6] major core refactor in gaussian wnb classifier --- tests/test_gwnb.py | 23 ++++ wnb/gnb.py | 10 +- wnb/gwnb.py | 303 +++++++++++++-------------------------------- 3 files changed, 111 insertions(+), 225 deletions(-) diff --git a/tests/test_gwnb.py b/tests/test_gwnb.py index d4ab622..6c09b4f 100644 --- a/tests/test_gwnb.py +++ b/tests/test_gwnb.py @@ -1,6 +1,7 @@ import re import numpy as np +import pandas as pd import pytest from sklearn.base import is_classifier from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal @@ -194,3 +195,25 @@ def test_gwnb_no_cost_hist(): clf = GaussianWNB(max_iter=10) clf.fit(X, y) assert clf.cost_hist_ is None + + +def test_gwnb_attrs(): + """ + Test whether the attributes are properly set. + """ + clf = GaussianWNB().fit(X, y) + assert np.array_equal(clf.class_count_, np.array([3, 3])) + assert np.array_equal(clf.class_prior_, np.array([0.5, 0.5])) + assert np.array_equal(clf.classes_, np.array([1, 2])) + assert clf.n_classes_ == 2 + assert clf.n_features_in_ == 2 + assert not hasattr(clf, "feature_names_in_") + assert np.array_equal(clf.error_weights_, np.array([[0, 1], [-1, 0]])) + assert clf.theta_.shape == (2, 2) + assert clf.std_.shape == (2, 2) + assert clf.var_.shape == (2, 2) + assert clf.coef_.shape == (2,) + + feature_names = [f"x{i}" for i in range(X.shape[1])] + clf = GaussianWNB().fit(pd.DataFrame(X, columns=feature_names), y) + assert np.array_equal(clf.feature_names_in_, np.array(feature_names)) diff --git a/wnb/gnb.py b/wnb/gnb.py index c6d9be6..c992699 100644 --- a/wnb/gnb.py +++ b/wnb/gnb.py @@ -145,14 +145,10 @@ def _check_X_y(self, X, y) -> tuple[np.ndarray, np.ndarray]: return X, y def _init_parameters(self) -> None: - self.class_prior_: np.ndarray - # Set priors if not specified if self.priors is None: - self.class_prior_ = ( - self.class_count_ / self.class_count_.sum() - ) # Calculate empirical prior probabilities - + # Calculate empirical prior probabilities + self.class_prior_ = self.class_count_ / self.class_count_.sum() else: # Check that the provided priors match the number of classes if len(self.priors) != self.n_classes_: @@ -213,7 +209,7 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self: X, y = self._check_X_y(X, y) self.classes_, y_, self.class_count_ = np.unique(y, return_counts=True, return_inverse=True) - self.n_classes_: int = len(self.classes_) + self.n_classes_ = len(self.classes_) self._init_parameters() diff --git a/wnb/gwnb.py b/wnb/gwnb.py index 7ed510d..8ea8d78 100644 --- a/wnb/gwnb.py +++ b/wnb/gwnb.py @@ -2,19 +2,12 @@ import numbers import sys -import warnings -from abc import ABCMeta from typing import Optional import numpy as np -import pandas as pd -from scipy.special import logsumexp from scipy.stats import norm -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.exceptions import DataConversionWarning -from sklearn.utils import as_float_array +from sklearn.naive_bayes import _BaseNB from sklearn.utils.multiclass import check_classification_targets, type_of_target -from sklearn.utils.validation import _ensure_no_complex_data, check_is_fitted if sys.version_info >= (3, 11): from typing import Self @@ -25,6 +18,7 @@ SKLEARN_V1_6_OR_LATER, _check_feature_names, _check_n_features, + check_X_y, validate_data, ) from .typing import ArrayLike, Float, Int, MatrixLike @@ -32,8 +26,8 @@ __all__ = ["GaussianWNB"] -class GaussianWNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta): - """Binary Gaussian Minimum Log-likelihood Difference Weighted Naive Bayes (MLD-WNB) classifier +class GaussianWNB(_BaseNB): + """Binary Gaussian Minimum Log-likelihood Difference Weighted Naive Bayes (MLD-WNB) classifier. Parameters ---------- @@ -134,22 +128,7 @@ def __sklearn_tags__(self): def _more_tags(self) -> dict[str, bool]: return {"binary_only": True, "requires_y": True} - def _check_inputs(self, X, y) -> None: - # Check if the targets are suitable for classification - check_classification_targets(y) - - # Check that the dataset has only two unique labels - if (y_type := type_of_target(y)) != "binary": - if SKLEARN_V1_6_OR_LATER: - msg = f"Only binary classification is supported. The type of the target is {y_type}." - else: - msg = "Unknown label type: non-binary" - raise ValueError(msg) - - # Check if only one class is present in label vector - if self.n_classes_ == 1: - raise ValueError("Classifier can't train when only one class is present.") - + def _check_X(self, X) -> np.ndarray: X = validate_data( self, X, @@ -157,16 +136,43 @@ def _check_inputs(self, X, y) -> None: accept_large_sparse=False, dtype="numeric", force_all_finite=True, - ensure_2d=True, - ensure_min_samples=1, - ensure_min_features=1, + reset=False, + ) + if X.shape[1] != self.n_features_in_: + raise ValueError( + "Expected input with %d features, got %d instead." % (self.n_features_in_, X.shape[1]) + ) + return X + + def _check_X_y(self, X, y) -> tuple[np.ndarray, np.ndarray]: + X, y = check_X_y( + X, + y, + accept_sparse=False, + accept_large_sparse=False, + dtype="numeric", + force_all_finite=True, + estimator=self, ) + check_classification_targets(y) - # Check that the number of samples and labels are compatible - if self.__n_samples != y.shape[0]: - raise ValueError("X.shape[0]=%d and y.shape[0]=%d are incompatible." % (X.shape[0], y.shape[0])) + if np.unique(y).shape[0] == 1: + raise ValueError("Classifier can't train when only one class is present") - if self.priors is not None: + if (y_type := type_of_target(y)) != "binary": + if SKLEARN_V1_6_OR_LATER: + msg = f"Only binary classification is supported. The type of the target is {y_type}." + else: + msg = "Unknown label type: non-binary" + raise ValueError(msg) + + return X, y + + def _init_parameters(self) -> None: + if self.priors is None: + # Calculate empirical prior probabilities + self.class_prior_ = self.class_count_ / self.class_count_.sum() + else: # Check that the provided priors match the number of classes if len(self.priors) != self.n_classes_: raise ValueError("Number of priors must match the number of classes.") @@ -177,16 +183,26 @@ def _check_inputs(self, X, y) -> None: if (self.priors < 0).any(): raise ValueError("Priors must be non-negative.") - if self.error_weights is not None: + self.class_prior_ = self.priors + + # Convert to NumPy array if input priors is in a list/tuple/set + if isinstance(self.class_prior_, (list, tuple, set)): + self.class_prior_ = np.array(list(self.class_prior_)) + + if self.error_weights is None: + # Assign equal weight to the errors of both classes + self.error_weights_ = np.array([[0, 1], [-1, 0]]) + else: # Check that the size of error weights matrix matches number of classes if self.error_weights.shape != (self.n_classes_, self.n_classes_): raise ValueError( "The shape of error weights matrix does not match the number of classes, " "must be (n_classes, n_classes)." ) + self.error_weights_ = self.error_weights # Check that the regularization type is either 'l1' or 'l2' - if self.penalty not in ["l1", "l2"]: + if self.penalty not in ("l1", "l2"): raise ValueError("Regularization type must be either 'l1' or 'l2'.") # Check that the regularization parameter is a positive integer @@ -199,62 +215,6 @@ def _check_inputs(self, X, y) -> None: "Maximum number of iteration must be a positive integer; got (max_iter=%r)." % self.max_iter ) - def _prepare_X_y(self, X=None, y=None, from_fit: bool = False): - if from_fit and y is None: - raise ValueError("requires y to be passed, but the target y is None.") - - if X is not None: - # Convert to NumPy array if X is Pandas DataFrame - if isinstance(X, pd.DataFrame): - X = X.values - _ensure_no_complex_data(X) - X = as_float_array(X) - - if y is not None: - # Convert to a NumPy array - if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series): - y = y.values - else: - y = np.array(y) - - # Warning in case of y being 2d - if y.ndim > 1: - warnings.warn( - "A column-vector y was passed when a 1d array was expected.", - DataConversionWarning, - ) - - y = y.flatten() - - output = tuple(item for item in [X, y] if item is not None) - return output[0] if len(output) == 1 else output - - def _prepare_parameters(self, X, y) -> None: - # Calculate mean and standard deviation of features for each class - for c in range(self.n_classes_): - self.theta_[:, c] = np.mean(X[y == c, :], axis=0) # Calculate mean of features for class c - self.std_[:, c] = np.std(X[y == c, :], axis=0) # Calculate std of features for class c - self.var_ = np.square(self.std_) # Calculate variance of features using std - - self.class_prior_: np.ndarray - # Update if no priors is provided - if self.priors is None: - self.class_prior_ = ( - self.class_count_ / self.class_count_.sum() - ) # Calculate empirical prior probabilities - else: - self.class_prior_ = self.priors - - # Convert to NumPy array if input priors is in a list/tuple/set - if isinstance(self.class_prior_, (list, tuple, set)): - self.class_prior_ = np.array(list(self.class_prior_)) - - # Update if no error weights is provided - if self.error_weights is None: - self.error_weights_ = np.array([[0, 1], [-1, 0]]) - else: - self.error_weights_ = self.error_weights - def fit(self, X: MatrixLike, y: ArrayLike) -> Self: """Fits Gaussian Binary MLD-WNB classifier according to X, y. @@ -277,56 +237,33 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self: _check_n_features(self, X=X, reset=True) _check_feature_names(self, X=X, reset=True) - X, y = self._prepare_X_y(X, y, from_fit=True) - - self.classes_: np.ndarray - self.class_count_: np.ndarray - self.classes_, y_, self.class_count_ = np.unique( - y, return_counts=True, return_inverse=True - ) # Unique class labels, their indices, and class counts - self.n_classes_: int = len(self.classes_) # Number of classes - - self.__n_samples = X.shape[0] # Number of samples (for internal use) - - self._check_inputs(X, y) - y = y_ - - self.theta_: np.ndarray = np.zeros( - (self.n_features_in_, self.n_classes_) - ) # Mean of each feature per class (n_features x n_classes) - self.std_: np.ndarray = np.zeros( - (self.n_features_in_, self.n_classes_) - ) # Standard deviation of each feature per class (n_features x n_classes) - self.var_: np.ndarray = np.zeros( - (self.n_features_in_, self.n_classes_) - ) # Variance of each feature per class (n_features x n_classes) - self.coef_: np.ndarray = np.ones((self.n_features_in_,)) # WNB coefficients (n_features x 1) - self.cost_hist_: np.ndarray = np.array( - [np.nan for _ in range(self.max_iter)] - ) # Cost value in each iteration - - self._prepare_parameters(X, y) - - # Learn the weights using gradient descent - self.n_iter_: int = 0 - for self.n_iter_ in range(self.max_iter): - # Predict on X - y_hat = self._predict(X) + X, y = self._check_X_y(X, y) - # Calculate cost - self.cost_hist_[self.n_iter_], _lambda = self._calculate_cost(X, y, y_hat, self.learning_hist) + self.classes_, y_, self.class_count_ = np.unique(y, return_counts=True, return_inverse=True) + self.n_classes_ = len(self.classes_) - # Calculate gradients (most time-consuming) - _grad = self._calculate_grad(X, _lambda) + self._init_parameters() - # Add regularization + self.theta_: np.ndarray = np.zeros((self.n_features_in_, self.n_classes_)) + self.std_: np.ndarray = np.zeros((self.n_features_in_, self.n_classes_)) + self.var_: np.ndarray = np.zeros((self.n_features_in_, self.n_classes_)) + for c in range(self.n_classes_): + self.theta_[:, c] = np.mean(X[y_ == c, :], axis=0) + self.std_[:, c] = np.std(X[y_ == c, :], axis=0) + self.var_ = np.square(self.std_) + + self.n_iter_: int = 0 + self.coef_: np.ndarray = np.ones((self.n_features_in_,)) + self.cost_hist_: np.ndarray = np.array([np.nan for _ in range(self.max_iter)]) + for self.n_iter_ in range(self.max_iter): + y_hat = self._predict(X) + self.cost_hist_[self.n_iter_], _lambda = self._calculate_cost(X, y_, y_hat, self.learning_hist) + grad = self._calculate_grad(X, _lambda) if self.penalty == "l1": - _grad += self.C * np.sign(self.coef_) + grad += self.C * np.sign(self.coef_) elif self.penalty == "l2": - _grad += 2 * self.C * self.coef_ - - # Update weights - self.coef_ = self.coef_ - self.step_size * _grad + grad += 2 * self.C * self.coef_ + self.coef_ = self.coef_ - self.step_size * grad self.n_iter_ += 1 self.cost_hist_ = None if not self.learning_hist else self.cost_hist_ @@ -334,12 +271,11 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self: return self def _calculate_cost(self, X, y, y_hat, learning_hist: bool) -> tuple[Float, list[Float]]: - _lambda = [self.error_weights_[y[i], y_hat[i]] for i in range(self.__n_samples)] + _lambda = [self.error_weights_[y[i], y_hat[i]] for i in range(X.shape[0])] if learning_hist: - # Calculate cost _cost = 0.0 - for i in range(self.__n_samples): + for i in range(X.shape[0]): _sum = np.log(self.class_prior_[1] / self.class_prior_[0]) x = X[i, :] for j in range(self.n_features_in_): @@ -356,22 +292,22 @@ def _calculate_cost(self, X, y, y_hat, learning_hist: bool) -> tuple[Float, list def _calculate_grad(self, X, _lambda: list[Float]) -> np.ndarray: _grad = np.repeat( np.log(self.std_[:, 0] / self.std_[:, 1]).reshape(1, -1), - self.__n_samples, + X.shape[0], axis=0, ) _grad += ( 0.5 * ( - (X - np.repeat(self.theta_[:, 0].reshape(1, -1), self.__n_samples, axis=0)) - / (np.repeat(self.std_[:, 0].reshape(1, -1), self.__n_samples, axis=0)) + (X - np.repeat(self.theta_[:, 0].reshape(1, -1), X.shape[0], axis=0)) + / (np.repeat(self.std_[:, 0].reshape(1, -1), X.shape[0], axis=0)) ) ** 2 ) _grad -= ( 0.5 * ( - (X - np.repeat(self.theta_[:, 1].reshape(1, -1), self.__n_samples, axis=0)) - / (np.repeat(self.std_[:, 1].reshape(1, -1), self.__n_samples, axis=0)) + (X - np.repeat(self.theta_[:, 1].reshape(1, -1), X.shape[0], axis=0)) + / (np.repeat(self.std_[:, 1].reshape(1, -1), X.shape[0], axis=0)) ) ** 2 ) @@ -381,58 +317,11 @@ def _calculate_grad(self, X, _lambda: list[Float]) -> np.ndarray: return _grad def _predict(self, X: MatrixLike) -> np.ndarray: - p_hat = self.predict_log_proba(X) - return np.argmax(p_hat, axis=1) - - def predict(self, X: MatrixLike) -> np.ndarray: - """Performs classification on an array of test vectors X. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The input samples. - - Returns - ------- - C : ndarray of shape (n_samples,) - Predicted target values for X. - """ - p_hat = self.predict_log_proba(X) - y_hat = self.classes_[np.argmax(p_hat, axis=1)] - return y_hat - - def predict_log_proba(self, X: MatrixLike) -> np.ndarray: - """Returns log-probability estimates for the array of test vectors X. + jll = self._joint_log_likelihood(X) + return np.argmax(jll, axis=1) - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The input samples. - - Returns - ------- - C : array-like of shape (n_samples, n_classes) - Returns the log-probability of the samples for each class in - the model. The columns correspond to the classes in sorted - order, as they appear in the attribute :term:`classes_`. - """ - # Check is fit had been called - check_is_fitted(self) - - # Input validation - X = validate_data(self, X, accept_large_sparse=False, force_all_finite=True, reset=False) - - # Check if the number of input features matches the data seen during fit - if X.shape[1] != self.n_features_in_: - raise ValueError( - "Expected input with %d features, got %d instead." % (self.n_features_in_, X.shape[1]) - ) - - n_samples = X.shape[0] - - X = self._prepare_X_y(X=X) - - log_priors = np.tile(np.log(self.class_prior_), (n_samples, 1)) + def _joint_log_likelihood(self, X) -> np.ndarray: + log_priors = np.tile(np.log(self.class_prior_), (X.shape[0], 1)) w_reshaped = np.tile(self.coef_.reshape(-1, 1), (1, self.n_classes_)) term1 = np.sum(np.multiply(w_reshaped, -np.log(np.sqrt(2 * np.pi) * self.std_))) var_inv = np.multiply(w_reshaped, 1.0 / np.multiply(self.std_, self.std_)) @@ -442,26 +331,4 @@ def predict_log_proba(self, X: MatrixLike) -> np.ndarray: - 2.0 * np.matmul(X, mu_by_var) + np.sum(self.theta_.conj() * mu_by_var, axis=0) ) - log_proba = log_priors + term1 + term2 - - log_proba -= np.transpose( - np.repeat(logsumexp(log_proba, axis=1).reshape(1, -1), self.n_classes_, axis=0) - ) - return log_proba - - def predict_proba(self, X: MatrixLike) -> np.ndarray: - """Returns probability estimates for the array of test vectors X. - - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The input samples. - - Returns - ------- - C : array-like of shape (n_samples, n_classes) - Returns the probability of the samples for each class in - the model. The columns correspond to the classes in sorted - order, as they appear in the attribute :term:`classes_`. - """ - return np.exp(self.predict_log_proba(X)) + return log_priors + term1 + term2 From 5710ea9a6bcbf02cb0c41dea93fffe0741c921be Mon Sep 17 00:00:00 2001 From: Mehdi Samsami Date: Sun, 19 Jan 2025 06:12:14 +0330 Subject: [PATCH 6/6] bump version -> v0.6.0 --- README.md | 3 ++- uv.lock | 2 +- wnb/__init__.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 45cb00e..f8bc0cc 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@
-![Lastest Release](https://img.shields.io/badge/release-v0.5.1-green) +![Lastest Release](https://img.shields.io/badge/release-v0.6.0-green) [![PyPI Version](https://img.shields.io/pypi/v/wnb)](https://pypi.org/project/wnb/) ![Python Versions](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue)
![GitHub Workflow Status (build)](https://github.com/msamsami/wnb/actions/workflows/build.yml/badge.svg) @@ -102,6 +102,7 @@ Both Scikit-learn classifiers and WNB classifiers share these well-known methods - `predict(X)` - `predict_proba(X)` - `predict_log_proba(X)` +- `predict_joint_log_proba(X)` - `score(X, y)` - `get_params()` - `set_params(**params)` diff --git a/uv.lock b/uv.lock index c9eb14f..0041729 100644 --- a/uv.lock +++ b/uv.lock @@ -1008,7 +1008,7 @@ wheels = [ [[package]] name = "wnb" -version = "0.5.1" +version = "0.6.0" source = { editable = "." } dependencies = [ { name = "pandas", version = "2.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" }, diff --git a/wnb/__init__.py b/wnb/__init__.py index 3523c4e..54f9482 100644 --- a/wnb/__init__.py +++ b/wnb/__init__.py @@ -2,7 +2,7 @@ Python library for the implementations of general and weighted naive Bayes (WNB) classifiers. """ -__version__ = "0.5.1" +__version__ = "0.6.0" __author__ = "Mehdi Samsami"