From 187a31e9fcc4c7d317b5a7481b7202d0337bf631 Mon Sep 17 00:00:00 2001
From: Mehdi Samsami <mehdisamsami@live.com>
Date: Sun, 19 Jan 2025 04:30:40 +0330
Subject: [PATCH 1/6] remove test section from readme, remove manifest file

---
 MANIFEST.in |  4 ----
 README.md   | 12 ------------
 2 files changed, 16 deletions(-)
 delete mode 100644 MANIFEST.in

diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index dca749c..0000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,4 +0,0 @@
-include README.md
-include LICENSE
-recursive-include wnb *.py
-recursive-include tests *.py
diff --git a/README.md b/README.md
index ba484e7..45cb00e 100644
--- a/README.md
+++ b/README.md
@@ -123,18 +123,6 @@ These benchmarks highlight the potential of WNB classifiers to provide better pe
 
 The scripts used to generate these benchmark results are available in the _tests/benchmarks/_ directory.
 
-## Tests
-To run the tests, make sure to clone the repository and install the development requirements in addition to base requirements:
-```bash
-pip install -r requirements.txt
-pip install -r requirements-dev.txt
-```
-
-Then, run pytest:
-```bash
-pytest
-```
-
 ## Support us 💡
 You can support the project in the following ways:
 

From ac9302f905c6e8f62414642823c1749908ac44db Mon Sep 17 00:00:00 2001
From: Mehdi Samsami <mehdisamsami@live.com>
Date: Sun, 19 Jan 2025 04:32:02 +0330
Subject: [PATCH 2/6] major internal refactor in general naive bayes class

---
 tests/test_gnb.py |  21 ++++++
 wnb/_utils.py     |  10 +++
 wnb/gnb.py        | 179 ++++++++--------------------------------------
 3 files changed, 61 insertions(+), 149 deletions(-)

diff --git a/tests/test_gnb.py b/tests/test_gnb.py
index d6f2801..03dda03 100644
--- a/tests/test_gnb.py
+++ b/tests/test_gnb.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 import pytest
 from sklearn.base import is_classifier
 from sklearn.naive_bayes import BernoulliNB, CategoricalNB, GaussianNB
@@ -281,3 +282,23 @@ def test_gnb_var_smoothing_non_numeric():
     clf = GeneralNB(distributions=[D.CATEGORICAL, D.CATEGORICAL], var_smoothing=1e-6)
     clf.fit(X, y)
     assert clf.epsilon_ == 0
+
+
+def test_gnb_attrs():
+    """
+    Test whether the attributes are properly set.
+    """
+    clf = GeneralNB().fit(X, y)
+    assert np.array_equal(clf.class_count_, np.array([3, 3]))
+    assert np.array_equal(clf.class_prior_, np.array([0.5, 0.5]))
+    assert np.array_equal(clf.classes_, np.array([1, 2]))
+    assert clf.n_classes_ == 2
+    assert clf.epsilon_ > 0
+    assert clf.n_features_in_ == 2
+    assert not hasattr(clf, "feature_names_in_")
+    assert clf.distributions_ == [D.NORMAL, D.NORMAL]
+    assert len(clf.likelihood_params_) == 2
+
+    feature_names = [f"x{i}" for i in range(X.shape[1])]
+    clf = GeneralNB().fit(pd.DataFrame(X, columns=feature_names), y)
+    assert np.array_equal(clf.feature_names_in_, np.array(feature_names))
diff --git a/wnb/_utils.py b/wnb/_utils.py
index 0686d0d..27b568f 100644
--- a/wnb/_utils.py
+++ b/wnb/_utils.py
@@ -7,6 +7,7 @@
 __all__ = [
     "SKLEARN_V1_6_OR_LATER",
     "validate_data",
+    "check_X_y",
     "_check_n_features",
     "_check_feature_names",
 ]
@@ -17,6 +18,7 @@
 
 if SKLEARN_V1_6_OR_LATER:
     from sklearn.utils.validation import _check_feature_names, _check_n_features
+    from sklearn.utils.validation import check_X_y as _check_X_y
     from sklearn.utils.validation import validate_data as _validate_data
 
     def validate_data(*args, **kwargs):
@@ -24,12 +26,20 @@ def validate_data(*args, **kwargs):
             kwargs["ensure_all_finite"] = kwargs.pop("force_all_finite")
         return _validate_data(*args, **kwargs)
 
+    def check_X_y(*args, **kwargs):
+        if kwargs.get("force_all_finite"):
+            kwargs["ensure_all_finite"] = kwargs.pop("force_all_finite")
+        return _check_X_y(*args, **kwargs)
+
 else:
 
     def validate_data(estimator, X, **kwargs: Any):
         kwargs.pop("reset", None)
         return check_array(X, estimator=estimator, **kwargs)
 
+    def check_X_y(*args, **kwargs):
+        return _check_X_y(*args, **kwargs)
+
     def _check_n_features(estimator, X, reset):
         return estimator._check_n_features(X, reset=reset)
 
diff --git a/wnb/gnb.py b/wnb/gnb.py
index 5a0b2fc..6313324 100644
--- a/wnb/gnb.py
+++ b/wnb/gnb.py
@@ -1,18 +1,11 @@
 from __future__ import annotations
 
 import sys
-import warnings
-from abc import ABCMeta
 from typing import Optional, Sequence
 
 import numpy as np
-import pandas as pd
-from scipy.special import logsumexp
-from sklearn.base import BaseEstimator, ClassifierMixin
-from sklearn.exceptions import DataConversionWarning
-from sklearn.utils import as_float_array
+from sklearn.naive_bayes import _BaseNB
 from sklearn.utils.multiclass import check_classification_targets
-from sklearn.utils.validation import _ensure_no_complex_data, check_is_fitted
 
 if sys.version_info >= (3, 11):
     from typing import Self
@@ -28,6 +21,7 @@
     SKLEARN_V1_6_OR_LATER,
     _check_feature_names,
     _check_n_features,
+    check_X_y,
     validate_data,
 )
 from .typing import ArrayLike, Float, MatrixLike
@@ -35,7 +29,7 @@
 __all__ = ["GeneralNB"]
 
 
-class GeneralNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
+class GeneralNB(_BaseNB):
     """A General Naive Bayes classifier that supports distinct likelihood distributions for individual features,
     enabling more tailored modeling beyond the standard single-distribution approaches such as GaussianNB and BernoulliNB.
 
@@ -117,15 +111,8 @@ def _get_distributions(self) -> Sequence[DistributionLike]:
         except Exception:
             return self.distributions or []
 
-    def _check_inputs(self, X, y) -> None:
-        # Check if the targets are suitable for classification
-        check_classification_targets(y)
-
-        # Check if only one class is present in label vector
-        if self.n_classes_ == 1:
-            raise ValueError("Classifier can't train when only one class is present")
-
-        X = validate_data(
+    def _check_X(self, X) -> np.ndarray:
+        return validate_data(
             self,
             X,
             accept_sparse=False,
@@ -134,50 +121,25 @@ def _check_inputs(self, X, y) -> None:
                 None if any(d in self._get_distributions() for d in NonNumericDistributions) else "numeric"
             ),
             force_all_finite=True,
-            ensure_2d=True,
-            ensure_min_samples=1,
-            ensure_min_features=1,
+            reset=False,
         )
 
-        # Check that the number of samples and labels are compatible
-        if X.shape[0] != y.shape[0]:
-            raise ValueError("X.shape[0]=%d and y.shape[0]=%d are incompatible." % (X.shape[0], y.shape[0]))
-
-    def _prepare_X_y(self, X=None, y=None, from_fit: bool = False):
-        if from_fit and y is None:
-            raise ValueError("requires y to be passed, but the target y is None.")
-
-        if X is not None:
-            # Convert to NumPy array if X is Pandas DataFrame
-            if isinstance(X, pd.DataFrame):
-                X = X.values
-            _ensure_no_complex_data(X)
-            X = (
-                X
-                if any(d in self._get_distributions() for d in NonNumericDistributions)
-                else as_float_array(X)
-            )
-
-        if y is not None:
-            # Convert to a NumPy array
-            if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
-                y = y.values
-            else:
-                y = np.array(y)
-
-            # Warning in case of y being 2d
-            if y.ndim > 1:
-                warnings.warn(
-                    "A column-vector y was passed when a 1d array was expected.",
-                    DataConversionWarning,
-                )
-
-            y = y.flatten()
-
-        output = tuple(item for item in [X, y] if item is not None)
-        return output[0] if len(output) == 1 else output
+    def _check_X_y(self, X, y) -> tuple[np.ndarray, np.ndarray]:
+        X, y = check_X_y(
+            X,
+            y,
+            accept_sparse=False,
+            accept_large_sparse=False,
+            dtype=(
+                None if any(d in self._get_distributions() for d in NonNumericDistributions) else "numeric"
+            ),
+            force_all_finite=True,
+            estimator=self,
+        )
+        check_classification_targets(y)
+        return X, y
 
-    def _prepare_parameters(self) -> None:
+    def _init_parameters(self) -> None:
         self.class_prior_: np.ndarray
 
         # Set priors if not specified
@@ -243,18 +205,12 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self:
         _check_n_features(self, X=X, reset=True)
         _check_feature_names(self, X=X, reset=True)
 
-        X, y = self._prepare_X_y(X, y, from_fit=True)
+        X, y = self._check_X_y(X, y)
 
-        self.classes_: np.ndarray
-        self.class_count_: np.ndarray
-        self.classes_, y_, self.class_count_ = np.unique(
-            y, return_counts=True, return_inverse=True
-        )  # Unique class labels, their indices, and class counts
-        self.n_classes_: int = len(self.classes_)  # Number of classes
+        self.classes_, y_, self.class_count_ = np.unique(y, return_counts=True, return_inverse=True)
+        self.n_classes_: int = len(self.classes_)
 
-        self._check_inputs(X, y)
-        y = y_
-        self._prepare_parameters()
+        self._init_parameters()
 
         self.epsilon_ = 0.0
         if np.all(np.isreal(X)):
@@ -263,7 +219,7 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self:
         self.likelihood_params_: dict[int, list[DistMixin]] = {
             c: [
                 get_dist_class(self.distributions_[i]).from_data(
-                    X[y == c, i], alpha=self.alpha, epsilon=self.epsilon_
+                    X[y_ == c, i], alpha=self.alpha, epsilon=self.epsilon_
                 )
                 for i in range(self.n_features_in_)
             ]
@@ -272,86 +228,11 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self:
 
         return self
 
-    def predict(self, X: MatrixLike) -> np.ndarray:
-        """Performs classification on an array of test vectors X.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            The input samples.
-
-        Returns
-        -------
-        C : ndarray of shape (n_samples,)
-            Predicted target values for X.
-        """
-        p_hat = self.predict_log_proba(X)
-        return self.classes_[np.argmax(p_hat, axis=1)]
-
-    def predict_log_proba(self, X: MatrixLike) -> np.ndarray:
-        """Returns log-probability estimates for the array of test vectors X.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            The input samples.
-
-        Returns
-        -------
-        C : array-like of shape (n_samples, n_classes)
-            Returns the log-probability of the samples for each class in
-            the model. The columns correspond to the classes in sorted
-            order, as they appear in the attribute :term:`classes_`.
-        """
-        # Check is fit had been called
-        check_is_fitted(self)
-
-        # Input validation
-        X = validate_data(
-            self,
-            X,
-            accept_large_sparse=False,
-            force_all_finite=True,
-            dtype=(
-                None if any(d in self._get_distributions() for d in NonNumericDistributions) else "numeric"
-            ),
-            reset=False,
-        )
-
-        # Check if the number of input features matches the data seen during fit
-        if X.shape[1] != self.n_features_in_:
-            raise ValueError(
-                "Expected input with %d features, got %d instead." % (self.n_features_in_, X.shape[1])
-            )
-
-        n_samples = X.shape[0]
-        X = self._prepare_X_y(X=X)
-
-        log_joint = np.zeros((n_samples, self.n_classes_))
+    def _joint_log_likelihood(self, X) -> np.ndarray:
+        jll = np.zeros((X.shape[0], self.n_classes_))
         for c in range(self.n_classes_):
-            log_joint[:, c] = np.log(self.class_prior_[c]) + np.sum(
+            jll[:, c] = np.log(self.class_prior_[c]) + np.sum(
                 [np.log(likelihood(X[:, i])) for i, likelihood in enumerate(self.likelihood_params_[c])],
                 axis=0,
             )
-
-        log_proba = log_joint - np.transpose(
-            np.repeat(logsumexp(log_joint, axis=1).reshape(1, -1), self.n_classes_, axis=0)
-        )
-        return log_proba
-
-    def predict_proba(self, X: MatrixLike) -> np.ndarray:
-        """Returns probability estimates for the array of test vectors X.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            The input samples.
-
-        Returns
-        -------
-        C : array-like of shape (n_samples, n_classes)
-            Returns the probability of the samples for each class in
-            the model. The columns correspond to the classes in sorted
-            order, as they appear in the attribute :term:`classes_`.
-        """
-        return np.exp(self.predict_log_proba(X))
+        return jll

From d7381810d036ab664d1668d16f736a5dfc26b84e Mon Sep 17 00:00:00 2001
From: Mehdi Samsami <mehdisamsami@live.com>
Date: Sun, 19 Jan 2025 04:36:34 +0330
Subject: [PATCH 3/6] fix import bug

---
 wnb/_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wnb/_utils.py b/wnb/_utils.py
index 27b568f..3fe76b2 100644
--- a/wnb/_utils.py
+++ b/wnb/_utils.py
@@ -2,7 +2,8 @@
 
 import sklearn
 from packaging import version
-from sklearn.utils import check_array
+from sklearn.utils.validation import check_array
+from sklearn.utils.validation import check_X_y as _check_X_y
 
 __all__ = [
     "SKLEARN_V1_6_OR_LATER",
@@ -18,7 +19,6 @@
 
 if SKLEARN_V1_6_OR_LATER:
     from sklearn.utils.validation import _check_feature_names, _check_n_features
-    from sklearn.utils.validation import check_X_y as _check_X_y
     from sklearn.utils.validation import validate_data as _validate_data
 
     def validate_data(*args, **kwargs):

From 1bf3803f78abcd679162efb928e63d82118483a7 Mon Sep 17 00:00:00 2001
From: Mehdi Samsami <mehdisamsami@live.com>
Date: Sun, 19 Jan 2025 04:40:07 +0330
Subject: [PATCH 4/6] add extra validation for compatibility with earlier
 sklearn versions

---
 wnb/gnb.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/wnb/gnb.py b/wnb/gnb.py
index 6313324..c6d9be6 100644
--- a/wnb/gnb.py
+++ b/wnb/gnb.py
@@ -112,7 +112,7 @@ def _get_distributions(self) -> Sequence[DistributionLike]:
             return self.distributions or []
 
     def _check_X(self, X) -> np.ndarray:
-        return validate_data(
+        X = validate_data(
             self,
             X,
             accept_sparse=False,
@@ -123,6 +123,11 @@ def _check_X(self, X) -> np.ndarray:
             force_all_finite=True,
             reset=False,
         )
+        if X.shape[1] != self.n_features_in_:
+            raise ValueError(
+                "Expected input with %d features, got %d instead." % (self.n_features_in_, X.shape[1])
+            )
+        return X
 
     def _check_X_y(self, X, y) -> tuple[np.ndarray, np.ndarray]:
         X, y = check_X_y(

From 00b36a19204dc6db149e239170571e2f7352909d Mon Sep 17 00:00:00 2001
From: Mehdi Samsami <mehdisamsami@live.com>
Date: Sun, 19 Jan 2025 05:52:46 +0330
Subject: [PATCH 5/6] major core refactor in gaussian wnb classifier

---
 tests/test_gwnb.py |  23 ++++
 wnb/gnb.py         |  10 +-
 wnb/gwnb.py        | 303 +++++++++++++--------------------------------
 3 files changed, 111 insertions(+), 225 deletions(-)

diff --git a/tests/test_gwnb.py b/tests/test_gwnb.py
index d4ab622..6c09b4f 100644
--- a/tests/test_gwnb.py
+++ b/tests/test_gwnb.py
@@ -1,6 +1,7 @@
 import re
 
 import numpy as np
+import pandas as pd
 import pytest
 from sklearn.base import is_classifier
 from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal
@@ -194,3 +195,25 @@ def test_gwnb_no_cost_hist():
     clf = GaussianWNB(max_iter=10)
     clf.fit(X, y)
     assert clf.cost_hist_ is None
+
+
+def test_gwnb_attrs():
+    """
+    Test whether the attributes are properly set.
+    """
+    clf = GaussianWNB().fit(X, y)
+    assert np.array_equal(clf.class_count_, np.array([3, 3]))
+    assert np.array_equal(clf.class_prior_, np.array([0.5, 0.5]))
+    assert np.array_equal(clf.classes_, np.array([1, 2]))
+    assert clf.n_classes_ == 2
+    assert clf.n_features_in_ == 2
+    assert not hasattr(clf, "feature_names_in_")
+    assert np.array_equal(clf.error_weights_, np.array([[0, 1], [-1, 0]]))
+    assert clf.theta_.shape == (2, 2)
+    assert clf.std_.shape == (2, 2)
+    assert clf.var_.shape == (2, 2)
+    assert clf.coef_.shape == (2,)
+
+    feature_names = [f"x{i}" for i in range(X.shape[1])]
+    clf = GaussianWNB().fit(pd.DataFrame(X, columns=feature_names), y)
+    assert np.array_equal(clf.feature_names_in_, np.array(feature_names))
diff --git a/wnb/gnb.py b/wnb/gnb.py
index c6d9be6..c992699 100644
--- a/wnb/gnb.py
+++ b/wnb/gnb.py
@@ -145,14 +145,10 @@ def _check_X_y(self, X, y) -> tuple[np.ndarray, np.ndarray]:
         return X, y
 
     def _init_parameters(self) -> None:
-        self.class_prior_: np.ndarray
-
         # Set priors if not specified
         if self.priors is None:
-            self.class_prior_ = (
-                self.class_count_ / self.class_count_.sum()
-            )  # Calculate empirical prior probabilities
-
+            # Calculate empirical prior probabilities
+            self.class_prior_ = self.class_count_ / self.class_count_.sum()
         else:
             # Check that the provided priors match the number of classes
             if len(self.priors) != self.n_classes_:
@@ -213,7 +209,7 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self:
         X, y = self._check_X_y(X, y)
 
         self.classes_, y_, self.class_count_ = np.unique(y, return_counts=True, return_inverse=True)
-        self.n_classes_: int = len(self.classes_)
+        self.n_classes_ = len(self.classes_)
 
         self._init_parameters()
 
diff --git a/wnb/gwnb.py b/wnb/gwnb.py
index 7ed510d..8ea8d78 100644
--- a/wnb/gwnb.py
+++ b/wnb/gwnb.py
@@ -2,19 +2,12 @@
 
 import numbers
 import sys
-import warnings
-from abc import ABCMeta
 from typing import Optional
 
 import numpy as np
-import pandas as pd
-from scipy.special import logsumexp
 from scipy.stats import norm
-from sklearn.base import BaseEstimator, ClassifierMixin
-from sklearn.exceptions import DataConversionWarning
-from sklearn.utils import as_float_array
+from sklearn.naive_bayes import _BaseNB
 from sklearn.utils.multiclass import check_classification_targets, type_of_target
-from sklearn.utils.validation import _ensure_no_complex_data, check_is_fitted
 
 if sys.version_info >= (3, 11):
     from typing import Self
@@ -25,6 +18,7 @@
     SKLEARN_V1_6_OR_LATER,
     _check_feature_names,
     _check_n_features,
+    check_X_y,
     validate_data,
 )
 from .typing import ArrayLike, Float, Int, MatrixLike
@@ -32,8 +26,8 @@
 __all__ = ["GaussianWNB"]
 
 
-class GaussianWNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
-    """Binary Gaussian Minimum Log-likelihood Difference Weighted Naive Bayes (MLD-WNB) classifier
+class GaussianWNB(_BaseNB):
+    """Binary Gaussian Minimum Log-likelihood Difference Weighted Naive Bayes (MLD-WNB) classifier.
 
     Parameters
     ----------
@@ -134,22 +128,7 @@ def __sklearn_tags__(self):
     def _more_tags(self) -> dict[str, bool]:
         return {"binary_only": True, "requires_y": True}
 
-    def _check_inputs(self, X, y) -> None:
-        # Check if the targets are suitable for classification
-        check_classification_targets(y)
-
-        # Check that the dataset has only two unique labels
-        if (y_type := type_of_target(y)) != "binary":
-            if SKLEARN_V1_6_OR_LATER:
-                msg = f"Only binary classification is supported. The type of the target is {y_type}."
-            else:
-                msg = "Unknown label type: non-binary"
-            raise ValueError(msg)
-
-        # Check if only one class is present in label vector
-        if self.n_classes_ == 1:
-            raise ValueError("Classifier can't train when only one class is present.")
-
+    def _check_X(self, X) -> np.ndarray:
         X = validate_data(
             self,
             X,
@@ -157,16 +136,43 @@ def _check_inputs(self, X, y) -> None:
             accept_large_sparse=False,
             dtype="numeric",
             force_all_finite=True,
-            ensure_2d=True,
-            ensure_min_samples=1,
-            ensure_min_features=1,
+            reset=False,
+        )
+        if X.shape[1] != self.n_features_in_:
+            raise ValueError(
+                "Expected input with %d features, got %d instead." % (self.n_features_in_, X.shape[1])
+            )
+        return X
+
+    def _check_X_y(self, X, y) -> tuple[np.ndarray, np.ndarray]:
+        X, y = check_X_y(
+            X,
+            y,
+            accept_sparse=False,
+            accept_large_sparse=False,
+            dtype="numeric",
+            force_all_finite=True,
+            estimator=self,
         )
+        check_classification_targets(y)
 
-        # Check that the number of samples and labels are compatible
-        if self.__n_samples != y.shape[0]:
-            raise ValueError("X.shape[0]=%d and y.shape[0]=%d are incompatible." % (X.shape[0], y.shape[0]))
+        if np.unique(y).shape[0] == 1:
+            raise ValueError("Classifier can't train when only one class is present")
 
-        if self.priors is not None:
+        if (y_type := type_of_target(y)) != "binary":
+            if SKLEARN_V1_6_OR_LATER:
+                msg = f"Only binary classification is supported. The type of the target is {y_type}."
+            else:
+                msg = "Unknown label type: non-binary"
+            raise ValueError(msg)
+
+        return X, y
+
+    def _init_parameters(self) -> None:
+        if self.priors is None:
+            # Calculate empirical prior probabilities
+            self.class_prior_ = self.class_count_ / self.class_count_.sum()
+        else:
             # Check that the provided priors match the number of classes
             if len(self.priors) != self.n_classes_:
                 raise ValueError("Number of priors must match the number of classes.")
@@ -177,16 +183,26 @@ def _check_inputs(self, X, y) -> None:
             if (self.priors < 0).any():
                 raise ValueError("Priors must be non-negative.")
 
-        if self.error_weights is not None:
+            self.class_prior_ = self.priors
+
+        # Convert to NumPy array if input priors is in a list/tuple/set
+        if isinstance(self.class_prior_, (list, tuple, set)):
+            self.class_prior_ = np.array(list(self.class_prior_))
+
+        if self.error_weights is None:
+            # Assign equal weight to the errors of both classes
+            self.error_weights_ = np.array([[0, 1], [-1, 0]])
+        else:
             # Check that the size of error weights matrix matches number of classes
             if self.error_weights.shape != (self.n_classes_, self.n_classes_):
                 raise ValueError(
                     "The shape of error weights matrix does not match the number of classes, "
                     "must be (n_classes, n_classes)."
                 )
+            self.error_weights_ = self.error_weights
 
         # Check that the regularization type is either 'l1' or 'l2'
-        if self.penalty not in ["l1", "l2"]:
+        if self.penalty not in ("l1", "l2"):
             raise ValueError("Regularization type must be either 'l1' or 'l2'.")
 
         # Check that the regularization parameter is a positive integer
@@ -199,62 +215,6 @@ def _check_inputs(self, X, y) -> None:
                 "Maximum number of iteration must be a positive integer; got (max_iter=%r)." % self.max_iter
             )
 
-    def _prepare_X_y(self, X=None, y=None, from_fit: bool = False):
-        if from_fit and y is None:
-            raise ValueError("requires y to be passed, but the target y is None.")
-
-        if X is not None:
-            # Convert to NumPy array if X is Pandas DataFrame
-            if isinstance(X, pd.DataFrame):
-                X = X.values
-            _ensure_no_complex_data(X)
-            X = as_float_array(X)
-
-        if y is not None:
-            # Convert to a NumPy array
-            if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
-                y = y.values
-            else:
-                y = np.array(y)
-
-            # Warning in case of y being 2d
-            if y.ndim > 1:
-                warnings.warn(
-                    "A column-vector y was passed when a 1d array was expected.",
-                    DataConversionWarning,
-                )
-
-            y = y.flatten()
-
-        output = tuple(item for item in [X, y] if item is not None)
-        return output[0] if len(output) == 1 else output
-
-    def _prepare_parameters(self, X, y) -> None:
-        # Calculate mean and standard deviation of features for each class
-        for c in range(self.n_classes_):
-            self.theta_[:, c] = np.mean(X[y == c, :], axis=0)  # Calculate mean of features for class c
-            self.std_[:, c] = np.std(X[y == c, :], axis=0)  # Calculate std of features for class c
-        self.var_ = np.square(self.std_)  # Calculate variance of features using std
-
-        self.class_prior_: np.ndarray
-        # Update if no priors is provided
-        if self.priors is None:
-            self.class_prior_ = (
-                self.class_count_ / self.class_count_.sum()
-            )  # Calculate empirical prior probabilities
-        else:
-            self.class_prior_ = self.priors
-
-        # Convert to NumPy array if input priors is in a list/tuple/set
-        if isinstance(self.class_prior_, (list, tuple, set)):
-            self.class_prior_ = np.array(list(self.class_prior_))
-
-        # Update if no error weights is provided
-        if self.error_weights is None:
-            self.error_weights_ = np.array([[0, 1], [-1, 0]])
-        else:
-            self.error_weights_ = self.error_weights
-
     def fit(self, X: MatrixLike, y: ArrayLike) -> Self:
         """Fits Gaussian Binary MLD-WNB classifier according to X, y.
 
@@ -277,56 +237,33 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self:
         _check_n_features(self, X=X, reset=True)
         _check_feature_names(self, X=X, reset=True)
 
-        X, y = self._prepare_X_y(X, y, from_fit=True)
-
-        self.classes_: np.ndarray
-        self.class_count_: np.ndarray
-        self.classes_, y_, self.class_count_ = np.unique(
-            y, return_counts=True, return_inverse=True
-        )  # Unique class labels, their indices, and class counts
-        self.n_classes_: int = len(self.classes_)  # Number of classes
-
-        self.__n_samples = X.shape[0]  # Number of samples (for internal use)
-
-        self._check_inputs(X, y)
-        y = y_
-
-        self.theta_: np.ndarray = np.zeros(
-            (self.n_features_in_, self.n_classes_)
-        )  # Mean of each feature per class (n_features x n_classes)
-        self.std_: np.ndarray = np.zeros(
-            (self.n_features_in_, self.n_classes_)
-        )  # Standard deviation of each feature per class (n_features x n_classes)
-        self.var_: np.ndarray = np.zeros(
-            (self.n_features_in_, self.n_classes_)
-        )  # Variance of each feature per class (n_features x n_classes)
-        self.coef_: np.ndarray = np.ones((self.n_features_in_,))  # WNB coefficients (n_features x 1)
-        self.cost_hist_: np.ndarray = np.array(
-            [np.nan for _ in range(self.max_iter)]
-        )  # Cost value in each iteration
-
-        self._prepare_parameters(X, y)
-
-        # Learn the weights using gradient descent
-        self.n_iter_: int = 0
-        for self.n_iter_ in range(self.max_iter):
-            # Predict on X
-            y_hat = self._predict(X)
+        X, y = self._check_X_y(X, y)
 
-            # Calculate cost
-            self.cost_hist_[self.n_iter_], _lambda = self._calculate_cost(X, y, y_hat, self.learning_hist)
+        self.classes_, y_, self.class_count_ = np.unique(y, return_counts=True, return_inverse=True)
+        self.n_classes_ = len(self.classes_)
 
-            # Calculate gradients (most time-consuming)
-            _grad = self._calculate_grad(X, _lambda)
+        self._init_parameters()
 
-            # Add regularization
+        self.theta_: np.ndarray = np.zeros((self.n_features_in_, self.n_classes_))
+        self.std_: np.ndarray = np.zeros((self.n_features_in_, self.n_classes_))
+        self.var_: np.ndarray = np.zeros((self.n_features_in_, self.n_classes_))
+        for c in range(self.n_classes_):
+            self.theta_[:, c] = np.mean(X[y_ == c, :], axis=0)
+            self.std_[:, c] = np.std(X[y_ == c, :], axis=0)
+        self.var_ = np.square(self.std_)
+
+        self.n_iter_: int = 0
+        self.coef_: np.ndarray = np.ones((self.n_features_in_,))
+        self.cost_hist_: np.ndarray = np.array([np.nan for _ in range(self.max_iter)])
+        for self.n_iter_ in range(self.max_iter):
+            y_hat = self._predict(X)
+            self.cost_hist_[self.n_iter_], _lambda = self._calculate_cost(X, y_, y_hat, self.learning_hist)
+            grad = self._calculate_grad(X, _lambda)
             if self.penalty == "l1":
-                _grad += self.C * np.sign(self.coef_)
+                grad += self.C * np.sign(self.coef_)
             elif self.penalty == "l2":
-                _grad += 2 * self.C * self.coef_
-
-            # Update weights
-            self.coef_ = self.coef_ - self.step_size * _grad
+                grad += 2 * self.C * self.coef_
+            self.coef_ = self.coef_ - self.step_size * grad
 
         self.n_iter_ += 1
         self.cost_hist_ = None if not self.learning_hist else self.cost_hist_
@@ -334,12 +271,11 @@ def fit(self, X: MatrixLike, y: ArrayLike) -> Self:
         return self
 
     def _calculate_cost(self, X, y, y_hat, learning_hist: bool) -> tuple[Float, list[Float]]:
-        _lambda = [self.error_weights_[y[i], y_hat[i]] for i in range(self.__n_samples)]
+        _lambda = [self.error_weights_[y[i], y_hat[i]] for i in range(X.shape[0])]
 
         if learning_hist:
-            # Calculate cost
             _cost = 0.0
-            for i in range(self.__n_samples):
+            for i in range(X.shape[0]):
                 _sum = np.log(self.class_prior_[1] / self.class_prior_[0])
                 x = X[i, :]
                 for j in range(self.n_features_in_):
@@ -356,22 +292,22 @@ def _calculate_cost(self, X, y, y_hat, learning_hist: bool) -> tuple[Float, list
     def _calculate_grad(self, X, _lambda: list[Float]) -> np.ndarray:
         _grad = np.repeat(
             np.log(self.std_[:, 0] / self.std_[:, 1]).reshape(1, -1),
-            self.__n_samples,
+            X.shape[0],
             axis=0,
         )
         _grad += (
             0.5
             * (
-                (X - np.repeat(self.theta_[:, 0].reshape(1, -1), self.__n_samples, axis=0))
-                / (np.repeat(self.std_[:, 0].reshape(1, -1), self.__n_samples, axis=0))
+                (X - np.repeat(self.theta_[:, 0].reshape(1, -1), X.shape[0], axis=0))
+                / (np.repeat(self.std_[:, 0].reshape(1, -1), X.shape[0], axis=0))
             )
             ** 2
         )
         _grad -= (
             0.5
             * (
-                (X - np.repeat(self.theta_[:, 1].reshape(1, -1), self.__n_samples, axis=0))
-                / (np.repeat(self.std_[:, 1].reshape(1, -1), self.__n_samples, axis=0))
+                (X - np.repeat(self.theta_[:, 1].reshape(1, -1), X.shape[0], axis=0))
+                / (np.repeat(self.std_[:, 1].reshape(1, -1), X.shape[0], axis=0))
             )
             ** 2
         )
@@ -381,58 +317,11 @@ def _calculate_grad(self, X, _lambda: list[Float]) -> np.ndarray:
         return _grad
 
     def _predict(self, X: MatrixLike) -> np.ndarray:
-        p_hat = self.predict_log_proba(X)
-        return np.argmax(p_hat, axis=1)
-
-    def predict(self, X: MatrixLike) -> np.ndarray:
-        """Performs classification on an array of test vectors X.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            The input samples.
-
-        Returns
-        -------
-        C : ndarray of shape (n_samples,)
-            Predicted target values for X.
-        """
-        p_hat = self.predict_log_proba(X)
-        y_hat = self.classes_[np.argmax(p_hat, axis=1)]
-        return y_hat
-
-    def predict_log_proba(self, X: MatrixLike) -> np.ndarray:
-        """Returns log-probability estimates for the array of test vectors X.
+        jll = self._joint_log_likelihood(X)
+        return np.argmax(jll, axis=1)
 
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            The input samples.
-
-        Returns
-        -------
-        C : array-like of shape (n_samples, n_classes)
-            Returns the log-probability of the samples for each class in
-            the model. The columns correspond to the classes in sorted
-            order, as they appear in the attribute :term:`classes_`.
-        """
-        # Check is fit had been called
-        check_is_fitted(self)
-
-        # Input validation
-        X = validate_data(self, X, accept_large_sparse=False, force_all_finite=True, reset=False)
-
-        # Check if the number of input features matches the data seen during fit
-        if X.shape[1] != self.n_features_in_:
-            raise ValueError(
-                "Expected input with %d features, got %d instead." % (self.n_features_in_, X.shape[1])
-            )
-
-        n_samples = X.shape[0]
-
-        X = self._prepare_X_y(X=X)
-
-        log_priors = np.tile(np.log(self.class_prior_), (n_samples, 1))
+    def _joint_log_likelihood(self, X) -> np.ndarray:
+        log_priors = np.tile(np.log(self.class_prior_), (X.shape[0], 1))
         w_reshaped = np.tile(self.coef_.reshape(-1, 1), (1, self.n_classes_))
         term1 = np.sum(np.multiply(w_reshaped, -np.log(np.sqrt(2 * np.pi) * self.std_)))
         var_inv = np.multiply(w_reshaped, 1.0 / np.multiply(self.std_, self.std_))
@@ -442,26 +331,4 @@ def predict_log_proba(self, X: MatrixLike) -> np.ndarray:
             - 2.0 * np.matmul(X, mu_by_var)
             + np.sum(self.theta_.conj() * mu_by_var, axis=0)
         )
-        log_proba = log_priors + term1 + term2
-
-        log_proba -= np.transpose(
-            np.repeat(logsumexp(log_proba, axis=1).reshape(1, -1), self.n_classes_, axis=0)
-        )
-        return log_proba
-
-    def predict_proba(self, X: MatrixLike) -> np.ndarray:
-        """Returns probability estimates for the array of test vectors X.
-
-        Parameters
-        ----------
-        X : array-like of shape (n_samples, n_features)
-            The input samples.
-
-        Returns
-        -------
-        C : array-like of shape (n_samples, n_classes)
-            Returns the probability of the samples for each class in
-            the model. The columns correspond to the classes in sorted
-            order, as they appear in the attribute :term:`classes_`.
-        """
-        return np.exp(self.predict_log_proba(X))
+        return log_priors + term1 + term2

From 5710ea9a6bcbf02cb0c41dea93fffe0741c921be Mon Sep 17 00:00:00 2001
From: Mehdi Samsami <mehdisamsami@live.com>
Date: Sun, 19 Jan 2025 06:12:14 +0330
Subject: [PATCH 6/6] bump version -> v0.6.0

---
 README.md       | 3 ++-
 uv.lock         | 2 +-
 wnb/__init__.py | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 45cb00e..f8bc0cc 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
 
 <div align="center">
 
-![Lastest Release](https://img.shields.io/badge/release-v0.5.1-green)
+![Lastest Release](https://img.shields.io/badge/release-v0.6.0-green)
 [![PyPI Version](https://img.shields.io/pypi/v/wnb)](https://pypi.org/project/wnb/)
 ![Python Versions](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue)<br>
 ![GitHub Workflow Status (build)](https://github.com/msamsami/wnb/actions/workflows/build.yml/badge.svg)
@@ -102,6 +102,7 @@ Both Scikit-learn classifiers and WNB classifiers share these well-known methods
 - `predict(X)`
 - `predict_proba(X)`
 - `predict_log_proba(X)`
+- `predict_joint_log_proba(X)`
 - `score(X, y)`
 - `get_params()`
 - `set_params(**params)`
diff --git a/uv.lock b/uv.lock
index c9eb14f..0041729 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1008,7 +1008,7 @@ wheels = [
 
 [[package]]
 name = "wnb"
-version = "0.5.1"
+version = "0.6.0"
 source = { editable = "." }
 dependencies = [
     { name = "pandas", version = "2.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9'" },
diff --git a/wnb/__init__.py b/wnb/__init__.py
index 3523c4e..54f9482 100644
--- a/wnb/__init__.py
+++ b/wnb/__init__.py
@@ -2,7 +2,7 @@
 Python library for the implementations of general and weighted naive Bayes (WNB) classifiers.
 """
 
-__version__ = "0.5.1"
+__version__ = "0.6.0"
 __author__ = "Mehdi Samsami"