Merge branch 'main' into summarize_functions

UBC-MDS · Jan 18, 2025 · 16a9747 · 16a9747
2 parents 5436a50 + 31678c7
commit 16a9747
Show file tree

Hide file tree

Showing 13 changed files with 367 additions and 47 deletions.
diff --git a/README.md b/README.md
@@ -1,7 +1,5 @@
 # summarease
 
-A package that provides quick summaries of datasets, including data types, missing value counts, and basic statistics.
-
 ## Project Summary
 
 Summarease is a package designed to provide quick insights into a dataset by summarizing its key features. It offers functions that help users understand the structure of the data, making it easier to plan data cleaning and exploratory data analysis (EDA) tasks.
@@ -39,10 +37,10 @@ Summarease is a package designed to provide quick insights into a dataset by sum
 
 Summarease is a lightweight and compact Python package designed for efficiency and ease of use. Despite its simplicity, it offers users great flexibility to customize the output format, whether through detailed tables or insightful visualizations.
 
-Related packages with similar functionalities:
-sweetviz: https://github.com/fbdesignpro/sweetviz
-ydata-profiling: https://github.com/ydataai/ydata-profiling
-dtale: https://github.com/man-group/dtale
+Related packages with similar functionalities:  
+sweetviz: https://github.com/fbdesignpro/sweetviz  
+ydata-profiling: https://github.com/ydataai/ydata-profiling  
+dtale: https://github.com/man-group/dtale  
 
 ## Installation
 
@@ -63,11 +61,15 @@ from summarease.summarize_numeric import summarize_numeric
 from summarease.plot_correlation_heatmap import plot_correlation_heatmap
 from summarease.summarize import summarize
 import matplotlib.pyplot as plt
+
+TODO
 ```
 
 ## Contributing
 
-Interested in contributing? Check out the contributing guidelines. Please note that this project is released with a Code of Conduct. By contributing to this project, you agree to abide by its terms.
+Interested in contributing? Check out the contributing guidelines. 
+
+Please note that this project is released with a Code of Conduct. By contributing to this project, you agree to abide by its terms.
 
 ## License
 

diff --git a/src/summarease/__init__.py b/src/summarease/__init__.py
@@ -1,3 +1,3 @@
 # read version from installed package
 from importlib.metadata import version
-__version__ = version("summarease")
+__version__ = version= ("summarease")
diff --git a/src/summarease/clean_data.py b/src/summarease/clean_data.py
@@ -1,4 +1,5 @@
-def clean_data(dataset, replace_values=None, drop_duplicates=True, standardize_columns=True):
+def clean_data(dataset: pd.Dataframe, replace_values: list = None, 
+              drop_duplicates: bool = True, standardize_columns: bool = True):
     """Clean the input dataset by standardizing column names, replacing invalid values, 
     and ensuring proper data types.
 

diff --git a/src/summarease/summarize_categorical.py b/src/summarease/summarize_categorical.py
@@ -1,4 +1,4 @@
-def summarize_categorical(dataset, summarize_by="table"):
+def summarize_categorical(dataset: pd.DataFrame, summarize_by: str = "table"):
     """
     Summarize the categorical variables in the dataset by providing the number of unique categories
     for each categorical column. If any categorical columns have too many unique categories, a warning 

diff --git a/src/summarease/summarize_dtypes.py b/src/summarease/summarize_dtypes.py
@@ -1,33 +1,47 @@
-def summarize_dtypes(dataset, summarize_by="table"):
-    """Summarize the data types in the dataset.
+import pandas as pd
+
+def summarize_dtypes_table(dataset: pd.DataFrame) -> pd.DataFrame:
+    """
+    Summarize the data types in the dataset and return a DataFrame.
 
     Parameters
     ----------
     dataset : DataFrame
         The input dataset to analyze.
-    summarize_by : str, within {"table", "plot"}, optional
-        Specifies the output format:
-        - "table": Returns a summary as a table.
-        - "plot": Visualizes the data type distribution as a plot.
-        Default is "table".
 
     Returns
     -------
-    dict or None
-        A dictionary containing the data type counts if `summarize_by="table"`.
-        If `summarize_by="plot"`, displays a visualization and returns None.
+    DataFrame
+        A DataFrame summarizing the counts of each data type.
 
-    Notes
-    -----
-    This function counts the occurrences of each data type in the dataset and
-    either generates a summary table or visualizes the distribution using a plot.
+    Raises
+    ------
+    TypeError
+        If the input dataset is not a pandas DataFrame.
 
     Examples
     --------
-    >>> summarize_dtypes(dataset=data, summarize_by="table")
-    {'int64': 5, 'float64': 3, 'object': 4}
-    
-    >>> summarize_dtypes(dataset=data, summarize_by="plot")
-    # Displays a bar plot showing the distribution of data types.
+    >>> data = pd.DataFrame({
+    ...     'int_col': [1, 2, 3],
+    ...     'float_col': [1.1, 2.2, 3.3],
+    ...     'str_col': ['a', 'b', 'c'],
+    ...     'bool_col': [True, False, True]
+    ... })
+    >>> summarize_dtypes_table(data)
+       DataType  Count
+    0    int64      1
+    1  float64      1
+    2   object      1
+    3     bool      1
     """
-    pass
+    if not isinstance(dataset, pd.DataFrame):
+        raise TypeError("The input dataset must be a pandas DataFrame.")
+
+    # Get data types and their counts
+    dtype_counts = dataset.dtypes.value_counts().reset_index()
+    dtype_counts.columns = ['DataType', 'Count']
+
+    # Convert DataType column to string for consistent output
+    dtype_counts['DataType'] = dtype_counts['DataType'].astype(str)
+
+    return dtype_counts
diff --git a/src/summarease/summarize_missing_values.py b/src/summarease/summarize_missing_values.py
@@ -1,4 +1,4 @@
-def summarize_missing_values(dataset, summarize_by="table"):
+def summarize_missing_values(dataset: pd.DataFrame, summarize_by: str = "table"):
     """
     Summarize the missing values in the dataset, providing information on the number and percentage of 
     missing values for each column. Generate a summary table or visualization to show the missing values

diff --git a/src/summarease/summarize_numeric.py b/src/summarease/summarize_numeric.py
@@ -51,8 +51,8 @@ def plot_correlation_heatmap(dataset_numeric):
 
     return heatmap
 
-
 def summarize_numeric(dataset, summarize_by="table"):
+
     """
     Summarize the numeric variables in the dataset by providing the summary statistics (e.g., mean, 
     standard deviation, min, max, etc.) for each numeric column or plotting the correlation heatmap 
@@ -92,18 +92,30 @@ def summarize_numeric(dataset, summarize_by="table"):
 
     assert summarize_by in {"table", "plot"}, f"Argument 'summarize_by' should be one of the following options: [table, plot]! You have {summarize_by}."
 
-    # Select the numeric columns from the dataset
-    dataset_numeric = dataset.select_dtypes(include=['number'])
+
+    numeric_columns = dataset.select_dtypes(include='number').columns
 
+    if numeric_columns.empty:  # Check if there are no numeric columns
+        print("No numeric columns found in the dataset.")
+        return
+
     outputs = {}
 
-    if (summarize_by == "plot"):
-        outputs["numeric_plot"] = plot_numeric_density(dataset_numeric)
-
-        if (dataset_numeric.shape[1] > 1):
-            outputs["corr_plot"] = plot_correlation_heatmap(dataset_numeric)
+    if summarize_by == "table":
+        # Generate summary statistics for numeric columns
+        summary = dataset[numeric_columns].describe()
+        print(summary)
+
+    elif summarize_by == "plot":
+        # Generate a correlation heatmap for numeric columns
+        if len(dataset) < 2:
+            print("Insufficient data for meaningful plots.")
+            return {}
 
-    elif (summarize_by == "table"):
-        outputs["numeric_describe"] = dataset_numeric.describe()
+        numeric_data = dataset[numeric_columns]
+        outputs["numeric_plot"] = plot_numeric_density(numeric_data)
 
+        if len(numeric_columns) > 1:
+            outputs["corr_plot"] = plot_correlation_heatmap(numeric_data)
+
     return outputs
diff --git a/src/summarease/summarize_outliers.py b/src/summarease/summarize_outliers.py
@@ -1,4 +1,5 @@
-def summarize_outliers(dataset_name, columns=None, summarize_by="table"):
+def summarize_outliers(dataset: pd.Dataframe, columns: list = None, 
+                       summarize_by: str = "table"):
     """Check and summarize the outliers by Z-scores in specified numeric columns of a table.
 
     Parameters

diff --git a/src/summarease/summarize_target.py b/src/summarease/summarize_target.py
@@ -115,7 +115,15 @@ def summarize_target_balance_plot(summary_df: pd.DataFrame):
     # Validate input DataFrame
     required_columns = {'class', 'proportion', 'imbalanced', 'threshold'}
     if not required_columns.issubset(summary_df.columns):
-        raise ValueError(f"Input DataFrame must contain columns: {required_columns}")
+        raise ValueError(f"Input DataFrame must contain columns: {', '.join(sorted(required_columns))}")
+
+    # Handle empty DataFrame
+    if summary_df.empty:
+        return alt.Chart(pd.DataFrame()).mark_text().encode(
+            text=alt.value("No data available for visualization.")
+        ).properties(
+            title="Categorical Target Balance Visualization (Empty)"
+        )
 
     # Add expected proportion range to the DataFrame
     n_classes = len(summary_df)

diff --git a/tests/test_dtypes.py b/tests/test_dtypes.py
@@ -0,0 +1,73 @@
+import pytest
+import pandas as pd
+import sys
+import os
+from pandas.testing import assert_frame_equal
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../src')))
+from summarease.summarize_dtypes import summarize_dtypes_table
+
+
+def test_summarize_dtypes_table():
+    # Test Case 1: Standard input with multiple data types
+    data = pd.DataFrame({
+        'int_col': [1, 2, 3],
+        'float_col': [1.1, 2.2, 3.3],
+        'str_col': ['a', 'b', 'c'],
+        'bool_col': [True, False, True]
+    })
+    result = summarize_dtypes_table(data)
+    expected = pd.DataFrame({
+        'DataType': ['int64', 'float64', 'object', 'bool'],
+        'Count': [1, 1, 1, 1]
+    })
+    assert isinstance(result, pd.DataFrame), "The function should return a DataFrame"
+    assert_frame_equal(result, expected)
+
+    # Test Case 2: Empty DataFrame
+    empty_data = pd.DataFrame()
+    result = summarize_dtypes_table(empty_data)
+    expected = pd.DataFrame(columns=['DataType', 'Count'])
+    assert isinstance(result, pd.DataFrame), "The function should return a DataFrame"
+    assert_frame_equal(result, expected, check_dtype=False)
+
+    # Test Case 3: Single column DataFrame
+    single_col_data = pd.DataFrame({'col1': [1, 2, 3]})
+    result = summarize_dtypes_table(single_col_data)
+    expected = pd.DataFrame({'DataType': ['int64'], 'Count': [1]})
+    assert isinstance(result, pd.DataFrame), "The function should return a DataFrame"
+    assert_frame_equal(result, expected, check_dtype=False)
+
+    # Test Case 4: DataFrame with datetime column
+    datetime_data = pd.DataFrame({
+        'int_col': [1, 2, 3],
+        'datetime_col': pd.date_range("2023-01-01", periods=3)
+    })
+    result = summarize_dtypes_table(datetime_data)
+    expected = pd.DataFrame({
+        'DataType': ['int64', 'datetime64[ns]'],
+        'Count': [1, 1]
+    })
+    assert isinstance(result, pd.DataFrame), "The function should return a DataFrame"
+    assert_frame_equal(result, expected, check_dtype=False)
+
+    # Test Case 5: Non-DataFrame input (list)
+    with pytest.raises(TypeError):
+        summarize_dtypes_table([1, 2, 3])
+
+    # Test Case 6: Non-DataFrame input (None)
+    with pytest.raises(TypeError):
+        summarize_dtypes_table(None)
+
+    # Test Case 7: Mixed data types including NaN
+    mixed_data = pd.DataFrame({
+        'int_col': [1, 2, None],
+        'float_col': [1.1, None, 3.3],
+        'str_col': ['a', None, 'c']
+    })
+    result = summarize_dtypes_table(mixed_data)
+    expected = pd.DataFrame({
+        'DataType': ['float64', 'object'],
+        'Count': [2, 1]
+        })
+    assert isinstance(result, pd.DataFrame), "The function should return a DataFrame"
+    assert_frame_equal(result, expected, check_dtype=False)
diff --git a/tests/test_summarease.py b/tests/test_summarease.py
@@ -1 +1,2 @@
 from summarease import summarease
+
diff --git a/tests/test_summarize_numeric.py b/tests/test_summarize_numeric.py
@@ -0,0 +1,100 @@
+import pandas as pd
+from summarease.summarize_numeric import summarize_numeric
+import pytest
+
+from io import StringIO
+from unittest.mock import patch
+
+# Test 1: Dataframe with numerical variables with at least one non-null value
+@pytest.fixture
+def df_with_numeric_data():
+    return pd.DataFrame({
+        'A': [1, 2, 3, 4, 5],
+        'B': [5, 4, 3, 2, 1],
+        'C': ['a', 'b', 'c', 'd', 'e']
+    })
+
+def test_summarize_numeric_with_numerical_data(df_with_numeric_data):
+    with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
+        summarize_numeric(df_with_numeric_data, summarize_by="table")
+        output = mock_stdout.getvalue()
+        assert "count" in output  # Check if summary statistics are printed
+        assert "mean" in output
+        assert "std" in output
+
+# Test 2: Dataframe with one numerical variable with at least one non-null value
+@pytest.fixture
+def df_with_single_numeric_column():
+    return pd.DataFrame({
+        'A': [1, 2, 3, 4, 5],
+        'B': ['a', 'b', 'c', 'd', 'e']
+    })
+
+def test_summarize_numeric_with_single_column(df_with_single_numeric_column):
+    with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
+        summarize_numeric(df_with_single_numeric_column, summarize_by="table")
+        output = mock_stdout.getvalue()
+        assert "count" in output
+        assert "mean" in output
+        assert "std" in output
+
+# Test 3: Second argument is one of the accepted enumerations
+def test_summarize_numeric_with_valid_summarize_by():
+    valid_args = ["table", "plot"]
+    for arg in valid_args:
+        result = summarize_numeric(pd.DataFrame({'A': [1, 2, 3]}), summarize_by=arg)
+        assert result is not None  # Ensure it completes without error
+
+# Test 4: Dataframe with no numerical variables
+@pytest.fixture
+def df_with_no_numeric_columns():
+    return pd.DataFrame({
+        'A': ['a', 'b', 'c', 'd', 'e'],
+        'B': ['f', 'g', 'h', 'i', 'j']
+    })
+
+def test_summarize_numeric_no_numeric_columns(df_with_no_numeric_columns):
+    with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
+        summarize_numeric(df_with_no_numeric_columns, summarize_by="table")
+        output = mock_stdout.getvalue()
+        assert "No numeric columns found in the dataset." in output
+
+# Test 5: Dataframe with a numerical variable that contains all null values
+@pytest.fixture
+def df_with_all_null_numeric():
+    return pd.DataFrame({
+        'A': [None, None, None, None, None],
+        'B': ['f', 'g', 'h', 'i', 'j']
+    })
+
+def test_summarize_numeric_with_all_null_values(df_with_all_null_numeric):
+    with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
+        summarize_numeric(df_with_all_null_numeric, summarize_by="table")
+        output = mock_stdout.getvalue()
+        assert "count" not in output  # Check that no summary stats are printed
+        assert "mean" not in output
+
+# Test 6: Too few data to create plot
+@pytest.fixture
+def df_with_too_few_data():
+    return pd.DataFrame({
+        'A': [1],
+        'B': [5]
+    })
+
+def test_summarize_numeric_too_few_data(df_with_too_few_data):
+    with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
+        result = summarize_numeric(df_with_too_few_data, summarize_by="plot")
+        output = mock_stdout.getvalue()
+        # Expect no plot output, since there's only one row of data
+        assert "numeric_plot" not in result
+
+# Test 7: Erroneous/Adversarial Input. First argument not a dataframe
+def test_summarize_numeric_invalid_first_argument():
+    with pytest.raises(AssertionError):
+        summarize_numeric("not a dataframe", summarize_by="table")
+
+# Test 8: Erroneous/Adversarial Input. Second argument invalid
+def test_summarize_numeric_invalid_summarize_by():
+    with pytest.raises(AssertionError):
+        summarize_numeric(pd.DataFrame({'A': [1, 2, 3]}), summarize_by="invalid")