Skip to content

Commit

Permalink
Merge branch 'main' into summarize_functions
Browse files Browse the repository at this point in the history
  • Loading branch information
HrayrMuradyan authored Jan 18, 2025
2 parents 5436a50 + 31678c7 commit 16a9747
Show file tree
Hide file tree
Showing 13 changed files with 367 additions and 47 deletions.
16 changes: 9 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
# summarease

A package that provides quick summaries of datasets, including data types, missing value counts, and basic statistics.

## Project Summary

Summarease is a package designed to provide quick insights into a dataset by summarizing its key features. It offers functions that help users understand the structure of the data, making it easier to plan data cleaning and exploratory data analysis (EDA) tasks.
Expand Down Expand Up @@ -39,10 +37,10 @@ Summarease is a package designed to provide quick insights into a dataset by sum

Summarease is a lightweight and compact Python package designed for efficiency and ease of use. Despite its simplicity, it offers users great flexibility to customize the output format, whether through detailed tables or insightful visualizations.

Related packages with similar functionalities:
sweetviz: https://github.com/fbdesignpro/sweetviz
ydata-profiling: https://github.com/ydataai/ydata-profiling
dtale: https://github.com/man-group/dtale
Related packages with similar functionalities:
sweetviz: https://github.com/fbdesignpro/sweetviz
ydata-profiling: https://github.com/ydataai/ydata-profiling
dtale: https://github.com/man-group/dtale

## Installation

Expand All @@ -63,11 +61,15 @@ from summarease.summarize_numeric import summarize_numeric
from summarease.plot_correlation_heatmap import plot_correlation_heatmap
from summarease.summarize import summarize
import matplotlib.pyplot as plt

TODO
```

## Contributing

Interested in contributing? Check out the contributing guidelines. Please note that this project is released with a Code of Conduct. By contributing to this project, you agree to abide by its terms.
Interested in contributing? Check out the contributing guidelines.

Please note that this project is released with a Code of Conduct. By contributing to this project, you agree to abide by its terms.

## License

Expand Down
2 changes: 1 addition & 1 deletion src/summarease/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# read version from installed package
from importlib.metadata import version
__version__ = version("summarease")
__version__ = version= ("summarease")
3 changes: 2 additions & 1 deletion src/summarease/clean_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
def clean_data(dataset, replace_values=None, drop_duplicates=True, standardize_columns=True):
def clean_data(dataset: pd.Dataframe, replace_values: list = None,
drop_duplicates: bool = True, standardize_columns: bool = True):
"""Clean the input dataset by standardizing column names, replacing invalid values,
and ensuring proper data types.
Expand Down
2 changes: 1 addition & 1 deletion src/summarease/summarize_categorical.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
def summarize_categorical(dataset, summarize_by="table"):
def summarize_categorical(dataset: pd.DataFrame, summarize_by: str = "table"):
"""
Summarize the categorical variables in the dataset by providing the number of unique categories
for each categorical column. If any categorical columns have too many unique categories, a warning
Expand Down
54 changes: 34 additions & 20 deletions src/summarease/summarize_dtypes.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,47 @@
def summarize_dtypes(dataset, summarize_by="table"):
"""Summarize the data types in the dataset.
import pandas as pd

def summarize_dtypes_table(dataset: pd.DataFrame) -> pd.DataFrame:
"""
Summarize the data types in the dataset and return a DataFrame.
Parameters
----------
dataset : DataFrame
The input dataset to analyze.
summarize_by : str, within {"table", "plot"}, optional
Specifies the output format:
- "table": Returns a summary as a table.
- "plot": Visualizes the data type distribution as a plot.
Default is "table".
Returns
-------
dict or None
A dictionary containing the data type counts if `summarize_by="table"`.
If `summarize_by="plot"`, displays a visualization and returns None.
DataFrame
A DataFrame summarizing the counts of each data type.
Notes
-----
This function counts the occurrences of each data type in the dataset and
either generates a summary table or visualizes the distribution using a plot.
Raises
------
TypeError
If the input dataset is not a pandas DataFrame.
Examples
--------
>>> summarize_dtypes(dataset=data, summarize_by="table")
{'int64': 5, 'float64': 3, 'object': 4}
>>> summarize_dtypes(dataset=data, summarize_by="plot")
# Displays a bar plot showing the distribution of data types.
>>> data = pd.DataFrame({
... 'int_col': [1, 2, 3],
... 'float_col': [1.1, 2.2, 3.3],
... 'str_col': ['a', 'b', 'c'],
... 'bool_col': [True, False, True]
... })
>>> summarize_dtypes_table(data)
DataType Count
0 int64 1
1 float64 1
2 object 1
3 bool 1
"""
pass
if not isinstance(dataset, pd.DataFrame):
raise TypeError("The input dataset must be a pandas DataFrame.")

# Get data types and their counts
dtype_counts = dataset.dtypes.value_counts().reset_index()
dtype_counts.columns = ['DataType', 'Count']

# Convert DataType column to string for consistent output
dtype_counts['DataType'] = dtype_counts['DataType'].astype(str)

return dtype_counts
2 changes: 1 addition & 1 deletion src/summarease/summarize_missing_values.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
def summarize_missing_values(dataset, summarize_by="table"):
def summarize_missing_values(dataset: pd.DataFrame, summarize_by: str = "table"):
"""
Summarize the missing values in the dataset, providing information on the number and percentage of
missing values for each column. Generate a summary table or visualization to show the missing values
Expand Down
32 changes: 22 additions & 10 deletions src/summarease/summarize_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ def plot_correlation_heatmap(dataset_numeric):

return heatmap


def summarize_numeric(dataset, summarize_by="table"):

"""
Summarize the numeric variables in the dataset by providing the summary statistics (e.g., mean,
standard deviation, min, max, etc.) for each numeric column or plotting the correlation heatmap
Expand Down Expand Up @@ -92,18 +92,30 @@ def summarize_numeric(dataset, summarize_by="table"):

assert summarize_by in {"table", "plot"}, f"Argument 'summarize_by' should be one of the following options: [table, plot]! You have {summarize_by}."

# Select the numeric columns from the dataset
dataset_numeric = dataset.select_dtypes(include=['number'])

numeric_columns = dataset.select_dtypes(include='number').columns

if numeric_columns.empty: # Check if there are no numeric columns
print("No numeric columns found in the dataset.")
return

outputs = {}

if (summarize_by == "plot"):
outputs["numeric_plot"] = plot_numeric_density(dataset_numeric)

if (dataset_numeric.shape[1] > 1):
outputs["corr_plot"] = plot_correlation_heatmap(dataset_numeric)
if summarize_by == "table":
# Generate summary statistics for numeric columns
summary = dataset[numeric_columns].describe()
print(summary)

elif summarize_by == "plot":
# Generate a correlation heatmap for numeric columns
if len(dataset) < 2:
print("Insufficient data for meaningful plots.")
return {}

elif (summarize_by == "table"):
outputs["numeric_describe"] = dataset_numeric.describe()
numeric_data = dataset[numeric_columns]
outputs["numeric_plot"] = plot_numeric_density(numeric_data)

if len(numeric_columns) > 1:
outputs["corr_plot"] = plot_correlation_heatmap(numeric_data)

return outputs
3 changes: 2 additions & 1 deletion src/summarease/summarize_outliers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
def summarize_outliers(dataset_name, columns=None, summarize_by="table"):
def summarize_outliers(dataset: pd.Dataframe, columns: list = None,
summarize_by: str = "table"):
"""Check and summarize the outliers by Z-scores in specified numeric columns of a table.
Parameters
Expand Down
10 changes: 9 additions & 1 deletion src/summarease/summarize_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,15 @@ def summarize_target_balance_plot(summary_df: pd.DataFrame):
# Validate input DataFrame
required_columns = {'class', 'proportion', 'imbalanced', 'threshold'}
if not required_columns.issubset(summary_df.columns):
raise ValueError(f"Input DataFrame must contain columns: {required_columns}")
raise ValueError(f"Input DataFrame must contain columns: {', '.join(sorted(required_columns))}")

# Handle empty DataFrame
if summary_df.empty:
return alt.Chart(pd.DataFrame()).mark_text().encode(
text=alt.value("No data available for visualization.")
).properties(
title="Categorical Target Balance Visualization (Empty)"
)

# Add expected proportion range to the DataFrame
n_classes = len(summary_df)
Expand Down
73 changes: 73 additions & 0 deletions tests/test_dtypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import pytest
import pandas as pd
import sys
import os
from pandas.testing import assert_frame_equal
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../src')))
from summarease.summarize_dtypes import summarize_dtypes_table


def test_summarize_dtypes_table():
# Test Case 1: Standard input with multiple data types
data = pd.DataFrame({
'int_col': [1, 2, 3],
'float_col': [1.1, 2.2, 3.3],
'str_col': ['a', 'b', 'c'],
'bool_col': [True, False, True]
})
result = summarize_dtypes_table(data)
expected = pd.DataFrame({
'DataType': ['int64', 'float64', 'object', 'bool'],
'Count': [1, 1, 1, 1]
})
assert isinstance(result, pd.DataFrame), "The function should return a DataFrame"
assert_frame_equal(result, expected)

# Test Case 2: Empty DataFrame
empty_data = pd.DataFrame()
result = summarize_dtypes_table(empty_data)
expected = pd.DataFrame(columns=['DataType', 'Count'])
assert isinstance(result, pd.DataFrame), "The function should return a DataFrame"
assert_frame_equal(result, expected, check_dtype=False)

# Test Case 3: Single column DataFrame
single_col_data = pd.DataFrame({'col1': [1, 2, 3]})
result = summarize_dtypes_table(single_col_data)
expected = pd.DataFrame({'DataType': ['int64'], 'Count': [1]})
assert isinstance(result, pd.DataFrame), "The function should return a DataFrame"
assert_frame_equal(result, expected, check_dtype=False)

# Test Case 4: DataFrame with datetime column
datetime_data = pd.DataFrame({
'int_col': [1, 2, 3],
'datetime_col': pd.date_range("2023-01-01", periods=3)
})
result = summarize_dtypes_table(datetime_data)
expected = pd.DataFrame({
'DataType': ['int64', 'datetime64[ns]'],
'Count': [1, 1]
})
assert isinstance(result, pd.DataFrame), "The function should return a DataFrame"
assert_frame_equal(result, expected, check_dtype=False)

# Test Case 5: Non-DataFrame input (list)
with pytest.raises(TypeError):
summarize_dtypes_table([1, 2, 3])

# Test Case 6: Non-DataFrame input (None)
with pytest.raises(TypeError):
summarize_dtypes_table(None)

# Test Case 7: Mixed data types including NaN
mixed_data = pd.DataFrame({
'int_col': [1, 2, None],
'float_col': [1.1, None, 3.3],
'str_col': ['a', None, 'c']
})
result = summarize_dtypes_table(mixed_data)
expected = pd.DataFrame({
'DataType': ['float64', 'object'],
'Count': [2, 1]
})
assert isinstance(result, pd.DataFrame), "The function should return a DataFrame"
assert_frame_equal(result, expected, check_dtype=False)
1 change: 1 addition & 0 deletions tests/test_summarease.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
from summarease import summarease

100 changes: 100 additions & 0 deletions tests/test_summarize_numeric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import pandas as pd
from summarease.summarize_numeric import summarize_numeric
import pytest

from io import StringIO
from unittest.mock import patch

# Test 1: Dataframe with numerical variables with at least one non-null value
@pytest.fixture
def df_with_numeric_data():
return pd.DataFrame({
'A': [1, 2, 3, 4, 5],
'B': [5, 4, 3, 2, 1],
'C': ['a', 'b', 'c', 'd', 'e']
})

def test_summarize_numeric_with_numerical_data(df_with_numeric_data):
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
summarize_numeric(df_with_numeric_data, summarize_by="table")
output = mock_stdout.getvalue()
assert "count" in output # Check if summary statistics are printed
assert "mean" in output
assert "std" in output

# Test 2: Dataframe with one numerical variable with at least one non-null value
@pytest.fixture
def df_with_single_numeric_column():
return pd.DataFrame({
'A': [1, 2, 3, 4, 5],
'B': ['a', 'b', 'c', 'd', 'e']
})

def test_summarize_numeric_with_single_column(df_with_single_numeric_column):
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
summarize_numeric(df_with_single_numeric_column, summarize_by="table")
output = mock_stdout.getvalue()
assert "count" in output
assert "mean" in output
assert "std" in output

# Test 3: Second argument is one of the accepted enumerations
def test_summarize_numeric_with_valid_summarize_by():
valid_args = ["table", "plot"]
for arg in valid_args:
result = summarize_numeric(pd.DataFrame({'A': [1, 2, 3]}), summarize_by=arg)
assert result is not None # Ensure it completes without error

# Test 4: Dataframe with no numerical variables
@pytest.fixture
def df_with_no_numeric_columns():
return pd.DataFrame({
'A': ['a', 'b', 'c', 'd', 'e'],
'B': ['f', 'g', 'h', 'i', 'j']
})

def test_summarize_numeric_no_numeric_columns(df_with_no_numeric_columns):
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
summarize_numeric(df_with_no_numeric_columns, summarize_by="table")
output = mock_stdout.getvalue()
assert "No numeric columns found in the dataset." in output

# Test 5: Dataframe with a numerical variable that contains all null values
@pytest.fixture
def df_with_all_null_numeric():
return pd.DataFrame({
'A': [None, None, None, None, None],
'B': ['f', 'g', 'h', 'i', 'j']
})

def test_summarize_numeric_with_all_null_values(df_with_all_null_numeric):
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
summarize_numeric(df_with_all_null_numeric, summarize_by="table")
output = mock_stdout.getvalue()
assert "count" not in output # Check that no summary stats are printed
assert "mean" not in output

# Test 6: Too few data to create plot
@pytest.fixture
def df_with_too_few_data():
return pd.DataFrame({
'A': [1],
'B': [5]
})

def test_summarize_numeric_too_few_data(df_with_too_few_data):
with patch("sys.stdout", new_callable=StringIO) as mock_stdout:
result = summarize_numeric(df_with_too_few_data, summarize_by="plot")
output = mock_stdout.getvalue()
# Expect no plot output, since there's only one row of data
assert "numeric_plot" not in result

# Test 7: Erroneous/Adversarial Input. First argument not a dataframe
def test_summarize_numeric_invalid_first_argument():
with pytest.raises(AssertionError):
summarize_numeric("not a dataframe", summarize_by="table")

# Test 8: Erroneous/Adversarial Input. Second argument invalid
def test_summarize_numeric_invalid_summarize_by():
with pytest.raises(AssertionError):
summarize_numeric(pd.DataFrame({'A': [1, 2, 3]}), summarize_by="invalid")
Loading

0 comments on commit 16a9747

Please sign in to comment.