-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into summarize_functions
- Loading branch information
Showing
13 changed files
with
367 additions
and
47 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
# read version from installed package | ||
from importlib.metadata import version | ||
__version__ = version("summarease") | ||
__version__ = version= ("summarease") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,33 +1,47 @@ | ||
def summarize_dtypes(dataset, summarize_by="table"): | ||
"""Summarize the data types in the dataset. | ||
import pandas as pd | ||
|
||
def summarize_dtypes_table(dataset: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Summarize the data types in the dataset and return a DataFrame. | ||
Parameters | ||
---------- | ||
dataset : DataFrame | ||
The input dataset to analyze. | ||
summarize_by : str, within {"table", "plot"}, optional | ||
Specifies the output format: | ||
- "table": Returns a summary as a table. | ||
- "plot": Visualizes the data type distribution as a plot. | ||
Default is "table". | ||
Returns | ||
------- | ||
dict or None | ||
A dictionary containing the data type counts if `summarize_by="table"`. | ||
If `summarize_by="plot"`, displays a visualization and returns None. | ||
DataFrame | ||
A DataFrame summarizing the counts of each data type. | ||
Notes | ||
----- | ||
This function counts the occurrences of each data type in the dataset and | ||
either generates a summary table or visualizes the distribution using a plot. | ||
Raises | ||
------ | ||
TypeError | ||
If the input dataset is not a pandas DataFrame. | ||
Examples | ||
-------- | ||
>>> summarize_dtypes(dataset=data, summarize_by="table") | ||
{'int64': 5, 'float64': 3, 'object': 4} | ||
>>> summarize_dtypes(dataset=data, summarize_by="plot") | ||
# Displays a bar plot showing the distribution of data types. | ||
>>> data = pd.DataFrame({ | ||
... 'int_col': [1, 2, 3], | ||
... 'float_col': [1.1, 2.2, 3.3], | ||
... 'str_col': ['a', 'b', 'c'], | ||
... 'bool_col': [True, False, True] | ||
... }) | ||
>>> summarize_dtypes_table(data) | ||
DataType Count | ||
0 int64 1 | ||
1 float64 1 | ||
2 object 1 | ||
3 bool 1 | ||
""" | ||
pass | ||
if not isinstance(dataset, pd.DataFrame): | ||
raise TypeError("The input dataset must be a pandas DataFrame.") | ||
|
||
# Get data types and their counts | ||
dtype_counts = dataset.dtypes.value_counts().reset_index() | ||
dtype_counts.columns = ['DataType', 'Count'] | ||
|
||
# Convert DataType column to string for consistent output | ||
dtype_counts['DataType'] = dtype_counts['DataType'].astype(str) | ||
|
||
return dtype_counts |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import pytest | ||
import pandas as pd | ||
import sys | ||
import os | ||
from pandas.testing import assert_frame_equal | ||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../src'))) | ||
from summarease.summarize_dtypes import summarize_dtypes_table | ||
|
||
|
||
def test_summarize_dtypes_table(): | ||
# Test Case 1: Standard input with multiple data types | ||
data = pd.DataFrame({ | ||
'int_col': [1, 2, 3], | ||
'float_col': [1.1, 2.2, 3.3], | ||
'str_col': ['a', 'b', 'c'], | ||
'bool_col': [True, False, True] | ||
}) | ||
result = summarize_dtypes_table(data) | ||
expected = pd.DataFrame({ | ||
'DataType': ['int64', 'float64', 'object', 'bool'], | ||
'Count': [1, 1, 1, 1] | ||
}) | ||
assert isinstance(result, pd.DataFrame), "The function should return a DataFrame" | ||
assert_frame_equal(result, expected) | ||
|
||
# Test Case 2: Empty DataFrame | ||
empty_data = pd.DataFrame() | ||
result = summarize_dtypes_table(empty_data) | ||
expected = pd.DataFrame(columns=['DataType', 'Count']) | ||
assert isinstance(result, pd.DataFrame), "The function should return a DataFrame" | ||
assert_frame_equal(result, expected, check_dtype=False) | ||
|
||
# Test Case 3: Single column DataFrame | ||
single_col_data = pd.DataFrame({'col1': [1, 2, 3]}) | ||
result = summarize_dtypes_table(single_col_data) | ||
expected = pd.DataFrame({'DataType': ['int64'], 'Count': [1]}) | ||
assert isinstance(result, pd.DataFrame), "The function should return a DataFrame" | ||
assert_frame_equal(result, expected, check_dtype=False) | ||
|
||
# Test Case 4: DataFrame with datetime column | ||
datetime_data = pd.DataFrame({ | ||
'int_col': [1, 2, 3], | ||
'datetime_col': pd.date_range("2023-01-01", periods=3) | ||
}) | ||
result = summarize_dtypes_table(datetime_data) | ||
expected = pd.DataFrame({ | ||
'DataType': ['int64', 'datetime64[ns]'], | ||
'Count': [1, 1] | ||
}) | ||
assert isinstance(result, pd.DataFrame), "The function should return a DataFrame" | ||
assert_frame_equal(result, expected, check_dtype=False) | ||
|
||
# Test Case 5: Non-DataFrame input (list) | ||
with pytest.raises(TypeError): | ||
summarize_dtypes_table([1, 2, 3]) | ||
|
||
# Test Case 6: Non-DataFrame input (None) | ||
with pytest.raises(TypeError): | ||
summarize_dtypes_table(None) | ||
|
||
# Test Case 7: Mixed data types including NaN | ||
mixed_data = pd.DataFrame({ | ||
'int_col': [1, 2, None], | ||
'float_col': [1.1, None, 3.3], | ||
'str_col': ['a', None, 'c'] | ||
}) | ||
result = summarize_dtypes_table(mixed_data) | ||
expected = pd.DataFrame({ | ||
'DataType': ['float64', 'object'], | ||
'Count': [2, 1] | ||
}) | ||
assert isinstance(result, pd.DataFrame), "The function should return a DataFrame" | ||
assert_frame_equal(result, expected, check_dtype=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
from summarease import summarease | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import pandas as pd | ||
from summarease.summarize_numeric import summarize_numeric | ||
import pytest | ||
|
||
from io import StringIO | ||
from unittest.mock import patch | ||
|
||
# Test 1: Dataframe with numerical variables with at least one non-null value | ||
@pytest.fixture | ||
def df_with_numeric_data(): | ||
return pd.DataFrame({ | ||
'A': [1, 2, 3, 4, 5], | ||
'B': [5, 4, 3, 2, 1], | ||
'C': ['a', 'b', 'c', 'd', 'e'] | ||
}) | ||
|
||
def test_summarize_numeric_with_numerical_data(df_with_numeric_data): | ||
with patch("sys.stdout", new_callable=StringIO) as mock_stdout: | ||
summarize_numeric(df_with_numeric_data, summarize_by="table") | ||
output = mock_stdout.getvalue() | ||
assert "count" in output # Check if summary statistics are printed | ||
assert "mean" in output | ||
assert "std" in output | ||
|
||
# Test 2: Dataframe with one numerical variable with at least one non-null value | ||
@pytest.fixture | ||
def df_with_single_numeric_column(): | ||
return pd.DataFrame({ | ||
'A': [1, 2, 3, 4, 5], | ||
'B': ['a', 'b', 'c', 'd', 'e'] | ||
}) | ||
|
||
def test_summarize_numeric_with_single_column(df_with_single_numeric_column): | ||
with patch("sys.stdout", new_callable=StringIO) as mock_stdout: | ||
summarize_numeric(df_with_single_numeric_column, summarize_by="table") | ||
output = mock_stdout.getvalue() | ||
assert "count" in output | ||
assert "mean" in output | ||
assert "std" in output | ||
|
||
# Test 3: Second argument is one of the accepted enumerations | ||
def test_summarize_numeric_with_valid_summarize_by(): | ||
valid_args = ["table", "plot"] | ||
for arg in valid_args: | ||
result = summarize_numeric(pd.DataFrame({'A': [1, 2, 3]}), summarize_by=arg) | ||
assert result is not None # Ensure it completes without error | ||
|
||
# Test 4: Dataframe with no numerical variables | ||
@pytest.fixture | ||
def df_with_no_numeric_columns(): | ||
return pd.DataFrame({ | ||
'A': ['a', 'b', 'c', 'd', 'e'], | ||
'B': ['f', 'g', 'h', 'i', 'j'] | ||
}) | ||
|
||
def test_summarize_numeric_no_numeric_columns(df_with_no_numeric_columns): | ||
with patch("sys.stdout", new_callable=StringIO) as mock_stdout: | ||
summarize_numeric(df_with_no_numeric_columns, summarize_by="table") | ||
output = mock_stdout.getvalue() | ||
assert "No numeric columns found in the dataset." in output | ||
|
||
# Test 5: Dataframe with a numerical variable that contains all null values | ||
@pytest.fixture | ||
def df_with_all_null_numeric(): | ||
return pd.DataFrame({ | ||
'A': [None, None, None, None, None], | ||
'B': ['f', 'g', 'h', 'i', 'j'] | ||
}) | ||
|
||
def test_summarize_numeric_with_all_null_values(df_with_all_null_numeric): | ||
with patch("sys.stdout", new_callable=StringIO) as mock_stdout: | ||
summarize_numeric(df_with_all_null_numeric, summarize_by="table") | ||
output = mock_stdout.getvalue() | ||
assert "count" not in output # Check that no summary stats are printed | ||
assert "mean" not in output | ||
|
||
# Test 6: Too few data to create plot | ||
@pytest.fixture | ||
def df_with_too_few_data(): | ||
return pd.DataFrame({ | ||
'A': [1], | ||
'B': [5] | ||
}) | ||
|
||
def test_summarize_numeric_too_few_data(df_with_too_few_data): | ||
with patch("sys.stdout", new_callable=StringIO) as mock_stdout: | ||
result = summarize_numeric(df_with_too_few_data, summarize_by="plot") | ||
output = mock_stdout.getvalue() | ||
# Expect no plot output, since there's only one row of data | ||
assert "numeric_plot" not in result | ||
|
||
# Test 7: Erroneous/Adversarial Input. First argument not a dataframe | ||
def test_summarize_numeric_invalid_first_argument(): | ||
with pytest.raises(AssertionError): | ||
summarize_numeric("not a dataframe", summarize_by="table") | ||
|
||
# Test 8: Erroneous/Adversarial Input. Second argument invalid | ||
def test_summarize_numeric_invalid_summarize_by(): | ||
with pytest.raises(AssertionError): | ||
summarize_numeric(pd.DataFrame({'A': [1, 2, 3]}), summarize_by="invalid") |
Oops, something went wrong.