diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..9d55e22 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Poetry 2.0.1 and should not be changed by hand. +package = [] + +[metadata] +lock-version = "2.1" +python-versions = "^3.11" +content-hash = "81b2fa642d7f2d1219cf80112ace12d689d053d81be7f7addb98144d56fc0fb2" diff --git a/src/summarease/__init__.py b/src/summarease/__init__.py index 600dab9..a2bbc97 100644 --- a/src/summarease/__init__.py +++ b/src/summarease/__init__.py @@ -1,3 +1,3 @@ # read version from installed package from importlib.metadata import version -__version__ = version= ("summarease") +__version__ = version("summarease") diff --git a/src/summarease/clean_data.py b/src/summarease/clean_data.py deleted file mode 100644 index 4f3b54b..0000000 --- a/src/summarease/clean_data.py +++ /dev/null @@ -1,41 +0,0 @@ -def clean_data(dataset: pd.Dataframe, replace_values: list = None, - drop_duplicates: bool = True, standardize_columns: bool = True): - """Clean the input dataset by standardizing column names, replacing invalid values, - and ensuring proper data types. - - Parameters - ---------- - dataset : DataFrame - The input dataset to clean. - replace_values : list of str, optional - A list of invalid or placeholder values (e.g., ["?", "NA", "-"]) to be replaced - with NaN. Default is None, which uses the default list ["?", "NA", "-"]. - drop_duplicates : bool, optional - Whether to remove duplicate rows from the dataset. Default is True. - standardize_columns : bool, optional - Whether to standardize column names by converting them to lowercase, - removing spaces, and replacing special characters with underscores. Default is True. - - Returns - ------- - DataFrame - A cleaned DataFrame with standardized column names, missing values replaced, - and appropriate data types inferred. - - Notes - ----- - - Column names will be converted to lowercase, and special characters or spaces - will be replaced with underscores if `standardize_columns` is True. - - Invalid or missing values specified in `replace_values` will be replaced with NaN. - - Attempts to convert text columns to numeric types where applicable. - - Examples - -------- - >>> clean_data(dataset=data, replace_values=["?", "NA"], drop_duplicates=True) - # Returns a cleaned DataFrame with invalid values replaced, duplicates removed, - # and standardized column names. - - >>> clean_data(dataset=data, standardize_columns=False) - # Returns a cleaned DataFrame without altering column names. - """ - pass diff --git a/src/summarease/plot_correlation_heatmap.py b/src/summarease/plot_correlation_heatmap.py deleted file mode 100644 index 27494e6..0000000 --- a/src/summarease/plot_correlation_heatmap.py +++ /dev/null @@ -1,28 +0,0 @@ -import pandas as pd - -def plot_correlation_heatmap(dataset: pd.Dataframe, numeric_columns: list = None, - save_path: str = None): - """ - Generate and save a correlation heatmap for the specified numeric columns in a dataset. - - Parameters: - ---------- - dataset : pd.DataFrame - The input dataset containing the data for the heatmap. - - numeric_columns : list of str, optional - A list of column names to include in the correlation heatmap. If None, all numeric columns in the dataset will be used. - - save_path : str, optional - File path to save the generated heatmap. If None, the plot will not be saved. - - Returns: - ------- - None - Displays the correlation heatmap or optionally saves it to the specified location. - - Example: - ------- - >>> plot_correlation_heatmap(dataset=df, numeric_columns=["col1", "col2", "col3"], save_path="heatmap.png") - """ - pass \ No newline at end of file diff --git a/src/summarease/summarize.py b/src/summarease/summarize.py index 3e58b6d..9b2ad55 100644 --- a/src/summarease/summarize.py +++ b/src/summarease/summarize.py @@ -3,7 +3,8 @@ import fpdf from pathlib import Path from summarize_numeric import summarize_numeric -from summarize_target import summarize_target_df +from summarize_target import summarize_target_df, summarize_target_balance_plot +from summarize_dtypes import summarize_dtypes_table from PIL import Image @@ -204,8 +205,10 @@ def add_table(pdf, table, pdf_height, pdf_width, element_padding=15): return pdf def switch_page_if_needed(pdf): - if pdf.get_y() > 30: + assert isinstance(pdf, FPDF), f"Argument 'pdf' should be FPDF class. You have {pdf}" + if pdf.get_y() > 50: pdf.add_page() + print("New page created before the header.") return pdf def summarize(dataset: pd.DataFrame, @@ -348,7 +351,7 @@ def summarize(dataset: pd.DataFrame, page_width = pdf.w page_height = pdf.h - element_padding = 15 + element_padding = 10 text_line_padding = 10 # Set the font to Helvetica, set the size, write the title @@ -370,15 +373,30 @@ def summarize(dataset: pd.DataFrame, if summarize_by == "plot": summarized_numeric_output = summarize_numeric(dataset, summarize_by="plot") - for key, item in summarized_numeric_output.items(): - plot_file = plot_output_path / f'{key}.png' - str_plot_file = str(plot_file) - item.save(plot_file) - pdf = add_image(pdf, image_path=str_plot_file, pdf_height=page_height, pdf_width=page_width, element_padding=15) + if summarized_numeric_output: + for key, item in summarized_numeric_output.items(): + plot_file = plot_output_path / f'{key}.png' + str_plot_file = str(plot_file) + item.save(plot_file) + pdf = add_image(pdf, image_path=str_plot_file, pdf_height=page_height, pdf_width=page_width, element_padding=10) + + if target_variable is not None: + pdf = switch_page_if_needed(pdf) + pdf.set_font("Helvetica", size=13) + pdf.cell(page_width - 2 * pdf.l_margin, element_padding, txt="Target Variable Summary", ln=True, align='C') + pdf.set_font("Helvetica", size=11) + pdf.multi_cell(page_width - 2 * pdf.l_margin, text_line_padding, txt=f"Target variable is a {target_type} variable. Please find the information about the target variable below:", align='L') + summarized_target_output = summarize_target_df(dataset, target_variable, target_type) + summarized_target_plot = summarize_target_balance_plot(summarized_target_output) + target_plot_file = plot_output_path / "target_plot.png" + summarized_target_plot.save(target_plot_file) + pdf = add_image(pdf, target_plot_file, pdf_height=page_height, pdf_width=page_width, element_padding=0) elif summarize_by == "table": summarized_numeric_output = summarize_numeric(dataset, summarize_by="table") - pdf = add_table(pdf, table = summarized_numeric_output["numeric_describe"], pdf_height=page_height, pdf_width=page_width, element_padding=15) + if summarized_numeric_output: + pdf = add_table(pdf, table = summarized_numeric_output["numeric_describe"], pdf_height=page_height, pdf_width=page_width, element_padding=15) + if target_variable is not None: pdf = switch_page_if_needed(pdf) summarized_target_output = summarize_target_df(dataset, target_variable, target_type) @@ -388,7 +406,10 @@ def summarize(dataset: pd.DataFrame, pdf.multi_cell(page_width - 2 * pdf.l_margin, text_line_padding, txt=f"Target variable is a {target_type} variable. Please find the information about the target variable below:", align='L') pdf = add_table(pdf, table = summarized_target_output, pdf_height=page_height, pdf_width=page_width, element_padding=15) - + summarized_dtypes_table = summarize_dtypes_table(dataset) + pdf.set_font("Helvetica", size=13) + pdf.cell(page_width - 2 * pdf.l_margin, element_padding, txt="Dataset Data Types Summary", ln=True, align='C') + pdf = add_table(pdf, table = summarized_dtypes_table, pdf_height=page_height, pdf_width=page_width, element_padding=15) pdf.output(output_path) assert output_path.exists(), "Something went wrong... The PDF output was not saved." diff --git a/src/summarease/summarize_categorical.py b/src/summarease/summarize_categorical.py deleted file mode 100644 index c20ca0d..0000000 --- a/src/summarease/summarize_categorical.py +++ /dev/null @@ -1,31 +0,0 @@ -def summarize_categorical(dataset: pd.DataFrame, summarize_by: str = "table"): - """ - Summarize the categorical variables in the dataset by providing the number of unique categories - for each categorical column. If any categorical columns have too many unique categories, a warning - is issued. - - Parameters: - ----------- - dataset : pd.DataFrame - The dataset to analyze. - summarize_by (str): - The format for summarizing the categorical variables. - Options are "table" (default) or "plot". If "table", a summary table is - generated with the counts of unique categories for each categorical column. - If "plot", a bar plot will be displayed for each categorical column showing - the frequency of each unique category. - - Returns: - ------- - None: Displays either a table or a plot, depending on the `summarize_by` argument. - - Notes: - ------ - If a categorical column has more than a threshold number of unique categories, a warning will - be printed to notify the user that the column may be too granular for meaningful analysis. - - Example: - ------- - >>> summarize_categorical(dataset=df, summarize_by="table") - """ - pass diff --git a/src/summarease/summarize_missing_values.py b/src/summarease/summarize_missing_values.py deleted file mode 100644 index 4d2a567..0000000 --- a/src/summarease/summarize_missing_values.py +++ /dev/null @@ -1,31 +0,0 @@ -def summarize_missing_values(dataset: pd.DataFrame, summarize_by: str = "table"): - """ - Summarize the missing values in the dataset, providing information on the number and percentage of - missing values for each column. Generate a summary table or visualization to show the missing values - depending on the `summarize_by` argument. - - Parameters: - ----------- - dataset : pd.DataFrame - The dataset to analyze. - summarize_by (str): - The format for summarizing the missing values. - Options are "table" (default) or "plot". If "table", a summary table is - generated showing the count and percentage of missing values for each column. - If "plot", a bar plot will be displayed showing the missing values per column - as a percentage of the total. - - Returns: - ------- - None: Displays either a table or a plot, depending on the `summarize_by` argument. - - Notes: - ------ - - This function only summarizes missing values; it does not handle the imputation or removal of them. - - The missing values are defined as NaN, None, or similar missing indicators. - - Example: - ------- - >>> summarize_missing_values(dataset=df, summarize_by="table") - """ - pass \ No newline at end of file diff --git a/src/summarease/summarize_numeric.py b/src/summarease/summarize_numeric.py index 0f769bf..9289376 100644 --- a/src/summarease/summarize_numeric.py +++ b/src/summarease/summarize_numeric.py @@ -39,6 +39,10 @@ def plot_correlation_heatmap(dataset_numeric): corr_melted = corr.reset_index().melt(id_vars='index') corr_melted.columns = ['Var1', 'Var2', 'Correlation'] + # Round the correlation values to 2 decimal places + corr_melted['Correlation'] = corr_melted['Correlation'].round(2) + + # Create the heatmap with correlation values heatmap = alt.Chart(corr_melted).mark_rect().encode( x='Var1:N', y='Var2:N', @@ -49,7 +53,15 @@ def plot_correlation_heatmap(dataset_numeric): height=400 ) - return heatmap + # Add correlation value labels on the heatmap cells + text = alt.Chart(corr_melted).mark_text(dy=-5).encode( + x='Var1:N', + y='Var2:N', + text='Correlation:Q' + ) + + # Overlay the text on top of the heatmap + return heatmap + text def summarize_numeric(dataset, summarize_by="table"): @@ -91,31 +103,22 @@ def summarize_numeric(dataset, summarize_by="table"): summarize_by = summarize_by.lower() assert summarize_by in {"table", "plot"}, f"Argument 'summarize_by' should be one of the following options: [table, plot]! You have {summarize_by}." - - numeric_columns = dataset.select_dtypes(include='number').columns + # Select the numeric columns from the dataset + dataset_numeric = dataset.select_dtypes(include=['number']) - if numeric_columns.empty: # Check if there are no numeric columns - print("No numeric columns found in the dataset.") + if dataset_numeric.empty: return - - outputs = {} - - if summarize_by == "table": - # Generate summary statistics for numeric columns - summary = dataset[numeric_columns].describe() - print(summary) - elif summarize_by == "plot": - # Generate a correlation heatmap for numeric columns - if len(dataset) < 2: - print("Insufficient data for meaningful plots.") - return {} + outputs = {} - numeric_data = dataset[numeric_columns] - outputs["numeric_plot"] = plot_numeric_density(numeric_data) + if (summarize_by == "plot"): + outputs["numeric_plot"] = plot_numeric_density(dataset_numeric) - if len(numeric_columns) > 1: - outputs["corr_plot"] = plot_correlation_heatmap(numeric_data) + if (dataset_numeric.shape[1] > 1): + outputs["corr_plot"] = plot_correlation_heatmap(dataset_numeric) + elif (summarize_by == "table"): + outputs["numeric_describe"] = dataset_numeric.describe() + return outputs \ No newline at end of file diff --git a/src/summarease/summarize_outliers.py b/src/summarease/summarize_outliers.py deleted file mode 100644 index 31556f3..0000000 --- a/src/summarease/summarize_outliers.py +++ /dev/null @@ -1,33 +0,0 @@ -def summarize_outliers(dataset: pd.Dataframe, columns: list = None, - summarize_by: str = "table"): - """Check and summarize the outliers by Z-scores in specified numeric columns of a table. - - Parameters - ---------- - dataset_name : DataFrame - The input dataset containing numeric column(s) for outlier check. - columns : list of str, optional - A list of column names to check the outliers. - Default is None. - If none, all numeric columns in the table will be checked. - summarize_by : str, within {"table", "plot"}, optional - The method to summarize the outliers: - "table": Default, returns a summary table showing outliers for each column. - "plot": Visualizes a boxplot to display outliers. - - Returns - ------- - dict or None - If summarize_by="table", returns a dictionary. - Keys: column names; values: lists of indices of rows containing outliers. - If summarize_by="plot", returns None. A plot will be displayed. - - Notes: - ----- - Outliers are idenfied with a absolute value of z-score greater than 3. - - Examples - -------- - >>> summarize_outliers(data, summarize_by="table") - """ - pass \ No newline at end of file