almost final changes 2

UBC-MDS · Jan 19, 2025 · 2fe8f6a · 2fe8f6a
1 parent 6d4e5c1
commit 2fe8f6a
Show file tree

Hide file tree

Showing 9 changed files with 63 additions and 196 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/src/summarease/__init__.py b/src/summarease/__init__.py
@@ -1,3 +1,3 @@
 # read version from installed package
 from importlib.metadata import version
-__version__ = version= ("summarease")
+__version__ = version("summarease")
diff --git a/src/summarease/clean_data.py b/src/summarease/clean_data.py
diff --git a/src/summarease/plot_correlation_heatmap.py b/src/summarease/plot_correlation_heatmap.py
diff --git a/src/summarease/summarize.py b/src/summarease/summarize.py
@@ -3,7 +3,8 @@
 import fpdf
 from pathlib import Path
 from summarize_numeric import summarize_numeric
-from summarize_target import summarize_target_df
+from summarize_target import summarize_target_df, summarize_target_balance_plot
+from summarize_dtypes import summarize_dtypes_table
 from PIL import Image
 
 
@@ -204,8 +205,10 @@ def add_table(pdf, table, pdf_height, pdf_width, element_padding=15):
     return pdf
 
 def switch_page_if_needed(pdf):
-    if pdf.get_y() > 30:
+    assert isinstance(pdf, FPDF), f"Argument 'pdf' should be FPDF class. You have {pdf}"
+    if pdf.get_y() > 50:
         pdf.add_page()
+        print("New page created before the header.")
     return pdf
 
 def summarize(dataset: pd.DataFrame,
@@ -348,7 +351,7 @@ def summarize(dataset: pd.DataFrame,
     page_width = pdf.w
     page_height = pdf.h
 
-    element_padding = 15
+    element_padding = 10
     text_line_padding = 10
 
     # Set the font to Helvetica, set the size, write the title
@@ -370,15 +373,30 @@ def summarize(dataset: pd.DataFrame,
 
     if summarize_by == "plot":
         summarized_numeric_output = summarize_numeric(dataset, summarize_by="plot")
-        for key, item in summarized_numeric_output.items():
-            plot_file = plot_output_path / f'{key}.png'
-            str_plot_file = str(plot_file)
-            item.save(plot_file)
-            pdf = add_image(pdf, image_path=str_plot_file, pdf_height=page_height, pdf_width=page_width, element_padding=15)
+        if summarized_numeric_output:
+            for key, item in summarized_numeric_output.items():
+                plot_file = plot_output_path / f'{key}.png'
+                str_plot_file = str(plot_file)
+                item.save(plot_file)
+                pdf = add_image(pdf, image_path=str_plot_file, pdf_height=page_height, pdf_width=page_width, element_padding=10)
+
+        if target_variable is not None:
+            pdf = switch_page_if_needed(pdf)
+            pdf.set_font("Helvetica", size=13)
+            pdf.cell(page_width - 2 * pdf.l_margin, element_padding, txt="Target Variable Summary", ln=True, align='C')
+            pdf.set_font("Helvetica", size=11)
+            pdf.multi_cell(page_width - 2 * pdf.l_margin, text_line_padding, txt=f"Target variable is a {target_type} variable. Please find the information about the target variable below:", align='L')
+            summarized_target_output = summarize_target_df(dataset, target_variable, target_type)
+            summarized_target_plot = summarize_target_balance_plot(summarized_target_output)
+            target_plot_file = plot_output_path / "target_plot.png"
+            summarized_target_plot.save(target_plot_file)
+            pdf = add_image(pdf, target_plot_file, pdf_height=page_height, pdf_width=page_width, element_padding=0)
 
     elif summarize_by == "table":
         summarized_numeric_output = summarize_numeric(dataset, summarize_by="table")
-        pdf = add_table(pdf, table = summarized_numeric_output["numeric_describe"], pdf_height=page_height, pdf_width=page_width, element_padding=15)
+        if summarized_numeric_output:
+            pdf = add_table(pdf, table = summarized_numeric_output["numeric_describe"], pdf_height=page_height, pdf_width=page_width, element_padding=15)
+
         if target_variable is not None:
             pdf = switch_page_if_needed(pdf)
             summarized_target_output = summarize_target_df(dataset, target_variable, target_type)
@@ -388,7 +406,10 @@ def summarize(dataset: pd.DataFrame,
             pdf.multi_cell(page_width - 2 * pdf.l_margin, text_line_padding, txt=f"Target variable is a {target_type} variable. Please find the information about the target variable below:", align='L')
             pdf = add_table(pdf, table = summarized_target_output, pdf_height=page_height, pdf_width=page_width, element_padding=15)
 
-
+    summarized_dtypes_table = summarize_dtypes_table(dataset)
+    pdf.set_font("Helvetica", size=13)
+    pdf.cell(page_width - 2 * pdf.l_margin, element_padding, txt="Dataset Data Types Summary", ln=True, align='C')
+    pdf = add_table(pdf, table = summarized_dtypes_table, pdf_height=page_height, pdf_width=page_width, element_padding=15)
 
     pdf.output(output_path)
     assert output_path.exists(), "Something went wrong... The PDF output was not saved."

diff --git a/src/summarease/summarize_categorical.py b/src/summarease/summarize_categorical.py
diff --git a/src/summarease/summarize_missing_values.py b/src/summarease/summarize_missing_values.py
diff --git a/src/summarease/summarize_numeric.py b/src/summarease/summarize_numeric.py
@@ -39,6 +39,10 @@ def plot_correlation_heatmap(dataset_numeric):
     corr_melted = corr.reset_index().melt(id_vars='index')
     corr_melted.columns = ['Var1', 'Var2', 'Correlation']
 
+    # Round the correlation values to 2 decimal places
+    corr_melted['Correlation'] = corr_melted['Correlation'].round(2)
+
+    # Create the heatmap with correlation values
     heatmap = alt.Chart(corr_melted).mark_rect().encode(
         x='Var1:N',
         y='Var2:N',
@@ -49,7 +53,15 @@ def plot_correlation_heatmap(dataset_numeric):
         height=400
     )
 
-    return heatmap
+    # Add correlation value labels on the heatmap cells
+    text = alt.Chart(corr_melted).mark_text(dy=-5).encode(
+        x='Var1:N',
+        y='Var2:N',
+        text='Correlation:Q'
+    )
+
+    # Overlay the text on top of the heatmap
+    return heatmap + text
 
 def summarize_numeric(dataset, summarize_by="table"):
 
@@ -91,31 +103,22 @@ def summarize_numeric(dataset, summarize_by="table"):
     summarize_by = summarize_by.lower()
 
     assert summarize_by in {"table", "plot"}, f"Argument 'summarize_by' should be one of the following options: [table, plot]! You have {summarize_by}."
-
 
-    numeric_columns = dataset.select_dtypes(include='number').columns
+    # Select the numeric columns from the dataset
+    dataset_numeric = dataset.select_dtypes(include=['number'])
 
-    if numeric_columns.empty:  # Check if there are no numeric columns
-        print("No numeric columns found in the dataset.")
+    if dataset_numeric.empty:
         return
-
-    outputs = {}
-
-    if summarize_by == "table":
-        # Generate summary statistics for numeric columns
-        summary = dataset[numeric_columns].describe()
-        print(summary)
 
-    elif summarize_by == "plot":
-        # Generate a correlation heatmap for numeric columns
-        if len(dataset) < 2:
-            print("Insufficient data for meaningful plots.")
-            return {}
+    outputs = {}
 
-        numeric_data = dataset[numeric_columns]
-        outputs["numeric_plot"] = plot_numeric_density(numeric_data)
+    if (summarize_by == "plot"):
+        outputs["numeric_plot"] = plot_numeric_density(dataset_numeric)
 
-        if len(numeric_columns) > 1:
-            outputs["corr_plot"] = plot_correlation_heatmap(numeric_data)
+        if (dataset_numeric.shape[1] > 1):
+            outputs["corr_plot"] = plot_correlation_heatmap(dataset_numeric)
 
+    elif (summarize_by == "table"):
+        outputs["numeric_describe"] = dataset_numeric.describe()
+
     return outputs
diff --git a/src/summarease/summarize_outliers.py b/src/summarease/summarize_outliers.py