Skip to content

Commit

Permalink
Merge pull request #40 from UBC-MDS/summarize_functions
Browse files Browse the repository at this point in the history
Summarize functions
  • Loading branch information
Green-zy authored Jan 18, 2025
2 parents 31678c7 + 16a9747 commit b7ab278
Show file tree
Hide file tree
Showing 5 changed files with 726 additions and 41 deletions.
303 changes: 302 additions & 1 deletion src/summarease/summarize.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,213 @@
import pandas as pd
from fpdf import FPDF
import fpdf
from pathlib import Path
from summarize_numeric import summarize_numeric
from summarize_target import summarize_target_df
from PIL import Image


def validate_or_create_path(path):
"""
Validates if the provided path is a valid `Path` object. If the path represents a file,
it ensures the parent directory exists, creating it if necessary. If the path represents
a directory, it ensures the directory exists, creating it if necessary.
Args:
path (Path): The path to validate or create. Can represent a file or directory.
Raises:
TypeError: If the provided `path` is not an instance of `Path`.
Notes:
- If the path is a file and the parent directory does not exist, the function creates
the necessary parent directories.
- If the path is a directory and it does not exist, the function creates it, including
any necessary parent directories.
- The `mkdir` method is used with `parents=True` and `exist_ok=True`, which ensures
that parent directories are created if they do not exist, and no error is raised
if the path already exists.
"""
if not isinstance(path, Path):
raise TypeError(f"Expected a Path object, got {type(path)}.")

# Check if the parent directory exists, if not, create it
if path.is_file():
if not path.parent.exists():
path.parent.mkdir(parents=True, exist_ok=True)
else:
if not path.exists():
path.mkdir(parents=True, exist_ok=True)


def add_image(pdf, image_path, pdf_height, pdf_width, element_padding=15):
"""
Adds an image to a PDF document at the current y-position with consideration for page size
and scaling. If the image height exceeds the remaining space on the current page, a new page
is added to the PDF. The image is scaled proportionally to fit the page width while maintaining
the aspect ratio.
Args:
pdf: A FPDF object representing the PDF document to which the image will be added.
image_path (str or Path): The file path to the image to be added. It supports various image
formats such as .jpg, .jpeg, .png, .gif, .bmp, .tiff, and .webp.
pdf_height (float): The total height of the PDF page in units consistent with the FPDF settings.
pdf_width (float): The total width of the PDF page in units consistent with the FPDF settings.
element_padding (int, optional): The padding (in units consistent with FPDF) to be applied between
the image and the page's top margin. Default is 15.
Returns:
pdf: The updated FPDF object with the image added at the correct position.
Notes:
- The function checks if the image file exists and has a valid image extension.
- The image is scaled to fit within the page width, and if necessary, a new page is added.
- The function assumes a DPI of 96 for the image size conversion from pixels to millimeters.
- If the image height exceeds the remaining space on the current page, a new page is created before adding the image.
"""
assert isinstance(pdf, FPDF), f"Argument 'pdf' should be FPDF class. You have {type(pdf)}."
assert isinstance(image_path, Path) or isinstance(image_path, str), f"Argument 'image_path' should be a Path class or string. You have {type(image_path)}."
assert isinstance(pdf_height, int) or isinstance(pdf_height, float), f"Argument 'pdf_height' should be an integer or float. You have {type(pdf_height)}."
assert isinstance(pdf_width, int) or isinstance(pdf_height, float), f"Argument 'pdf_width' should be an integer or float. You have {type(pdf_width)}."
assert isinstance(element_padding, int), f"Argument 'element_padding' should be an integer. You have {type(element_padding)}."

image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}
image_path = Path(image_path)
assert image_path.suffix in image_extensions, f"Unsupported image format. Should be {image_extensions}"
image_path_str = str(image_path)
y_position = pdf.get_y() # Get initial y_position
page_height = pdf_height - 2 * pdf.t_margin

if not image_path.is_file():
raise ValueError(f"File not found: {image_path_str}")

# Check if the file has a valid image extension
if image_path.suffix.lower() not in image_extensions:
raise ValueError(f"Unsupported image format: {image_path.suffix.lower()}")

if image_path.is_file():
# Check if the file has a valid image extension
if image_path.suffix.lower() in image_extensions:
with Image.open(image_path_str) as img:
image_width, image_height = img.size
dpi = 96
element_width_mm = image_width / dpi * 25.4
element_height_mm = image_height / dpi * 25.4

if element_height_mm > page_height:
scale_factor = page_height / element_height_mm
else:
scale_factor = 1
if y_position + element_height_mm > page_height:
print("Switching page before plotting", image_path_str)
pdf.add_page()

print("Y position before adding an image", pdf.get_y())
pdf.ln(pdf.get_y())
y_position = pdf.get_y()
# Add the image to the PDF
pdf.image(image_path_str, x=pdf.l_margin, y=y_position + element_padding, w=int(scale_factor*(pdf_width - 2 * pdf.l_margin)))
pdf.ln(element_height_mm + element_padding)
print("Y position after adding an image", pdf.get_y())

# Manually update y_position after adding the image
y_position = pdf.get_y()

return pdf

def add_table(pdf, table, pdf_height, pdf_width, element_padding=15):
"""
Adds a table to the PDF document with the provided data, scaling the column widths to fit
within the page width while maintaining their relative proportions. The first row (header)
has a gray background, and the first column (index) is highlighted with a gray background.
Args:
pdf: A FPDF object representing the PDF document to which the table will be added.
table (pandas.DataFrame): The table containing the data to be added. The first column
(index) will be inserted as a new column in the table.
pdf_height (float): The total height of the PDF page in units consistent with the FPDF settings.
pdf_width (float): The total width of the PDF page in units consistent with the FPDF settings.
element_padding (int, optional): The padding (in units consistent with FPDF) to be applied
around the table. Default is 15.
Returns:
pdf: The updated FPDF object with the table added.
Notes:
- The function calculates the maximum column width based on the longest entry or column name,
scaling the column widths to fit the available page width while maintaining relative proportions.
- The first row (header) is filled with a light gray background, and the first column (index)
is also highlighted with a gray background for better readability.
- Column names are truncated if they are too long to fit in the cell, and the font size is adjusted
accordingly for long column names.
- Numeric values are rounded to 2 decimal places for consistency.
"""
assert isinstance(pdf, FPDF), f"Argument 'pdf' should be FPDF class. You have {type(pdf)}."
assert isinstance(table, pd.DataFrame), f"Argument 'table' should be a pandas Dataframe. You have {type(table)}."
assert isinstance(pdf_height, int) or isinstance(pdf_height, float), f"Argument 'pdf_height' should be an integer or float. You have {type(pdf_height)}."
assert isinstance(pdf_width, int) or isinstance(pdf_height, float), f"Argument 'pdf_width' should be an integer or float. You have {type(pdf_width)}."
assert isinstance(element_padding, int), f"Argument 'element_padding' should be an integer. You have {type(element_padding)}."
assert not table.empty, f"The table shouldn't be empty"

pdf.set_font('Arial', '', 9)

# Insert index as a new column (at the start)
table.insert(0, 'Index', table.index)

# Calculate maximum column width based on the longest entry
col_widths = []
for col in table.columns:
max_length = max(table[col].apply(lambda x: len(str(x))).max(), len(col))
col_widths.append(max_length * 2)

# Adjust the index column width to be smaller (as index is usually smaller)
col_widths[0] = max(col_widths[0], 20)

total_width = sum(col_widths)

# Scale column widths to fit within the page width, maintaining the relative proportions
scale_factor = (pdf_width - 2 * element_padding) / total_width
col_widths = [w * scale_factor for w in col_widths]

# Set gray color for the first row and first column
pdf.set_fill_color(230, 230, 230)

# Add table header with gray background for the first row
for i, col in enumerate(table.columns):
col_name = col
# Convert col_widths[i] to integer for proper slicing
max_length_for_col = int(col_widths[i] // 2)
# If the column name is longer than the cell, truncate or wrap the text
if len(col_name) > max_length_for_col:
col_name = col_name[:max_length_for_col] + '...'
pdf.set_font('Arial', '', 8)
pdf.cell(col_widths[i], 10, col_name, border=1, align='C', fill=True)
pdf.set_font('Arial', '', 9)
pdf.ln()

# Add table rows with gray background for the first column (index)
for i in range(len(table)):
for j, col in enumerate(table.columns):
value = table[col].iloc[i]
# Round numeric values to 2 decimals
if isinstance(value, (int, float)):
value = round(value, 2)

# Apply gray background for the first column (index)
if j == 0:
pdf.set_fill_color(230, 230, 230)
pdf.cell(col_widths[j], 10, str(value), border=1, align='C', fill=True)
else:
pdf.cell(col_widths[j], 10, str(value), border=1, align='C', fill=False)
pdf.ln()

return pdf

def switch_page_if_needed(pdf):
if pdf.get_y() > 30:
pdf.add_page()
return pdf

def summarize(dataset: pd.DataFrame,
dataset_name: str = "Dataset Summary",
description: str = "Dataset summary generated by summarease.",
Expand All @@ -7,6 +217,7 @@ def summarize(dataset: pd.DataFrame,
summarize_by: str = "mix",
auto_cleaning: bool = False,
target_variable: str = None,
target_type: str = "categorical",
output_file: str = "summary.pdf",
output_dir: str = "./summarease_summary/"
):
Expand Down Expand Up @@ -52,6 +263,9 @@ def summarize(dataset: pd.DataFrame,
target_variable : str, optional, default=None
The name of the target variable in the dataset. This helps in identifying the dependent variable for further analysis.
target_type : str, within {"categorical", "numerical"}
The type of target variable.
output_file : str, optional, default="summary.pdf"
The name of the output file where the summary will be saved.
Expand Down Expand Up @@ -91,5 +305,92 @@ def summarize(dataset: pd.DataFrame,
# This will generate a summary of the `data` dataframe, display the first three observations,
# clean the dataset, and save the summary as 'employee_summary.pdf' in the default output directory.
"""
pass
assert isinstance(dataset, pd.DataFrame), f"Argument 'dataset' should be pandas dataframe (pd.DataFrame)! You have {type(dataset)}."
assert isinstance(dataset_name, str), f"Argument 'dataset_name' should be string (str)! You have {type(dataset_name)}."
assert isinstance(description, str), f"Argument 'description' should be string (str)! You have {type(description)}."
assert isinstance(show_observations, str), f"Argument 'show_observations' should be a string (str)! You have {type(show_observations)}."
assert isinstance(show_n_observations, int), f"Argument 'show_n_observations' should be an integer (int)! You have {type(show_n_observations)}."
assert isinstance(show_warnings, bool), f"Argument 'show_warnings' should be a boolean (bool)! You have {type(show_warnings)}."
assert isinstance(summarize_by, str), f"Argument 'summarize_by' should be a string (str)! You have {type(summarize_by)}."
assert isinstance(auto_cleaning, bool), f"Argument 'auto_cleaning' should be a boolean (bool)! You have {type(auto_cleaning)}."
if target_variable is not None:
assert isinstance(target_variable, str), f"Argument 'target_variable' should be a string (str)! You have {type(target_variable)}."
assert isinstance(target_type, str), f"Argument 'target_type' should be a string (str)! You have {type(target_type)}."
assert isinstance(output_file, str), f"Argument 'output_file' should be a string (str)! You have {type(output_file)}."
assert isinstance(output_dir, str), f"Argument 'output_dir' should be a string (str)! You have {type(output_dir)}."
assert show_observations in {"random", "head", "tail"}, f"Argument 'show_observations' should be one of the following options: [random, head, tail]! You have {show_observations}."

summarize_by = summarize_by.lower()
assert summarize_by in {"table", "plot", "mix"}, f"Argument 'summarize_by' should be one of the following options: [table, plot, mix]! You have {summarize_by}."

output_dir = Path(output_dir)
output_path = output_dir / output_file

assert (output_path.suffix == ".pdf") or (output_path.suffix == ""), f"The 'output_file' should either have a .pdf extension or no extension! You have {output_path.suffix}."

# If the path doesn't exist, create it
validate_or_create_path(output_dir)

if summarize_by in {"plot", "mix"}:
plot_output_path = output_dir / "img"
validate_or_create_path(plot_output_path)


dataset_shape = dataset.shape
assert (dataset_shape[1] >= 2 and dataset_shape[1] <= 15), f"The function currently supports dataframes having less than 15 columns and more than 2 columns! You have {dataset_shape[1]}"

# Create the PDF
pdf = FPDF()

# Add a new page
pdf.add_page()

page_width = pdf.w
page_height = pdf.h

element_padding = 15
text_line_padding = 10

# Set the font to Helvetica, set the size, write the title
pdf.set_font("Helvetica", size=15)
pdf.cell(page_width - 2 * pdf.l_margin, element_padding, txt=dataset_name, ln=True, align='C')

# Change the size for the description and write it
pdf.set_font("Helvetica", size=11)
pdf.multi_cell(page_width - 2 * pdf.l_margin, text_line_padding, txt=description, align='L')

pdf = switch_page_if_needed(pdf)
pdf.set_font("Helvetica", size=13)
pdf.cell(page_width - 2 * pdf.l_margin, element_padding, txt="Numeric Columns Summary", ln=True, align='C')

numeric_description = "Here are numeric columns"

pdf.set_font("Helvetica", size=11)
pdf.multi_cell(page_width - 2 * pdf.l_margin, text_line_padding, txt=numeric_description, align='L')

if summarize_by == "plot":
summarized_numeric_output = summarize_numeric(dataset, summarize_by="plot")
for key, item in summarized_numeric_output.items():
plot_file = plot_output_path / f'{key}.png'
str_plot_file = str(plot_file)
item.save(plot_file)
pdf = add_image(pdf, image_path=str_plot_file, pdf_height=page_height, pdf_width=page_width, element_padding=15)

elif summarize_by == "table":
summarized_numeric_output = summarize_numeric(dataset, summarize_by="table")
pdf = add_table(pdf, table = summarized_numeric_output["numeric_describe"], pdf_height=page_height, pdf_width=page_width, element_padding=15)
if target_variable is not None:
pdf = switch_page_if_needed(pdf)
summarized_target_output = summarize_target_df(dataset, target_variable, target_type)
pdf.set_font("Helvetica", size=13)
pdf.cell(page_width - 2 * pdf.l_margin, element_padding, txt="Target Variable Summary", ln=True, align='C')
pdf.set_font("Helvetica", size=11)
pdf.multi_cell(page_width - 2 * pdf.l_margin, text_line_padding, txt=f"Target variable is a {target_type} variable. Please find the information about the target variable below:", align='L')
pdf = add_table(pdf, table = summarized_target_output, pdf_height=page_height, pdf_width=page_width, element_padding=15)



pdf.output(output_path)
assert output_path.exists(), "Something went wrong... The PDF output was not saved."
print("PDF created with FPDF!")

6 changes: 4 additions & 2 deletions src/summarease/summarize_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ def plot_correlation_heatmap(dataset_numeric):

return heatmap

def summarize_numeric(dataset: pd.DataFrame, summarize_by: str = "table"):
def summarize_numeric(dataset, summarize_by="table"):

"""
Summarize the numeric variables in the dataset by providing the summary statistics (e.g., mean,
standard deviation, min, max, etc.) for each numeric column or plotting the correlation heatmap
Expand All @@ -70,7 +71,7 @@ def summarize_numeric(dataset: pd.DataFrame, summarize_by: str = "table"):
Returns:
-------
None: Displays either a table of summary statistics or a plot (correlation heatmap), depending on the
A table of summary statistics or a plot (correlation heatmap), depending on the
`summarize_by` argument.
Notes:
Expand All @@ -91,6 +92,7 @@ def summarize_numeric(dataset: pd.DataFrame, summarize_by: str = "table"):

assert summarize_by in {"table", "plot"}, f"Argument 'summarize_by' should be one of the following options: [table, plot]! You have {summarize_by}."


numeric_columns = dataset.select_dtypes(include='number').columns

if numeric_columns.empty: # Check if there are no numeric columns
Expand Down
5 changes: 4 additions & 1 deletion src/summarease/summarize_target.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
import pandas as pd
import altair as alt
import warnings

def summarize_target_df(dataset_name: pd.DataFrame, target_variable: str,
target_type: str, threshold=0.2):
"""Summarize and evaluate the target variable for categarical or numerical types.
Expand Down Expand Up @@ -163,7 +167,6 @@ def summarize_target_balance_plot(summary_df: pd.DataFrame):
y=alt.Y('expected_upper:Q')
)

# Combine all charts
balance_chart = (actual_dist + error_bar + lower_ticks + upper_ticks).properties(
width=600,
height=400,
Expand Down
Loading

0 comments on commit b7ab278

Please sign in to comment.