Skip to content

Commit

Permalink
almost final changes 2
Browse files Browse the repository at this point in the history
  • Loading branch information
HrayrMuradyan committed Jan 19, 2025
1 parent 6d4e5c1 commit 2fe8f6a
Show file tree
Hide file tree
Showing 9 changed files with 63 additions and 196 deletions.
7 changes: 7 additions & 0 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion src/summarease/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# read version from installed package
from importlib.metadata import version
__version__ = version= ("summarease")
__version__ = version("summarease")
41 changes: 0 additions & 41 deletions src/summarease/clean_data.py

This file was deleted.

28 changes: 0 additions & 28 deletions src/summarease/plot_correlation_heatmap.py

This file was deleted.

41 changes: 31 additions & 10 deletions src/summarease/summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
import fpdf
from pathlib import Path
from summarize_numeric import summarize_numeric
from summarize_target import summarize_target_df
from summarize_target import summarize_target_df, summarize_target_balance_plot
from summarize_dtypes import summarize_dtypes_table
from PIL import Image


Expand Down Expand Up @@ -204,8 +205,10 @@ def add_table(pdf, table, pdf_height, pdf_width, element_padding=15):
return pdf

def switch_page_if_needed(pdf):
if pdf.get_y() > 30:
assert isinstance(pdf, FPDF), f"Argument 'pdf' should be FPDF class. You have {pdf}"
if pdf.get_y() > 50:
pdf.add_page()
print("New page created before the header.")
return pdf

def summarize(dataset: pd.DataFrame,
Expand Down Expand Up @@ -348,7 +351,7 @@ def summarize(dataset: pd.DataFrame,
page_width = pdf.w
page_height = pdf.h

element_padding = 15
element_padding = 10
text_line_padding = 10

# Set the font to Helvetica, set the size, write the title
Expand All @@ -370,15 +373,30 @@ def summarize(dataset: pd.DataFrame,

if summarize_by == "plot":
summarized_numeric_output = summarize_numeric(dataset, summarize_by="plot")
for key, item in summarized_numeric_output.items():
plot_file = plot_output_path / f'{key}.png'
str_plot_file = str(plot_file)
item.save(plot_file)
pdf = add_image(pdf, image_path=str_plot_file, pdf_height=page_height, pdf_width=page_width, element_padding=15)
if summarized_numeric_output:
for key, item in summarized_numeric_output.items():
plot_file = plot_output_path / f'{key}.png'
str_plot_file = str(plot_file)
item.save(plot_file)
pdf = add_image(pdf, image_path=str_plot_file, pdf_height=page_height, pdf_width=page_width, element_padding=10)

if target_variable is not None:
pdf = switch_page_if_needed(pdf)
pdf.set_font("Helvetica", size=13)
pdf.cell(page_width - 2 * pdf.l_margin, element_padding, txt="Target Variable Summary", ln=True, align='C')
pdf.set_font("Helvetica", size=11)
pdf.multi_cell(page_width - 2 * pdf.l_margin, text_line_padding, txt=f"Target variable is a {target_type} variable. Please find the information about the target variable below:", align='L')
summarized_target_output = summarize_target_df(dataset, target_variable, target_type)
summarized_target_plot = summarize_target_balance_plot(summarized_target_output)
target_plot_file = plot_output_path / "target_plot.png"
summarized_target_plot.save(target_plot_file)
pdf = add_image(pdf, target_plot_file, pdf_height=page_height, pdf_width=page_width, element_padding=0)

elif summarize_by == "table":
summarized_numeric_output = summarize_numeric(dataset, summarize_by="table")
pdf = add_table(pdf, table = summarized_numeric_output["numeric_describe"], pdf_height=page_height, pdf_width=page_width, element_padding=15)
if summarized_numeric_output:
pdf = add_table(pdf, table = summarized_numeric_output["numeric_describe"], pdf_height=page_height, pdf_width=page_width, element_padding=15)

if target_variable is not None:
pdf = switch_page_if_needed(pdf)
summarized_target_output = summarize_target_df(dataset, target_variable, target_type)
Expand All @@ -388,7 +406,10 @@ def summarize(dataset: pd.DataFrame,
pdf.multi_cell(page_width - 2 * pdf.l_margin, text_line_padding, txt=f"Target variable is a {target_type} variable. Please find the information about the target variable below:", align='L')
pdf = add_table(pdf, table = summarized_target_output, pdf_height=page_height, pdf_width=page_width, element_padding=15)


summarized_dtypes_table = summarize_dtypes_table(dataset)
pdf.set_font("Helvetica", size=13)
pdf.cell(page_width - 2 * pdf.l_margin, element_padding, txt="Dataset Data Types Summary", ln=True, align='C')
pdf = add_table(pdf, table = summarized_dtypes_table, pdf_height=page_height, pdf_width=page_width, element_padding=15)

pdf.output(output_path)
assert output_path.exists(), "Something went wrong... The PDF output was not saved."
Expand Down
31 changes: 0 additions & 31 deletions src/summarease/summarize_categorical.py

This file was deleted.

31 changes: 0 additions & 31 deletions src/summarease/summarize_missing_values.py

This file was deleted.

45 changes: 24 additions & 21 deletions src/summarease/summarize_numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ def plot_correlation_heatmap(dataset_numeric):
corr_melted = corr.reset_index().melt(id_vars='index')
corr_melted.columns = ['Var1', 'Var2', 'Correlation']

# Round the correlation values to 2 decimal places
corr_melted['Correlation'] = corr_melted['Correlation'].round(2)

# Create the heatmap with correlation values
heatmap = alt.Chart(corr_melted).mark_rect().encode(
x='Var1:N',
y='Var2:N',
Expand All @@ -49,7 +53,15 @@ def plot_correlation_heatmap(dataset_numeric):
height=400
)

return heatmap
# Add correlation value labels on the heatmap cells
text = alt.Chart(corr_melted).mark_text(dy=-5).encode(
x='Var1:N',
y='Var2:N',
text='Correlation:Q'
)

# Overlay the text on top of the heatmap
return heatmap + text

def summarize_numeric(dataset, summarize_by="table"):

Expand Down Expand Up @@ -91,31 +103,22 @@ def summarize_numeric(dataset, summarize_by="table"):
summarize_by = summarize_by.lower()

assert summarize_by in {"table", "plot"}, f"Argument 'summarize_by' should be one of the following options: [table, plot]! You have {summarize_by}."


numeric_columns = dataset.select_dtypes(include='number').columns
# Select the numeric columns from the dataset
dataset_numeric = dataset.select_dtypes(include=['number'])

if numeric_columns.empty: # Check if there are no numeric columns
print("No numeric columns found in the dataset.")
if dataset_numeric.empty:
return

outputs = {}

if summarize_by == "table":
# Generate summary statistics for numeric columns
summary = dataset[numeric_columns].describe()
print(summary)

elif summarize_by == "plot":
# Generate a correlation heatmap for numeric columns
if len(dataset) < 2:
print("Insufficient data for meaningful plots.")
return {}
outputs = {}

numeric_data = dataset[numeric_columns]
outputs["numeric_plot"] = plot_numeric_density(numeric_data)
if (summarize_by == "plot"):
outputs["numeric_plot"] = plot_numeric_density(dataset_numeric)

if len(numeric_columns) > 1:
outputs["corr_plot"] = plot_correlation_heatmap(numeric_data)
if (dataset_numeric.shape[1] > 1):
outputs["corr_plot"] = plot_correlation_heatmap(dataset_numeric)

elif (summarize_by == "table"):
outputs["numeric_describe"] = dataset_numeric.describe()

return outputs
33 changes: 0 additions & 33 deletions src/summarease/summarize_outliers.py

This file was deleted.

0 comments on commit 2fe8f6a

Please sign in to comment.