Skip to content

Commit

Permalink
only call parsers once (#1898)
Browse files Browse the repository at this point in the history
* only call parsers once

Signed-off-by: cosmicBboy <[email protected]>

* fix dask test error

Signed-off-by: cosmicBboy <[email protected]>

---------

Signed-off-by: cosmicBboy <[email protected]>
  • Loading branch information
cosmicBboy authored Jan 22, 2025
1 parent a832172 commit 33fe68b
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 13 deletions.
1 change: 1 addition & 0 deletions pandera/api/pandas/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def register_geopandas_backend():
register_fn = {
"pandas": register_pandas_backend,
"dask_expr": register_dask_backend,
"dask": register_dask_backend,
"modin": register_modin_backend,
"pyspark": register_pyspark_backend,
"geopandas": register_geopandas_backend,
Expand Down
28 changes: 15 additions & 13 deletions pandera/backends/pandas/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,27 +119,29 @@ def validate_column(check_obj, column_name, return_check_obj=False):
except SchemaErrors as exc:
error_handler.collect_errors(exc.schema_errors)

if schema.parsers:
for parser_index, parser in enumerate(schema.parsers):
check_obj[column_name] = self.run_parser(
check_obj[column_name],
parser,
parser_index,
).parser_output

if is_table(check_obj[column_name]):
for i in range(check_obj[column_name].shape[1]):
validate_column(
check_obj[column_name].iloc[:, [i]], column_name
validated_column = validate_column(
check_obj[column_name].iloc[:, [i]],
column_name,
return_check_obj=True,
)
if schema.parsers:
check_obj[column_name] = validated_column
else:
if getattr(schema, "drop_invalid_rows", False):
# replace the check_obj with the validated check_obj
# replace the check_obj with the validated
check_obj = validate_column(
check_obj, column_name, return_check_obj=True
)
else:
validate_column(check_obj, column_name)

validated_column = validate_column(
check_obj,
column_name,
return_check_obj=True,
)
if schema.parsers:
check_obj[column_name] = validated_column

if lazy and error_handler.collected_errors:
raise SchemaErrors(
Expand Down
19 changes: 19 additions & 0 deletions tests/core/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,22 @@ def int_column_lt_100(cls, series: pd.Series):
)
with pytest.raises(pa.errors.SchemaInitError, match=err_msg):
Schema.to_schema()


def test_parser_called_once():

data = pd.DataFrame({"col": [2.0, 4.0, 9.0]})
n_calls = 0

class DFModel(pa.DataFrameModel):
col: float

@pa.parser("col")
@classmethod
def negate(cls, series):
nonlocal n_calls
n_calls += 1
return series * -1

DFModel.validate(data)
assert n_calls == 1

0 comments on commit 33fe68b

Please sign in to comment.