Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support config files #169

Open
wants to merge 17 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,10 @@ Do not strip the output ::

nbstripout --keep-output

Do not reassign the cell ids to be sequential ::

nbstripout --keep-id

To mark special cells so that the output is not stripped, you can either:

1. Set the ``keep_output`` tag on the cell. To do this, enable the tags
Expand Down Expand Up @@ -311,8 +315,9 @@ Stripping metadata

The following metadata is stripped by default:

* Notebook metadata: ``signature``, ``widgets``
* Cell metadata: ``ExecuteTime``, ``collapsed``, ``execution``, ``scrolled``
* Notebook metadata: ``signature``, ``widgets``
* Cell metadata: ``ExecuteTime``, ``collapsed``, ``execution``,
``heading_collapsed``, ``hidden``, ``scrolled``

Additional metadata to be stripped can be configured via either

Expand All @@ -337,6 +342,10 @@ Additional metadata to be stripped can be configured via either

--extra-keys "metadata.celltoolbar cell.metadata.heading_collapsed"

Note: Only notebook and cell metadata is currently supported and every key
specified via ``filter.nbstripout.extrakeys`` or ``--extra-keys`` must start
with ``metadata.`` for notebook and ``cell.metadata.`` for cell metadata.

You can keep certain metadata with either

* ``git config (--global/--system) filter.nbstripout.keepmetadatakeys``, e.g. ::
Expand Down
23 changes: 16 additions & 7 deletions nbstripout/_nbstripout.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@
*.ipynb diff=ipynb
"""

from argparse import ArgumentParser, RawDescriptionHelpFormatter
from argparse import ArgumentParser, Namespace, RawDescriptionHelpFormatter
import collections
import io
import json
Expand All @@ -120,7 +120,7 @@
import sys
import warnings

from nbstripout._utils import strip_output, strip_zeppelin_output
from nbstripout._utils import strip_output, strip_zeppelin_output, merge_configuration_file
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
from nbstripout._utils import strip_output, strip_zeppelin_output, merge_configuration_file
from nbstripout._utils import strip_output, strip_zeppelin_output, merge_configuration_files

try:
# Jupyter >= 4
from nbformat import read, write, NO_CONVERT
Expand Down Expand Up @@ -351,7 +351,7 @@ def status(git_config, install_location=INSTALL_LOCATION_LOCAL, verbose=False):
return 1


def main():
def setup_commandline() -> Namespace:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is no longer returning a Namespace but a parser, right?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should probably be called get_parser or sth like that.

parser = ArgumentParser(epilog=__doc__, formatter_class=RawDescriptionHelpFormatter)
task = parser.add_mutually_exclusive_group()
task.add_argument('--dry-run', action='store_true',
Expand All @@ -373,14 +373,17 @@ def main():
help='Do not strip the execution count/prompt number')
parser.add_argument('--keep-output', action='store_true',
help='Do not strip output', default=None)
parser.add_argument('--keep-id', action='store_true',
help='Keep the randomly generated cell ids, '
'which will be different after each execution.')
parser.add_argument('--extra-keys', default='',
help='Space separated list of extra keys to strip '
'from metadata, e.g. metadata.foo cell.metadata.bar')
parser.add_argument('--keep-metadata-keys', default='',
help='Space separated list of metadata keys to keep'
', e.g. metadata.foo cell.metadata.bar')
parser.add_argument('--drop-empty-cells', action='store_true',
help='Remove cells where `source` is empty or contains only whitepace')
help='Remove cells where `source` is empty or contains only whitespace')
parser.add_argument('--drop-tagged-cells', default='',
help='Space separated list of cell-tags that remove an entire cell')
parser.add_argument('--strip-init-cells', action='store_true',
Expand Down Expand Up @@ -408,7 +411,13 @@ def main():
help='Prints stripped files to STDOUT')

parser.add_argument('files', nargs='*', help='Files to strip output from')
args = parser.parse_args()

return parser


def main():
parser = setup_commandline()
args = merge_configuration_file(parser)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
args = merge_configuration_file(parser)
args = merge_configuration_files(parser)


git_config = ['git', 'config']

Expand Down Expand Up @@ -487,7 +496,7 @@ def main():
warnings.simplefilter("ignore", category=UserWarning)
nb = read(f, as_version=NO_CONVERT)

nb = strip_output(nb, args.keep_output, args.keep_count, extra_keys, args.drop_empty_cells,
nb = strip_output(nb, args.keep_output, args.keep_count, args.keep_id, extra_keys, args.drop_empty_cells,
args.drop_tagged_cells.split(), args.strip_init_cells, _parse_size(args.max_size))

if args.dry_run:
Expand Down Expand Up @@ -533,7 +542,7 @@ def main():
warnings.simplefilter("ignore", category=UserWarning)
nb = read(input_stream, as_version=NO_CONVERT)

nb = strip_output(nb, args.keep_output, args.keep_count, extra_keys, args.drop_empty_cells,
nb = strip_output(nb, args.keep_output, args.keep_count, args.keep_id, extra_keys, args.drop_empty_cells,
args.drop_tagged_cells.split(), args.strip_init_cells, _parse_size(args.max_size))

if args.dry_run:
Expand Down
117 changes: 113 additions & 4 deletions nbstripout/_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
from collections import defaultdict
from argparse import ArgumentParser, Namespace, _StoreTrueAction, _StoreFalseAction
import os
import sys
from collections import defaultdict
from functools import partial
from typing import Any, Dict, Optional

__all__ = ["pop_recursive", "strip_output", "strip_zeppelin_output", "MetadataError"]
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please keep imports sorted


Expand Down Expand Up @@ -94,7 +98,7 @@ def strip_zeppelin_output(nb):
return nb


def strip_output(nb, keep_output, keep_count, extra_keys=[], drop_empty_cells=False, drop_tagged_cells=[],
def strip_output(nb, keep_output, keep_count, keep_id, extra_keys=[], drop_empty_cells=False, drop_tagged_cells=[],
strip_init_cells=False, max_size=0):
"""
Strip the outputs, execution count/prompt number and miscellaneous
Expand Down Expand Up @@ -124,7 +128,7 @@ def strip_output(nb, keep_output, keep_count, extra_keys=[], drop_empty_cells=Fa
for tag_to_drop in drop_tagged_cells:
conditionals.append(lambda c: tag_to_drop not in c.get("metadata", {}).get("tags", []))

for cell in _cells(nb, conditionals):
for i, cell in enumerate(_cells(nb, conditionals)):
keep_output_this_cell = determine_keep_output(cell, keep_output, strip_init_cells)

# Remove the outputs, unless directed otherwise
Expand All @@ -148,7 +152,112 @@ def strip_output(nb, keep_output, keep_count, extra_keys=[], drop_empty_cells=Fa
cell['prompt_number'] = None
if 'execution_count' in cell and not keep_count:
cell['execution_count'] = None

# Replace the cell id with an incremental value that will be consistent across runs
if 'id' in cell and not keep_id:
cell['id'] = str(i)
for field in keys['cell']:
pop_recursive(cell, field)
return nb


def process_pyproject_toml(toml_file_path: str, parser: ArgumentParser) -> Optional[Dict[str, Any]]:
"""Extract config mapping from pyproject.toml file."""
try:
import tomllib # python 3.11+
except ModuleNotFoundError:
import tomli as tomllib

with open(toml_file_path, 'rb') as f:
dict_config = tomllib.load(f).get('tool', {}).get('nbstripout', None)

if not dict_config:
# could be {} if 'tool' not found, or None if 'nbstripout' not found
return dict_config

# special processing of boolean options, make sure we don't have invalid types
for a in parser._actions:
if a.dest in dict_config and isinstance(a, (_StoreTrueAction, _StoreFalseAction)):
if not isinstance(dict_config[a.dest], bool):
Comment on lines +178 to +179
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a big fan of reaching into the internals of ArgumentParser but also don't have a great idea what to do instead.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I thought that this would still be more robust than keeping an explicit list of Boolean arguments, although that wouldn't be impossible. I looked at a range of options, including shifting to an alternative arguments library, like Click, but that all got too complicated.
I think that this is a reasonable compromise if we keep on top of adding new Python versions to the test suite whenever they become available.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, agree

raise ValueError(f'Argument {a.dest} in pyproject.toml must be a boolean, not {dict_config[a.dest]}')

return dict_config


def process_setup_cfg(cfg_file_path, parser: ArgumentParser) -> Optional[Dict[str, Any]]:
"""Extract config mapping from setup.cfg file."""
import configparser

reader = configparser.ConfigParser()
reader.read(cfg_file_path)
if not reader.has_section('nbstripout'):
return None

raw_config = reader['nbstripout']
dict_config = dict(raw_config)

# special processing of boolean options, to convert various configparser bool types to true/false
for a in parser._actions:
if a.dest in raw_config and isinstance(a, (_StoreTrueAction, _StoreFalseAction)):
dict_config[a.dest] = raw_config.getboolean(a.dest)
Comment on lines +199 to +200
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto


return dict_config


def merge_configuration_files(parser: ArgumentParser, args_str=None) -> Namespace:
"""Merge flags from config files into args."""
CONFIG_FILES = {
'pyproject.toml': partial(process_pyproject_toml, parser=parser),
'setup.cfg': partial(process_setup_cfg, parser=parser),
}

# parse args as-is to look for configuration files
args = parser.parse_args(args_str)

# Traverse the file tree common to all files given as argument looking for
# a configuration file
# TODO: make this more like Black:
# By default Black looks for pyproject.toml starting from the common base directory of all files and
# directories passed on the command line. If it’s not there, it looks in parent directories. It stops looking
# when it finds the file, or a .git directory, or a .hg directory, or the root of the file system, whichever
# comes first.
# if no files are given, start from cwd
config_path = os.path.commonpath([os.path.abspath(file) for file in args.files]) if args.files else os.path.abspath(os.getcwd())
config: Optional[Dict[str, Any]] = None
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line gets a bit long with a ternary

while True:
for config_file, processor in CONFIG_FILES.items():
config_file_path = os.path.join(config_path, config_file)
if os.path.isfile(config_file_path):
config = processor(config_file_path)
if config is not None:
break
if config is not None:
break
config_path, tail = os.path.split(config_path)
if not tail:
break

# black starts with default arguments (from click), updates that with the config file,
# then applies command line arguments. this all happens in the click framework, before main() is called
# we can use parser.set_defaults
if config:
# check all arguments are valid
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hadn't picked up on this in my first round review: The way you implemented this, config file settings take precedence over command line arguments, however it should be the other way around.

I realise this will be a bit harder to implement. You could turn the Namespace returned by argparse into a dict and merge that into the dict you get from reading the config files, where the argparse dict overwrites.

What makes this even more annoying is that you don't know if a value has been set by the caller or is simply the default :( I may need to look around for an idiomatic way to implement this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry about the delay. Have to prioritize something else atm. Hope to come back to this at some point but if anyone sees fit, feel free to take over.

valid_args = vars(args).keys()
for name in config.keys():
if name not in valid_args:
raise ValueError(f'{name} in the config file is not a valid option')

# separate into default-overrides and special treatment
extra_keys: Optional[str] = None
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a brief comment why extra_keys needs special treatment.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not 100% sure that it does. I left this with the same functionality that @janosh did.
This special treatment lets you specify a base set of extra keys in the config files, and then add to that in an ad-hoc manner, but does not let you remove extra keys....
Alternative treatment would be that whenever you specify extra keys you would have to list the complete set.
The more I think about it, the more I think that perhaps special treatment is NOT justified.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think that's a fair point and treating it specially may lead to surprising results.

@janosh can you explain your rationale? Do you agree that special treatment of this flag may be surprising?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't remember for sure at this point, but I think it might be left over from the original autoflake8 config file PR. I agree minimal user surprise is usually the way to go.

if 'extra_keys' in config:
extra_keys = config['extra_keys']
del config['extra_keys']

# merge the configuration options as new defaults, and re-parse the arguments
parser.set_defaults(**config)
args = parser.parse_args(args_str)

# merge extra_keys using set union
if extra_keys:
args.extra_keys = ' '.join(sorted(set(extra_keys.split()) | set(args.extra_keys.split())))

return args
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@
"Intended Audience :: Developers",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Topic :: Software Development :: Version Control",
])
90 changes: 90 additions & 0 deletions tests/e2e_notebooks/test_max_size.ipynb.expected_sequential_id
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "0",
"metadata": {},
"source": [
"This notebook tests that outputs can be cleared based on size"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"aaaaaaaaaa\n"
]
}
],
"source": [
"print(\"a\"*10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2",
"metadata": {},
"outputs": [],
"source": [
"print(\"a\"*100)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.4"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading
Loading