From 5e6167aa1a232eb3aa603332b503527123c345a4 Mon Sep 17 00:00:00 2001 From: Craig de Stigter Date: Wed, 13 Nov 2024 15:33:04 +1300 Subject: [PATCH] Use `orjson` for faster JSONL output refs #1018 When generating a large (2GB) diff as JSON-Lines this takes 20-30% less time than the stdlib. It may be possible to use this in other places, but note that orjson doesn't support streaming encoding (iterencode), which means it is of limited utility where we're trying to stream JSON diffs of huge datasets. This change uses it for individual features in JSONL diffs only where the lack of iterencode() isn't a concern. orjson is MIT licensed. --- kart/json_diff_writers.py | 14 ++++++++++---- kart/output_util.py | 25 ++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 5 deletions(-) diff --git a/kart/json_diff_writers.py b/kart/json_diff_writers.py index 48509fff1..0a928adad 100644 --- a/kart/json_diff_writers.py +++ b/kart/json_diff_writers.py @@ -1,4 +1,4 @@ -import json +import orjson import logging import threading from datetime import datetime, timedelta, timezone @@ -19,7 +19,11 @@ from kart.diff_structs import FILES_KEY, BINARY_FILE, DatasetDiff from kart.key_filters import DeltaFilter from kart.log import commit_obj_to_json -from kart.output_util import dump_json_output, resolve_output_path +from kart.output_util import ( + dump_json_output, + resolve_output_path, + orjson_encode_default, +) from kart.tabular.feature_output import feature_as_geojson, feature_as_json from kart.timestamps import datetime_to_iso8601_utc, timedelta_to_iso8601_tz @@ -241,9 +245,11 @@ def __init__(self, *args, diff_estimate_accuracy=None, delta_filter=None, **kwar self._output_lock = threading.RLock() def dump(self, obj): + output: bytes = orjson.dumps( + obj, default=orjson_encode_default, option=orjson.OPT_APPEND_NEWLINE + ) with self._output_lock: - json.dump(obj, self.fp, separators=self.separators) - self.fp.write("\n") + self.fp.buffer.write(output) def write_header(self): self.dump( diff --git a/kart/output_util.py b/kart/output_util.py index 78af8e7ad..b46b2f22e 100644 --- a/kart/output_util.py +++ b/kart/output_util.py @@ -8,6 +8,7 @@ import types from pathlib import Path +import orjson import pygments from pygments.lexers import JsonLexer @@ -15,10 +16,19 @@ _terminal_formatter = None +# note: `json` and `orjson` libraries aren't quite interchangeable. +# * orjson is much faster, so we use it where we can +# * orjson doesn't support custom separators +# * orjson doesn't support iterencode(), so can't stream unbounded iterators to stdout :( +ORJSON_OPTIONS = { + "compact": 0, # orjson doesn't support custom separators, so extracompact and compact look identical + "extracompact": 0, + "pretty": orjson.OPT_INDENT_2, +} JSON_PARAMS = { "compact": {}, - "pretty": {"indent": 2}, "extracompact": {"separators": (",", ":")}, + "pretty": {"indent": 2}, } @@ -37,6 +47,19 @@ def __iter__(self): return itertools.chain(self._head, *self[:1]) +def orjson_encode_default(obj): + """ + Hook to extend the default serialisation of `orjson.dumps()` + """ + if isinstance(obj, tuple): + return list(obj) + + if hasattr(obj, "__json__"): + return obj.__json__() + + raise TypeError + + class ExtendedJsonEncoder(json.JSONEncoder): """A JSONEncoder that tries calling __json__() if it can't serialise an object another way."""