Skip to content

Commit

Permalink
Use orjson for faster JSONL output
Browse files Browse the repository at this point in the history
refs #1018

When generating a large (2GB) diff as JSON-Lines this takes 20-30% less
time than the stdlib.

It may be possible to use this in other places, but note that orjson
doesn't support streaming encoding (iterencode), which means it is of
limited utility where we're trying to stream JSON diffs of huge
datasets.

This change uses it for individual features in JSONL diffs only where
the lack of iterencode() isn't a concern.

orjson is MIT licensed.
  • Loading branch information
craigds committed Nov 13, 2024
1 parent 2a92cfd commit 5e6167a
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 5 deletions.
14 changes: 10 additions & 4 deletions kart/json_diff_writers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import json
import orjson
import logging
import threading
from datetime import datetime, timedelta, timezone
Expand All @@ -19,7 +19,11 @@
from kart.diff_structs import FILES_KEY, BINARY_FILE, DatasetDiff
from kart.key_filters import DeltaFilter
from kart.log import commit_obj_to_json
from kart.output_util import dump_json_output, resolve_output_path
from kart.output_util import (
dump_json_output,
resolve_output_path,
orjson_encode_default,
)
from kart.tabular.feature_output import feature_as_geojson, feature_as_json
from kart.timestamps import datetime_to_iso8601_utc, timedelta_to_iso8601_tz

Expand Down Expand Up @@ -241,9 +245,11 @@ def __init__(self, *args, diff_estimate_accuracy=None, delta_filter=None, **kwar
self._output_lock = threading.RLock()

def dump(self, obj):
output: bytes = orjson.dumps(
obj, default=orjson_encode_default, option=orjson.OPT_APPEND_NEWLINE
)
with self._output_lock:
json.dump(obj, self.fp, separators=self.separators)
self.fp.write("\n")
self.fp.buffer.write(output)

def write_header(self):
self.dump(
Expand Down
25 changes: 24 additions & 1 deletion kart/output_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,27 @@
import types
from pathlib import Path

import orjson
import pygments
from pygments.lexers import JsonLexer

from .wkt_lexer import WKTLexer

_terminal_formatter = None

# note: `json` and `orjson` libraries aren't quite interchangeable.
# * orjson is much faster, so we use it where we can
# * orjson doesn't support custom separators
# * orjson doesn't support iterencode(), so can't stream unbounded iterators to stdout :(
ORJSON_OPTIONS = {
"compact": 0, # orjson doesn't support custom separators, so extracompact and compact look identical
"extracompact": 0,
"pretty": orjson.OPT_INDENT_2,
}
JSON_PARAMS = {
"compact": {},
"pretty": {"indent": 2},
"extracompact": {"separators": (",", ":")},
"pretty": {"indent": 2},
}


Expand All @@ -37,6 +47,19 @@ def __iter__(self):
return itertools.chain(self._head, *self[:1])


def orjson_encode_default(obj):
"""
Hook to extend the default serialisation of `orjson.dumps()`
"""
if isinstance(obj, tuple):
return list(obj)

if hasattr(obj, "__json__"):
return obj.__json__()

raise TypeError


class ExtendedJsonEncoder(json.JSONEncoder):
"""A JSONEncoder that tries calling __json__() if it can't serialise an object another way."""

Expand Down

0 comments on commit 5e6167a

Please sign in to comment.