From 5e6167aa1a232eb3aa603332b503527123c345a4 Mon Sep 17 00:00:00 2001
From: Craig de Stigter <craig@destigter.nz>
Date: Wed, 13 Nov 2024 15:33:04 +1300
Subject: [PATCH] Use `orjson` for faster JSONL output

refs #1018

When generating a large (2GB) diff as JSON-Lines this takes 20-30% less
time than the stdlib.

It may be possible to use this in other places, but note that orjson
doesn't support streaming encoding (iterencode), which means it is of
limited utility where we're trying to stream JSON diffs of huge
datasets.

This change uses it for individual features in JSONL diffs only where
the lack of iterencode() isn't a concern.

orjson is MIT licensed.
---
 kart/json_diff_writers.py | 14 ++++++++++----
 kart/output_util.py       | 25 ++++++++++++++++++++++++-
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/kart/json_diff_writers.py b/kart/json_diff_writers.py
index 48509fff1..0a928adad 100644
--- a/kart/json_diff_writers.py
+++ b/kart/json_diff_writers.py
@@ -1,4 +1,4 @@
-import json
+import orjson
 import logging
 import threading
 from datetime import datetime, timedelta, timezone
@@ -19,7 +19,11 @@
 from kart.diff_structs import FILES_KEY, BINARY_FILE, DatasetDiff
 from kart.key_filters import DeltaFilter
 from kart.log import commit_obj_to_json
-from kart.output_util import dump_json_output, resolve_output_path
+from kart.output_util import (
+    dump_json_output,
+    resolve_output_path,
+    orjson_encode_default,
+)
 from kart.tabular.feature_output import feature_as_geojson, feature_as_json
 from kart.timestamps import datetime_to_iso8601_utc, timedelta_to_iso8601_tz
 
@@ -241,9 +245,11 @@ def __init__(self, *args, diff_estimate_accuracy=None, delta_filter=None, **kwar
         self._output_lock = threading.RLock()
 
     def dump(self, obj):
+        output: bytes = orjson.dumps(
+            obj, default=orjson_encode_default, option=orjson.OPT_APPEND_NEWLINE
+        )
         with self._output_lock:
-            json.dump(obj, self.fp, separators=self.separators)
-            self.fp.write("\n")
+            self.fp.buffer.write(output)
 
     def write_header(self):
         self.dump(
diff --git a/kart/output_util.py b/kart/output_util.py
index 78af8e7ad..b46b2f22e 100644
--- a/kart/output_util.py
+++ b/kart/output_util.py
@@ -8,6 +8,7 @@
 import types
 from pathlib import Path
 
+import orjson
 import pygments
 from pygments.lexers import JsonLexer
 
@@ -15,10 +16,19 @@
 
 _terminal_formatter = None
 
+# note: `json` and `orjson` libraries aren't quite interchangeable.
+#   * orjson is much faster, so we use it where we can
+#   * orjson doesn't support custom separators
+#   * orjson doesn't support iterencode(), so can't stream unbounded iterators to stdout :(
+ORJSON_OPTIONS = {
+    "compact": 0,  # orjson doesn't support custom separators, so extracompact and compact look identical
+    "extracompact": 0,
+    "pretty": orjson.OPT_INDENT_2,
+}
 JSON_PARAMS = {
     "compact": {},
-    "pretty": {"indent": 2},
     "extracompact": {"separators": (",", ":")},
+    "pretty": {"indent": 2},
 }
 
 
@@ -37,6 +47,19 @@ def __iter__(self):
         return itertools.chain(self._head, *self[:1])
 
 
+def orjson_encode_default(obj):
+    """
+    Hook to extend the default serialisation of `orjson.dumps()`
+    """
+    if isinstance(obj, tuple):
+        return list(obj)
+
+    if hasattr(obj, "__json__"):
+        return obj.__json__()
+
+    raise TypeError
+
+
 class ExtendedJsonEncoder(json.JSONEncoder):
     """A JSONEncoder that tries calling __json__() if it can't serialise an object another way."""