diff --git a/README.md b/README.md
index e60d19409..939628248 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ with initialize_config_dir(config_dir=str(cdir)):
 
 # Documentation
 
-[Find our docs here](https://psychic-chainsaw-f197cc2b.pages.github.io/_build/html/index.html)
+[Find our docs here](https://astrazeneca.github.io/KAZU/_build/html/index.html)
 
 ## License
 
diff --git a/docs/_autosummary/kazu.modelling.ontology_preprocessing.base.rst b/docs/_autosummary/kazu.modelling.ontology_preprocessing.base.rst
index 254ab3f3d..d3d9b116c 100644
--- a/docs/_autosummary/kazu.modelling.ontology_preprocessing.base.rst
+++ b/docs/_autosummary/kazu.modelling.ontology_preprocessing.base.rst
@@ -1,4 +1,4 @@
-kazu.modelling.ontology\_preprocessing.base
+﻿kazu.modelling.ontology\_preprocessing.base
 ===========================================
 
 .. automodule:: kazu.modelling.ontology_preprocessing.base
@@ -19,6 +19,7 @@ kazu.modelling.ontology\_preprocessing.base
    
       BiologicalProcessGeneOntologyParser
       CLOOntologyParser
+      CLOntologyParser
       CellosaurusOntologyParser
       CellularComponentGeneOntologyParser
       ChemblOntologyParser
diff --git a/docs/_build/html/_autosummary/kazu.modelling.distillation.models.html b/docs/_build/html/_autosummary/kazu.modelling.distillation.models.html
index 37596446e..00936a5c2 100644
--- a/docs/_build/html/_autosummary/kazu.modelling.distillation.models.html
+++ b/docs/_build/html/_autosummary/kazu.modelling.distillation.models.html
@@ -444,7 +444,7 @@
 <dd><p>Implement one or more PyTorch DataLoaders for training.</p>
 <dl class="simple">
 <dt>Return:</dt><dd><p>A collection of <a class="reference external" href="https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader" title="(in PyTorch v1.13)"><code class="xref py py-class docutils literal notranslate"><span class="pre">torch.utils.data.DataLoader</span></code></a> specifying training samples.
-In the case of multiple dataloaders, please see this <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/guides/data.html#multiple-dataloaders" title="(in PyTorch Lightning v1.8.3.post1)"><span class="xref std std-ref">section</span></a>.</p>
+In the case of multiple dataloaders, please see this <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/guides/data.html#multiple-dataloaders" title="(in PyTorch Lightning v1.8.4)"><span class="xref std std-ref">section</span></a>.</p>
 </dd>
 </dl>
 <p>The dataloader you return will not be reloaded unless you set
@@ -463,7 +463,7 @@
 <p>do not assign state in prepare_data</p>
 </div>
 <ul class="simple">
-<li><p><a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.fit" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fit()</span></code></a></p></li>
+<li><p><a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.fit" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fit()</span></code></a></p></li>
 <li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">prepare_data()</span></code></p></li>
 <li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">setup()</span></code></p></li>
 </ul>
@@ -529,8 +529,8 @@
 a positive integer.</p>
 <p>It’s recommended that all data downloads and preparation happen in <code class="xref py py-meth docutils literal notranslate"><span class="pre">prepare_data()</span></code>.</p>
 <ul class="simple">
-<li><p><a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.fit" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fit()</span></code></a></p></li>
-<li><p><a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.validate" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">validate()</span></code></a></p></li>
+<li><p><a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.fit" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fit()</span></code></a></p></li>
+<li><p><a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.validate" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">validate()</span></code></a></p></li>
 <li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">prepare_data()</span></code></p></li>
 <li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">setup()</span></code></p></li>
 </ul>
@@ -1108,7 +1108,7 @@
 <dl class="py class">
 <dt class="sig sig-object py" id="kazu.modelling.distillation.models.TaskSpecificDistillation">
 <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kazu.modelling.distillation.models.</span></span><span class="sig-name descname"><span class="pre">TaskSpecificDistillation</span></span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/modelling/distillation/models.py#L143-L244"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.distillation.models.TaskSpecificDistillation" title="Permalink to this definition">#</a></dt>
-<dd><p>Bases: <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.LightningModule.html#pytorch_lightning.core.LightningModule" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">LightningModule</span></code></a></p>
+<dd><p>Bases: <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.LightningModule.html#pytorch_lightning.core.LightningModule" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-class docutils literal notranslate"><span class="pre">LightningModule</span></code></a></p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.modelling.distillation.models.TaskSpecificDistillation.__init__">
 <span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">temperature</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">warmup_steps</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">learning_rate</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">weight_decay</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">accumulate_grad_batches</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_epochs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">schedule</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/modelling/distillation/models.py#L144-L189"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.distillation.models.TaskSpecificDistillation.__init__" title="Permalink to this definition">#</a></dt>
diff --git a/docs/_build/html/_autosummary/kazu.modelling.hf_lightning_wrappers.html b/docs/_build/html/_autosummary/kazu.modelling.hf_lightning_wrappers.html
index fd23c0159..b6cd36241 100644
--- a/docs/_build/html/_autosummary/kazu.modelling.hf_lightning_wrappers.html
+++ b/docs/_build/html/_autosummary/kazu.modelling.hf_lightning_wrappers.html
@@ -352,7 +352,7 @@
 <dl class="py class">
 <dt class="sig sig-object py" id="kazu.modelling.hf_lightning_wrappers.PLAutoModel">
 <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kazu.modelling.hf_lightning_wrappers.</span></span><span class="sig-name descname"><span class="pre">PLAutoModel</span></span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/modelling/hf_lightning_wrappers.py#L23-L38"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.hf_lightning_wrappers.PLAutoModel" title="Permalink to this definition">#</a></dt>
-<dd><p>Bases: <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.LightningModule.html#pytorch_lightning.core.LightningModule" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">LightningModule</span></code></a></p>
+<dd><p>Bases: <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.LightningModule.html#pytorch_lightning.core.LightningModule" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-class docutils literal notranslate"><span class="pre">LightningModule</span></code></a></p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.modelling.hf_lightning_wrappers.PLAutoModel.__init__">
 <span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">model</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/modelling/hf_lightning_wrappers.py#L24-L33"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.hf_lightning_wrappers.PLAutoModel.__init__" title="Permalink to this definition">#</a></dt>
@@ -371,13 +371,13 @@
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.modelling.hf_lightning_wrappers.PLAutoModel.predict_step">
 <span class="sig-name descname"><span class="pre">predict_step</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">batch</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_idx</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dataloader_idx</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/modelling/hf_lightning_wrappers.py#L35-L38"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.hf_lightning_wrappers.PLAutoModel.predict_step" title="Permalink to this definition">#</a></dt>
-<dd><p>Step function called during <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.predict" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">predict()</span></code></a>. By default, it
+<dd><p>Step function called during <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.predict" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">predict()</span></code></a>. By default, it
 calls <code class="xref py py-meth docutils literal notranslate"><span class="pre">forward()</span></code>. Override to add any processing logic.</p>
 <p>The <code class="xref py py-meth docutils literal notranslate"><span class="pre">predict_step()</span></code> is used
 to scale inference on multi-devices.</p>
-<p>To prevent an OOM error, it is possible to use <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.BasePredictionWriter.html#pytorch_lightning.callbacks.BasePredictionWriter" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">BasePredictionWriter</span></code></a>
+<p>To prevent an OOM error, it is possible to use <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.BasePredictionWriter.html#pytorch_lightning.callbacks.BasePredictionWriter" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-class docutils literal notranslate"><span class="pre">BasePredictionWriter</span></code></a>
 callback to write the predictions to disk or database after each batch or on epoch end.</p>
-<p>The <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.BasePredictionWriter.html#pytorch_lightning.callbacks.BasePredictionWriter" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">BasePredictionWriter</span></code></a> should be used while using a spawn
+<p>The <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.BasePredictionWriter.html#pytorch_lightning.callbacks.BasePredictionWriter" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-class docutils literal notranslate"><span class="pre">BasePredictionWriter</span></code></a> should be used while using a spawn
 based accelerator. This happens for <code class="docutils literal notranslate"><span class="pre">Trainer(strategy=&quot;ddp_spawn&quot;)</span></code>
 or training on 8 TPU cores with <code class="docutils literal notranslate"><span class="pre">Trainer(accelerator=&quot;tpu&quot;,</span> <span class="pre">devices=8)</span></code> as predictions won’t be returned.</p>
 <p>Example</p>
@@ -424,7 +424,7 @@
 <dl class="py class">
 <dt class="sig sig-object py" id="kazu.modelling.hf_lightning_wrappers.PLAutoModelForTokenClassification">
 <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kazu.modelling.hf_lightning_wrappers.</span></span><span class="sig-name descname"><span class="pre">PLAutoModelForTokenClassification</span></span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/modelling/hf_lightning_wrappers.py#L7-L20"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.hf_lightning_wrappers.PLAutoModelForTokenClassification" title="Permalink to this definition">#</a></dt>
-<dd><p>Bases: <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.LightningModule.html#pytorch_lightning.core.LightningModule" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">LightningModule</span></code></a></p>
+<dd><p>Bases: <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.LightningModule.html#pytorch_lightning.core.LightningModule" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-class docutils literal notranslate"><span class="pre">LightningModule</span></code></a></p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.modelling.hf_lightning_wrappers.PLAutoModelForTokenClassification.__init__">
 <span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">model</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/modelling/hf_lightning_wrappers.py#L8-L17"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.hf_lightning_wrappers.PLAutoModelForTokenClassification.__init__" title="Permalink to this definition">#</a></dt>
@@ -443,13 +443,13 @@
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.modelling.hf_lightning_wrappers.PLAutoModelForTokenClassification.predict_step">
 <span class="sig-name descname"><span class="pre">predict_step</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">batch</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_idx</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dataloader_idx</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/modelling/hf_lightning_wrappers.py#L19-L20"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.hf_lightning_wrappers.PLAutoModelForTokenClassification.predict_step" title="Permalink to this definition">#</a></dt>
-<dd><p>Step function called during <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.predict" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">predict()</span></code></a>. By default, it
+<dd><p>Step function called during <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.predict" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">predict()</span></code></a>. By default, it
 calls <code class="xref py py-meth docutils literal notranslate"><span class="pre">forward()</span></code>. Override to add any processing logic.</p>
 <p>The <code class="xref py py-meth docutils literal notranslate"><span class="pre">predict_step()</span></code> is used
 to scale inference on multi-devices.</p>
-<p>To prevent an OOM error, it is possible to use <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.BasePredictionWriter.html#pytorch_lightning.callbacks.BasePredictionWriter" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">BasePredictionWriter</span></code></a>
+<p>To prevent an OOM error, it is possible to use <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.BasePredictionWriter.html#pytorch_lightning.callbacks.BasePredictionWriter" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-class docutils literal notranslate"><span class="pre">BasePredictionWriter</span></code></a>
 callback to write the predictions to disk or database after each batch or on epoch end.</p>
-<p>The <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.BasePredictionWriter.html#pytorch_lightning.callbacks.BasePredictionWriter" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">BasePredictionWriter</span></code></a> should be used while using a spawn
+<p>The <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.BasePredictionWriter.html#pytorch_lightning.callbacks.BasePredictionWriter" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-class docutils literal notranslate"><span class="pre">BasePredictionWriter</span></code></a> should be used while using a spawn
 based accelerator. This happens for <code class="docutils literal notranslate"><span class="pre">Trainer(strategy=&quot;ddp_spawn&quot;)</span></code>
 or training on 8 TPU cores with <code class="docutils literal notranslate"><span class="pre">Trainer(accelerator=&quot;tpu&quot;,</span> <span class="pre">devices=8)</span></code> as predictions won’t be returned.</p>
 <p>Example</p>
diff --git a/docs/_build/html/_autosummary/kazu.modelling.language.string_similarity_scorers.html b/docs/_build/html/_autosummary/kazu.modelling.language.string_similarity_scorers.html
index 651b54182..2feafdf47 100644
--- a/docs/_build/html/_autosummary/kazu.modelling.language.string_similarity_scorers.html
+++ b/docs/_build/html/_autosummary/kazu.modelling.language.string_similarity_scorers.html
@@ -370,7 +370,7 @@
 <dd><p>Bases: <a class="reference internal" href="#kazu.modelling.language.string_similarity_scorers.StringSimilarityScorer" title="kazu.modelling.language.string_similarity_scorers.StringSimilarityScorer"><code class="xref py py-class docutils literal notranslate"><span class="pre">StringSimilarityScorer</span></code></a>, <a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Protocol" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">Protocol</span></code></a></p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.modelling.language.string_similarity_scorers.BooleanStringSimilarityScorer.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/../../../.pyenv/versions/3.9.13/lib/python3.9/typing.py#L1087-L1113"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.language.string_similarity_scorers.BooleanStringSimilarityScorer.__init__" title="Permalink to this definition">#</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/../../../miniforge3/envs/kazu_39/lib/python3.9/typing.py#L1087-L1113"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.language.string_similarity_scorers.BooleanStringSimilarityScorer.__init__" title="Permalink to this definition">#</a></dt>
 <dd></dd></dl>
 
 </dd></dl>
@@ -436,7 +436,7 @@
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>sapbert</strong> (<a class="reference internal" href="kazu.modelling.linking.sapbert.train.html#kazu.modelling.linking.sapbert.train.PLSapbertModel" title="kazu.modelling.linking.sapbert.train.PLSapbertModel"><em>PLSapbertModel</em></a>) – </p></li>
-<li><p><strong>trainer</strong> (<a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer" title="(in PyTorch Lightning v1.8.3.post1)"><em>Trainer</em></a>) – </p></li>
+<li><p><strong>trainer</strong> (<a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer" title="(in PyTorch Lightning v1.8.4)"><em>Trainer</em></a>) – </p></li>
 </ul>
 </dd>
 </dl>
@@ -451,7 +451,7 @@
 <p>calculates a NumericMetric based on a string match or a normalised string match and a normalised term</p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.modelling.language.string_similarity_scorers.StringSimilarityScorer.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/../../../.pyenv/versions/3.9.13/lib/python3.9/typing.py#L1087-L1113"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.language.string_similarity_scorers.StringSimilarityScorer.__init__" title="Permalink to this definition">#</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/../../../miniforge3/envs/kazu_39/lib/python3.9/typing.py#L1087-L1113"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.language.string_similarity_scorers.StringSimilarityScorer.__init__" title="Permalink to this definition">#</a></dt>
 <dd></dd></dl>
 
 </dd></dl>
diff --git a/docs/_build/html/_autosummary/kazu.modelling.linking.sapbert.train.html b/docs/_build/html/_autosummary/kazu.modelling.linking.sapbert.train.html
index 6255bfb7e..c9bbc707a 100644
--- a/docs/_build/html/_autosummary/kazu.modelling.linking.sapbert.train.html
+++ b/docs/_build/html/_autosummary/kazu.modelling.linking.sapbert.train.html
@@ -510,7 +510,7 @@
 <dl class="py class">
 <dt class="sig sig-object py" id="kazu.modelling.linking.sapbert.train.PLSapbertModel">
 <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kazu.modelling.linking.sapbert.train.</span></span><span class="sig-name descname"><span class="pre">PLSapbertModel</span></span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/modelling/linking/sapbert/train.py#L238-L482"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.linking.sapbert.train.PLSapbertModel" title="Permalink to this definition">#</a></dt>
-<dd><p>Bases: <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.LightningModule.html#pytorch_lightning.core.LightningModule" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">LightningModule</span></code></a></p>
+<dd><p>Bases: <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.LightningModule.html#pytorch_lightning.core.LightningModule" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-class docutils literal notranslate"><span class="pre">LightningModule</span></code></a></p>
 <p>pytorch lightning production implementation of SapBert</p>
 <p>Original source
 <a class="reference external" href="https://github.com/cambridgeltl/sapbert">https://github.com/cambridgeltl/sapbert</a></p>
@@ -624,7 +624,7 @@
 </pre></div>
 </div>
 <p>Metrics can be made available to monitor by simply logging it using
-<code class="docutils literal notranslate"><span class="pre">self.log('metric_to_track',</span> <span class="pre">metric_val)</span></code> in your <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.LightningModule.html#pytorch_lightning.core.LightningModule" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">LightningModule</span></code></a>.</p>
+<code class="docutils literal notranslate"><span class="pre">self.log('metric_to_track',</span> <span class="pre">metric_val)</span></code> in your <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.core.LightningModule.html#pytorch_lightning.core.LightningModule" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-class docutils literal notranslate"><span class="pre">LightningModule</span></code></a>.</p>
 <dl>
 <dt>Note:</dt><dd><p>The <code class="docutils literal notranslate"><span class="pre">frequency</span></code> value specified in a dict along with the <code class="docutils literal notranslate"><span class="pre">optimizer</span></code> key is an int corresponding
 to the number of sequential batches optimized with the specific optimizer.
@@ -792,7 +792,7 @@
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>texts</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.List" title="(in Python v3.11)"><em>List</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a><em>]</em>) – </p></li>
-<li><p><strong>trainer</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.11)"><em>Optional</em></a><em>[</em><a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer" title="(in PyTorch Lightning v1.8.3.post1)"><em>Trainer</em></a><em>]</em>) – an optional PL Trainer to use. If not specified, uses the default one</p></li>
+<li><p><strong>trainer</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.11)"><em>Optional</em></a><em>[</em><a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer" title="(in PyTorch Lightning v1.8.4)"><em>Trainer</em></a><em>]</em>) – an optional PL Trainer to use. If not specified, uses the default one</p></li>
 <li><p><strong>size</strong> (<em>batch</em>) – optional batch size to use. If not specified, use 16</p></li>
 <li><p><strong>batch_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.11)"><em>Optional</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><em>int</em></a><em>]</em>) – </p></li>
 </ul>
@@ -814,7 +814,7 @@
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>loader</strong> (<a class="reference external" href="https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader" title="(in PyTorch v1.13)"><em>DataLoader</em></a>) – </p></li>
-<li><p><strong>trainer</strong> (<a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer" title="(in PyTorch Lightning v1.8.3.post1)"><em>Trainer</em></a>) – the PL Trainer to use</p></li>
+<li><p><strong>trainer</strong> (<a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer" title="(in PyTorch Lightning v1.8.4)"><em>Trainer</em></a>) – the PL Trainer to use</p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns<span class="colon">:</span></dt>
@@ -834,13 +834,13 @@
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.modelling.linking.sapbert.train.PLSapbertModel.predict_step">
 <span class="sig-name descname"><span class="pre">predict_step</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">batch</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_idx</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dataloader_idx</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/modelling/linking/sapbert/train.py#L375-L376"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.linking.sapbert.train.PLSapbertModel.predict_step" title="Permalink to this definition">#</a></dt>
-<dd><p>Step function called during <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.predict" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">predict()</span></code></a>. By default, it
+<dd><p>Step function called during <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.predict" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">predict()</span></code></a>. By default, it
 calls <code class="xref py py-meth docutils literal notranslate"><span class="pre">forward()</span></code>. Override to add any processing logic.</p>
 <p>The <code class="xref py py-meth docutils literal notranslate"><span class="pre">predict_step()</span></code> is used
 to scale inference on multi-devices.</p>
-<p>To prevent an OOM error, it is possible to use <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.BasePredictionWriter.html#pytorch_lightning.callbacks.BasePredictionWriter" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">BasePredictionWriter</span></code></a>
+<p>To prevent an OOM error, it is possible to use <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.BasePredictionWriter.html#pytorch_lightning.callbacks.BasePredictionWriter" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-class docutils literal notranslate"><span class="pre">BasePredictionWriter</span></code></a>
 callback to write the predictions to disk or database after each batch or on epoch end.</p>
-<p>The <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.BasePredictionWriter.html#pytorch_lightning.callbacks.BasePredictionWriter" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">BasePredictionWriter</span></code></a> should be used while using a spawn
+<p>The <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.callbacks.BasePredictionWriter.html#pytorch_lightning.callbacks.BasePredictionWriter" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-class docutils literal notranslate"><span class="pre">BasePredictionWriter</span></code></a> should be used while using a spawn
 based accelerator. This happens for <code class="docutils literal notranslate"><span class="pre">Trainer(strategy=&quot;ddp_spawn&quot;)</span></code>
 or training on 8 TPU cores with <code class="docutils literal notranslate"><span class="pre">Trainer(accelerator=&quot;tpu&quot;,</span> <span class="pre">devices=8)</span></code> as predictions won’t be returned.</p>
 <p>Example</p>
@@ -883,7 +883,7 @@
 <dd><p>Implement one or more PyTorch DataLoaders for training.</p>
 <dl class="simple">
 <dt>Return:</dt><dd><p>A collection of <a class="reference external" href="https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader" title="(in PyTorch v1.13)"><code class="xref py py-class docutils literal notranslate"><span class="pre">torch.utils.data.DataLoader</span></code></a> specifying training samples.
-In the case of multiple dataloaders, please see this <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/guides/data.html#multiple-dataloaders" title="(in PyTorch Lightning v1.8.3.post1)"><span class="xref std std-ref">section</span></a>.</p>
+In the case of multiple dataloaders, please see this <a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/guides/data.html#multiple-dataloaders" title="(in PyTorch Lightning v1.8.4)"><span class="xref std std-ref">section</span></a>.</p>
 </dd>
 </dl>
 <p>The dataloader you return will not be reloaded unless you set
@@ -902,7 +902,7 @@
 <p>do not assign state in prepare_data</p>
 </div>
 <ul class="simple">
-<li><p><a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.fit" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fit()</span></code></a></p></li>
+<li><p><a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.fit" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fit()</span></code></a></p></li>
 <li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">prepare_data()</span></code></p></li>
 <li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">setup()</span></code></p></li>
 </ul>
@@ -1041,8 +1041,8 @@
 a positive integer.</p>
 <p>It’s recommended that all data downloads and preparation happen in <code class="xref py py-meth docutils literal notranslate"><span class="pre">prepare_data()</span></code>.</p>
 <ul class="simple">
-<li><p><a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.fit" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fit()</span></code></a></p></li>
-<li><p><a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.validate" title="(in PyTorch Lightning v1.8.3.post1)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">validate()</span></code></a></p></li>
+<li><p><a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.fit" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">fit()</span></code></a></p></li>
+<li><p><a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer.validate" title="(in PyTorch Lightning v1.8.4)"><code class="xref py py-meth docutils literal notranslate"><span class="pre">validate()</span></code></a></p></li>
 <li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">prepare_data()</span></code></p></li>
 <li><p><code class="xref py py-meth docutils literal notranslate"><span class="pre">setup()</span></code></p></li>
 </ul>
diff --git a/docs/_build/html/_autosummary/kazu.modelling.ontology_preprocessing.base.html b/docs/_build/html/_autosummary/kazu.modelling.ontology_preprocessing.base.html
index 48c8af5e1..d847a321d 100644
--- a/docs/_build/html/_autosummary/kazu.modelling.ontology_preprocessing.base.html
+++ b/docs/_build/html/_autosummary/kazu.modelling.ontology_preprocessing.base.html
@@ -346,49 +346,52 @@
 <tr class="row-even"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.CLOOntologyParser" title="kazu.modelling.ontology_preprocessing.base.CLOOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">CLOOntologyParser</span></code></a></p></td>
 <td><p>input is a CLO Owl file <a class="reference external" href="https://www.ebi.ac.uk/ols/ontologies/clo">https://www.ebi.ac.uk/ols/ontologies/clo</a></p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.CellosaurusOntologyParser" title="kazu.modelling.ontology_preprocessing.base.CellosaurusOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">CellosaurusOntologyParser</span></code></a></p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.CLOntologyParser" title="kazu.modelling.ontology_preprocessing.base.CLOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">CLOntologyParser</span></code></a></p></td>
+<td><p>input should be an CL owl file e.g.</p></td>
+</tr>
+<tr class="row-even"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.CellosaurusOntologyParser" title="kazu.modelling.ontology_preprocessing.base.CellosaurusOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">CellosaurusOntologyParser</span></code></a></p></td>
 <td><p>input is an obo file from cellosaurus, e.g.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.CellularComponentGeneOntologyParser" title="kazu.modelling.ontology_preprocessing.base.CellularComponentGeneOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">CellularComponentGeneOntologyParser</span></code></a></p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.CellularComponentGeneOntologyParser" title="kazu.modelling.ontology_preprocessing.base.CellularComponentGeneOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">CellularComponentGeneOntologyParser</span></code></a></p></td>
 <td><p></p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.ChemblOntologyParser" title="kazu.modelling.ontology_preprocessing.base.ChemblOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">ChemblOntologyParser</span></code></a></p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.ChemblOntologyParser" title="kazu.modelling.ontology_preprocessing.base.ChemblOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">ChemblOntologyParser</span></code></a></p></td>
 <td><p>input is a sqllite dump from Chembl, e.g.</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.EnsemblOntologyParser" title="kazu.modelling.ontology_preprocessing.base.EnsemblOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">EnsemblOntologyParser</span></code></a></p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.EnsemblOntologyParser" title="kazu.modelling.ontology_preprocessing.base.EnsemblOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">EnsemblOntologyParser</span></code></a></p></td>
 <td><p>input is a json from HGNC e.g.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.GeneOntologyParser" title="kazu.modelling.ontology_preprocessing.base.GeneOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">GeneOntologyParser</span></code></a></p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.GeneOntologyParser" title="kazu.modelling.ontology_preprocessing.base.GeneOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">GeneOntologyParser</span></code></a></p></td>
 <td><p></p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.JsonLinesOntologyParser" title="kazu.modelling.ontology_preprocessing.base.JsonLinesOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">JsonLinesOntologyParser</span></code></a></p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.JsonLinesOntologyParser" title="kazu.modelling.ontology_preprocessing.base.JsonLinesOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">JsonLinesOntologyParser</span></code></a></p></td>
 <td><p>A parser for a jsonlines dataset.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.MeddraOntologyParser" title="kazu.modelling.ontology_preprocessing.base.MeddraOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">MeddraOntologyParser</span></code></a></p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.MeddraOntologyParser" title="kazu.modelling.ontology_preprocessing.base.MeddraOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">MeddraOntologyParser</span></code></a></p></td>
 <td><p>input is an unzipped directory to a MEddra release (Note, requires licence).</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.MolecularFunctionGeneOntologyParser" title="kazu.modelling.ontology_preprocessing.base.MolecularFunctionGeneOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">MolecularFunctionGeneOntologyParser</span></code></a></p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.MolecularFunctionGeneOntologyParser" title="kazu.modelling.ontology_preprocessing.base.MolecularFunctionGeneOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">MolecularFunctionGeneOntologyParser</span></code></a></p></td>
 <td><p></p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.MondoOntologyParser" title="kazu.modelling.ontology_preprocessing.base.MondoOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">MondoOntologyParser</span></code></a></p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.MondoOntologyParser" title="kazu.modelling.ontology_preprocessing.base.MondoOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">MondoOntologyParser</span></code></a></p></td>
 <td><p></p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.OntologyParser" title="kazu.modelling.ontology_preprocessing.base.OntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">OntologyParser</span></code></a></p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.OntologyParser" title="kazu.modelling.ontology_preprocessing.base.OntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">OntologyParser</span></code></a></p></td>
 <td><p>Parse an ontology (or similar) into a set of outputs suitable for NLP entity linking.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.OpenTargetsDiseaseOntologyParser" title="kazu.modelling.ontology_preprocessing.base.OpenTargetsDiseaseOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">OpenTargetsDiseaseOntologyParser</span></code></a></p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.OpenTargetsDiseaseOntologyParser" title="kazu.modelling.ontology_preprocessing.base.OpenTargetsDiseaseOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">OpenTargetsDiseaseOntologyParser</span></code></a></p></td>
 <td><p></p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.OpenTargetsMoleculeOntologyParser" title="kazu.modelling.ontology_preprocessing.base.OpenTargetsMoleculeOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">OpenTargetsMoleculeOntologyParser</span></code></a></p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.OpenTargetsMoleculeOntologyParser" title="kazu.modelling.ontology_preprocessing.base.OpenTargetsMoleculeOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">OpenTargetsMoleculeOntologyParser</span></code></a></p></td>
 <td><p></p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.OpenTargetsTargetOntologyParser" title="kazu.modelling.ontology_preprocessing.base.OpenTargetsTargetOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">OpenTargetsTargetOntologyParser</span></code></a></p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.OpenTargetsTargetOntologyParser" title="kazu.modelling.ontology_preprocessing.base.OpenTargetsTargetOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">OpenTargetsTargetOntologyParser</span></code></a></p></td>
 <td><p></p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.RDFGraphParser" title="kazu.modelling.ontology_preprocessing.base.RDFGraphParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">RDFGraphParser</span></code></a></p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.RDFGraphParser" title="kazu.modelling.ontology_preprocessing.base.RDFGraphParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">RDFGraphParser</span></code></a></p></td>
 <td><p>Parser for Owl files.</p></td>
 </tr>
-<tr class="row-odd"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.UberonOntologyParser" title="kazu.modelling.ontology_preprocessing.base.UberonOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">UberonOntologyParser</span></code></a></p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.UberonOntologyParser" title="kazu.modelling.ontology_preprocessing.base.UberonOntologyParser"><code class="xref py py-obj docutils literal notranslate"><span class="pre">UberonOntologyParser</span></code></a></p></td>
 <td><p>input should be an UBERON owl file e.g.</p></td>
 </tr>
 </tbody>
@@ -477,6 +480,58 @@
 
 </dd></dl>
 
+<dl class="py class">
+<dt class="sig sig-object py" id="kazu.modelling.ontology_preprocessing.base.CLOntologyParser">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kazu.modelling.ontology_preprocessing.base.</span></span><span class="sig-name descname"><span class="pre">CLOntologyParser</span></span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/modelling/ontology_preprocessing/base.py#L1522-L1556"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.ontology_preprocessing.base.CLOntologyParser" title="Permalink to this definition">#</a></dt>
+<dd><p>Bases: <a class="reference internal" href="#kazu.modelling.ontology_preprocessing.base.RDFGraphParser" title="kazu.modelling.ontology_preprocessing.base.RDFGraphParser"><code class="xref py py-class docutils literal notranslate"><span class="pre">RDFGraphParser</span></code></a></p>
+<p>input should be an CL owl file
+e.g.
+<a class="reference external" href="https://www.ebi.ac.uk/ols/ontologies/cl">https://www.ebi.ac.uk/ols/ontologies/cl</a></p>
+<dl class="py method">
+<dt class="sig sig-object py" id="kazu.modelling.ontology_preprocessing.base.CLOntologyParser.__init__">
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">in_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">entity_class</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">string_scorer</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">synonym_merge_threshold</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.7</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_origin</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'unknown'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">synonym_generator</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">excluded_ids</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/modelling/ontology_preprocessing/base.py#L1529-L1553"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.ontology_preprocessing.base.CLOntologyParser.__init__" title="Permalink to this definition">#</a></dt>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>in_path</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – Path to some resource that should be processed (e.g. owl file, db config, tsv etc)</p></li>
+<li><p><strong>entity_class</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – The entity class to associate with this parser throughout the pipeline.
+Also used in the parser when calling StringNormalizer to determine the class-appropriate behaviour.</p></li>
+<li><p><strong>name</strong> – A string to represent a parser in the overall pipeline. Should be globally unique</p></li>
+<li><p><strong>string_scorer</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.11)"><em>Optional</em></a><em>[</em><a class="reference internal" href="kazu.modelling.language.string_similarity_scorers.html#kazu.modelling.language.string_similarity_scorers.StringSimilarityScorer" title="kazu.modelling.language.string_similarity_scorers.StringSimilarityScorer"><em>StringSimilarityScorer</em></a><em>]</em>) – Optional protocol of StringSimilarityScorer.  Used for resolving ambiguous symbolic
+synonyms via similarity calculation of the default label associated with the conflicted labels. If no
+instance is provided, all synonym conflicts will be assumed to refer to different concepts. This is not
+recommended!</p></li>
+<li><p><strong>synonym_merge_threshold</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#float" title="(in Python v3.11)"><em>float</em></a>) – similarity threshold to trigger a merge of conflicted synonyms into a single
+EquivalentIdSet. See docs for score_and_group_ids for further details</p></li>
+<li><p><strong>data_origin</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – The origin of this dataset - e.g. HGNC release 2.1, MEDDRA 24.1 etc. Note, this is different
+from the parser.name, as is used to identify the origin of a mapping back to a data source</p></li>
+<li><p><strong>synonym_generator</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.11)"><em>Optional</em></a><em>[</em><a class="reference internal" href="kazu.modelling.ontology_preprocessing.synonym_generation.html#kazu.modelling.ontology_preprocessing.synonym_generation.CombinatorialSynonymGenerator" title="kazu.modelling.ontology_preprocessing.synonym_generation.CombinatorialSynonymGenerator"><em>CombinatorialSynonymGenerator</em></a><em>]</em>) – optional CombinatorialSynonymGenerator. Used to generate synonyms for dictionary
+based NER matching</p></li>
+<li><p><strong>excluded_ids</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.11)"><em>Optional</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Set" title="(in Python v3.11)"><em>Set</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a><em>]</em><em>]</em>) – optional set of ids to exclude from the parsing process</p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="kazu.modelling.ontology_preprocessing.base.CLOntologyParser.find_kb">
+<span class="sig-name descname"><span class="pre">find_kb</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">string</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/modelling/ontology_preprocessing/base.py#L1555-L1556"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.ontology_preprocessing.base.CLOntologyParser.find_kb" title="Permalink to this definition">#</a></dt>
+<dd><p>split an IDX somehow to find the ontology SOURCE reference</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>string</strong> (<a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a>) – the IDX string to process</p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p></p>
+</dd>
+<dt class="field-odd">Return type<span class="colon">:</span></dt>
+<dd class="field-odd"><p><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)">str</a></p>
+</dd>
+</dl>
+</dd></dl>
+
+</dd></dl>
+
 <dl class="py class">
 <dt class="sig sig-object py" id="kazu.modelling.ontology_preprocessing.base.CellosaurusOntologyParser">
 <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kazu.modelling.ontology_preprocessing.base.</span></span><span class="sig-name descname"><span class="pre">CellosaurusOntologyParser</span></span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/modelling/ontology_preprocessing/base.py#L1282-L1397"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.modelling.ontology_preprocessing.base.CellosaurusOntologyParser" title="Permalink to this definition">#</a></dt>
diff --git a/docs/_build/html/_autosummary/kazu.pipeline.pipeline.html b/docs/_build/html/_autosummary/kazu.pipeline.pipeline.html
index e9183404d..cf28d5a09 100644
--- a/docs/_build/html/_autosummary/kazu.pipeline.pipeline.html
+++ b/docs/_build/html/_autosummary/kazu.pipeline.pipeline.html
@@ -373,12 +373,12 @@
 </div>
 <dl class="py class">
 <dt class="sig sig-object py" id="kazu.pipeline.pipeline.FailedDocsFileHandler">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kazu.pipeline.pipeline.</span></span><span class="sig-name descname"><span class="pre">FailedDocsFileHandler</span></span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L98-L126"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.FailedDocsFileHandler" title="Permalink to this definition">#</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kazu.pipeline.pipeline.</span></span><span class="sig-name descname"><span class="pre">FailedDocsFileHandler</span></span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L97-L125"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.FailedDocsFileHandler" title="Permalink to this definition">#</a></dt>
 <dd><p>Bases: <a class="reference internal" href="#kazu.pipeline.pipeline.FailedDocsHandler" title="kazu.pipeline.pipeline.FailedDocsHandler"><code class="xref py py-class docutils literal notranslate"><span class="pre">FailedDocsHandler</span></code></a></p>
 <p>implementation logs docs to a directory, along with exception message</p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.pipeline.pipeline.FailedDocsFileHandler.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">log_dir</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L103-L104"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.FailedDocsFileHandler.__init__" title="Permalink to this definition">#</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">log_dir</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L102-L103"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.FailedDocsFileHandler.__init__" title="Permalink to this definition">#</a></dt>
 <dd><dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><p><strong>log_dir</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.11)"><em>Path</em></a>) – </p>
@@ -390,25 +390,25 @@
 
 <dl class="py class">
 <dt class="sig sig-object py" id="kazu.pipeline.pipeline.FailedDocsHandler">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kazu.pipeline.pipeline.</span></span><span class="sig-name descname"><span class="pre">FailedDocsHandler</span></span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L64-L76"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.FailedDocsHandler" title="Permalink to this definition">#</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kazu.pipeline.pipeline.</span></span><span class="sig-name descname"><span class="pre">FailedDocsHandler</span></span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L63-L75"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.FailedDocsHandler" title="Permalink to this definition">#</a></dt>
 <dd><p>Bases: <a class="reference external" href="https://docs.python.org/3/library/functions.html#object" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></a></p>
 <p>class to somehow handle failed docs</p>
 </dd></dl>
 
 <dl class="py class">
 <dt class="sig sig-object py" id="kazu.pipeline.pipeline.FailedDocsLogHandler">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kazu.pipeline.pipeline.</span></span><span class="sig-name descname"><span class="pre">FailedDocsLogHandler</span></span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L79-L95"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.FailedDocsLogHandler" title="Permalink to this definition">#</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kazu.pipeline.pipeline.</span></span><span class="sig-name descname"><span class="pre">FailedDocsLogHandler</span></span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L78-L94"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.FailedDocsLogHandler" title="Permalink to this definition">#</a></dt>
 <dd><p>Bases: <a class="reference internal" href="#kazu.pipeline.pipeline.FailedDocsHandler" title="kazu.pipeline.pipeline.FailedDocsHandler"><code class="xref py py-class docutils literal notranslate"><span class="pre">FailedDocsHandler</span></code></a></p>
 <p>implementation that logs to warning</p>
 </dd></dl>
 
 <dl class="py class">
 <dt class="sig sig-object py" id="kazu.pipeline.pipeline.Pipeline">
-<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kazu.pipeline.pipeline.</span></span><span class="sig-name descname"><span class="pre">Pipeline</span></span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L129-L236"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.Pipeline" title="Permalink to this definition">#</a></dt>
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kazu.pipeline.pipeline.</span></span><span class="sig-name descname"><span class="pre">Pipeline</span></span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L128-L245"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.Pipeline" title="Permalink to this definition">#</a></dt>
 <dd><p>Bases: <a class="reference external" href="https://docs.python.org/3/library/functions.html#object" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></a></p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.pipeline.pipeline.Pipeline.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">steps</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">failure_handler</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">profile_steps_dir</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">skip_doc_len</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">200000</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L130-L161"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.Pipeline.__init__" title="Permalink to this definition">#</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">steps</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">failure_handler</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">profile_steps_dir</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">skip_doc_len</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">200000</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L129-L170"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.Pipeline.__init__" title="Permalink to this definition">#</a></dt>
 <dd><p>A basic pipeline, used to help run a series of steps</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
@@ -424,7 +424,7 @@
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.pipeline.pipeline.Pipeline.prefilter_docs">
-<span class="sig-name descname"><span class="pre">prefilter_docs</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">docs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L163-L174"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.Pipeline.prefilter_docs" title="Permalink to this definition">#</a></dt>
+<span class="sig-name descname"><span class="pre">prefilter_docs</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">docs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L172-L183"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.Pipeline.prefilter_docs" title="Permalink to this definition">#</a></dt>
 <dd><dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><p><strong>docs</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.List" title="(in Python v3.11)"><em>List</em></a><em>[</em><a class="reference internal" href="kazu.data.data.html#kazu.data.data.Document" title="kazu.data.data.Document"><em>Document</em></a><em>]</em>) – </p>
@@ -434,7 +434,7 @@
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.pipeline.pipeline.Pipeline.profile">
-<span class="sig-name descname"><span class="pre">profile</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">step_times</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_time</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_metrics_dict</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L200-L226"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.Pipeline.profile" title="Permalink to this definition">#</a></dt>
+<span class="sig-name descname"><span class="pre">profile</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">step_times</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_time</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">batch_metrics_dict</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L209-L235"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.Pipeline.profile" title="Permalink to this definition">#</a></dt>
 <dd><dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><ul class="simple">
@@ -448,12 +448,12 @@
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.pipeline.pipeline.Pipeline.reset">
-<span class="sig-name descname"><span class="pre">reset</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L232-L236"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.Pipeline.reset" title="Permalink to this definition">#</a></dt>
+<span class="sig-name descname"><span class="pre">reset</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L241-L245"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.Pipeline.reset" title="Permalink to this definition">#</a></dt>
 <dd></dd></dl>
 
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.pipeline.pipeline.Pipeline.update_failed_docs">
-<span class="sig-name descname"><span class="pre">update_failed_docs</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">step</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">failed_docs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L228-L230"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.Pipeline.update_failed_docs" title="Permalink to this definition">#</a></dt>
+<span class="sig-name descname"><span class="pre">update_failed_docs</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">step</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">failed_docs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L237-L239"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.Pipeline.update_failed_docs" title="Permalink to this definition">#</a></dt>
 <dd><dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><ul class="simple">
@@ -468,7 +468,7 @@
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kazu.pipeline.pipeline.batch_metrics">
-<span class="sig-prename descclassname"><span class="pre">kazu.pipeline.pipeline.</span></span><span class="sig-name descname"><span class="pre">batch_metrics</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">docs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L50-L61"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.batch_metrics" title="Permalink to this definition">#</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kazu.pipeline.pipeline.</span></span><span class="sig-name descname"><span class="pre">batch_metrics</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">docs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L49-L60"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.batch_metrics" title="Permalink to this definition">#</a></dt>
 <dd><dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><p><strong>docs</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.List" title="(in Python v3.11)"><em>List</em></a><em>[</em><a class="reference internal" href="kazu.data.data.html#kazu.data.data.Document" title="kazu.data.data.Document"><em>Document</em></a><em>]</em>) – </p>
@@ -478,7 +478,7 @@
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kazu.pipeline.pipeline.calc_doc_size">
-<span class="sig-prename descclassname"><span class="pre">kazu.pipeline.pipeline.</span></span><span class="sig-name descname"><span class="pre">calc_doc_size</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">doc</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L46-L47"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.calc_doc_size" title="Permalink to this definition">#</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kazu.pipeline.pipeline.</span></span><span class="sig-name descname"><span class="pre">calc_doc_size</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">doc</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L45-L46"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.calc_doc_size" title="Permalink to this definition">#</a></dt>
 <dd><dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><p><strong>doc</strong> (<a class="reference internal" href="kazu.data.data.html#kazu.data.data.Document" title="kazu.data.data.Document"><em>Document</em></a>) – </p>
@@ -488,7 +488,7 @@
 
 <dl class="py function">
 <dt class="sig sig-object py" id="kazu.pipeline.pipeline.load_steps_and_log_memory_usage">
-<span class="sig-prename descclassname"><span class="pre">kazu.pipeline.pipeline.</span></span><span class="sig-name descname"><span class="pre">load_steps_and_log_memory_usage</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cfg</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L20-L43"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.load_steps_and_log_memory_usage" title="Permalink to this definition">#</a></dt>
+<span class="sig-prename descclassname"><span class="pre">kazu.pipeline.pipeline.</span></span><span class="sig-name descname"><span class="pre">load_steps_and_log_memory_usage</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">cfg</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/pipeline/pipeline.py#L19-L42"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.pipeline.pipeline.load_steps_and_log_memory_usage" title="Permalink to this definition">#</a></dt>
 <dd><p>Loads steps based on the pipeline config, and log the memory increase after loading each step.</p>
 <p>Note that you can instantiate the pipeline directly from the config in a way that gives the
 same results, but this is useful for monitoring/debugging high memory usage.</p>
diff --git a/docs/_build/html/_autosummary/kazu.steps.linking.post_processing.disambiguation.context_scoring.html b/docs/_build/html/_autosummary/kazu.steps.linking.post_processing.disambiguation.context_scoring.html
index 8b051931b..13100920d 100644
--- a/docs/_build/html/_autosummary/kazu.steps.linking.post_processing.disambiguation.context_scoring.html
+++ b/docs/_build/html/_autosummary/kazu.steps.linking.post_processing.disambiguation.context_scoring.html
@@ -351,7 +351,7 @@
 <table class="autosummary longtable docutils align-default">
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="#kazu.steps.linking.post_processing.disambiguation.context_scoring.TfIdfScorer" title="kazu.steps.linking.post_processing.disambiguation.context_scoring.TfIdfScorer"><code class="xref py py-obj docutils literal notranslate"><span class="pre">TfIdfScorer</span></code></a></p></td>
-<td><p>This class manages a set of TFIDF models (via <a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer" title="(in scikit-learn v1.1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">sklearn.feature_extraction.text.TfidfVectorizer</span></code></a>).</p></td>
+<td><p>This class manages a set of TFIDF models (via <a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer" title="(in scikit-learn v1.2)"><code class="xref py py-class docutils literal notranslate"><span class="pre">sklearn.feature_extraction.text.TfidfVectorizer</span></code></a>).</p></td>
 </tr>
 </tbody>
 </table>
@@ -361,7 +361,7 @@
 <em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">kazu.steps.linking.post_processing.disambiguation.context_scoring.</span></span><span class="sig-name descname"><span class="pre">TfIdfScorer</span></span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/steps/linking/post_processing/disambiguation/context_scoring.py#L42-L106"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.steps.linking.post_processing.disambiguation.context_scoring.TfIdfScorer" title="Permalink to this definition">#</a></dt>
 <dd><p>Bases: <a class="reference external" href="https://docs.python.org/3/library/functions.html#object" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></a></p>
 <p>This class manages a set of TFIDF models (via
-<a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer" title="(in scikit-learn v1.1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">sklearn.feature_extraction.text.TfidfVectorizer</span></code></a>).</p>
+<a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer" title="(in scikit-learn v1.2)"><code class="xref py py-class docutils literal notranslate"><span class="pre">sklearn.feature_extraction.text.TfidfVectorizer</span></code></a>).</p>
 <p>It’s a singleton, so that the models can be accessed in multiple locations without the need to
 load them into memory multiple times.</p>
 <dl class="py method">
@@ -370,7 +370,7 @@
 <dd><dl class="field-list simple">
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><p><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.11)"><em>Path</em></a>) – to a directory of files containing serialised
-<a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer" title="(in scikit-learn v1.1)"><code class="xref py py-class docutils literal notranslate"><span class="pre">sklearn.feature_extraction.text.TfidfVectorizer</span></code></a>. The individual filenames
+<a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer" title="(in scikit-learn v1.2)"><code class="xref py py-class docutils literal notranslate"><span class="pre">sklearn.feature_extraction.text.TfidfVectorizer</span></code></a>. The individual filenames
 are used to map the models to the relevant parser</p>
 </dd>
 </dl>
@@ -404,7 +404,7 @@
 <dd class="field-odd"><p><strong>path</strong> (<a class="reference external" href="https://docs.python.org/3/library/pathlib.html#pathlib.Path" title="(in Python v3.11)"><em>Path</em></a>) – </p>
 </dd>
 <dt class="field-even">Return type<span class="colon">:</span></dt>
-<dd class="field-even"><p><a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer" title="(in scikit-learn v1.1)"><em>TfidfVectorizer</em></a></p>
+<dd class="field-even"><p><a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer" title="(in scikit-learn v1.2)"><em>TfidfVectorizer</em></a></p>
 </dd>
 </dl>
 </dd></dl>
diff --git a/docs/_build/html/_autosummary/kazu.steps.linking.sapbert.html b/docs/_build/html/_autosummary/kazu.steps.linking.sapbert.html
index 37b28241d..bec1653b4 100644
--- a/docs/_build/html/_autosummary/kazu.steps.linking.sapbert.html
+++ b/docs/_build/html/_autosummary/kazu.steps.linking.sapbert.html
@@ -360,7 +360,7 @@
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>indices</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.List" title="(in Python v3.11)"><em>List</em></a><em>[</em><a class="reference internal" href="kazu.utils.link_index.html#kazu.utils.link_index.EmbeddingIndex" title="kazu.utils.link_index.EmbeddingIndex"><em>EmbeddingIndex</em></a><em>]</em>) – list of EmbeddingIndex to use with this model</p></li>
 <li><p><strong>embedding_model</strong> (<a class="reference internal" href="kazu.modelling.linking.sapbert.train.html#kazu.modelling.linking.sapbert.train.PLSapbertModel" title="kazu.modelling.linking.sapbert.train.PLSapbertModel"><em>PLSapbertModel</em></a>) – The SapBERT model to use to generate embeddings for entity mentions in input documents</p></li>
-<li><p><strong>trainer</strong> (<a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer" title="(in PyTorch Lightning v1.8.3.post1)"><em>Trainer</em></a>) – PL trainer to call when generating embeddings</p></li>
+<li><p><strong>trainer</strong> (<a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer" title="(in PyTorch Lightning v1.8.4)"><em>Trainer</em></a>) – PL trainer to call when generating embeddings</p></li>
 <li><p><strong>min_string_length_to_trigger</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.11)"><em>Optional</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Dict" title="(in Python v3.11)"><em>Dict</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/stdtypes.html#str" title="(in Python v3.11)"><em>str</em></a><em>, </em><a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><em>int</em></a><em>]</em><em>]</em>) – a per entity class mapping that signals sapbert will not run on matches
 shorter than this. (sapbert is less good at symbolic matching than string processing techniques)</p></li>
 <li><p><strong>ignore_high_conf</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.11)"><em>bool</em></a>) – If a perfect match has already been found, don’t run sapbert</p></li>
diff --git a/docs/_build/html/_autosummary/kazu.steps.ner.hf_token_classification.html b/docs/_build/html/_autosummary/kazu.steps.ner.hf_token_classification.html
index 64809aaa2..2ca2c7a86 100644
--- a/docs/_build/html/_autosummary/kazu.steps.ner.hf_token_classification.html
+++ b/docs/_build/html/_autosummary/kazu.steps.ner.hf_token_classification.html
@@ -365,7 +365,7 @@
 <li><p><strong>batch_size</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><em>int</em></a>) – batch size for dataloader</p></li>
 <li><p><strong>stride</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><em>int</em></a>) – passed to HF tokenizers (for splitting long docs)</p></li>
 <li><p><strong>max_sequence_length</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#int" title="(in Python v3.11)"><em>int</em></a>) – passed to HF tokenizers (for splitting long docs)</p></li>
-<li><p><strong>trainer</strong> (<a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer" title="(in PyTorch Lightning v1.8.3.post1)"><em>Trainer</em></a>) – </p></li>
+<li><p><strong>trainer</strong> (<a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer" title="(in PyTorch Lightning v1.8.4)"><em>Trainer</em></a>) – </p></li>
 <li><p><strong>detect_subspans</strong> (<a class="reference external" href="https://docs.python.org/3/library/functions.html#bool" title="(in Python v3.11)"><em>bool</em></a>) – attempt to detect nested entities (threshold must be configured)</p></li>
 <li><p><strong>threshold</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.11)"><em>Optional</em></a><em>[</em><a class="reference external" href="https://docs.python.org/3/library/functions.html#float" title="(in Python v3.11)"><em>float</em></a><em>]</em>) – the confidence threshold used to detect nested entities</p></li>
 <li><p><strong>entity_splitter</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.11)"><em>Optional</em></a><em>[</em><a class="reference internal" href="kazu.steps.ner.entity_post_processing.html#kazu.steps.ner.entity_post_processing.NonContiguousEntitySplitter" title="kazu.steps.ner.entity_post_processing.NonContiguousEntitySplitter"><em>NonContiguousEntitySplitter</em></a><em>]</em>) – instance of <a class="reference internal" href="kazu.steps.ner.entity_post_processing.html#kazu.steps.ner.entity_post_processing.NonContiguousEntitySplitter" title="kazu.steps.ner.entity_post_processing.NonContiguousEntitySplitter"><code class="xref py py-class docutils literal notranslate"><span class="pre">kazu.steps.ner.entity_post_processing.NonContiguousEntitySplitter</span></code></a> to detect non-contiguous entities</p></li>
diff --git a/docs/_build/html/_autosummary/kazu.steps.other.cleanup.html b/docs/_build/html/_autosummary/kazu.steps.other.cleanup.html
index b3cc22373..3b6199448 100644
--- a/docs/_build/html/_autosummary/kazu.steps.other.cleanup.html
+++ b/docs/_build/html/_autosummary/kazu.steps.other.cleanup.html
@@ -367,7 +367,7 @@
 <dd><p>Bases: <a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Protocol" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">Protocol</span></code></a></p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.steps.other.cleanup.CleanupAction.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/../../../.pyenv/versions/3.9.13/lib/python3.9/typing.py#L1087-L1113"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.steps.other.cleanup.CleanupAction.__init__" title="Permalink to this definition">#</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/../../../miniforge3/envs/kazu_39/lib/python3.9/typing.py#L1087-L1113"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.steps.other.cleanup.CleanupAction.__init__" title="Permalink to this definition">#</a></dt>
 <dd></dd></dl>
 
 <dl class="py method">
diff --git a/docs/_build/html/_autosummary/kazu.steps.step.html b/docs/_build/html/_autosummary/kazu.steps.step.html
index 365bb4f21..3f2e63110 100644
--- a/docs/_build/html/_autosummary/kazu.steps.step.html
+++ b/docs/_build/html/_autosummary/kazu.steps.step.html
@@ -365,7 +365,7 @@
 <dd><p>Bases: <a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Protocol" title="(in Python v3.11)"><code class="xref py py-class docutils literal notranslate"><span class="pre">Protocol</span></code></a></p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.steps.step.Step.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/../../../.pyenv/versions/3.9.13/lib/python3.9/typing.py#L1087-L1113"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.steps.step.Step.__init__" title="Permalink to this definition">#</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/../../../miniforge3/envs/kazu_39/lib/python3.9/typing.py#L1087-L1113"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.steps.step.Step.__init__" title="Permalink to this definition">#</a></dt>
 <dd></dd></dl>
 
 <dl class="py method">
diff --git a/docs/_build/html/_autosummary/kazu.utils.link_index.html b/docs/_build/html/_autosummary/kazu.utils.link_index.html
index 9d1f1a447..e7b437dc3 100644
--- a/docs/_build/html/_autosummary/kazu.utils.link_index.html
+++ b/docs/_build/html/_autosummary/kazu.utils.link_index.html
@@ -511,7 +511,7 @@
 <dt class="field-odd">Parameters<span class="colon">:</span></dt>
 <dd class="field-odd"><ul class="simple">
 <li><p><strong>embedding_model</strong> (<a class="reference internal" href="kazu.modelling.linking.sapbert.train.html#kazu.modelling.linking.sapbert.train.PLSapbertModel" title="kazu.modelling.linking.sapbert.train.PLSapbertModel"><em>PLSapbertModel</em></a>) – </p></li>
-<li><p><strong>trainer</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.11)"><em>Optional</em></a><em>[</em><a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer" title="(in PyTorch Lightning v1.8.3.post1)"><em>Trainer</em></a><em>]</em>) – </p></li>
+<li><p><strong>trainer</strong> (<a class="reference external" href="https://docs.python.org/3/library/typing.html#typing.Optional" title="(in Python v3.11)"><em>Optional</em></a><em>[</em><a class="reference external" href="https://pytorch-lightning.readthedocs.io/en/stable/api/pytorch_lightning.trainer.trainer.Trainer.html#pytorch_lightning.trainer.trainer.Trainer" title="(in PyTorch Lightning v1.8.4)"><em>Trainer</em></a><em>]</em>) – </p></li>
 </ul>
 </dd>
 <dt class="field-even">Returns<span class="colon">:</span></dt>
diff --git a/docs/_build/html/_autosummary/kazu.utils.string_normalizer.html b/docs/_build/html/_autosummary/kazu.utils.string_normalizer.html
index cdac989a3..e9f29c873 100644
--- a/docs/_build/html/_autosummary/kazu.utils.string_normalizer.html
+++ b/docs/_build/html/_autosummary/kazu.utils.string_normalizer.html
@@ -696,7 +696,7 @@
 <p>protocol describing methods a normalizer should implement</p>
 <dl class="py method">
 <dt class="sig sig-object py" id="kazu.utils.string_normalizer.EntityClassNormalizer.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/../../../.pyenv/versions/3.9.13/lib/python3.9/typing.py#L1087-L1113"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.utils.string_normalizer.EntityClassNormalizer.__init__" title="Permalink to this definition">#</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference external" href="https://github.com/AZ-AI/kazu/blob/main/kazu/../../../miniforge3/envs/kazu_39/lib/python3.9/typing.py#L1087-L1113"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#kazu.utils.string_normalizer.EntityClassNormalizer.__init__" title="Permalink to this definition">#</a></dt>
 <dd></dd></dl>
 
 <dl class="py method">
diff --git a/docs/_build/html/_sources/_autosummary/kazu.modelling.ontology_preprocessing.base.rst.txt b/docs/_build/html/_sources/_autosummary/kazu.modelling.ontology_preprocessing.base.rst.txt
index 254ab3f3d..d3d9b116c 100644
--- a/docs/_build/html/_sources/_autosummary/kazu.modelling.ontology_preprocessing.base.rst.txt
+++ b/docs/_build/html/_sources/_autosummary/kazu.modelling.ontology_preprocessing.base.rst.txt
@@ -1,4 +1,4 @@
-kazu.modelling.ontology\_preprocessing.base
+﻿kazu.modelling.ontology\_preprocessing.base
 ===========================================
 
 .. automodule:: kazu.modelling.ontology_preprocessing.base
@@ -19,6 +19,7 @@ kazu.modelling.ontology\_preprocessing.base
    
       BiologicalProcessGeneOntologyParser
       CLOOntologyParser
+      CLOntologyParser
       CellosaurusOntologyParser
       CellularComponentGeneOntologyParser
       ChemblOntologyParser
diff --git a/docs/_build/html/_sources/curating_for_explosion.rst.txt b/docs/_build/html/_sources/curating_for_explosion.rst.txt
index 0a446dbfb..84d56f8cc 100644
--- a/docs/_build/html/_sources/curating_for_explosion.rst.txt
+++ b/docs/_build/html/_sources/curating_for_explosion.rst.txt
@@ -7,106 +7,23 @@ Many entities in Biomedical NER do not require sophisticated NER or disambiguati
 unambiguous, and have few genuine synonyms. For instance, terms such as "Breast Cancer" and "mitosis" can be taken at face value, and
 simple string matching techniques can be employed (in our case, we use the `Spacy PhraseMatcher <https://spacy.io/api/phrasematcher>`_).
 
-However, the terms in ontologies tend to be noisy when taken 'wholesale', and need curation in order to ensure high precision matching.
+However, the string labels in ontologies tend to be noisy when taken 'wholesale', and need curation in order to ensure high precision matching.
+For instance, the `Gene Ontology reference for envelope <http://amigo.geneontology.org/amigo/term/GO:0031975>`_ is highly ambiguous -
+we wouldn't want this to be tagged every time we see the string 'envelope' appear in text. On the other hand
+`cornified envelope assembly <http://amigo.geneontology.org/amigo/term/GO:1903575>`_ is highly specific, and whenever we see this string,
+we can safely assume it refers to this GO id.
 
-In Kazu, we take the following approach:
-
-1. generate synonym candidates from the raw ontology to build a putative pipeline.
-
-    .. code-block::
-
-        from kazu.modelling.ontology_matching import assemble_pipeline
-        from kazu.modelling.ontology_preprocessing.base import MondoOntologyParser
-        from kazu.modelling.ontology_preprocessing.synonym_generation import (
-            CombinatorialSynonymGenerator,
-            StringReplacement,
-            StopWordRemover,
-        )
-
-        syn_generator = CombinatorialSynonymGenerator(
-            [StopWordRemover(), StringReplacement(include_greek=True)]
-        )
-        parser = MondoOntologyParser(
-            in_path="", data_origin="test", synonym_generator=syn_generator
-        )
-        nlp = assemble_pipeline.main(
-            parser_name_to_entity_type={parser.name: "disease"},
-            parsers=[parser],
-            labels={"disease"},
-            output_dir="~/noisy_spacy_pipeline",
-        )
-
-
-2. we then run this pipeline over a large corpora of text, and look at the frequency of each hit. Note, the below
-   is for illustration only - you'll probably want a more sophisticated set up when doing this on a large document set!
-
-    .. code-block::
-
-        from kazu.data.data import Document
-        from kazu.steps.joint_ner_and_linking.explosion import ExplosionStringMatchingStep
-        from dataclasses import dataclass, field
-        from typing import List
-        import json
-
-
-        @dataclass
-        class AnnotatedPhrase:
-            term: str
-            action: str
-            symbolic: bool
-            case_sensitive: bool
-            term_norm_mapping: dict[str, str] = field(default_factory=dict)
-            examples: list[str] = field(default_factory=list)
+Given an ontology can contain 100 000s of labels, how do we curate these? It's too labour intensive to look at every one. Therefore, we
+apply some pragmaticism in order to produce a set of precise labels we want to use for dictionary based NER and linking.
 
+In Kazu, we take the following approach:
 
-        class AnnotatedPhraseEncoder(json.JSONEncoder):
-            def default(self, obj):
-                if isinstance(obj, AnnotatedPhrase):
-                    return obj.__dict__
-                # Base class default() raises TypeError:
-                return json.JSONEncoder.default(self, obj)
-
-
-        def save(path, data):
-            with open(path, "w") as f:
-                f.writelines(json.dumps(x, cls=AnnotatedPhraseEncoder) + "\n" for x in data)
-
-
-        # get_docs represents some function to get documents relevant to you
-        docs: List[Document] = get_docs()
-        noisy_step = ExplosionStringMatchingStep(path="~/noisy_spacy_pipeline")
-
-        noisy_step(docs)
-        curatable_phrases = []
-        for doc in docs:
-            for section in doc.sections:
-                for ent in section.entities:
-                    term_norm_mapping = {
-                        term.parser_name: term.term_norm for term in ent.syn_term_to_synonym_terms
-                    }
-                    symbolic = any(x.is_symbolic for x in ent.syn_term_to_synonym_terms)
-                    to_curate = AnnotatedPhrase(
-                        term=ent.match,
-                        action="to_curate",
-                        case_sensitive=True,
-                        symbolic=symbolic,
-                        term_norm_mapping=term_norm_mapping,
-                        examples=[section.text[ent.start : ent.end]],
-                    )
-                    curatable_phrases.append(to_curate)
-
-        save("~/phrases_to_curate.jsonl", curatable_phrases)
-
-
-3. we curate the phrases_to_curate.jsonl file, according to whether they look like good matches or not for a given parser, and whether case matters.
-
-4. Now, the final pipeline can be generated as follows:
-
-    .. code-block::
+1. Generate synonym candidates from the raw ontology to build a putative list of terms we might want to use. If the term is symbolic,
+   we assume it's case sensitive. Otherwise assume case insensitive.
+2. Build a pipeline from this list, execute this pipeline over a large corpora of target data, and explore the results to get a sense of
+   which terms are 'noisy'
+3. Curate the top x hits by frequency, to determine whether a given term is precise enough in it's own right to be valid for dictionary based NER.
+   We assume here that if a term doesn't hit frequently enough to be considered in step 2, it's probably safe to include. Depending on your target
+   data, this may be invalid -  so in practice, the curation approach is iterative.
 
-        nlp = assemble_pipeline.main(
-            parser_name_to_entity_type={parser.name: "disease"},
-            curated_list="~/phrases_to_curate.jsonl",
-            labels={"disease"},
-            output_dir="~/<kazu model pack>/spacy_pipeline",
-        )
+TODO: add a worked example
diff --git a/docs/_build/html/_sources/quickstart.rst.txt b/docs/_build/html/_sources/quickstart.rst.txt
index 84d004997..e0f2a6a95 100644
--- a/docs/_build/html/_sources/quickstart.rst.txt
+++ b/docs/_build/html/_sources/quickstart.rst.txt
@@ -15,56 +15,56 @@ Ensure you are on version 21.0 or newer of pip.
 
 Model Pack
 ----------
-In order to use the majority of Kazu, you will need the model pack, which contains
-the pretrained models required by the pipeline. This is available from <TBA>
+In order to use the majority of Kazu, you will need a model pack, which contains
+the pretrained models and knowledge bases/ontologies required by the pipeline.
+These are available from the `release page <https://github.com/astrazeneca/kazu/releases>`_
 
-Running Steps
--------------
-Components are wrapped as instances of :class:`kazu.steps.step.Step`.
-
-.. include:: single_step_example.rst
+Default configuration
+---------------------
+Kazu has a LOT of moving parts, each of which can be configured according to your requirements.
+Since this can get complicated, we use `Hydra <https://hydra.cc/docs/intro/>`_ to manage different
+configurations, and provide a 'default' configuration that is generally useful in most circumstances
+(and is also a good starting point for your own tweaks). This default configuration is located in
+the 'conf/' directory of the model pack.
 
-Advanced Pipeline configuration with Hydra
--------------------------------------------
-
-To create an NLP pipeline, you need to instantiate steps. Given the large amount
-of configuration required, the easiest way to do this is with Hydra https://hydra.cc/docs/intro/
-
-Here, you will need a hydra config directory (see kazu/conf for an example).
-
-First, export the path of your config directory to KAZU_CONFIG_DIR.
-
-To use the example kazu/conf config you will need to
-set the environment variable KAZU_MODEL_PACK to a path for a kazu model pack,
-or manually update the model paths that use the variable - search for
-`${oc.env:KAZU_MODEL_PACK}` in kazu/conf).
+Processing your first document
+------------------------------
 
 .. testcode::
     :skipif: kazu_config_missing or kazu_model_pack_missing
 
-    import os
-    from hydra import compose, initialize_config_dir
+    from hydra import initialize_config_dir, compose
     from hydra.utils import instantiate
     from kazu.data.data import Document
     from kazu.pipeline import Pipeline
-    # some text we want to process
-    text = """EGFR is a gene"""
+    from pathlib import Path
+    import os
 
-    with initialize_config_dir(config_dir=os.environ.get("KAZU_CONFIG_DIR")):
-        cfg = compose(config_name="config")
-        # instantiate a pipeline based on Hydra defaults
+    # the hydra config is kept in the model pack. Ensure this env
+    # variable is set to your model pack location
+    cdir = Path(os.environ["KAZU_MODEL_PACK"]).joinpath('conf')
+    with initialize_config_dir(config_dir=str(cdir)):
+        cfg = compose(
+            config_name="config",
+            overrides=[],
+        )
         pipeline: Pipeline = instantiate(cfg.Pipeline)
-        # create an instance of Document from our text string
+        text = "EGFR mutations are often implicated in lung cancer"
         doc = Document.create_simple_document(text)
-        # Pipeline takes a List[Document] as an argument to __call__
-        # and returns a processed List[Document]
-        result: Document = pipeline([doc])[0]
-        # a Document is composed of Sections
-        # (a Document created with create_simple_document has only one)
-        print(result.sections[0].get_text())
+        pipeline([doc])
+        print(f"{doc.sections[0].text}")
+
 
 .. testoutput::
     :hide:
     :skipif: kazu_config_missing or kazu_model_pack_missing
 
-    EGFR is a gene
+    EGFR mutations are often implicated in lung cancer
+
+You can now inspect the doc object, and explore what entities were detected on each section
+
+Running Steps
+-------------
+Components are wrapped as instances of :class:`kazu.steps.step.Step`.
+
+.. include:: single_step_example.rst
diff --git a/docs/_build/html/_static/debug.css b/docs/_build/html/_static/debug.css
index 74d4aec33..3264805cc 100644
--- a/docs/_build/html/_static/debug.css
+++ b/docs/_build/html/_static/debug.css
@@ -64,6 +64,6 @@ body {
 .sb-footer__inner {
   background: salmon;
 }
-.sb-article {
+[role="main"] {
   background: white;
 }
diff --git a/docs/_build/html/_static/pygments.css b/docs/_build/html/_static/pygments.css
index 754715093..053cfd3db 100644
--- a/docs/_build/html/_static/pygments.css
+++ b/docs/_build/html/_static/pygments.css
@@ -54,7 +54,6 @@
 .highlight .nt { color: #204a87; font-weight: bold } /* Name.Tag */
 .highlight .nv { color: #000000 } /* Name.Variable */
 .highlight .ow { color: #204a87; font-weight: bold } /* Operator.Word */
-.highlight .pm { color: #000000; font-weight: bold } /* Punctuation.Marker */
 .highlight .w { color: #f8f8f8 } /* Text.Whitespace */
 .highlight .mb { color: #0000cf; font-weight: bold } /* Literal.Number.Bin */
 .highlight .mf { color: #0000cf; font-weight: bold } /* Literal.Number.Float */
@@ -139,7 +138,6 @@ body[data-theme="dark"] .highlight .py { color: #d0d0d0 } /* Name.Property */
 body[data-theme="dark"] .highlight .nt { color: #6ebf26; font-weight: bold } /* Name.Tag */
 body[data-theme="dark"] .highlight .nv { color: #40ffff } /* Name.Variable */
 body[data-theme="dark"] .highlight .ow { color: #6ebf26; font-weight: bold } /* Operator.Word */
-body[data-theme="dark"] .highlight .pm { color: #d0d0d0 } /* Punctuation.Marker */
 body[data-theme="dark"] .highlight .w { color: #666666 } /* Text.Whitespace */
 body[data-theme="dark"] .highlight .mb { color: #51b2fd } /* Literal.Number.Bin */
 body[data-theme="dark"] .highlight .mf { color: #51b2fd } /* Literal.Number.Float */
@@ -224,7 +222,6 @@ body:not([data-theme="light"]) .highlight .py { color: #d0d0d0 } /* Name.Propert
 body:not([data-theme="light"]) .highlight .nt { color: #6ebf26; font-weight: bold } /* Name.Tag */
 body:not([data-theme="light"]) .highlight .nv { color: #40ffff } /* Name.Variable */
 body:not([data-theme="light"]) .highlight .ow { color: #6ebf26; font-weight: bold } /* Operator.Word */
-body:not([data-theme="light"]) .highlight .pm { color: #d0d0d0 } /* Punctuation.Marker */
 body:not([data-theme="light"]) .highlight .w { color: #666666 } /* Text.Whitespace */
 body:not([data-theme="light"]) .highlight .mb { color: #51b2fd } /* Literal.Number.Bin */
 body:not([data-theme="light"]) .highlight .mf { color: #51b2fd } /* Literal.Number.Float */
diff --git a/docs/_build/html/_static/skeleton.css b/docs/_build/html/_static/skeleton.css
index 467c878c6..012311671 100644
--- a/docs/_build/html/_static/skeleton.css
+++ b/docs/_build/html/_static/skeleton.css
@@ -143,19 +143,19 @@ article {
 
 .sb-article-container,
 .sb-footer-content__inner,
-.drop-secondary-sidebar-for-full-width-content .sb-article,
+.drop-secondary-sidebar-for-full-width-content [role="main"],
 .drop-secondary-sidebar-for-full-width-content .match-content-width {
   width: 100vw;
 }
 
-.sb-article,
+[role="main"],
 .match-content-width {
   padding: 0 1rem;
   box-sizing: border-box;
 }
 
 @media (min-width: 32rem) {
-  .sb-article,
+  [role="main"],
   .match-content-width {
     padding: 0 2rem;
   }
@@ -167,33 +167,33 @@ article {
     width: auto;
   }
   .sb-footer-content__inner,
-  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content [role="main"],
   .drop-secondary-sidebar-for-full-width-content .match-content-width {
     width: 42rem;
   }
-  .sb-article,
+  [role="main"],
   .match-content-width {
     width: 42rem;
   }
 }
 @media (min-width: 46rem) {
   .sb-footer-content__inner,
-  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content [role="main"],
   .drop-secondary-sidebar-for-full-width-content .match-content-width {
     width: 46rem;
   }
-  .sb-article,
+  [role="main"],
   .match-content-width {
     width: 46rem;
   }
 }
 @media (min-width: 50rem) {
   .sb-footer-content__inner,
-  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content [role="main"],
   .drop-secondary-sidebar-for-full-width-content .match-content-width {
     width: 50rem;
   }
-  .sb-article,
+  [role="main"],
   .match-content-width {
     width: 50rem;
   }
@@ -208,33 +208,33 @@ article {
     display: none !important;
   }
   .sb-footer-content__inner,
-  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content [role="main"],
   .drop-secondary-sidebar-for-full-width-content .match-content-width {
     width: 59rem;
   }
-  .sb-article,
+  [role="main"],
   .match-content-width {
     width: 42rem;
   }
 }
 @media (min-width: 63rem) {
   .sb-footer-content__inner,
-  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content [role="main"],
   .drop-secondary-sidebar-for-full-width-content .match-content-width {
     width: 63rem;
   }
-  .sb-article,
+  [role="main"],
   .match-content-width {
     width: 46rem;
   }
 }
 @media (min-width: 67rem) {
   .sb-footer-content__inner,
-  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content [role="main"],
   .drop-secondary-sidebar-for-full-width-content .match-content-width {
     width: 67rem;
   }
-  .sb-article,
+  [role="main"],
   .match-content-width {
     width: 50rem;
   }
@@ -249,11 +249,11 @@ article {
     display: none !important;
   }
   .sb-footer-content__inner,
-  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content [role="main"],
   .drop-secondary-sidebar-for-full-width-content .match-content-width {
     width: 59rem;
   }
-  .sb-article,
+  [role="main"],
   .match-content-width {
     width: 42rem;
   }
@@ -261,24 +261,24 @@ article {
 
 /* Full desktop views */
 @media (min-width: 80rem) {
-  .sb-article,
+  [role="main"],
   .match-content-width {
     width: 46rem;
   }
   .sb-footer-content__inner,
-  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content [role="main"],
   .drop-secondary-sidebar-for-full-width-content .match-content-width {
     width: 63rem;
   }
 }
 
 @media (min-width: 84rem) {
-  .sb-article,
+  [role="main"],
   .match-content-width {
     width: 50rem;
   }
   .sb-footer-content__inner,
-  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content [role="main"],
   .drop-secondary-sidebar-for-full-width-content .match-content-width {
     width: 67rem;
   }
@@ -286,7 +286,7 @@ article {
 
 @media (min-width: 88rem) {
   .sb-footer-content__inner,
-  .drop-secondary-sidebar-for-full-width-content .sb-article,
+  .drop-secondary-sidebar-for-full-width-content [role="main"],
   .drop-secondary-sidebar-for-full-width-content .match-content-width {
     width: 67rem;
   }
diff --git a/docs/_build/html/curating_for_explosion.html b/docs/_build/html/curating_for_explosion.html
index af3928048..4c9e6c790 100644
--- a/docs/_build/html/curating_for_explosion.html
+++ b/docs/_build/html/curating_for_explosion.html
@@ -339,110 +339,24 @@
 <p>Many entities in Biomedical NER do not require sophisticated NER or disambiguation techniques, as they are often
 unambiguous, and have few genuine synonyms. For instance, terms such as “Breast Cancer” and “mitosis” can be taken at face value, and
 simple string matching techniques can be employed (in our case, we use the <a class="reference external" href="https://spacy.io/api/phrasematcher">Spacy PhraseMatcher</a>).</p>
-<p>However, the terms in ontologies tend to be noisy when taken ‘wholesale’, and need curation in order to ensure high precision matching.</p>
+<p>However, the string labels in ontologies tend to be noisy when taken ‘wholesale’, and need curation in order to ensure high precision matching.
+For instance, the <a class="reference external" href="http://amigo.geneontology.org/amigo/term/GO:0031975">Gene Ontology reference for envelope</a> is highly ambiguous -
+we wouldn’t want this to be tagged every time we see the string ‘envelope’ appear in text. On the other hand
+<a class="reference external" href="http://amigo.geneontology.org/amigo/term/GO:1903575">cornified envelope assembly</a> is highly specific, and whenever we see this string,
+we can safely assume it refers to this GO id.</p>
+<p>Given an ontology can contain 100 000s of labels, how do we curate these? It’s too labour intensive to look at every one. Therefore, we
+apply some pragmaticism in order to produce a set of precise labels we want to use for dictionary based NER and linking.</p>
 <p>In Kazu, we take the following approach:</p>
-<ol class="arabic">
-<li><p>generate synonym candidates from the raw ontology to build a putative pipeline.</p>
-<blockquote>
-<div><div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">kazu.modelling.ontology_matching</span> <span class="kn">import</span> <span class="n">assemble_pipeline</span>
-<span class="kn">from</span> <span class="nn">kazu.modelling.ontology_preprocessing.base</span> <span class="kn">import</span> <span class="n">MondoOntologyParser</span>
-<span class="kn">from</span> <span class="nn">kazu.modelling.ontology_preprocessing.synonym_generation</span> <span class="kn">import</span> <span class="p">(</span>
-    <span class="n">CombinatorialSynonymGenerator</span><span class="p">,</span>
-    <span class="n">StringReplacement</span><span class="p">,</span>
-    <span class="n">StopWordRemover</span><span class="p">,</span>
-<span class="p">)</span>
-
-<span class="n">syn_generator</span> <span class="o">=</span> <span class="n">CombinatorialSynonymGenerator</span><span class="p">(</span>
-    <span class="p">[</span><span class="n">StopWordRemover</span><span class="p">(),</span> <span class="n">StringReplacement</span><span class="p">(</span><span class="n">include_greek</span><span class="o">=</span><span class="kc">True</span><span class="p">)]</span>
-<span class="p">)</span>
-<span class="n">parser</span> <span class="o">=</span> <span class="n">MondoOntologyParser</span><span class="p">(</span>
-    <span class="n">in_path</span><span class="o">=</span><span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="n">data_origin</span><span class="o">=</span><span class="s2">&quot;test&quot;</span><span class="p">,</span> <span class="n">synonym_generator</span><span class="o">=</span><span class="n">syn_generator</span>
-<span class="p">)</span>
-<span class="n">nlp</span> <span class="o">=</span> <span class="n">assemble_pipeline</span><span class="o">.</span><span class="n">main</span><span class="p">(</span>
-    <span class="n">parser_name_to_entity_type</span><span class="o">=</span><span class="p">{</span><span class="n">parser</span><span class="o">.</span><span class="n">name</span><span class="p">:</span> <span class="s2">&quot;disease&quot;</span><span class="p">},</span>
-    <span class="n">parsers</span><span class="o">=</span><span class="p">[</span><span class="n">parser</span><span class="p">],</span>
-    <span class="n">labels</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;disease&quot;</span><span class="p">},</span>
-    <span class="n">output_dir</span><span class="o">=</span><span class="s2">&quot;~/noisy_spacy_pipeline&quot;</span><span class="p">,</span>
-<span class="p">)</span>
-</pre></div>
-</div>
-</div></blockquote>
-</li>
-<li><p>we then run this pipeline over a large corpora of text, and look at the frequency of each hit. Note, the below
-is for illustration only - you’ll probably want a more sophisticated set up when doing this on a large document set!</p>
-<blockquote>
-<div><div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">kazu.data.data</span> <span class="kn">import</span> <span class="n">Document</span>
-<span class="kn">from</span> <span class="nn">kazu.steps.joint_ner_and_linking.explosion</span> <span class="kn">import</span> <span class="n">ExplosionStringMatchingStep</span>
-<span class="kn">from</span> <span class="nn">dataclasses</span> <span class="kn">import</span> <span class="n">dataclass</span><span class="p">,</span> <span class="n">field</span>
-<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">List</span>
-<span class="kn">import</span> <span class="nn">json</span>
-
-
-<span class="nd">@dataclass</span>
-<span class="k">class</span> <span class="nc">AnnotatedPhrase</span><span class="p">:</span>
-    <span class="n">term</span><span class="p">:</span> <span class="nb">str</span>
-    <span class="n">action</span><span class="p">:</span> <span class="nb">str</span>
-    <span class="n">symbolic</span><span class="p">:</span> <span class="nb">bool</span>
-    <span class="n">case_sensitive</span><span class="p">:</span> <span class="nb">bool</span>
-    <span class="n">term_norm_mapping</span><span class="p">:</span> <span class="nb">dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">field</span><span class="p">(</span><span class="n">default_factory</span><span class="o">=</span><span class="nb">dict</span><span class="p">)</span>
-    <span class="n">examples</span><span class="p">:</span> <span class="nb">list</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">field</span><span class="p">(</span><span class="n">default_factory</span><span class="o">=</span><span class="nb">list</span><span class="p">)</span>
-
-
-<span class="k">class</span> <span class="nc">AnnotatedPhraseEncoder</span><span class="p">(</span><span class="n">json</span><span class="o">.</span><span class="n">JSONEncoder</span><span class="p">):</span>
-    <span class="k">def</span> <span class="nf">default</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">obj</span><span class="p">):</span>
-        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">obj</span><span class="p">,</span> <span class="n">AnnotatedPhrase</span><span class="p">):</span>
-            <span class="k">return</span> <span class="n">obj</span><span class="o">.</span><span class="vm">__dict__</span>
-        <span class="c1"># Base class default() raises TypeError:</span>
-        <span class="k">return</span> <span class="n">json</span><span class="o">.</span><span class="n">JSONEncoder</span><span class="o">.</span><span class="n">default</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">obj</span><span class="p">)</span>
-
-
-<span class="k">def</span> <span class="nf">save</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="n">data</span><span class="p">):</span>
-    <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="s2">&quot;w&quot;</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
-        <span class="n">f</span><span class="o">.</span><span class="n">writelines</span><span class="p">(</span><span class="n">json</span><span class="o">.</span><span class="n">dumps</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="bp">cls</span><span class="o">=</span><span class="n">AnnotatedPhraseEncoder</span><span class="p">)</span> <span class="o">+</span> <span class="s2">&quot;</span><span class="se">\n</span><span class="s2">&quot;</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">data</span><span class="p">)</span>
-
-
-<span class="c1"># get_docs represents some function to get documents relevant to you</span>
-<span class="n">docs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Document</span><span class="p">]</span> <span class="o">=</span> <span class="n">get_docs</span><span class="p">()</span>
-<span class="n">noisy_step</span> <span class="o">=</span> <span class="n">ExplosionStringMatchingStep</span><span class="p">(</span><span class="n">path</span><span class="o">=</span><span class="s2">&quot;~/noisy_spacy_pipeline&quot;</span><span class="p">)</span>
-
-<span class="n">noisy_step</span><span class="p">(</span><span class="n">docs</span><span class="p">)</span>
-<span class="n">curatable_phrases</span> <span class="o">=</span> <span class="p">[]</span>
-<span class="k">for</span> <span class="n">doc</span> <span class="ow">in</span> <span class="n">docs</span><span class="p">:</span>
-    <span class="k">for</span> <span class="n">section</span> <span class="ow">in</span> <span class="n">doc</span><span class="o">.</span><span class="n">sections</span><span class="p">:</span>
-        <span class="k">for</span> <span class="n">ent</span> <span class="ow">in</span> <span class="n">section</span><span class="o">.</span><span class="n">entities</span><span class="p">:</span>
-            <span class="n">term_norm_mapping</span> <span class="o">=</span> <span class="p">{</span>
-                <span class="n">term</span><span class="o">.</span><span class="n">parser_name</span><span class="p">:</span> <span class="n">term</span><span class="o">.</span><span class="n">term_norm</span> <span class="k">for</span> <span class="n">term</span> <span class="ow">in</span> <span class="n">ent</span><span class="o">.</span><span class="n">syn_term_to_synonym_terms</span>
-            <span class="p">}</span>
-            <span class="n">symbolic</span> <span class="o">=</span> <span class="nb">any</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">is_symbolic</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">ent</span><span class="o">.</span><span class="n">syn_term_to_synonym_terms</span><span class="p">)</span>
-            <span class="n">to_curate</span> <span class="o">=</span> <span class="n">AnnotatedPhrase</span><span class="p">(</span>
-                <span class="n">term</span><span class="o">=</span><span class="n">ent</span><span class="o">.</span><span class="n">match</span><span class="p">,</span>
-                <span class="n">action</span><span class="o">=</span><span class="s2">&quot;to_curate&quot;</span><span class="p">,</span>
-                <span class="n">case_sensitive</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
-                <span class="n">symbolic</span><span class="o">=</span><span class="n">symbolic</span><span class="p">,</span>
-                <span class="n">term_norm_mapping</span><span class="o">=</span><span class="n">term_norm_mapping</span><span class="p">,</span>
-                <span class="n">examples</span><span class="o">=</span><span class="p">[</span><span class="n">section</span><span class="o">.</span><span class="n">text</span><span class="p">[</span><span class="n">ent</span><span class="o">.</span><span class="n">start</span> <span class="p">:</span> <span class="n">ent</span><span class="o">.</span><span class="n">end</span><span class="p">]],</span>
-            <span class="p">)</span>
-            <span class="n">curatable_phrases</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">to_curate</span><span class="p">)</span>
-
-<span class="n">save</span><span class="p">(</span><span class="s2">&quot;~/phrases_to_curate.jsonl&quot;</span><span class="p">,</span> <span class="n">curatable_phrases</span><span class="p">)</span>
-</pre></div>
-</div>
-</div></blockquote>
-</li>
-<li><p>we curate the phrases_to_curate.jsonl file, according to whether they look like good matches or not for a given parser, and whether case matters.</p></li>
-<li><p>Now, the final pipeline can be generated as follows:</p>
-<blockquote>
-<div><div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">nlp</span> <span class="o">=</span> <span class="n">assemble_pipeline</span><span class="o">.</span><span class="n">main</span><span class="p">(</span>
-    <span class="n">parser_name_to_entity_type</span><span class="o">=</span><span class="p">{</span><span class="n">parser</span><span class="o">.</span><span class="n">name</span><span class="p">:</span> <span class="s2">&quot;disease&quot;</span><span class="p">},</span>
-    <span class="n">curated_list</span><span class="o">=</span><span class="s2">&quot;~/phrases_to_curate.jsonl&quot;</span><span class="p">,</span>
-    <span class="n">labels</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;disease&quot;</span><span class="p">},</span>
-    <span class="n">output_dir</span><span class="o">=</span><span class="s2">&quot;~/&lt;kazu model pack&gt;/spacy_pipeline&quot;</span><span class="p">,</span>
-<span class="p">)</span>
-</pre></div>
-</div>
-</div></blockquote>
-</li>
+<ol class="arabic simple">
+<li><p>Generate synonym candidates from the raw ontology to build a putative list of terms we might want to use. If the term is symbolic,
+we assume it’s case sensitive. Otherwise assume case insensitive.</p></li>
+<li><p>Build a pipeline from this list, execute this pipeline over a large corpora of target data, and explore the results to get a sense of
+which terms are ‘noisy’</p></li>
+<li><p>Curate the top x hits by frequency, to determine whether a given term is precise enough in it’s own right to be valid for dictionary based NER.
+We assume here that if a term doesn’t hit frequently enough to be considered in step 2, it’s probably safe to include. Depending on your target
+data, this may be invalid -  so in practice, the curation approach is iterative.</p></li>
 </ol>
+<p>TODO: add a worked example</p>
 </section>
 
         </article>
diff --git a/docs/_build/html/genindex.html b/docs/_build/html/genindex.html
index 05c2d0a23..a7466a609 100644
--- a/docs/_build/html/genindex.html
+++ b/docs/_build/html/genindex.html
@@ -410,6 +410,8 @@ <h2>_</h2>
           <li><a href="_autosummary/kazu.modelling.ontology_preprocessing.base.html#kazu.modelling.ontology_preprocessing.base.CellularComponentGeneOntologyParser.__init__">(kazu.modelling.ontology_preprocessing.base.CellularComponentGeneOntologyParser method)</a>
 </li>
           <li><a href="_autosummary/kazu.modelling.ontology_preprocessing.base.html#kazu.modelling.ontology_preprocessing.base.ChemblOntologyParser.__init__">(kazu.modelling.ontology_preprocessing.base.ChemblOntologyParser method)</a>
+</li>
+          <li><a href="_autosummary/kazu.modelling.ontology_preprocessing.base.html#kazu.modelling.ontology_preprocessing.base.CLOntologyParser.__init__">(kazu.modelling.ontology_preprocessing.base.CLOntologyParser method)</a>
 </li>
           <li><a href="_autosummary/kazu.modelling.ontology_preprocessing.base.html#kazu.modelling.ontology_preprocessing.base.CLOOntologyParser.__init__">(kazu.modelling.ontology_preprocessing.base.CLOOntologyParser method)</a>
 </li>
@@ -765,10 +767,12 @@ <h2>C</h2>
         <li><a href="_autosummary/kazu.steps.other.cleanup.html#kazu.steps.other.cleanup.CleanupAction">CleanupAction (class in kazu.steps.other.cleanup)</a>
 </li>
         <li><a href="_autosummary/kazu.steps.other.cleanup.html#kazu.steps.other.cleanup.CleanupStep">CleanupStep (class in kazu.steps.other.cleanup)</a>
+</li>
+        <li><a href="_autosummary/kazu.utils.build_and_test_model_packs.html#kazu.utils.build_and_test_model_packs.ModelPackBuilder.clear_cached_resources_from_model_pack_dir">clear_cached_resources_from_model_pack_dir() (kazu.utils.build_and_test_model_packs.ModelPackBuilder static method)</a>
 </li>
     </ul></td>
     <td style="width: 33%; vertical-align: top;"><ul>
-        <li><a href="_autosummary/kazu.utils.build_and_test_model_packs.html#kazu.utils.build_and_test_model_packs.ModelPackBuilder.clear_cached_resources_from_model_pack_dir">clear_cached_resources_from_model_pack_dir() (kazu.utils.build_and_test_model_packs.ModelPackBuilder static method)</a>
+        <li><a href="_autosummary/kazu.modelling.ontology_preprocessing.base.html#kazu.modelling.ontology_preprocessing.base.CLOntologyParser">CLOntologyParser (class in kazu.modelling.ontology_preprocessing.base)</a>
 </li>
         <li><a href="_autosummary/kazu.modelling.ontology_preprocessing.base.html#kazu.modelling.ontology_preprocessing.base.CLOOntologyParser">CLOOntologyParser (class in kazu.modelling.ontology_preprocessing.base)</a>
 </li>
@@ -1040,6 +1044,8 @@ <h2>F</h2>
 
         <ul>
           <li><a href="_autosummary/kazu.modelling.ontology_preprocessing.base.html#kazu.modelling.ontology_preprocessing.base.ChemblOntologyParser.find_kb">(kazu.modelling.ontology_preprocessing.base.ChemblOntologyParser method)</a>
+</li>
+          <li><a href="_autosummary/kazu.modelling.ontology_preprocessing.base.html#kazu.modelling.ontology_preprocessing.base.CLOntologyParser.find_kb">(kazu.modelling.ontology_preprocessing.base.CLOntologyParser method)</a>
 </li>
           <li><a href="_autosummary/kazu.modelling.ontology_preprocessing.base.html#kazu.modelling.ontology_preprocessing.base.CLOOntologyParser.find_kb">(kazu.modelling.ontology_preprocessing.base.CLOOntologyParser method)</a>
 </li>
diff --git a/docs/_build/html/index.html b/docs/_build/html/index.html
index 0b31d3ffe..13cee9bf8 100644
--- a/docs/_build/html/index.html
+++ b/docs/_build/html/index.html
@@ -350,8 +350,9 @@ <h1>Welcome to Kazu’s documentation!<a class="headerlink" href="#welcome-to-ka
 <li class="toctree-l1"><a class="reference internal" href="quickstart.html">Quickstart</a><ul>
 <li class="toctree-l2"><a class="reference internal" href="quickstart.html#installation">Installation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="quickstart.html#model-pack">Model Pack</a></li>
+<li class="toctree-l2"><a class="reference internal" href="quickstart.html#default-configuration">Default configuration</a></li>
+<li class="toctree-l2"><a class="reference internal" href="quickstart.html#processing-your-first-document">Processing your first document</a></li>
 <li class="toctree-l2"><a class="reference internal" href="quickstart.html#running-steps">Running Steps</a></li>
-<li class="toctree-l2"><a class="reference internal" href="quickstart.html#advanced-pipeline-configuration-with-hydra">Advanced Pipeline configuration with Hydra</a></li>
 </ul>
 </li>
 <li class="toctree-l1"><a class="reference internal" href="default_pipeline.html">The Default Kazu Pipeline</a></li>
diff --git a/docs/_build/html/objects.inv b/docs/_build/html/objects.inv
index 0d98af4f5..b4570ee04 100644
Binary files a/docs/_build/html/objects.inv and b/docs/_build/html/objects.inv differ
diff --git a/docs/_build/html/quickstart.html b/docs/_build/html/quickstart.html
index 6d4f9b472..68704c968 100644
--- a/docs/_build/html/quickstart.html
+++ b/docs/_build/html/quickstart.html
@@ -346,8 +346,43 @@ <h2>Installation<a class="headerlink" href="#installation" title="Permalink to t
 </section>
 <section id="model-pack">
 <h2>Model Pack<a class="headerlink" href="#model-pack" title="Permalink to this heading">#</a></h2>
-<p>In order to use the majority of Kazu, you will need the model pack, which contains
-the pretrained models required by the pipeline. This is available from &lt;TBA&gt;</p>
+<p>In order to use the majority of Kazu, you will need a model pack, which contains
+the pretrained models and knowledge bases/ontologies required by the pipeline.
+These are available from the <a class="reference external" href="https://github.com/astrazeneca/kazu/releases">release page</a></p>
+</section>
+<section id="default-configuration">
+<h2>Default configuration<a class="headerlink" href="#default-configuration" title="Permalink to this heading">#</a></h2>
+<p>Kazu has a LOT of moving parts, each of which can be configured according to your requirements.
+Since this can get complicated, we use <a class="reference external" href="https://hydra.cc/docs/intro/">Hydra</a> to manage different
+configurations, and provide a ‘default’ configuration that is generally useful in most circumstances
+(and is also a good starting point for your own tweaks). This default configuration is located in
+the ‘conf/’ directory of the model pack.</p>
+</section>
+<section id="processing-your-first-document">
+<h2>Processing your first document<a class="headerlink" href="#processing-your-first-document" title="Permalink to this heading">#</a></h2>
+<div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">hydra</span> <span class="kn">import</span> <span class="n">initialize_config_dir</span><span class="p">,</span> <span class="n">compose</span>
+<span class="kn">from</span> <span class="nn">hydra.utils</span> <span class="kn">import</span> <span class="n">instantiate</span>
+<span class="kn">from</span> <span class="nn">kazu.data.data</span> <span class="kn">import</span> <span class="n">Document</span>
+<span class="kn">from</span> <span class="nn">kazu.pipeline</span> <span class="kn">import</span> <span class="n">Pipeline</span>
+<span class="kn">from</span> <span class="nn">pathlib</span> <span class="kn">import</span> <span class="n">Path</span>
+<span class="kn">import</span> <span class="nn">os</span>
+
+<span class="c1"># the hydra config is kept in the model pack. Ensure this env</span>
+<span class="c1"># variable is set to your model pack location</span>
+<span class="n">cdir</span> <span class="o">=</span> <span class="n">Path</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;KAZU_MODEL_PACK&quot;</span><span class="p">])</span><span class="o">.</span><span class="n">joinpath</span><span class="p">(</span><span class="s1">&#39;conf&#39;</span><span class="p">)</span>
+<span class="k">with</span> <span class="n">initialize_config_dir</span><span class="p">(</span><span class="n">config_dir</span><span class="o">=</span><span class="nb">str</span><span class="p">(</span><span class="n">cdir</span><span class="p">)):</span>
+    <span class="n">cfg</span> <span class="o">=</span> <span class="n">compose</span><span class="p">(</span>
+        <span class="n">config_name</span><span class="o">=</span><span class="s2">&quot;config&quot;</span><span class="p">,</span>
+        <span class="n">overrides</span><span class="o">=</span><span class="p">[],</span>
+    <span class="p">)</span>
+    <span class="n">pipeline</span><span class="p">:</span> <span class="n">Pipeline</span> <span class="o">=</span> <span class="n">instantiate</span><span class="p">(</span><span class="n">cfg</span><span class="o">.</span><span class="n">Pipeline</span><span class="p">)</span>
+    <span class="n">text</span> <span class="o">=</span> <span class="s2">&quot;EGFR mutations are often implicated in lung cancer&quot;</span>
+    <span class="n">doc</span> <span class="o">=</span> <span class="n">Document</span><span class="o">.</span><span class="n">create_simple_document</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
+    <span class="n">pipeline</span><span class="p">([</span><span class="n">doc</span><span class="p">])</span>
+    <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">doc</span><span class="o">.</span><span class="n">sections</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">text</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+</pre></div>
+</div>
+<p>You can now inspect the doc object, and explore what entities were detected on each section</p>
 </section>
 <section id="running-steps">
 <h2>Running Steps<a class="headerlink" href="#running-steps" title="Permalink to this heading">#</a></h2>
@@ -383,39 +418,6 @@ <h2>Running Steps<a class="headerlink" href="#running-steps" title="Permalink to
 </pre></div>
 </div>
 </section>
-<section id="advanced-pipeline-configuration-with-hydra">
-<h2>Advanced Pipeline configuration with Hydra<a class="headerlink" href="#advanced-pipeline-configuration-with-hydra" title="Permalink to this heading">#</a></h2>
-<p>To create an NLP pipeline, you need to instantiate steps. Given the large amount
-of configuration required, the easiest way to do this is with Hydra <a class="reference external" href="https://hydra.cc/docs/intro/">https://hydra.cc/docs/intro/</a></p>
-<p>Here, you will need a hydra config directory (see kazu/conf for an example).</p>
-<p>First, export the path of your config directory to KAZU_CONFIG_DIR.</p>
-<p>To use the example kazu/conf config you will need to
-set the environment variable KAZU_MODEL_PACK to a path for a kazu model pack,
-or manually update the model paths that use the variable - search for
-<cite>${oc.env:KAZU_MODEL_PACK}</cite> in kazu/conf).</p>
-<div class="highlight-python3 notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">os</span>
-<span class="kn">from</span> <span class="nn">hydra</span> <span class="kn">import</span> <span class="n">compose</span><span class="p">,</span> <span class="n">initialize_config_dir</span>
-<span class="kn">from</span> <span class="nn">hydra.utils</span> <span class="kn">import</span> <span class="n">instantiate</span>
-<span class="kn">from</span> <span class="nn">kazu.data.data</span> <span class="kn">import</span> <span class="n">Document</span>
-<span class="kn">from</span> <span class="nn">kazu.pipeline</span> <span class="kn">import</span> <span class="n">Pipeline</span>
-<span class="c1"># some text we want to process</span>
-<span class="n">text</span> <span class="o">=</span> <span class="s2">&quot;&quot;&quot;EGFR is a gene&quot;&quot;&quot;</span>
-
-<span class="k">with</span> <span class="n">initialize_config_dir</span><span class="p">(</span><span class="n">config_dir</span><span class="o">=</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;KAZU_CONFIG_DIR&quot;</span><span class="p">)):</span>
-    <span class="n">cfg</span> <span class="o">=</span> <span class="n">compose</span><span class="p">(</span><span class="n">config_name</span><span class="o">=</span><span class="s2">&quot;config&quot;</span><span class="p">)</span>
-    <span class="c1"># instantiate a pipeline based on Hydra defaults</span>
-    <span class="n">pipeline</span><span class="p">:</span> <span class="n">Pipeline</span> <span class="o">=</span> <span class="n">instantiate</span><span class="p">(</span><span class="n">cfg</span><span class="o">.</span><span class="n">Pipeline</span><span class="p">)</span>
-    <span class="c1"># create an instance of Document from our text string</span>
-    <span class="n">doc</span> <span class="o">=</span> <span class="n">Document</span><span class="o">.</span><span class="n">create_simple_document</span><span class="p">(</span><span class="n">text</span><span class="p">)</span>
-    <span class="c1"># Pipeline takes a List[Document] as an argument to __call__</span>
-    <span class="c1"># and returns a processed List[Document]</span>
-    <span class="n">result</span><span class="p">:</span> <span class="n">Document</span> <span class="o">=</span> <span class="n">pipeline</span><span class="p">([</span><span class="n">doc</span><span class="p">])[</span><span class="mi">0</span><span class="p">]</span>
-    <span class="c1"># a Document is composed of Sections</span>
-    <span class="c1"># (a Document created with create_simple_document has only one)</span>
-    <span class="nb">print</span><span class="p">(</span><span class="n">result</span><span class="o">.</span><span class="n">sections</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">get_text</span><span class="p">())</span>
-</pre></div>
-</div>
-</section>
 </section>
 
         </article>
@@ -478,8 +480,9 @@ <h2>Advanced Pipeline configuration with Hydra<a class="headerlink" href="#advan
 <li><a class="reference internal" href="#">Quickstart</a><ul>
 <li><a class="reference internal" href="#installation">Installation</a></li>
 <li><a class="reference internal" href="#model-pack">Model Pack</a></li>
+<li><a class="reference internal" href="#default-configuration">Default configuration</a></li>
+<li><a class="reference internal" href="#processing-your-first-document">Processing your first document</a></li>
 <li><a class="reference internal" href="#running-steps">Running Steps</a></li>
-<li><a class="reference internal" href="#advanced-pipeline-configuration-with-hydra">Advanced Pipeline configuration with Hydra</a></li>
 </ul>
 </li>
 </ul>
diff --git a/docs/_build/html/searchindex.js b/docs/_build/html/searchindex.js
index 931d9ff8f..5a1ef695e 100644
--- a/docs/_build/html/searchindex.js
+++ b/docs/_build/html/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["_autosummary/kazu", "_autosummary/kazu.data", "_autosummary/kazu.data.data", "_autosummary/kazu.data.pytorch", "_autosummary/kazu.modelling", "_autosummary/kazu.modelling.annotation", "_autosummary/kazu.modelling.annotation.acceptance_test", "_autosummary/kazu.modelling.annotation.label_studio", "_autosummary/kazu.modelling.database", "_autosummary/kazu.modelling.database.in_memory_db", "_autosummary/kazu.modelling.distillation", "_autosummary/kazu.modelling.distillation.data_utils", "_autosummary/kazu.modelling.distillation.dataprocessor", "_autosummary/kazu.modelling.distillation.lightning_plugins", "_autosummary/kazu.modelling.distillation.metrics", "_autosummary/kazu.modelling.distillation.models", "_autosummary/kazu.modelling.distillation.tiny_transformers", "_autosummary/kazu.modelling.distillation.train", "_autosummary/kazu.modelling.hf_lightning_wrappers", "_autosummary/kazu.modelling.language", "_autosummary/kazu.modelling.language.language_phenomena", "_autosummary/kazu.modelling.language.string_similarity_scorers", "_autosummary/kazu.modelling.linking", "_autosummary/kazu.modelling.linking.sapbert", "_autosummary/kazu.modelling.linking.sapbert.train", "_autosummary/kazu.modelling.ontology_matching", "_autosummary/kazu.modelling.ontology_matching.assemble_pipeline", "_autosummary/kazu.modelling.ontology_matching.ontology_matcher", "_autosummary/kazu.modelling.ontology_preprocessing", "_autosummary/kazu.modelling.ontology_preprocessing.base", "_autosummary/kazu.modelling.ontology_preprocessing.synonym_generation", "_autosummary/kazu.pipeline", "_autosummary/kazu.pipeline.pipeline", "_autosummary/kazu.steps", "_autosummary/kazu.steps.document_post_processing", "_autosummary/kazu.steps.document_post_processing.abbreviation_finder", "_autosummary/kazu.steps.joint_ner_and_linking", "_autosummary/kazu.steps.joint_ner_and_linking.explosion", "_autosummary/kazu.steps.linking", "_autosummary/kazu.steps.linking.dictionary", "_autosummary/kazu.steps.linking.mapping_step", "_autosummary/kazu.steps.linking.post_processing", "_autosummary/kazu.steps.linking.post_processing.disambiguation", "_autosummary/kazu.steps.linking.post_processing.disambiguation.context_scoring", "_autosummary/kazu.steps.linking.post_processing.disambiguation.strategies", "_autosummary/kazu.steps.linking.post_processing.mapping_strategies", "_autosummary/kazu.steps.linking.post_processing.mapping_strategies.strategies", "_autosummary/kazu.steps.linking.post_processing.strategy_runner", "_autosummary/kazu.steps.linking.post_processing.xref_manager", "_autosummary/kazu.steps.linking.sapbert", "_autosummary/kazu.steps.ner", "_autosummary/kazu.steps.ner.entity_post_processing", "_autosummary/kazu.steps.ner.hf_token_classification", "_autosummary/kazu.steps.ner.seth", "_autosummary/kazu.steps.ner.spacy_ner", "_autosummary/kazu.steps.ner.tokenized_word_processor", "_autosummary/kazu.steps.other", "_autosummary/kazu.steps.other.cleanup", "_autosummary/kazu.steps.other.merge_overlapping_ents", "_autosummary/kazu.steps.other.stanza", "_autosummary/kazu.steps.step", "_autosummary/kazu.utils", "_autosummary/kazu.utils.abbreviation_detector", "_autosummary/kazu.utils.build_and_test_model_packs", "_autosummary/kazu.utils.caching", "_autosummary/kazu.utils.grouping", "_autosummary/kazu.utils.link_index", "_autosummary/kazu.utils.spacy_pipeline", "_autosummary/kazu.utils.stanza_pipeline", "_autosummary/kazu.utils.stopwatch", "_autosummary/kazu.utils.string_normalizer", "_autosummary/kazu.utils.utils", "_autosummary/kazu.web", "_autosummary/kazu.web.jwtauth", "_autosummary/kazu.web.routes", "_autosummary/kazu.web.server", "apidocs_autosummary", "curating_for_explosion", "datamodel", "default_pipeline", "index", "introduction", "kazu_webservice", "label_studio_integration", "ontology_parser", "pipeline_example", "quickstart", "scaling_kazu", "single_step_example"], "filenames": ["_autosummary/kazu.rst", "_autosummary/kazu.data.rst", "_autosummary/kazu.data.data.rst", "_autosummary/kazu.data.pytorch.rst", "_autosummary/kazu.modelling.rst", "_autosummary/kazu.modelling.annotation.rst", "_autosummary/kazu.modelling.annotation.acceptance_test.rst", "_autosummary/kazu.modelling.annotation.label_studio.rst", "_autosummary/kazu.modelling.database.rst", "_autosummary/kazu.modelling.database.in_memory_db.rst", "_autosummary/kazu.modelling.distillation.rst", "_autosummary/kazu.modelling.distillation.data_utils.rst", "_autosummary/kazu.modelling.distillation.dataprocessor.rst", "_autosummary/kazu.modelling.distillation.lightning_plugins.rst", "_autosummary/kazu.modelling.distillation.metrics.rst", "_autosummary/kazu.modelling.distillation.models.rst", "_autosummary/kazu.modelling.distillation.tiny_transformers.rst", "_autosummary/kazu.modelling.distillation.train.rst", "_autosummary/kazu.modelling.hf_lightning_wrappers.rst", "_autosummary/kazu.modelling.language.rst", "_autosummary/kazu.modelling.language.language_phenomena.rst", "_autosummary/kazu.modelling.language.string_similarity_scorers.rst", "_autosummary/kazu.modelling.linking.rst", "_autosummary/kazu.modelling.linking.sapbert.rst", "_autosummary/kazu.modelling.linking.sapbert.train.rst", "_autosummary/kazu.modelling.ontology_matching.rst", "_autosummary/kazu.modelling.ontology_matching.assemble_pipeline.rst", "_autosummary/kazu.modelling.ontology_matching.ontology_matcher.rst", "_autosummary/kazu.modelling.ontology_preprocessing.rst", "_autosummary/kazu.modelling.ontology_preprocessing.base.rst", "_autosummary/kazu.modelling.ontology_preprocessing.synonym_generation.rst", "_autosummary/kazu.pipeline.rst", "_autosummary/kazu.pipeline.pipeline.rst", "_autosummary/kazu.steps.rst", "_autosummary/kazu.steps.document_post_processing.rst", "_autosummary/kazu.steps.document_post_processing.abbreviation_finder.rst", "_autosummary/kazu.steps.joint_ner_and_linking.rst", "_autosummary/kazu.steps.joint_ner_and_linking.explosion.rst", "_autosummary/kazu.steps.linking.rst", "_autosummary/kazu.steps.linking.dictionary.rst", "_autosummary/kazu.steps.linking.mapping_step.rst", "_autosummary/kazu.steps.linking.post_processing.rst", "_autosummary/kazu.steps.linking.post_processing.disambiguation.rst", "_autosummary/kazu.steps.linking.post_processing.disambiguation.context_scoring.rst", "_autosummary/kazu.steps.linking.post_processing.disambiguation.strategies.rst", "_autosummary/kazu.steps.linking.post_processing.mapping_strategies.rst", "_autosummary/kazu.steps.linking.post_processing.mapping_strategies.strategies.rst", "_autosummary/kazu.steps.linking.post_processing.strategy_runner.rst", "_autosummary/kazu.steps.linking.post_processing.xref_manager.rst", "_autosummary/kazu.steps.linking.sapbert.rst", "_autosummary/kazu.steps.ner.rst", "_autosummary/kazu.steps.ner.entity_post_processing.rst", "_autosummary/kazu.steps.ner.hf_token_classification.rst", "_autosummary/kazu.steps.ner.seth.rst", "_autosummary/kazu.steps.ner.spacy_ner.rst", "_autosummary/kazu.steps.ner.tokenized_word_processor.rst", "_autosummary/kazu.steps.other.rst", "_autosummary/kazu.steps.other.cleanup.rst", "_autosummary/kazu.steps.other.merge_overlapping_ents.rst", "_autosummary/kazu.steps.other.stanza.rst", "_autosummary/kazu.steps.step.rst", "_autosummary/kazu.utils.rst", "_autosummary/kazu.utils.abbreviation_detector.rst", "_autosummary/kazu.utils.build_and_test_model_packs.rst", "_autosummary/kazu.utils.caching.rst", "_autosummary/kazu.utils.grouping.rst", "_autosummary/kazu.utils.link_index.rst", "_autosummary/kazu.utils.spacy_pipeline.rst", "_autosummary/kazu.utils.stanza_pipeline.rst", "_autosummary/kazu.utils.stopwatch.rst", "_autosummary/kazu.utils.string_normalizer.rst", "_autosummary/kazu.utils.utils.rst", "_autosummary/kazu.web.rst", "_autosummary/kazu.web.jwtauth.rst", "_autosummary/kazu.web.routes.rst", "_autosummary/kazu.web.server.rst", "apidocs_autosummary.rst", "curating_for_explosion.rst", "datamodel.rst", "default_pipeline.rst", "index.rst", "introduction.rst", "kazu_webservice.rst", "label_studio_integration.rst", "ontology_parser.rst", "pipeline_example.rst", "quickstart.rst", "scaling_kazu.rst", "single_step_example.rst"], "titles": ["kazu", "kazu.data", "kazu.data.data", "kazu.data.pytorch", "kazu.modelling", "kazu.modelling.annotation", "kazu.modelling.annotation.acceptance_test", "kazu.modelling.annotation.label_studio", "kazu.modelling.database", "kazu.modelling.database.in_memory_db", "kazu.modelling.distillation", "kazu.modelling.distillation.data_utils", "kazu.modelling.distillation.dataprocessor", "kazu.modelling.distillation.lightning_plugins", "kazu.modelling.distillation.metrics", "kazu.modelling.distillation.models", "kazu.modelling.distillation.tiny_transformers", "kazu.modelling.distillation.train", "kazu.modelling.hf_lightning_wrappers", "kazu.modelling.language", "kazu.modelling.language.language_phenomena", "kazu.modelling.language.string_similarity_scorers", "kazu.modelling.linking", "kazu.modelling.linking.sapbert", "kazu.modelling.linking.sapbert.train", "kazu.modelling.ontology_matching", "kazu.modelling.ontology_matching.assemble_pipeline", "kazu.modelling.ontology_matching.ontology_matcher", "kazu.modelling.ontology_preprocessing", "kazu.modelling.ontology_preprocessing.base", "kazu.modelling.ontology_preprocessing.synonym_generation", "kazu.pipeline", "kazu.pipeline.pipeline", "kazu.steps", "kazu.steps.document_post_processing", "kazu.steps.document_post_processing.abbreviation_finder", "kazu.steps.joint_ner_and_linking", "kazu.steps.joint_ner_and_linking.explosion", "kazu.steps.linking", "kazu.steps.linking.dictionary", "kazu.steps.linking.mapping_step", "kazu.steps.linking.post_processing", "kazu.steps.linking.post_processing.disambiguation", "kazu.steps.linking.post_processing.disambiguation.context_scoring", "kazu.steps.linking.post_processing.disambiguation.strategies", "kazu.steps.linking.post_processing.mapping_strategies", "kazu.steps.linking.post_processing.mapping_strategies.strategies", "kazu.steps.linking.post_processing.strategy_runner", "kazu.steps.linking.post_processing.xref_manager", "kazu.steps.linking.sapbert", "kazu.steps.ner", "kazu.steps.ner.entity_post_processing", "kazu.steps.ner.hf_token_classification", "kazu.steps.ner.seth", "kazu.steps.ner.spacy_ner", "kazu.steps.ner.tokenized_word_processor", "kazu.steps.other", "kazu.steps.other.cleanup", "kazu.steps.other.merge_overlapping_ents", "kazu.steps.other.stanza", "kazu.steps.step", "kazu.utils", "kazu.utils.abbreviation_detector", "kazu.utils.build_and_test_model_packs", "kazu.utils.caching", "kazu.utils.grouping", "kazu.utils.link_index", "kazu.utils.spacy_pipeline", "kazu.utils.stanza_pipeline", "kazu.utils.stopwatch", "kazu.utils.string_normalizer", "kazu.utils.utils", "kazu.web", "kazu.web.jwtauth", "kazu.web.routes", "kazu.web.server", "API Reference", "Curating a knowledge base for NER and Linking", "Kazu Data Model", "At a glance: How to use the default Kazu pipeline", "Welcome to Kazu\u2019s documentation!", "Introduction", "TBA", "Visualising results in Label Studio", "The OntologyParser", "&lt;no title&gt;", "Quickstart", "TBA", "&lt;no title&gt;"], "terms": {"modul": [0, 1, 4, 5, 8, 10, 15, 16, 19, 22, 23, 24, 25, 28, 31, 33, 34, 36, 38, 41, 42, 45, 50, 56, 61, 72, 80], "class": [2, 3, 6, 7, 9, 12, 13, 15, 16, 18, 21, 24, 27, 29, 30, 32, 35, 37, 39, 40, 43, 44, 46, 47, 48, 49, 51, 52, 53, 54, 55, 57, 58, 59, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 73, 75, 77, 79, 84], "autonameenum": 2, "sourc": [2, 3, 6, 7, 9, 11, 12, 13, 14, 15, 16, 17, 18, 21, 24, 26, 27, 29, 30, 32, 35, 37, 39, 40, 43, 44, 46, 47, 48, 49, 51, 52, 53, 54, 55, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 75, 81, 84], "base": [2, 3, 6, 7, 9, 12, 13, 15, 16, 18, 21, 24, 26, 27, 30, 32, 35, 37, 39, 40, 43, 44, 46, 47, 48, 49, 51, 52, 53, 54, 55, 57, 58, 59, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 73, 75, 78, 80, 84, 86], "enum": 2, "subclass": [2, 15, 16, 55], "creat": [2, 7, 15, 24, 27, 29, 32, 43, 44, 47, 48, 52, 66, 78, 83, 84, 86, 88], "an": [2, 9, 15, 18, 21, 24, 26, 27, 29, 32, 44, 46, 47, 55, 58, 59, 62, 64, 66, 70, 78, 81, 83, 84, 86, 88], "where": [2, 15, 24, 46, 48, 66, 79], "valu": [2, 15, 24, 46, 60, 77, 84], "ar": [2, 15, 21, 24, 26, 27, 29, 43, 44, 46, 47, 48, 51, 52, 53, 55, 58, 59, 60, 62, 64, 66, 70, 73, 77, 78, 79, 81, 84, 86, 88], "name": [2, 7, 9, 24, 27, 29, 44, 46, 47, 48, 54, 55, 58, 59, 60, 66, 73, 77, 81, 84], "when": [2, 13, 15, 24, 27, 29, 44, 47, 48, 49, 55, 60, 66, 69, 77, 81, 84], "us": [2, 9, 14, 15, 18, 21, 24, 26, 29, 30, 32, 35, 39, 40, 43, 44, 46, 47, 48, 49, 52, 55, 59, 60, 62, 63, 66, 70, 73, 77, 81, 83, 84, 86], "auto": 2, "taken": [2, 46, 77], "from": [2, 13, 15, 24, 26, 27, 29, 30, 32, 44, 47, 48, 52, 54, 58, 59, 62, 63, 66, 69, 70, 73, 75, 77, 78, 81, 83, 84, 85, 86, 88], "python": [2, 29, 54, 59], "doc": [2, 6, 7, 26, 27, 29, 32, 44, 51, 52, 54, 57, 60, 62, 71, 77, 78, 83, 85, 86, 88], "licens": [2, 70, 73, 81], "under": [2, 55, 73, 81], "zero": 2, "claus": [2, 70, 73], "bsd": [2, 70, 73], "charspan": [2, 7], "object": [2, 6, 7, 9, 15, 21, 24, 27, 29, 30, 32, 43, 44, 46, 47, 51, 55, 57, 62, 63, 64, 67, 68, 69, 70, 71, 84], "A": [2, 13, 15, 24, 29, 32, 35, 37, 40, 44, 46, 48, 52, 53, 54, 55, 58, 59, 60, 62, 64, 70, 71, 73, 78, 84], "concept": [2, 29, 52, 78, 81, 84], "similar": [2, 21, 24, 29, 46, 59, 84], "spaci": [2, 26, 27, 37, 54, 60, 62, 67, 77, 79], "span": [2, 6, 26, 27, 37, 47, 55, 58, 62, 78, 79, 86, 88], "except": [2, 6, 32, 60, 63, 73], "charact": [2, 55, 62, 70, 78, 86, 88], "index": [2, 15, 18, 24, 48, 52, 58, 66, 71], "rather": [2, 60, 81], "than": [2, 15, 21, 29, 46, 47, 49, 52, 60, 81], "token": [2, 3, 15, 21, 24, 26, 46, 52, 55, 59, 70, 71, 73, 83], "__init__": [2, 3, 6, 7, 13, 15, 16, 18, 21, 24, 27, 29, 30, 32, 35, 37, 39, 40, 43, 44, 46, 47, 48, 49, 51, 52, 53, 54, 55, 57, 58, 59, 60, 62, 64, 66, 67, 68, 69, 70, 71, 73], "start": [2, 17, 24, 51, 55, 58, 69, 75, 77, 78, 81, 86, 88], "end": [2, 15, 18, 24, 47, 51, 55, 58, 63, 77, 78, 86, 88], "paramet": [2, 3, 6, 7, 9, 11, 12, 13, 14, 15, 17, 18, 21, 24, 26, 27, 29, 30, 32, 35, 37, 39, 40, 43, 44, 46, 47, 48, 49, 51, 52, 53, 54, 55, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 75, 84], "int": [2, 6, 7, 9, 14, 15, 18, 24, 27, 29, 32, 37, 39, 43, 46, 49, 52, 55, 58, 62, 64, 66, 69, 71], "return": [2, 6, 7, 9, 11, 12, 13, 14, 15, 17, 18, 24, 26, 27, 29, 30, 32, 37, 43, 44, 46, 47, 48, 52, 55, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 71, 73, 75, 77, 78, 84, 86, 88], "type": [2, 6, 7, 9, 11, 12, 13, 14, 15, 17, 18, 21, 24, 26, 27, 29, 30, 32, 37, 43, 44, 46, 47, 48, 52, 55, 58, 60, 62, 63, 64, 65, 66, 68, 70, 71, 73, 75, 77, 83, 84], "none": [2, 6, 9, 13, 15, 16, 17, 18, 24, 26, 27, 29, 30, 32, 35, 39, 44, 46, 47, 49, 52, 53, 55, 62, 63, 66, 67, 70, 73, 75, 81, 83, 85], "is_completely_overlap": 2, "other": [2, 9, 29, 35, 46, 47, 48, 70, 73, 79, 80, 81, 84], "true": [2, 15, 24, 27, 29, 30, 37, 46, 49, 55, 58, 71, 77, 84], "complet": [2, 84], "overlap": [2, 7, 55, 58, 79, 81], "thi": [2, 9, 15, 16, 18, 21, 24, 26, 27, 29, 30, 32, 35, 43, 44, 46, 47, 48, 49, 52, 54, 55, 58, 60, 62, 63, 66, 70, 73, 77, 79, 81, 83, 84, 86], "is_partially_overlap": 2, "partial": [2, 58, 81], "document": [2, 6, 7, 32, 44, 46, 47, 49, 52, 53, 54, 55, 57, 60, 70, 71, 73, 75, 77, 78, 81, 83, 84, 85, 86, 88], "idx": [2, 9, 29, 46, 66, 84], "str": [2, 6, 7, 9, 11, 12, 13, 15, 21, 24, 26, 27, 29, 30, 32, 35, 37, 39, 43, 44, 46, 47, 48, 49, 51, 52, 53, 55, 57, 58, 60, 62, 63, 66, 67, 68, 69, 70, 71, 73, 75, 77, 84], "section": [2, 6, 7, 15, 24, 35, 52, 54, 62, 71, 75, 77, 78, 83, 86, 88], "list": [2, 6, 7, 12, 14, 15, 21, 24, 26, 27, 29, 30, 32, 35, 39, 43, 44, 46, 47, 48, 49, 51, 52, 55, 57, 58, 60, 62, 63, 64, 66, 70, 71, 73, 77, 78, 83, 86, 88], "factori": [2, 6, 27, 46, 55], "metadata": [2, 9, 24, 29, 37, 60, 66, 78, 83, 84], "dict": [2, 6, 7, 9, 13, 15, 24, 27, 29, 30, 32, 37, 44, 46, 47, 48, 49, 51, 52, 55, 58, 66, 70, 71, 73, 75, 77], "ani": [2, 6, 9, 13, 15, 18, 24, 26, 29, 44, 46, 47, 53, 55, 58, 60, 62, 63, 66, 70, 71, 73, 77, 84], "as_minified_dict": 2, "drop_unmapped_": 2, "fals": [2, 15, 16, 24, 27, 46, 47, 48, 52, 55, 66], "drop_term": 2, "bool": [2, 9, 15, 16, 18, 24, 27, 29, 30, 37, 46, 47, 48, 49, 52, 53, 55, 57, 58, 62, 63, 66, 68, 70, 71, 73, 77], "classmethod": [2, 7, 30, 46, 60, 68, 70, 73], "create_simple_docu": [2, 78, 83, 85, 86, 88], "text": [2, 11, 15, 24, 29, 35, 43, 51, 52, 53, 55, 58, 59, 62, 66, 70, 71, 75, 77, 78, 83, 84, 85, 86], "instanc": [2, 15, 16, 18, 24, 29, 44, 46, 48, 52, 55, 62, 66, 67, 68, 77, 78, 84, 86, 88], "string": [2, 6, 9, 13, 14, 21, 24, 26, 27, 29, 30, 43, 44, 46, 49, 51, 52, 62, 66, 70, 71, 77, 84, 86], "The": [2, 13, 15, 18, 24, 26, 27, 29, 32, 43, 44, 47, 49, 52, 55, 58, 59, 60, 62, 66, 70, 78, 80, 81], "field": [2, 24, 26, 27, 44, 54, 77, 81], "gener": [2, 15, 24, 26, 29, 30, 39, 44, 48, 49, 52, 59, 60, 62, 63, 66, 77, 78, 84], "uuid": 2, "uuid4": 2, "hex": 2, "from_named_section_text": 2, "named_sect": 2, "get_ent": [2, 78, 86, 88], "get": [2, 9, 12, 15, 24, 47, 52, 55, 77, 84, 86], "all": [2, 9, 15, 16, 21, 24, 27, 29, 30, 44, 46, 48, 58, 59, 63, 66, 70, 73, 79, 84], "entiti": [2, 6, 7, 24, 27, 29, 37, 39, 44, 46, 47, 49, 51, 52, 53, 55, 57, 58, 59, 62, 64, 70, 71, 77, 78, 79, 81, 83, 84, 86, 88], "json": [2, 26, 29, 48, 77, 83], "kwarg": [2, 18, 21, 24, 57, 60, 68, 70], "custom": [2, 24, 63, 70, 80, 81, 83], "encod": [2, 3, 15, 24, 52, 71], "need": [2, 7, 9, 15, 16, 24, 26, 27, 29, 43, 44, 46, 48, 52, 55, 67, 77, 84, 86], "handl": [2, 15, 24, 32, 44, 47, 52, 55, 60, 70, 78, 84], "serialis": [2, 43], "issu": [2, 81, 84], "our": [2, 59, 77, 81, 83, 84, 86], "model": [2, 43, 44, 46, 49, 52, 55, 59, 60, 62, 63, 66, 67, 77, 79, 80, 81, 83, 84], "param": [2, 58, 66, 70], "drop": 2, "have": [2, 6, 15, 24, 29, 44, 46, 47, 54, 58, 60, 70, 71, 77, 81], "map": [2, 6, 7, 15, 27, 29, 30, 43, 44, 46, 47, 48, 49, 52, 55, 57, 58, 71, 79, 84], "synonym": [2, 9, 26, 27, 29, 30, 44, 46, 64, 77, 84], "term": [2, 21, 26, 27, 44, 46, 47, 64, 77, 84], "addit": [2, 15, 24, 30, 48, 78, 84], "pass": [2, 15, 16, 24, 39, 46, 52, 60], "dump": [2, 29, 77], "documentjsonutil": 2, "conversionexcept": 2, "doc_to_json_dict": 2, "option": [2, 9, 13, 15, 18, 24, 26, 27, 29, 30, 32, 35, 39, 44, 46, 47, 49, 52, 53, 55, 62, 63, 66, 70, 73, 81], "union": [2, 9, 11, 13, 15, 24, 26, 27, 29, 30, 37, 66, 68, 71, 73, 84], "float": [2, 6, 9, 14, 15, 24, 29, 32, 44, 46, 52, 55, 66], "static": [2, 6, 7, 9, 24, 29, 43, 44, 46, 47, 52, 63, 66, 70], "empti": 2, "x": [2, 15, 21, 24, 30, 44, 77, 78, 83, 84, 85, 86, 88], "minify_json_dict": 2, "doc_json_dict": 2, "in_plac": 2, "obj_to_dict_repr": 2, "obj": [2, 77], "remove_empty_el": 2, "d": [2, 15, 24, 30, 59, 70, 84], "recurs": 2, "remov": [2, 13, 30, 62, 70], "element": [2, 7, 46, 60, 70], "dictionari": [2, 15, 24, 29, 47, 66, 79, 80, 83, 84], "atomic_typ": 2, "nonetyp": 2, "listlike_typ": 2, "tupl": [2, 6, 7, 9, 15, 24, 27, 29, 37, 46, 47, 48, 52, 55, 58, 60, 62, 63, 65, 66, 70, 71, 73], "set": [2, 6, 9, 12, 15, 24, 27, 29, 30, 37, 39, 43, 44, 46, 47, 48, 55, 58, 66, 77, 81, 84, 86], "frozenset": [2, 29, 44, 46, 47], "contain": [2, 15, 24, 29, 43, 53, 55, 78, 84, 86], "inform": [2, 6, 7, 47, 55, 62, 78, 79, 83, 84], "about": [2, 84], "singl": [2, 15, 24, 29, 46, 60, 70, 71, 78, 84, 86, 88], "detect": [2, 35, 44, 46, 47, 52, 53, 55, 62, 70, 78, 79, 84], "within": [2, 16, 26, 30, 46, 52, 58], "most": [2, 24, 30, 44, 46, 70, 79, 84], "import": [2, 29, 77, 78, 81, 83, 84, 85, 86, 88], "match": [2, 6, 15, 21, 24, 26, 27, 29, 30, 39, 46, 47, 49, 51, 62, 71, 77, 78, 86, 88], "actual": [2, 15, 24, 46, 55, 81, 84], "syn_term_to_synonym_term": [2, 77], "synonymtermwithmetr": [2, 46, 47, 64, 66], "candid": [2, 24, 62, 77, 79, 84], "knowledgebas": [2, 24, 29, 79, 81, 84], "hit": [2, 27, 44, 48, 77], "final": [2, 15, 30, 58, 70, 77, 81, 84], "product": [2, 24, 26, 73, 81], "link": [2, 9, 29, 30, 79, 80, 81, 83, 84], "refer": [2, 24, 29, 44, 46, 48, 84], "underli": [2, 29, 47, 48, 52, 84], "entity_class": [2, 6, 7, 26, 27, 29, 47, 51, 53, 70, 78, 84, 86, 88], "namespac": [2, 39, 47, 51, 55, 58, 60, 62, 78, 86, 88], "add_map": 2, "deprec": 2, "as_brat": 2, "self": [2, 15, 18, 24, 29, 49, 55, 60, 69, 77, 84], "third": 2, "parti": 2, "biomed": [2, 24, 35, 49, 59, 62, 70, 77, 81], "nlp": [2, 26, 27, 29, 59, 62, 77, 81, 84, 86], "brat": 2, "format": [2, 9, 14, 48, 55], "see": [2, 6, 15, 24, 29, 30, 35, 44, 46, 47, 58, 62, 78, 79, 81, 86], "calc_starts_and_end": 2, "from_span": 2, "join_str": 2, "indic": [2, 24, 39, 49, 55, 66, 78, 86, 88], "also": [2, 15, 24, 26, 29, 30, 44, 58, 59, 63, 66, 70, 71, 81, 83, 84], "requir": [2, 24, 29, 44, 46, 47, 48, 62, 63, 70, 77, 81, 86], "produc": [2, 3, 15, 24, 29, 30, 46, 47, 48, 55, 69], "repres": [2, 21, 24, 29, 55, 58, 66, 70, 77], "join": [2, 84], "togeth": 2, "encompass": 2, "onli": [2, 9, 15, 24, 27, 29, 44, 46, 47, 55, 60, 66, 69, 70, 73, 77, 78, 84, 86, 88], "one": [2, 6, 15, 16, 24, 27, 29, 35, 46, 48, 84, 86], "defin": [2, 15, 16, 24, 27, 29, 62], "both": [2, 16, 24, 30, 46, 48, 59, 84], "thei": [2, 29, 46, 47, 55, 62, 77, 84], "If": [2, 9, 15, 24, 26, 29, 44, 46, 48, 49, 54, 58, 62, 64, 70, 84], "multipl": [2, 15, 24, 29, 43, 44, 46, 55, 63, 67, 84], "becom": [2, 81], "patholog": 2, "while": [2, 15, 16, 18, 24, 59], "mai": [2, 29, 44, 46, 47, 48, 55, 63, 73, 78, 81, 84, 86, 88], "technic": [2, 44], "sens": [2, 44, 84], "distinct": [2, 29, 84], "semant": [2, 84], "mean": [2, 84], "For": [2, 15, 24, 48, 59, 62, 77, 78, 79, 81, 83, 84], "consid": [2, 44, 46, 47, 55, 58], "case": [2, 15, 24, 27, 29, 44, 62, 70, 77, 79, 84], "we": [2, 7, 15, 21, 24, 27, 29, 44, 47, 48, 52, 53, 55, 59, 66, 67, 70, 77, 79, 81, 83, 84, 86], "want": [2, 29, 47, 48, 55, 60, 77, 81, 84, 86], "select": [2, 44, 46, 58, 84], "longest": [2, 24, 30, 46, 58], "annot": [2, 44, 59, 63, 80, 81, 83], "suggest": [2, 84], "some": [2, 15, 24, 26, 29, 44, 51, 55, 58, 70, 77, 79, 81, 84, 86], "ner": [2, 7, 15, 26, 29, 30, 39, 47, 59, 78, 79, 80, 81, 83, 84], "system": [2, 47, 59, 81, 84], "1": [2, 15, 24, 29, 43, 44, 46, 66, 69, 70, 83, 84, 85], "patient": 2, "ha": [2, 15, 21, 24, 27, 29, 44, 46, 47, 49, 71, 78, 84, 86, 88], "metastat": 2, "liver": 2, "cancer": [2, 77], "entity1": 2, "16": [2, 24, 49], "39": 2, "entity2": 2, "27": 2, "40": 2, "result": [2, 6, 24, 29, 32, 39, 47, 49, 52, 54, 58, 59, 60, 66, 71, 80, 84, 86], "part": [2, 47, 51, 58, 59, 70], "same": [2, 32, 46, 58, 59, 84], "2": [2, 15, 18, 24, 29, 43, 46, 51, 52, 70, 71, 81, 83, 84, 85], "non": [2, 27, 29, 46, 47, 51, 52, 58, 70, 81, 83], "contigu": [2, 51, 52, 58, 70, 81, 83], "lung": 2, "0": [2, 6, 15, 18, 21, 24, 29, 44, 46, 51, 58, 69, 78, 81, 84, 86, 88], "4": [2, 21, 47, 70], "1521": 2, "9": [2, 21, 51, 59, 70], "21": [2, 86], "load_contiguous_ent": [2, 51, 78, 86, 88], "update_term": 2, "iter": [2, 6, 7, 9, 27, 29, 30, 37, 43, 44, 46, 47, 48, 49, 52, 57, 60, 62, 64, 65, 66, 71, 78, 86, 88], "match_norm": [2, 46], "equivalentidaggregationstrategi": [2, 9, 29, 44], "enumer": [2, 62], "merged_as_non_symbol": [2, 84], "no_strategi": [2, 29], "resolved_by_similar": 2, "synonym_is_ambigu": 2, "unambigu": [2, 29, 44, 77], "equivalentidset": [2, 9, 29, 44, 46, 84], "represent": [2, 24, 44, 47, 49, 70], "kb": [2, 9, 24, 29, 71, 84], "id": [2, 7, 9, 14, 24, 29, 44, 46, 48, 52, 55, 84], "s": [2, 11, 15, 24, 26, 27, 29, 30, 43, 44, 52, 58, 60, 70, 71, 78, 81, 84], "thing": [2, 15, 24, 84], "ids_to_sourc": 2, "linkrank": [2, 46, 57], "ambigu": [2, 29, 44, 46, 84], "highly_lik": 2, "possibl": [2, 18, 24, 46, 47, 70, 73], "probabl": [2, 70, 77], "fulli": [2, 59, 81], "disambigu": [2, 29, 46, 70, 77, 80], "default_label": [2, 24, 29, 66, 84], "parser_nam": [2, 44, 46, 77], "mapping_strategi": [2, 80], "confid": [2, 46, 52, 55], "disambiguation_strategi": [2, 46], "xref_source_parser_nam": [2, 46], "preprocessed_text": 2, "get_text": [2, 86], "access": [2, 26, 43, 55, 83], "directli": [2, 26, 32, 48, 58, 81], "method": [2, 15, 24, 27, 29, 44, 46, 47, 55, 60, 70, 84], "provid": [2, 9, 26, 29, 30, 55, 59, 62, 66, 70, 73, 83], "conveni": [2, 24, 55, 78], "wrapper": [2, 18, 37, 40, 52, 64, 66], "avail": [2, 24, 27, 44, 48, 59, 66, 70, 86], "offset_map": 2, "properti": [2, 6, 7, 27, 47, 68, 73], "sentence_span": 2, "synonymterm": [2, 9, 26, 27, 29, 30, 47, 66, 78, 84], "normalis": [2, 21, 29, 46, 66, 70, 81], "ontologypars": [2, 26, 27, 29, 66, 80], "implement": [2, 3, 13, 15, 21, 24, 29, 32, 35, 52, 54, 55, 59, 60, 62, 70], "It": [2, 15, 24, 29, 43, 84], "compos": [2, 15, 24, 78, 86], "uniqu": [2, 29, 44, 47, 84], "e": [2, 7, 15, 24, 29, 30, 46, 47, 52, 55, 58, 62, 64, 70, 81, 84], "g": [2, 15, 24, 29, 30, 46, 55, 58, 62, 64, 70], "breast": [2, 77], "number": [2, 15, 21, 24, 47, 52, 59, 66, 70], "associated_id_set": [2, 27, 84], "determin": [2, 29, 55, 70, 84], "score_and_group_id": [2, 29, 84], "associ": [2, 9, 24, 26, 29, 44, 46, 47, 52, 55, 59, 66, 81, 84], "term_norm": [2, 21, 27, 46, 66, 77], "is_symbol": [2, 29, 77], "mapping_typ": [2, 29, 66, 84], "aggregated_bi": 2, "is_ambigu": 2, "allow": [2, 9, 52, 58, 81, 84], "metric": [2, 15, 24, 80], "score": [2, 6, 14, 44, 46], "As": 2, "hash": [2, 27], "function": [2, 6, 9, 11, 13, 14, 16, 17, 18, 24, 26, 32, 43, 47, 60, 62, 63, 65, 69, 71, 73, 75, 77, 86], "care": [2, 16, 46, 66, 84], "should": [2, 15, 16, 18, 24, 26, 27, 29, 44, 46, 47, 48, 55, 58, 63, 66, 70, 79, 81, 84], "search_scor": 2, "embed_scor": 2, "bool_scor": 2, "exact_match": 2, "from_synonym_term": 2, "merge_metr": 2, "hfdataset": [3, 52], "iterabledataset": [3, 24], "simpl": [3, 18, 24, 35, 51, 54, 62, 64, 66, 77, 84], "torch": [3, 13, 15, 24, 66], "util": [3, 15, 24, 35, 39, 80, 83, 85, 86], "hf": [3, 24, 52], "input_id": [3, 16, 24], "batchencod": [3, 24, 52, 71], "acceptancetestfailur": 6, "aggregatedaccuracyresult": 6, "tp": [6, 27], "fp": [6, 27], "fn": 6, "fp_counter": 6, "collect": [6, 12, 15, 24], "counter": 6, "fn_counter": 6, "fp_items_to_task": 6, "fn_items_to_task": 6, "add_fn": 6, "item": [6, 15, 24, 26, 65], "task": [6, 7, 15, 59, 83], "add_fp": 6, "tasks_for_fn": 6, "tasks_for_fp": 6, "fn_info": 6, "fp_info": 6, "precis": [6, 15, 24, 47, 66, 77], "recal": [6, 30, 47, 70], "sectionscor": 6, "gold_ent": [6, 83], "test_ent": 6, "calculate_linking_match": 6, "calculate_ner_match": 6, "group_mappings_by_sourc": 6, "ent": [6, 51, 54, 58, 71, 77], "acceptance_criteria": 6, "aggregate_linking_result": 6, "class_and_scor": 6, "aggregate_ner_result": 6, "analyse_full_pipelin": [6, 83], "pipelin": [6, 26, 27, 29, 37, 40, 54, 59, 63, 67, 68, 77, 78, 80, 81, 83, 85], "check_annotation_consist": 6, "cfg": [6, 17, 24, 32, 63, 75, 83, 85, 86], "check_ent_class_consist": 6, "ent_to_task_lookup": 6, "match_str": 6, "messag": [6, 32, 69], "check": [6, 21, 27, 29, 30, 46, 62, 64, 66, 70, 78, 84, 86, 88], "differ": [6, 9, 15, 24, 29, 44, 47, 48, 52, 55, 63, 84], "check_ent_mapping_consist": 6, "inconsist": [6, 81], "check_ent_match_abnorm": 6, "gold": [6, 14, 83], "standard": [6, 14, 24, 83], "look": [6, 9, 27, 44, 46, 48, 66, 70, 77, 83, 84], "bit": [6, 24], "weird": 6, "check_results_meet_threshold": 6, "threshold": [6, 29, 44, 46, 52, 55, 84], "execute_full_pipeline_acceptance_test": 6, "score_sect": 6, "scorer": [6, 44, 46, 84], "per": [6, 15, 24, 29, 44, 46, 47, 49, 58], "kazutolabelstudioconvert": [7, 83], "convert": [7, 11, 12, 24, 29, 46, 55, 71, 83], "label": [7, 14, 15, 16, 24, 27, 29, 46, 48, 52, 55, 77, 80, 84], "studio": [7, 80], "sinc": [7, 16, 29, 47, 48, 53, 55, 66, 70, 81], "ls": [7, 83], "region": 7, "new": [7, 24, 29, 30, 55, 62, 78, 81, 84, 86, 88], "everi": [7, 16, 24, 29], "even": [7, 44, 58, 62, 70, 73], "ones": [7, 44, 46, 47, 84], "add": [7, 9, 15, 18, 24, 29, 37, 55, 60, 66, 78, 86, 88], "etc": [7, 29, 48, 66, 78, 83, 85], "convert_docs_to_task": [7, 83], "convert_single_doc_to_task": 7, "lstokazuconvers": 7, "convert_tasks_to_doc": 7, "create_": 7, "create_map": [7, 46], "taxonomy_hit": 7, "task_id": 7, "create_sect": 7, "labelstudioannotationview": [7, 83], "ner_label": [7, 83], "i": [7, 9, 21, 24, 29, 30, 46, 47, 52, 55, 81, 84], "valid": [7, 13, 15, 24, 29, 30, 46, 84], "colour": 7, "build_label": 7, "dom": 7, "build_taxonomi": 7, "create_main_view": 7, "getdom": 7, "labelstudiomanag": [7, 83], "project_nam": [7, 83], "header": [7, 48, 73, 83], "url": [7, 53, 59, 70, 83], "http": [7, 24, 29, 48, 49, 53, 59, 62, 70, 73, 83, 84, 86], "localhost": [7, 83], "8080": [7, 83], "create_linking_project": [7, 83], "view": [7, 83], "delete_project_if_exist": 7, "export_from_l": [7, 83], "get_all_task": 7, "get_task": 7, "import_to_l": 7, "project_id": 7, "metadatadatabas": [9, 44, 46, 48, 66], "singleton": [9, 21, 43, 59, 63, 67, 71], "ontolog": [9, 24, 26, 29, 37, 39, 48, 66, 77, 79, 81, 84], "purpos": [9, 70, 73, 81], "up": [9, 15, 48, 49, 66, 77, 81], "process": [9, 15, 18, 24, 26, 27, 29, 32, 43, 47, 49, 52, 53, 55, 58, 59, 60, 63, 64, 70, 78, 81, 86, 88], "load": [9, 13, 24, 27, 32, 43, 48, 54, 66, 67, 83], "onc": [9, 46, 83], "reduc": [9, 64], "memori": [9, 32, 43, 66], "usag": [9, 32], "add_pars": 9, "note": [9, 15, 21, 24, 29, 30, 32, 44, 46, 47, 60, 62, 70, 77, 86], "assum": [9, 11, 29, 58, 70], "global": [9, 29, 84], "call": [9, 15, 16, 18, 24, 29, 30, 44, 46, 47, 49, 53, 55, 60, 63, 64, 69, 70, 84], "overrid": [9, 18, 24, 29, 46, 84], "exist": [9, 27, 53, 81], "entri": [9, 15, 24, 29], "kei": [9, 15, 24, 26, 27, 29, 52, 65, 83], "get_al": 9, "get_by_idx": 9, "queri": [9, 24, 29, 39, 44, 46, 49, 52, 66, 84], "get_by_index": 9, "loaded_pars": 9, "synonymdatabas": [9, 66], "get_syns_for_id": 9, "strategy_filt": 9, "get_syns_sharing_id": 9, "syn": [9, 29, 66, 84], "parser": [9, 26, 27, 29, 43, 44, 46, 47, 48, 66, 77, 80], "aggreg": 9, "via": [9, 29, 43, 47, 48, 52, 55, 66, 79, 81], "strategi": [9, 18, 24, 29, 47, 66, 80], "default": [9, 15, 18, 24, 26, 29, 46, 47, 48, 60, 66, 70, 77, 80, 81, 84, 86], "to_unicod": 11, "unicod": 11, "alreadi": [11, 47, 49, 54, 81], "utf": 11, "8": [11, 18, 24, 51, 70], "input": [11, 15, 24, 26, 29, 49, 60, 63, 66], "byte": 11, "nerprocessor": [12, 15], "seqtagprocessor": 12, "get_aug_exampl": 12, "data_dir": [12, 15], "transform": [12, 15, 24, 52, 55, 60, 84], "inputexampl": [12, 15], "dev": 12, "get_dev_exampl": 12, "get_test_exampl": 12, "test": [12, 13, 51, 63, 77, 83], "get_train_exampl": 12, "train": [12, 15, 16, 18, 59, 80, 81], "data": [12, 15, 24, 26, 29, 48, 52, 63, 66, 70, 73, 77, 80, 81, 83, 85, 86, 88], "sequenc": [12, 15, 24, 52, 55, 78], "tag": [12, 15, 55, 59, 79], "studentmodelcheckpointio": 13, "checkpointio": 13, "plugin": 13, "save": [13, 48, 60, 66, 77], "student": 13, "without": [13, 24, 26, 43, 67, 70, 73, 81, 84], "teacher": 13, "model_name_or_path": [13, 24], "load_checkpoint": 13, "path": [13, 15, 24, 26, 27, 29, 30, 32, 37, 43, 48, 52, 53, 54, 63, 66, 67, 68, 71, 77, 84, 86], "storage_opt": 13, "checkpoint": 13, "resum": 13, "ckpt": 13, "predict": [13, 14, 15, 18, 24, 52], "stage": 13, "arg": [13, 15, 18, 21, 24, 57, 60, 70], "map_loc": 13, "devic": [13, 18, 24], "specifi": [13, 15, 24, 48, 53, 58, 63, 71], "how": [13, 24, 29, 44, 55, 83, 84], "remap": 13, "storag": 13, "locat": [13, 24, 43, 48, 58, 66], "remove_checkpoint": 13, "file": [13, 26, 29, 32, 43, 48, 62, 66, 73, 77], "filesystem": 13, "save_checkpoint": 13, "current": [13, 18, 24, 27, 59, 66, 70, 86], "content": [13, 48, 83], "includ": [13, 15, 24, 37, 70, 73, 81, 83], "state_dict": 13, "optimizer_st": 13, "callback": [13, 18, 24], "accuraci": [14, 15, 24], "pred": [14, 29], "numeric_label_f1_scor": 14, "label_list": [14, 15], "calcul": [14, 15, 21, 24, 29, 66, 84], "f1": 14, "seqev": 14, "numer": [14, 51, 70, 81], "2d": [14, 24], "arrai": 14, "mappingid": 14, "nerdataset": 15, "dataset": [15, 24, 29, 59], "design": [15, 47, 55, 59, 70, 84], "fly": 15, "tokenis": [15, 46, 55], "speed": [15, 81], "multi": [15, 18, 24], "cach": [15, 39, 44, 46, 48, 49, 63, 66, 80], "prevent": [15, 18, 24], "repeat": 15, "exampl": [15, 18, 24, 48, 77, 78, 83, 84, 86, 88], "label_map": 15, "max_length": [15, 24, 71], "autotoken": [15, 71], "typic": [15, 51, 84], "dataprocessor": [15, 80], "maximum": [15, 47, 52, 66], "can": [15, 21, 24, 26, 29, 32, 40, 43, 44, 46, 47, 52, 53, 55, 62, 66, 67, 70, 77, 78, 81, 83, 84], "longer": [15, 52], "truncat": [15, 24], "convert_single_exampl": 15, "ex_index": 15, "tensor": [15, 24, 52, 55, 66], "sequencetaggingdistillationbas": 15, "taskspecificdistil": 15, "temperatur": 15, "warmup_step": 15, "learning_r": 15, "weight_decai": [15, 24], "batch_siz": [15, 24, 49, 52], "accumulate_grad_batch": 15, "max_epoch": 15, "student_model_path": 15, "teacher_model_path": 15, "num_work": [15, 24], "schedul": [15, 24], "specif": [15, 24, 47, 48, 73, 81], "step": [15, 18, 24, 27, 30, 32, 67, 77, 78, 79, 80, 81, 84, 88], "listconfig": 15, "get_training_exampl": 15, "train_dataload": [15, 24], "more": [15, 21, 24, 29, 46, 47, 55, 59, 60, 77, 79, 84], "pytorch": [15, 16, 24, 80], "dataload": [15, 18, 24, 52], "sampl": [15, 24], "In": [15, 24, 29, 30, 35, 44, 46, 58, 62, 77, 81, 84, 86], "pleas": [15, 24, 78, 84], "you": [15, 24, 32, 47, 60, 62, 77, 83, 84, 86], "reload": [15, 24], "unless": [15, 24, 60], "paramref": [15, 24], "pytorch_lightn": [15, 24], "trainer": [15, 18, 21, 24, 49, 52, 66], "reload_dataloaders_every_n_epoch": [15, 24], "posit": [15, 24, 27], "integ": [15, 24, 66, 70], "follow": [15, 24, 27, 29, 47, 55, 58, 70, 73, 77, 79, 81, 83, 84], "pattern": [15, 24, 29, 51], "download": [15, 24, 48, 68], "prepare_data": [15, 24], "split": [15, 24, 29, 46, 51, 52, 55, 66, 70, 79, 84], "setup": [15, 24, 46], "howev": [15, 24, 46, 47, 62, 70, 73, 77, 81, 84], "abov": [15, 24, 29, 44, 46, 55, 70, 73], "necessari": [15, 24, 47], "distribut": [15, 24, 70, 73], "do": [15, 24, 29, 60, 77, 84, 86], "assign": [15, 24, 46, 53, 55, 84], "state": [15, 16, 24, 44, 46, 47, 59, 63, 81], "fit": [15, 24, 52, 70, 73], "lightn": [15, 18, 24], "correct": [15, 24, 83], "sampler": [15, 24], "arbitrari": [15, 24, 58], "hardwar": [15, 24], "There": [15, 24, 55, 81, 84], "yourself": [15, 24], "def": [15, 18, 24, 77, 83, 84, 85], "totensor": [15, 24], "normal": [15, 24, 32, 48, 53, 60, 66, 70, 84], "5": [15, 24, 47, 51, 59, 70], "mnist": [15, 24], "root": [15, 24], "loader": [15, 24, 52], "shuffl": [15, 24], "cifar": [15, 24], "mnist_load": [15, 24], "cifar_load": [15, 24], "each": [15, 18, 24, 26, 29, 32, 44, 46, 48, 55, 58, 60, 77, 81, 84], "batch": [15, 18, 24, 49, 52, 60], "batch_mnist": [15, 24], "batch_cifar": [15, 24], "val_dataload": [15, 24], "recommend": [15, 24, 26, 29, 83], "prepar": [15, 24, 44, 46, 81], "happen": [15, 18, 24], "them": [15, 16, 24, 29, 43, 46, 66, 81], "loader_a": [15, 24], "loader_b": [15, 24], "loader_n": [15, 24], "don": [15, 24, 44, 49, 52, 53, 55], "t": [15, 18, 21, 24, 26, 30, 44, 46, 49, 52, 53, 55, 70], "validation_step": [15, 24], "argument": [15, 24, 26, 63, 86], "dataloader_idx": [15, 18, 24], "which": [15, 24, 26, 29, 44, 47, 48, 55, 60, 70, 79, 84, 86], "order": [15, 24, 27, 47, 48, 58, 62, 71, 77, 86], "here": [15, 24, 44, 83, 84, 86], "sequencetaggingdistillationforfinallay": 15, "layer": 15, "soft_cross_entropi": 15, "target": [15, 48, 66, 81, 84], "tensor_to_jagged_arrai": 15, "attention_mask": [15, 16], "training_step": [15, 24], "batch_idx": [15, 18, 24], "comput": [15, 16, 24, 59, 60], "loss": [15, 24, 70, 73], "progress": [15, 24], "bar": [15, 24], "logger": [15, 24], "output": [15, 18, 24, 26, 29], "your": [15, 24, 83, 86], "displai": [15, 24], "optimizer_idx": [15, 24], "optim": [15, 24, 59], "present": [15, 24, 30, 66, 86], "hidden": [15, 24], "core": [15, 18, 24, 27, 84], "lightningmodul": [15, 18, 24], "truncated_bptt_step": [15, 24], "must": [15, 24, 29, 46, 52, 55, 60, 70, 73, 84], "skip": [15, 24, 39], "next": [15, 24, 47, 55, 78, 86, 88], "automat": [15, 24, 60], "support": [15, 24, 66, 81], "gpu": [15, 18, 24, 81], "tpu": [15, 18, 24], "ipu": [15, 24], "deepspe": [15, 24], "forward": [15, 16, 18, 24, 46], "fancier": [15, 24], "like": [15, 24, 29, 44, 55, 70, 77, 81, 84], "someth": [15, 24, 29, 84], "y": [15, 24, 44], "z": [15, 24, 30], "out": [15, 24, 44, 47, 70, 73, 81], "gan": [15, 24], "decod": [15, 24], "back": [15, 24, 29, 55, 83], "propag": [15, 24], "through": [15, 24, 60], "time": [15, 24, 43, 44, 47, 60, 67], "previou": [15, 24, 55], "backprop": [15, 24], "lstm": [15, 24], "shown": [15, 24], "smooth": [15, 24], "averag": [15, 24], "over": [15, 24, 47, 53, 54, 60, 77, 78, 81], "last": [15, 24, 44, 52, 58, 81], "so": [15, 24, 27, 29, 40, 43, 44, 47, 53, 55, 62, 67, 70, 78, 81, 83, 86, 88], "validation_epoch_end": [15, 24], "val_step_output": 15, "epoch": [15, 18, 24], "pseudocod": [15, 24], "val_out": [15, 24], "val_batch": [15, 24], "val_data": [15, 24], "append": [15, 24, 77, 78, 86, 88], "didn": 15, "won": [15, 18, 24], "With": 15, "outer": 15, "inner": [15, 55], "individu": [15, 43], "dataloader_output_result": 15, "dataloader_out": 15, "dataloader_i_output": 15, "log": [15, 24, 26, 32, 69, 78], "final_metr": 15, "final_valu": 15, "oper": [15, 24, 47, 55], "might": [15, 24, 29, 48, 62, 70, 81], "anyth": [15, 24, 84], "interest": [15, 24, 81], "val": [15, 24], "validation_step_end": [15, 24], "own": [15, 24, 83, 84], "6": [15, 24, 46, 70], "imag": [15, 24], "whatev": [15, 24], "sample_img": [15, 24], "grid": [15, 24], "torchvis": [15, 24], "make_grid": [15, 24], "experi": [15, 24, 81], "add_imag": [15, 24], "example_imag": [15, 24], "acc": [15, 24], "labels_hat": [15, 24], "argmax": [15, 24], "dim": [15, 24], "val_acc": [15, 24], "sum": [15, 24], "len": [15, 24, 46, 78, 86, 88], "log_dict": [15, 24], "val_loss": [15, 24], "quickli": [15, 24], "switch": [15, 24, 70], "between": [15, 24, 44, 46, 63, 66], "tell": [15, 24, 48, 81, 84], "been": [15, 24, 49, 70, 78, 81, 86, 88], "put": [15, 24, 77], "eval": [15, 24], "mode": [15, 24], "gradient": [15, 24], "disabl": [15, 24, 29], "At": [15, 24], "goe": [15, 24, 84], "enabl": [15, 24, 81], "allow_zero_length_dataloader_with_multiple_devic": 15, "prepare_data_per_nod": 15, "sequencetaggingdistillationforintermediatelay": 15, "intermedi": 15, "embed": [15, 24, 49, 66, 84], "platform": 15, "configure_optim": [15, 24], "configur": [15, 24, 26, 29, 46, 52, 55, 58, 63, 66, 79, 80, 81, 84], "learn": [15, 24, 60, 70], "rate": [15, 24], "get_optimizer_grouped_paramet": 15, "student_model": 15, "bert": [16, 24, 52, 55, 64], "tinybertforsequencetag": 16, "bertpretrainedmodel": 16, "config": [16, 26, 29, 32, 52, 63, 83, 85, 86], "num_label": 16, "fit_siz": 16, "768": 16, "initi": [16, 27], "intern": [16, 55], "share": [16, 59], "nn": 16, "scriptmodul": 16, "token_type_id": 16, "is_stud": 16, "perform": [16, 27, 44, 46, 59, 79], "overridden": [16, 29], "although": [16, 81], "recip": 16, "afterward": 16, "instead": [16, 48, 66], "former": [16, 24], "take": [16, 77, 83, 84, 86], "run": [16, 24, 27, 32, 46, 47, 49, 52, 54, 59, 60, 63, 77, 78, 80, 83], "regist": 16, "hook": [16, 24], "latter": [16, 24, 62], "silent": 16, "ignor": [16, 55, 60], "dictconfig": [17, 24, 32, 63, 75, 83, 85], "plautomodel": 18, "veri": [18, 62, 84], "automodel": [18, 24], "predict_step": [18, 24], "dure": [18, 24], "By": [18, 24, 70], "logic": [18, 24, 55, 58], "scale": [18, 24, 80, 81], "infer": [18, 24, 49, 60], "To": [18, 24, 59, 84, 86], "oom": [18, 24], "error": [18, 24, 60, 63], "basepredictionwrit": [18, 24], "write": [18, 24, 26, 80], "disk": [18, 24, 26, 27, 48, 66], "databas": [18, 24, 29, 44, 46, 66, 80], "after": [18, 24, 32, 46, 47, 55], "spawn": [18, 24], "acceler": [18, 24, 81], "ddp_spawn": [18, 24], "mymodel": [18, 24], "dm": [18, 24], "plautomodelfortokenclassif": 18, "automodelfortokenclassif": [18, 52], "booleanstringsimilarityscor": [21, 46, 66], "stringsimilarityscor": [21, 29], "protocol": [21, 29, 57, 60, 70], "entitynounmodifierstringsimilarityscor": 21, "modifi": [21, 27, 35, 62, 66, 70], "phrase": [21, 27, 29, 70, 84], "reference_term": [21, 66], "noun_modifier_phras": 21, "entitysubtypestringsimilarityscor": 21, "mention": [21, 24, 44, 49], "norm": 21, "numeric_class_phras": 21, "re": [21, 29, 70, 81, 83], "compil": [21, 29, 70], "numbermatchstringsimilarityscor": 21, "number_find": 21, "rapidfuzzstringsimilarityscor": 21, "rapid": 21, "fuzz": 21, "count": 21, "10": [21, 24, 53, 59, 70], "char": [21, 43, 58, 66, 70, 71], "token_sort_ratio": 21, "otherwis": [21, 29, 44, 66, 70, 73, 84], "wratio": 21, "sapbertstringsimilarityscor": [21, 29], "inherit": 21, "sapbert": [21, 80, 84], "plsapbertmodel": [21, 24, 49, 66], "numericmetr": 21, "namedtupl": 24, "__new__": 24, "_cl": 24, "iri": [24, 29], "alia": 24, "goldstandardexampl": 24, "gold_default_label": 24, "gold_iri": 24, "hfsapbertinferencedataset": 24, "inferenc": 24, "track": [24, 47], "vector": 24, "environ": [24, 54, 86], "hfsapbertpairwisedataset": 24, "encodings_1": 24, "encodings_2": 24, "ndarrai": [24, 44], "identifi": [24, 29, 35, 46, 47, 48, 62, 84], "origin": [24, 29, 46, 47, 49, 55, 59, 62, 70, 73, 78], "github": [24, 62, 70], "com": [24, 59, 62, 70, 73], "cambridgeltl": 24, "credit": [24, 62, 70], "inproceed": [24, 59], "liu2021self": 24, "titl": [24, 53, 59, 70, 78], "align": [24, 49, 55], "pretrain": [24, 49, 86], "author": [24, 44, 53, 59, 70, 73, 83], "liu": 24, "fangyu": 24, "shareghi": 24, "ehsan": 24, "meng": 24, "zaiqiao": 24, "basaldella": 24, "marco": 24, "collier": 24, "nigel": 24, "booktitl": [24, 59], "proceed": [24, 59], "2021": [24, 49, 59], "confer": 24, "north": 24, "american": [24, 59], "chapter": 24, "linguist": [24, 59], "human": [24, 46, 53, 59], "languag": [24, 26, 27, 53, 59, 62, 80, 81], "technolog": 24, "page": [24, 59], "4228": 24, "4238": 24, "month": [24, 53, 59, 70], "jun": [24, 53], "year": [24, 53, 59, 70, 81], "sapbert_training_param": 24, "sapbert_evaluation_manag": 24, "from_pretrain": [24, 52], "sapberttrainingparam": 24, "sapbertevaluationdatamanag": 24, "choos": [24, 58, 84], "what": [24, 47, 48, 84], "But": [24, 81], "two": [24, 46, 78, 84, 86, 88], "first": [24, 52, 58, 62, 70, 78, 84, 86, 88], "second": [24, 60, 70], "lr": 24, "lr_scheduler_config": 24, "lr_schedul": 24, "whose": 24, "describ": [24, 60, 70], "frequenc": [24, 77], "its": [24, 27, 60, 73], "below": [24, 58, 77, 79], "unit": 24, "size": [24, 39, 49, 52, 66, 84], "could": [24, 29, 47, 81, 84], "updat": [24, 55, 64, 86], "wherea": 24, "interv": 24, "mani": [24, 29, 46, 59, 77, 81, 84], "correspond": [24, 44, 46, 62], "monitor": [24, 32], "reducelronplateau": 24, "enforc": 24, "thu": [24, 29], "stop": [24, 69, 75], "found": [24, 49, 62, 64], "warn": [24, 26, 32, 58], "strict": [24, 27, 70, 73], "learningratemonitor": 24, "keyword": 24, "condit": [24, 53, 55, 70, 71, 73], "adam": 24, "metric_to_track": 24, "often": [24, 44, 77, 84], "check_val_every_n_epoch": 24, "optimizer1": 24, "optimizer2": 24, "sgd": 24, "scheduler1": 24, "scheduler2": 24, "lambdalr": 24, "made": 24, "simpli": [24, 46], "metric_v": 24, "along": [24, 32, 70], "sequenti": [24, 55, 66], "given": [24, 29, 44, 46, 48, 52, 63, 70, 71, 77, 81, 84, 86], "optimizer_on": 24, "01": 24, "optimizer_two": 24, "cycl": 24, "continu": [24, 55], "being": [24, 29, 55, 59], "1e": 24, "3": [24, 43, 46, 51, 70, 73, 84], "gen_opt": 24, "model_gen": 24, "dis_opt": 24, "model_di": 24, "02": 24, "dis_sch": 24, "cosineann": 24, "t_max": 24, "gen_sch": 24, "exponentiallr": 24, "99": 24, "procedur": 24, "improv": [24, 70], "wasserstein": 24, "algorithm": [24, 29, 35, 46, 58, 62, 73, 81, 84], "arxiv": 24, "org": [24, 29, 49, 53, 59, 70, 84], "ab": 24, "1704": 24, "00028": 24, "n_critic": 24, "know": [24, 84], "backward": 24, "lbfg": 24, "closur": 24, "control": 24, "those": 24, "optimizer_step": 24, "evaluate_topk_acc": 24, "level": [24, 46], "k": [24, 30, 47, 58], "nearest": 24, "neighbour": 24, "get_candidate_dict": 24, "np_candid": 24, "golden_iri": 24, "row": 24, "datafram": [24, 29, 66, 84], "get_embed": 24, "come": [24, 26, 84], "get_embeddings_for_str": 24, "pl": [24, 49, 66], "get_embeddings_from_dataload": 24, "cl": [24, 60, 77], "log_result": 24, "dataset_nam": 24, "sapbertevaluationdataset": 24, "ontology_sourc": 24, "query_sourc": 24, "dataset_idx": 24, "sapbertdatacollatorwithpad": 24, "collat": [24, 52], "pad": 24, "pad_to_multiple_of": 24, "pretrainedtokenizerbas": 24, "paddingstrategi": 24, "manag": [24, 43, 47, 48, 66, 81, 83], "pars": [24, 29, 59, 73, 84], "evalu": [24, 59], "maintain": [24, 39, 49, 81], "construct": [24, 48], "debug": [24, 32, 70], "datasourc": [24, 84], "space": [24, 71], "against": [24, 47, 59, 62, 63, 83], "three": [24, 84], "column": [24, 29], "basemodel": [24, 75], "miner_margin": 24, "topk": 24, "train_batch_s": 24, "train_fil": 24, "type_of_triplet": 24, "get_embedding_dataloader_from_str": 24, "50": 24, "datacollatorwithpad": [24, 52], "callabl": [24, 51, 53, 57, 60, 65], "init_hf_collate_fn": 24, "custom_token": 26, "main": [26, 49, 62, 77, 83, 85], "output_dir": [26, 63, 77], "curated_list": [26, 27, 77], "span_kei": [26, 27], "raw_hit": [26, 27], "serial": [26, 27], "ontologymatch": [26, 27], "english": [26, 59], "sentenc": [26, 27, 37, 59], "written": [26, 73], "caller": 26, "built": [26, 59, 63], "try": [26, 46], "understand": [26, 84], "noisi": [26, 30, 77, 84], "raw": [26, 46, 55, 73, 77], "tend": [26, 77], "curat": [26, 27, 30, 79, 80, 81, 84], "befor": [26, 47, 66, 70], "appli": [26, 35, 46, 47, 55, 66, 70], "build": [26, 44, 48, 63, 66, 77], "attempt": [26, 47, 48, 52, 54, 66, 81, 84], "directori": [26, 29, 32, 43, 48, 63, 66, 86], "jsonl": [26, 77], "line": [26, 29], "case_sensit": [26, 27, 77], "attribut": [26, 29, 84], "store": [26, 27, 29, 44], "recognis": [26, 81], "curatedterm": 27, "action": [27, 77], "term_norm_map": [27, 77], "done": [27, 44, 55], "phrasematch": [27, 77, 79], "match_id_sep": 27, "parser_name_to_entity_typ": [27, 77], "create_lowercase_phrasematcher_from_pars": 27, "matcher": [27, 37], "compon": [27, 51, 81, 83, 84, 86], "deseri": 27, "lowercas": 27, "shape": 27, "create_phrasematchers_from_curated_list": 27, "redundantli": 27, "sensit": 27, "go": [27, 29], "redund": [27, 29], "filter_by_context": 27, "These": [27, 59], "filter": [27, 44, 46, 78, 84, 86, 88], "work": [27, 29, 44, 46, 47, 55, 60, 62, 70, 81, 84], "best": [27, 46, 58, 81], "segment": [27, 59], "from_disk": 27, "exclud": [27, 29, 58], "pipe": [27, 60], "place": [27, 60], "set_context_match": 27, "set_label": 27, "span_in_fp_context": 27, "ent_class": 27, "regard": [27, 81], "span_in_fp_coocc": 27, "co": [27, 47], "occ": 27, "dic": 27, "least": [27, 29, 39, 49, 81, 84], "span_in_tp_context": 27, "span_in_tp_coocc": 27, "to_disk": 27, "nr_lowercase_rul": 27, "nr_strict_rul": 27, "ontologymatcherconfig": 27, "biologicalprocessgeneontologypars": 29, "geneontologypars": 29, "in_path": [29, 77, 84], "string_scor": [29, 84], "synonym_merge_threshold": 29, "7": [29, 44, 47, 51, 70], "data_origin": [29, 77], "unknown": 29, "synonym_gener": [29, 77, 80], "excluded_id": 29, "resourc": [29, 48, 63], "owl": 29, "db": [29, 84], "tsv": 29, "throughout": [29, 32, 35], "stringnorm": [29, 47, 70, 84], "appropri": [29, 46, 47, 55, 79, 84], "behaviour": [29, 84], "overal": 29, "resolv": [29, 47, 84], "symbol": [29, 46, 47, 49, 70, 77, 84], "conflict": [29, 63], "trigger": [29, 46], "merg": [29, 58, 79, 84], "further": [29, 47, 64, 78], "detail": [29, 47, 79], "hgnc": 29, "releas": [29, 59, 81], "meddra": [29, 48], "24": 29, "combinatorialsynonymgener": [29, 30, 77], "cloontologypars": 29, "rdfgraphpars": 29, "clo": [29, 79], "www": [29, 48], "ebi": [29, 48], "ac": [29, 48], "uk": [29, 48], "ol": 29, "find_kb": [29, 84], "somehow": [29, 32], "find": [29, 55, 71, 79, 84], "cellosaurusontologypars": 29, "obo": [29, 84], "cellosauru": [29, 79], "ftp": 29, "expasi": 29, "parse_to_datafram": [29, 84], "long": [29, 52, 62, 84], "thin": [29, 84], "pd": [29, 84], "prefer": [29, 46, 58, 60, 62, 81], "xref": [29, 48], "exactsyn": 29, "usual": 29, "respons": [29, 46, 47], "id_to_sourc": 29, "original_syn_set": 29, "treat": 29, "seper": 29, "cell": 29, "cell_line_r": 29, "ignorecas": 29, "cellularcomponentgeneontologypars": 29, "chemblontologypars": [29, 84], "sqllite": 29, "chembl": [29, 79, 84], "pub": [29, 59], "chembldb": 29, "latest": 29, "chembl_29_sqlit": 29, "tar": 29, "gz": 29, "ensemblontologypars": 29, "genenam": 29, "hgnc_complete_set": 29, "additional_syns_path": 29, "load_go": 29, "graph": [29, 81], "populate_databas": 29, "popul": [29, 54], "instances_in_db": 29, "jsonlinesontologypars": 29, "jsonlin": 29, "implemet": 29, "json_dict_to_parser_dict": 29, "json_dict_to_parser_record": 29, "jsons_gen": 29, "yield": 29, "record": 29, "compat": 29, "expect": [29, 30, 47, 54, 60], "structur": [29, 84], "superclass": 29, "read": 29, "meddraontologypars": 29, "unzip": 29, "licenc": 29, "mdhier": 29, "asc": 29, "llt": 29, "molecularfunctiongeneontologypars": 29, "parsed_datafram": 29, "mondoontologypars": [29, 48, 77], "is_valid_iri": 29, "abc": [29, 30, 44, 48, 66], "suitabl": [29, 48, 70], "composit": [29, 81, 84], "seed": 29, "speak": [29, 60], "therefor": [29, 55, 62, 81], "cox": 29, "ensg00000095303": 29, "OR": [29, 70, 73], "ensg00000198804": 29, "noun": [29, 66, 70, 84], "far": 29, "less": [29, 49], "form": [29, 47, 62, 70, 73, 84], "mulipl": 29, "subset": [29, 44, 52, 84], "accordingli": [29, 55, 64, 79], "meddra_diseas": 29, "meddra_diagnost": 29, "drop_excluded_id": 29, "export_metadata": 29, "export_synonym_term": 29, "generate_synonym": 29, "resolut": 29, "populate_metadata_databas": 29, "populate_synonym_databas": 29, "resolve_synonym": 29, "synonym_df": 29, "duplic": [29, 44], "paracetamol": 29, "confus": [29, 55, 70, 84], "manner": 29, "decid": [29, 47, 79, 84], "wai": [29, 32, 58, 70, 73, 86], "cluster": 29, "turn": 29, "depend": [29, 47, 52, 53, 59, 81], "whether": [29, 37, 46, 47, 52, 55, 70, 73, 77, 84], "job": [29, 44, 84], "concret": 29, "stringscor": 29, "referenc": [29, 48, 84], "group": [29, 47, 58, 80, 84], "compar": [29, 59], "comparison": 29, "upon": [29, 55], "all_synonym_column_nam": 29, "minimum_metadata_column_nam": 29, "opentargetsdiseaseontologypars": [29, 48], "look_for_mondo": 29, "ot_id": 29, "db_xref": 29, "allowed_sourc": 29, "hp": [29, 84], "mondo": [29, 48, 79, 84], "opentargetsmoleculeontologypars": 29, "opentargetstargetontologypars": 29, "gene": [29, 46, 51, 53, 70, 78, 79, 83, 86, 88], "frequent": 29, "reli": 29, "answer": 29, "postcard": 29, "anyon": [29, 84], "better": [29, 59], "idea": [29, 84], "annotation_field": 29, "chemicalprob": 29, "constraint": 29, "functiondescript": 29, "hallmark": 29, "pathwai": 29, "safetyli": 29, "subcellularloc": 29, "targetclass": 29, "tractabl": 29, "uri_regex": 29, "synonym_pred": 29, "include_entity_pattern": 29, "exclude_entity_pattern": 29, "node": 29, "convert_to_rdflib_ref": 29, "_uri_regex": 29, "uberonontologypars": 29, "uberon": [29, 79], "synonymgener": 30, "greeksymbolsubstitut": 30, "all_sub": 30, "alpha": [30, 70], "\u03b1": [30, 70], "beta": [30, 70], "\u03b2": [30, 70], "\u03d0": [30, 70], "chi": [30, 70], "\u03c7": [30, 70], "delta": [30, 70], "\u03b4": [30, 70], "epsilon": [30, 70], "\u03b5": [30, 70], "eta": [30, 70], "\u03b7": [30, 70], "sigma": [30, 70], "\u03c2": [30, 70], "gamma": [30, 70], "\u03b3": [30, 70], "iota": [30, 70], "\u03b9": [30, 70], "kappa": [30, 70], "\u03ba": [30, 70], "lambda": [30, 70, 78, 84, 86, 88], "\u03bb": [30, 70], "mu": [30, 70], "\u03bc": [30, 70], "nu": [30, 70], "\u03bd": [30, 70], "omega": [30, 70], "\u03c9": [30, 70], "omicron": [30, 70], "\u03bf": [30, 70], "phi": [30, 70], "\u03c6": [30, 70], "\u03d5": [30, 70], "pi": [30, 70], "\u03c0": [30, 70], "psi": [30, 70], "\u03c8": [30, 70], "rho": [30, 70], "\u03c1": [30, 70], "\u03c3": [30, 70], "tau": [30, 70], "\u03c4": [30, 70], "theta": [30, 70], "\u03b8": [30, 70], "upsilon": [30, 70], "\u03c5": [30, 70], "xi": [30, 70], "\u03be": [30, 70], "zeta": [30, 70], "\u03b6": [30, 70], "b": [30, 55, 58, 70], "l": 30, "m": [30, 70], "n": [30, 43, 71, 77], "o": [30, 53, 55], "p": [30, 59, 71], "r": 30, "u": 30, "c": [30, 58, 70, 73], "f": [30, 77, 83, 84], "greek_lett": 30, "lower_greek_lett": 30, "spell": 30, "upper_greek_lett": 30, "separatorexpans": 30, "spacy_pipelin": [30, 51, 54, 77, 80], "spacypipelin": [30, 51, 54, 67], "spellingvariationreplac": 30, "known": [30, 55, 81], "variat": [30, 53], "input_path": 30, "stopwordremov": [30, 77], "stopword": 30, "all_stopword": 30, "caus": [30, 70, 73], "involv": [30, 60], "stringreplac": [30, 77], "replacement_dict": 30, "digit_aware_replacement_dict": 30, "include_greek": [30, 77], "suffixreplac": 30, "interchang": 30, "suffic": [30, 79], "word": [30, 43, 52, 55, 62, 70, 71], "high": [30, 32, 47, 77], "later": 30, "knowledg": [30, 80, 81, 84], "particular": [30, 70, 73], "doesn": 30, "suffix": 30, "anaemia": 30, "ia": 30, "ic": 30, "anaem": 30, "amaemi": 30, "abstract": [30, 44, 48, 59], "faileddocsfilehandl": 32, "faileddocshandl": 32, "log_dir": 32, "fail": [32, 60, 78, 86, 88], "faileddocsloghandl": 32, "failure_handl": 32, "profile_steps_dir": 32, "skip_doc_len": 32, "200000": 32, "basic": [32, 55], "help": 32, "seri": 32, "handler": 32, "profil": 32, "tensorboard": 32, "dir": [32, 48, 63, 66], "prefilter_doc": 32, "step_tim": 32, "batch_tim": 32, "batch_metrics_dict": 32, "reset": [32, 47], "update_failed_doc": 32, "failed_doc": 32, "batch_metr": 32, "calc_doc_s": 32, "load_steps_and_log_memory_usag": 32, "increas": [32, 66], "instanti": [32, 63, 83, 85, 86], "give": [32, 62, 84], "omegaconf": [32, 83, 85], "hydra": [32, 75, 79, 80, 81, 83, 85], "abbreviationfinderstep": [35, 78, 79, 86, 88], "abbrevi": [35, 62, 79, 84], "definit": [35, 62], "schwartz": [35, 62], "hearst": [35, 62], "2003": [35, 62], "version": [35, 46, 48, 62, 63, 66, 70, 86], "scispaci": [35, 59, 62], "finder": 35, "rule": [35, 51, 70], "expand": 35, "abbreviation_detector": [35, 80], "kazuabbreviationdetector": [35, 62], "learnt": 35, "exclude_abbrv": [35, 62], "explosionstringmatchingstep": [37, 77, 79], "linker": 37, "include_sentence_offset": 37, "offset": [37, 52], "extract_entity_data_from_span": 37, "dictionaryentitylinkingstep": [39, 79], "link_index": [39, 80], "dictionaryindex": [39, 66], "lookup_cache_s": [39, 49, 64], "5000": [39, 49, 64], "top_n": [39, 49, 66], "20": [39, 49, 81], "skip_ner_namespac": 39, "recent": [39, 49], "lookup": [39, 48, 49], "keep": [39, 49, 84], "load_or_build_cach": [39, 48, 49, 66], "mappingstep": [40, 79], "strategyrunn": [40, 44, 47], "strategy_runn": [40, 80], "tfidfscor": [43, 44], "tfidf": [43, 44], "sklearn": 43, "feature_extract": 43, "tfidfvector": 43, "filenam": 43, "relev": [43, 47, 64, 77, 84], "build_or_load_vector": 43, "build_vector": 43, "load_vector": 43, "create_word_and_char_ngram": 43, "ngram": [43, 66, 71], "compris": 43, "annotationleveldisambiguationstrategi": 44, "disambiguationstrategi": [44, 46], "certain": 44, "colloqui": 44, "incorrect": 44, "annotation_scor": 44, "metadata_db": [44, 46, 66], "proxi": 44, "wide": [44, 59], "studi": [44, 59], "random": 44, "vs": [44, 46], "natur": [44, 59, 70, 84], "pretti": 44, "unsophist": 44, "resort": 44, "id_set": [44, 46], "preprocess": [44, 81], "in_memory_db": [44, 46, 80], "definedelsewhereindocumentdisambiguationstrategi": 44, "chang": [44, 46, 47, 70], "execut": [44, 46, 47, 63], "hopefulli": 44, "smaller": 44, "event": [44, 70, 73, 78], "complex": [44, 46, 47, 81], "mappingstrategi": [44, 46, 47], "coordin": 44, "tfidfdisambiguationstrategi": 44, "retriev": 44, "matrix": 44, "regardless": 44, "sort": [44, 46, 47, 58], "accord": [44, 46, 55, 58, 77, 79], "minimum": 44, "context_threshold": 44, "relevant_aggregation_strategi": 44, "context": [44, 58, 84], "search": [44, 46, 48, 66, 70, 86], "build_id_set_represent": 44, "cacheable_build_document_represent": 44, "recalcul": 44, "hashabl": [44, 47], "thrown": 44, "awai": 44, "pragmat": 44, "make": [44, 48, 59, 66, 81, 84], "context_scor": [44, 80], "definedelsewhereindocumentmappingstrategi": 46, "filter_term": 46, "ent_match": 46, "ent_match_norm": 46, "ideal": 46, "scenario": [46, 84], "carri": 46, "found_equivalent_id": 46, "exactmatchmappingstrategi": 46, "exact": [46, 84], "mappingfactori": 46, "additional_metadata": 46, "strip_url": 46, "create_mapping_from_id_set": 46, "actualis": 46, "down": 46, "user": 46, "bear": 46, "still": [46, 47], "remain": 46, "receiv": 46, "either": [46, 60, 81, 84], "disambiguate_if_requir": 46, "filtered_term": 46, "liabl": [46, 70, 73], "strongmatchmappingstrategi": 46, "highest": [46, 55], "greater": 46, "differenti": 46, "close": [46, 55, 84], "search_threshold": 46, "80": 46, "symbolic_onli": 46, "equal": [46, 58], "minu": 46, "strongmatchwithembeddingconfirmationstringmatchingstrategi": 46, "parent": [46, 63], "predefin": [46, 84], "confirm": 46, "broadli": [46, 84], "attach": [46, 47, 78, 86, 88], "refin": 46, "neck": 46, "diseas": [46, 70, 77, 79, 83, 84], "heck": 46, "complex_string_scor": 46, "embedding_threshold": 46, "symbolmatchmappingstrategi": 46, "whitespac": 46, "k8": 46, "mapk8": 46, "shortest": 46, "match_symbol": 46, "s1": 46, "s2": 46, "termnormissubstringmappingstrategi": 46, "exactli": 46, "testin": 46, "min_term_norm_len_to_consid": 46, "length": [46, 52], "namespacestrategyexecut": 47, "role": 47, "had": [47, 81], "successfulli": 47, "applic": [47, 48, 83], "entitykei": 47, "__call__": [47, 55, 60, 86], "docstr": 47, "ent_class_strategi": 47, "default_strategi": 47, "stop_on_success": 47, "get_strategies_for_entity_class": 47, "clear": 47, "readi": [47, 84], "anoth": [47, 81, 84], "longest_mapping_strategy_list_s": 47, "ordin": 47, "variou": [47, 60, 84], "ground": [47, 81], "success": [47, 78, 86, 88], "henc": 47, "why": [47, 80], "crucial": 47, "higher": [47, 58, 70], "lower": [47, 70], "beyond": 47, "itself": [47, 60, 84], "variabl": [47, 86], "vari": 47, "sub": [47, 48], "again": 47, "divid": 47, "condition": 47, "symbolic_strategi": 47, "non_symbolic_strategi": 47, "ner_namespace_processing_ord": 47, "cross_ref_manag": 47, "low": 47, "combin": 47, "info": [47, 48, 55], "deriv": [47, 73], "crossreferencemanag": [47, 48], "xrefer": 47, "execute_hit_post_processing_strategi": 47, "ents_needing_map": 47, "namespace_strategy_execut": 47, "entity_to_entity_kei": 47, "group_entities_by_symbol": 47, "separ": [47, 71, 84], "elsewher": [47, 84], "unsort": 47, "just": [47, 59], "classify_symbol": [47, 70], "source_to_parser_metadata_lookup": 48, "cross": 48, "hold": [48, 84], "superset": 48, "held": 48, "ref": 48, "build_xref_cach": 48, "xrefdatabas": 48, "create_xref_map": 48, "cache_path": [48, 66], "force_rebuild_cach": [48, 66], "xref_db": 48, "asset": [48, 66], "wa": [48, 62, 66], "oxocrossreferencemanag": 48, "oxo": 48, "servic": [48, 70, 73], "local": [48, 83], "oxo_kazu_name_map": 48, "uri_prefix": 48, "oxo_queri": 48, "covert": 48, "uri": [48, 81], "prefix": [48, 71, 73], "correctli": [48, 55], "reconstruct": 48, "api": [48, 78, 84], "request": [48, 73], "create_oxo_dump": 48, "parse_oxo_dump": 48, "oxo_dump": 48, "accept": [48, 63, 83], "oxo_url": 48, "spot": 48, "sapbertforentitylinkingstep": 49, "wrap": [49, 59, 60, 78, 81, 86], "paper": [49, 79], "aclantholog": 49, "naacl": 49, "334": 49, "pdf": [49, 59], "embedding_model": [49, 66], "min_string_length_to_trigg": 49, "ignore_high_conf": 49, "embeddingindex": [49, 66], "signal": 49, "shorter": 49, "good": [49, 70, 73, 77, 84], "techniqu": [49, 77], "perfect": 49, "process_ent": 49, "noncontiguousentitysplitt": [51, 52], "entity_condit": 51, "splitonconjunctionpattern": 51, "analys": 51, "run_conjunction_rul": 51, "splitonnumericallistpatternwithprefix": 51, "increment": 51, "splitter": 51, "brca1": 51, "print": [51, 78, 86, 88], "oncogen": 51, "brca2": 51, "brca3": 51, "transformersmodelfortokenclassificationnerstep": [52, 79], "slide": 52, "window": 52, "larg": [52, 77, 86], "post": [52, 55], "tokenizedwordprocessor": [52, 55], "stride": [52, 71], "max_sequence_length": 52, "detect_subspan": [52, 55], "entity_splitt": 52, "nest": [52, 55, 58, 81, 83], "entity_post_process": [52, 80], "frame_to_tok_word": 52, "batch_encod": 52, "number_of_fram": 52, "frame_index": 52, "section_frame_index": 52, "frame": 52, "word_id": [52, 55], "rel": [52, 81], "total": 52, "whole": [52, 55], "tokenizedword": [52, 55], "get_activ": 52, "namedtuple_values_indic": 52, "consist": [52, 70, 81], "get_dataload": 52, "overflow_to_sample_map": 52, "get_list_of_batch_encoding_frames_for_sect": 52, "section_index": 52, "id2labels_from_label_list": 52, "section_frames_to_tokenised_word": 52, "sethstep": [53, 79], "snp": 53, "extract": [53, 79, 84], "tool": [53, 83], "py4j": 53, "articl": [53, 59, 70], "seth2016": 53, "genet": 53, "variant": 53, "thoma": 53, "philipp": 53, "rockt": 53, "schel": 53, "tim": 53, "hakenberg": 53, "j": 53, "rg": 53, "lichtblau": 53, "yvonn": 53, "leser": 53, "ulf": 53, "journal": [53, 59, 70], "bioinformat": [53, 70], "2016": 53, "doi": [53, 59, 70], "1093": [53, 59, 70], "btw234": 53, "eng": 53, "medlin": 53, "pst": 53, "aheadofprint": 53, "pmid": 53, "27256315": 53, "dx": 53, "seth_fatjar_path": 53, "java_hom": 53, "emerg": 53, "fatjar": 53, "slow": 53, "pre": [53, 83], "protein": 53, "spacynerstep": 54, "model_nam": 54, "instal": [54, 80], "simplespanfind": 55, "spanfind": 55, "id2label": 55, "bio": [55, 59], "get_bio_and_class_label": 55, "process_next_word": 55, "span_continue_condit": 55, "bio_and_class_label": 55, "potenti": [55, 63], "met": [55, 70, 73], "span_breaking_char": 55, "smartspanfind": 55, "complic": 55, "soft": 55, "consider": 55, "wordpiec": 55, "oov": 55, "problem": [55, 70, 81], "reconstitut": 55, "ne": 55, "inprecis": 55, "art": [55, 59, 81], "closed_span": 55, "tokwordspan": 55, "close_span": 55, "activ": 55, "start_span": 55, "subspan": 55, "dataclass": [55, 77], "clazz": 55, "tok_word": 55, "token_id": 55, "token_confid": 55, "token_offset": 55, "word_char_start": 55, "word_char_end": 55, "becaus": [55, 70], "inher": 55, "obscur": 55, "sometim": 55, "mayb": 55, "classic": 55, "entir": 55, "confidence_threshold": 55, "calculate_span_offset": 55, "make_span_find": 55, "spans_to_ent": 55, "ad": [55, 60, 70, 78], "cleanupact": 57, "cleanupstep": [57, 79], "cleanup_act": 57, "dropmappingsbyconfidencemappingfilt": 57, "ranks_to_drop": 57, "dropunmappedentityfilt": 57, "from_ent_namespac": 57, "entityfiltercleanupact": 57, "filter_fn": 57, "mappingfiltercleanupact": 57, "mergeoverlappingentsstep": [58, 79], "descript": 58, "ent_class_preferred_ord": 58, "ignore_non_contigu": 58, "pick": 58, "proscrib": 58, "basi": 58, "revers": 58, "alphabet": 58, "criteria": 58, "elimin": [58, 84], "prioriti": 58, "lowest": 58, "filter_ents_across_class": 58, "group_entities_by_loc": 58, "select_preferred_ent": 58, "stanzastep": 59, "genia": [59, 81], "treebank": 59, "qi2020stanza": 59, "qi": 59, "peng": 59, "zhang": 59, "yuhao": 59, "yuhui": 59, "bolton": 59, "jason": 59, "man": 59, "christoph": 59, "58th": 59, "annual": 59, "meet": 59, "demonstr": 59, "toolkit": 59, "stanford": 59, "edu": 59, "2020": 59, "jamia": 59, "ocab090": 59, "langlotz": 59, "curti": 59, "clinic": 59, "packag": 59, "librari": 59, "medic": [59, 70], "informat": 59, "volum": 59, "28": 59, "1892": 59, "1899": 59, "06": 59, "sought": 59, "develop": [59, 81], "neural": 59, "syntact": 59, "analysi": 59, "recognit": 59, "extend": [59, 81], "mix": 59, "public": 59, "craft": 59, "well": [59, 62, 81, 83, 84], "privat": 59, "corpu": [59, 63, 81], "radiolog": 59, "report": 59, "domain": [59, 81], "network": 59, "abl": [59, 81], "speech": 59, "lemmat": 59, "popular": [59, 83], "open": [59, 77, 81], "corenlp": 59, "biobert": 59, "win": 59, "bionlp": [59, 81], "achiev": 59, "much": 59, "retrain": 59, "par": 59, "substanti": 59, "outperform": 59, "computation": 59, "effici": [59, 60, 81], "introduc": 59, "offer": 59, "eas": 59, "facilit": 59, "research": 59, "publicli": 59, "onlin": 59, "issn": [59, 70], "1527": 59, "974x": 59, "eprint": 59, "academ": 59, "oup": 59, "39731803": 59, "stanza_pipelin": [59, 80], "stanzapipelin": [59, 68], "__name__": [60, 83, 85], "document_batch_step": 60, "batch_doc_cal": 60, "decor": 60, "processing_except": 60, "effort": 60, "repetit": 60, "document_iterating_step": 60, "would": [60, 84], "machin": [60, 70], "standalon": 60, "mutat": [60, 79], "per_doc_cal": 60, "allenai": 62, "blob": [62, 70], "py": [62, 70, 73], "top": 62, "copi": [62, 79], "forc": 62, "delet": [62, 63, 66], "wise": 62, "common": 62, "nsclc": [62, 70], "alwai": [62, 81, 84], "chosen": 62, "filter_match": 62, "matcher_output": 62, "find_abbrevi": 62, "long_form_candid": 62, "short_form_candid": 62, "short": 62, "letter": 62, "_beginning_": 62, "expans": 62, "short_form_filt": 62, "modelpackbuilderror": 63, "modelpackbuild": 63, "build_all_model_pack": 63, "maybe_base_model_pack_path": 63, "maybe_base_configuration_path": 63, "custom_model_pack_param": 63, "zip_pack": 63, "run_acceptance_test": 63, "run_consistency_check": 63, "pack": [63, 77, 80], "zip": 63, "highlight": 63, "build_cach": 63, "clear_cached_resources_from_model_pack_dir": 63, "model_path_path": 63, "process_model_pack_path": 63, "kazu_vers": 63, "uncached_model_pack_path": 63, "build_dir": 63, "consistency_check": 63, "reset_singleton": 63, "zip_model_pack": 63, "model_pack_nam": 63, "subprocess": 63, "compress": 63, "cli": 63, "move": [63, 81], "build_custom_pack_param": 63, "entitylinkinglookupcach": 64, "around": [64, 66, 78], "lfucach": 64, "expens": [64, 81], "check_lookup_cach": 64, "miss": 64, "update_terms_lookup_cach": 64, "sort_then_group": 65, "key_func": 65, "cdisttensorembeddingindex": 66, "tensorembeddingindex": 66, "cosin": 66, "distanc": 66, "boolean_scor": 66, "boolean": 66, "apply_boolean_scor": 66, "query_term": 66, "15": 66, "ontology_partition_s": 66, "1000": 66, "enumerate_database_chunk": 66, "chunk_siz": 66, "100000": 66, "partit": 66, "partitt": 66, "predict_ontology_embed": 66, "hungri": 66, "chuck": 66, "set_embedding_model": 66, "build_ontology_cach": 66, "cache_dir": 66, "get_index_data_path": 66, "get_metadata_path": 66, "get_synonym_data_path": 66, "overwrit": 66, "column_type_dict": 66, "matmultensorembeddingindex": 66, "matmul": 66, "synonym_db": 66, "reus": 67, "across": [67, 84], "stanza_nlp": 68, "from_stanza_kwarg": 68, "simple_init": 68, "use_gpu": 68, "call_count_interv": 69, "watch": 69, "call_count": 69, "helper": 69, "benchmark": [69, 83], "anatomystringnorm": 70, "entityclassnorm": 70, "is_symbol_lik": 70, "original_str": 70, "alzheim": 70, "normalize_noun_phras": 70, "revert": 70, "defaultstringnorm": 70, "normalize_symbol": 70, "anatomi": [70, 79, 83], "theoret": 70, "superflu": 70, "anywai": 70, "deplur": 70, "depluralis": 70, "handle_lower_case_prefix": 70, "preserv": 70, "subsequ": 70, "alphanum": 70, "upper": 70, "rest": 70, "unus": 70, "erbb2": 70, "commonli": 70, "ratio": 70, "remove_non_alphanum": 70, "alphanumer": 70, "replace_greek": 70, "replac": 70, "greek": 70, "replace_substr": 70, "rang": 70, "classifi": 70, "roman": 70, "split_on_numb": 70, "sub_greek_char_abbrevi": 70, "substitut": [70, 73], "allowed_additional_char": 70, "greek_sub": 70, "greek_subs_upp": 70, "number_split_pattern": 70, "other_sub": 70, "ii": 70, "iii": 70, "iv": 70, "ix": 70, "vi": 70, "vii": 70, "viii": 70, "11": 70, "xii": 70, "12": 70, "re_sub": 70, "si": 70, "sv": 70, "sx": 70, "re_subs_2": 70, "sa": 70, "sb": 70, "symbol_number_split": 70, "trailing_lowercase_s_split": 70, "diseasestringnorm": 70, "known_disease_short_noun": 70, "flu": 70, "hiv": 70, "sti": 70, "genestringnorm": 70, "gene_token_classifi": 70, "slightli": 70, "especi": 70, "contrari": 70, "special": [70, 73], "highli": 70, "unusu": 70, "remove_trailing_s_if_otherwise_capitalis": 70, "frustratingli": 70, "pluralis": 70, "erbb": 70, "jsut": 70, "trail": 70, "break": 70, "genuin": [70, 77], "mdh": 70, "gasp10p": 70, "strip": 70, "gene_name_suffix": 70, "ase": 70, "gen": 70, "gon": 70, "gildautil": 70, "indralab": 70, "gilda": 70, "9e383213098144fe82103a3a5aa1bf4c14059e57": 70, "gyori2022gilda": 70, "gyori": 70, "benjamin": 70, "hoyt": 70, "charl": 70, "taplei": 70, "steppi": 70, "albert": 70, "advanc": [70, 80], "2022": 70, "05": 70, "2635": 70, "0041": 70, "bioadv": 70, "vbac034": 70, "copyright": [70, 73], "2019": 70, "harvard": 70, "school": 70, "right": [70, 73], "reserv": [70, 73], "redistribut": [70, 73], "binari": [70, 73], "modif": [70, 73], "permit": [70, 73], "code": [70, 73, 81], "retain": [70, 73], "notic": [70, 73], "disclaim": [70, 73], "reproduc": [70, 73], "materi": [70, 73], "softwar": [70, 73], "IS": [70, 73], "BY": [70, 73], "THE": [70, 73], "holder": [70, 73], "AND": [70, 73], "contributor": [70, 73], "AS": [70, 73, 84], "express": [70, 73], "impli": [70, 73], "warranti": [70, 73], "BUT": [70, 73], "NOT": [70, 73], "limit": [70, 73], "TO": [70, 73], "OF": [70, 73], "merchant": [70, 73], "FOR": [70, 73], "IN": [70, 73], "NO": [70, 73], "shall": [70, 73], "BE": [70, 73], "direct": [70, 73], "indirect": [70, 73], "incident": [70, 73], "exemplari": [70, 73], "consequenti": [70, 73], "damag": [70, 73], "procur": [70, 73], "profit": [70, 73], "busi": [70, 73], "interrupt": [70, 73], "ON": [70, 73, 84], "theori": [70, 73], "liabil": [70, 73], "contract": [70, 73], "tort": [70, 73], "neglig": [70, 73], "aris": [70, 73], "IF": [70, 73], "advis": [70, 73], "SUCH": [70, 73], "statu": 70, "flag": 70, "plural": 70, "non_plur": 70, "braf": 70, "plural_o": 70, "mosquito": 70, "plural_i": 70, "antibodi": 70, "plural_": 70, "plural_cap_": 70, "mapk": 70, "receptor": [70, 78, 86, 88], "replace_dash": 70, "rep": 70, "dash": 70, "plain": 70, "ascii": 70, "entityclassfilt": 71, "required_entity_class": 71, "assess": 71, "as_path": 71, "create_char_ngram": 71, "create_word_ngram": 71, "documents_to_document_section_batch_encodings_map": 71, "128": 71, "512": 71, "documents_to_document_section_text_map": 71, "dochash": 71, "sectionhash": 71, "documents_to_id_section_map": 71, "filter_entities_with_ontology_map": 71, "find_document_from_ent": 71, "belong": 71, "list_map": 71, "get_cache_dir": 71, "create_if_not_exist": 71, "get_cache_path": 71, "cache_id": 71, "get_match_entity_class_hash": 71, "githubusercont": 73, "amitripshto": 73, "starlett": 73, "jwt": 73, "master": 73, "starlette_jwt": 73, "middlewar": 73, "2018": 73, "amit": 73, "ripshto": 73, "neither": 73, "nor": 73, "endors": 73, "promot": 73, "prior": 73, "permiss": [73, 81], "jwtauthenticationbackend": 73, "authenticationbackend": 73, "secret_kei": 73, "hs256": 73, "bearer": 73, "username_field": 73, "usernam": 73, "audienc": 73, "async": 73, "authent": 73, "authcredenti": 73, "baseus": 73, "get_token_from_head": 73, "jwtuser": 73, "payload": 73, "display_nam": 73, "is_authent": 73, "on_auth_error": 73, "exc": 73, "sectionedwebdocu": 75, "to_kazu_docu": 75, "simplewebdocu": 75, "deploi": 75, "app": 75, "rai": [75, 80], "serv": 75, "sophist": 77, "few": 77, "mitosi": 77, "face": 77, "emploi": 77, "wholesal": 77, "ensur": [77, 81, 86], "kazu": [77, 83, 84, 85, 86, 88], "approach": [77, 81], "ontology_match": [77, 80], "assemble_pipelin": [77, 80], "ontology_preprocess": [77, 80, 84], "syn_gener": 77, "noisy_spacy_pipelin": 77, "corpora": 77, "illustr": 77, "ll": 77, "joint_ner_and_link": [77, 79, 80], "explos": [77, 79, 80], "annotatedphras": 77, "default_factori": 77, "annotatedphraseencod": 77, "jsonencod": 77, "isinst": 77, "__dict__": 77, "rais": 77, "typeerror": 77, "w": 77, "writelin": 77, "get_doc": 77, "noisy_step": 77, "curatable_phras": 77, "to_cur": 77, "phrases_to_cur": 77, "matter": 77, "now": [77, 83, 84], "datamodel": 78, "bodi": 78, "document_post_process": [78, 79, 80, 86, 88], "abbreviation_find": [78, 79, 80, 86, 88], "epiderm": [78, 86, 88], "growth": [78, 86, 88], "factor": [78, 86, 88], "egfr": [78, 86, 88], "failur": [78, 86, 88], "egfr_ent": [78, 86, 88], "assert": [78, 86, 88], "ve": 79, "encount": 79, "tinybern2": [79, 81], "emnlp": 79, "tba": [79, 86], "hf_token_classif": [79, 80], "drug": [79, 81, 83, 84], "cell_lin": [79, 83], "cell_typ": [79, 83], "go_bp": [79, 83], "go_cc": [79, 83], "go_mf": [79, 83], "seth": [79, 80], "tagger": 79, "yaml": 79, "schema": 79, "opentargets_molecul": 79, "opentargets_diseas": 79, "opentargets_target": 79, "bp_gene_ontolog": 79, "mf_gene_ontolog": 79, "cc_gene_ontolog": 79, "mapping_step": [79, 80], "merge_overlapping_": [79, 80], "desir": [79, 84], "customis": [79, 83], "cleanup": [79, 80], "introduct": 80, "summari": 80, "quickstart": 80, "visualis": 80, "webservic": 80, "acceptance_test": [80, 83], "label_studio": [80, 83], "distil": [80, 84], "data_util": 80, "lightning_plugin": 80, "tiny_transform": 80, "hf_lightning_wrapp": 80, "language_phenomena": 80, "string_similarity_scor": 80, "post_process": 80, "xref_manag": 80, "spacy_n": 80, "tokenized_word_processor": 80, "stanza": 80, "build_and_test_model_pack": 80, "stopwatch": 80, "string_norm": 80, "web": 80, "jwtauth": 80, "rout": 80, "server": 80, "lightweight": 81, "framework": 81, "astrazeneca": 81, "collabor": 81, "dmi": 81, "lab": 81, "korea": 81, "univers": 81, "whilst": 81, "rework": 81, "integr": [81, 83, 84], "plethora": 81, "wider": 81, "commun": 81, "great": [81, 84], "focu": 81, "literatur": 81, "nativ": [81, 84], "phenomena": 81, "particularli": 81, "challeng": 81, "texta": 81, "recogn": 81, "conceptu": 81, "fashion": 81, "uml": 81, "date": 81, "avoid": 81, "deal": 81, "autom": 81, "clean": 81, "intent": 81, "reprocess": 81, "million": 81, "easili": 81, "sever": 81, "fast": 81, "princip": 81, "extens": 81, "isol": 81, "brought": 81, "littl": 81, "scalabl": 81, "easi": [81, 84], "cc": [81, 86], "live": 81, "discoveri": 81, "project": 81, "bikg": 81, "apach": 81, "commerci": 81, "histor": 81, "workflow": 83, "config_path": [83, 85], "conf": [83, 85, 86], "config_nam": [83, 85, 86], "run_doc": [83, 85], "__main__": [83, 85], "red": 83, "darkblu": 83, "orang": 83, "yellow": 83, "green": 83, "speci": 83, "purpl": 83, "pink": 83, "grei": 83, "blue": 83, "brown": 83, "label_studio_url_and_port": 83, "finish": 83, "export": [83, 86], "angu": 84, "robert": 84, "lot": 84, "vocabulari": 84, "uncontextualis": 84, "overload": 84, "ofd": 84, "has_exact_synonym": 84, "osteofibr": 84, "dysplasia": 84, "orofaciodigit": 84, "syndrom": 84, "let": 84, "similarli": 84, "xloa": 84, "ocular": 84, "albin": 84, "recess": 84, "wors": 84, "tga": 84, "dextro": 84, "loop": 84, "transposit": 84, "arteri": 84, "mondo_0019443": 84, "0031348": 84, "sai": 84, "everyth": 84, "familiar": 84, "uncommon": 84, "reconcil": 84, "perspect": 84, "difficult": 84, "arbitrarili": 84, "seem": 84, "nevertheless": 84, "enter": 84, "relat": 84, "piec": 84, "doe": 84, "ought": 84, "enough": 84, "seborrh": 84, "eczema": 84, "purl": 84, "obolibrari": 84, "hp_0001051": 84, "mondo_0006608": 84, "equivalentidsetaggregationstrategi": 84, "4532": 84, "70": 84, "decis": 84, "7426": 84, "despit": 84, "perhap": 84, "fortun": 84, "quit": 84, "sqlite3": 84, "panda": 84, "sqlite": 84, "lend": 84, "tabular": 84, "conn": 84, "connect": 84, "chembl_id": 84, "pref_nam": 84, "syn_typ": 84, "molecule_dictionari": 84, "md": 84, "molecule_synonym": 84, "ms": 84, "molregno": 84, "df": 84, "read_sql": 84, "too": 84, "big": 84, "dropna": 84, "drop_dupl": 84, "inplac": 84, "secondli": 84, "mondo_xxxxx": 84, "hp_xxxxxxx": 84, "full": 84, "trivial": 84, "breviti": 84, "string_1": 84, "string_2": 84, "75": 84, "That": 84, "explor": 84, "capabl": 84, "repo": 86, "newer": 86, "pip": 86, "major": 86, "amount": 86, "easiest": 86, "intro": 86, "kazu_config_dir": 86, "kazu_model_pack": 86, "manual": 86, "oc": 86, "env": 86, "os": 86, "initialize_config_dir": 86, "config_dir": 86}, "objects": {"": [[0, 0, 0, "-", "kazu"]], "kazu": [[1, 0, 0, "-", "data"], [4, 0, 0, "-", "modelling"], [31, 0, 0, "-", "pipeline"], [33, 0, 0, "-", "steps"], [61, 0, 0, "-", "utils"], [72, 0, 0, "-", "web"]], "kazu.data": [[2, 0, 0, "-", "data"], [3, 0, 0, "-", "pytorch"]], "kazu.data.data": [[2, 1, 1, "", "AutoNameEnum"], [2, 1, 1, "", "CharSpan"], [2, 1, 1, "", "Document"], [2, 1, 1, "", "DocumentJsonUtils"], [2, 1, 1, "", "Entity"], [2, 1, 1, "", "EquivalentIdAggregationStrategy"], [2, 1, 1, "", "EquivalentIdSet"], [2, 1, 1, "", "LinkRanks"], [2, 1, 1, "", "Mapping"], [2, 1, 1, "", "Section"], [2, 1, 1, "", "SynonymTerm"], [2, 1, 1, "", "SynonymTermWithMetrics"]], "kazu.data.data.CharSpan": [[2, 2, 1, "", "__init__"], [2, 3, 1, "", "end"], [2, 2, 1, "", "is_completely_overlapped"], [2, 2, 1, "", "is_partially_overlapped"], [2, 3, 1, "", "start"]], "kazu.data.data.Document": [[2, 2, 1, "", "__init__"], [2, 2, 1, "", "as_minified_dict"], [2, 2, 1, "", "create_simple_document"], [2, 2, 1, "", "from_named_section_texts"], [2, 2, 1, "", "get_entities"], [2, 3, 1, "", "idx"], [2, 2, 1, "", "json"], [2, 3, 1, "", "metadata"], [2, 3, 1, "", "sections"]], "kazu.data.data.DocumentJsonUtils": [[2, 4, 1, "", "ConversionException"], [2, 3, 1, "", "atomic_types"], [2, 2, 1, "", "doc_to_json_dict"], [2, 2, 1, "", "empty"], [2, 3, 1, "", "listlike_types"], [2, 2, 1, "", "minify_json_dict"], [2, 2, 1, "", "obj_to_dict_repr"], [2, 2, 1, "", "remove_empty_elements"]], "kazu.data.data.Entity": [[2, 2, 1, "", "__init__"], [2, 2, 1, "", "add_mapping"], [2, 2, 1, "", "as_brat"], [2, 2, 1, "", "calc_starts_and_ends"], [2, 3, 1, "", "end"], [2, 3, 1, "", "entity_class"], [2, 2, 1, "", "from_spans"], [2, 2, 1, "", "is_completely_overlapped"], [2, 2, 1, "", "is_partially_overlapped"], [2, 2, 1, "", "load_contiguous_entity"], [2, 3, 1, "", "mappings"], [2, 3, 1, "", "match"], [2, 3, 1, "", "match_norm"], [2, 3, 1, "", "metadata"], [2, 3, 1, "", "namespace"], [2, 3, 1, "", "spans"], [2, 3, 1, "", "start"], [2, 3, 1, "", "syn_term_to_synonym_terms"], [2, 2, 1, "", "update_terms"]], "kazu.data.data.EquivalentIdAggregationStrategy": [[2, 3, 1, "", "CUSTOM"], [2, 3, 1, "", "MERGED_AS_NON_SYMBOLIC"], [2, 3, 1, "", "NO_STRATEGY"], [2, 3, 1, "", "RESOLVED_BY_SIMILARITY"], [2, 3, 1, "", "SYNONYM_IS_AMBIGUOUS"], [2, 3, 1, "", "UNAMBIGUOUS"]], "kazu.data.data.EquivalentIdSet": [[2, 2, 1, "", "__init__"], [2, 3, 1, "", "ids"], [2, 3, 1, "", "ids_to_source"]], "kazu.data.data.LinkRanks": [[2, 3, 1, "", "AMBIGUOUS"], [2, 3, 1, "", "HIGHLY_LIKELY"], [2, 3, 1, "", "POSSIBLE"], [2, 3, 1, "", "PROBABLE"]], "kazu.data.data.Mapping": [[2, 2, 1, "", "__init__"], [2, 3, 1, "", "confidence"], [2, 3, 1, "", "default_label"], [2, 3, 1, "", "disambiguation_strategy"], [2, 3, 1, "", "idx"], [2, 3, 1, "", "mapping_strategy"], [2, 3, 1, "", "metadata"], [2, 3, 1, "", "parser_name"], [2, 3, 1, "", "source"], [2, 3, 1, "", "xref_source_parser_name"]], "kazu.data.data.Section": [[2, 2, 1, "", "__init__"], [2, 3, 1, "", "entities"], [2, 2, 1, "", "get_text"], [2, 3, 1, "", "metadata"], [2, 3, 1, "", "name"], [2, 3, 1, "", "offset_map"], [2, 3, 1, "", "preprocessed_text"], [2, 5, 1, "", "sentence_spans"], [2, 3, 1, "", "text"]], "kazu.data.data.SynonymTerm": [[2, 2, 1, "", "__init__"], [2, 3, 1, "", "aggregated_by"], [2, 3, 1, "", "associated_id_sets"], [2, 5, 1, "", "is_ambiguous"], [2, 3, 1, "", "is_symbolic"], [2, 3, 1, "", "mapping_types"], [2, 3, 1, "", "parser_name"], [2, 3, 1, "", "term_norm"], [2, 3, 1, "", "terms"]], "kazu.data.data.SynonymTermWithMetrics": [[2, 2, 1, "", "__init__"], [2, 3, 1, "", "bool_score"], [2, 3, 1, "", "embed_score"], [2, 3, 1, "", "exact_match"], [2, 2, 1, "", "from_synonym_term"], [2, 2, 1, "", "merge_metrics"], [2, 3, 1, "", "search_score"]], "kazu.data.pytorch": [[3, 1, 1, "", "HFDataset"]], "kazu.data.pytorch.HFDataset": [[3, 2, 1, "", "__init__"]], "kazu.modelling": [[5, 0, 0, "-", "annotation"], [8, 0, 0, "-", "database"], [10, 0, 0, "-", "distillation"], [18, 0, 0, "-", "hf_lightning_wrappers"], [19, 0, 0, "-", "language"], [22, 0, 0, "-", "linking"], [25, 0, 0, "-", "ontology_matching"], [28, 0, 0, "-", "ontology_preprocessing"]], "kazu.modelling.annotation": [[6, 0, 0, "-", "acceptance_test"], [7, 0, 0, "-", "label_studio"]], "kazu.modelling.annotation.acceptance_test": [[6, 4, 1, "", "AcceptanceTestFailure"], [6, 1, 1, "", "AggregatedAccuracyResult"], [6, 1, 1, "", "SectionScorer"], [6, 6, 1, "", "acceptance_criteria"], [6, 6, 1, "", "aggregate_linking_results"], [6, 6, 1, "", "aggregate_ner_results"], [6, 6, 1, "", "analyse_full_pipeline"], [6, 6, 1, "", "check_annotation_consistency"], [6, 6, 1, "", "check_ent_class_consistency"], [6, 6, 1, "", "check_ent_mapping_consistency"], [6, 6, 1, "", "check_ent_match_abnormalities"], [6, 6, 1, "", "check_results_meet_threshold"], [6, 6, 1, "", "execute_full_pipeline_acceptance_test"], [6, 6, 1, "", "score_sections"]], "kazu.modelling.annotation.acceptance_test.AggregatedAccuracyResult": [[6, 2, 1, "", "__init__"], [6, 2, 1, "", "add_fn"], [6, 2, 1, "", "add_fp"], [6, 3, 1, "", "fn"], [6, 3, 1, "", "fn_counter"], [6, 5, 1, "", "fn_info"], [6, 3, 1, "", "fn_items_to_tasks"], [6, 3, 1, "", "fp"], [6, 3, 1, "", "fp_counter"], [6, 5, 1, "", "fp_info"], [6, 3, 1, "", "fp_items_to_tasks"], [6, 5, 1, "", "precision"], [6, 5, 1, "", "recall"], [6, 2, 1, "", "tasks_for_fn"], [6, 2, 1, "", "tasks_for_fp"], [6, 3, 1, "", "tp"]], "kazu.modelling.annotation.acceptance_test.SectionScorer": [[6, 2, 1, "", "__init__"], [6, 2, 1, "", "calculate_linking_matches"], [6, 2, 1, "", "calculate_ner_matches"], [6, 2, 1, "", "group_mappings_by_source"]], "kazu.modelling.annotation.label_studio": [[7, 1, 1, "", "KazuToLabelStudioConverter"], [7, 1, 1, "", "LSToKazuConversion"], [7, 1, 1, "", "LabelStudioAnnotationView"], [7, 1, 1, "", "LabelStudioManager"]], "kazu.modelling.annotation.label_studio.KazuToLabelStudioConverter": [[7, 2, 1, "", "convert_docs_to_tasks"], [7, 2, 1, "", "convert_single_doc_to_tasks"]], "kazu.modelling.annotation.label_studio.LSToKazuConversion": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "convert_tasks_to_docs"], [7, 2, 1, "", "create_ents"], [7, 2, 1, "", "create_mappings"], [7, 2, 1, "", "create_section"]], "kazu.modelling.annotation.label_studio.LabelStudioAnnotationView": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "build_labels"], [7, 2, 1, "", "build_taxonomy"], [7, 2, 1, "", "create_main_view"], [7, 2, 1, "", "getDOM"]], "kazu.modelling.annotation.label_studio.LabelStudioManager": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "create_linking_project"], [7, 2, 1, "", "delete_project_if_exists"], [7, 2, 1, "", "export_from_ls"], [7, 2, 1, "", "get_all_tasks"], [7, 2, 1, "", "get_tasks"], [7, 2, 1, "", "import_to_ls"], [7, 5, 1, "", "project_id"]], "kazu.modelling.database": [[9, 0, 0, "-", "in_memory_db"]], "kazu.modelling.database.in_memory_db": [[9, 1, 1, "", "MetadataDatabase"], [9, 1, 1, "", "SynonymDatabase"]], "kazu.modelling.database.in_memory_db.MetadataDatabase": [[9, 2, 1, "", "add_parser"], [9, 2, 1, "", "get_all"], [9, 2, 1, "", "get_by_idx"], [9, 2, 1, "", "get_by_index"], [9, 3, 1, "", "loaded_parsers"]], "kazu.modelling.database.in_memory_db.SynonymDatabase": [[9, 2, 1, "", "add"], [9, 2, 1, "", "get"], [9, 2, 1, "", "get_all"], [9, 2, 1, "", "get_syns_for_id"], [9, 2, 1, "", "get_syns_sharing_id"], [9, 3, 1, "", "loaded_parsers"]], "kazu.modelling.distillation": [[11, 0, 0, "-", "data_utils"], [12, 0, 0, "-", "dataprocessor"], [13, 0, 0, "-", "lightning_plugins"], [14, 0, 0, "-", "metrics"], [15, 0, 0, "-", "models"], [16, 0, 0, "-", "tiny_transformers"], [17, 0, 0, "-", "train"]], "kazu.modelling.distillation.data_utils": [[11, 6, 1, "", "to_unicode"]], "kazu.modelling.distillation.dataprocessor": [[12, 1, 1, "", "NerProcessor"], [12, 1, 1, "", "SeqTagProcessor"]], "kazu.modelling.distillation.dataprocessor.NerProcessor": [[12, 2, 1, "", "get_aug_examples"], [12, 2, 1, "", "get_dev_examples"], [12, 2, 1, "", "get_test_examples"], [12, 2, 1, "", "get_train_examples"]], "kazu.modelling.distillation.dataprocessor.SeqTagProcessor": [[12, 2, 1, "", "get_aug_examples"], [12, 2, 1, "", "get_dev_examples"], [12, 2, 1, "", "get_train_examples"]], "kazu.modelling.distillation.lightning_plugins": [[13, 1, 1, "", "StudentModelCheckpointIO"]], "kazu.modelling.distillation.lightning_plugins.StudentModelCheckpointIO": [[13, 2, 1, "", "__init__"], [13, 2, 1, "", "load_checkpoint"], [13, 2, 1, "", "remove_checkpoint"], [13, 2, 1, "", "save_checkpoint"]], "kazu.modelling.distillation.metrics": [[14, 6, 1, "", "accuracy"], [14, 6, 1, "", "numeric_label_f1_score"]], "kazu.modelling.distillation.models": [[15, 1, 1, "", "NerDataset"], [15, 1, 1, "", "SequenceTaggingDistillationBase"], [15, 1, 1, "", "SequenceTaggingDistillationForFinalLayer"], [15, 1, 1, "", "SequenceTaggingDistillationForIntermediateLayer"], [15, 1, 1, "", "TaskSpecificDistillation"]], "kazu.modelling.distillation.models.NerDataset": [[15, 2, 1, "", "__init__"], [15, 2, 1, "", "convert_single_example"]], "kazu.modelling.distillation.models.SequenceTaggingDistillationBase": [[15, 2, 1, "", "__init__"], [15, 2, 1, "", "get_training_examples"], [15, 2, 1, "", "train_dataloader"], [15, 3, 1, "", "training"], [15, 2, 1, "", "val_dataloader"]], "kazu.modelling.distillation.models.SequenceTaggingDistillationForFinalLayer": [[15, 2, 1, "", "__init__"], [15, 3, 1, "", "allow_zero_length_dataloader_with_multiple_devices"], [15, 3, 1, "", "precision"], [15, 3, 1, "", "prepare_data_per_node"], [15, 2, 1, "", "soft_cross_entropy"], [15, 2, 1, "", "tensor_to_jagged_array"], [15, 3, 1, "", "training"], [15, 2, 1, "", "training_step"], [15, 2, 1, "", "validation_epoch_end"], [15, 2, 1, "", "validation_step"]], "kazu.modelling.distillation.models.SequenceTaggingDistillationForIntermediateLayer": [[15, 2, 1, "", "__init__"], [15, 3, 1, "", "allow_zero_length_dataloader_with_multiple_devices"], [15, 3, 1, "", "precision"], [15, 3, 1, "", "prepare_data_per_node"], [15, 3, 1, "", "training"], [15, 2, 1, "", "training_step"], [15, 2, 1, "", "validation_epoch_end"], [15, 2, 1, "", "validation_step"]], "kazu.modelling.distillation.models.TaskSpecificDistillation": [[15, 2, 1, "", "__init__"], [15, 2, 1, "", "configure_optimizers"], [15, 2, 1, "", "get_optimizer_grouped_parameters"], [15, 2, 1, "", "get_training_examples"], [15, 3, 1, "", "training"]], "kazu.modelling.distillation.tiny_transformers": [[16, 1, 1, "", "TinyBertForSequenceTagging"]], "kazu.modelling.distillation.tiny_transformers.TinyBertForSequenceTagging": [[16, 2, 1, "", "__init__"], [16, 2, 1, "", "forward"], [16, 3, 1, "", "training"]], "kazu.modelling.distillation.train": [[17, 6, 1, "", "start"]], "kazu.modelling.hf_lightning_wrappers": [[18, 1, 1, "", "PLAutoModel"], [18, 1, 1, "", "PLAutoModelForTokenClassification"]], "kazu.modelling.hf_lightning_wrappers.PLAutoModel": [[18, 2, 1, "", "__init__"], [18, 2, 1, "", "predict_step"], [18, 3, 1, "", "training"]], "kazu.modelling.hf_lightning_wrappers.PLAutoModelForTokenClassification": [[18, 2, 1, "", "__init__"], [18, 2, 1, "", "predict_step"], [18, 3, 1, "", "training"]], "kazu.modelling.language": [[20, 0, 0, "-", "language_phenomena"], [21, 0, 0, "-", "string_similarity_scorers"]], "kazu.modelling.language.string_similarity_scorers": [[21, 1, 1, "", "BooleanStringSimilarityScorer"], [21, 1, 1, "", "EntityNounModifierStringSimilarityScorer"], [21, 1, 1, "", "EntitySubtypeStringSimilarityScorer"], [21, 1, 1, "", "NumberMatchStringSimilarityScorer"], [21, 1, 1, "", "RapidFuzzStringSimilarityScorer"], [21, 1, 1, "", "SapbertStringSimilarityScorer"], [21, 1, 1, "", "StringSimilarityScorer"]], "kazu.modelling.language.string_similarity_scorers.BooleanStringSimilarityScorer": [[21, 2, 1, "", "__init__"]], "kazu.modelling.language.string_similarity_scorers.EntityNounModifierStringSimilarityScorer": [[21, 2, 1, "", "__init__"]], "kazu.modelling.language.string_similarity_scorers.EntitySubtypeStringSimilarityScorer": [[21, 3, 1, "", "numeric_class_phrases"]], "kazu.modelling.language.string_similarity_scorers.NumberMatchStringSimilarityScorer": [[21, 3, 1, "", "number_finder"]], "kazu.modelling.language.string_similarity_scorers.SapbertStringSimilarityScorer": [[21, 2, 1, "", "__init__"]], "kazu.modelling.language.string_similarity_scorers.StringSimilarityScorer": [[21, 2, 1, "", "__init__"]], "kazu.modelling.linking": [[23, 0, 0, "-", "sapbert"]], "kazu.modelling.linking.sapbert": [[24, 0, 0, "-", "train"]], "kazu.modelling.linking.sapbert.train": [[24, 1, 1, "", "Candidate"], [24, 1, 1, "", "GoldStandardExample"], [24, 1, 1, "", "HFSapbertInferenceDataset"], [24, 1, 1, "", "HFSapbertPairwiseDataset"], [24, 1, 1, "", "PLSapbertModel"], [24, 1, 1, "", "SapbertDataCollatorWithPadding"], [24, 1, 1, "", "SapbertEvaluationDataManager"], [24, 1, 1, "", "SapbertEvaluationDataset"], [24, 1, 1, "", "SapbertTrainingParams"], [24, 6, 1, "", "get_embedding_dataloader_from_strings"], [24, 6, 1, "", "init_hf_collate_fn"], [24, 6, 1, "", "start"]], "kazu.modelling.linking.sapbert.train.Candidate": [[24, 2, 1, "", "__new__"], [24, 3, 1, "", "correct"], [24, 3, 1, "", "default_label"], [24, 3, 1, "", "iri"]], "kazu.modelling.linking.sapbert.train.GoldStandardExample": [[24, 2, 1, "", "__new__"], [24, 3, 1, "", "candidates"], [24, 3, 1, "", "gold_default_label"], [24, 3, 1, "", "gold_iri"]], "kazu.modelling.linking.sapbert.train.HFSapbertInferenceDataset": [[24, 2, 1, "", "__init__"]], "kazu.modelling.linking.sapbert.train.HFSapbertPairwiseDataset": [[24, 2, 1, "", "__init__"]], "kazu.modelling.linking.sapbert.train.PLSapbertModel": [[24, 2, 1, "", "__init__"], [24, 2, 1, "", "configure_optimizers"], [24, 2, 1, "", "evaluate_topk_acc"], [24, 2, 1, "", "forward"], [24, 2, 1, "", "get_candidate_dict"], [24, 2, 1, "", "get_embeddings"], [24, 2, 1, "", "get_embeddings_for_strings"], [24, 2, 1, "", "get_embeddings_from_dataloader"], [24, 2, 1, "", "log_results"], [24, 2, 1, "", "predict_step"], [24, 2, 1, "", "train_dataloader"], [24, 3, 1, "", "training"], [24, 2, 1, "", "training_step"], [24, 2, 1, "", "val_dataloader"], [24, 2, 1, "", "validation_epoch_end"], [24, 2, 1, "", "validation_step"]], "kazu.modelling.linking.sapbert.train.SapbertDataCollatorWithPadding": [[24, 2, 1, "", "__init__"], [24, 3, 1, "", "max_length"], [24, 3, 1, "", "pad_to_multiple_of"], [24, 3, 1, "", "padding"], [24, 3, 1, "", "tokenizer"]], "kazu.modelling.linking.sapbert.train.SapbertEvaluationDataManager": [[24, 2, 1, "", "__init__"]], "kazu.modelling.linking.sapbert.train.SapbertEvaluationDataset": [[24, 2, 1, "", "__new__"], [24, 3, 1, "", "ontology_source"], [24, 3, 1, "", "query_source"]], "kazu.modelling.linking.sapbert.train.SapbertTrainingParams": [[24, 3, 1, "", "lr"], [24, 3, 1, "", "miner_margin"], [24, 3, 1, "", "num_workers"], [24, 3, 1, "", "topk"], [24, 3, 1, "", "train_batch_size"], [24, 3, 1, "", "train_file"], [24, 3, 1, "", "type_of_triplets"], [24, 3, 1, "", "weight_decay"]], "kazu.modelling.ontology_matching": [[26, 0, 0, "-", "assemble_pipeline"], [27, 0, 0, "-", "ontology_matcher"]], "kazu.modelling.ontology_matching.assemble_pipeline": [[26, 6, 1, "", "custom_tokenizer"], [26, 6, 1, "", "main"]], "kazu.modelling.ontology_matching.ontology_matcher": [[27, 1, 1, "", "CuratedTerm"], [27, 1, 1, "", "OntologyMatcher"], [27, 1, 1, "", "OntologyMatcherConfig"]], "kazu.modelling.ontology_matching.ontology_matcher.CuratedTerm": [[27, 2, 1, "", "__init__"], [27, 3, 1, "", "action"], [27, 3, 1, "", "case_sensitive"], [27, 3, 1, "", "entity_class"], [27, 3, 1, "", "term"], [27, 3, 1, "", "term_norm_mapping"]], "kazu.modelling.ontology_matching.ontology_matcher.OntologyMatcher": [[27, 2, 1, "", "__init__"], [27, 2, 1, "", "create_lowercase_phrasematcher_from_parsers"], [27, 2, 1, "", "create_phrasematchers_from_curated_list"], [27, 2, 1, "", "filter_by_contexts"], [27, 2, 1, "", "from_disk"], [27, 5, 1, "", "labels"], [27, 5, 1, "", "match_id_sep"], [27, 5, 1, "", "nr_lowercase_rules"], [27, 5, 1, "", "nr_strict_rules"], [27, 5, 1, "", "parser_name_to_entity_type"], [27, 2, 1, "", "set_context_matchers"], [27, 2, 1, "", "set_labels"], [27, 2, 1, "", "span_in_FP_context"], [27, 2, 1, "", "span_in_FP_coocc"], [27, 2, 1, "", "span_in_TP_context"], [27, 2, 1, "", "span_in_TP_coocc"], [27, 5, 1, "", "span_key"], [27, 2, 1, "", "to_disk"]], "kazu.modelling.ontology_matching.ontology_matcher.OntologyMatcherConfig": [[27, 2, 1, "", "__init__"], [27, 3, 1, "", "labels"], [27, 3, 1, "", "match_id_sep"], [27, 3, 1, "", "parser_name_to_entity_type"], [27, 3, 1, "", "span_key"]], "kazu.modelling.ontology_preprocessing": [[29, 0, 0, "-", "base"], [30, 0, 0, "-", "synonym_generation"]], "kazu.modelling.ontology_preprocessing.base": [[29, 1, 1, "", "BiologicalProcessGeneOntologyParser"], [29, 1, 1, "", "CLOOntologyParser"], [29, 1, 1, "", "CellosaurusOntologyParser"], [29, 1, 1, "", "CellularComponentGeneOntologyParser"], [29, 1, 1, "", "ChemblOntologyParser"], [29, 1, 1, "", "EnsemblOntologyParser"], [29, 1, 1, "", "GeneOntologyParser"], [29, 1, 1, "", "JsonLinesOntologyParser"], [29, 1, 1, "", "MeddraOntologyParser"], [29, 1, 1, "", "MolecularFunctionGeneOntologyParser"], [29, 1, 1, "", "MondoOntologyParser"], [29, 1, 1, "", "OntologyParser"], [29, 1, 1, "", "OpenTargetsDiseaseOntologyParser"], [29, 1, 1, "", "OpenTargetsMoleculeOntologyParser"], [29, 1, 1, "", "OpenTargetsTargetOntologyParser"], [29, 1, 1, "", "RDFGraphParser"], [29, 1, 1, "", "UberonOntologyParser"]], "kazu.modelling.ontology_preprocessing.base.BiologicalProcessGeneOntologyParser": [[29, 2, 1, "", "__init__"]], "kazu.modelling.ontology_preprocessing.base.CLOOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"]], "kazu.modelling.ontology_preprocessing.base.CellosaurusOntologyParser": [[29, 2, 1, "", "__init__"], [29, 3, 1, "", "cell_line_re"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "parse_to_dataframe"], [29, 2, 1, "", "score_and_group_ids"]], "kazu.modelling.ontology_preprocessing.base.CellularComponentGeneOntologyParser": [[29, 2, 1, "", "__init__"]], "kazu.modelling.ontology_preprocessing.base.ChemblOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "parse_to_dataframe"]], "kazu.modelling.ontology_preprocessing.base.EnsemblOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "parse_to_dataframe"]], "kazu.modelling.ontology_preprocessing.base.GeneOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"], [29, 3, 1, "", "instances"], [29, 3, 1, "", "instances_in_dbs"], [29, 2, 1, "", "load_go"], [29, 2, 1, "", "parse_to_dataframe"], [29, 2, 1, "", "populate_databases"]], "kazu.modelling.ontology_preprocessing.base.JsonLinesOntologyParser": [[29, 2, 1, "", "json_dict_to_parser_records"], [29, 2, 1, "", "parse_to_dataframe"], [29, 2, 1, "", "read"]], "kazu.modelling.ontology_preprocessing.base.MeddraOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "parse_to_dataframe"]], "kazu.modelling.ontology_preprocessing.base.MolecularFunctionGeneOntologyParser": [[29, 2, 1, "", "__init__"], [29, 3, 1, "", "parsed_dataframe"]], "kazu.modelling.ontology_preprocessing.base.MondoOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "is_valid_iri"], [29, 2, 1, "", "parse_to_dataframe"]], "kazu.modelling.ontology_preprocessing.base.OntologyParser": [[29, 2, 1, "", "__init__"], [29, 3, 1, "", "all_synonym_column_names"], [29, 2, 1, "", "drop_excluded_ids"], [29, 2, 1, "", "export_metadata"], [29, 2, 1, "", "export_synonym_terms"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "generate_synonyms"], [29, 3, 1, "", "minimum_metadata_column_names"], [29, 2, 1, "", "parse_to_dataframe"], [29, 2, 1, "", "populate_databases"], [29, 2, 1, "", "populate_metadata_database"], [29, 2, 1, "", "populate_synonym_database"], [29, 2, 1, "", "resolve_synonyms"], [29, 2, 1, "", "score_and_group_ids"]], "kazu.modelling.ontology_preprocessing.base.OpenTargetsDiseaseOntologyParser": [[29, 2, 1, "", "__init__"], [29, 3, 1, "", "allowed_sources"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "json_dict_to_parser_records"], [29, 2, 1, "", "look_for_mondo"], [29, 3, 1, "", "parsed_dataframe"]], "kazu.modelling.ontology_preprocessing.base.OpenTargetsMoleculeOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "json_dict_to_parser_records"], [29, 3, 1, "", "parsed_dataframe"]], "kazu.modelling.ontology_preprocessing.base.OpenTargetsTargetOntologyParser": [[29, 2, 1, "", "__init__"], [29, 3, 1, "", "annotation_fields"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "json_dict_to_parser_records"], [29, 3, 1, "", "parsed_dataframe"], [29, 2, 1, "", "score_and_group_ids"]], "kazu.modelling.ontology_preprocessing.base.RDFGraphParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "convert_to_rdflib_ref"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "is_valid_iri"], [29, 2, 1, "", "parse_to_dataframe"], [29, 3, 1, "", "parsed_dataframe"]], "kazu.modelling.ontology_preprocessing.base.UberonOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"], [29, 3, 1, "", "parsed_dataframe"]], "kazu.modelling.ontology_preprocessing.synonym_generation": [[30, 1, 1, "", "CombinatorialSynonymGenerator"], [30, 1, 1, "", "GreekSymbolSubstitution"], [30, 1, 1, "", "SeparatorExpansion"], [30, 1, 1, "", "SpellingVariationReplacement"], [30, 1, 1, "", "StopWordRemover"], [30, 1, 1, "", "StringReplacement"], [30, 1, 1, "", "SuffixReplacement"], [30, 1, 1, "", "SynonymGenerator"]], "kazu.modelling.ontology_preprocessing.synonym_generation.CombinatorialSynonymGenerator": [[30, 2, 1, "", "__init__"]], "kazu.modelling.ontology_preprocessing.synonym_generation.GreekSymbolSubstitution": [[30, 3, 1, "", "ALL_SUBS"], [30, 3, 1, "", "greek_letter"], [30, 3, 1, "", "lower_greek_letter"], [30, 3, 1, "", "spelling"], [30, 3, 1, "", "upper_greek_letter"]], "kazu.modelling.ontology_preprocessing.synonym_generation.SeparatorExpansion": [[30, 2, 1, "", "__init__"], [30, 2, 1, "", "call"]], "kazu.modelling.ontology_preprocessing.synonym_generation.SpellingVariationReplacement": [[30, 2, 1, "", "__init__"], [30, 2, 1, "", "call"]], "kazu.modelling.ontology_preprocessing.synonym_generation.StopWordRemover": [[30, 3, 1, "", "all_stopwords"], [30, 2, 1, "", "call"]], "kazu.modelling.ontology_preprocessing.synonym_generation.StringReplacement": [[30, 2, 1, "", "__init__"], [30, 2, 1, "", "call"]], "kazu.modelling.ontology_preprocessing.synonym_generation.SuffixReplacement": [[30, 2, 1, "", "__init__"], [30, 2, 1, "", "call"]], "kazu.modelling.ontology_preprocessing.synonym_generation.SynonymGenerator": [[30, 2, 1, "", "call"]], "kazu.pipeline": [[32, 0, 0, "-", "pipeline"]], "kazu.pipeline.pipeline": [[32, 1, 1, "", "FailedDocsFileHandler"], [32, 1, 1, "", "FailedDocsHandler"], [32, 1, 1, "", "FailedDocsLogHandler"], [32, 1, 1, "", "Pipeline"], [32, 6, 1, "", "batch_metrics"], [32, 6, 1, "", "calc_doc_size"], [32, 6, 1, "", "load_steps_and_log_memory_usage"]], "kazu.pipeline.pipeline.FailedDocsFileHandler": [[32, 2, 1, "", "__init__"]], "kazu.pipeline.pipeline.Pipeline": [[32, 2, 1, "", "__init__"], [32, 2, 1, "", "prefilter_docs"], [32, 2, 1, "", "profile"], [32, 2, 1, "", "reset"], [32, 2, 1, "", "update_failed_docs"]], "kazu.steps": [[34, 0, 0, "-", "document_post_processing"], [36, 0, 0, "-", "joint_ner_and_linking"], [38, 0, 0, "-", "linking"], [50, 0, 0, "-", "ner"], [56, 0, 0, "-", "other"], [60, 0, 0, "-", "step"]], "kazu.steps.document_post_processing": [[35, 0, 0, "-", "abbreviation_finder"]], "kazu.steps.document_post_processing.abbreviation_finder": [[35, 1, 1, "", "AbbreviationFinderStep"]], "kazu.steps.document_post_processing.abbreviation_finder.AbbreviationFinderStep": [[35, 2, 1, "", "__init__"]], "kazu.steps.joint_ner_and_linking": [[37, 0, 0, "-", "explosion"]], "kazu.steps.joint_ner_and_linking.explosion": [[37, 1, 1, "", "ExplosionStringMatchingStep"]], "kazu.steps.joint_ner_and_linking.explosion.ExplosionStringMatchingStep": [[37, 2, 1, "", "__init__"], [37, 2, 1, "", "extract_entity_data_from_spans"]], "kazu.steps.linking": [[39, 0, 0, "-", "dictionary"], [40, 0, 0, "-", "mapping_step"], [41, 0, 0, "-", "post_processing"], [49, 0, 0, "-", "sapbert"]], "kazu.steps.linking.dictionary": [[39, 1, 1, "", "DictionaryEntityLinkingStep"]], "kazu.steps.linking.dictionary.DictionaryEntityLinkingStep": [[39, 2, 1, "", "__init__"], [39, 2, 1, "", "load_or_build_caches"]], "kazu.steps.linking.mapping_step": [[40, 1, 1, "", "MappingStep"]], "kazu.steps.linking.mapping_step.MappingStep": [[40, 2, 1, "", "__init__"]], "kazu.steps.linking.post_processing": [[42, 0, 0, "-", "disambiguation"], [45, 0, 0, "-", "mapping_strategies"], [47, 0, 0, "-", "strategy_runner"], [48, 0, 0, "-", "xref_manager"]], "kazu.steps.linking.post_processing.disambiguation": [[43, 0, 0, "-", "context_scoring"], [44, 0, 0, "-", "strategies"]], "kazu.steps.linking.post_processing.disambiguation.context_scoring": [[43, 1, 1, "", "TfIdfScorer"], [43, 6, 1, "", "create_word_and_char_ngrams"]], "kazu.steps.linking.post_processing.disambiguation.context_scoring.TfIdfScorer": [[43, 2, 1, "", "__init__"], [43, 2, 1, "", "build_or_load_vectorizers"], [43, 2, 1, "", "build_vectorizers"], [43, 2, 1, "", "load_vectorizer"], [43, 2, 1, "", "load_vectorizers"]], "kazu.steps.linking.post_processing.disambiguation.strategies": [[44, 1, 1, "", "AnnotationLevelDisambiguationStrategy"], [44, 1, 1, "", "DefinedElsewhereInDocumentDisambiguationStrategy"], [44, 1, 1, "", "DisambiguationStrategy"], [44, 1, 1, "", "TfIdfDisambiguationStrategy"]], "kazu.steps.linking.post_processing.disambiguation.strategies.AnnotationLevelDisambiguationStrategy": [[44, 2, 1, "", "disambiguate"], [44, 3, 1, "", "metadata_db"], [44, 2, 1, "", "prepare"]], "kazu.steps.linking.post_processing.disambiguation.strategies.DefinedElsewhereInDocumentDisambiguationStrategy": [[44, 2, 1, "", "__init__"], [44, 2, 1, "", "disambiguate"], [44, 2, 1, "", "prepare"]], "kazu.steps.linking.post_processing.disambiguation.strategies.DisambiguationStrategy": [[44, 2, 1, "", "disambiguate"], [44, 2, 1, "", "prepare"]], "kazu.steps.linking.post_processing.disambiguation.strategies.TfIdfDisambiguationStrategy": [[44, 3, 1, "", "CONTEXT_SCORE"], [44, 2, 1, "", "__init__"], [44, 2, 1, "", "build_id_set_representation"], [44, 2, 1, "", "cacheable_build_document_representation"], [44, 2, 1, "", "disambiguate"], [44, 2, 1, "", "prepare"]], "kazu.steps.linking.post_processing.mapping_strategies": [[46, 0, 0, "-", "strategies"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies": [[46, 1, 1, "", "DefinedElsewhereInDocumentMappingStrategy"], [46, 1, 1, "", "ExactMatchMappingStrategy"], [46, 1, 1, "", "MappingFactory"], [46, 1, 1, "", "MappingStrategy"], [46, 1, 1, "", "StrongMatchMappingStrategy"], [46, 1, 1, "", "StrongMatchWithEmbeddingConfirmationStringMatchingStrategy"], [46, 1, 1, "", "SymbolMatchMappingStrategy"], [46, 1, 1, "", "TermNormIsSubStringMappingStrategy"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies.DefinedElsewhereInDocumentMappingStrategy": [[46, 2, 1, "", "filter_terms"], [46, 3, 1, "", "found_equivalent_ids"], [46, 2, 1, "", "prepare"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies.ExactMatchMappingStrategy": [[46, 2, 1, "", "filter_terms"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies.MappingFactory": [[46, 2, 1, "", "create_mapping"], [46, 2, 1, "", "create_mapping_from_id_set"], [46, 2, 1, "", "create_mapping_from_id_sets"], [46, 3, 1, "", "metadata_db"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies.MappingStrategy": [[46, 2, 1, "", "__init__"], [46, 2, 1, "", "disambiguate_if_required"], [46, 2, 1, "", "filter_terms"], [46, 2, 1, "", "prepare"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies.StrongMatchMappingStrategy": [[46, 2, 1, "", "__init__"], [46, 2, 1, "", "filter_terms"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies.StrongMatchWithEmbeddingConfirmationStringMatchingStrategy": [[46, 2, 1, "", "__init__"], [46, 2, 1, "", "filter_terms"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies.SymbolMatchMappingStrategy": [[46, 2, 1, "", "filter_terms"], [46, 2, 1, "", "match_symbols"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies.TermNormIsSubStringMappingStrategy": [[46, 2, 1, "", "__init__"], [46, 2, 1, "", "filter_terms"]], "kazu.steps.linking.post_processing.strategy_runner": [[47, 1, 1, "", "NamespaceStrategyExecution"], [47, 1, 1, "", "StrategyRunner"], [47, 6, 1, "", "entity_to_entity_key"]], "kazu.steps.linking.post_processing.strategy_runner.NamespaceStrategyExecution": [[47, 2, 1, "", "__init__"], [47, 2, 1, "", "get_strategies_for_entity_class"], [47, 5, 1, "", "longest_mapping_strategy_list_size"], [47, 2, 1, "", "reset"]], "kazu.steps.linking.post_processing.strategy_runner.StrategyRunner": [[47, 2, 1, "", "__init__"], [47, 2, 1, "", "execute_hit_post_processing_strategies"], [47, 2, 1, "", "group_entities_by_symbolism"]], "kazu.steps.linking.post_processing.xref_manager": [[48, 1, 1, "", "CrossReferenceManager"], [48, 1, 1, "", "OxoCrossReferenceManager"]], "kazu.steps.linking.post_processing.xref_manager.CrossReferenceManager": [[48, 2, 1, "", "__init__"], [48, 2, 1, "", "build_xref_cache"], [48, 2, 1, "", "create_xref_mappings"], [48, 2, 1, "", "load"], [48, 2, 1, "", "load_or_build_cache"], [48, 2, 1, "", "save"], [48, 3, 1, "", "xref_db"]], "kazu.steps.linking.post_processing.xref_manager.OxoCrossReferenceManager": [[48, 2, 1, "", "__init__"], [48, 2, 1, "", "build_xref_cache"], [48, 2, 1, "", "create_oxo_dump"], [48, 3, 1, "", "headers"], [48, 3, 1, "", "oxo_url"], [48, 2, 1, "", "parse_oxo_dump"], [48, 3, 1, "", "xref_db"]], "kazu.steps.linking.sapbert": [[49, 1, 1, "", "SapBertForEntityLinkingStep"]], "kazu.steps.linking.sapbert.SapBertForEntityLinkingStep": [[49, 2, 1, "", "__init__"], [49, 2, 1, "", "load_or_build_caches"], [49, 2, 1, "", "process_entities"]], "kazu.steps.ner": [[51, 0, 0, "-", "entity_post_processing"], [52, 0, 0, "-", "hf_token_classification"], [53, 0, 0, "-", "seth"], [54, 0, 0, "-", "spacy_ner"], [55, 0, 0, "-", "tokenized_word_processor"]], "kazu.steps.ner.entity_post_processing": [[51, 1, 1, "", "NonContiguousEntitySplitter"], [51, 1, 1, "", "SplitOnConjunctionPattern"], [51, 1, 1, "", "SplitOnNumericalListPatternWithPrefix"]], "kazu.steps.ner.entity_post_processing.NonContiguousEntitySplitter": [[51, 2, 1, "", "__init__"]], "kazu.steps.ner.entity_post_processing.SplitOnConjunctionPattern": [[51, 2, 1, "", "__init__"], [51, 2, 1, "", "run_conjunction_rules"]], "kazu.steps.ner.entity_post_processing.SplitOnNumericalListPatternWithPrefix": [[51, 2, 1, "", "__init__"]], "kazu.steps.ner.hf_token_classification": [[52, 1, 1, "", "TransformersModelForTokenClassificationNerStep"]], "kazu.steps.ner.hf_token_classification.TransformersModelForTokenClassificationNerStep": [[52, 2, 1, "", "__init__"], [52, 2, 1, "", "frame_to_tok_word"], [52, 2, 1, "", "get_activations"], [52, 2, 1, "", "get_dataloader"], [52, 2, 1, "", "get_list_of_batch_encoding_frames_for_section"], [52, 2, 1, "", "id2labels_from_label_list"], [52, 2, 1, "", "section_frames_to_tokenised_words"]], "kazu.steps.ner.seth": [[53, 1, 1, "", "SethStep"]], "kazu.steps.ner.seth.SethStep": [[53, 2, 1, "", "__init__"]], "kazu.steps.ner.spacy_ner": [[54, 1, 1, "", "SpacyNerStep"]], "kazu.steps.ner.spacy_ner.SpacyNerStep": [[54, 2, 1, "", "__init__"]], "kazu.steps.ner.tokenized_word_processor": [[55, 1, 1, "", "SimpleSpanFinder"], [55, 1, 1, "", "SmartSpanFinder"], [55, 1, 1, "", "SpanFinder"], [55, 1, 1, "", "TokWordSpan"], [55, 1, 1, "", "TokenizedWord"], [55, 1, 1, "", "TokenizedWordProcessor"]], "kazu.steps.ner.tokenized_word_processor.SimpleSpanFinder": [[55, 2, 1, "", "__init__"], [55, 2, 1, "", "get_bio_and_class_labels"], [55, 2, 1, "", "process_next_word"], [55, 2, 1, "", "span_continue_condition"]], "kazu.steps.ner.tokenized_word_processor.SmartSpanFinder": [[55, 2, 1, "", "__init__"], [55, 2, 1, "", "get_bio_and_class_labels"], [55, 2, 1, "", "process_next_word"], [55, 2, 1, "", "span_continue_condition"]], "kazu.steps.ner.tokenized_word_processor.SpanFinder": [[55, 2, 1, "", "__init__"], [55, 2, 1, "", "close_spans"], [55, 2, 1, "", "get_bio_and_class_labels"], [55, 2, 1, "", "process_next_word"], [55, 2, 1, "", "span_continue_condition"], [55, 2, 1, "", "start_span"]], "kazu.steps.ner.tokenized_word_processor.TokWordSpan": [[55, 2, 1, "", "__init__"], [55, 3, 1, "", "clazz"], [55, 3, 1, "", "subspan"], [55, 3, 1, "", "tok_words"]], "kazu.steps.ner.tokenized_word_processor.TokenizedWord": [[55, 2, 1, "", "__init__"], [55, 3, 1, "", "token_confidences"], [55, 3, 1, "", "token_ids"], [55, 3, 1, "", "token_offsets"], [55, 3, 1, "", "tokens"], [55, 3, 1, "", "word_char_end"], [55, 3, 1, "", "word_char_start"], [55, 3, 1, "", "word_id"]], "kazu.steps.ner.tokenized_word_processor.TokenizedWordProcessor": [[55, 2, 1, "", "__init__"], [55, 2, 1, "", "calculate_span_offsets"], [55, 2, 1, "", "make_span_finder"], [55, 2, 1, "", "spans_to_entities"]], "kazu.steps.other": [[57, 0, 0, "-", "cleanup"], [58, 0, 0, "-", "merge_overlapping_ents"], [59, 0, 0, "-", "stanza"]], "kazu.steps.other.cleanup": [[57, 1, 1, "", "CleanupAction"], [57, 1, 1, "", "CleanupStep"], [57, 1, 1, "", "DropMappingsByConfidenceMappingFilter"], [57, 1, 1, "", "DropUnmappedEntityFilter"], [57, 1, 1, "", "EntityFilterCleanupAction"], [57, 1, 1, "", "MappingFilterCleanupAction"]], "kazu.steps.other.cleanup.CleanupAction": [[57, 2, 1, "", "__init__"], [57, 2, 1, "", "cleanup"]], "kazu.steps.other.cleanup.CleanupStep": [[57, 2, 1, "", "__init__"]], "kazu.steps.other.cleanup.DropMappingsByConfidenceMappingFilter": [[57, 2, 1, "", "__init__"]], "kazu.steps.other.cleanup.DropUnmappedEntityFilter": [[57, 2, 1, "", "__init__"]], "kazu.steps.other.cleanup.EntityFilterCleanupAction": [[57, 2, 1, "", "__init__"], [57, 2, 1, "", "cleanup"]], "kazu.steps.other.cleanup.MappingFilterCleanupAction": [[57, 2, 1, "", "__init__"], [57, 2, 1, "", "cleanup"]], "kazu.steps.other.merge_overlapping_ents": [[58, 1, 1, "", "MergeOverlappingEntsStep"]], "kazu.steps.other.merge_overlapping_ents.MergeOverlappingEntsStep": [[58, 2, 1, "", "__init__"], [58, 2, 1, "", "filter_ents_across_class"], [58, 2, 1, "", "group_entities_by_location"], [58, 2, 1, "", "select_preferred_entity"]], "kazu.steps.other.stanza": [[59, 1, 1, "", "StanzaStep"]], "kazu.steps.other.stanza.StanzaStep": [[59, 2, 1, "", "__init__"]], "kazu.steps.step": [[60, 1, 1, "", "Step"], [60, 6, 1, "", "document_batch_step"], [60, 6, 1, "", "document_iterating_step"]], "kazu.steps.step.Step": [[60, 2, 1, "", "__init__"], [60, 2, 1, "", "namespace"]], "kazu.utils": [[62, 0, 0, "-", "abbreviation_detector"], [63, 0, 0, "-", "build_and_test_model_packs"], [64, 0, 0, "-", "caching"], [65, 0, 0, "-", "grouping"], [66, 0, 0, "-", "link_index"], [67, 0, 0, "-", "spacy_pipeline"], [68, 0, 0, "-", "stanza_pipeline"], [69, 0, 0, "-", "stopwatch"], [70, 0, 0, "-", "string_normalizer"], [71, 0, 0, "-", "utils"]], "kazu.utils.abbreviation_detector": [[62, 1, 1, "", "KazuAbbreviationDetector"], [62, 6, 1, "", "filter_matches"], [62, 6, 1, "", "find_abbreviation"], [62, 6, 1, "", "short_form_filter"]], "kazu.utils.abbreviation_detector.KazuAbbreviationDetector": [[62, 2, 1, "", "__init__"]], "kazu.utils.build_and_test_model_packs": [[63, 4, 1, "", "ModelPackBuildError"], [63, 1, 1, "", "ModelPackBuilder"], [63, 6, 1, "", "build_custom_pack_params"]], "kazu.utils.build_and_test_model_packs.ModelPackBuilder": [[63, 2, 1, "", "build_all_model_packs"], [63, 2, 1, "", "build_caches"], [63, 2, 1, "", "clear_cached_resources_from_model_pack_dir"], [63, 2, 1, "", "process_model_pack_path"], [63, 2, 1, "", "reset_singletons"], [63, 2, 1, "", "zip_model_pack"]], "kazu.utils.caching": [[64, 1, 1, "", "EntityLinkingLookupCache"]], "kazu.utils.caching.EntityLinkingLookupCache": [[64, 2, 1, "", "__init__"], [64, 2, 1, "", "check_lookup_cache"], [64, 2, 1, "", "update_terms_lookup_cache"]], "kazu.utils.grouping": [[65, 6, 1, "", "sort_then_group"]], "kazu.utils.link_index": [[66, 1, 1, "", "CDistTensorEmbeddingIndex"], [66, 1, 1, "", "DictionaryIndex"], [66, 1, 1, "", "EmbeddingIndex"], [66, 1, 1, "", "Index"], [66, 1, 1, "", "MatMulTensorEmbeddingIndex"], [66, 1, 1, "", "TensorEmbeddingIndex"]], "kazu.utils.link_index.DictionaryIndex": [[66, 2, 1, "", "__init__"], [66, 2, 1, "", "apply_boolean_scorers"], [66, 2, 1, "", "search"]], "kazu.utils.link_index.EmbeddingIndex": [[66, 2, 1, "", "__init__"], [66, 2, 1, "", "enumerate_database_chunks"], [66, 2, 1, "", "predict_ontology_embeddings"], [66, 2, 1, "", "search"], [66, 2, 1, "", "set_embedding_model"]], "kazu.utils.link_index.Index": [[66, 2, 1, "", "__init__"], [66, 2, 1, "", "add"], [66, 2, 1, "", "build_ontology_cache"], [66, 3, 1, "", "column_type_dict"], [66, 2, 1, "", "get_index_data_path"], [66, 2, 1, "", "get_metadata_path"], [66, 2, 1, "", "get_synonym_data_path"], [66, 2, 1, "", "load"], [66, 2, 1, "", "load_or_build_cache"], [66, 2, 1, "", "save"]], "kazu.utils.link_index.MatMulTensorEmbeddingIndex": [[66, 3, 1, "", "metadata_db"], [66, 3, 1, "", "synonym_db"]], "kazu.utils.link_index.TensorEmbeddingIndex": [[66, 2, 1, "", "__init__"], [66, 3, 1, "", "metadata_db"], [66, 3, 1, "", "synonym_db"]], "kazu.utils.spacy_pipeline": [[67, 1, 1, "", "SpacyPipeline"]], "kazu.utils.spacy_pipeline.SpacyPipeline": [[67, 2, 1, "", "__init__"], [67, 3, 1, "", "instance"]], "kazu.utils.stanza_pipeline": [[68, 1, 1, "", "StanzaPipeline"]], "kazu.utils.stanza_pipeline.StanzaPipeline": [[68, 2, 1, "", "__init__"], [68, 2, 1, "", "from_stanza_kwargs"], [68, 5, 1, "", "instance"], [68, 2, 1, "", "simple_init"]], "kazu.utils.stopwatch": [[69, 1, 1, "", "Stopwatch"]], "kazu.utils.stopwatch.Stopwatch": [[69, 2, 1, "", "__init__"], [69, 2, 1, "", "message"], [69, 2, 1, "", "start"]], "kazu.utils.string_normalizer": [[70, 1, 1, "", "AnatomyStringNormalizer"], [70, 1, 1, "", "DefaultStringNormalizer"], [70, 1, 1, "", "DiseaseStringNormalizer"], [70, 1, 1, "", "EntityClassNormalizer"], [70, 1, 1, "", "GeneStringNormalizer"], [70, 1, 1, "", "GildaUtils"], [70, 1, 1, "", "StringNormalizer"]], "kazu.utils.string_normalizer.AnatomyStringNormalizer": [[70, 2, 1, "", "is_symbol_like"], [70, 2, 1, "", "normalize_noun_phrase"], [70, 2, 1, "", "normalize_symbol"]], "kazu.utils.string_normalizer.DefaultStringNormalizer": [[70, 3, 1, "", "allowed_additional_chars"], [70, 2, 1, "", "depluralize"], [70, 3, 1, "", "greek_subs"], [70, 3, 1, "", "greek_subs_upper"], [70, 2, 1, "", "handle_lower_case_prefixes"], [70, 2, 1, "", "is_symbol_like"], [70, 2, 1, "", "normalize_noun_phrase"], [70, 2, 1, "", "normalize_symbol"], [70, 3, 1, "", "number_split_pattern"], [70, 3, 1, "", "other_subs"], [70, 3, 1, "", "re_subs"], [70, 3, 1, "", "re_subs_2"], [70, 2, 1, "", "remove_non_alphanum"], [70, 2, 1, "", "replace_greek"], [70, 2, 1, "", "replace_substrings"], [70, 2, 1, "", "split_on_numbers"], [70, 2, 1, "", "sub_greek_char_abbreviations"], [70, 3, 1, "", "symbol_number_split"], [70, 3, 1, "", "trailing_lowercase_s_split"]], "kazu.utils.string_normalizer.DiseaseStringNormalizer": [[70, 2, 1, "", "is_symbol_like"], [70, 3, 1, "", "known_disease_short_nouns"], [70, 2, 1, "", "normalize_noun_phrase"], [70, 2, 1, "", "normalize_symbol"]], "kazu.utils.string_normalizer.EntityClassNormalizer": [[70, 2, 1, "", "__init__"], [70, 2, 1, "", "is_symbol_like"], [70, 2, 1, "", "normalize_noun_phrase"], [70, 2, 1, "", "normalize_symbol"]], "kazu.utils.string_normalizer.GeneStringNormalizer": [[70, 3, 1, "", "gene_name_suffixes"], [70, 2, 1, "", "gene_token_classifier"], [70, 2, 1, "", "is_symbol_like"], [70, 2, 1, "", "normalize_noun_phrase"], [70, 2, 1, "", "normalize_symbol"], [70, 2, 1, "", "remove_trailing_s_if_otherwise_capitalised"]], "kazu.utils.string_normalizer.GildaUtils": [[70, 3, 1, "", "dashes"], [70, 2, 1, "", "depluralize"], [70, 2, 1, "", "replace_dashes"]], "kazu.utils.string_normalizer.StringNormalizer": [[70, 2, 1, "", "classify_symbolic"], [70, 2, 1, "", "normalize"], [70, 3, 1, "", "normalizers"]], "kazu.utils.utils": [[71, 1, 1, "", "EntityClassFilter"], [71, 1, 1, "", "Singleton"], [71, 6, 1, "", "as_path"], [71, 6, 1, "", "create_char_ngrams"], [71, 6, 1, "", "create_word_ngrams"], [71, 6, 1, "", "documents_to_document_section_batch_encodings_map"], [71, 6, 1, "", "documents_to_document_section_text_map"], [71, 6, 1, "", "documents_to_id_section_map"], [71, 6, 1, "", "filter_entities_with_ontology_mappings"], [71, 6, 1, "", "find_document_from_entity"], [71, 6, 1, "", "get_cache_dir"], [71, 6, 1, "", "get_cache_path"], [71, 6, 1, "", "get_match_entity_class_hash"]], "kazu.utils.utils.EntityClassFilter": [[71, 2, 1, "", "__init__"]], "kazu.web": [[73, 0, 0, "-", "jwtauth"], [74, 0, 0, "-", "routes"], [75, 0, 0, "-", "server"]], "kazu.web.jwtauth": [[73, 1, 1, "", "JWTAuthenticationBackend"], [73, 1, 1, "", "JWTUser"], [73, 6, 1, "", "on_auth_error"]], "kazu.web.jwtauth.JWTAuthenticationBackend": [[73, 2, 1, "", "__init__"], [73, 2, 1, "", "authenticate"], [73, 2, 1, "", "get_token_from_header"]], "kazu.web.jwtauth.JWTUser": [[73, 2, 1, "", "__init__"], [73, 5, 1, "", "display_name"], [73, 5, 1, "", "is_authenticated"]], "kazu.web.server": [[75, 1, 1, "", "SectionedWebDocument"], [75, 1, 1, "", "SimpleWebDocument"], [75, 6, 1, "", "start"], [75, 6, 1, "", "stop"]], "kazu.web.server.SectionedWebDocument": [[75, 3, 1, "", "sections"], [75, 2, 1, "", "to_kazu_document"]], "kazu.web.server.SimpleWebDocument": [[75, 3, 1, "", "text"], [75, 2, 1, "", "to_kazu_document"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:attribute", "4": "py:exception", "5": "py:property", "6": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "attribute", "Python attribute"], "4": ["py", "exception", "Python exception"], "5": ["py", "property", "Python property"], "6": ["py", "function", "Python function"]}, "titleterms": {"kazu": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 78, 79, 80, 81], "data": [1, 2, 3, 78], "pytorch": 3, "model": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 78, 86], "annot": [5, 6, 7], "acceptance_test": 6, "label_studio": 7, "databas": [8, 9], "in_memory_db": 9, "distil": [10, 11, 12, 13, 14, 15, 16, 17], "data_util": 11, "dataprocessor": 12, "lightning_plugin": 13, "metric": 14, "tiny_transform": 16, "train": [17, 24], "hf_lightning_wrapp": 18, "languag": [19, 20, 21], "language_phenomena": 20, "string_similarity_scor": 21, "link": [22, 23, 24, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 77], "sapbert": [23, 24, 49], "ontology_match": [25, 26, 27], "assemble_pipelin": 26, "ontology_preprocess": [28, 29, 30], "base": [29, 77], "synonym_gener": 30, "pipelin": [31, 32, 79, 86], "step": [33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 86], "document_post_process": [34, 35], "abbreviation_find": 35, "joint_ner_and_link": [36, 37], "explos": 37, "dictionari": 39, "mapping_step": 40, "post_process": [41, 42, 43, 44, 45, 46, 47, 48], "disambigu": [42, 43, 44], "context_scor": 43, "strategi": [44, 46], "mapping_strategi": [45, 46], "strategy_runn": 47, "xref_manag": 48, "ner": [50, 51, 52, 53, 54, 55, 77], "entity_post_process": 51, "hf_token_classif": 52, "seth": 53, "spacy_n": 54, "tokenized_word_processor": 55, "other": [56, 57, 58, 59], "cleanup": 57, "merge_overlapping_": 58, "stanza": 59, "util": [61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71], "abbreviation_detector": 62, "build_and_test_model_pack": 63, "cach": 64, "group": 65, "link_index": 66, "spacy_pipelin": 67, "stanza_pipelin": 68, "stopwatch": 69, "string_norm": 70, "web": [72, 73, 74, 75], "jwtauth": 73, "rout": 74, "server": 75, "api": [76, 80], "refer": 76, "curat": 77, "knowledg": 77, "At": 79, "glanc": 79, "how": 79, "us": 79, "default": 79, "welcom": 80, "s": 80, "document": 80, "guid": 80, "tutori": 80, "site": 80, "index": 80, "introduct": 81, "why": 81, "summari": 81, "tba": [82, 87], "visualis": 83, "result": 83, "label": 83, "studio": 83, "The": 84, "ontologypars": 84, "write": 84, "custom": 84, "parser": 84, "quickstart": 86, "instal": 86, "pack": 86, "run": 86, "advanc": 86, "configur": 86, "hydra": 86}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 6, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx": 56}})
\ No newline at end of file
+Search.setIndex({"docnames": ["_autosummary/kazu", "_autosummary/kazu.data", "_autosummary/kazu.data.data", "_autosummary/kazu.data.pytorch", "_autosummary/kazu.modelling", "_autosummary/kazu.modelling.annotation", "_autosummary/kazu.modelling.annotation.acceptance_test", "_autosummary/kazu.modelling.annotation.label_studio", "_autosummary/kazu.modelling.database", "_autosummary/kazu.modelling.database.in_memory_db", "_autosummary/kazu.modelling.distillation", "_autosummary/kazu.modelling.distillation.data_utils", "_autosummary/kazu.modelling.distillation.dataprocessor", "_autosummary/kazu.modelling.distillation.lightning_plugins", "_autosummary/kazu.modelling.distillation.metrics", "_autosummary/kazu.modelling.distillation.models", "_autosummary/kazu.modelling.distillation.tiny_transformers", "_autosummary/kazu.modelling.distillation.train", "_autosummary/kazu.modelling.hf_lightning_wrappers", "_autosummary/kazu.modelling.language", "_autosummary/kazu.modelling.language.language_phenomena", "_autosummary/kazu.modelling.language.string_similarity_scorers", "_autosummary/kazu.modelling.linking", "_autosummary/kazu.modelling.linking.sapbert", "_autosummary/kazu.modelling.linking.sapbert.train", "_autosummary/kazu.modelling.ontology_matching", "_autosummary/kazu.modelling.ontology_matching.assemble_pipeline", "_autosummary/kazu.modelling.ontology_matching.ontology_matcher", "_autosummary/kazu.modelling.ontology_preprocessing", "_autosummary/kazu.modelling.ontology_preprocessing.base", "_autosummary/kazu.modelling.ontology_preprocessing.synonym_generation", "_autosummary/kazu.pipeline", "_autosummary/kazu.pipeline.pipeline", "_autosummary/kazu.steps", "_autosummary/kazu.steps.document_post_processing", "_autosummary/kazu.steps.document_post_processing.abbreviation_finder", "_autosummary/kazu.steps.joint_ner_and_linking", "_autosummary/kazu.steps.joint_ner_and_linking.explosion", "_autosummary/kazu.steps.linking", "_autosummary/kazu.steps.linking.dictionary", "_autosummary/kazu.steps.linking.mapping_step", "_autosummary/kazu.steps.linking.post_processing", "_autosummary/kazu.steps.linking.post_processing.disambiguation", "_autosummary/kazu.steps.linking.post_processing.disambiguation.context_scoring", "_autosummary/kazu.steps.linking.post_processing.disambiguation.strategies", "_autosummary/kazu.steps.linking.post_processing.mapping_strategies", "_autosummary/kazu.steps.linking.post_processing.mapping_strategies.strategies", "_autosummary/kazu.steps.linking.post_processing.strategy_runner", "_autosummary/kazu.steps.linking.post_processing.xref_manager", "_autosummary/kazu.steps.linking.sapbert", "_autosummary/kazu.steps.ner", "_autosummary/kazu.steps.ner.entity_post_processing", "_autosummary/kazu.steps.ner.hf_token_classification", "_autosummary/kazu.steps.ner.seth", "_autosummary/kazu.steps.ner.spacy_ner", "_autosummary/kazu.steps.ner.tokenized_word_processor", "_autosummary/kazu.steps.other", "_autosummary/kazu.steps.other.cleanup", "_autosummary/kazu.steps.other.merge_overlapping_ents", "_autosummary/kazu.steps.other.stanza", "_autosummary/kazu.steps.step", "_autosummary/kazu.utils", "_autosummary/kazu.utils.abbreviation_detector", "_autosummary/kazu.utils.build_and_test_model_packs", "_autosummary/kazu.utils.caching", "_autosummary/kazu.utils.grouping", "_autosummary/kazu.utils.link_index", "_autosummary/kazu.utils.spacy_pipeline", "_autosummary/kazu.utils.stanza_pipeline", "_autosummary/kazu.utils.stopwatch", "_autosummary/kazu.utils.string_normalizer", "_autosummary/kazu.utils.utils", "_autosummary/kazu.web", "_autosummary/kazu.web.jwtauth", "_autosummary/kazu.web.routes", "_autosummary/kazu.web.server", "apidocs_autosummary", "curating_for_explosion", "datamodel", "default_pipeline", "index", "introduction", "kazu_webservice", "label_studio_integration", "ontology_parser", "pipeline_example", "quickstart", "scaling_kazu", "single_step_example"], "filenames": ["_autosummary/kazu.rst", "_autosummary/kazu.data.rst", "_autosummary/kazu.data.data.rst", "_autosummary/kazu.data.pytorch.rst", "_autosummary/kazu.modelling.rst", "_autosummary/kazu.modelling.annotation.rst", "_autosummary/kazu.modelling.annotation.acceptance_test.rst", "_autosummary/kazu.modelling.annotation.label_studio.rst", "_autosummary/kazu.modelling.database.rst", "_autosummary/kazu.modelling.database.in_memory_db.rst", "_autosummary/kazu.modelling.distillation.rst", "_autosummary/kazu.modelling.distillation.data_utils.rst", "_autosummary/kazu.modelling.distillation.dataprocessor.rst", "_autosummary/kazu.modelling.distillation.lightning_plugins.rst", "_autosummary/kazu.modelling.distillation.metrics.rst", "_autosummary/kazu.modelling.distillation.models.rst", "_autosummary/kazu.modelling.distillation.tiny_transformers.rst", "_autosummary/kazu.modelling.distillation.train.rst", "_autosummary/kazu.modelling.hf_lightning_wrappers.rst", "_autosummary/kazu.modelling.language.rst", "_autosummary/kazu.modelling.language.language_phenomena.rst", "_autosummary/kazu.modelling.language.string_similarity_scorers.rst", "_autosummary/kazu.modelling.linking.rst", "_autosummary/kazu.modelling.linking.sapbert.rst", "_autosummary/kazu.modelling.linking.sapbert.train.rst", "_autosummary/kazu.modelling.ontology_matching.rst", "_autosummary/kazu.modelling.ontology_matching.assemble_pipeline.rst", "_autosummary/kazu.modelling.ontology_matching.ontology_matcher.rst", "_autosummary/kazu.modelling.ontology_preprocessing.rst", "_autosummary/kazu.modelling.ontology_preprocessing.base.rst", "_autosummary/kazu.modelling.ontology_preprocessing.synonym_generation.rst", "_autosummary/kazu.pipeline.rst", "_autosummary/kazu.pipeline.pipeline.rst", "_autosummary/kazu.steps.rst", "_autosummary/kazu.steps.document_post_processing.rst", "_autosummary/kazu.steps.document_post_processing.abbreviation_finder.rst", "_autosummary/kazu.steps.joint_ner_and_linking.rst", "_autosummary/kazu.steps.joint_ner_and_linking.explosion.rst", "_autosummary/kazu.steps.linking.rst", "_autosummary/kazu.steps.linking.dictionary.rst", "_autosummary/kazu.steps.linking.mapping_step.rst", "_autosummary/kazu.steps.linking.post_processing.rst", "_autosummary/kazu.steps.linking.post_processing.disambiguation.rst", "_autosummary/kazu.steps.linking.post_processing.disambiguation.context_scoring.rst", "_autosummary/kazu.steps.linking.post_processing.disambiguation.strategies.rst", "_autosummary/kazu.steps.linking.post_processing.mapping_strategies.rst", "_autosummary/kazu.steps.linking.post_processing.mapping_strategies.strategies.rst", "_autosummary/kazu.steps.linking.post_processing.strategy_runner.rst", "_autosummary/kazu.steps.linking.post_processing.xref_manager.rst", "_autosummary/kazu.steps.linking.sapbert.rst", "_autosummary/kazu.steps.ner.rst", "_autosummary/kazu.steps.ner.entity_post_processing.rst", "_autosummary/kazu.steps.ner.hf_token_classification.rst", "_autosummary/kazu.steps.ner.seth.rst", "_autosummary/kazu.steps.ner.spacy_ner.rst", "_autosummary/kazu.steps.ner.tokenized_word_processor.rst", "_autosummary/kazu.steps.other.rst", "_autosummary/kazu.steps.other.cleanup.rst", "_autosummary/kazu.steps.other.merge_overlapping_ents.rst", "_autosummary/kazu.steps.other.stanza.rst", "_autosummary/kazu.steps.step.rst", "_autosummary/kazu.utils.rst", "_autosummary/kazu.utils.abbreviation_detector.rst", "_autosummary/kazu.utils.build_and_test_model_packs.rst", "_autosummary/kazu.utils.caching.rst", "_autosummary/kazu.utils.grouping.rst", "_autosummary/kazu.utils.link_index.rst", "_autosummary/kazu.utils.spacy_pipeline.rst", "_autosummary/kazu.utils.stanza_pipeline.rst", "_autosummary/kazu.utils.stopwatch.rst", "_autosummary/kazu.utils.string_normalizer.rst", "_autosummary/kazu.utils.utils.rst", "_autosummary/kazu.web.rst", "_autosummary/kazu.web.jwtauth.rst", "_autosummary/kazu.web.routes.rst", "_autosummary/kazu.web.server.rst", "apidocs_autosummary.rst", "curating_for_explosion.rst", "datamodel.rst", "default_pipeline.rst", "index.rst", "introduction.rst", "kazu_webservice.rst", "label_studio_integration.rst", "ontology_parser.rst", "pipeline_example.rst", "quickstart.rst", "scaling_kazu.rst", "single_step_example.rst"], "titles": ["kazu", "kazu.data", "kazu.data.data", "kazu.data.pytorch", "kazu.modelling", "kazu.modelling.annotation", "kazu.modelling.annotation.acceptance_test", "kazu.modelling.annotation.label_studio", "kazu.modelling.database", "kazu.modelling.database.in_memory_db", "kazu.modelling.distillation", "kazu.modelling.distillation.data_utils", "kazu.modelling.distillation.dataprocessor", "kazu.modelling.distillation.lightning_plugins", "kazu.modelling.distillation.metrics", "kazu.modelling.distillation.models", "kazu.modelling.distillation.tiny_transformers", "kazu.modelling.distillation.train", "kazu.modelling.hf_lightning_wrappers", "kazu.modelling.language", "kazu.modelling.language.language_phenomena", "kazu.modelling.language.string_similarity_scorers", "kazu.modelling.linking", "kazu.modelling.linking.sapbert", "kazu.modelling.linking.sapbert.train", "kazu.modelling.ontology_matching", "kazu.modelling.ontology_matching.assemble_pipeline", "kazu.modelling.ontology_matching.ontology_matcher", "kazu.modelling.ontology_preprocessing", "kazu.modelling.ontology_preprocessing.base", "kazu.modelling.ontology_preprocessing.synonym_generation", "kazu.pipeline", "kazu.pipeline.pipeline", "kazu.steps", "kazu.steps.document_post_processing", "kazu.steps.document_post_processing.abbreviation_finder", "kazu.steps.joint_ner_and_linking", "kazu.steps.joint_ner_and_linking.explosion", "kazu.steps.linking", "kazu.steps.linking.dictionary", "kazu.steps.linking.mapping_step", "kazu.steps.linking.post_processing", "kazu.steps.linking.post_processing.disambiguation", "kazu.steps.linking.post_processing.disambiguation.context_scoring", "kazu.steps.linking.post_processing.disambiguation.strategies", "kazu.steps.linking.post_processing.mapping_strategies", "kazu.steps.linking.post_processing.mapping_strategies.strategies", "kazu.steps.linking.post_processing.strategy_runner", "kazu.steps.linking.post_processing.xref_manager", "kazu.steps.linking.sapbert", "kazu.steps.ner", "kazu.steps.ner.entity_post_processing", "kazu.steps.ner.hf_token_classification", "kazu.steps.ner.seth", "kazu.steps.ner.spacy_ner", "kazu.steps.ner.tokenized_word_processor", "kazu.steps.other", "kazu.steps.other.cleanup", "kazu.steps.other.merge_overlapping_ents", "kazu.steps.other.stanza", "kazu.steps.step", "kazu.utils", "kazu.utils.abbreviation_detector", "kazu.utils.build_and_test_model_packs", "kazu.utils.caching", "kazu.utils.grouping", "kazu.utils.link_index", "kazu.utils.spacy_pipeline", "kazu.utils.stanza_pipeline", "kazu.utils.stopwatch", "kazu.utils.string_normalizer", "kazu.utils.utils", "kazu.web", "kazu.web.jwtauth", "kazu.web.routes", "kazu.web.server", "API Reference", "Curating a knowledge base for NER and Linking", "Kazu Data Model", "At a glance: How to use the default Kazu pipeline", "Welcome to Kazu\u2019s documentation!", "Introduction", "TBA", "Visualising results in Label Studio", "The OntologyParser", "&lt;no title&gt;", "Quickstart", "TBA", "&lt;no title&gt;"], "terms": {"modul": [0, 1, 4, 5, 8, 10, 15, 16, 19, 22, 23, 24, 25, 28, 31, 33, 34, 36, 38, 41, 42, 45, 50, 56, 61, 72, 80], "class": [2, 3, 6, 7, 9, 12, 13, 15, 16, 18, 21, 24, 27, 29, 30, 32, 35, 37, 39, 40, 43, 44, 46, 47, 48, 49, 51, 52, 53, 54, 55, 57, 58, 59, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 73, 75, 79, 84], "autonameenum": 2, "sourc": [2, 3, 6, 7, 9, 11, 12, 13, 14, 15, 16, 17, 18, 21, 24, 26, 27, 29, 30, 32, 35, 37, 39, 40, 43, 44, 46, 47, 48, 49, 51, 52, 53, 54, 55, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 75, 81, 84], "base": [2, 3, 6, 7, 9, 12, 13, 15, 16, 18, 21, 24, 26, 27, 30, 32, 35, 37, 39, 40, 43, 44, 46, 47, 48, 49, 51, 52, 53, 54, 55, 57, 58, 59, 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 73, 75, 78, 80, 84, 86], "enum": 2, "subclass": [2, 15, 16, 55], "creat": [2, 7, 15, 24, 27, 29, 32, 43, 44, 47, 48, 52, 66, 78, 83, 84, 86, 88], "an": [2, 9, 15, 18, 21, 24, 26, 27, 29, 32, 44, 46, 47, 55, 58, 59, 62, 64, 66, 70, 77, 78, 81, 83, 84, 86, 88], "where": [2, 15, 24, 46, 48, 66, 79], "valu": [2, 15, 24, 46, 60, 77, 84], "ar": [2, 15, 21, 24, 26, 27, 29, 43, 44, 46, 47, 48, 51, 52, 53, 55, 58, 59, 60, 62, 64, 66, 70, 73, 77, 78, 79, 81, 84, 86, 88], "name": [2, 7, 9, 24, 27, 29, 44, 46, 47, 48, 54, 55, 58, 59, 60, 66, 73, 81, 84], "when": [2, 13, 15, 24, 27, 29, 44, 47, 48, 49, 55, 60, 66, 69, 77, 81, 84], "us": [2, 9, 14, 15, 18, 21, 24, 26, 29, 30, 32, 35, 39, 40, 43, 44, 46, 47, 48, 49, 52, 55, 59, 60, 62, 63, 66, 70, 73, 77, 81, 83, 84, 86], "auto": 2, "taken": [2, 46, 77], "from": [2, 13, 15, 24, 26, 27, 29, 30, 32, 44, 47, 48, 52, 54, 58, 59, 62, 63, 66, 69, 70, 73, 75, 77, 78, 81, 83, 84, 85, 86, 88], "python": [2, 29, 54, 59], "doc": [2, 6, 7, 26, 27, 29, 32, 44, 51, 52, 54, 57, 60, 62, 71, 78, 83, 85, 86, 88], "licens": [2, 70, 73, 81], "under": [2, 55, 73, 81], "zero": 2, "claus": [2, 70, 73], "bsd": [2, 70, 73], "charspan": [2, 7], "object": [2, 6, 7, 9, 15, 21, 24, 27, 29, 30, 32, 43, 44, 46, 47, 51, 55, 57, 62, 63, 64, 67, 68, 69, 70, 71, 84, 86], "A": [2, 13, 15, 24, 29, 32, 35, 37, 40, 44, 46, 48, 52, 53, 54, 55, 58, 59, 60, 62, 64, 70, 71, 73, 78, 84], "concept": [2, 29, 52, 78, 81, 84], "similar": [2, 21, 24, 29, 46, 59, 84], "spaci": [2, 26, 27, 37, 54, 60, 62, 67, 77, 79], "span": [2, 6, 26, 27, 37, 47, 55, 58, 62, 78, 79, 86, 88], "except": [2, 6, 32, 60, 63, 73], "charact": [2, 55, 62, 70, 78, 86, 88], "index": [2, 15, 18, 24, 48, 52, 58, 66, 71], "rather": [2, 60, 81], "than": [2, 15, 21, 29, 46, 47, 49, 52, 60, 81], "token": [2, 3, 15, 21, 24, 26, 46, 52, 55, 59, 70, 71, 73, 83], "__init__": [2, 3, 6, 7, 13, 15, 16, 18, 21, 24, 27, 29, 30, 32, 35, 37, 39, 40, 43, 44, 46, 47, 48, 49, 51, 52, 53, 54, 55, 57, 58, 59, 60, 62, 64, 66, 67, 68, 69, 70, 71, 73], "start": [2, 17, 24, 51, 55, 58, 69, 75, 78, 81, 86, 88], "end": [2, 15, 18, 24, 47, 51, 55, 58, 63, 78, 86, 88], "paramet": [2, 3, 6, 7, 9, 11, 12, 13, 14, 15, 17, 18, 21, 24, 26, 27, 29, 30, 32, 35, 37, 39, 40, 43, 44, 46, 47, 48, 49, 51, 52, 53, 54, 55, 57, 58, 59, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 73, 75, 84], "int": [2, 6, 7, 9, 14, 15, 18, 24, 27, 29, 32, 37, 39, 43, 46, 49, 52, 55, 58, 62, 64, 66, 69, 71], "return": [2, 6, 7, 9, 11, 12, 13, 14, 15, 17, 18, 24, 26, 27, 29, 30, 32, 37, 43, 44, 46, 47, 48, 52, 55, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 71, 73, 75, 78, 84, 86, 88], "type": [2, 6, 7, 9, 11, 12, 13, 14, 15, 17, 18, 21, 24, 26, 27, 29, 30, 32, 37, 43, 44, 46, 47, 48, 52, 55, 58, 60, 62, 63, 64, 65, 66, 68, 70, 71, 73, 75, 83, 84], "none": [2, 6, 9, 13, 15, 16, 17, 18, 24, 26, 27, 29, 30, 32, 35, 39, 44, 46, 47, 49, 52, 53, 55, 62, 63, 66, 67, 70, 73, 75, 81, 83, 85], "is_completely_overlap": 2, "other": [2, 9, 29, 35, 46, 47, 48, 70, 73, 77, 79, 80, 81, 84], "true": [2, 15, 24, 27, 29, 30, 37, 46, 49, 55, 58, 71, 84], "complet": [2, 84], "overlap": [2, 7, 55, 58, 79, 81], "thi": [2, 9, 15, 16, 18, 21, 24, 26, 27, 29, 30, 32, 35, 43, 44, 46, 47, 48, 49, 52, 54, 55, 58, 60, 62, 63, 66, 70, 73, 77, 79, 81, 83, 84, 86], "is_partially_overlap": 2, "partial": [2, 58, 81], "document": [2, 6, 7, 32, 44, 46, 47, 49, 52, 53, 54, 55, 57, 60, 70, 71, 73, 75, 78, 81, 83, 84, 85, 88], "idx": [2, 9, 29, 46, 66, 84], "str": [2, 6, 7, 9, 11, 12, 13, 15, 21, 24, 26, 27, 29, 30, 32, 35, 37, 39, 43, 44, 46, 47, 48, 49, 51, 52, 53, 55, 57, 58, 60, 62, 63, 66, 67, 68, 69, 70, 71, 73, 75, 84, 86], "section": [2, 6, 7, 15, 24, 35, 52, 54, 62, 71, 75, 78, 83, 86, 88], "list": [2, 6, 7, 12, 14, 15, 21, 24, 26, 27, 29, 30, 32, 35, 39, 43, 44, 46, 47, 48, 49, 51, 52, 55, 57, 58, 60, 62, 63, 64, 66, 70, 71, 73, 77, 78, 83, 86, 88], "factori": [2, 6, 27, 46, 55], "metadata": [2, 9, 24, 29, 37, 60, 66, 78, 83, 84], "dict": [2, 6, 7, 9, 13, 15, 24, 27, 29, 30, 32, 37, 44, 46, 47, 48, 49, 51, 52, 55, 58, 66, 70, 71, 73, 75], "ani": [2, 6, 9, 13, 15, 18, 24, 26, 29, 44, 46, 47, 53, 55, 58, 60, 62, 63, 66, 70, 71, 73, 84], "as_minified_dict": 2, "drop_unmapped_": 2, "fals": [2, 15, 16, 24, 27, 46, 47, 48, 52, 55, 66], "drop_term": 2, "bool": [2, 9, 15, 16, 18, 24, 27, 29, 30, 37, 46, 47, 48, 49, 52, 53, 55, 57, 58, 62, 63, 66, 68, 70, 71, 73], "classmethod": [2, 7, 30, 46, 60, 68, 70, 73], "create_simple_docu": [2, 78, 83, 85, 86, 88], "text": [2, 11, 15, 24, 29, 35, 43, 51, 52, 53, 55, 58, 59, 62, 66, 70, 71, 75, 77, 78, 83, 84, 85, 86], "instanc": [2, 15, 16, 18, 24, 29, 44, 46, 48, 52, 55, 62, 66, 67, 68, 77, 78, 84, 86, 88], "string": [2, 6, 9, 13, 14, 21, 24, 26, 27, 29, 30, 43, 44, 46, 49, 51, 52, 62, 66, 70, 71, 77, 84], "The": [2, 13, 15, 18, 24, 26, 27, 29, 32, 43, 44, 47, 49, 52, 55, 58, 59, 60, 62, 66, 70, 78, 80, 81], "field": [2, 24, 26, 27, 44, 54, 81], "gener": [2, 15, 24, 26, 29, 30, 39, 44, 48, 49, 52, 59, 60, 62, 63, 66, 77, 78, 84, 86], "uuid": 2, "uuid4": 2, "hex": 2, "from_named_section_text": 2, "named_sect": 2, "get_ent": [2, 78, 86, 88], "get": [2, 9, 12, 15, 24, 47, 52, 55, 77, 84, 86], "all": [2, 9, 15, 16, 21, 24, 27, 29, 30, 44, 46, 48, 58, 59, 63, 66, 70, 73, 79, 84], "entiti": [2, 6, 7, 24, 27, 29, 37, 39, 44, 46, 47, 49, 51, 52, 53, 55, 57, 58, 59, 62, 64, 70, 71, 77, 78, 79, 81, 83, 84, 86, 88], "json": [2, 26, 29, 48, 83], "kwarg": [2, 18, 21, 24, 57, 60, 68, 70], "custom": [2, 24, 63, 70, 80, 81, 83], "encod": [2, 3, 15, 24, 52, 71], "need": [2, 7, 9, 15, 16, 24, 26, 27, 29, 43, 44, 46, 48, 52, 55, 67, 77, 84, 86], "handl": [2, 15, 24, 32, 44, 47, 52, 55, 60, 70, 78, 84], "serialis": [2, 43], "issu": [2, 81, 84], "our": [2, 59, 77, 81, 83, 84], "model": [2, 43, 44, 46, 49, 52, 55, 59, 60, 62, 63, 66, 67, 79, 80, 81, 83, 84], "param": [2, 58, 66, 70], "drop": 2, "have": [2, 6, 15, 24, 29, 44, 46, 47, 54, 58, 60, 70, 71, 77, 81], "map": [2, 6, 7, 15, 27, 29, 30, 43, 44, 46, 47, 48, 49, 52, 55, 57, 58, 71, 79, 84], "synonym": [2, 9, 26, 27, 29, 30, 44, 46, 64, 77, 84], "term": [2, 21, 26, 27, 44, 46, 47, 64, 77, 84], "addit": [2, 15, 24, 30, 48, 78, 84], "pass": [2, 15, 16, 24, 39, 46, 52, 60], "dump": [2, 29], "documentjsonutil": 2, "conversionexcept": 2, "doc_to_json_dict": 2, "option": [2, 9, 13, 15, 18, 24, 26, 27, 29, 30, 32, 35, 39, 44, 46, 47, 49, 52, 53, 55, 62, 63, 66, 70, 73, 81], "union": [2, 9, 11, 13, 15, 24, 26, 27, 29, 30, 37, 66, 68, 71, 73, 84], "float": [2, 6, 9, 14, 15, 24, 29, 32, 44, 46, 52, 55, 66], "static": [2, 6, 7, 9, 24, 29, 43, 44, 46, 47, 52, 63, 66, 70], "empti": 2, "x": [2, 15, 21, 24, 30, 44, 77, 78, 83, 84, 85, 86, 88], "minify_json_dict": 2, "doc_json_dict": 2, "in_plac": 2, "obj_to_dict_repr": 2, "obj": 2, "remove_empty_el": 2, "d": [2, 15, 24, 30, 59, 70, 84], "recurs": 2, "remov": [2, 13, 30, 62, 70], "element": [2, 7, 46, 60, 70], "dictionari": [2, 15, 24, 29, 47, 66, 77, 79, 80, 83, 84], "atomic_typ": 2, "nonetyp": 2, "listlike_typ": 2, "tupl": [2, 6, 7, 9, 15, 24, 27, 29, 37, 46, 47, 48, 52, 55, 58, 60, 62, 63, 65, 66, 70, 71, 73], "set": [2, 6, 9, 12, 15, 24, 27, 29, 30, 37, 39, 43, 44, 46, 47, 48, 55, 58, 66, 77, 81, 84, 86], "frozenset": [2, 29, 44, 46, 47], "contain": [2, 15, 24, 29, 43, 53, 55, 77, 78, 84, 86], "inform": [2, 6, 7, 47, 55, 62, 78, 79, 83, 84], "about": [2, 84], "singl": [2, 15, 24, 29, 46, 60, 70, 71, 78, 84, 86, 88], "detect": [2, 35, 44, 46, 47, 52, 53, 55, 62, 70, 78, 79, 84, 86], "within": [2, 16, 26, 30, 46, 52, 58], "most": [2, 24, 30, 44, 46, 70, 79, 84, 86], "import": [2, 29, 78, 81, 83, 84, 85, 86, 88], "match": [2, 6, 15, 21, 24, 26, 27, 29, 30, 39, 46, 47, 49, 51, 62, 71, 77, 78, 86, 88], "actual": [2, 15, 24, 46, 55, 81, 84], "syn_term_to_synonym_term": 2, "synonymtermwithmetr": [2, 46, 47, 64, 66], "candid": [2, 24, 62, 77, 79, 84], "knowledgebas": [2, 24, 29, 79, 81, 84], "hit": [2, 27, 44, 48, 77], "final": [2, 15, 30, 58, 70, 81, 84], "product": [2, 24, 26, 73, 81], "link": [2, 9, 29, 30, 79, 80, 81, 83, 84], "refer": [2, 24, 29, 44, 46, 48, 77, 84], "underli": [2, 29, 47, 48, 52, 84], "entity_class": [2, 6, 7, 26, 27, 29, 47, 51, 53, 70, 78, 84, 86, 88], "namespac": [2, 39, 47, 51, 55, 58, 60, 62, 78, 86, 88], "add_map": 2, "deprec": 2, "as_brat": 2, "self": [2, 15, 18, 24, 29, 49, 55, 60, 69, 84], "third": 2, "parti": 2, "biomed": [2, 24, 35, 49, 59, 62, 70, 77, 81], "nlp": [2, 26, 27, 29, 59, 62, 81, 84], "brat": 2, "format": [2, 9, 14, 48, 55], "see": [2, 6, 15, 24, 29, 30, 35, 44, 46, 47, 58, 62, 77, 78, 79, 81], "calc_starts_and_end": 2, "from_span": 2, "join_str": 2, "indic": [2, 24, 39, 49, 55, 66, 78, 86, 88], "also": [2, 15, 24, 26, 29, 30, 44, 58, 59, 63, 66, 70, 71, 81, 83, 84, 86], "requir": [2, 24, 29, 44, 46, 47, 48, 62, 63, 70, 77, 81, 86], "produc": [2, 3, 15, 24, 29, 30, 46, 47, 48, 55, 69, 77], "repres": [2, 21, 24, 29, 55, 58, 66, 70], "join": [2, 84], "togeth": 2, "encompass": 2, "onli": [2, 9, 15, 24, 27, 29, 44, 46, 47, 55, 60, 66, 69, 70, 73, 78, 84, 86, 88], "one": [2, 6, 15, 16, 24, 27, 29, 35, 46, 48, 77, 84], "defin": [2, 15, 16, 24, 27, 29, 62], "both": [2, 16, 24, 30, 46, 48, 59, 84], "thei": [2, 29, 46, 47, 55, 62, 77, 84], "If": [2, 9, 15, 24, 26, 29, 44, 46, 48, 49, 54, 58, 62, 64, 70, 77, 84], "multipl": [2, 15, 24, 29, 43, 44, 46, 55, 63, 67, 84], "becom": [2, 81], "patholog": 2, "while": [2, 15, 16, 18, 24, 59], "mai": [2, 29, 44, 46, 47, 48, 55, 63, 73, 77, 78, 81, 84, 86, 88], "technic": [2, 44], "sens": [2, 44, 77, 84], "distinct": [2, 29, 84], "semant": [2, 84], "mean": [2, 84], "For": [2, 15, 24, 48, 59, 62, 77, 78, 79, 81, 83, 84], "consid": [2, 44, 46, 47, 55, 58, 77], "case": [2, 15, 24, 27, 29, 44, 62, 70, 77, 79, 84], "we": [2, 7, 15, 21, 24, 27, 29, 44, 47, 48, 52, 53, 55, 59, 66, 67, 70, 77, 79, 81, 83, 84, 86], "want": [2, 29, 47, 48, 55, 60, 77, 81, 84], "select": [2, 44, 46, 58, 84], "longest": [2, 24, 30, 46, 58], "annot": [2, 44, 59, 63, 80, 81, 83], "suggest": [2, 84], "some": [2, 15, 24, 26, 29, 44, 51, 55, 58, 70, 77, 79, 81, 84], "ner": [2, 7, 15, 26, 29, 30, 39, 47, 59, 78, 79, 80, 81, 83, 84], "system": [2, 47, 59, 81, 84], "1": [2, 15, 24, 29, 43, 44, 46, 66, 69, 70, 83, 84, 85], "patient": 2, "ha": [2, 15, 21, 24, 27, 29, 44, 46, 47, 49, 71, 78, 84, 86, 88], "metastat": 2, "liver": 2, "cancer": [2, 77, 86], "entity1": 2, "16": [2, 24, 49], "39": 2, "entity2": 2, "27": 2, "40": 2, "result": [2, 6, 24, 29, 32, 39, 47, 49, 52, 54, 58, 59, 60, 66, 71, 77, 80, 84], "part": [2, 47, 51, 58, 59, 70, 86], "same": [2, 32, 46, 58, 59, 84], "2": [2, 15, 18, 24, 29, 43, 46, 51, 52, 70, 71, 77, 81, 83, 84, 85], "non": [2, 27, 29, 46, 47, 51, 52, 58, 70, 81, 83], "contigu": [2, 51, 52, 58, 70, 81, 83], "lung": [2, 86], "0": [2, 6, 15, 18, 21, 24, 29, 44, 46, 51, 58, 69, 78, 81, 84, 86, 88], "4": [2, 21, 47, 70], "1521": 2, "9": [2, 21, 51, 59, 70], "21": [2, 86], "load_contiguous_ent": [2, 51, 78, 86, 88], "update_term": 2, "iter": [2, 6, 7, 9, 27, 29, 30, 37, 43, 44, 46, 47, 48, 49, 52, 57, 60, 62, 64, 65, 66, 71, 77, 78, 86, 88], "match_norm": [2, 46], "equivalentidaggregationstrategi": [2, 9, 29, 44], "enumer": [2, 62], "merged_as_non_symbol": [2, 84], "no_strategi": [2, 29], "resolved_by_similar": 2, "synonym_is_ambigu": 2, "unambigu": [2, 29, 44, 77], "equivalentidset": [2, 9, 29, 44, 46, 84], "represent": [2, 24, 44, 47, 49, 70], "kb": [2, 9, 24, 29, 71, 84], "id": [2, 7, 9, 14, 24, 29, 44, 46, 48, 52, 55, 77, 84], "s": [2, 11, 15, 24, 26, 27, 29, 30, 43, 44, 52, 58, 60, 70, 71, 77, 78, 81, 84], "thing": [2, 15, 24, 84], "ids_to_sourc": 2, "linkrank": [2, 46, 57], "ambigu": [2, 29, 44, 46, 77, 84], "highly_lik": 2, "possibl": [2, 18, 24, 46, 47, 70, 73], "probabl": [2, 70, 77], "fulli": [2, 59, 81], "disambigu": [2, 29, 46, 70, 77, 80], "default_label": [2, 24, 29, 66, 84], "parser_nam": [2, 44, 46], "mapping_strategi": [2, 80], "confid": [2, 46, 52, 55], "disambiguation_strategi": [2, 46], "xref_source_parser_nam": [2, 46], "preprocessed_text": 2, "get_text": 2, "access": [2, 26, 43, 55, 83], "directli": [2, 26, 32, 48, 58, 81], "method": [2, 15, 24, 27, 29, 44, 46, 47, 55, 60, 70, 84], "provid": [2, 9, 26, 29, 30, 55, 59, 62, 66, 70, 73, 83, 86], "conveni": [2, 24, 55, 78], "wrapper": [2, 18, 37, 40, 52, 64, 66], "avail": [2, 24, 27, 44, 48, 59, 66, 70, 86], "offset_map": 2, "properti": [2, 6, 7, 27, 47, 68, 73], "sentence_span": 2, "synonymterm": [2, 9, 26, 27, 29, 30, 47, 66, 78, 84], "normalis": [2, 21, 29, 46, 66, 70, 81], "ontologypars": [2, 26, 27, 29, 66, 80], "implement": [2, 3, 13, 15, 21, 24, 29, 32, 35, 52, 54, 55, 59, 60, 62, 70], "It": [2, 15, 24, 29, 43, 77, 84], "compos": [2, 15, 24, 78, 86], "uniqu": [2, 29, 44, 47, 84], "e": [2, 7, 15, 24, 29, 30, 46, 47, 52, 55, 58, 62, 64, 70, 81, 84], "g": [2, 15, 24, 29, 30, 46, 55, 58, 62, 64, 70], "breast": [2, 77], "number": [2, 15, 21, 24, 47, 52, 59, 66, 70], "associated_id_set": [2, 27, 84], "determin": [2, 29, 55, 70, 77, 84], "score_and_group_id": [2, 29, 84], "associ": [2, 9, 24, 26, 29, 44, 46, 47, 52, 55, 59, 66, 81, 84], "term_norm": [2, 21, 27, 46, 66], "is_symbol": [2, 29], "mapping_typ": [2, 29, 66, 84], "aggregated_bi": 2, "is_ambigu": 2, "allow": [2, 9, 52, 58, 81, 84], "metric": [2, 15, 24, 80], "score": [2, 6, 14, 44, 46], "As": 2, "hash": [2, 27], "function": [2, 6, 9, 11, 13, 14, 16, 17, 18, 24, 26, 32, 43, 47, 60, 62, 63, 65, 69, 71, 73, 75, 86], "care": [2, 16, 46, 66, 84], "should": [2, 15, 16, 18, 24, 26, 27, 29, 44, 46, 47, 48, 55, 58, 63, 66, 70, 79, 81, 84], "search_scor": 2, "embed_scor": 2, "bool_scor": 2, "exact_match": 2, "from_synonym_term": 2, "merge_metr": 2, "hfdataset": [3, 52], "iterabledataset": [3, 24], "simpl": [3, 18, 24, 35, 51, 54, 62, 64, 66, 77, 84], "torch": [3, 13, 15, 24, 66], "util": [3, 15, 24, 35, 39, 80, 83, 85, 86], "hf": [3, 24, 52], "input_id": [3, 16, 24], "batchencod": [3, 24, 52, 71], "acceptancetestfailur": 6, "aggregatedaccuracyresult": 6, "tp": [6, 27], "fp": [6, 27], "fn": 6, "fp_counter": 6, "collect": [6, 12, 15, 24], "counter": 6, "fn_counter": 6, "fp_items_to_task": 6, "fn_items_to_task": 6, "add_fn": 6, "item": [6, 15, 24, 26, 65], "task": [6, 7, 15, 59, 83], "add_fp": 6, "tasks_for_fn": 6, "tasks_for_fp": 6, "fn_info": 6, "fp_info": 6, "precis": [6, 15, 24, 47, 66, 77], "recal": [6, 30, 47, 70], "sectionscor": 6, "gold_ent": [6, 83], "test_ent": 6, "calculate_linking_match": 6, "calculate_ner_match": 6, "group_mappings_by_sourc": 6, "ent": [6, 51, 54, 58, 71], "acceptance_criteria": 6, "aggregate_linking_result": 6, "class_and_scor": 6, "aggregate_ner_result": 6, "analyse_full_pipelin": [6, 83], "pipelin": [6, 26, 27, 29, 37, 40, 54, 59, 63, 67, 68, 77, 78, 80, 81, 83, 85, 86], "check_annotation_consist": 6, "cfg": [6, 17, 24, 32, 63, 75, 83, 85, 86], "check_ent_class_consist": 6, "ent_to_task_lookup": 6, "match_str": 6, "messag": [6, 32, 69], "check": [6, 21, 27, 29, 30, 46, 62, 64, 66, 70, 78, 84, 86, 88], "differ": [6, 9, 15, 24, 29, 44, 47, 48, 52, 55, 63, 84, 86], "check_ent_mapping_consist": 6, "inconsist": [6, 81], "check_ent_match_abnorm": 6, "gold": [6, 14, 83], "standard": [6, 14, 24, 83], "look": [6, 9, 27, 44, 46, 48, 66, 70, 77, 83, 84], "bit": [6, 24], "weird": 6, "check_results_meet_threshold": 6, "threshold": [6, 29, 44, 46, 52, 55, 84], "execute_full_pipeline_acceptance_test": 6, "score_sect": 6, "scorer": [6, 44, 46, 84], "per": [6, 15, 24, 29, 44, 46, 47, 49, 58], "kazutolabelstudioconvert": [7, 83], "convert": [7, 11, 12, 24, 29, 46, 55, 71, 83], "label": [7, 14, 15, 16, 24, 27, 29, 46, 48, 52, 55, 77, 80, 84], "studio": [7, 80], "sinc": [7, 16, 29, 47, 48, 53, 55, 66, 70, 81, 86], "ls": [7, 83], "region": 7, "new": [7, 24, 29, 30, 55, 62, 78, 81, 84, 86, 88], "everi": [7, 16, 24, 29, 77], "even": [7, 44, 58, 62, 70, 73], "ones": [7, 44, 46, 47, 84], "add": [7, 9, 15, 18, 24, 29, 37, 55, 60, 66, 77, 78, 86, 88], "etc": [7, 29, 48, 66, 78, 83, 85], "convert_docs_to_task": [7, 83], "convert_single_doc_to_task": 7, "lstokazuconvers": 7, "convert_tasks_to_doc": 7, "create_": 7, "create_map": [7, 46], "taxonomy_hit": 7, "task_id": 7, "create_sect": 7, "labelstudioannotationview": [7, 83], "ner_label": [7, 83], "i": [7, 9, 21, 24, 29, 30, 46, 47, 52, 55, 81, 84], "valid": [7, 13, 15, 24, 29, 30, 46, 77, 84], "colour": 7, "build_label": 7, "dom": 7, "build_taxonomi": 7, "create_main_view": 7, "getdom": 7, "labelstudiomanag": [7, 83], "project_nam": [7, 83], "header": [7, 48, 73, 83], "url": [7, 53, 59, 70, 83], "http": [7, 24, 29, 48, 49, 53, 59, 62, 70, 73, 83, 84], "localhost": [7, 83], "8080": [7, 83], "create_linking_project": [7, 83], "view": [7, 83], "delete_project_if_exist": 7, "export_from_l": [7, 83], "get_all_task": 7, "get_task": 7, "import_to_l": 7, "project_id": 7, "metadatadatabas": [9, 44, 46, 48, 66], "singleton": [9, 21, 43, 59, 63, 67, 71], "ontolog": [9, 24, 26, 29, 37, 39, 48, 66, 77, 79, 81, 84, 86], "purpos": [9, 70, 73, 81], "up": [9, 15, 48, 49, 66, 81], "process": [9, 15, 18, 24, 26, 27, 29, 32, 43, 47, 49, 52, 53, 55, 58, 59, 60, 63, 64, 70, 78, 80, 81, 88], "load": [9, 13, 24, 27, 32, 43, 48, 54, 66, 67, 83], "onc": [9, 46, 83], "reduc": [9, 64], "memori": [9, 32, 43, 66], "usag": [9, 32], "add_pars": 9, "note": [9, 15, 21, 24, 29, 30, 32, 44, 46, 47, 60, 62, 70, 86], "assum": [9, 11, 29, 58, 70, 77], "global": [9, 29, 84], "call": [9, 15, 16, 18, 24, 29, 30, 44, 46, 47, 49, 53, 55, 60, 63, 64, 69, 70, 84], "overrid": [9, 18, 24, 29, 46, 84, 86], "exist": [9, 27, 53, 81], "entri": [9, 15, 24, 29], "kei": [9, 15, 24, 26, 27, 29, 52, 65, 83], "get_al": 9, "get_by_idx": 9, "queri": [9, 24, 29, 39, 44, 46, 49, 52, 66, 84], "get_by_index": 9, "loaded_pars": 9, "synonymdatabas": [9, 66], "get_syns_for_id": 9, "strategy_filt": 9, "get_syns_sharing_id": 9, "syn": [9, 29, 66, 84], "parser": [9, 26, 27, 29, 43, 44, 46, 47, 48, 66, 80], "aggreg": 9, "via": [9, 29, 43, 47, 48, 52, 55, 66, 79, 81], "strategi": [9, 18, 24, 29, 47, 66, 80], "default": [9, 15, 18, 24, 26, 29, 46, 47, 48, 60, 66, 70, 80, 81, 84], "to_unicod": 11, "unicod": 11, "alreadi": [11, 47, 49, 54, 81], "utf": 11, "8": [11, 18, 24, 51, 70], "input": [11, 15, 24, 26, 29, 49, 60, 63, 66], "byte": 11, "nerprocessor": [12, 15], "seqtagprocessor": 12, "get_aug_exampl": 12, "data_dir": [12, 15], "transform": [12, 15, 24, 52, 55, 60, 84], "inputexampl": [12, 15], "dev": 12, "get_dev_exampl": 12, "get_test_exampl": 12, "test": [12, 13, 51, 63, 83], "get_train_exampl": 12, "train": [12, 15, 16, 18, 59, 80, 81], "data": [12, 15, 24, 26, 29, 48, 52, 63, 66, 70, 73, 77, 80, 81, 83, 85, 86, 88], "sequenc": [12, 15, 24, 52, 55, 78], "tag": [12, 15, 55, 59, 77, 79], "studentmodelcheckpointio": 13, "checkpointio": 13, "plugin": 13, "save": [13, 48, 60, 66], "student": 13, "without": [13, 24, 26, 43, 67, 70, 73, 81, 84], "teacher": 13, "model_name_or_path": [13, 24], "load_checkpoint": 13, "path": [13, 15, 24, 26, 27, 29, 30, 32, 37, 43, 48, 52, 53, 54, 63, 66, 67, 68, 71, 84, 86], "storage_opt": 13, "checkpoint": 13, "resum": 13, "ckpt": 13, "predict": [13, 14, 15, 18, 24, 52], "stage": 13, "arg": [13, 15, 18, 21, 24, 57, 60, 70], "map_loc": 13, "devic": [13, 18, 24], "specifi": [13, 15, 24, 48, 53, 58, 63, 71], "how": [13, 24, 29, 44, 55, 77, 83, 84], "remap": 13, "storag": 13, "locat": [13, 24, 43, 48, 58, 66, 86], "remove_checkpoint": 13, "file": [13, 26, 29, 32, 43, 48, 62, 66, 73], "filesystem": 13, "save_checkpoint": 13, "current": [13, 18, 24, 27, 59, 66, 70, 86], "content": [13, 48, 83], "includ": [13, 15, 24, 37, 70, 73, 77, 81, 83], "state_dict": 13, "optimizer_st": 13, "callback": [13, 18, 24], "accuraci": [14, 15, 24], "pred": [14, 29], "numeric_label_f1_scor": 14, "label_list": [14, 15], "calcul": [14, 15, 21, 24, 29, 66, 84], "f1": 14, "seqev": 14, "numer": [14, 51, 70, 81], "2d": [14, 24], "arrai": 14, "mappingid": 14, "nerdataset": 15, "dataset": [15, 24, 29, 59], "design": [15, 47, 55, 59, 70, 84], "fly": 15, "tokenis": [15, 46, 55], "speed": [15, 81], "multi": [15, 18, 24], "cach": [15, 39, 44, 46, 48, 49, 63, 66, 80], "prevent": [15, 18, 24], "repeat": 15, "exampl": [15, 18, 24, 48, 77, 78, 83, 84, 86, 88], "label_map": 15, "max_length": [15, 24, 71], "autotoken": [15, 71], "typic": [15, 51, 84], "dataprocessor": [15, 80], "maximum": [15, 47, 52, 66], "can": [15, 21, 24, 26, 29, 32, 40, 43, 44, 46, 47, 52, 53, 55, 62, 66, 67, 70, 77, 78, 81, 83, 84, 86], "longer": [15, 52], "truncat": [15, 24], "convert_single_exampl": 15, "ex_index": 15, "tensor": [15, 24, 52, 55, 66], "sequencetaggingdistillationbas": 15, "taskspecificdistil": 15, "temperatur": 15, "warmup_step": 15, "learning_r": 15, "weight_decai": [15, 24], "batch_siz": [15, 24, 49, 52], "accumulate_grad_batch": 15, "max_epoch": 15, "student_model_path": 15, "teacher_model_path": 15, "num_work": [15, 24], "schedul": [15, 24], "specif": [15, 24, 47, 48, 73, 77, 81], "step": [15, 18, 24, 27, 30, 32, 67, 77, 78, 79, 80, 81, 84, 88], "listconfig": 15, "get_training_exampl": 15, "train_dataload": [15, 24], "more": [15, 21, 24, 29, 46, 47, 55, 59, 60, 79, 84], "pytorch": [15, 16, 24, 80], "dataload": [15, 18, 24, 52], "sampl": [15, 24], "In": [15, 24, 29, 30, 35, 44, 46, 58, 62, 77, 81, 84, 86], "pleas": [15, 24, 78, 84], "you": [15, 24, 32, 47, 60, 62, 83, 84, 86], "reload": [15, 24], "unless": [15, 24, 60], "paramref": [15, 24], "pytorch_lightn": [15, 24], "trainer": [15, 18, 21, 24, 49, 52, 66], "reload_dataloaders_every_n_epoch": [15, 24], "posit": [15, 24, 27], "integ": [15, 24, 66, 70], "follow": [15, 24, 27, 29, 47, 55, 58, 70, 73, 77, 79, 81, 83, 84], "pattern": [15, 24, 29, 51], "download": [15, 24, 48, 68], "prepare_data": [15, 24], "split": [15, 24, 29, 46, 51, 52, 55, 66, 70, 79, 84], "setup": [15, 24, 46], "howev": [15, 24, 46, 47, 62, 70, 73, 77, 81, 84], "abov": [15, 24, 29, 44, 46, 55, 70, 73], "necessari": [15, 24, 47], "distribut": [15, 24, 70, 73], "do": [15, 24, 29, 60, 77, 84], "assign": [15, 24, 46, 53, 55, 84], "state": [15, 16, 24, 44, 46, 47, 59, 63, 81], "fit": [15, 24, 52, 70, 73], "lightn": [15, 18, 24], "correct": [15, 24, 83], "sampler": [15, 24], "arbitrari": [15, 24, 58], "hardwar": [15, 24], "There": [15, 24, 55, 81, 84], "yourself": [15, 24], "def": [15, 18, 24, 83, 84, 85], "totensor": [15, 24], "normal": [15, 24, 32, 48, 53, 60, 66, 70, 84], "5": [15, 24, 47, 51, 59, 70], "mnist": [15, 24], "root": [15, 24], "loader": [15, 24, 52], "shuffl": [15, 24], "cifar": [15, 24], "mnist_load": [15, 24], "cifar_load": [15, 24], "each": [15, 18, 24, 26, 29, 32, 44, 46, 48, 55, 58, 60, 81, 84, 86], "batch": [15, 18, 24, 49, 52, 60], "batch_mnist": [15, 24], "batch_cifar": [15, 24], "val_dataload": [15, 24], "recommend": [15, 24, 26, 29, 83], "prepar": [15, 24, 44, 46, 81], "happen": [15, 18, 24], "them": [15, 16, 24, 29, 43, 46, 66, 81], "loader_a": [15, 24], "loader_b": [15, 24], "loader_n": [15, 24], "don": [15, 24, 44, 49, 52, 53, 55], "t": [15, 18, 21, 24, 26, 30, 44, 46, 49, 52, 53, 55, 70, 77], "validation_step": [15, 24], "argument": [15, 24, 26, 63], "dataloader_idx": [15, 18, 24], "which": [15, 24, 26, 29, 44, 47, 48, 55, 60, 70, 77, 79, 84, 86], "order": [15, 24, 27, 47, 48, 58, 62, 71, 77, 86], "here": [15, 24, 44, 77, 83, 84], "sequencetaggingdistillationforfinallay": 15, "layer": 15, "soft_cross_entropi": 15, "target": [15, 48, 66, 77, 81, 84], "tensor_to_jagged_arrai": 15, "attention_mask": [15, 16], "training_step": [15, 24], "batch_idx": [15, 18, 24], "comput": [15, 16, 24, 59, 60], "loss": [15, 24, 70, 73], "progress": [15, 24], "bar": [15, 24], "logger": [15, 24], "output": [15, 18, 24, 26, 29], "your": [15, 24, 77, 80, 83], "displai": [15, 24], "optimizer_idx": [15, 24], "optim": [15, 24, 59], "present": [15, 24, 30, 66, 86], "hidden": [15, 24], "core": [15, 18, 24, 27, 84], "lightningmodul": [15, 18, 24], "truncated_bptt_step": [15, 24], "must": [15, 24, 29, 46, 52, 55, 60, 70, 73, 84], "skip": [15, 24, 39], "next": [15, 24, 47, 55, 78, 86, 88], "automat": [15, 24, 60], "support": [15, 24, 66, 81], "gpu": [15, 18, 24, 81], "tpu": [15, 18, 24], "ipu": [15, 24], "deepspe": [15, 24], "forward": [15, 16, 18, 24, 46], "fancier": [15, 24], "like": [15, 24, 29, 44, 55, 70, 81, 84], "someth": [15, 24, 29, 84], "y": [15, 24, 44], "z": [15, 24, 30], "out": [15, 24, 44, 47, 70, 73, 81], "gan": [15, 24], "decod": [15, 24], "back": [15, 24, 29, 55, 83], "propag": [15, 24], "through": [15, 24, 60], "time": [15, 24, 43, 44, 47, 60, 67, 77], "previou": [15, 24, 55], "backprop": [15, 24], "lstm": [15, 24], "shown": [15, 24], "smooth": [15, 24], "averag": [15, 24], "over": [15, 24, 47, 53, 54, 60, 77, 78, 81], "last": [15, 24, 44, 52, 58, 81], "so": [15, 24, 27, 29, 40, 43, 44, 47, 53, 55, 62, 67, 70, 77, 78, 81, 83, 86, 88], "validation_epoch_end": [15, 24], "val_step_output": 15, "epoch": [15, 18, 24], "pseudocod": [15, 24], "val_out": [15, 24], "val_batch": [15, 24], "val_data": [15, 24], "append": [15, 24, 78, 86, 88], "didn": 15, "won": [15, 18, 24], "With": 15, "outer": 15, "inner": [15, 55], "individu": [15, 43], "dataloader_output_result": 15, "dataloader_out": 15, "dataloader_i_output": 15, "log": [15, 24, 26, 32, 69, 78], "final_metr": 15, "final_valu": 15, "oper": [15, 24, 47, 55], "might": [15, 24, 29, 48, 62, 70, 77, 81], "anyth": [15, 24, 84], "interest": [15, 24, 81], "val": [15, 24], "validation_step_end": [15, 24], "own": [15, 24, 77, 83, 84, 86], "6": [15, 24, 46, 70], "imag": [15, 24], "whatev": [15, 24], "sample_img": [15, 24], "grid": [15, 24], "torchvis": [15, 24], "make_grid": [15, 24], "experi": [15, 24, 81], "add_imag": [15, 24], "example_imag": [15, 24], "acc": [15, 24], "labels_hat": [15, 24], "argmax": [15, 24], "dim": [15, 24], "val_acc": [15, 24], "sum": [15, 24], "len": [15, 24, 46, 78, 86, 88], "log_dict": [15, 24], "val_loss": [15, 24], "quickli": [15, 24], "switch": [15, 24, 70], "between": [15, 24, 44, 46, 63, 66], "tell": [15, 24, 48, 81, 84], "been": [15, 24, 49, 70, 78, 81, 86, 88], "put": [15, 24, 77], "eval": [15, 24], "mode": [15, 24], "gradient": [15, 24], "disabl": [15, 24, 29], "At": [15, 24], "goe": [15, 24, 84], "enabl": [15, 24, 81], "allow_zero_length_dataloader_with_multiple_devic": 15, "prepare_data_per_nod": 15, "sequencetaggingdistillationforintermediatelay": 15, "intermedi": 15, "embed": [15, 24, 49, 66, 84], "platform": 15, "configure_optim": [15, 24], "configur": [15, 24, 26, 29, 46, 52, 55, 58, 63, 66, 79, 80, 81, 84], "learn": [15, 24, 60, 70], "rate": [15, 24], "get_optimizer_grouped_paramet": 15, "student_model": 15, "bert": [16, 24, 52, 55, 64], "tinybertforsequencetag": 16, "bertpretrainedmodel": 16, "config": [16, 26, 29, 32, 52, 63, 83, 85, 86], "num_label": 16, "fit_siz": 16, "768": 16, "initi": [16, 27], "intern": [16, 55], "share": [16, 59], "nn": 16, "scriptmodul": 16, "token_type_id": 16, "is_stud": 16, "perform": [16, 27, 44, 46, 59, 79], "overridden": [16, 29], "although": [16, 81], "recip": 16, "afterward": 16, "instead": [16, 48, 66], "former": [16, 24], "take": [16, 77, 83, 84], "run": [16, 24, 27, 32, 46, 47, 49, 52, 54, 59, 60, 63, 78, 80, 83], "regist": 16, "hook": [16, 24], "latter": [16, 24, 62], "silent": 16, "ignor": [16, 55, 60], "dictconfig": [17, 24, 32, 63, 75, 83, 85], "plautomodel": 18, "veri": [18, 62, 84], "automodel": [18, 24], "predict_step": [18, 24], "dure": [18, 24], "By": [18, 24, 70], "logic": [18, 24, 55, 58], "scale": [18, 24, 80, 81], "infer": [18, 24, 49, 60], "To": [18, 24, 59, 84], "oom": [18, 24], "error": [18, 24, 60, 63], "basepredictionwrit": [18, 24], "write": [18, 24, 26, 80], "disk": [18, 24, 26, 27, 48, 66], "databas": [18, 24, 29, 44, 46, 66, 80], "after": [18, 24, 32, 46, 47, 55], "spawn": [18, 24], "acceler": [18, 24, 81], "ddp_spawn": [18, 24], "mymodel": [18, 24], "dm": [18, 24], "plautomodelfortokenclassif": 18, "automodelfortokenclassif": [18, 52], "booleanstringsimilarityscor": [21, 46, 66], "stringsimilarityscor": [21, 29], "protocol": [21, 29, 57, 60, 70], "entitynounmodifierstringsimilarityscor": 21, "modifi": [21, 27, 35, 62, 66, 70], "phrase": [21, 27, 29, 70, 84], "reference_term": [21, 66], "noun_modifier_phras": 21, "entitysubtypestringsimilarityscor": 21, "mention": [21, 24, 44, 49], "norm": 21, "numeric_class_phras": 21, "re": [21, 29, 70, 81, 83], "compil": [21, 29, 70], "numbermatchstringsimilarityscor": 21, "number_find": 21, "rapidfuzzstringsimilarityscor": 21, "rapid": 21, "fuzz": 21, "count": 21, "10": [21, 24, 53, 59, 70], "char": [21, 43, 58, 66, 70, 71], "token_sort_ratio": 21, "otherwis": [21, 29, 44, 66, 70, 73, 77, 84], "wratio": 21, "sapbertstringsimilarityscor": [21, 29], "inherit": 21, "sapbert": [21, 80, 84], "plsapbertmodel": [21, 24, 49, 66], "numericmetr": 21, "namedtupl": 24, "__new__": 24, "_cl": 24, "iri": [24, 29], "alia": 24, "goldstandardexampl": 24, "gold_default_label": 24, "gold_iri": 24, "hfsapbertinferencedataset": 24, "inferenc": 24, "track": [24, 47], "vector": 24, "environ": [24, 54, 86], "hfsapbertpairwisedataset": 24, "encodings_1": 24, "encodings_2": 24, "ndarrai": [24, 44], "identifi": [24, 29, 35, 46, 47, 48, 62, 84], "origin": [24, 29, 46, 47, 49, 55, 59, 62, 70, 73, 78], "github": [24, 62, 70], "com": [24, 59, 62, 70, 73], "cambridgeltl": 24, "credit": [24, 62, 70], "inproceed": [24, 59], "liu2021self": 24, "titl": [24, 53, 59, 70, 78], "align": [24, 49, 55], "pretrain": [24, 49, 86], "author": [24, 44, 53, 59, 70, 73, 83], "liu": 24, "fangyu": 24, "shareghi": 24, "ehsan": 24, "meng": 24, "zaiqiao": 24, "basaldella": 24, "marco": 24, "collier": 24, "nigel": 24, "booktitl": [24, 59], "proceed": [24, 59], "2021": [24, 49, 59], "confer": 24, "north": 24, "american": [24, 59], "chapter": 24, "linguist": [24, 59], "human": [24, 46, 53, 59], "languag": [24, 26, 27, 53, 59, 62, 80, 81], "technolog": 24, "page": [24, 59, 86], "4228": 24, "4238": 24, "month": [24, 53, 59, 70], "jun": [24, 53], "year": [24, 53, 59, 70, 81], "sapbert_training_param": 24, "sapbert_evaluation_manag": 24, "from_pretrain": [24, 52], "sapberttrainingparam": 24, "sapbertevaluationdatamanag": 24, "choos": [24, 58, 84], "what": [24, 47, 48, 84, 86], "But": [24, 81], "two": [24, 46, 78, 84, 86, 88], "first": [24, 52, 58, 62, 70, 78, 80, 84, 88], "second": [24, 60, 70], "lr": 24, "lr_scheduler_config": 24, "lr_schedul": 24, "whose": 24, "describ": [24, 60, 70], "frequenc": [24, 77], "its": [24, 27, 60, 73], "below": [24, 58, 79], "unit": 24, "size": [24, 39, 49, 52, 66, 84], "could": [24, 29, 47, 81, 84], "updat": [24, 55, 64], "wherea": 24, "interv": 24, "mani": [24, 29, 46, 59, 77, 81, 84], "correspond": [24, 44, 46, 62], "monitor": [24, 32], "reducelronplateau": 24, "enforc": 24, "thu": [24, 29], "stop": [24, 69, 75], "found": [24, 49, 62, 64], "warn": [24, 26, 32, 58], "strict": [24, 27, 70, 73], "learningratemonitor": 24, "keyword": 24, "condit": [24, 53, 55, 70, 71, 73], "adam": 24, "metric_to_track": 24, "often": [24, 44, 77, 84, 86], "check_val_every_n_epoch": 24, "optimizer1": 24, "optimizer2": 24, "sgd": 24, "scheduler1": 24, "scheduler2": 24, "lambdalr": 24, "made": 24, "simpli": [24, 46], "metric_v": 24, "along": [24, 32, 70], "sequenti": [24, 55, 66], "given": [24, 29, 44, 46, 48, 52, 63, 70, 71, 77, 81, 84], "optimizer_on": 24, "01": 24, "optimizer_two": 24, "cycl": 24, "continu": [24, 55], "being": [24, 29, 55, 59], "1e": 24, "3": [24, 43, 46, 51, 70, 73, 84], "gen_opt": 24, "model_gen": 24, "dis_opt": 24, "model_di": 24, "02": 24, "dis_sch": 24, "cosineann": 24, "t_max": 24, "gen_sch": 24, "exponentiallr": 24, "99": 24, "procedur": 24, "improv": [24, 70], "wasserstein": 24, "algorithm": [24, 29, 35, 46, 58, 62, 73, 81, 84], "arxiv": 24, "org": [24, 29, 49, 53, 59, 70, 84], "ab": 24, "1704": 24, "00028": 24, "n_critic": 24, "know": [24, 84], "backward": 24, "lbfg": 24, "closur": 24, "control": 24, "those": 24, "optimizer_step": 24, "evaluate_topk_acc": 24, "level": [24, 46], "k": [24, 30, 47, 58], "nearest": 24, "neighbour": 24, "get_candidate_dict": 24, "np_candid": 24, "golden_iri": 24, "row": 24, "datafram": [24, 29, 66, 84], "get_embed": 24, "come": [24, 26, 84], "get_embeddings_for_str": 24, "pl": [24, 49, 66], "get_embeddings_from_dataload": 24, "cl": [24, 29, 60], "log_result": 24, "dataset_nam": 24, "sapbertevaluationdataset": 24, "ontology_sourc": 24, "query_sourc": 24, "dataset_idx": 24, "sapbertdatacollatorwithpad": 24, "collat": [24, 52], "pad": 24, "pad_to_multiple_of": 24, "pretrainedtokenizerbas": 24, "paddingstrategi": 24, "manag": [24, 43, 47, 48, 66, 81, 83, 86], "pars": [24, 29, 59, 73, 84], "evalu": [24, 59], "maintain": [24, 39, 49, 81], "construct": [24, 48], "debug": [24, 32, 70], "datasourc": [24, 84], "space": [24, 71], "against": [24, 47, 59, 62, 63, 83], "three": [24, 84], "column": [24, 29], "basemodel": [24, 75], "miner_margin": 24, "topk": 24, "train_batch_s": 24, "train_fil": 24, "type_of_triplet": 24, "get_embedding_dataloader_from_str": 24, "50": 24, "datacollatorwithpad": [24, 52], "callabl": [24, 51, 53, 57, 60, 65], "init_hf_collate_fn": 24, "custom_token": 26, "main": [26, 49, 62, 83, 85], "output_dir": [26, 63], "curated_list": [26, 27], "span_kei": [26, 27], "raw_hit": [26, 27], "serial": [26, 27], "ontologymatch": [26, 27], "english": [26, 59], "sentenc": [26, 27, 37, 59], "written": [26, 73], "caller": 26, "built": [26, 59, 63], "try": [26, 46], "understand": [26, 84], "noisi": [26, 30, 77, 84], "raw": [26, 46, 55, 73, 77], "tend": [26, 77], "curat": [26, 27, 30, 79, 80, 81, 84], "befor": [26, 47, 66, 70], "appli": [26, 35, 46, 47, 55, 66, 70, 77], "build": [26, 44, 48, 63, 66, 77], "attempt": [26, 47, 48, 52, 54, 66, 81, 84], "directori": [26, 29, 32, 43, 48, 63, 66, 86], "jsonl": 26, "line": [26, 29], "case_sensit": [26, 27], "attribut": [26, 29, 84], "store": [26, 27, 29, 44], "recognis": [26, 81], "curatedterm": 27, "action": 27, "term_norm_map": 27, "done": [27, 44, 55], "phrasematch": [27, 77, 79], "match_id_sep": 27, "parser_name_to_entity_typ": 27, "create_lowercase_phrasematcher_from_pars": 27, "matcher": [27, 37], "compon": [27, 51, 81, 83, 84, 86], "deseri": 27, "lowercas": 27, "shape": 27, "create_phrasematchers_from_curated_list": 27, "redundantli": 27, "sensit": [27, 77], "go": [27, 29, 77], "redund": [27, 29], "filter_by_context": 27, "These": [27, 59, 86], "filter": [27, 44, 46, 78, 84, 86, 88], "work": [27, 29, 44, 46, 47, 55, 60, 62, 70, 77, 81, 84], "best": [27, 46, 58, 81], "segment": [27, 59], "from_disk": 27, "exclud": [27, 29, 58], "pipe": [27, 60], "place": [27, 60], "set_context_match": 27, "set_label": 27, "span_in_fp_context": 27, "ent_class": 27, "regard": [27, 81], "span_in_fp_coocc": 27, "co": [27, 47], "occ": 27, "dic": 27, "least": [27, 29, 39, 49, 81, 84], "span_in_tp_context": 27, "span_in_tp_coocc": 27, "to_disk": 27, "nr_lowercase_rul": 27, "nr_strict_rul": 27, "ontologymatcherconfig": 27, "biologicalprocessgeneontologypars": 29, "geneontologypars": 29, "in_path": [29, 84], "string_scor": [29, 84], "synonym_merge_threshold": 29, "7": [29, 44, 47, 51, 70], "data_origin": 29, "unknown": 29, "synonym_gener": [29, 80], "excluded_id": 29, "resourc": [29, 48, 63], "owl": 29, "db": [29, 84], "tsv": 29, "throughout": [29, 32, 35], "stringnorm": [29, 47, 70, 84], "appropri": [29, 46, 47, 55, 79, 84], "behaviour": [29, 84], "overal": 29, "resolv": [29, 47, 84], "symbol": [29, 46, 47, 49, 70, 77, 84], "conflict": [29, 63], "trigger": [29, 46], "merg": [29, 58, 79, 84], "further": [29, 47, 64, 78], "detail": [29, 47, 79], "hgnc": 29, "releas": [29, 59, 81, 86], "meddra": [29, 48], "24": 29, "combinatorialsynonymgener": [29, 30], "cloontologypars": 29, "rdfgraphpars": 29, "clo": [29, 79], "www": [29, 48], "ebi": [29, 48], "ac": [29, 48], "uk": [29, 48], "ol": 29, "find_kb": [29, 84], "somehow": [29, 32], "find": [29, 55, 71, 79, 84], "cellosaurusontologypars": 29, "obo": [29, 84], "cellosauru": [29, 79], "ftp": 29, "expasi": 29, "parse_to_datafram": [29, 84], "long": [29, 52, 62, 84], "thin": [29, 84], "pd": [29, 84], "prefer": [29, 46, 58, 60, 62, 81], "xref": [29, 48], "exactsyn": 29, "usual": 29, "respons": [29, 46, 47], "id_to_sourc": 29, "original_syn_set": 29, "treat": 29, "seper": 29, "cell": 29, "cell_line_r": 29, "ignorecas": 29, "cellularcomponentgeneontologypars": 29, "chemblontologypars": [29, 84], "sqllite": 29, "chembl": [29, 79, 84], "pub": [29, 59], "chembldb": 29, "latest": 29, "chembl_29_sqlit": 29, "tar": 29, "gz": 29, "ensemblontologypars": 29, "genenam": 29, "hgnc_complete_set": 29, "additional_syns_path": 29, "load_go": 29, "graph": [29, 81], "populate_databas": 29, "popul": [29, 54], "instances_in_db": 29, "jsonlinesontologypars": 29, "jsonlin": 29, "implemet": 29, "json_dict_to_parser_dict": 29, "json_dict_to_parser_record": 29, "jsons_gen": 29, "yield": 29, "record": 29, "compat": 29, "expect": [29, 30, 47, 54, 60], "structur": [29, 84], "superclass": 29, "read": 29, "meddraontologypars": 29, "unzip": 29, "licenc": 29, "mdhier": 29, "asc": 29, "llt": 29, "molecularfunctiongeneontologypars": 29, "parsed_datafram": 29, "mondoontologypars": [29, 48], "is_valid_iri": 29, "abc": [29, 30, 44, 48, 66], "suitabl": [29, 48, 70], "composit": [29, 81, 84], "seed": 29, "speak": [29, 60], "therefor": [29, 55, 62, 77, 81], "cox": 29, "ensg00000095303": 29, "OR": [29, 70, 73], "ensg00000198804": 29, "noun": [29, 66, 70, 84], "far": 29, "less": [29, 49], "form": [29, 47, 62, 70, 73, 84], "mulipl": 29, "subset": [29, 44, 52, 84], "accordingli": [29, 55, 64, 79], "meddra_diseas": 29, "meddra_diagnost": 29, "drop_excluded_id": 29, "export_metadata": 29, "export_synonym_term": 29, "generate_synonym": 29, "resolut": 29, "populate_metadata_databas": 29, "populate_synonym_databas": 29, "resolve_synonym": 29, "synonym_df": 29, "duplic": [29, 44], "paracetamol": 29, "confus": [29, 55, 70, 84], "manner": 29, "decid": [29, 47, 79, 84], "wai": [29, 32, 58, 70, 73], "cluster": 29, "turn": 29, "depend": [29, 47, 52, 53, 59, 77, 81], "whether": [29, 37, 46, 47, 52, 55, 70, 73, 77, 84], "job": [29, 44, 84], "concret": 29, "stringscor": 29, "referenc": [29, 48, 84], "group": [29, 47, 58, 80, 84], "compar": [29, 59], "comparison": 29, "upon": [29, 55], "all_synonym_column_nam": 29, "minimum_metadata_column_nam": 29, "opentargetsdiseaseontologypars": [29, 48], "look_for_mondo": 29, "ot_id": 29, "db_xref": 29, "allowed_sourc": 29, "hp": [29, 84], "mondo": [29, 48, 79, 84], "opentargetsmoleculeontologypars": 29, "opentargetstargetontologypars": 29, "gene": [29, 46, 51, 53, 70, 77, 78, 79, 83, 86, 88], "frequent": [29, 77], "reli": 29, "answer": 29, "postcard": 29, "anyon": [29, 84], "better": [29, 59], "idea": [29, 84], "annotation_field": 29, "chemicalprob": 29, "constraint": 29, "functiondescript": 29, "hallmark": 29, "pathwai": 29, "safetyli": 29, "subcellularloc": 29, "targetclass": 29, "tractabl": 29, "uri_regex": 29, "synonym_pred": 29, "include_entity_pattern": 29, "exclude_entity_pattern": 29, "node": 29, "convert_to_rdflib_ref": 29, "_uri_regex": 29, "uberonontologypars": 29, "uberon": [29, 79], "synonymgener": 30, "greeksymbolsubstitut": 30, "all_sub": 30, "alpha": [30, 70], "\u03b1": [30, 70], "beta": [30, 70], "\u03b2": [30, 70], "\u03d0": [30, 70], "chi": [30, 70], "\u03c7": [30, 70], "delta": [30, 70], "\u03b4": [30, 70], "epsilon": [30, 70], "\u03b5": [30, 70], "eta": [30, 70], "\u03b7": [30, 70], "sigma": [30, 70], "\u03c2": [30, 70], "gamma": [30, 70], "\u03b3": [30, 70], "iota": [30, 70], "\u03b9": [30, 70], "kappa": [30, 70], "\u03ba": [30, 70], "lambda": [30, 70, 78, 84, 86, 88], "\u03bb": [30, 70], "mu": [30, 70], "\u03bc": [30, 70], "nu": [30, 70], "\u03bd": [30, 70], "omega": [30, 70], "\u03c9": [30, 70], "omicron": [30, 70], "\u03bf": [30, 70], "phi": [30, 70], "\u03c6": [30, 70], "\u03d5": [30, 70], "pi": [30, 70], "\u03c0": [30, 70], "psi": [30, 70], "\u03c8": [30, 70], "rho": [30, 70], "\u03c1": [30, 70], "\u03c3": [30, 70], "tau": [30, 70], "\u03c4": [30, 70], "theta": [30, 70], "\u03b8": [30, 70], "upsilon": [30, 70], "\u03c5": [30, 70], "xi": [30, 70], "\u03be": [30, 70], "zeta": [30, 70], "\u03b6": [30, 70], "b": [30, 55, 58, 70], "l": 30, "m": [30, 70], "n": [30, 43, 71], "o": [30, 53, 55], "p": [30, 59, 71], "r": 30, "u": 30, "c": [30, 58, 70, 73], "f": [30, 83, 84, 86], "greek_lett": 30, "lower_greek_lett": 30, "spell": 30, "upper_greek_lett": 30, "separatorexpans": 30, "spacy_pipelin": [30, 51, 54, 80], "spacypipelin": [30, 51, 54, 67], "spellingvariationreplac": 30, "known": [30, 55, 81], "variat": [30, 53], "input_path": 30, "stopwordremov": 30, "stopword": 30, "all_stopword": 30, "caus": [30, 70, 73], "involv": [30, 60], "stringreplac": 30, "replacement_dict": 30, "digit_aware_replacement_dict": 30, "include_greek": 30, "suffixreplac": 30, "interchang": 30, "suffic": [30, 79], "word": [30, 43, 52, 55, 62, 70, 71], "high": [30, 32, 47, 77], "later": 30, "knowledg": [30, 80, 81, 84, 86], "particular": [30, 70, 73], "doesn": [30, 77], "suffix": 30, "anaemia": 30, "ia": 30, "ic": 30, "anaem": 30, "amaemi": 30, "abstract": [30, 44, 48, 59], "faileddocsfilehandl": 32, "faileddocshandl": 32, "log_dir": 32, "fail": [32, 60, 78, 86, 88], "faileddocsloghandl": 32, "failure_handl": 32, "profile_steps_dir": 32, "skip_doc_len": 32, "200000": 32, "basic": [32, 55], "help": 32, "seri": 32, "handler": 32, "profil": 32, "tensorboard": 32, "dir": [32, 48, 63, 66], "prefilter_doc": 32, "step_tim": 32, "batch_tim": 32, "batch_metrics_dict": 32, "reset": [32, 47], "update_failed_doc": 32, "failed_doc": 32, "batch_metr": 32, "calc_doc_s": 32, "load_steps_and_log_memory_usag": 32, "increas": [32, 66], "instanti": [32, 63, 83, 85, 86], "give": [32, 62, 84], "omegaconf": [32, 83, 85], "hydra": [32, 75, 79, 81, 83, 85, 86], "abbreviationfinderstep": [35, 78, 79, 86, 88], "abbrevi": [35, 62, 79, 84], "definit": [35, 62], "schwartz": [35, 62], "hearst": [35, 62], "2003": [35, 62], "version": [35, 46, 48, 62, 63, 66, 70, 86], "scispaci": [35, 59, 62], "finder": 35, "rule": [35, 51, 70], "expand": 35, "abbreviation_detector": [35, 80], "kazuabbreviationdetector": [35, 62], "learnt": 35, "exclude_abbrv": [35, 62], "explosionstringmatchingstep": [37, 79], "linker": 37, "include_sentence_offset": 37, "offset": [37, 52], "extract_entity_data_from_span": 37, "dictionaryentitylinkingstep": [39, 79], "link_index": [39, 80], "dictionaryindex": [39, 66], "lookup_cache_s": [39, 49, 64], "5000": [39, 49, 64], "top_n": [39, 49, 66], "20": [39, 49, 81], "skip_ner_namespac": 39, "recent": [39, 49], "lookup": [39, 48, 49], "keep": [39, 49, 84], "load_or_build_cach": [39, 48, 49, 66], "mappingstep": [40, 79], "strategyrunn": [40, 44, 47], "strategy_runn": [40, 80], "tfidfscor": [43, 44], "tfidf": [43, 44], "sklearn": 43, "feature_extract": 43, "tfidfvector": 43, "filenam": 43, "relev": [43, 47, 64, 84], "build_or_load_vector": 43, "build_vector": 43, "load_vector": 43, "create_word_and_char_ngram": 43, "ngram": [43, 66, 71], "compris": 43, "annotationleveldisambiguationstrategi": 44, "disambiguationstrategi": [44, 46], "certain": 44, "colloqui": 44, "incorrect": 44, "annotation_scor": 44, "metadata_db": [44, 46, 66], "proxi": 44, "wide": [44, 59], "studi": [44, 59], "random": 44, "vs": [44, 46], "natur": [44, 59, 70, 84], "pretti": 44, "unsophist": 44, "resort": 44, "id_set": [44, 46], "preprocess": [44, 81], "in_memory_db": [44, 46, 80], "definedelsewhereindocumentdisambiguationstrategi": 44, "chang": [44, 46, 47, 70], "execut": [44, 46, 47, 63, 77], "hopefulli": 44, "smaller": 44, "event": [44, 70, 73, 78], "complex": [44, 46, 47, 81], "mappingstrategi": [44, 46, 47], "coordin": 44, "tfidfdisambiguationstrategi": 44, "retriev": 44, "matrix": 44, "regardless": 44, "sort": [44, 46, 47, 58], "accord": [44, 46, 55, 58, 79, 86], "minimum": 44, "context_threshold": 44, "relevant_aggregation_strategi": 44, "context": [44, 58, 84], "search": [44, 46, 48, 66, 70], "build_id_set_represent": 44, "cacheable_build_document_represent": 44, "recalcul": 44, "hashabl": [44, 47], "thrown": 44, "awai": 44, "pragmat": 44, "make": [44, 48, 59, 66, 81, 84], "context_scor": [44, 80], "definedelsewhereindocumentmappingstrategi": 46, "filter_term": 46, "ent_match": 46, "ent_match_norm": 46, "ideal": 46, "scenario": [46, 84], "carri": 46, "found_equivalent_id": 46, "exactmatchmappingstrategi": 46, "exact": [46, 84], "mappingfactori": 46, "additional_metadata": 46, "strip_url": 46, "create_mapping_from_id_set": 46, "actualis": 46, "down": 46, "user": 46, "bear": 46, "still": [46, 47], "remain": 46, "receiv": 46, "either": [46, 60, 81, 84], "disambiguate_if_requir": 46, "filtered_term": 46, "liabl": [46, 70, 73], "strongmatchmappingstrategi": 46, "highest": [46, 55], "greater": 46, "differenti": 46, "close": [46, 55, 84], "search_threshold": 46, "80": 46, "symbolic_onli": 46, "equal": [46, 58], "minu": 46, "strongmatchwithembeddingconfirmationstringmatchingstrategi": 46, "parent": [46, 63], "predefin": [46, 84], "confirm": 46, "broadli": [46, 84], "attach": [46, 47, 78, 86, 88], "refin": 46, "neck": 46, "diseas": [46, 70, 79, 83, 84], "heck": 46, "complex_string_scor": 46, "embedding_threshold": 46, "symbolmatchmappingstrategi": 46, "whitespac": 46, "k8": 46, "mapk8": 46, "shortest": 46, "match_symbol": 46, "s1": 46, "s2": 46, "termnormissubstringmappingstrategi": 46, "exactli": 46, "testin": 46, "min_term_norm_len_to_consid": 46, "length": [46, 52], "namespacestrategyexecut": 47, "role": 47, "had": [47, 81], "successfulli": 47, "applic": [47, 48, 83], "entitykei": 47, "__call__": [47, 55, 60], "docstr": 47, "ent_class_strategi": 47, "default_strategi": 47, "stop_on_success": 47, "get_strategies_for_entity_class": 47, "clear": 47, "readi": [47, 84], "anoth": [47, 81, 84], "longest_mapping_strategy_list_s": 47, "ordin": 47, "variou": [47, 60, 84], "ground": [47, 81], "success": [47, 78, 86, 88], "henc": 47, "why": [47, 80], "crucial": 47, "higher": [47, 58, 70], "lower": [47, 70], "beyond": 47, "itself": [47, 60, 84], "variabl": [47, 86], "vari": 47, "sub": [47, 48], "again": 47, "divid": 47, "condition": 47, "symbolic_strategi": 47, "non_symbolic_strategi": 47, "ner_namespace_processing_ord": 47, "cross_ref_manag": 47, "low": 47, "combin": 47, "info": [47, 48, 55], "deriv": [47, 73], "crossreferencemanag": [47, 48], "xrefer": 47, "execute_hit_post_processing_strategi": 47, "ents_needing_map": 47, "namespace_strategy_execut": 47, "entity_to_entity_kei": 47, "group_entities_by_symbol": 47, "separ": [47, 71, 84], "elsewher": [47, 84], "unsort": 47, "just": [47, 59], "classify_symbol": [47, 70], "source_to_parser_metadata_lookup": 48, "cross": 48, "hold": [48, 84], "superset": 48, "held": 48, "ref": 48, "build_xref_cach": 48, "xrefdatabas": 48, "create_xref_map": 48, "cache_path": [48, 66], "force_rebuild_cach": [48, 66], "xref_db": 48, "asset": [48, 66], "wa": [48, 62, 66], "oxocrossreferencemanag": 48, "oxo": 48, "servic": [48, 70, 73], "local": [48, 83], "oxo_kazu_name_map": 48, "uri_prefix": 48, "oxo_queri": 48, "covert": 48, "uri": [48, 81], "prefix": [48, 71, 73], "correctli": [48, 55], "reconstruct": 48, "api": [48, 78, 84], "request": [48, 73], "create_oxo_dump": 48, "parse_oxo_dump": 48, "oxo_dump": 48, "accept": [48, 63, 83], "oxo_url": 48, "spot": 48, "sapbertforentitylinkingstep": 49, "wrap": [49, 59, 60, 78, 81, 86], "paper": [49, 79], "aclantholog": 49, "naacl": 49, "334": 49, "pdf": [49, 59], "embedding_model": [49, 66], "min_string_length_to_trigg": 49, "ignore_high_conf": 49, "embeddingindex": [49, 66], "signal": 49, "shorter": 49, "good": [49, 70, 73, 84, 86], "techniqu": [49, 77], "perfect": 49, "process_ent": 49, "noncontiguousentitysplitt": [51, 52], "entity_condit": 51, "splitonconjunctionpattern": 51, "analys": 51, "run_conjunction_rul": 51, "splitonnumericallistpatternwithprefix": 51, "increment": 51, "splitter": 51, "brca1": 51, "print": [51, 78, 86, 88], "oncogen": 51, "brca2": 51, "brca3": 51, "transformersmodelfortokenclassificationnerstep": [52, 79], "slide": 52, "window": 52, "larg": [52, 77], "post": [52, 55], "tokenizedwordprocessor": [52, 55], "stride": [52, 71], "max_sequence_length": 52, "detect_subspan": [52, 55], "entity_splitt": 52, "nest": [52, 55, 58, 81, 83], "entity_post_process": [52, 80], "frame_to_tok_word": 52, "batch_encod": 52, "number_of_fram": 52, "frame_index": 52, "section_frame_index": 52, "frame": 52, "word_id": [52, 55], "rel": [52, 81], "total": 52, "whole": [52, 55], "tokenizedword": [52, 55], "get_activ": 52, "namedtuple_values_indic": 52, "consist": [52, 70, 81], "get_dataload": 52, "overflow_to_sample_map": 52, "get_list_of_batch_encoding_frames_for_sect": 52, "section_index": 52, "id2labels_from_label_list": 52, "section_frames_to_tokenised_word": 52, "sethstep": [53, 79], "snp": 53, "extract": [53, 79, 84], "tool": [53, 83], "py4j": 53, "articl": [53, 59, 70], "seth2016": 53, "genet": 53, "variant": 53, "thoma": 53, "philipp": 53, "rockt": 53, "schel": 53, "tim": 53, "hakenberg": 53, "j": 53, "rg": 53, "lichtblau": 53, "yvonn": 53, "leser": 53, "ulf": 53, "journal": [53, 59, 70], "bioinformat": [53, 70], "2016": 53, "doi": [53, 59, 70], "1093": [53, 59, 70], "btw234": 53, "eng": 53, "medlin": 53, "pst": 53, "aheadofprint": 53, "pmid": 53, "27256315": 53, "dx": 53, "seth_fatjar_path": 53, "java_hom": 53, "emerg": 53, "fatjar": 53, "slow": 53, "pre": [53, 83], "protein": 53, "spacynerstep": 54, "model_nam": 54, "instal": [54, 80], "simplespanfind": 55, "spanfind": 55, "id2label": 55, "bio": [55, 59], "get_bio_and_class_label": 55, "process_next_word": 55, "span_continue_condit": 55, "bio_and_class_label": 55, "potenti": [55, 63], "met": [55, 70, 73], "span_breaking_char": 55, "smartspanfind": 55, "complic": [55, 86], "soft": 55, "consider": 55, "wordpiec": 55, "oov": 55, "problem": [55, 70, 81], "reconstitut": 55, "ne": 55, "inprecis": 55, "art": [55, 59, 81], "closed_span": 55, "tokwordspan": 55, "close_span": 55, "activ": 55, "start_span": 55, "subspan": 55, "dataclass": 55, "clazz": 55, "tok_word": 55, "token_id": 55, "token_confid": 55, "token_offset": 55, "word_char_start": 55, "word_char_end": 55, "becaus": [55, 70], "inher": 55, "obscur": 55, "sometim": 55, "mayb": 55, "classic": 55, "entir": 55, "confidence_threshold": 55, "calculate_span_offset": 55, "make_span_find": 55, "spans_to_ent": 55, "ad": [55, 60, 70, 78], "cleanupact": 57, "cleanupstep": [57, 79], "cleanup_act": 57, "dropmappingsbyconfidencemappingfilt": 57, "ranks_to_drop": 57, "dropunmappedentityfilt": 57, "from_ent_namespac": 57, "entityfiltercleanupact": 57, "filter_fn": 57, "mappingfiltercleanupact": 57, "mergeoverlappingentsstep": [58, 79], "descript": 58, "ent_class_preferred_ord": 58, "ignore_non_contigu": 58, "pick": 58, "proscrib": 58, "basi": 58, "revers": 58, "alphabet": 58, "criteria": 58, "elimin": [58, 84], "prioriti": 58, "lowest": 58, "filter_ents_across_class": 58, "group_entities_by_loc": 58, "select_preferred_ent": 58, "stanzastep": 59, "genia": [59, 81], "treebank": 59, "qi2020stanza": 59, "qi": 59, "peng": 59, "zhang": 59, "yuhao": 59, "yuhui": 59, "bolton": 59, "jason": 59, "man": 59, "christoph": 59, "58th": 59, "annual": 59, "meet": 59, "demonstr": 59, "toolkit": 59, "stanford": 59, "edu": 59, "2020": 59, "jamia": 59, "ocab090": 59, "langlotz": 59, "curti": 59, "clinic": 59, "packag": 59, "librari": 59, "medic": [59, 70], "informat": 59, "volum": 59, "28": 59, "1892": 59, "1899": 59, "06": 59, "sought": 59, "develop": [59, 81], "neural": 59, "syntact": 59, "analysi": 59, "recognit": 59, "extend": [59, 81], "mix": 59, "public": 59, "craft": 59, "well": [59, 62, 81, 83, 84], "privat": 59, "corpu": [59, 63, 81], "radiolog": 59, "report": 59, "domain": [59, 81], "network": 59, "abl": [59, 81], "speech": 59, "lemmat": 59, "popular": [59, 83], "open": [59, 81], "corenlp": 59, "biobert": 59, "win": 59, "bionlp": [59, 81], "achiev": 59, "much": 59, "retrain": 59, "par": 59, "substanti": 59, "outperform": 59, "computation": 59, "effici": [59, 60, 81], "introduc": 59, "offer": 59, "eas": 59, "facilit": 59, "research": 59, "publicli": 59, "onlin": 59, "issn": [59, 70], "1527": 59, "974x": 59, "eprint": 59, "academ": 59, "oup": 59, "39731803": 59, "stanza_pipelin": [59, 80], "stanzapipelin": [59, 68], "__name__": [60, 83, 85], "document_batch_step": 60, "batch_doc_cal": 60, "decor": 60, "processing_except": 60, "effort": 60, "repetit": 60, "document_iterating_step": 60, "would": [60, 84], "machin": [60, 70], "standalon": 60, "mutat": [60, 79, 86], "per_doc_cal": 60, "allenai": 62, "blob": [62, 70], "py": [62, 70, 73], "top": [62, 77], "copi": [62, 79], "forc": 62, "delet": [62, 63, 66], "wise": 62, "common": 62, "nsclc": [62, 70], "alwai": [62, 81, 84], "chosen": 62, "filter_match": 62, "matcher_output": 62, "find_abbrevi": 62, "long_form_candid": 62, "short_form_candid": 62, "short": 62, "letter": 62, "_beginning_": 62, "expans": 62, "short_form_filt": 62, "modelpackbuilderror": 63, "modelpackbuild": 63, "build_all_model_pack": 63, "maybe_base_model_pack_path": 63, "maybe_base_configuration_path": 63, "custom_model_pack_param": 63, "zip_pack": 63, "run_acceptance_test": 63, "run_consistency_check": 63, "pack": [63, 80], "zip": 63, "highlight": 63, "build_cach": 63, "clear_cached_resources_from_model_pack_dir": 63, "model_path_path": 63, "process_model_pack_path": 63, "kazu_vers": 63, "uncached_model_pack_path": 63, "build_dir": 63, "consistency_check": 63, "reset_singleton": 63, "zip_model_pack": 63, "model_pack_nam": 63, "subprocess": 63, "compress": 63, "cli": 63, "move": [63, 81, 86], "build_custom_pack_param": 63, "entitylinkinglookupcach": 64, "around": [64, 66, 78], "lfucach": 64, "expens": [64, 81], "check_lookup_cach": 64, "miss": 64, "update_terms_lookup_cach": 64, "sort_then_group": 65, "key_func": 65, "cdisttensorembeddingindex": 66, "tensorembeddingindex": 66, "cosin": 66, "distanc": 66, "boolean_scor": 66, "boolean": 66, "apply_boolean_scor": 66, "query_term": 66, "15": 66, "ontology_partition_s": 66, "1000": 66, "enumerate_database_chunk": 66, "chunk_siz": 66, "100000": 66, "partit": 66, "partitt": 66, "predict_ontology_embed": 66, "hungri": 66, "chuck": 66, "set_embedding_model": 66, "build_ontology_cach": 66, "cache_dir": 66, "get_index_data_path": 66, "get_metadata_path": 66, "get_synonym_data_path": 66, "overwrit": 66, "column_type_dict": 66, "matmultensorembeddingindex": 66, "matmul": 66, "synonym_db": 66, "reus": 67, "across": [67, 84], "stanza_nlp": 68, "from_stanza_kwarg": 68, "simple_init": 68, "use_gpu": 68, "call_count_interv": 69, "watch": 69, "call_count": 69, "helper": 69, "benchmark": [69, 83], "anatomystringnorm": 70, "entityclassnorm": 70, "is_symbol_lik": 70, "original_str": 70, "alzheim": 70, "normalize_noun_phras": 70, "revert": 70, "defaultstringnorm": 70, "normalize_symbol": 70, "anatomi": [70, 79, 83], "theoret": 70, "superflu": 70, "anywai": 70, "deplur": 70, "depluralis": 70, "handle_lower_case_prefix": 70, "preserv": 70, "subsequ": 70, "alphanum": 70, "upper": 70, "rest": 70, "unus": 70, "erbb2": 70, "commonli": 70, "ratio": 70, "remove_non_alphanum": 70, "alphanumer": 70, "replace_greek": 70, "replac": 70, "greek": 70, "replace_substr": 70, "rang": 70, "classifi": 70, "roman": 70, "split_on_numb": 70, "sub_greek_char_abbrevi": 70, "substitut": [70, 73], "allowed_additional_char": 70, "greek_sub": 70, "greek_subs_upp": 70, "number_split_pattern": 70, "other_sub": 70, "ii": 70, "iii": 70, "iv": 70, "ix": 70, "vi": 70, "vii": 70, "viii": 70, "11": 70, "xii": 70, "12": 70, "re_sub": 70, "si": 70, "sv": 70, "sx": 70, "re_subs_2": 70, "sa": 70, "sb": 70, "symbol_number_split": 70, "trailing_lowercase_s_split": 70, "diseasestringnorm": 70, "known_disease_short_noun": 70, "flu": 70, "hiv": 70, "sti": 70, "genestringnorm": 70, "gene_token_classifi": 70, "slightli": 70, "especi": 70, "contrari": 70, "special": [70, 73], "highli": [70, 77], "unusu": 70, "remove_trailing_s_if_otherwise_capitalis": 70, "frustratingli": 70, "pluralis": 70, "erbb": 70, "jsut": 70, "trail": 70, "break": 70, "genuin": [70, 77], "mdh": 70, "gasp10p": 70, "strip": 70, "gene_name_suffix": 70, "ase": 70, "gen": 70, "gon": 70, "gildautil": 70, "indralab": 70, "gilda": 70, "9e383213098144fe82103a3a5aa1bf4c14059e57": 70, "gyori2022gilda": 70, "gyori": 70, "benjamin": 70, "hoyt": 70, "charl": 70, "taplei": 70, "steppi": 70, "albert": 70, "advanc": 70, "2022": 70, "05": 70, "2635": 70, "0041": 70, "bioadv": 70, "vbac034": 70, "copyright": [70, 73], "2019": 70, "harvard": 70, "school": 70, "right": [70, 73, 77], "reserv": [70, 73], "redistribut": [70, 73], "binari": [70, 73], "modif": [70, 73], "permit": [70, 73], "code": [70, 73, 81], "retain": [70, 73], "notic": [70, 73], "disclaim": [70, 73], "reproduc": [70, 73], "materi": [70, 73], "softwar": [70, 73], "IS": [70, 73], "BY": [70, 73], "THE": [70, 73], "holder": [70, 73], "AND": [70, 73], "contributor": [70, 73], "AS": [70, 73, 84], "express": [70, 73], "impli": [70, 73], "warranti": [70, 73], "BUT": [70, 73], "NOT": [70, 73], "limit": [70, 73], "TO": [70, 73], "OF": [70, 73], "merchant": [70, 73], "FOR": [70, 73], "IN": [70, 73], "NO": [70, 73], "shall": [70, 73], "BE": [70, 73], "direct": [70, 73], "indirect": [70, 73], "incident": [70, 73], "exemplari": [70, 73], "consequenti": [70, 73], "damag": [70, 73], "procur": [70, 73], "profit": [70, 73], "busi": [70, 73], "interrupt": [70, 73], "ON": [70, 73, 84], "theori": [70, 73], "liabil": [70, 73], "contract": [70, 73], "tort": [70, 73], "neglig": [70, 73], "aris": [70, 73], "IF": [70, 73], "advis": [70, 73], "SUCH": [70, 73], "statu": 70, "flag": 70, "plural": 70, "non_plur": 70, "braf": 70, "plural_o": 70, "mosquito": 70, "plural_i": 70, "antibodi": 70, "plural_": 70, "plural_cap_": 70, "mapk": 70, "receptor": [70, 78, 86, 88], "replace_dash": 70, "rep": 70, "dash": 70, "plain": 70, "ascii": 70, "entityclassfilt": 71, "required_entity_class": 71, "assess": 71, "as_path": 71, "create_char_ngram": 71, "create_word_ngram": 71, "documents_to_document_section_batch_encodings_map": 71, "128": 71, "512": 71, "documents_to_document_section_text_map": 71, "dochash": 71, "sectionhash": 71, "documents_to_id_section_map": 71, "filter_entities_with_ontology_map": 71, "find_document_from_ent": 71, "belong": 71, "list_map": 71, "get_cache_dir": 71, "create_if_not_exist": 71, "get_cache_path": 71, "cache_id": 71, "get_match_entity_class_hash": 71, "githubusercont": 73, "amitripshto": 73, "starlett": 73, "jwt": 73, "master": 73, "starlette_jwt": 73, "middlewar": 73, "2018": 73, "amit": 73, "ripshto": 73, "neither": 73, "nor": 73, "endors": 73, "promot": 73, "prior": 73, "permiss": [73, 81], "jwtauthenticationbackend": 73, "authenticationbackend": 73, "secret_kei": 73, "hs256": 73, "bearer": 73, "username_field": 73, "usernam": 73, "audienc": 73, "async": 73, "authent": 73, "authcredenti": 73, "baseus": 73, "get_token_from_head": 73, "jwtuser": 73, "payload": 73, "display_nam": 73, "is_authent": 73, "on_auth_error": 73, "exc": 73, "sectionedwebdocu": 75, "to_kazu_docu": 75, "simplewebdocu": 75, "deploi": 75, "app": 75, "rai": [75, 80], "serv": 75, "sophist": 77, "few": 77, "mitosi": 77, "face": 77, "emploi": 77, "wholesal": 77, "ensur": [77, 81, 86], "kazu": [77, 83, 84, 85, 86, 88], "approach": [77, 81], "ontology_match": 80, "assemble_pipelin": 80, "ontology_preprocess": [80, 84], "syn_gener": [], "noisy_spacy_pipelin": [], "corpora": 77, "illustr": [], "ll": [], "joint_ner_and_link": [79, 80], "explos": [79, 80], "annotatedphras": [], "default_factori": [], "annotatedphraseencod": [], "jsonencod": [], "isinst": [], "__dict__": [], "rais": [], "typeerror": [], "w": [], "writelin": [], "get_doc": [], "noisy_step": [], "curatable_phras": [], "to_cur": [], "phrases_to_cur": [], "matter": [], "now": [83, 84, 86], "datamodel": 78, "bodi": 78, "document_post_process": [78, 79, 80, 86, 88], "abbreviation_find": [78, 79, 80, 86, 88], "epiderm": [78, 86, 88], "growth": [78, 86, 88], "factor": [78, 86, 88], "egfr": [78, 86, 88], "failur": [78, 86, 88], "egfr_ent": [78, 86, 88], "assert": [78, 86, 88], "ve": 79, "encount": 79, "tinybern2": [79, 81], "emnlp": 79, "tba": 79, "hf_token_classif": [79, 80], "drug": [79, 81, 83, 84], "cell_lin": [79, 83], "cell_typ": [79, 83], "go_bp": [79, 83], "go_cc": [79, 83], "go_mf": [79, 83], "seth": [79, 80], "tagger": 79, "yaml": 79, "schema": 79, "opentargets_molecul": 79, "opentargets_diseas": 79, "opentargets_target": 79, "bp_gene_ontolog": 79, "mf_gene_ontolog": 79, "cc_gene_ontolog": 79, "mapping_step": [79, 80], "merge_overlapping_": [79, 80], "desir": [79, 84], "customis": [79, 83], "cleanup": [79, 80], "introduct": 80, "summari": 80, "quickstart": 80, "visualis": 80, "webservic": 80, "acceptance_test": [80, 83], "label_studio": [80, 83], "distil": [80, 84], "data_util": 80, "lightning_plugin": 80, "tiny_transform": 80, "hf_lightning_wrapp": 80, "language_phenomena": 80, "string_similarity_scor": 80, "post_process": 80, "xref_manag": 80, "spacy_n": 80, "tokenized_word_processor": 80, "stanza": 80, "build_and_test_model_pack": 80, "stopwatch": 80, "string_norm": 80, "web": 80, "jwtauth": 80, "rout": 80, "server": 80, "lightweight": 81, "framework": 81, "astrazeneca": 81, "collabor": 81, "dmi": 81, "lab": 81, "korea": 81, "univers": 81, "whilst": 81, "rework": 81, "integr": [81, 83, 84], "plethora": 81, "wider": 81, "commun": 81, "great": [81, 84], "focu": 81, "literatur": 81, "nativ": [81, 84], "phenomena": 81, "particularli": 81, "challeng": 81, "texta": 81, "recogn": 81, "conceptu": 81, "fashion": 81, "uml": 81, "date": 81, "avoid": 81, "deal": 81, "autom": 81, "clean": 81, "intent": 81, "reprocess": 81, "million": 81, "easili": 81, "sever": 81, "fast": 81, "princip": 81, "extens": 81, "isol": 81, "brought": 81, "littl": 81, "scalabl": 81, "easi": [81, 84], "cc": 81, "live": 81, "discoveri": 81, "project": 81, "bikg": 81, "apach": 81, "commerci": 81, "histor": 81, "workflow": 83, "config_path": [83, 85], "conf": [83, 85, 86], "config_nam": [83, 85, 86], "run_doc": [83, 85], "__main__": [83, 85], "red": 83, "darkblu": 83, "orang": 83, "yellow": 83, "green": 83, "speci": 83, "purpl": 83, "pink": 83, "grei": 83, "blue": 83, "brown": 83, "label_studio_url_and_port": 83, "finish": 83, "export": 83, "angu": 84, "robert": 84, "lot": [84, 86], "vocabulari": 84, "uncontextualis": 84, "overload": 84, "ofd": 84, "has_exact_synonym": 84, "osteofibr": 84, "dysplasia": 84, "orofaciodigit": 84, "syndrom": 84, "let": 84, "similarli": 84, "xloa": 84, "ocular": 84, "albin": 84, "recess": 84, "wors": 84, "tga": 84, "dextro": 84, "loop": 84, "transposit": 84, "arteri": 84, "mondo_0019443": 84, "0031348": 84, "sai": 84, "everyth": 84, "familiar": 84, "uncommon": 84, "reconcil": 84, "perspect": 84, "difficult": 84, "arbitrarili": 84, "seem": 84, "nevertheless": 84, "enter": 84, "relat": 84, "piec": 84, "doe": 84, "ought": 84, "enough": [77, 84], "seborrh": 84, "eczema": 84, "purl": 84, "obolibrari": 84, "hp_0001051": 84, "mondo_0006608": 84, "equivalentidsetaggregationstrategi": 84, "4532": 84, "70": 84, "decis": 84, "7426": 84, "despit": 84, "perhap": 84, "fortun": 84, "quit": 84, "sqlite3": 84, "panda": 84, "sqlite": 84, "lend": 84, "tabular": 84, "conn": 84, "connect": 84, "chembl_id": 84, "pref_nam": 84, "syn_typ": 84, "molecule_dictionari": 84, "md": 84, "molecule_synonym": 84, "ms": 84, "molregno": 84, "df": 84, "read_sql": 84, "too": [77, 84], "big": 84, "dropna": 84, "drop_dupl": 84, "inplac": 84, "secondli": 84, "mondo_xxxxx": 84, "hp_xxxxxxx": 84, "full": 84, "trivial": 84, "breviti": 84, "string_1": 84, "string_2": 84, "75": 84, "That": 84, "explor": [77, 84, 86], "capabl": 84, "repo": 86, "newer": 86, "pip": 86, "major": 86, "amount": [], "easiest": [], "intro": [], "kazu_config_dir": [], "kazu_model_pack": 86, "manual": [], "oc": [], "env": 86, "os": 86, "initialize_config_dir": 86, "config_dir": 86, "clontologypars": 29, "envelop": 77, "wouldn": 77, "appear": 77, "On": 77, "hand": 77, "cornifi": 77, "assembl": 77, "whenev": 77, "safe": 77, "100": 77, "000": 77, "labour": 77, "intens": 77, "pragmatic": 77, "insensit": 77, "invalid": 77, "practic": 77, "todo": 77, "circumst": 86, "point": 86, "tweak": 86, "pathlib": 86, "kept": 86, "cdir": 86, "joinpath": 86, "implic": 86, "inspect": 86, "were": 86}, "objects": {"": [[0, 0, 0, "-", "kazu"]], "kazu": [[1, 0, 0, "-", "data"], [4, 0, 0, "-", "modelling"], [31, 0, 0, "-", "pipeline"], [33, 0, 0, "-", "steps"], [61, 0, 0, "-", "utils"], [72, 0, 0, "-", "web"]], "kazu.data": [[2, 0, 0, "-", "data"], [3, 0, 0, "-", "pytorch"]], "kazu.data.data": [[2, 1, 1, "", "AutoNameEnum"], [2, 1, 1, "", "CharSpan"], [2, 1, 1, "", "Document"], [2, 1, 1, "", "DocumentJsonUtils"], [2, 1, 1, "", "Entity"], [2, 1, 1, "", "EquivalentIdAggregationStrategy"], [2, 1, 1, "", "EquivalentIdSet"], [2, 1, 1, "", "LinkRanks"], [2, 1, 1, "", "Mapping"], [2, 1, 1, "", "Section"], [2, 1, 1, "", "SynonymTerm"], [2, 1, 1, "", "SynonymTermWithMetrics"]], "kazu.data.data.CharSpan": [[2, 2, 1, "", "__init__"], [2, 3, 1, "", "end"], [2, 2, 1, "", "is_completely_overlapped"], [2, 2, 1, "", "is_partially_overlapped"], [2, 3, 1, "", "start"]], "kazu.data.data.Document": [[2, 2, 1, "", "__init__"], [2, 2, 1, "", "as_minified_dict"], [2, 2, 1, "", "create_simple_document"], [2, 2, 1, "", "from_named_section_texts"], [2, 2, 1, "", "get_entities"], [2, 3, 1, "", "idx"], [2, 2, 1, "", "json"], [2, 3, 1, "", "metadata"], [2, 3, 1, "", "sections"]], "kazu.data.data.DocumentJsonUtils": [[2, 4, 1, "", "ConversionException"], [2, 3, 1, "", "atomic_types"], [2, 2, 1, "", "doc_to_json_dict"], [2, 2, 1, "", "empty"], [2, 3, 1, "", "listlike_types"], [2, 2, 1, "", "minify_json_dict"], [2, 2, 1, "", "obj_to_dict_repr"], [2, 2, 1, "", "remove_empty_elements"]], "kazu.data.data.Entity": [[2, 2, 1, "", "__init__"], [2, 2, 1, "", "add_mapping"], [2, 2, 1, "", "as_brat"], [2, 2, 1, "", "calc_starts_and_ends"], [2, 3, 1, "", "end"], [2, 3, 1, "", "entity_class"], [2, 2, 1, "", "from_spans"], [2, 2, 1, "", "is_completely_overlapped"], [2, 2, 1, "", "is_partially_overlapped"], [2, 2, 1, "", "load_contiguous_entity"], [2, 3, 1, "", "mappings"], [2, 3, 1, "", "match"], [2, 3, 1, "", "match_norm"], [2, 3, 1, "", "metadata"], [2, 3, 1, "", "namespace"], [2, 3, 1, "", "spans"], [2, 3, 1, "", "start"], [2, 3, 1, "", "syn_term_to_synonym_terms"], [2, 2, 1, "", "update_terms"]], "kazu.data.data.EquivalentIdAggregationStrategy": [[2, 3, 1, "", "CUSTOM"], [2, 3, 1, "", "MERGED_AS_NON_SYMBOLIC"], [2, 3, 1, "", "NO_STRATEGY"], [2, 3, 1, "", "RESOLVED_BY_SIMILARITY"], [2, 3, 1, "", "SYNONYM_IS_AMBIGUOUS"], [2, 3, 1, "", "UNAMBIGUOUS"]], "kazu.data.data.EquivalentIdSet": [[2, 2, 1, "", "__init__"], [2, 3, 1, "", "ids"], [2, 3, 1, "", "ids_to_source"]], "kazu.data.data.LinkRanks": [[2, 3, 1, "", "AMBIGUOUS"], [2, 3, 1, "", "HIGHLY_LIKELY"], [2, 3, 1, "", "POSSIBLE"], [2, 3, 1, "", "PROBABLE"]], "kazu.data.data.Mapping": [[2, 2, 1, "", "__init__"], [2, 3, 1, "", "confidence"], [2, 3, 1, "", "default_label"], [2, 3, 1, "", "disambiguation_strategy"], [2, 3, 1, "", "idx"], [2, 3, 1, "", "mapping_strategy"], [2, 3, 1, "", "metadata"], [2, 3, 1, "", "parser_name"], [2, 3, 1, "", "source"], [2, 3, 1, "", "xref_source_parser_name"]], "kazu.data.data.Section": [[2, 2, 1, "", "__init__"], [2, 3, 1, "", "entities"], [2, 2, 1, "", "get_text"], [2, 3, 1, "", "metadata"], [2, 3, 1, "", "name"], [2, 3, 1, "", "offset_map"], [2, 3, 1, "", "preprocessed_text"], [2, 5, 1, "", "sentence_spans"], [2, 3, 1, "", "text"]], "kazu.data.data.SynonymTerm": [[2, 2, 1, "", "__init__"], [2, 3, 1, "", "aggregated_by"], [2, 3, 1, "", "associated_id_sets"], [2, 5, 1, "", "is_ambiguous"], [2, 3, 1, "", "is_symbolic"], [2, 3, 1, "", "mapping_types"], [2, 3, 1, "", "parser_name"], [2, 3, 1, "", "term_norm"], [2, 3, 1, "", "terms"]], "kazu.data.data.SynonymTermWithMetrics": [[2, 2, 1, "", "__init__"], [2, 3, 1, "", "bool_score"], [2, 3, 1, "", "embed_score"], [2, 3, 1, "", "exact_match"], [2, 2, 1, "", "from_synonym_term"], [2, 2, 1, "", "merge_metrics"], [2, 3, 1, "", "search_score"]], "kazu.data.pytorch": [[3, 1, 1, "", "HFDataset"]], "kazu.data.pytorch.HFDataset": [[3, 2, 1, "", "__init__"]], "kazu.modelling": [[5, 0, 0, "-", "annotation"], [8, 0, 0, "-", "database"], [10, 0, 0, "-", "distillation"], [18, 0, 0, "-", "hf_lightning_wrappers"], [19, 0, 0, "-", "language"], [22, 0, 0, "-", "linking"], [25, 0, 0, "-", "ontology_matching"], [28, 0, 0, "-", "ontology_preprocessing"]], "kazu.modelling.annotation": [[6, 0, 0, "-", "acceptance_test"], [7, 0, 0, "-", "label_studio"]], "kazu.modelling.annotation.acceptance_test": [[6, 4, 1, "", "AcceptanceTestFailure"], [6, 1, 1, "", "AggregatedAccuracyResult"], [6, 1, 1, "", "SectionScorer"], [6, 6, 1, "", "acceptance_criteria"], [6, 6, 1, "", "aggregate_linking_results"], [6, 6, 1, "", "aggregate_ner_results"], [6, 6, 1, "", "analyse_full_pipeline"], [6, 6, 1, "", "check_annotation_consistency"], [6, 6, 1, "", "check_ent_class_consistency"], [6, 6, 1, "", "check_ent_mapping_consistency"], [6, 6, 1, "", "check_ent_match_abnormalities"], [6, 6, 1, "", "check_results_meet_threshold"], [6, 6, 1, "", "execute_full_pipeline_acceptance_test"], [6, 6, 1, "", "score_sections"]], "kazu.modelling.annotation.acceptance_test.AggregatedAccuracyResult": [[6, 2, 1, "", "__init__"], [6, 2, 1, "", "add_fn"], [6, 2, 1, "", "add_fp"], [6, 3, 1, "", "fn"], [6, 3, 1, "", "fn_counter"], [6, 5, 1, "", "fn_info"], [6, 3, 1, "", "fn_items_to_tasks"], [6, 3, 1, "", "fp"], [6, 3, 1, "", "fp_counter"], [6, 5, 1, "", "fp_info"], [6, 3, 1, "", "fp_items_to_tasks"], [6, 5, 1, "", "precision"], [6, 5, 1, "", "recall"], [6, 2, 1, "", "tasks_for_fn"], [6, 2, 1, "", "tasks_for_fp"], [6, 3, 1, "", "tp"]], "kazu.modelling.annotation.acceptance_test.SectionScorer": [[6, 2, 1, "", "__init__"], [6, 2, 1, "", "calculate_linking_matches"], [6, 2, 1, "", "calculate_ner_matches"], [6, 2, 1, "", "group_mappings_by_source"]], "kazu.modelling.annotation.label_studio": [[7, 1, 1, "", "KazuToLabelStudioConverter"], [7, 1, 1, "", "LSToKazuConversion"], [7, 1, 1, "", "LabelStudioAnnotationView"], [7, 1, 1, "", "LabelStudioManager"]], "kazu.modelling.annotation.label_studio.KazuToLabelStudioConverter": [[7, 2, 1, "", "convert_docs_to_tasks"], [7, 2, 1, "", "convert_single_doc_to_tasks"]], "kazu.modelling.annotation.label_studio.LSToKazuConversion": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "convert_tasks_to_docs"], [7, 2, 1, "", "create_ents"], [7, 2, 1, "", "create_mappings"], [7, 2, 1, "", "create_section"]], "kazu.modelling.annotation.label_studio.LabelStudioAnnotationView": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "build_labels"], [7, 2, 1, "", "build_taxonomy"], [7, 2, 1, "", "create_main_view"], [7, 2, 1, "", "getDOM"]], "kazu.modelling.annotation.label_studio.LabelStudioManager": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "create_linking_project"], [7, 2, 1, "", "delete_project_if_exists"], [7, 2, 1, "", "export_from_ls"], [7, 2, 1, "", "get_all_tasks"], [7, 2, 1, "", "get_tasks"], [7, 2, 1, "", "import_to_ls"], [7, 5, 1, "", "project_id"]], "kazu.modelling.database": [[9, 0, 0, "-", "in_memory_db"]], "kazu.modelling.database.in_memory_db": [[9, 1, 1, "", "MetadataDatabase"], [9, 1, 1, "", "SynonymDatabase"]], "kazu.modelling.database.in_memory_db.MetadataDatabase": [[9, 2, 1, "", "add_parser"], [9, 2, 1, "", "get_all"], [9, 2, 1, "", "get_by_idx"], [9, 2, 1, "", "get_by_index"], [9, 3, 1, "", "loaded_parsers"]], "kazu.modelling.database.in_memory_db.SynonymDatabase": [[9, 2, 1, "", "add"], [9, 2, 1, "", "get"], [9, 2, 1, "", "get_all"], [9, 2, 1, "", "get_syns_for_id"], [9, 2, 1, "", "get_syns_sharing_id"], [9, 3, 1, "", "loaded_parsers"]], "kazu.modelling.distillation": [[11, 0, 0, "-", "data_utils"], [12, 0, 0, "-", "dataprocessor"], [13, 0, 0, "-", "lightning_plugins"], [14, 0, 0, "-", "metrics"], [15, 0, 0, "-", "models"], [16, 0, 0, "-", "tiny_transformers"], [17, 0, 0, "-", "train"]], "kazu.modelling.distillation.data_utils": [[11, 6, 1, "", "to_unicode"]], "kazu.modelling.distillation.dataprocessor": [[12, 1, 1, "", "NerProcessor"], [12, 1, 1, "", "SeqTagProcessor"]], "kazu.modelling.distillation.dataprocessor.NerProcessor": [[12, 2, 1, "", "get_aug_examples"], [12, 2, 1, "", "get_dev_examples"], [12, 2, 1, "", "get_test_examples"], [12, 2, 1, "", "get_train_examples"]], "kazu.modelling.distillation.dataprocessor.SeqTagProcessor": [[12, 2, 1, "", "get_aug_examples"], [12, 2, 1, "", "get_dev_examples"], [12, 2, 1, "", "get_train_examples"]], "kazu.modelling.distillation.lightning_plugins": [[13, 1, 1, "", "StudentModelCheckpointIO"]], "kazu.modelling.distillation.lightning_plugins.StudentModelCheckpointIO": [[13, 2, 1, "", "__init__"], [13, 2, 1, "", "load_checkpoint"], [13, 2, 1, "", "remove_checkpoint"], [13, 2, 1, "", "save_checkpoint"]], "kazu.modelling.distillation.metrics": [[14, 6, 1, "", "accuracy"], [14, 6, 1, "", "numeric_label_f1_score"]], "kazu.modelling.distillation.models": [[15, 1, 1, "", "NerDataset"], [15, 1, 1, "", "SequenceTaggingDistillationBase"], [15, 1, 1, "", "SequenceTaggingDistillationForFinalLayer"], [15, 1, 1, "", "SequenceTaggingDistillationForIntermediateLayer"], [15, 1, 1, "", "TaskSpecificDistillation"]], "kazu.modelling.distillation.models.NerDataset": [[15, 2, 1, "", "__init__"], [15, 2, 1, "", "convert_single_example"]], "kazu.modelling.distillation.models.SequenceTaggingDistillationBase": [[15, 2, 1, "", "__init__"], [15, 2, 1, "", "get_training_examples"], [15, 2, 1, "", "train_dataloader"], [15, 3, 1, "", "training"], [15, 2, 1, "", "val_dataloader"]], "kazu.modelling.distillation.models.SequenceTaggingDistillationForFinalLayer": [[15, 2, 1, "", "__init__"], [15, 3, 1, "", "allow_zero_length_dataloader_with_multiple_devices"], [15, 3, 1, "", "precision"], [15, 3, 1, "", "prepare_data_per_node"], [15, 2, 1, "", "soft_cross_entropy"], [15, 2, 1, "", "tensor_to_jagged_array"], [15, 3, 1, "", "training"], [15, 2, 1, "", "training_step"], [15, 2, 1, "", "validation_epoch_end"], [15, 2, 1, "", "validation_step"]], "kazu.modelling.distillation.models.SequenceTaggingDistillationForIntermediateLayer": [[15, 2, 1, "", "__init__"], [15, 3, 1, "", "allow_zero_length_dataloader_with_multiple_devices"], [15, 3, 1, "", "precision"], [15, 3, 1, "", "prepare_data_per_node"], [15, 3, 1, "", "training"], [15, 2, 1, "", "training_step"], [15, 2, 1, "", "validation_epoch_end"], [15, 2, 1, "", "validation_step"]], "kazu.modelling.distillation.models.TaskSpecificDistillation": [[15, 2, 1, "", "__init__"], [15, 2, 1, "", "configure_optimizers"], [15, 2, 1, "", "get_optimizer_grouped_parameters"], [15, 2, 1, "", "get_training_examples"], [15, 3, 1, "", "training"]], "kazu.modelling.distillation.tiny_transformers": [[16, 1, 1, "", "TinyBertForSequenceTagging"]], "kazu.modelling.distillation.tiny_transformers.TinyBertForSequenceTagging": [[16, 2, 1, "", "__init__"], [16, 2, 1, "", "forward"], [16, 3, 1, "", "training"]], "kazu.modelling.distillation.train": [[17, 6, 1, "", "start"]], "kazu.modelling.hf_lightning_wrappers": [[18, 1, 1, "", "PLAutoModel"], [18, 1, 1, "", "PLAutoModelForTokenClassification"]], "kazu.modelling.hf_lightning_wrappers.PLAutoModel": [[18, 2, 1, "", "__init__"], [18, 2, 1, "", "predict_step"], [18, 3, 1, "", "training"]], "kazu.modelling.hf_lightning_wrappers.PLAutoModelForTokenClassification": [[18, 2, 1, "", "__init__"], [18, 2, 1, "", "predict_step"], [18, 3, 1, "", "training"]], "kazu.modelling.language": [[20, 0, 0, "-", "language_phenomena"], [21, 0, 0, "-", "string_similarity_scorers"]], "kazu.modelling.language.string_similarity_scorers": [[21, 1, 1, "", "BooleanStringSimilarityScorer"], [21, 1, 1, "", "EntityNounModifierStringSimilarityScorer"], [21, 1, 1, "", "EntitySubtypeStringSimilarityScorer"], [21, 1, 1, "", "NumberMatchStringSimilarityScorer"], [21, 1, 1, "", "RapidFuzzStringSimilarityScorer"], [21, 1, 1, "", "SapbertStringSimilarityScorer"], [21, 1, 1, "", "StringSimilarityScorer"]], "kazu.modelling.language.string_similarity_scorers.BooleanStringSimilarityScorer": [[21, 2, 1, "", "__init__"]], "kazu.modelling.language.string_similarity_scorers.EntityNounModifierStringSimilarityScorer": [[21, 2, 1, "", "__init__"]], "kazu.modelling.language.string_similarity_scorers.EntitySubtypeStringSimilarityScorer": [[21, 3, 1, "", "numeric_class_phrases"]], "kazu.modelling.language.string_similarity_scorers.NumberMatchStringSimilarityScorer": [[21, 3, 1, "", "number_finder"]], "kazu.modelling.language.string_similarity_scorers.SapbertStringSimilarityScorer": [[21, 2, 1, "", "__init__"]], "kazu.modelling.language.string_similarity_scorers.StringSimilarityScorer": [[21, 2, 1, "", "__init__"]], "kazu.modelling.linking": [[23, 0, 0, "-", "sapbert"]], "kazu.modelling.linking.sapbert": [[24, 0, 0, "-", "train"]], "kazu.modelling.linking.sapbert.train": [[24, 1, 1, "", "Candidate"], [24, 1, 1, "", "GoldStandardExample"], [24, 1, 1, "", "HFSapbertInferenceDataset"], [24, 1, 1, "", "HFSapbertPairwiseDataset"], [24, 1, 1, "", "PLSapbertModel"], [24, 1, 1, "", "SapbertDataCollatorWithPadding"], [24, 1, 1, "", "SapbertEvaluationDataManager"], [24, 1, 1, "", "SapbertEvaluationDataset"], [24, 1, 1, "", "SapbertTrainingParams"], [24, 6, 1, "", "get_embedding_dataloader_from_strings"], [24, 6, 1, "", "init_hf_collate_fn"], [24, 6, 1, "", "start"]], "kazu.modelling.linking.sapbert.train.Candidate": [[24, 2, 1, "", "__new__"], [24, 3, 1, "", "correct"], [24, 3, 1, "", "default_label"], [24, 3, 1, "", "iri"]], "kazu.modelling.linking.sapbert.train.GoldStandardExample": [[24, 2, 1, "", "__new__"], [24, 3, 1, "", "candidates"], [24, 3, 1, "", "gold_default_label"], [24, 3, 1, "", "gold_iri"]], "kazu.modelling.linking.sapbert.train.HFSapbertInferenceDataset": [[24, 2, 1, "", "__init__"]], "kazu.modelling.linking.sapbert.train.HFSapbertPairwiseDataset": [[24, 2, 1, "", "__init__"]], "kazu.modelling.linking.sapbert.train.PLSapbertModel": [[24, 2, 1, "", "__init__"], [24, 2, 1, "", "configure_optimizers"], [24, 2, 1, "", "evaluate_topk_acc"], [24, 2, 1, "", "forward"], [24, 2, 1, "", "get_candidate_dict"], [24, 2, 1, "", "get_embeddings"], [24, 2, 1, "", "get_embeddings_for_strings"], [24, 2, 1, "", "get_embeddings_from_dataloader"], [24, 2, 1, "", "log_results"], [24, 2, 1, "", "predict_step"], [24, 2, 1, "", "train_dataloader"], [24, 3, 1, "", "training"], [24, 2, 1, "", "training_step"], [24, 2, 1, "", "val_dataloader"], [24, 2, 1, "", "validation_epoch_end"], [24, 2, 1, "", "validation_step"]], "kazu.modelling.linking.sapbert.train.SapbertDataCollatorWithPadding": [[24, 2, 1, "", "__init__"], [24, 3, 1, "", "max_length"], [24, 3, 1, "", "pad_to_multiple_of"], [24, 3, 1, "", "padding"], [24, 3, 1, "", "tokenizer"]], "kazu.modelling.linking.sapbert.train.SapbertEvaluationDataManager": [[24, 2, 1, "", "__init__"]], "kazu.modelling.linking.sapbert.train.SapbertEvaluationDataset": [[24, 2, 1, "", "__new__"], [24, 3, 1, "", "ontology_source"], [24, 3, 1, "", "query_source"]], "kazu.modelling.linking.sapbert.train.SapbertTrainingParams": [[24, 3, 1, "", "lr"], [24, 3, 1, "", "miner_margin"], [24, 3, 1, "", "num_workers"], [24, 3, 1, "", "topk"], [24, 3, 1, "", "train_batch_size"], [24, 3, 1, "", "train_file"], [24, 3, 1, "", "type_of_triplets"], [24, 3, 1, "", "weight_decay"]], "kazu.modelling.ontology_matching": [[26, 0, 0, "-", "assemble_pipeline"], [27, 0, 0, "-", "ontology_matcher"]], "kazu.modelling.ontology_matching.assemble_pipeline": [[26, 6, 1, "", "custom_tokenizer"], [26, 6, 1, "", "main"]], "kazu.modelling.ontology_matching.ontology_matcher": [[27, 1, 1, "", "CuratedTerm"], [27, 1, 1, "", "OntologyMatcher"], [27, 1, 1, "", "OntologyMatcherConfig"]], "kazu.modelling.ontology_matching.ontology_matcher.CuratedTerm": [[27, 2, 1, "", "__init__"], [27, 3, 1, "", "action"], [27, 3, 1, "", "case_sensitive"], [27, 3, 1, "", "entity_class"], [27, 3, 1, "", "term"], [27, 3, 1, "", "term_norm_mapping"]], "kazu.modelling.ontology_matching.ontology_matcher.OntologyMatcher": [[27, 2, 1, "", "__init__"], [27, 2, 1, "", "create_lowercase_phrasematcher_from_parsers"], [27, 2, 1, "", "create_phrasematchers_from_curated_list"], [27, 2, 1, "", "filter_by_contexts"], [27, 2, 1, "", "from_disk"], [27, 5, 1, "", "labels"], [27, 5, 1, "", "match_id_sep"], [27, 5, 1, "", "nr_lowercase_rules"], [27, 5, 1, "", "nr_strict_rules"], [27, 5, 1, "", "parser_name_to_entity_type"], [27, 2, 1, "", "set_context_matchers"], [27, 2, 1, "", "set_labels"], [27, 2, 1, "", "span_in_FP_context"], [27, 2, 1, "", "span_in_FP_coocc"], [27, 2, 1, "", "span_in_TP_context"], [27, 2, 1, "", "span_in_TP_coocc"], [27, 5, 1, "", "span_key"], [27, 2, 1, "", "to_disk"]], "kazu.modelling.ontology_matching.ontology_matcher.OntologyMatcherConfig": [[27, 2, 1, "", "__init__"], [27, 3, 1, "", "labels"], [27, 3, 1, "", "match_id_sep"], [27, 3, 1, "", "parser_name_to_entity_type"], [27, 3, 1, "", "span_key"]], "kazu.modelling.ontology_preprocessing": [[29, 0, 0, "-", "base"], [30, 0, 0, "-", "synonym_generation"]], "kazu.modelling.ontology_preprocessing.base": [[29, 1, 1, "", "BiologicalProcessGeneOntologyParser"], [29, 1, 1, "", "CLOOntologyParser"], [29, 1, 1, "", "CLOntologyParser"], [29, 1, 1, "", "CellosaurusOntologyParser"], [29, 1, 1, "", "CellularComponentGeneOntologyParser"], [29, 1, 1, "", "ChemblOntologyParser"], [29, 1, 1, "", "EnsemblOntologyParser"], [29, 1, 1, "", "GeneOntologyParser"], [29, 1, 1, "", "JsonLinesOntologyParser"], [29, 1, 1, "", "MeddraOntologyParser"], [29, 1, 1, "", "MolecularFunctionGeneOntologyParser"], [29, 1, 1, "", "MondoOntologyParser"], [29, 1, 1, "", "OntologyParser"], [29, 1, 1, "", "OpenTargetsDiseaseOntologyParser"], [29, 1, 1, "", "OpenTargetsMoleculeOntologyParser"], [29, 1, 1, "", "OpenTargetsTargetOntologyParser"], [29, 1, 1, "", "RDFGraphParser"], [29, 1, 1, "", "UberonOntologyParser"]], "kazu.modelling.ontology_preprocessing.base.BiologicalProcessGeneOntologyParser": [[29, 2, 1, "", "__init__"]], "kazu.modelling.ontology_preprocessing.base.CLOOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"]], "kazu.modelling.ontology_preprocessing.base.CLOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"]], "kazu.modelling.ontology_preprocessing.base.CellosaurusOntologyParser": [[29, 2, 1, "", "__init__"], [29, 3, 1, "", "cell_line_re"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "parse_to_dataframe"], [29, 2, 1, "", "score_and_group_ids"]], "kazu.modelling.ontology_preprocessing.base.CellularComponentGeneOntologyParser": [[29, 2, 1, "", "__init__"]], "kazu.modelling.ontology_preprocessing.base.ChemblOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "parse_to_dataframe"]], "kazu.modelling.ontology_preprocessing.base.EnsemblOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "parse_to_dataframe"]], "kazu.modelling.ontology_preprocessing.base.GeneOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"], [29, 3, 1, "", "instances"], [29, 3, 1, "", "instances_in_dbs"], [29, 2, 1, "", "load_go"], [29, 2, 1, "", "parse_to_dataframe"], [29, 2, 1, "", "populate_databases"]], "kazu.modelling.ontology_preprocessing.base.JsonLinesOntologyParser": [[29, 2, 1, "", "json_dict_to_parser_records"], [29, 2, 1, "", "parse_to_dataframe"], [29, 2, 1, "", "read"]], "kazu.modelling.ontology_preprocessing.base.MeddraOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "parse_to_dataframe"]], "kazu.modelling.ontology_preprocessing.base.MolecularFunctionGeneOntologyParser": [[29, 2, 1, "", "__init__"], [29, 3, 1, "", "parsed_dataframe"]], "kazu.modelling.ontology_preprocessing.base.MondoOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "is_valid_iri"], [29, 2, 1, "", "parse_to_dataframe"]], "kazu.modelling.ontology_preprocessing.base.OntologyParser": [[29, 2, 1, "", "__init__"], [29, 3, 1, "", "all_synonym_column_names"], [29, 2, 1, "", "drop_excluded_ids"], [29, 2, 1, "", "export_metadata"], [29, 2, 1, "", "export_synonym_terms"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "generate_synonyms"], [29, 3, 1, "", "minimum_metadata_column_names"], [29, 2, 1, "", "parse_to_dataframe"], [29, 2, 1, "", "populate_databases"], [29, 2, 1, "", "populate_metadata_database"], [29, 2, 1, "", "populate_synonym_database"], [29, 2, 1, "", "resolve_synonyms"], [29, 2, 1, "", "score_and_group_ids"]], "kazu.modelling.ontology_preprocessing.base.OpenTargetsDiseaseOntologyParser": [[29, 2, 1, "", "__init__"], [29, 3, 1, "", "allowed_sources"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "json_dict_to_parser_records"], [29, 2, 1, "", "look_for_mondo"], [29, 3, 1, "", "parsed_dataframe"]], "kazu.modelling.ontology_preprocessing.base.OpenTargetsMoleculeOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "json_dict_to_parser_records"], [29, 3, 1, "", "parsed_dataframe"]], "kazu.modelling.ontology_preprocessing.base.OpenTargetsTargetOntologyParser": [[29, 2, 1, "", "__init__"], [29, 3, 1, "", "annotation_fields"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "json_dict_to_parser_records"], [29, 3, 1, "", "parsed_dataframe"], [29, 2, 1, "", "score_and_group_ids"]], "kazu.modelling.ontology_preprocessing.base.RDFGraphParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "convert_to_rdflib_ref"], [29, 2, 1, "", "find_kb"], [29, 2, 1, "", "is_valid_iri"], [29, 2, 1, "", "parse_to_dataframe"], [29, 3, 1, "", "parsed_dataframe"]], "kazu.modelling.ontology_preprocessing.base.UberonOntologyParser": [[29, 2, 1, "", "__init__"], [29, 2, 1, "", "find_kb"], [29, 3, 1, "", "parsed_dataframe"]], "kazu.modelling.ontology_preprocessing.synonym_generation": [[30, 1, 1, "", "CombinatorialSynonymGenerator"], [30, 1, 1, "", "GreekSymbolSubstitution"], [30, 1, 1, "", "SeparatorExpansion"], [30, 1, 1, "", "SpellingVariationReplacement"], [30, 1, 1, "", "StopWordRemover"], [30, 1, 1, "", "StringReplacement"], [30, 1, 1, "", "SuffixReplacement"], [30, 1, 1, "", "SynonymGenerator"]], "kazu.modelling.ontology_preprocessing.synonym_generation.CombinatorialSynonymGenerator": [[30, 2, 1, "", "__init__"]], "kazu.modelling.ontology_preprocessing.synonym_generation.GreekSymbolSubstitution": [[30, 3, 1, "", "ALL_SUBS"], [30, 3, 1, "", "greek_letter"], [30, 3, 1, "", "lower_greek_letter"], [30, 3, 1, "", "spelling"], [30, 3, 1, "", "upper_greek_letter"]], "kazu.modelling.ontology_preprocessing.synonym_generation.SeparatorExpansion": [[30, 2, 1, "", "__init__"], [30, 2, 1, "", "call"]], "kazu.modelling.ontology_preprocessing.synonym_generation.SpellingVariationReplacement": [[30, 2, 1, "", "__init__"], [30, 2, 1, "", "call"]], "kazu.modelling.ontology_preprocessing.synonym_generation.StopWordRemover": [[30, 3, 1, "", "all_stopwords"], [30, 2, 1, "", "call"]], "kazu.modelling.ontology_preprocessing.synonym_generation.StringReplacement": [[30, 2, 1, "", "__init__"], [30, 2, 1, "", "call"]], "kazu.modelling.ontology_preprocessing.synonym_generation.SuffixReplacement": [[30, 2, 1, "", "__init__"], [30, 2, 1, "", "call"]], "kazu.modelling.ontology_preprocessing.synonym_generation.SynonymGenerator": [[30, 2, 1, "", "call"]], "kazu.pipeline": [[32, 0, 0, "-", "pipeline"]], "kazu.pipeline.pipeline": [[32, 1, 1, "", "FailedDocsFileHandler"], [32, 1, 1, "", "FailedDocsHandler"], [32, 1, 1, "", "FailedDocsLogHandler"], [32, 1, 1, "", "Pipeline"], [32, 6, 1, "", "batch_metrics"], [32, 6, 1, "", "calc_doc_size"], [32, 6, 1, "", "load_steps_and_log_memory_usage"]], "kazu.pipeline.pipeline.FailedDocsFileHandler": [[32, 2, 1, "", "__init__"]], "kazu.pipeline.pipeline.Pipeline": [[32, 2, 1, "", "__init__"], [32, 2, 1, "", "prefilter_docs"], [32, 2, 1, "", "profile"], [32, 2, 1, "", "reset"], [32, 2, 1, "", "update_failed_docs"]], "kazu.steps": [[34, 0, 0, "-", "document_post_processing"], [36, 0, 0, "-", "joint_ner_and_linking"], [38, 0, 0, "-", "linking"], [50, 0, 0, "-", "ner"], [56, 0, 0, "-", "other"], [60, 0, 0, "-", "step"]], "kazu.steps.document_post_processing": [[35, 0, 0, "-", "abbreviation_finder"]], "kazu.steps.document_post_processing.abbreviation_finder": [[35, 1, 1, "", "AbbreviationFinderStep"]], "kazu.steps.document_post_processing.abbreviation_finder.AbbreviationFinderStep": [[35, 2, 1, "", "__init__"]], "kazu.steps.joint_ner_and_linking": [[37, 0, 0, "-", "explosion"]], "kazu.steps.joint_ner_and_linking.explosion": [[37, 1, 1, "", "ExplosionStringMatchingStep"]], "kazu.steps.joint_ner_and_linking.explosion.ExplosionStringMatchingStep": [[37, 2, 1, "", "__init__"], [37, 2, 1, "", "extract_entity_data_from_spans"]], "kazu.steps.linking": [[39, 0, 0, "-", "dictionary"], [40, 0, 0, "-", "mapping_step"], [41, 0, 0, "-", "post_processing"], [49, 0, 0, "-", "sapbert"]], "kazu.steps.linking.dictionary": [[39, 1, 1, "", "DictionaryEntityLinkingStep"]], "kazu.steps.linking.dictionary.DictionaryEntityLinkingStep": [[39, 2, 1, "", "__init__"], [39, 2, 1, "", "load_or_build_caches"]], "kazu.steps.linking.mapping_step": [[40, 1, 1, "", "MappingStep"]], "kazu.steps.linking.mapping_step.MappingStep": [[40, 2, 1, "", "__init__"]], "kazu.steps.linking.post_processing": [[42, 0, 0, "-", "disambiguation"], [45, 0, 0, "-", "mapping_strategies"], [47, 0, 0, "-", "strategy_runner"], [48, 0, 0, "-", "xref_manager"]], "kazu.steps.linking.post_processing.disambiguation": [[43, 0, 0, "-", "context_scoring"], [44, 0, 0, "-", "strategies"]], "kazu.steps.linking.post_processing.disambiguation.context_scoring": [[43, 1, 1, "", "TfIdfScorer"], [43, 6, 1, "", "create_word_and_char_ngrams"]], "kazu.steps.linking.post_processing.disambiguation.context_scoring.TfIdfScorer": [[43, 2, 1, "", "__init__"], [43, 2, 1, "", "build_or_load_vectorizers"], [43, 2, 1, "", "build_vectorizers"], [43, 2, 1, "", "load_vectorizer"], [43, 2, 1, "", "load_vectorizers"]], "kazu.steps.linking.post_processing.disambiguation.strategies": [[44, 1, 1, "", "AnnotationLevelDisambiguationStrategy"], [44, 1, 1, "", "DefinedElsewhereInDocumentDisambiguationStrategy"], [44, 1, 1, "", "DisambiguationStrategy"], [44, 1, 1, "", "TfIdfDisambiguationStrategy"]], "kazu.steps.linking.post_processing.disambiguation.strategies.AnnotationLevelDisambiguationStrategy": [[44, 2, 1, "", "disambiguate"], [44, 3, 1, "", "metadata_db"], [44, 2, 1, "", "prepare"]], "kazu.steps.linking.post_processing.disambiguation.strategies.DefinedElsewhereInDocumentDisambiguationStrategy": [[44, 2, 1, "", "__init__"], [44, 2, 1, "", "disambiguate"], [44, 2, 1, "", "prepare"]], "kazu.steps.linking.post_processing.disambiguation.strategies.DisambiguationStrategy": [[44, 2, 1, "", "disambiguate"], [44, 2, 1, "", "prepare"]], "kazu.steps.linking.post_processing.disambiguation.strategies.TfIdfDisambiguationStrategy": [[44, 3, 1, "", "CONTEXT_SCORE"], [44, 2, 1, "", "__init__"], [44, 2, 1, "", "build_id_set_representation"], [44, 2, 1, "", "cacheable_build_document_representation"], [44, 2, 1, "", "disambiguate"], [44, 2, 1, "", "prepare"]], "kazu.steps.linking.post_processing.mapping_strategies": [[46, 0, 0, "-", "strategies"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies": [[46, 1, 1, "", "DefinedElsewhereInDocumentMappingStrategy"], [46, 1, 1, "", "ExactMatchMappingStrategy"], [46, 1, 1, "", "MappingFactory"], [46, 1, 1, "", "MappingStrategy"], [46, 1, 1, "", "StrongMatchMappingStrategy"], [46, 1, 1, "", "StrongMatchWithEmbeddingConfirmationStringMatchingStrategy"], [46, 1, 1, "", "SymbolMatchMappingStrategy"], [46, 1, 1, "", "TermNormIsSubStringMappingStrategy"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies.DefinedElsewhereInDocumentMappingStrategy": [[46, 2, 1, "", "filter_terms"], [46, 3, 1, "", "found_equivalent_ids"], [46, 2, 1, "", "prepare"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies.ExactMatchMappingStrategy": [[46, 2, 1, "", "filter_terms"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies.MappingFactory": [[46, 2, 1, "", "create_mapping"], [46, 2, 1, "", "create_mapping_from_id_set"], [46, 2, 1, "", "create_mapping_from_id_sets"], [46, 3, 1, "", "metadata_db"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies.MappingStrategy": [[46, 2, 1, "", "__init__"], [46, 2, 1, "", "disambiguate_if_required"], [46, 2, 1, "", "filter_terms"], [46, 2, 1, "", "prepare"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies.StrongMatchMappingStrategy": [[46, 2, 1, "", "__init__"], [46, 2, 1, "", "filter_terms"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies.StrongMatchWithEmbeddingConfirmationStringMatchingStrategy": [[46, 2, 1, "", "__init__"], [46, 2, 1, "", "filter_terms"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies.SymbolMatchMappingStrategy": [[46, 2, 1, "", "filter_terms"], [46, 2, 1, "", "match_symbols"]], "kazu.steps.linking.post_processing.mapping_strategies.strategies.TermNormIsSubStringMappingStrategy": [[46, 2, 1, "", "__init__"], [46, 2, 1, "", "filter_terms"]], "kazu.steps.linking.post_processing.strategy_runner": [[47, 1, 1, "", "NamespaceStrategyExecution"], [47, 1, 1, "", "StrategyRunner"], [47, 6, 1, "", "entity_to_entity_key"]], "kazu.steps.linking.post_processing.strategy_runner.NamespaceStrategyExecution": [[47, 2, 1, "", "__init__"], [47, 2, 1, "", "get_strategies_for_entity_class"], [47, 5, 1, "", "longest_mapping_strategy_list_size"], [47, 2, 1, "", "reset"]], "kazu.steps.linking.post_processing.strategy_runner.StrategyRunner": [[47, 2, 1, "", "__init__"], [47, 2, 1, "", "execute_hit_post_processing_strategies"], [47, 2, 1, "", "group_entities_by_symbolism"]], "kazu.steps.linking.post_processing.xref_manager": [[48, 1, 1, "", "CrossReferenceManager"], [48, 1, 1, "", "OxoCrossReferenceManager"]], "kazu.steps.linking.post_processing.xref_manager.CrossReferenceManager": [[48, 2, 1, "", "__init__"], [48, 2, 1, "", "build_xref_cache"], [48, 2, 1, "", "create_xref_mappings"], [48, 2, 1, "", "load"], [48, 2, 1, "", "load_or_build_cache"], [48, 2, 1, "", "save"], [48, 3, 1, "", "xref_db"]], "kazu.steps.linking.post_processing.xref_manager.OxoCrossReferenceManager": [[48, 2, 1, "", "__init__"], [48, 2, 1, "", "build_xref_cache"], [48, 2, 1, "", "create_oxo_dump"], [48, 3, 1, "", "headers"], [48, 3, 1, "", "oxo_url"], [48, 2, 1, "", "parse_oxo_dump"], [48, 3, 1, "", "xref_db"]], "kazu.steps.linking.sapbert": [[49, 1, 1, "", "SapBertForEntityLinkingStep"]], "kazu.steps.linking.sapbert.SapBertForEntityLinkingStep": [[49, 2, 1, "", "__init__"], [49, 2, 1, "", "load_or_build_caches"], [49, 2, 1, "", "process_entities"]], "kazu.steps.ner": [[51, 0, 0, "-", "entity_post_processing"], [52, 0, 0, "-", "hf_token_classification"], [53, 0, 0, "-", "seth"], [54, 0, 0, "-", "spacy_ner"], [55, 0, 0, "-", "tokenized_word_processor"]], "kazu.steps.ner.entity_post_processing": [[51, 1, 1, "", "NonContiguousEntitySplitter"], [51, 1, 1, "", "SplitOnConjunctionPattern"], [51, 1, 1, "", "SplitOnNumericalListPatternWithPrefix"]], "kazu.steps.ner.entity_post_processing.NonContiguousEntitySplitter": [[51, 2, 1, "", "__init__"]], "kazu.steps.ner.entity_post_processing.SplitOnConjunctionPattern": [[51, 2, 1, "", "__init__"], [51, 2, 1, "", "run_conjunction_rules"]], "kazu.steps.ner.entity_post_processing.SplitOnNumericalListPatternWithPrefix": [[51, 2, 1, "", "__init__"]], "kazu.steps.ner.hf_token_classification": [[52, 1, 1, "", "TransformersModelForTokenClassificationNerStep"]], "kazu.steps.ner.hf_token_classification.TransformersModelForTokenClassificationNerStep": [[52, 2, 1, "", "__init__"], [52, 2, 1, "", "frame_to_tok_word"], [52, 2, 1, "", "get_activations"], [52, 2, 1, "", "get_dataloader"], [52, 2, 1, "", "get_list_of_batch_encoding_frames_for_section"], [52, 2, 1, "", "id2labels_from_label_list"], [52, 2, 1, "", "section_frames_to_tokenised_words"]], "kazu.steps.ner.seth": [[53, 1, 1, "", "SethStep"]], "kazu.steps.ner.seth.SethStep": [[53, 2, 1, "", "__init__"]], "kazu.steps.ner.spacy_ner": [[54, 1, 1, "", "SpacyNerStep"]], "kazu.steps.ner.spacy_ner.SpacyNerStep": [[54, 2, 1, "", "__init__"]], "kazu.steps.ner.tokenized_word_processor": [[55, 1, 1, "", "SimpleSpanFinder"], [55, 1, 1, "", "SmartSpanFinder"], [55, 1, 1, "", "SpanFinder"], [55, 1, 1, "", "TokWordSpan"], [55, 1, 1, "", "TokenizedWord"], [55, 1, 1, "", "TokenizedWordProcessor"]], "kazu.steps.ner.tokenized_word_processor.SimpleSpanFinder": [[55, 2, 1, "", "__init__"], [55, 2, 1, "", "get_bio_and_class_labels"], [55, 2, 1, "", "process_next_word"], [55, 2, 1, "", "span_continue_condition"]], "kazu.steps.ner.tokenized_word_processor.SmartSpanFinder": [[55, 2, 1, "", "__init__"], [55, 2, 1, "", "get_bio_and_class_labels"], [55, 2, 1, "", "process_next_word"], [55, 2, 1, "", "span_continue_condition"]], "kazu.steps.ner.tokenized_word_processor.SpanFinder": [[55, 2, 1, "", "__init__"], [55, 2, 1, "", "close_spans"], [55, 2, 1, "", "get_bio_and_class_labels"], [55, 2, 1, "", "process_next_word"], [55, 2, 1, "", "span_continue_condition"], [55, 2, 1, "", "start_span"]], "kazu.steps.ner.tokenized_word_processor.TokWordSpan": [[55, 2, 1, "", "__init__"], [55, 3, 1, "", "clazz"], [55, 3, 1, "", "subspan"], [55, 3, 1, "", "tok_words"]], "kazu.steps.ner.tokenized_word_processor.TokenizedWord": [[55, 2, 1, "", "__init__"], [55, 3, 1, "", "token_confidences"], [55, 3, 1, "", "token_ids"], [55, 3, 1, "", "token_offsets"], [55, 3, 1, "", "tokens"], [55, 3, 1, "", "word_char_end"], [55, 3, 1, "", "word_char_start"], [55, 3, 1, "", "word_id"]], "kazu.steps.ner.tokenized_word_processor.TokenizedWordProcessor": [[55, 2, 1, "", "__init__"], [55, 2, 1, "", "calculate_span_offsets"], [55, 2, 1, "", "make_span_finder"], [55, 2, 1, "", "spans_to_entities"]], "kazu.steps.other": [[57, 0, 0, "-", "cleanup"], [58, 0, 0, "-", "merge_overlapping_ents"], [59, 0, 0, "-", "stanza"]], "kazu.steps.other.cleanup": [[57, 1, 1, "", "CleanupAction"], [57, 1, 1, "", "CleanupStep"], [57, 1, 1, "", "DropMappingsByConfidenceMappingFilter"], [57, 1, 1, "", "DropUnmappedEntityFilter"], [57, 1, 1, "", "EntityFilterCleanupAction"], [57, 1, 1, "", "MappingFilterCleanupAction"]], "kazu.steps.other.cleanup.CleanupAction": [[57, 2, 1, "", "__init__"], [57, 2, 1, "", "cleanup"]], "kazu.steps.other.cleanup.CleanupStep": [[57, 2, 1, "", "__init__"]], "kazu.steps.other.cleanup.DropMappingsByConfidenceMappingFilter": [[57, 2, 1, "", "__init__"]], "kazu.steps.other.cleanup.DropUnmappedEntityFilter": [[57, 2, 1, "", "__init__"]], "kazu.steps.other.cleanup.EntityFilterCleanupAction": [[57, 2, 1, "", "__init__"], [57, 2, 1, "", "cleanup"]], "kazu.steps.other.cleanup.MappingFilterCleanupAction": [[57, 2, 1, "", "__init__"], [57, 2, 1, "", "cleanup"]], "kazu.steps.other.merge_overlapping_ents": [[58, 1, 1, "", "MergeOverlappingEntsStep"]], "kazu.steps.other.merge_overlapping_ents.MergeOverlappingEntsStep": [[58, 2, 1, "", "__init__"], [58, 2, 1, "", "filter_ents_across_class"], [58, 2, 1, "", "group_entities_by_location"], [58, 2, 1, "", "select_preferred_entity"]], "kazu.steps.other.stanza": [[59, 1, 1, "", "StanzaStep"]], "kazu.steps.other.stanza.StanzaStep": [[59, 2, 1, "", "__init__"]], "kazu.steps.step": [[60, 1, 1, "", "Step"], [60, 6, 1, "", "document_batch_step"], [60, 6, 1, "", "document_iterating_step"]], "kazu.steps.step.Step": [[60, 2, 1, "", "__init__"], [60, 2, 1, "", "namespace"]], "kazu.utils": [[62, 0, 0, "-", "abbreviation_detector"], [63, 0, 0, "-", "build_and_test_model_packs"], [64, 0, 0, "-", "caching"], [65, 0, 0, "-", "grouping"], [66, 0, 0, "-", "link_index"], [67, 0, 0, "-", "spacy_pipeline"], [68, 0, 0, "-", "stanza_pipeline"], [69, 0, 0, "-", "stopwatch"], [70, 0, 0, "-", "string_normalizer"], [71, 0, 0, "-", "utils"]], "kazu.utils.abbreviation_detector": [[62, 1, 1, "", "KazuAbbreviationDetector"], [62, 6, 1, "", "filter_matches"], [62, 6, 1, "", "find_abbreviation"], [62, 6, 1, "", "short_form_filter"]], "kazu.utils.abbreviation_detector.KazuAbbreviationDetector": [[62, 2, 1, "", "__init__"]], "kazu.utils.build_and_test_model_packs": [[63, 4, 1, "", "ModelPackBuildError"], [63, 1, 1, "", "ModelPackBuilder"], [63, 6, 1, "", "build_custom_pack_params"]], "kazu.utils.build_and_test_model_packs.ModelPackBuilder": [[63, 2, 1, "", "build_all_model_packs"], [63, 2, 1, "", "build_caches"], [63, 2, 1, "", "clear_cached_resources_from_model_pack_dir"], [63, 2, 1, "", "process_model_pack_path"], [63, 2, 1, "", "reset_singletons"], [63, 2, 1, "", "zip_model_pack"]], "kazu.utils.caching": [[64, 1, 1, "", "EntityLinkingLookupCache"]], "kazu.utils.caching.EntityLinkingLookupCache": [[64, 2, 1, "", "__init__"], [64, 2, 1, "", "check_lookup_cache"], [64, 2, 1, "", "update_terms_lookup_cache"]], "kazu.utils.grouping": [[65, 6, 1, "", "sort_then_group"]], "kazu.utils.link_index": [[66, 1, 1, "", "CDistTensorEmbeddingIndex"], [66, 1, 1, "", "DictionaryIndex"], [66, 1, 1, "", "EmbeddingIndex"], [66, 1, 1, "", "Index"], [66, 1, 1, "", "MatMulTensorEmbeddingIndex"], [66, 1, 1, "", "TensorEmbeddingIndex"]], "kazu.utils.link_index.DictionaryIndex": [[66, 2, 1, "", "__init__"], [66, 2, 1, "", "apply_boolean_scorers"], [66, 2, 1, "", "search"]], "kazu.utils.link_index.EmbeddingIndex": [[66, 2, 1, "", "__init__"], [66, 2, 1, "", "enumerate_database_chunks"], [66, 2, 1, "", "predict_ontology_embeddings"], [66, 2, 1, "", "search"], [66, 2, 1, "", "set_embedding_model"]], "kazu.utils.link_index.Index": [[66, 2, 1, "", "__init__"], [66, 2, 1, "", "add"], [66, 2, 1, "", "build_ontology_cache"], [66, 3, 1, "", "column_type_dict"], [66, 2, 1, "", "get_index_data_path"], [66, 2, 1, "", "get_metadata_path"], [66, 2, 1, "", "get_synonym_data_path"], [66, 2, 1, "", "load"], [66, 2, 1, "", "load_or_build_cache"], [66, 2, 1, "", "save"]], "kazu.utils.link_index.MatMulTensorEmbeddingIndex": [[66, 3, 1, "", "metadata_db"], [66, 3, 1, "", "synonym_db"]], "kazu.utils.link_index.TensorEmbeddingIndex": [[66, 2, 1, "", "__init__"], [66, 3, 1, "", "metadata_db"], [66, 3, 1, "", "synonym_db"]], "kazu.utils.spacy_pipeline": [[67, 1, 1, "", "SpacyPipeline"]], "kazu.utils.spacy_pipeline.SpacyPipeline": [[67, 2, 1, "", "__init__"], [67, 3, 1, "", "instance"]], "kazu.utils.stanza_pipeline": [[68, 1, 1, "", "StanzaPipeline"]], "kazu.utils.stanza_pipeline.StanzaPipeline": [[68, 2, 1, "", "__init__"], [68, 2, 1, "", "from_stanza_kwargs"], [68, 5, 1, "", "instance"], [68, 2, 1, "", "simple_init"]], "kazu.utils.stopwatch": [[69, 1, 1, "", "Stopwatch"]], "kazu.utils.stopwatch.Stopwatch": [[69, 2, 1, "", "__init__"], [69, 2, 1, "", "message"], [69, 2, 1, "", "start"]], "kazu.utils.string_normalizer": [[70, 1, 1, "", "AnatomyStringNormalizer"], [70, 1, 1, "", "DefaultStringNormalizer"], [70, 1, 1, "", "DiseaseStringNormalizer"], [70, 1, 1, "", "EntityClassNormalizer"], [70, 1, 1, "", "GeneStringNormalizer"], [70, 1, 1, "", "GildaUtils"], [70, 1, 1, "", "StringNormalizer"]], "kazu.utils.string_normalizer.AnatomyStringNormalizer": [[70, 2, 1, "", "is_symbol_like"], [70, 2, 1, "", "normalize_noun_phrase"], [70, 2, 1, "", "normalize_symbol"]], "kazu.utils.string_normalizer.DefaultStringNormalizer": [[70, 3, 1, "", "allowed_additional_chars"], [70, 2, 1, "", "depluralize"], [70, 3, 1, "", "greek_subs"], [70, 3, 1, "", "greek_subs_upper"], [70, 2, 1, "", "handle_lower_case_prefixes"], [70, 2, 1, "", "is_symbol_like"], [70, 2, 1, "", "normalize_noun_phrase"], [70, 2, 1, "", "normalize_symbol"], [70, 3, 1, "", "number_split_pattern"], [70, 3, 1, "", "other_subs"], [70, 3, 1, "", "re_subs"], [70, 3, 1, "", "re_subs_2"], [70, 2, 1, "", "remove_non_alphanum"], [70, 2, 1, "", "replace_greek"], [70, 2, 1, "", "replace_substrings"], [70, 2, 1, "", "split_on_numbers"], [70, 2, 1, "", "sub_greek_char_abbreviations"], [70, 3, 1, "", "symbol_number_split"], [70, 3, 1, "", "trailing_lowercase_s_split"]], "kazu.utils.string_normalizer.DiseaseStringNormalizer": [[70, 2, 1, "", "is_symbol_like"], [70, 3, 1, "", "known_disease_short_nouns"], [70, 2, 1, "", "normalize_noun_phrase"], [70, 2, 1, "", "normalize_symbol"]], "kazu.utils.string_normalizer.EntityClassNormalizer": [[70, 2, 1, "", "__init__"], [70, 2, 1, "", "is_symbol_like"], [70, 2, 1, "", "normalize_noun_phrase"], [70, 2, 1, "", "normalize_symbol"]], "kazu.utils.string_normalizer.GeneStringNormalizer": [[70, 3, 1, "", "gene_name_suffixes"], [70, 2, 1, "", "gene_token_classifier"], [70, 2, 1, "", "is_symbol_like"], [70, 2, 1, "", "normalize_noun_phrase"], [70, 2, 1, "", "normalize_symbol"], [70, 2, 1, "", "remove_trailing_s_if_otherwise_capitalised"]], "kazu.utils.string_normalizer.GildaUtils": [[70, 3, 1, "", "dashes"], [70, 2, 1, "", "depluralize"], [70, 2, 1, "", "replace_dashes"]], "kazu.utils.string_normalizer.StringNormalizer": [[70, 2, 1, "", "classify_symbolic"], [70, 2, 1, "", "normalize"], [70, 3, 1, "", "normalizers"]], "kazu.utils.utils": [[71, 1, 1, "", "EntityClassFilter"], [71, 1, 1, "", "Singleton"], [71, 6, 1, "", "as_path"], [71, 6, 1, "", "create_char_ngrams"], [71, 6, 1, "", "create_word_ngrams"], [71, 6, 1, "", "documents_to_document_section_batch_encodings_map"], [71, 6, 1, "", "documents_to_document_section_text_map"], [71, 6, 1, "", "documents_to_id_section_map"], [71, 6, 1, "", "filter_entities_with_ontology_mappings"], [71, 6, 1, "", "find_document_from_entity"], [71, 6, 1, "", "get_cache_dir"], [71, 6, 1, "", "get_cache_path"], [71, 6, 1, "", "get_match_entity_class_hash"]], "kazu.utils.utils.EntityClassFilter": [[71, 2, 1, "", "__init__"]], "kazu.web": [[73, 0, 0, "-", "jwtauth"], [74, 0, 0, "-", "routes"], [75, 0, 0, "-", "server"]], "kazu.web.jwtauth": [[73, 1, 1, "", "JWTAuthenticationBackend"], [73, 1, 1, "", "JWTUser"], [73, 6, 1, "", "on_auth_error"]], "kazu.web.jwtauth.JWTAuthenticationBackend": [[73, 2, 1, "", "__init__"], [73, 2, 1, "", "authenticate"], [73, 2, 1, "", "get_token_from_header"]], "kazu.web.jwtauth.JWTUser": [[73, 2, 1, "", "__init__"], [73, 5, 1, "", "display_name"], [73, 5, 1, "", "is_authenticated"]], "kazu.web.server": [[75, 1, 1, "", "SectionedWebDocument"], [75, 1, 1, "", "SimpleWebDocument"], [75, 6, 1, "", "start"], [75, 6, 1, "", "stop"]], "kazu.web.server.SectionedWebDocument": [[75, 3, 1, "", "sections"], [75, 2, 1, "", "to_kazu_document"]], "kazu.web.server.SimpleWebDocument": [[75, 3, 1, "", "text"], [75, 2, 1, "", "to_kazu_document"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:attribute", "4": "py:exception", "5": "py:property", "6": "py:function"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "attribute", "Python attribute"], "4": ["py", "exception", "Python exception"], "5": ["py", "property", "Python property"], "6": ["py", "function", "Python function"]}, "titleterms": {"kazu": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 78, 79, 80, 81], "data": [1, 2, 3, 78], "pytorch": 3, "model": [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 78, 86], "annot": [5, 6, 7], "acceptance_test": 6, "label_studio": 7, "databas": [8, 9], "in_memory_db": 9, "distil": [10, 11, 12, 13, 14, 15, 16, 17], "data_util": 11, "dataprocessor": 12, "lightning_plugin": 13, "metric": 14, "tiny_transform": 16, "train": [17, 24], "hf_lightning_wrapp": 18, "languag": [19, 20, 21], "language_phenomena": 20, "string_similarity_scor": 21, "link": [22, 23, 24, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 77], "sapbert": [23, 24, 49], "ontology_match": [25, 26, 27], "assemble_pipelin": 26, "ontology_preprocess": [28, 29, 30], "base": [29, 77], "synonym_gener": 30, "pipelin": [31, 32, 79], "step": [33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 86], "document_post_process": [34, 35], "abbreviation_find": 35, "joint_ner_and_link": [36, 37], "explos": 37, "dictionari": 39, "mapping_step": 40, "post_process": [41, 42, 43, 44, 45, 46, 47, 48], "disambigu": [42, 43, 44], "context_scor": 43, "strategi": [44, 46], "mapping_strategi": [45, 46], "strategy_runn": 47, "xref_manag": 48, "ner": [50, 51, 52, 53, 54, 55, 77], "entity_post_process": 51, "hf_token_classif": 52, "seth": 53, "spacy_n": 54, "tokenized_word_processor": 55, "other": [56, 57, 58, 59], "cleanup": 57, "merge_overlapping_": 58, "stanza": 59, "util": [61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71], "abbreviation_detector": 62, "build_and_test_model_pack": 63, "cach": 64, "group": 65, "link_index": 66, "spacy_pipelin": 67, "stanza_pipelin": 68, "stopwatch": 69, "string_norm": 70, "web": [72, 73, 74, 75], "jwtauth": 73, "rout": 74, "server": 75, "api": [76, 80], "refer": 76, "curat": 77, "knowledg": 77, "At": 79, "glanc": 79, "how": 79, "us": 79, "default": [79, 86], "welcom": 80, "s": 80, "document": [80, 86], "guid": 80, "tutori": 80, "site": 80, "index": 80, "introduct": 81, "why": 81, "summari": 81, "tba": [82, 87], "visualis": 83, "result": 83, "label": 83, "studio": 83, "The": 84, "ontologypars": 84, "write": 84, "custom": 84, "parser": 84, "quickstart": 86, "instal": 86, "pack": 86, "run": 86, "advanc": [], "configur": 86, "hydra": [], "process": 86, "your": 86, "first": 86}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 6, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx": 56}})
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index a3cc3defd..2ca5b61ae 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -97,7 +97,6 @@
 doctest_global_setup = """
 import os
 
-kazu_config_missing = os.environ.get("KAZU_CONFIG_DIR") is None
 kazu_model_pack_missing = os.environ.get("KAZU_MODEL_PACK") is None
 """
 
diff --git a/docs/curating_for_explosion.rst b/docs/curating_for_explosion.rst
index 0a446dbfb..84d56f8cc 100644
--- a/docs/curating_for_explosion.rst
+++ b/docs/curating_for_explosion.rst
@@ -7,106 +7,23 @@ Many entities in Biomedical NER do not require sophisticated NER or disambiguati
 unambiguous, and have few genuine synonyms. For instance, terms such as "Breast Cancer" and "mitosis" can be taken at face value, and
 simple string matching techniques can be employed (in our case, we use the `Spacy PhraseMatcher <https://spacy.io/api/phrasematcher>`_).
 
-However, the terms in ontologies tend to be noisy when taken 'wholesale', and need curation in order to ensure high precision matching.
+However, the string labels in ontologies tend to be noisy when taken 'wholesale', and need curation in order to ensure high precision matching.
+For instance, the `Gene Ontology reference for envelope <http://amigo.geneontology.org/amigo/term/GO:0031975>`_ is highly ambiguous -
+we wouldn't want this to be tagged every time we see the string 'envelope' appear in text. On the other hand
+`cornified envelope assembly <http://amigo.geneontology.org/amigo/term/GO:1903575>`_ is highly specific, and whenever we see this string,
+we can safely assume it refers to this GO id.
 
-In Kazu, we take the following approach:
-
-1. generate synonym candidates from the raw ontology to build a putative pipeline.
-
-    .. code-block::
-
-        from kazu.modelling.ontology_matching import assemble_pipeline
-        from kazu.modelling.ontology_preprocessing.base import MondoOntologyParser
-        from kazu.modelling.ontology_preprocessing.synonym_generation import (
-            CombinatorialSynonymGenerator,
-            StringReplacement,
-            StopWordRemover,
-        )
-
-        syn_generator = CombinatorialSynonymGenerator(
-            [StopWordRemover(), StringReplacement(include_greek=True)]
-        )
-        parser = MondoOntologyParser(
-            in_path="", data_origin="test", synonym_generator=syn_generator
-        )
-        nlp = assemble_pipeline.main(
-            parser_name_to_entity_type={parser.name: "disease"},
-            parsers=[parser],
-            labels={"disease"},
-            output_dir="~/noisy_spacy_pipeline",
-        )
-
-
-2. we then run this pipeline over a large corpora of text, and look at the frequency of each hit. Note, the below
-   is for illustration only - you'll probably want a more sophisticated set up when doing this on a large document set!
-
-    .. code-block::
-
-        from kazu.data.data import Document
-        from kazu.steps.joint_ner_and_linking.explosion import ExplosionStringMatchingStep
-        from dataclasses import dataclass, field
-        from typing import List
-        import json
-
-
-        @dataclass
-        class AnnotatedPhrase:
-            term: str
-            action: str
-            symbolic: bool
-            case_sensitive: bool
-            term_norm_mapping: dict[str, str] = field(default_factory=dict)
-            examples: list[str] = field(default_factory=list)
+Given an ontology can contain 100 000s of labels, how do we curate these? It's too labour intensive to look at every one. Therefore, we
+apply some pragmaticism in order to produce a set of precise labels we want to use for dictionary based NER and linking.
 
+In Kazu, we take the following approach:
 
-        class AnnotatedPhraseEncoder(json.JSONEncoder):
-            def default(self, obj):
-                if isinstance(obj, AnnotatedPhrase):
-                    return obj.__dict__
-                # Base class default() raises TypeError:
-                return json.JSONEncoder.default(self, obj)
-
-
-        def save(path, data):
-            with open(path, "w") as f:
-                f.writelines(json.dumps(x, cls=AnnotatedPhraseEncoder) + "\n" for x in data)
-
-
-        # get_docs represents some function to get documents relevant to you
-        docs: List[Document] = get_docs()
-        noisy_step = ExplosionStringMatchingStep(path="~/noisy_spacy_pipeline")
-
-        noisy_step(docs)
-        curatable_phrases = []
-        for doc in docs:
-            for section in doc.sections:
-                for ent in section.entities:
-                    term_norm_mapping = {
-                        term.parser_name: term.term_norm for term in ent.syn_term_to_synonym_terms
-                    }
-                    symbolic = any(x.is_symbolic for x in ent.syn_term_to_synonym_terms)
-                    to_curate = AnnotatedPhrase(
-                        term=ent.match,
-                        action="to_curate",
-                        case_sensitive=True,
-                        symbolic=symbolic,
-                        term_norm_mapping=term_norm_mapping,
-                        examples=[section.text[ent.start : ent.end]],
-                    )
-                    curatable_phrases.append(to_curate)
-
-        save("~/phrases_to_curate.jsonl", curatable_phrases)
-
-
-3. we curate the phrases_to_curate.jsonl file, according to whether they look like good matches or not for a given parser, and whether case matters.
-
-4. Now, the final pipeline can be generated as follows:
-
-    .. code-block::
+1. Generate synonym candidates from the raw ontology to build a putative list of terms we might want to use. If the term is symbolic,
+   we assume it's case sensitive. Otherwise assume case insensitive.
+2. Build a pipeline from this list, execute this pipeline over a large corpora of target data, and explore the results to get a sense of
+   which terms are 'noisy'
+3. Curate the top x hits by frequency, to determine whether a given term is precise enough in it's own right to be valid for dictionary based NER.
+   We assume here that if a term doesn't hit frequently enough to be considered in step 2, it's probably safe to include. Depending on your target
+   data, this may be invalid -  so in practice, the curation approach is iterative.
 
-        nlp = assemble_pipeline.main(
-            parser_name_to_entity_type={parser.name: "disease"},
-            curated_list="~/phrases_to_curate.jsonl",
-            labels={"disease"},
-            output_dir="~/<kazu model pack>/spacy_pipeline",
-        )
+TODO: add a worked example
diff --git a/docs/quickstart.rst b/docs/quickstart.rst
index 84d004997..e0f2a6a95 100644
--- a/docs/quickstart.rst
+++ b/docs/quickstart.rst
@@ -15,56 +15,56 @@ Ensure you are on version 21.0 or newer of pip.
 
 Model Pack
 ----------
-In order to use the majority of Kazu, you will need the model pack, which contains
-the pretrained models required by the pipeline. This is available from <TBA>
+In order to use the majority of Kazu, you will need a model pack, which contains
+the pretrained models and knowledge bases/ontologies required by the pipeline.
+These are available from the `release page <https://github.com/astrazeneca/kazu/releases>`_
 
-Running Steps
--------------
-Components are wrapped as instances of :class:`kazu.steps.step.Step`.
-
-.. include:: single_step_example.rst
+Default configuration
+---------------------
+Kazu has a LOT of moving parts, each of which can be configured according to your requirements.
+Since this can get complicated, we use `Hydra <https://hydra.cc/docs/intro/>`_ to manage different
+configurations, and provide a 'default' configuration that is generally useful in most circumstances
+(and is also a good starting point for your own tweaks). This default configuration is located in
+the 'conf/' directory of the model pack.
 
-Advanced Pipeline configuration with Hydra
--------------------------------------------
-
-To create an NLP pipeline, you need to instantiate steps. Given the large amount
-of configuration required, the easiest way to do this is with Hydra https://hydra.cc/docs/intro/
-
-Here, you will need a hydra config directory (see kazu/conf for an example).
-
-First, export the path of your config directory to KAZU_CONFIG_DIR.
-
-To use the example kazu/conf config you will need to
-set the environment variable KAZU_MODEL_PACK to a path for a kazu model pack,
-or manually update the model paths that use the variable - search for
-`${oc.env:KAZU_MODEL_PACK}` in kazu/conf).
+Processing your first document
+------------------------------
 
 .. testcode::
     :skipif: kazu_config_missing or kazu_model_pack_missing
 
-    import os
-    from hydra import compose, initialize_config_dir
+    from hydra import initialize_config_dir, compose
     from hydra.utils import instantiate
     from kazu.data.data import Document
     from kazu.pipeline import Pipeline
-    # some text we want to process
-    text = """EGFR is a gene"""
+    from pathlib import Path
+    import os
 
-    with initialize_config_dir(config_dir=os.environ.get("KAZU_CONFIG_DIR")):
-        cfg = compose(config_name="config")
-        # instantiate a pipeline based on Hydra defaults
+    # the hydra config is kept in the model pack. Ensure this env
+    # variable is set to your model pack location
+    cdir = Path(os.environ["KAZU_MODEL_PACK"]).joinpath('conf')
+    with initialize_config_dir(config_dir=str(cdir)):
+        cfg = compose(
+            config_name="config",
+            overrides=[],
+        )
         pipeline: Pipeline = instantiate(cfg.Pipeline)
-        # create an instance of Document from our text string
+        text = "EGFR mutations are often implicated in lung cancer"
         doc = Document.create_simple_document(text)
-        # Pipeline takes a List[Document] as an argument to __call__
-        # and returns a processed List[Document]
-        result: Document = pipeline([doc])[0]
-        # a Document is composed of Sections
-        # (a Document created with create_simple_document has only one)
-        print(result.sections[0].get_text())
+        pipeline([doc])
+        print(f"{doc.sections[0].text}")
+
 
 .. testoutput::
     :hide:
     :skipif: kazu_config_missing or kazu_model_pack_missing
 
-    EGFR is a gene
+    EGFR mutations are often implicated in lung cancer
+
+You can now inspect the doc object, and explore what entities were detected on each section
+
+Running Steps
+-------------
+Components are wrapped as instances of :class:`kazu.steps.step.Step`.
+
+.. include:: single_step_example.rst
diff --git a/setup.py b/setup.py
index 6150ff702..d3d4755c5 100644
--- a/setup.py
+++ b/setup.py
@@ -30,6 +30,7 @@
         "stanza>=1.0.0",
         "regex>=2020.1.7",
         "psutil>=5.3.0",
+        "cachetools>=5.2.0",
     ],
     extras_require={
         "webserver": webserver_dependencies,