From cca0cc939e7dca17c6c1ef1d22659d9f0346be97 Mon Sep 17 00:00:00 2001
From: Florents Tselai <florents.tselai@gmail.com>
Date: Thu, 12 Sep 2024 23:32:03 +0300
Subject: [PATCH 1/7] Add docs boilerplate

---
 .readthedocs.yml      |  18 ++++
 docs/.gitignore       |   1 +
 docs/Makefile         |  23 +++++
 docs/conf.py          | 193 ++++++++++++++++++++++++++++++++++++++++++
 docs/index.rst        |   1 +
 docs/requirements.txt |   6 ++
 6 files changed, 242 insertions(+)
 create mode 100644 .readthedocs.yml
 create mode 100644 docs/.gitignore
 create mode 100644 docs/Makefile
 create mode 100644 docs/conf.py
 create mode 120000 docs/index.rst
 create mode 100644 docs/requirements.txt

diff --git a/.readthedocs.yml b/.readthedocs.yml
new file mode 100644
index 00000000..8f9020d6
--- /dev/null
+++ b/.readthedocs.yml
@@ -0,0 +1,18 @@
+version: 2
+
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.12"
+
+sphinx:
+  configuration: docs/conf.py
+
+formats:
+  - pdf
+  - epub
+
+python:
+  install:
+    - requirements: docs/requirements.txt
+
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 00000000..e35d8850
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1 @@
+_build
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 00000000..2b54e10b
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,23 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SPHINXPROJ    = warcio
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+livehtml:
+	sphinx-autobuild -a -b html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(0) --watch ../warcio
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 00000000..f78f375a
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from subprocess import Popen, PIPE
+from beanbag_docutils.sphinx.ext.github import github_linkcode_resolve
+
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "sphinx.ext.extlinks",
+    "sphinx.ext.autodoc",
+    "sphinx_copybutton",
+    "sphinx.ext.linkcode",
+]
+autodoc_member_order = "bysource"
+autodoc_typehints = "description"
+
+extlinks = {
+    "issue": ("https://github.com/webrecorder/warcio/issues/%s", "#%s"),
+}
+
+
+def linkcode_resolve(domain, info):
+    return github_linkcode_resolve(
+        domain=domain,
+        info=info,
+        allowed_module_names=["warcio"],
+        github_org_id="webrecorder",
+        github_repo_id="warcio",
+        branch="master",
+    )
+
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = ".rst"
+
+# The master toctree document.
+master_doc = "index"
+
+# General information about the project.
+project = "warcio"
+copyright = "2023"
+author = "webrecorder"
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+pipe = Popen("git describe --tags --always", stdout=PIPE, shell=True)
+git_version = pipe.stdout.read().decode("utf8")
+
+if git_version:
+    version = git_version.rsplit("-", 1)[0]
+    release = git_version
+else:
+    version = ""
+    release = ""
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = "en"
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = "sphinx"
+
+# Only syntax highlight of code-block is used:
+highlight_language = "none"
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "furo"
+html_title = "warcio"
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
+
+# html_js_files = ["js/custom.js"]
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = "warcio-doc"
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (
+        master_doc,
+        "warcio.tex",
+        "warcio documentation",
+        "webrecorder",
+        "manual",
+    )
+]
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, "warcio", "warcio documentation", [author], 1)]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (
+        master_doc,
+        "warcio",
+        "warcio documentation",
+        author,
+        "warcio",
+        "Python library for manipulating SQLite databases",
+        "Miscellaneous",
+    )
+]
diff --git a/docs/index.rst b/docs/index.rst
new file mode 120000
index 00000000..89a01069
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1 @@
+../README.rst
\ No newline at end of file
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 00000000..e798be25
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,6 @@
+furo
+sphinx-autobuild
+codespell
+sphinx-copybutton
+beanbag-docutils>=2.0
+pygments-csv-lexer
\ No newline at end of file

From 6434939e3f5ff856ce8e6dc0f4a770277be11f5e Mon Sep 17 00:00:00 2001
From: Florents Tselai <florents.tselai@gmail.com>
Date: Thu, 12 Sep 2024 23:52:56 +0300
Subject: [PATCH 2/7] Add api reference, auto-genrated from source. Also source
 is linked to GitHub

---
 docs/api.rst   |  36 +++++
 docs/conf.py   |   2 +-
 docs/index.rst | 421 ++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 457 insertions(+), 2 deletions(-)
 create mode 100644 docs/api.rst
 mode change 120000 => 100644 docs/index.rst

diff --git a/docs/api.rst b/docs/api.rst
new file mode 100644
index 00000000..a406b3fa
--- /dev/null
+++ b/docs/api.rst
@@ -0,0 +1,36 @@
+.. _api:
+
+===============
+ API Reference
+===============
+
+.. contents:: :local:
+   :class: this-will-duplicate-information-and-it-is-still-useful-here
+
+.. _reference_statusandheaders:
+
+warcio.statusandheaders.StatusAndHeaders
+========================================
+
+.. autoclass:: warcio.statusandheaders.StatusAndHeaders
+    :members:
+    :undoc-members:
+    :special-members: __getitem__
+
+.. _reference_archiveiterator:
+
+warcio.archiveiterator.ArchiveIterator
+=======================================
+
+.. autoclass:: warcio.archiveiterator.ArchiveIterator
+    :members:
+    :undoc-members:
+
+.. _reference_warcwriter:
+
+warcio.warcwriter.WARCWriter
+============================
+
+.. autoclass:: warcio.warcwriter.WARCWriter
+    :members:
+    :undoc-members:
diff --git a/docs/conf.py b/docs/conf.py
index f78f375a..4f36fc17 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -187,7 +187,7 @@ def linkcode_resolve(domain, info):
         "warcio documentation",
         author,
         "warcio",
-        "Python library for manipulating SQLite databases",
+        "warcio description...",
         "Miscellaneous",
     )
 ]
diff --git a/docs/index.rst b/docs/index.rst
deleted file mode 120000
index 89a01069..00000000
--- a/docs/index.rst
+++ /dev/null
@@ -1 +0,0 @@
-../README.rst
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 00000000..d451c303
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,420 @@
+WARCIO: WARC (and ARC) Streaming Library
+========================================
+.. image:: https://travis-ci.org/webrecorder/warcio.svg?branch=master
+      :target: https://travis-ci.org/webrecorder/warcio
+.. image:: https://codecov.io/gh/webrecorder/warcio/branch/master/graph/badge.svg
+      :target: https://codecov.io/gh/webrecorder/warcio
+
+
+Background
+----------
+
+This library provides a fast, standalone way to read and write `WARC
+Format <https://en.wikipedia.org/wiki/Web_ARChive>`__ commonly used in
+web archives. Python 3.7+ (minimally only needing
+`six <https://pythonhosted.org/six/>`__ as an external dependency)
+
+warcio supports reading and writing of WARC files compliant with both the `WARC 1.0 <http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf>`__
+and `WARC 1.1 <http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1-1_latestdraft.pdf>`__ ISO standards.
+
+Install with: ``pip install warcio`` (or ``pip install warcio[all]`` to get optional features)
+
+This library is a spin-off of the WARC reading and writing component of
+the `pywb <https://github.com/webrecorder/pywb>`__ high-fidelity replay
+library, a key component of
+`Webrecorder <https://github.com/webrecorder/webrecorder>`__
+
+The library is designed for fast, low-level access to web archival
+content, oriented around a stream of WARC records rather than files.
+
+Reading WARC Records
+--------------------
+
+A key feature of the library is to be able to iterate over a stream of
+WARC records using the ``ArchiveIterator``.
+
+It includes the following features:
+
+- Reading a WARC 1.0, WARC 1.1 or ARC stream
+- On the fly ARC to WARC record conversion
+- Decompressing and de-chunking HTTP payload content stored in WARC/ARC files.
+
+For example, the following prints the the url for each WARC ``response``
+record:
+
+.. code:: python
+
+    from warcio.archiveiterator import ArchiveIterator
+
+    with open('path/to/file', 'rb') as stream:
+        for record in ArchiveIterator(stream):
+            if record.rec_type == 'response':
+                print(record.rec_headers.get_header('WARC-Target-URI'))
+
+The stream object could be a file on disk or a remote network stream.
+The ``ArchiveIterator`` reads the WARC content in a single pass. The
+``record`` is represented by an ``ArcWarcRecord`` object which contains
+the format (ARC or WARC), record type, the record headers, http headers
+(if any), and raw stream for reading the payload.
+
+.. code:: python
+
+    class ArcWarcRecord(object):
+        def __init__(self, *args):
+            (self.format, self.rec_type, self.rec_headers, self.raw_stream,
+             self.http_headers, self.content_type, self.length) = args
+
+Reading WARC Content
+~~~~~~~~~~~~~~~~~~~~
+
+The ``raw_stream`` can be used to read the rest of the payload directly.
+A special ``ArcWarcRecord.content_stream()`` function provides a stream that
+automatically decompresses and de-chunks the HTTP payload, if it is
+compressed and/or transfer-encoding chunked.
+
+ARC Files
+~~~~~~~~~
+
+The library provides support for reading (but not writing ARC) files.
+The ARC format is legacy but is important to support in a consistent
+matter. The ``ArchiveIterator`` can equally iterate over ARC and WARC
+files to emit ``ArcWarcRecord`` objects. The special ``arc2warc`` option
+converts ARC records to WARCs on the fly, allowing for them to be
+accessed using the same API.
+
+(Special ``WARCIterator`` and ``ARCIterator`` subclasses of ``ArchiveIterator``
+are also available to read only WARC or only ARC files).
+
+WARC and ARC Streaming
+~~~~~~~~~~~~~~~~~~~~~~
+For example, here is a snippet for reading an ARC and a WARC using the
+same API.
+
+The example streams a WARC and ARC file over HTTP using
+`requests <http://docs.python-requests.org/en/master/>`__, printing the
+``warcinfo`` record (or ARC header) and any response records (or all ARC
+records) that contain HTML:
+
+.. code:: python
+
+    import requests
+    from warcio.archiveiterator import ArchiveIterator
+
+    def print_records(url):
+        resp = requests.get(url, stream=True)
+
+        for record in ArchiveIterator(resp.raw, arc2warc=True):
+            if record.rec_type == 'warcinfo':
+                print(record.raw_stream.read())
+
+            elif record.rec_type == 'response':
+                if record.http_headers.get_header('Content-Type') == 'text/html':
+                    print(record.rec_headers.get_header('WARC-Target-URI'))
+                    print(record.content_stream().read())
+                    print('')
+
+    # WARC
+    print_records('https://archive.org/download/ExampleArcAndWarcFiles/IAH-20080430204825-00000-blackbook.warc.gz')
+
+
+    # ARC with arc2warc
+    print_records('https://archive.org/download/ExampleArcAndWarcFiles/IAH-20080430204825-00000-blackbook.arc.gz')
+
+
+Writing WARC Records
+--------------------
+
+Starting with 1.6, warcio introduces a way to capture HTTP/S traffic directly
+to a WARC file, by monkey-patching Python's ``http.client`` library.
+
+This approach works well with the popular ``requests`` library often used to fetch
+HTTP/S content. Note that ``requests`` must be imported after the ``capture_http`` module.
+
+Quick Start to Writing a WARC
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Fetching the url ``https://example.com/`` while capturing the response and request
+into a gzip compressed WARC file named ``example.warc.gz`` can be done with the following four lines:
+
+.. code:: python
+
+    from warcio.capture_http import capture_http
+    import requests  # requests must be imported after capture_http
+
+    with capture_http('example.warc.gz'):
+        requests.get('https://example.com/')
+
+
+The WARC ``example.warc.gz`` will contain two records (the response is written first, then the request).
+
+To write to a default in-memory buffer (``BufferWARCWriter``), don't specify a filename, using ``with capture_http() as writer:``.
+
+Additional requests in the ``capture_http`` context and will be appended to the WARC as expected.
+
+The ``WARC-IP-Address`` header will also be added for each record if the IP address is available.
+
+The following example (similar to a `unit test from the test suite <test/test_capture_http.py>`__) demonstrates the resulting records created with ``capture_http``:
+
+.. code:: python
+
+    with capture_http() as writer:
+        requests.get('http://example.com/')
+        requests.get('https://google.com/')
+
+    expected = [('http://example.com/', 'response', True),
+                ('http://example.com/', 'request', True),
+                ('https://google.com/', 'response', True),
+                ('https://google.com/', 'request', True),
+                ('https://www.google.com/', 'response', True),
+                ('https://www.google.com/', 'request', True)
+               ]
+
+     actual = [
+                (record.rec_headers['WARC-Target-URI'],
+                 record.rec_type,
+                 'WARC-IP-Address' in record.rec_headers)
+
+                for record in ArchiveIterator(writer.get_stream())
+              ]
+
+     assert actual == expected
+        
+
+Customizing WARC Writing
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+The library provides a simple and extensible interface for writing
+standards-compliant WARC files.
+
+The library comes with a basic ``WARCWriter`` class for writing to a
+single WARC file and ``BufferWARCWriter`` for writing to an in-memory
+buffer. The ``BaseWARCWriter`` can be extended to support more complex
+operations.
+
+(There is no support for writing legacy ARC files)
+
+For more flexibility, such as to use a custom ``WARCWriter`` class,
+the above example can be written as:
+
+.. code:: python
+
+    from warcio.capture_http import capture_http
+    from warcio import WARCWriter
+    import requests  # requests *must* be imported after capture_http
+
+    with open('example.warc.gz', 'wb') as fh:
+        warc_writer = WARCWriter(fh)
+        with capture_http(warc_writer):
+            requests.get('https://example.com/')
+            
+WARC/1.1 Support
+~~~~~~~~~~~~~~~~
+
+By default, warcio creates WARC 1.0 records for maximum compatibility with existing tools.
+To create WARC/1.1 records, simply specify the warc version as follows:
+
+.. code:: python
+   
+    with capture_http('example.warc.gz', warc_version='1.1'):
+        ...
+
+
+.. code:: python
+
+    WARCWriter(fh, warc_version='1.1)
+    ...
+    
+When using WARC 1.1, the main difference is that the ``WARC-Date`` timestamp header
+will be written with microsecond precision, while WARC 1.0 only supports second precision.
+
+WARC 1.0:
+
+.. code::
+ 
+    WARC/1.0
+    ...
+    WARC-Date: 2018-12-26T10:11:12Z
+
+WARC 1.1:
+
+.. code::
+
+    WARC/1.1
+    ...
+    WARC-Date: 2018-12-26T10:11:12.456789Z
+    
+    
+
+Filtering HTTP Capture
+~~~~~~~~~~~~~~~~~~~~~~
+
+When capturing via HTTP, it is possible to provide a custom filter function, 
+which can be used to determine if a particular request and response records
+should be written to the WARC file or skipped.
+
+The filter function is called with the request and response record
+before they are written, and can be used to substitute a different record (for example, a revisit
+instead of a response), or to skip writing altogether by returning nothing, as shown below:
+
+.. code:: python
+
+    def filter_records(request, response, request_recorder):
+        # return None, None to indicate records should be skipped
+        if response.http_headers.get_statuscode() != '200':
+            return None, None
+            
+        # the response record can be replaced with a revisit record
+        elif check_for_dedup():
+            response = create_revisit_record(...)
+            
+        return request, response
+
+    with capture_http('example.warc.gz', filter_records):
+         requests.get('https://example.com/')
+         
+Please refer to
+`test/test\_capture_http.py <test/test_capture_http.py>`__ for additional examples
+of capturing ``requests`` traffic to WARC.
+
+Manual/Advanced WARC Writing
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Before 1.6, this was the primary method for fetching a url and then
+writing to a WARC. This process is a bit more verbose,
+but provides for full control of WARC creation and avoid monkey-patching.
+
+The following example loads ``http://example.com/``, creates a WARC
+response record, and writes it, gzip compressed, to ``example.warc.gz``
+The block and payload digests are computed automatically.
+
+.. code:: python
+
+    from warcio.warcwriter import WARCWriter
+    from warcio.statusandheaders import StatusAndHeaders
+
+    import requests
+
+    with open('example.warc.gz', 'wb') as output:
+        writer = WARCWriter(output, gzip=True)
+
+        resp = requests.get('http://example.com/',
+                            headers={'Accept-Encoding': 'identity'},
+                            stream=True)
+
+        # get raw headers from urllib3
+        headers_list = resp.raw.headers.items()
+
+        http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')
+
+        record = writer.create_warc_record('http://example.com/', 'response',
+                                            payload=resp.raw,
+                                            http_headers=http_headers)
+
+        writer.write_record(record)
+
+
+The library also includes additional semantics for:
+ - Creating ``warcinfo`` and ``revisit`` records
+ - Writing ``response`` and ``request`` records together
+ - Writing custom WARC records
+ - Reading a full WARC record from a stream
+
+Please refer to `warcwriter.py <warcio/warcwriter.py>`__ and
+`test/test\_writer.py <test/test_writer.py>`__ for additional examples.
+
+WARCIO CLI: Indexing and Recompression
+--------------------------------------
+
+The library currently ships with a few simple command line tools.
+
+Index
+~~~~~
+
+The ``warcio index`` cmd will print a simple index of the records in the
+warc file as newline delimited JSON lines (NDJSON).
+
+WARC header fields to include in the index can be specified via the
+``-f`` flag, and are included in the JSON block (in order, for
+convenience).
+
+::
+
+    warcio index ./test/data/example-iana.org-chunked.warc -f warc-type,warc-target-uri,content-length
+    {"warc-type": "warcinfo", "content-length": "137"}
+    {"warc-type": "response", "warc-target-uri": "http://www.iana.org/", "content-length": "7566"}
+    {"warc-type": "request", "warc-target-uri": "http://www.iana.org/", "content-length": "76"}
+
+
+HTTP header fields can be included by prefixing them with the prefix
+``http:``. The special field ``offset`` refers to the record offset within
+the warc file.
+
+::
+
+    warcio index ./test/data/example-iana.org-chunked.warc -f offset,content-type,http:content-type,warc-target-uri
+    {"offset": "0", "content-type": "application/warc-fields"}
+    {"offset": "405", "content-type": "application/http;msgtype=response", "http:content-type": "text/html; charset=UTF-8", "warc-target-uri": "http://www.iana.org/"}
+    {"offset": "8379", "content-type": "application/http;msgtype=request", "warc-target-uri": "http://www.iana.org/"}
+
+(Note: this library does not produce CDX or CDXJ format indexes often
+associated with web archives. To create these indexes, please see the
+`cdxj-indexer <https://github.com/webrecorder/cdxj-indexer>`__ tool which extends warcio indexing to provide this functionality)
+
+Check
+~~~~~
+
+The ``warcio check`` command will check the payload and block digests
+of WARC records, if possible. An exit value of 1 indicates a failure.
+``warcio check -v`` will print verbose output for each record in the
+WARC file.
+
+Recompress
+~~~~~~~~~~
+
+The ``recompress`` command allows for re-compressing or normalizing WARC
+(or ARC) files to a record-compressed, gzipped WARC file.
+
+Each WARC record is compressed individually and concatenated. This is
+the 'canonical' WARC storage format used by
+`Webrecorder <https://github.com/webrecorder/webrecorder>`__ and other
+web archiving institutions, and usually stored with a ``.warc.gz``
+extension.
+
+It can be used to: - Compress an uncompressed WARC - Convert any ARC
+file to a compressed WARC - Fix an improperly compressed WARC file (eg.
+a WARC compressed entirely instead of by record)
+
+::
+
+    warcio recompress ./input.arc.gz ./output.warc.gz
+
+
+Extract
+~~~~~~~
+
+The  ``extract`` command provides a way to extract either the WARC and HTTP headers and/or payload of a WARC record
+to stdout. Given a WARC filename and an offset, ``extract`` will print the (decompressed) record at that offset
+in the file to stdout
+
+Specifying --payload or --headers will output only the payload or only the WARC + HTTP headers (if any), respectively.
+
+::
+
+    warcio extract [--payload | --headers] filename offset
+
+
+License
+~~~~~~~
+
+``warcio`` is licensed under the Apache 2.0 License and is part of the
+Webrecorder project.
+
+See `NOTICE <NOTICE>`__ and `LICENSE <LICENSE>`__ for details.
+
+Contents
+--------
+
+.. toctree::
+   :maxdepth: 3
+
+   api
\ No newline at end of file

From b7bfee77335e67d0ae190587bbd7cebd26c0d56a Mon Sep 17 00:00:00 2001
From: Florents Tselai <florents.tselai@gmail.com>
Date: Thu, 12 Sep 2024 23:56:15 +0300
Subject: [PATCH 3/7] Have to install the library to auto generate API docs

---
 .readthedocs.yml | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 8f9020d6..c551aeac 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -1,18 +1,20 @@
 version: 2
 
+sphinx:
+  configuration: docs/conf.py
+
 build:
   os: ubuntu-22.04
   tools:
     python: "3.12"
 
-sphinx:
-  configuration: docs/conf.py
-
-formats:
-  - pdf
-  - epub
-
 python:
   install:
-    - requirements: docs/requirements.txt
+  - method: pip
+    path: .
+    extra_requirements:
+    - docs
 
+formats:
+- pdf
+- epub
\ No newline at end of file

From e6e1935a73fbce0722dca797ef060bbfe0e53651 Mon Sep 17 00:00:00 2001
From: Florents Tselai <florents.tselai@gmail.com>
Date: Thu, 12 Sep 2024 23:57:53 +0300
Subject: [PATCH 4/7] beanbag_docutils ->> 3.11

---
 .readthedocs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index c551aeac..4f16d592 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -6,7 +6,7 @@ sphinx:
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.12"
+    python: "3.11"
 
 python:
   install:

From 57dc72298e87d1a2ef3e801a4928ecee0c7e86f0 Mon Sep 17 00:00:00 2001
From: Florents Tselai <florents.tselai@gmail.com>
Date: Fri, 13 Sep 2024 00:00:55 +0300
Subject: [PATCH 5/7] docs extra to setup.py

---
 docs/requirements.txt | 6 ------
 setup.py              | 9 +++++++++
 2 files changed, 9 insertions(+), 6 deletions(-)
 delete mode 100644 docs/requirements.txt

diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index e798be25..00000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-furo
-sphinx-autobuild
-codespell
-sphinx-copybutton
-beanbag-docutils>=2.0
-pygments-csv-lexer
\ No newline at end of file
diff --git a/setup.py b/setup.py
index e1d8fe4f..0d48ab25 100755
--- a/setup.py
+++ b/setup.py
@@ -40,6 +40,15 @@
         ],
         'all': [
             'brotlipy',
+        ],
+
+        'docs': [
+            "furo",
+            "sphinx-autobuild",
+            "codespell",
+            "sphinx-copybutton",
+            "beanbag-docutils>=2.0",
+            "pygments-csv-lexer"
         ]
     },
     classifiers=[

From 78ac4f9c0220396f04a8373320cc9bcbba6e3e29 Mon Sep 17 00:00:00 2001
From: Florents Tselai <florents.tselai@gmail.com>
Date: Fri, 13 Sep 2024 00:08:14 +0300
Subject: [PATCH 6/7] Simplify class paths

---
 docs/api.rst | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/api.rst b/docs/api.rst
index a406b3fa..b1a0db51 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -9,8 +9,8 @@
 
 .. _reference_statusandheaders:
 
-warcio.statusandheaders.StatusAndHeaders
-========================================
+warcio.StatusAndHeaders
+=======================
 
 .. autoclass:: warcio.statusandheaders.StatusAndHeaders
     :members:
@@ -19,18 +19,18 @@ warcio.statusandheaders.StatusAndHeaders
 
 .. _reference_archiveiterator:
 
-warcio.archiveiterator.ArchiveIterator
-=======================================
+warcio.ArchiveIterator
+======================
 
-.. autoclass:: warcio.archiveiterator.ArchiveIterator
+.. autoclass:: warcio.ArchiveIterator
     :members:
     :undoc-members:
 
 .. _reference_warcwriter:
 
-warcio.warcwriter.WARCWriter
-============================
+warcio.WARCWriter
+=================
 
-.. autoclass:: warcio.warcwriter.WARCWriter
+.. autoclass:: warcio.WARCWriter
     :members:
     :undoc-members:

From 0611da92f2ea93aa53c63fa8795e941dacdeb2e2 Mon Sep 17 00:00:00 2001
From: Florents Tselai <florents.tselai@gmail.com>
Date: Fri, 13 Sep 2024 00:14:19 +0300
Subject: [PATCH 7/7] Gh links work again

---
 docs/api.rst | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/api.rst b/docs/api.rst
index b1a0db51..a406b3fa 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -9,8 +9,8 @@
 
 .. _reference_statusandheaders:
 
-warcio.StatusAndHeaders
-=======================
+warcio.statusandheaders.StatusAndHeaders
+========================================
 
 .. autoclass:: warcio.statusandheaders.StatusAndHeaders
     :members:
@@ -19,18 +19,18 @@ warcio.StatusAndHeaders
 
 .. _reference_archiveiterator:
 
-warcio.ArchiveIterator
-======================
+warcio.archiveiterator.ArchiveIterator
+=======================================
 
-.. autoclass:: warcio.ArchiveIterator
+.. autoclass:: warcio.archiveiterator.ArchiveIterator
     :members:
     :undoc-members:
 
 .. _reference_warcwriter:
 
-warcio.WARCWriter
-=================
+warcio.warcwriter.WARCWriter
+============================
 
-.. autoclass:: warcio.WARCWriter
+.. autoclass:: warcio.warcwriter.WARCWriter
     :members:
     :undoc-members: