From cca0cc939e7dca17c6c1ef1d22659d9f0346be97 Mon Sep 17 00:00:00 2001 From: Florents Tselai Date: Thu, 12 Sep 2024 23:32:03 +0300 Subject: [PATCH 1/7] Add docs boilerplate --- .readthedocs.yml | 18 ++++ docs/.gitignore | 1 + docs/Makefile | 23 +++++ docs/conf.py | 193 ++++++++++++++++++++++++++++++++++++++++++ docs/index.rst | 1 + docs/requirements.txt | 6 ++ 6 files changed, 242 insertions(+) create mode 100644 .readthedocs.yml create mode 100644 docs/.gitignore create mode 100644 docs/Makefile create mode 100644 docs/conf.py create mode 120000 docs/index.rst create mode 100644 docs/requirements.txt diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 00000000..8f9020d6 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,18 @@ +version: 2 + +build: + os: ubuntu-22.04 + tools: + python: "3.12" + +sphinx: + configuration: docs/conf.py + +formats: + - pdf + - epub + +python: + install: + - requirements: docs/requirements.txt + diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 00000000..e35d8850 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +_build diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..2b54e10b --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,23 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SPHINXPROJ = warcio +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +livehtml: + sphinx-autobuild -a -b html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(0) --watch ../warcio diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 00000000..f78f375a --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from subprocess import Popen, PIPE +from beanbag_docutils.sphinx.ext.github import github_linkcode_resolve + +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx.ext.extlinks", + "sphinx.ext.autodoc", + "sphinx_copybutton", + "sphinx.ext.linkcode", +] +autodoc_member_order = "bysource" +autodoc_typehints = "description" + +extlinks = { + "issue": ("https://github.com/webrecorder/warcio/issues/%s", "#%s"), +} + + +def linkcode_resolve(domain, info): + return github_linkcode_resolve( + domain=domain, + info=info, + allowed_module_names=["warcio"], + github_org_id="webrecorder", + github_repo_id="warcio", + branch="master", + ) + + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = ".rst" + +# The master toctree document. +master_doc = "index" + +# General information about the project. +project = "warcio" +copyright = "2023" +author = "webrecorder" + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +pipe = Popen("git describe --tags --always", stdout=PIPE, shell=True) +git_version = pipe.stdout.read().decode("utf8") + +if git_version: + version = git_version.rsplit("-", 1)[0] + release = git_version +else: + version = "" + release = "" + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = "en" + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "sphinx" + +# Only syntax highlight of code-block is used: +highlight_language = "none" + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "furo" +html_title = "warcio" + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] + +# html_js_files = ["js/custom.js"] + +# -- Options for HTMLHelp output ------------------------------------------ + +# Output file base name for HTML help builder. +htmlhelp_basename = "warcio-doc" + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + ( + master_doc, + "warcio.tex", + "warcio documentation", + "webrecorder", + "manual", + ) +] + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [(master_doc, "warcio", "warcio documentation", [author], 1)] + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ( + master_doc, + "warcio", + "warcio documentation", + author, + "warcio", + "Python library for manipulating SQLite databases", + "Miscellaneous", + ) +] diff --git a/docs/index.rst b/docs/index.rst new file mode 120000 index 00000000..89a01069 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1 @@ +../README.rst \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..e798be25 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,6 @@ +furo +sphinx-autobuild +codespell +sphinx-copybutton +beanbag-docutils>=2.0 +pygments-csv-lexer \ No newline at end of file From 6434939e3f5ff856ce8e6dc0f4a770277be11f5e Mon Sep 17 00:00:00 2001 From: Florents Tselai Date: Thu, 12 Sep 2024 23:52:56 +0300 Subject: [PATCH 2/7] Add api reference, auto-genrated from source. Also source is linked to GitHub --- docs/api.rst | 36 +++++ docs/conf.py | 2 +- docs/index.rst | 421 ++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 457 insertions(+), 2 deletions(-) create mode 100644 docs/api.rst mode change 120000 => 100644 docs/index.rst diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 00000000..a406b3fa --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,36 @@ +.. _api: + +=============== + API Reference +=============== + +.. contents:: :local: + :class: this-will-duplicate-information-and-it-is-still-useful-here + +.. _reference_statusandheaders: + +warcio.statusandheaders.StatusAndHeaders +======================================== + +.. autoclass:: warcio.statusandheaders.StatusAndHeaders + :members: + :undoc-members: + :special-members: __getitem__ + +.. _reference_archiveiterator: + +warcio.archiveiterator.ArchiveIterator +======================================= + +.. autoclass:: warcio.archiveiterator.ArchiveIterator + :members: + :undoc-members: + +.. _reference_warcwriter: + +warcio.warcwriter.WARCWriter +============================ + +.. autoclass:: warcio.warcwriter.WARCWriter + :members: + :undoc-members: diff --git a/docs/conf.py b/docs/conf.py index f78f375a..4f36fc17 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -187,7 +187,7 @@ def linkcode_resolve(domain, info): "warcio documentation", author, "warcio", - "Python library for manipulating SQLite databases", + "warcio description...", "Miscellaneous", ) ] diff --git a/docs/index.rst b/docs/index.rst deleted file mode 120000 index 89a01069..00000000 --- a/docs/index.rst +++ /dev/null @@ -1 +0,0 @@ -../README.rst \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 00000000..d451c303 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,420 @@ +WARCIO: WARC (and ARC) Streaming Library +======================================== +.. image:: https://travis-ci.org/webrecorder/warcio.svg?branch=master + :target: https://travis-ci.org/webrecorder/warcio +.. image:: https://codecov.io/gh/webrecorder/warcio/branch/master/graph/badge.svg + :target: https://codecov.io/gh/webrecorder/warcio + + +Background +---------- + +This library provides a fast, standalone way to read and write `WARC +Format `__ commonly used in +web archives. Python 3.7+ (minimally only needing +`six `__ as an external dependency) + +warcio supports reading and writing of WARC files compliant with both the `WARC 1.0 `__ +and `WARC 1.1 `__ ISO standards. + +Install with: ``pip install warcio`` (or ``pip install warcio[all]`` to get optional features) + +This library is a spin-off of the WARC reading and writing component of +the `pywb `__ high-fidelity replay +library, a key component of +`Webrecorder `__ + +The library is designed for fast, low-level access to web archival +content, oriented around a stream of WARC records rather than files. + +Reading WARC Records +-------------------- + +A key feature of the library is to be able to iterate over a stream of +WARC records using the ``ArchiveIterator``. + +It includes the following features: + +- Reading a WARC 1.0, WARC 1.1 or ARC stream +- On the fly ARC to WARC record conversion +- Decompressing and de-chunking HTTP payload content stored in WARC/ARC files. + +For example, the following prints the the url for each WARC ``response`` +record: + +.. code:: python + + from warcio.archiveiterator import ArchiveIterator + + with open('path/to/file', 'rb') as stream: + for record in ArchiveIterator(stream): + if record.rec_type == 'response': + print(record.rec_headers.get_header('WARC-Target-URI')) + +The stream object could be a file on disk or a remote network stream. +The ``ArchiveIterator`` reads the WARC content in a single pass. The +``record`` is represented by an ``ArcWarcRecord`` object which contains +the format (ARC or WARC), record type, the record headers, http headers +(if any), and raw stream for reading the payload. + +.. code:: python + + class ArcWarcRecord(object): + def __init__(self, *args): + (self.format, self.rec_type, self.rec_headers, self.raw_stream, + self.http_headers, self.content_type, self.length) = args + +Reading WARC Content +~~~~~~~~~~~~~~~~~~~~ + +The ``raw_stream`` can be used to read the rest of the payload directly. +A special ``ArcWarcRecord.content_stream()`` function provides a stream that +automatically decompresses and de-chunks the HTTP payload, if it is +compressed and/or transfer-encoding chunked. + +ARC Files +~~~~~~~~~ + +The library provides support for reading (but not writing ARC) files. +The ARC format is legacy but is important to support in a consistent +matter. The ``ArchiveIterator`` can equally iterate over ARC and WARC +files to emit ``ArcWarcRecord`` objects. The special ``arc2warc`` option +converts ARC records to WARCs on the fly, allowing for them to be +accessed using the same API. + +(Special ``WARCIterator`` and ``ARCIterator`` subclasses of ``ArchiveIterator`` +are also available to read only WARC or only ARC files). + +WARC and ARC Streaming +~~~~~~~~~~~~~~~~~~~~~~ +For example, here is a snippet for reading an ARC and a WARC using the +same API. + +The example streams a WARC and ARC file over HTTP using +`requests `__, printing the +``warcinfo`` record (or ARC header) and any response records (or all ARC +records) that contain HTML: + +.. code:: python + + import requests + from warcio.archiveiterator import ArchiveIterator + + def print_records(url): + resp = requests.get(url, stream=True) + + for record in ArchiveIterator(resp.raw, arc2warc=True): + if record.rec_type == 'warcinfo': + print(record.raw_stream.read()) + + elif record.rec_type == 'response': + if record.http_headers.get_header('Content-Type') == 'text/html': + print(record.rec_headers.get_header('WARC-Target-URI')) + print(record.content_stream().read()) + print('') + + # WARC + print_records('https://archive.org/download/ExampleArcAndWarcFiles/IAH-20080430204825-00000-blackbook.warc.gz') + + + # ARC with arc2warc + print_records('https://archive.org/download/ExampleArcAndWarcFiles/IAH-20080430204825-00000-blackbook.arc.gz') + + +Writing WARC Records +-------------------- + +Starting with 1.6, warcio introduces a way to capture HTTP/S traffic directly +to a WARC file, by monkey-patching Python's ``http.client`` library. + +This approach works well with the popular ``requests`` library often used to fetch +HTTP/S content. Note that ``requests`` must be imported after the ``capture_http`` module. + +Quick Start to Writing a WARC +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Fetching the url ``https://example.com/`` while capturing the response and request +into a gzip compressed WARC file named ``example.warc.gz`` can be done with the following four lines: + +.. code:: python + + from warcio.capture_http import capture_http + import requests # requests must be imported after capture_http + + with capture_http('example.warc.gz'): + requests.get('https://example.com/') + + +The WARC ``example.warc.gz`` will contain two records (the response is written first, then the request). + +To write to a default in-memory buffer (``BufferWARCWriter``), don't specify a filename, using ``with capture_http() as writer:``. + +Additional requests in the ``capture_http`` context and will be appended to the WARC as expected. + +The ``WARC-IP-Address`` header will also be added for each record if the IP address is available. + +The following example (similar to a `unit test from the test suite `__) demonstrates the resulting records created with ``capture_http``: + +.. code:: python + + with capture_http() as writer: + requests.get('http://example.com/') + requests.get('https://google.com/') + + expected = [('http://example.com/', 'response', True), + ('http://example.com/', 'request', True), + ('https://google.com/', 'response', True), + ('https://google.com/', 'request', True), + ('https://www.google.com/', 'response', True), + ('https://www.google.com/', 'request', True) + ] + + actual = [ + (record.rec_headers['WARC-Target-URI'], + record.rec_type, + 'WARC-IP-Address' in record.rec_headers) + + for record in ArchiveIterator(writer.get_stream()) + ] + + assert actual == expected + + +Customizing WARC Writing +~~~~~~~~~~~~~~~~~~~~~~~~ + +The library provides a simple and extensible interface for writing +standards-compliant WARC files. + +The library comes with a basic ``WARCWriter`` class for writing to a +single WARC file and ``BufferWARCWriter`` for writing to an in-memory +buffer. The ``BaseWARCWriter`` can be extended to support more complex +operations. + +(There is no support for writing legacy ARC files) + +For more flexibility, such as to use a custom ``WARCWriter`` class, +the above example can be written as: + +.. code:: python + + from warcio.capture_http import capture_http + from warcio import WARCWriter + import requests # requests *must* be imported after capture_http + + with open('example.warc.gz', 'wb') as fh: + warc_writer = WARCWriter(fh) + with capture_http(warc_writer): + requests.get('https://example.com/') + +WARC/1.1 Support +~~~~~~~~~~~~~~~~ + +By default, warcio creates WARC 1.0 records for maximum compatibility with existing tools. +To create WARC/1.1 records, simply specify the warc version as follows: + +.. code:: python + + with capture_http('example.warc.gz', warc_version='1.1'): + ... + + +.. code:: python + + WARCWriter(fh, warc_version='1.1) + ... + +When using WARC 1.1, the main difference is that the ``WARC-Date`` timestamp header +will be written with microsecond precision, while WARC 1.0 only supports second precision. + +WARC 1.0: + +.. code:: + + WARC/1.0 + ... + WARC-Date: 2018-12-26T10:11:12Z + +WARC 1.1: + +.. code:: + + WARC/1.1 + ... + WARC-Date: 2018-12-26T10:11:12.456789Z + + + +Filtering HTTP Capture +~~~~~~~~~~~~~~~~~~~~~~ + +When capturing via HTTP, it is possible to provide a custom filter function, +which can be used to determine if a particular request and response records +should be written to the WARC file or skipped. + +The filter function is called with the request and response record +before they are written, and can be used to substitute a different record (for example, a revisit +instead of a response), or to skip writing altogether by returning nothing, as shown below: + +.. code:: python + + def filter_records(request, response, request_recorder): + # return None, None to indicate records should be skipped + if response.http_headers.get_statuscode() != '200': + return None, None + + # the response record can be replaced with a revisit record + elif check_for_dedup(): + response = create_revisit_record(...) + + return request, response + + with capture_http('example.warc.gz', filter_records): + requests.get('https://example.com/') + +Please refer to +`test/test\_capture_http.py `__ for additional examples +of capturing ``requests`` traffic to WARC. + +Manual/Advanced WARC Writing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Before 1.6, this was the primary method for fetching a url and then +writing to a WARC. This process is a bit more verbose, +but provides for full control of WARC creation and avoid monkey-patching. + +The following example loads ``http://example.com/``, creates a WARC +response record, and writes it, gzip compressed, to ``example.warc.gz`` +The block and payload digests are computed automatically. + +.. code:: python + + from warcio.warcwriter import WARCWriter + from warcio.statusandheaders import StatusAndHeaders + + import requests + + with open('example.warc.gz', 'wb') as output: + writer = WARCWriter(output, gzip=True) + + resp = requests.get('http://example.com/', + headers={'Accept-Encoding': 'identity'}, + stream=True) + + # get raw headers from urllib3 + headers_list = resp.raw.headers.items() + + http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') + + record = writer.create_warc_record('http://example.com/', 'response', + payload=resp.raw, + http_headers=http_headers) + + writer.write_record(record) + + +The library also includes additional semantics for: + - Creating ``warcinfo`` and ``revisit`` records + - Writing ``response`` and ``request`` records together + - Writing custom WARC records + - Reading a full WARC record from a stream + +Please refer to `warcwriter.py `__ and +`test/test\_writer.py `__ for additional examples. + +WARCIO CLI: Indexing and Recompression +-------------------------------------- + +The library currently ships with a few simple command line tools. + +Index +~~~~~ + +The ``warcio index`` cmd will print a simple index of the records in the +warc file as newline delimited JSON lines (NDJSON). + +WARC header fields to include in the index can be specified via the +``-f`` flag, and are included in the JSON block (in order, for +convenience). + +:: + + warcio index ./test/data/example-iana.org-chunked.warc -f warc-type,warc-target-uri,content-length + {"warc-type": "warcinfo", "content-length": "137"} + {"warc-type": "response", "warc-target-uri": "http://www.iana.org/", "content-length": "7566"} + {"warc-type": "request", "warc-target-uri": "http://www.iana.org/", "content-length": "76"} + + +HTTP header fields can be included by prefixing them with the prefix +``http:``. The special field ``offset`` refers to the record offset within +the warc file. + +:: + + warcio index ./test/data/example-iana.org-chunked.warc -f offset,content-type,http:content-type,warc-target-uri + {"offset": "0", "content-type": "application/warc-fields"} + {"offset": "405", "content-type": "application/http;msgtype=response", "http:content-type": "text/html; charset=UTF-8", "warc-target-uri": "http://www.iana.org/"} + {"offset": "8379", "content-type": "application/http;msgtype=request", "warc-target-uri": "http://www.iana.org/"} + +(Note: this library does not produce CDX or CDXJ format indexes often +associated with web archives. To create these indexes, please see the +`cdxj-indexer `__ tool which extends warcio indexing to provide this functionality) + +Check +~~~~~ + +The ``warcio check`` command will check the payload and block digests +of WARC records, if possible. An exit value of 1 indicates a failure. +``warcio check -v`` will print verbose output for each record in the +WARC file. + +Recompress +~~~~~~~~~~ + +The ``recompress`` command allows for re-compressing or normalizing WARC +(or ARC) files to a record-compressed, gzipped WARC file. + +Each WARC record is compressed individually and concatenated. This is +the 'canonical' WARC storage format used by +`Webrecorder `__ and other +web archiving institutions, and usually stored with a ``.warc.gz`` +extension. + +It can be used to: - Compress an uncompressed WARC - Convert any ARC +file to a compressed WARC - Fix an improperly compressed WARC file (eg. +a WARC compressed entirely instead of by record) + +:: + + warcio recompress ./input.arc.gz ./output.warc.gz + + +Extract +~~~~~~~ + +The ``extract`` command provides a way to extract either the WARC and HTTP headers and/or payload of a WARC record +to stdout. Given a WARC filename and an offset, ``extract`` will print the (decompressed) record at that offset +in the file to stdout + +Specifying --payload or --headers will output only the payload or only the WARC + HTTP headers (if any), respectively. + +:: + + warcio extract [--payload | --headers] filename offset + + +License +~~~~~~~ + +``warcio`` is licensed under the Apache 2.0 License and is part of the +Webrecorder project. + +See `NOTICE `__ and `LICENSE `__ for details. + +Contents +-------- + +.. toctree:: + :maxdepth: 3 + + api \ No newline at end of file From b7bfee77335e67d0ae190587bbd7cebd26c0d56a Mon Sep 17 00:00:00 2001 From: Florents Tselai Date: Thu, 12 Sep 2024 23:56:15 +0300 Subject: [PATCH 3/7] Have to install the library to auto generate API docs --- .readthedocs.yml | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index 8f9020d6..c551aeac 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -1,18 +1,20 @@ version: 2 +sphinx: + configuration: docs/conf.py + build: os: ubuntu-22.04 tools: python: "3.12" -sphinx: - configuration: docs/conf.py - -formats: - - pdf - - epub - python: install: - - requirements: docs/requirements.txt + - method: pip + path: . + extra_requirements: + - docs +formats: +- pdf +- epub \ No newline at end of file From e6e1935a73fbce0722dca797ef060bbfe0e53651 Mon Sep 17 00:00:00 2001 From: Florents Tselai Date: Thu, 12 Sep 2024 23:57:53 +0300 Subject: [PATCH 4/7] beanbag_docutils ->> 3.11 --- .readthedocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index c551aeac..4f16d592 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -6,7 +6,7 @@ sphinx: build: os: ubuntu-22.04 tools: - python: "3.12" + python: "3.11" python: install: From 57dc72298e87d1a2ef3e801a4928ecee0c7e86f0 Mon Sep 17 00:00:00 2001 From: Florents Tselai Date: Fri, 13 Sep 2024 00:00:55 +0300 Subject: [PATCH 5/7] docs extra to setup.py --- docs/requirements.txt | 6 ------ setup.py | 9 +++++++++ 2 files changed, 9 insertions(+), 6 deletions(-) delete mode 100644 docs/requirements.txt diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index e798be25..00000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -furo -sphinx-autobuild -codespell -sphinx-copybutton -beanbag-docutils>=2.0 -pygments-csv-lexer \ No newline at end of file diff --git a/setup.py b/setup.py index e1d8fe4f..0d48ab25 100755 --- a/setup.py +++ b/setup.py @@ -40,6 +40,15 @@ ], 'all': [ 'brotlipy', + ], + + 'docs': [ + "furo", + "sphinx-autobuild", + "codespell", + "sphinx-copybutton", + "beanbag-docutils>=2.0", + "pygments-csv-lexer" ] }, classifiers=[ From 78ac4f9c0220396f04a8373320cc9bcbba6e3e29 Mon Sep 17 00:00:00 2001 From: Florents Tselai Date: Fri, 13 Sep 2024 00:08:14 +0300 Subject: [PATCH 6/7] Simplify class paths --- docs/api.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index a406b3fa..b1a0db51 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -9,8 +9,8 @@ .. _reference_statusandheaders: -warcio.statusandheaders.StatusAndHeaders -======================================== +warcio.StatusAndHeaders +======================= .. autoclass:: warcio.statusandheaders.StatusAndHeaders :members: @@ -19,18 +19,18 @@ warcio.statusandheaders.StatusAndHeaders .. _reference_archiveiterator: -warcio.archiveiterator.ArchiveIterator -======================================= +warcio.ArchiveIterator +====================== -.. autoclass:: warcio.archiveiterator.ArchiveIterator +.. autoclass:: warcio.ArchiveIterator :members: :undoc-members: .. _reference_warcwriter: -warcio.warcwriter.WARCWriter -============================ +warcio.WARCWriter +================= -.. autoclass:: warcio.warcwriter.WARCWriter +.. autoclass:: warcio.WARCWriter :members: :undoc-members: From 0611da92f2ea93aa53c63fa8795e941dacdeb2e2 Mon Sep 17 00:00:00 2001 From: Florents Tselai Date: Fri, 13 Sep 2024 00:14:19 +0300 Subject: [PATCH 7/7] Gh links work again --- docs/api.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index b1a0db51..a406b3fa 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -9,8 +9,8 @@ .. _reference_statusandheaders: -warcio.StatusAndHeaders -======================= +warcio.statusandheaders.StatusAndHeaders +======================================== .. autoclass:: warcio.statusandheaders.StatusAndHeaders :members: @@ -19,18 +19,18 @@ warcio.StatusAndHeaders .. _reference_archiveiterator: -warcio.ArchiveIterator -====================== +warcio.archiveiterator.ArchiveIterator +======================================= -.. autoclass:: warcio.ArchiveIterator +.. autoclass:: warcio.archiveiterator.ArchiveIterator :members: :undoc-members: .. _reference_warcwriter: -warcio.WARCWriter -================= +warcio.warcwriter.WARCWriter +============================ -.. autoclass:: warcio.WARCWriter +.. autoclass:: warcio.warcwriter.WARCWriter :members: :undoc-members: