From 6d04a4ee8bcbe66b7bee45c843c9c13ee7234d14 Mon Sep 17 00:00:00 2001 From: Jay Qi <2721979+jayqi@users.noreply.github.com> Date: Sat, 6 Jan 2024 13:38:33 -0500 Subject: [PATCH] Refactor tests (#5) * Refactor tests * Remove old tests file * Add conda-forge badges * Add test for mkdir --------- Co-authored-by: Jay Qi --- .github/codecov.yml | 19 +++ README.md | 14 +- pyproject.toml | 9 +- tests.py | 312 -------------------------------------- tests/__init__.py | 3 + tests/conftest.py | 24 +++ tests/test_core.py | 360 ++++++++++++++++++++++++++++++++++++++++++++ tests/utils.py | 67 +++++++++ 8 files changed, 490 insertions(+), 318 deletions(-) create mode 100644 .github/codecov.yml delete mode 100644 tests.py create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/test_core.py create mode 100644 tests/utils.py diff --git a/.github/codecov.yml b/.github/codecov.yml new file mode 100644 index 0000000..937255e --- /dev/null +++ b/.github/codecov.yml @@ -0,0 +1,19 @@ +codecov: + require_ci_to_pass: yes + +coverage: + precision: 1 + round: down + range: "70...100" + status: + project: # Coverage of whole project + default: + target: auto # Coverage target to pass; auto is base commit + threshold: 5% # Allow coverage to drop by this much vs. base and still pass + patch: # Coverage of lines in this change + default: + target: 80% # Coverage target to pass + threshold: 20% # Allow coverage to drop by this much vs. base and still pass + +comment: + layout: "diff,flags,tree" diff --git a/README.md b/README.md index ea0ca84..2bfc216 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,17 @@ # repro-zipfile [![PyPI](https://img.shields.io/pypi/v/repro-zipfile.svg)](https://pypi.org/project/repro-zipfile/) +[![Conda Version](https://img.shields.io/conda/vn/conda-forge/repro-zipfile.svg)](https://anaconda.org/conda-forge/repro-zipfile) +[![conda-forge feedstock](https://img.shields.io/badge/conda--forge-feedstock-yellowgreen)](https://github.com/conda-forge/repro-zipfile-feedstock) [![Supported Python versions](https://img.shields.io/pypi/pyversions/repro-zipfile)](https://pypi.org/project/repro-zipfile/) [![tests](https://github.com/drivendataorg/repro-zipfile/actions/workflows/tests.yml/badge.svg?branch=main)](https://github.com/drivendataorg/repro-zipfile/actions/workflows/tests.yml?query=branch%3Amain) [![codecov](https://codecov.io/gh/drivendataorg/repro-zipfile/branch/main/graph/badge.svg)](https://codecov.io/gh/drivendataorg/repro-zipfile) **A tiny, zero-dependency replacement for Python's `zipfile.ZipFile` for creating reproducible/deterministic ZIP archives.** -"Reproducible" or "deterministic" in this context means that the binary content of the ZIP archive is identical if you add files with identical binary content in the same order. This Python package provides a `ReproducibleZipFile` class that works exactly like [`zipfile.ZipFile`](https://docs.python.org/3/library/zipfile.html#zipfile-objects) from the Python standard library, except that all files written to the archive have their last-modified timestamps set to a fixed value. +"Reproducible" or "deterministic" in this context means that the binary content of the ZIP archive is identical if you add files with identical binary content in the same order. It means you can reliably check equality of the contents of two ZIP archives by simply comparing checksums of the archive using a hash function like MD5 or SHA-256. + +This Python package provides a `ReproducibleZipFile` class that works exactly like [`zipfile.ZipFile`](https://docs.python.org/3/library/zipfile.html#zipfile-objects) from the Python standard library, except that all files written to the archive have their last-modified timestamps set to a fixed value. ## Installation @@ -17,6 +21,12 @@ repro-zipfile is available from PyPI. To install, run: pip install repro-zipfile ``` +It is also available from conda-forge. To install, run: + +```bash +conda install repro-zipfile -c conda-forge +``` + ## Usage Simply import `ReproducibleZipFile` and use it in the same way you would use [`zipfile.ZipFile`](https://docs.python.org/3/library/zipfile.html#zipfile-objects) from the Python standard library. @@ -37,7 +47,7 @@ See [`examples/usage.py`](./examples/usage.py) for an example script that you ca ### Set timestamp value with SOURCE_DATE_EPOCH -repro_zipfile supports setting the fixed timestamp value using the `SOURCE_DATE_EPOCH` environment variable. This should be an integer corresponding to the [Unix epoch time](https://en.wikipedia.org/wiki/Unix_time) of the timestamp you want to set. `SOURCE_DATE_EPOCH` is a [standard](https://reproducible-builds.org/docs/source-date-epoch/) created by the [Reproducible Builds project](https://reproducible-builds.org/). +repro_zipfile supports the `SOURCE_DATE_EPOCH` environment variable. If set, it will be used as a fixed value for the modified timestamps of files added to an archive. This should be an integer corresponding to the [Unix epoch time](https://en.wikipedia.org/wiki/Unix_time) of the timestamp you want to set. `SOURCE_DATE_EPOCH` is a [standard](https://reproducible-builds.org/docs/source-date-epoch/) created by the [Reproducible Builds project](https://reproducible-builds.org/) for software distributions. ## How does repro-zipfile work? diff --git a/pyproject.toml b/pyproject.toml index 7922062..46f337d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: System :: Archiving", "Topic :: System :: Archiving :: Compression", "Topic :: System :: Archiving :: Packaging", @@ -57,10 +58,10 @@ dependencies = ["coverage>=6.5", "pytest-cov"] template = "tests" [[tool.hatch.envs.tests.matrix]] -python = ["3.8", "3.9", "3.10", "3.11"] +python = ["3.8", "3.9", "3.10", "3.11", "3.12"] [tool.hatch.envs.tests.scripts] -test = "pytest {args:tests.py} -v --cov=repro_zipfile --cov-report=term --cov-report=html --cov-report=xml" +test = "pytest {args:tests} -v --cov=. --cov-report=term --cov-report=html --cov-report=xml" ## TOOLS ## @@ -74,7 +75,7 @@ select = [ "F", # Pycodestyle "I", # isort ] -src = ["repro_zipfile.py", "tests.py"] +src = ["*.py", "tests/*.py"] unfixable = ["F"] [tool.ruff.isort] @@ -82,4 +83,4 @@ known-first-party = ["repro_zipfile"] force-sort-within-sections = true [tool.coverage.run] -source = ["repro_zipfile.py"] +omit = ["tests/*"] diff --git a/tests.py b/tests.py deleted file mode 100644 index 82cc7be..0000000 --- a/tests.py +++ /dev/null @@ -1,312 +0,0 @@ -import hashlib -import os -from pathlib import Path -import shutil -from time import sleep -import uuid -from zipfile import ZipFile, ZipInfo - -import pytest -from pytest_cases import fixture_union - -from repro_zipfile import ReproducibleZipFile - - -def data_factory(): - """Utility function to generate random data.""" - return str(uuid.uuid4()) - - -def hash_file(path: Path): - """Utility function to calculate the hash of a file's contents.""" - return hashlib.md5(path.read_bytes()).hexdigest() - - -@pytest.fixture -def abs_path(tmp_path): - """Fixture that returns a temporary directory as an absolute Path object.""" - return tmp_path - - -@pytest.fixture -def rel_path(tmp_path): - """Fixture that sets a temporary directory as the current working directory and returns a - relative path to it.""" - orig_wd = Path.cwd() - os.chdir(tmp_path) - yield Path() - os.chdir(orig_wd) - - -base_path = fixture_union("base_path", ["rel_path", "abs_path"]) - - -def test_write(base_path): - """Test that write adds files to the archive in the expected way, i.e., we didn't break the - basic functionality.""" - data_file = base_path / "data.txt" - data_file.write_text(data_factory()) - data_dir = base_path / "dir" - data_dir.mkdir() - for i in range(3): - (data_dir / f"{i}.txt").write_text(data_factory()) - - # Create and extract ReproducibleZipFile - repro_zipfile_file = base_path / "repro_zipfile_file.zip" - with ReproducibleZipFile(repro_zipfile_file, "w") as zp: - zp.write(data_file) - zp.write(data_dir) - for root, dirs, files in os.walk(data_dir): - for d in dirs: - zp.write(Path(root) / d) - for f in files: - zp.write(Path(root) / f) - repro_zipfile_outdir = base_path / "repro_zipfile_out" - repro_zipfile_outdir.mkdir() - with ReproducibleZipFile(repro_zipfile_file, "r") as zp: - zp.extractall(repro_zipfile_outdir) - - # Create and extract regular ZipFile for comparison - zip_file = base_path / "zip_file.zip" - with ZipFile(zip_file, "w") as zp: - zp.write(data_file) - zp.write(data_dir) - for root, dirs, files in os.walk(data_dir): - for d in dirs: - zp.write(Path(root) / d) - for f in files: - zp.write(Path(root) / f) - zip_outdir = base_path / "zip_out" - zip_outdir.mkdir() - with ZipFile(zip_file, "r") as zp: - zp.extractall(zip_outdir) - - repro_zipfile_extracted = sorted(repro_zipfile_outdir.glob("**/*")) - zip_extracted = sorted(zip_outdir.glob("**/*")) - assert len(repro_zipfile_extracted) == len(zip_extracted) - for repro_zipfile_member, zip_member in zip(repro_zipfile_extracted, zip_extracted): - assert repro_zipfile_member.relative_to(repro_zipfile_outdir) == zip_member.relative_to( - zip_outdir - ) - if repro_zipfile_member.is_file(): - assert zip_member.is_file() - assert repro_zipfile_member.read_text() == zip_member.read_text() - - -def test_writestr(tmp_path): - """Test that writestr adds files to the archive in the expected way, i.e., we didn't break the - basic functionality.""" - data = data_factory() - - with ReproducibleZipFile(tmp_path / "repro_zipfile.zip", "w") as zp: - zp.writestr("data.txt", data=data) - - extract_dir = tmp_path / "extract" - extract_dir.mkdir() - - with ZipFile(tmp_path / "repro_zipfile.zip", "r") as zp: - zp.extractall(extract_dir) - - assert sorted(extract_dir.glob("**/*")) == [extract_dir / "data.txt"] - assert (extract_dir / "data.txt").read_text() == data - - -def test_write_same_file_different_mtime(base_path): - """Test that writing the same file with different mtime produces the same hash.""" - data = data_factory() - data_file = base_path / "data.txt" - - data_file.write_text(data) - with ReproducibleZipFile(base_path / "zip1.zip", "w") as zp: - zp.write(data_file) - - sleep(2) - - data_file.write_text(data) - with ReproducibleZipFile(base_path / "zip2.zip", "w") as zp: - zp.write(data_file) - - assert hash_file(base_path / "zip1.zip") == hash_file(base_path / "zip2.zip") - - -def test_write_same_file_different_mtime_source_date_epoch(base_path, monkeypatch): - """Test that writing the same file at different times with SOURCE_DATE_EPOCH set produces the - same hash.""" - monkeypatch.setenv("SOURCE_DATE_EPOCH", "1691732367") - data = data_factory() - data_file = base_path / "data.txt" - - data_file.write_text(data) - with ReproducibleZipFile(base_path / "zip1.zip", "w") as zp: - zp.write(data_file) - - sleep(2) - - data_file.write_text(data) - with ReproducibleZipFile(base_path / "zip2.zip", "w") as zp: - zp.write(data_file) - - assert hash_file(base_path / "zip1.zip") == hash_file(base_path / "zip2.zip") - - -def test_write_same_file_different_mtime_string_input(rel_path): - """Test that writing the same file with different mtime produces the same hash, using string - inputs instead of Path.""" - data = data_factory() - data_file = rel_path / "data.txt" - - data_file.write_text(data) - with ReproducibleZipFile(rel_path / "zip1.zip", "w") as zp: - zp.write("data.txt") - - sleep(2) - - data_file.write_text(data) - with ReproducibleZipFile(rel_path / "zip2.zip", "w") as zp: - zp.write("data.txt") - - assert hash_file(rel_path / "zip1.zip") == hash_file(rel_path / "zip2.zip") - - -def test_write_same_file_different_mtime_arcname(base_path): - """Test that writing the same file with different mtime produces the same hash.""" - data = data_factory() - data_file = base_path / "data.txt" - - data_file.write_text(data) - with ReproducibleZipFile(base_path / "zip1.zip", "w") as zp: - zp.write(data_file, arcname="lore.txt") - - sleep(2) - - data_file.write_text(data) - with ReproducibleZipFile(base_path / "zip2.zip", "w") as zp: - zp.write(data_file, arcname="lore.txt") - - assert hash_file(base_path / "zip1.zip") == hash_file(base_path / "zip2.zip") - - -def test_write_same_directory_different_mtime(base_path): - data_list = [data_factory() for _ in range(3)] - data_dir = base_path / "dir" - - data_dir.mkdir() - for data in data_list: - (data_dir / f"{data}.txt").write_text(data) - with ReproducibleZipFile(base_path / "zip1.zip", "w") as zp: - zp.write(data_dir) - for root, dirs, files in os.walk(data_dir): - for d in dirs: - zp.write(Path(root) / d) - for f in files: - zp.write(Path(root) / f) - - sleep(2) - - shutil.rmtree(data_dir) - data_dir.mkdir() - for data in data_list: - (data_dir / f"{data}.txt").write_text(data) - with ReproducibleZipFile(base_path / "zip2.zip", "w") as zp: - zp.write(data_dir) - for root, dirs, files in os.walk(data_dir): - for d in dirs: - zp.write(Path(root) / d) - for f in files: - zp.write(Path(root) / f) - - assert hash_file(base_path / "zip1.zip") == hash_file(base_path / "zip2.zip") - - -def test_write_same_directory_different_mtime_string_input(rel_path): - data_list = [data_factory() for _ in range(3)] - data_dir = rel_path / "dir" - - data_dir.mkdir() - for data in data_list: - (data_dir / f"{data}.txt").write_text(data) - with ReproducibleZipFile(rel_path / "zip1.zip", "w") as zp: - zp.write(data_dir.name) - for root, dirs, files in os.walk(data_dir): - for d in dirs: - zp.write(os.path.join(root, d)) - for f in files: - zp.write(os.path.join(root, f)) - - sleep(2) - - shutil.rmtree(data_dir) - data_dir.mkdir() - for data in data_list: - (data_dir / f"{data}.txt").write_text(data) - with ReproducibleZipFile("zip2.zip", "w") as zp: - zp.write(data_dir.name) - for root, dirs, files in os.walk(data_dir): - for d in dirs: - zp.write(os.path.join(root, d)) - for f in files: - zp.write(os.path.join(root, f)) - - assert hash_file(rel_path / "zip1.zip") == hash_file(rel_path / "zip2.zip") - - -def test_writestr_same_data_different_mtime(rel_path): - """Test that using writestr with the same data at different times produces the same hash.""" - data = data_factory() - - with ReproducibleZipFile(rel_path / "zip1.zip", "w") as zp: - zp.writestr("data.txt", data=data) - - sleep(2) - - with ReproducibleZipFile(rel_path / "zip2.zip", "w") as zp: - zp.writestr("data.txt", data=data) - - assert hash_file(rel_path / "zip1.zip") == hash_file(rel_path / "zip2.zip") - - -def test_writestr_same_data_different_mtime_zipinfo_input(rel_path): - """Test that using writestr with the same data at different times produces the same hash.""" - data = data_factory() - data_file = rel_path / "data.txt" - - data_file.write_text(data) - with ReproducibleZipFile(rel_path / "zip1.zip", "w") as zp: - zp.writestr(ZipInfo.from_file(data_file), data=data) - - sleep(2) - - data_file.write_text(data) - with ReproducibleZipFile(rel_path / "zip2.zip", "w") as zp: - zp.writestr(ZipInfo.from_file(data_file), data=data) - - assert hash_file(rel_path / "zip1.zip") == hash_file(rel_path / "zip2.zip") - - -def test_writestr_same_data_different_mtime_source_date_epoch(rel_path, monkeypatch): - """Test that using writestr with the same data at different times with SOURCE_DATE_EPOCH set - produces the same hash.""" - monkeypatch.setenv("SOURCE_DATE_EPOCH", "1691732367") - data = data_factory() - - with ReproducibleZipFile(rel_path / "zip1.zip", "w") as zp: - zp.writestr("data.txt", data=data) - - sleep(2) - - with ReproducibleZipFile(rel_path / "zip2.zip", "w") as zp: - zp.writestr("data.txt", data=data) - - assert hash_file(rel_path / "zip1.zip") == hash_file(rel_path / "zip2.zip") - - -def test_mkdir(rel_path): - with ReproducibleZipFile(rel_path / "zip1.zip", "w") as zp: - zp.mkdir("dir") - - sleep(2) - - with ReproducibleZipFile(rel_path / "zip2.zip", "w") as zp: - zp.mkdir("dir") - - assert hash_file(rel_path / "zip1.zip") == hash_file(rel_path / "zip2.zip") diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..543ab06 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,3 @@ +import pytest + +pytest.register_assert_rewrite("tests.utils") diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..c347a8c --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,24 @@ +import os +from pathlib import Path + +import pytest +from pytest_cases import fixture_union + + +@pytest.fixture +def abs_path(tmp_path): + """Fixture that returns a temporary directory as an absolute Path object.""" + return tmp_path + + +@pytest.fixture +def rel_path(tmp_path): + """Fixture that sets a temporary directory as the current working directory and returns a + relative path to it.""" + orig_wd = Path.cwd() + os.chdir(tmp_path) + yield Path() + os.chdir(orig_wd) + + +base_path = fixture_union("base_path", ["rel_path", "abs_path"]) diff --git a/tests/test_core.py b/tests/test_core.py new file mode 100644 index 0000000..2e0c53e --- /dev/null +++ b/tests/test_core.py @@ -0,0 +1,360 @@ +from time import sleep +from zipfile import ZipFile, ZipInfo + +from repro_zipfile import ReproducibleZipFile +from tests.utils import ( + assert_archive_contents_equals, + data_factory, + dir_tree_factory, + file_factory, + hash_file, +) + + +def test_write_dir_tree(base_path): + """Archiving a directory tree works.""" + dir_tree = dir_tree_factory(base_path) + + # Create base ReproducibleZipFile archive + repro_zipfile_arc1 = base_path / "repro_zipfile1.zip" + with ReproducibleZipFile(repro_zipfile_arc1, "w") as zp: + for path in sorted(dir_tree.glob("**/*")): + zp.write(path) + + # Create regular ZipFile archive for comparison + zipfile_arc1 = base_path / "zipfile1.zip" + with ZipFile(zipfile_arc1, "w") as zp: + for path in sorted(dir_tree.glob("**/*")): + zp.write(path) + + # Update modified times + sleep(2) + for path in dir_tree.glob("**/*"): + path.touch() + + # Create second ReproducibleZipFile archive after delay + repro_zipfile_arc2 = base_path / "repro_zipfile2.zip" + with ReproducibleZipFile(repro_zipfile_arc2, "w") as zp: + for path in sorted(dir_tree.glob("**/*")): + zp.write(path) + + # Create second regular ZipFile archive for comparison after delay + zipfile_arc2 = base_path / "zipfile2.zip" + with ZipFile(zipfile_arc2, "w") as zp: + for path in sorted(dir_tree.glob("**/*")): + zp.write(path) + + # All four archives should have identical content + assert_archive_contents_equals(repro_zipfile_arc1, zipfile_arc1) + assert_archive_contents_equals(repro_zipfile_arc1, repro_zipfile_arc2) + assert_archive_contents_equals(repro_zipfile_arc1, zipfile_arc2) + + # ReproducibleZipFile hashes should match; ZipFile hashes should not + assert hash_file(repro_zipfile_arc1) == hash_file(repro_zipfile_arc2) + assert hash_file(zipfile_arc1) != hash_file(zipfile_arc2) + + +def test_write_dir_tree_string_paths(rel_path): + """Archiving a directory tree works.""" + dir_tree = dir_tree_factory(rel_path) + + # Create base ReproducibleZipFile archive + repro_zipfile_arc1 = rel_path / "repro_zipfile1.zip" + with ReproducibleZipFile(repro_zipfile_arc1, "w") as zp: + for path in sorted(dir_tree.glob("**/*")): + zp.write(str(path)) + + # Create regular ZipFile archive for comparison + zipfile_arc1 = rel_path / "zipfile1.zip" + with ZipFile(zipfile_arc1, "w") as zp: + for path in sorted(dir_tree.glob("**/*")): + zp.write(str(path)) + + # Update modified times + sleep(2) + for path in dir_tree.glob("**/*"): + path.touch() + + # Create second ReproducibleZipFile archive after delay + repro_zipfile_arc2 = rel_path / "repro_zipfile2.zip" + with ReproducibleZipFile(repro_zipfile_arc2, "w") as zp: + for path in sorted(dir_tree.glob("**/*")): + zp.write(str(path)) + + # Create second regular ZipFile archive for comparison after delay + zipfile_arc2 = rel_path / "zipfile2.zip" + with ZipFile(zipfile_arc2, "w") as zp: + for path in sorted(dir_tree.glob("**/*")): + zp.write(str(path)) + + # All four archives should have identical content + assert_archive_contents_equals(repro_zipfile_arc1, zipfile_arc1) + assert_archive_contents_equals(repro_zipfile_arc1, repro_zipfile_arc2) + assert_archive_contents_equals(repro_zipfile_arc1, zipfile_arc2) + + # ReproducibleZipFile hashes should match; ZipFile hashes should not + assert hash_file(repro_zipfile_arc1) == hash_file(repro_zipfile_arc2) + assert hash_file(zipfile_arc1) != hash_file(zipfile_arc2) + + +def test_write_single_file(base_path): + """Writing the same file with different mtime produces the same hash.""" + data_file = file_factory(base_path) + + repro_zip1 = base_path / "repro_zip1.zip" + with ReproducibleZipFile(repro_zip1, "w") as zp: + zp.write(data_file) + + zip1 = base_path / "zip1.zip" + with ZipFile(zip1, "w") as zp: + zp.write(data_file) + + print(data_file.stat()) + sleep(2) + data_file.touch() + print(data_file.stat()) + + repro_zip2 = base_path / "repro_zip2.zip" + with ReproducibleZipFile(repro_zip2, "w") as zp: + zp.write(data_file) + + zip2 = base_path / "zip2.zip" + with ZipFile(zip2, "w") as zp: + zp.write(data_file) + + # All four archives should have identical content + assert_archive_contents_equals(repro_zip1, zip1) + assert_archive_contents_equals(repro_zip1, repro_zip2) + assert_archive_contents_equals(repro_zip1, zip2) + + # ReproducibleZipFile hashes should match; ZipFile hashes should not + assert hash_file(repro_zip1) == hash_file(repro_zip2) + assert hash_file(zip1) != hash_file(zip2) + + +def test_write_single_file_with_source_date_epoch(base_path, monkeypatch): + """Writing the same file with different mtime with SOURCE_DATE_EPOCH set produces the + same hash.""" + + data_file = file_factory(base_path) + + arc_base = base_path / "base.zip" + with ReproducibleZipFile(arc_base, "w") as zp: + zp.write(data_file) + + monkeypatch.setenv("SOURCE_DATE_EPOCH", "1691732367") + + # With SOURCE_DATE_EPOCH set + arc_sde1 = base_path / "with_sde1.zip" + with ReproducibleZipFile(arc_sde1, "w") as zp: + zp.write(data_file) + + sleep(2) + data_file.touch() + + arc_sde2 = base_path / "with_sde2.zip" + with ReproducibleZipFile(arc_sde2, "w") as zp: + zp.write(data_file) + + # All four archives should have identical content + assert_archive_contents_equals(arc_base, arc_sde1) + assert_archive_contents_equals(arc_base, arc_sde2) + + # Base archive hash should match neither, two archives with SOURCE_DATE_EPOCH should match + assert hash_file(arc_base) != hash_file(arc_sde1) + assert hash_file(arc_sde1) == hash_file(arc_sde2) + + +def test_write_single_file_string_paths(rel_path): + """Writing the same file with different mtime produces the same hash, using string inputs + instead of Path.""" + data_file = file_factory(rel_path) + file_name = data_file.name + assert isinstance(file_name, str) + + repro_zip1 = rel_path / "repro_zip1.zip" + with ReproducibleZipFile(repro_zip1, "w") as zp: + zp.write(file_name) + + zip1 = rel_path / "zip1.zip" + with ZipFile(zip1, "w") as zp: + zp.write(file_name) + + sleep(2) + data_file.touch() + + repro_zip2 = rel_path / "repro_zip2.zip" + with ReproducibleZipFile(repro_zip2, "w") as zp: + zp.write(file_name) + + zip2 = rel_path / "zip2.zip" + with ZipFile(zip2, "w") as zp: + zp.write(file_name) + + # All four archives should have identical content + assert_archive_contents_equals(repro_zip1, zip1) + assert_archive_contents_equals(repro_zip1, repro_zip2) + assert_archive_contents_equals(repro_zip1, zip2) + + # ReproducibleZipFile hashes should match; ZipFile hashes should not + assert hash_file(repro_zip1) == hash_file(repro_zip2) + assert hash_file(zip1) != hash_file(zip2) + + +def test_write_single_file_arcname(base_path): + """Writing a single file with explicit arcname.""" + data_file = file_factory(base_path) + + repro_zip1 = base_path / "repro_zip1.zip" + with ReproducibleZipFile(repro_zip1, "w") as zp: + zp.write(data_file, arcname="lore.txt") + + zip1 = base_path / "zip1.zip" + with ZipFile(zip1, "w") as zp: + zp.write(data_file, arcname="lore.txt") + + sleep(2) + data_file.touch() + + repro_zip2 = base_path / "repro_zip2.zip" + with ReproducibleZipFile(repro_zip2, "w") as zp: + zp.write(data_file, arcname="lore.txt") + + zip2 = base_path / "zip2.zip" + with ZipFile(zip2, "w") as zp: + zp.write(data_file, arcname="lore.txt") + + # All four archives should have identical content + assert_archive_contents_equals(repro_zip1, zip1) + assert_archive_contents_equals(repro_zip1, repro_zip2) + assert_archive_contents_equals(repro_zip1, zip2) + + # ReproducibleZipFile hashes should match; ZipFile hashes should not + assert hash_file(repro_zip1) == hash_file(repro_zip2) + assert hash_file(zip1) != hash_file(zip2) + + +def test_writestr(tmp_path): + """writestr works as expected""" + data = data_factory() + + repro_zip1 = tmp_path / "repro_zip1.zip" + with ReproducibleZipFile(repro_zip1, "w") as zp: + zp.writestr("data.txt", data=data) + + zip1 = tmp_path / "zip1.zip" + with ZipFile(zip1, "w") as zp: + zp.writestr("data.txt", data=data) + + sleep(2) + + repro_zip2 = tmp_path / "repro_zip2.zip" + with ReproducibleZipFile(repro_zip2, "w") as zp: + zp.writestr("data.txt", data=data) + + zip2 = tmp_path / "zip2.zip" + with ZipFile(zip2, "w") as zp: + zp.writestr("data.txt", data=data) + + # All four archives should have identical content + assert_archive_contents_equals(repro_zip1, zip1) + assert_archive_contents_equals(repro_zip1, repro_zip2) + assert_archive_contents_equals(repro_zip1, zip2) + + # ReproducibleZipFile hashes should match; ZipFile hashes should not + assert hash_file(repro_zip1) == hash_file(repro_zip2) + assert hash_file(zip1) != hash_file(zip2) + + +def test_writestr_zip_info(rel_path): + """writestr with ZipInfo input""" + data = data_factory() + data_file = rel_path / "data.txt" + data_file.touch() + + repro_zip1 = rel_path / "repro_zip1.zip" + with ReproducibleZipFile(repro_zip1, "w") as zp: + zp.writestr(ZipInfo.from_file(data_file), data=data) + + zip1 = rel_path / "zip1.zip" + with ZipFile(zip1, "w") as zp: + zp.writestr(ZipInfo.from_file(data_file), data=data) + + sleep(2) + data_file.touch() + + repro_zip2 = rel_path / "repro_zip2.zip" + with ReproducibleZipFile(repro_zip2, "w") as zp: + zp.writestr(ZipInfo.from_file(data_file), data=data) + + zip2 = rel_path / "zip2.zip" + with ZipFile(zip2, "w") as zp: + zp.writestr(ZipInfo.from_file(data_file), data=data) + + # All four archives should have identical content + assert_archive_contents_equals(repro_zip1, zip1) + assert_archive_contents_equals(repro_zip1, repro_zip2) + assert_archive_contents_equals(repro_zip1, zip2) + + # ReproducibleZipFile hashes should match; ZipFile hashes should not + assert hash_file(repro_zip1) == hash_file(repro_zip2) + assert hash_file(zip1) != hash_file(zip2) + + +def test_writestr_source_date_epoch(tmp_path, monkeypatch): + """Test that using writestr with the same data at different times with SOURCE_DATE_EPOCH set + produces the same hash.""" + + data = data_factory() + + arc_base = tmp_path / "base.zip" + with ReproducibleZipFile(arc_base, "w") as zp: + zp.writestr("data.txt", data=data) + + sleep(2) + + monkeypatch.setenv("SOURCE_DATE_EPOCH", "1691732367") + + arc_sde1 = tmp_path / "sde1.zip" + with ReproducibleZipFile(arc_sde1, "w") as zp: + zp.writestr("data.txt", data=data) + + arc_sde2 = tmp_path / "sde2.zip" + with ReproducibleZipFile(arc_sde2, "w") as zp: + zp.writestr("data.txt", data=data) + + # All three archives should have identical content + assert_archive_contents_equals(arc_base, arc_sde1) + assert_archive_contents_equals(arc_base, arc_sde2) + + # Base archive hash should match neither; two with SOURCE_DATE_EPOCH should match + assert hash_file(arc_base) != hash_file(arc_sde1) + assert hash_file(arc_sde1) == hash_file(arc_sde2) + + +def test_mkdir(rel_path): + """mkdir is a Python 3.11+ method that we've backported to ReproducibleZipFile for <3.11, so + that we can use the 3.11 implementation of write. Other tests don't cover direct use with a + string input. Test that this is the same as writing a directory with ZipFile. + """ + arc_repro1 = rel_path / "repro1.zip" + with ReproducibleZipFile(arc_repro1, "w") as zp: + zp.mkdir("dir") + + sleep(2) + + # Hashes are the same in second copy + arc_repro2 = rel_path / "repro2.zip" + with ReproducibleZipFile(arc_repro2, "w") as zp: + zp.mkdir("dir") + + assert hash_file(arc_repro1) == hash_file(arc_repro2) + + # Produces same contests as regular ZipFile + dir_path = rel_path / "dir" + dir_path.mkdir() + arc_zip = rel_path / "zip.zip" + with ZipFile(arc_zip, "w") as zp: + zp.write(dir_path) + + assert_archive_contents_equals(arc_repro1, arc_zip) diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..067c589 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,67 @@ +import hashlib +from pathlib import Path +from tempfile import TemporaryDirectory +from zipfile import ZipFile + + +class _DataFactory: + """Utility function to generate a unique data string using an incrementing counter.""" + + def __init__(self) -> None: + self.counter = 0 + + def __call__(self) -> str: + out = f"{self.counter:04d}" + self.counter += 1 + return out + + +data_factory = _DataFactory() + + +def file_factory(parent_dir: Path) -> Path: + """Utility function to generate a file with random data.""" + path = parent_dir / f"{data_factory()}.txt" + path.write_text(data_factory()) + return path + + +def dir_tree_factory(parent_dir: Path): + """Utility function to generate a directory tree containing files with random data.""" + root_dir = parent_dir / data_factory() + root_dir.mkdir() + + for _ in range(3): + file_factory(root_dir) + + sub_dir = root_dir / "sub_dir" + sub_dir.mkdir() + + for _ in range(3): + file_factory(sub_dir) + + return root_dir + + +def hash_file(path: Path): + """Utility function to calculate the hash of a file's contents.""" + return hashlib.md5(path.read_bytes()).hexdigest() + + +def assert_archive_contents_equals(arc1: Path, arc2: Path): + with TemporaryDirectory() as outdir1, TemporaryDirectory() as outdir2: + with ZipFile(arc1, "r") as zp: + zp.extractall(outdir1) + with ZipFile(arc2, "r") as zp: + zp.extractall(outdir2) + + extracted1 = sorted(Path(outdir1).glob("**/*")) + extracted2 = sorted(Path(outdir2).glob("**/*")) + + assert [p.relative_to(outdir1) for p in extracted1] == [ + p.relative_to(outdir2) for p in extracted2 + ] + for arc1_member, arc2_member in zip(extracted1, extracted2): + assert arc1_member.is_file() == arc2_member.is_file() + if arc1_member.is_file(): + assert hash_file(arc1_member) == hash_file(arc2_member)