diff --git a/core/pyproject.toml b/core/pyproject.toml index 95a75113..27e553c2 100644 --- a/core/pyproject.toml +++ b/core/pyproject.toml @@ -86,7 +86,7 @@ full = [ # https://github.com/libgit2/pygit2/issues/1316 # https://github.com/fsspec/filesystem_spec/pull/1703 # build error in Python 3.13 because it requires libgit2 1.8.1 and there are no wheels - "pygit2<1.15", + "pygit2", "fsspec", "s3fs", #"gcsfs", # untested @@ -108,7 +108,7 @@ fsspec = [ # https://github.com/libgit2/pygit2/issues/1316 # https://github.com/fsspec/filesystem_spec/pull/1703 # build error in Python 3.13 because it requires libgit2 1.8.1 and there are no wheels - "pygit2<1.15", + "pygit2", "fsspec", "s3fs", #"gcsfs", # untested diff --git a/core/ratarmountcore/GitMountSource.py b/core/ratarmountcore/GitMountSource.py new file mode 100644 index 00000000..c281b266 --- /dev/null +++ b/core/ratarmountcore/GitMountSource.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import io +import os +import stat +from typing import Dict, Iterable, IO, Optional, Union + +try: + import pygit2 +except ImportError: + pygit2 = None # type: ignore + +from .MountSource import FileInfo, MountSource +from .utils import overrides + + +class GitMountSource(MountSource): + """ + Reimplementation from scratch of the very barebones implementation inside fsspec + because it is slow and "older" versions did not work with pygit2 1.15. + + https://github.com/fsspec/filesystem_spec/blob/master/fsspec/implementations/git.py + https://github.com/fsspec/filesystem_spec/issues/1708 + """ + + enabled = pygit2 is not None + + # pylint: disable=unused-argument + def __init__(self, path: Optional[str] = None, reference: Optional[str] = None, **kwargs): + self.repository = pygit2.Repository(path if path else os.getcwd()) + self.reference = reference if reference else self._getDefaultReference(self.repository) + commit, reference = self.repository.resolve_refish(self.reference) + self.tree = commit.tree + self.commitTime = self.repository[self.repository.head.target].commit_time + self.prefix = "" + + @staticmethod + def _getDefaultReference(repository): + if 'init.defaultBranch' in repository.config: + return repository.config['init.defaultBranch'] + + # Try to find checked out branch. + for branch in repository.branches: + if repository.branches[branch].is_head(): + return branch + + for branch in ['master', 'main']: + if branch in repository.branches: + return branch + + return 'master' + + def _lookUpPath(self, path: str): + tree = self.tree + for name in self.prefix.split("/") + path.split("/"): + if name and isinstance(tree, pygit2.Tree): + if name not in tree: + return None + tree = tree[name] + return tree + + @staticmethod + def _convertToFileMode(obj): + if obj.filemode == pygit2.enums.FileMode.LINK: + return 0o555 | stat.S_IFLNK + return 0o555 | (stat.S_IFDIR if isinstance(obj, pygit2.Tree) else stat.S_IFREG) + + def _convertToFileInfo(self, obj, path: str): + return FileInfo( + # fmt: off + size = obj.size if hasattr(obj, 'size') else 0, + mtime = self.commitTime, + mode = GitMountSource._convertToFileMode(obj), + linkname = obj.data.decode() if obj.filemode == pygit2.enums.FileMode.LINK else "", + uid = os.getuid(), + gid = os.getgid(), + userdata = [path], + # fmt: on + ) + + @overrides(MountSource) + def isImmutable(self) -> bool: + return True + + @overrides(MountSource) + def exists(self, path: str) -> bool: + return self._lookUpPath(path) is not None + + def _listDir(self, path: str, onlyMode: bool) -> Optional[Union[Iterable[str], Dict[str, FileInfo]]]: + tree = self._lookUpPath(path) + if not isinstance(tree, pygit2.Tree): + return None + return { + obj.name: ( + GitMountSource._convertToFileMode(obj) + if onlyMode + else self._convertToFileInfo(obj, path + '/' + obj.name) + ) + for obj in tree + } + + @overrides(MountSource) + def listDir(self, path: str) -> Optional[Union[Iterable[str], Dict[str, FileInfo]]]: + return self._listDir(path, onlyMode=False) + + @overrides(MountSource) + def listDirModeOnly(self, path: str) -> Optional[Union[Iterable[str], Dict[str, int]]]: + return self._listDir(path, onlyMode=True) + + @overrides(MountSource) + def getFileInfo(self, path: str, fileVersion: int = 0) -> Optional[FileInfo]: + obj = self._lookUpPath(path) + return None if obj is None else self._convertToFileInfo(obj, path) + + @overrides(MountSource) + def fileVersions(self, path: str) -> int: + return 1 + + @overrides(MountSource) + def open(self, fileInfo: FileInfo, buffering=-1) -> IO[bytes]: + path = fileInfo.userdata[-1] + assert isinstance(path, str) + # TODO Avoid high memory usage for very large files. + # Check whether pygit2 even has a kind of streaming API for file contents. + return io.BytesIO(self._lookUpPath(path).data) + + @overrides(MountSource) + def __exit__(self, exception_type, exception_value, exception_traceback): + pass diff --git a/core/ratarmountcore/factory.py b/core/ratarmountcore/factory.py index f72cbb66..470ee96d 100644 --- a/core/ratarmountcore/factory.py +++ b/core/ratarmountcore/factory.py @@ -5,6 +5,7 @@ # Disable pylint errors. See https://github.com/fsspec/filesystem_spec/issues/1678 import os +import stat import sys import traceback import warnings @@ -16,6 +17,7 @@ from .MountSource import MountSource from .FolderMountSource import FolderMountSource from .FSSpecMountSource import FSSpecMountSource +from .GitMountSource import GitMountSource from .RarMountSource import RarMountSource from .SingleFileMountSource import SingleFileMountSource from .SQLiteIndexedTar import SQLiteIndexedTar @@ -135,6 +137,39 @@ def openFsspec(url, options, printDebug: int) -> Optional[Union[MountSource, IO[ if protocol == 'file': return splitURI[1] + if protocol == 'git': + if not GitMountSource.enabled: + raise ValueError( + "Detected git:// URL but GitMountSource could not be loaded. Please ensure that pygit2 is installed." + ) + + remainder = splitURI[1] + + splitRepositoryPath = remainder.split(':', 1) + repositoryPath = splitRepositoryPath[0] if len(splitRepositoryPath) > 1 else None + remainder = splitRepositoryPath[-1] + + splitReference = remainder.split('@', 1) + reference = splitReference[0] if len(splitReference) > 1 else None + pathInsideRepository = splitReference[-1] + + mountSource = GitMountSource(repositoryPath, reference=reference) + if pathInsideRepository: + fileInfo = mountSource.getFileInfo(pathInsideRepository) + if not fileInfo: + raise ValueError( + f"The path {pathInsideRepository} in the git repository specified via '{url}' does not exist!" + ) + + if stat.S_ISDIR(fileInfo.mode): + mountSource.prefix = pathInsideRepository + else: + # In the future it might be necessary to extend the lifetime of mountSource by adding it as + # a member of the opened file, but not right now. + return mountSource.open(fileInfo) + + return mountSource + if not fsspec: print("[Warning] An URL was detected but fsspec is not installed. You may want to install it with:") print("[Warning] python3 -m pip install ratarmount[fsspec]") diff --git a/tests/.pylintrc b/tests/.pylintrc index 177eaf53..535854ff 100644 --- a/tests/.pylintrc +++ b/tests/.pylintrc @@ -5,7 +5,7 @@ init-hook='import sys; sys.path.append("./core")' # run arbitrary code. extension-pkg-whitelist=indexed_gzip,indexed_bzip2,indexed_zstd,libarchive,libarchive.ffi,lzmaffi,rapidgzip,isal, PySquashfsImage,PySquashfsImage.compressor,zstandard,lz4,deflate,pyminizip,fast_zip_decryption, - asyncssh,sshfs,fsspec + asyncssh,sshfs,fsspec,pygit2 # Specify a score threshold to be exceeded before program exits with error. fail-under=10.0