Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update Papis Sci-Hub plugin #37

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
674 changes: 674 additions & 0 deletions papis-scihub/LICENSE

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions papis-scihub/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Papis-SciHub

This [Papis](https://github.com/papis/papis/) plugin provides a Sci-Hub `Importer`.

## Installation

`pip install papis-scihub`

## Usage

```bash
papis add [--from scihub] <doi>
```

- The `Importer` corresponding to this plugin is called `scihub`, so the option `--from scihub` will add files only from Sci-Hub.
- DOIs can be provided either as raw strings (e.g. `10.1101/2021.03.21.436284`) or as complete URLs (e.g. `https://doi.org/10.1101/2021.03.21.436284`).

8 changes: 0 additions & 8 deletions papis-scihub/README.rst

This file was deleted.

2 changes: 0 additions & 2 deletions papis-scihub/papis_scihub/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +0,0 @@
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
189 changes: 80 additions & 109 deletions papis-scihub/papis_scihub/plugin.py
Original file line number Diff line number Diff line change
@@ -1,117 +1,88 @@
from urllib.parse import urlparse

import doi
import scihub
import webbrowser
import papis.importer
import papis.crossref
import tempfile
import colorama
import warnings
import urllib.request


WARNING_NOTICE = '''
{bb} .+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+. {ns}
{bb}( ){ns}
{bb} ) {rb} WARNING NOTICE {bb} \
( {ns}
{bb}( ---------------- ){ns}
{bb} ) ( {ns}
{bb}( This script uses the platform {rb}SCIHUB{bb}, which may or MAY NOT \
){ns}
{bb} ) be in conflict with local laws in your country. Use it at ( {ns}
{bb}( your own risk, {rb}the author bears no responsibility{bb}. \
){ns}
{bb} ) ( {ns}
{bb}( papis team ){ns}
{bb} "+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+"+.+" {ns}\
'''.format(
bb=colorama.Back.BLACK + colorama.Fore.WHITE,
ns=colorama.Style.RESET_ALL,
rb=colorama.Back.RED,
)


class Importer(papis.importer.Importer):

"""Importer that tries to get files and data first from crossref,
and if no files are found on crossref, try to get them from scihub.
"""

def __init__(self, **kwargs):
papis.importer.Importer.__init__(self, name='scihub', **kwargs)
self.doi = None
import papis.downloaders
from bs4 import BeautifulSoup


BASE_URLS = ("http://sci-hub.ee",)


class Downloader(papis.downloaders.Downloader):
def __init__(self, uri: str) -> None:
papis.downloaders.Downloader.__init__(self, uri=uri, name="sci-hub")
self.expected_document_extension = "pdf"
self.priority = 1

try:
self._get_active_server_url()
except Exception as e:
raise e

self.doi = _extract_doi(uri)
self._body = self.session.get(
f"{self.base_url}/{self.doi}",
verify=False
)

@classmethod
def match(cls, uri):
def match(cls, url: str) -> papis.downloaders.Downloader | None:
try:
doi.validate_doi(uri)
except ValueError:
_extract_doi(url)
return Downloader(url)
except Exception:
return None
else:
return Importer(uri=uri)

def fetch(self):
doi_str = (
doi.find_doi_in_text(self.uri) or
doi.find_doi_in_text(
urllib.request.urlopen(self.uri).read().decode('utf-8')
) or
self.uri
)
ctx = self.fetch_from_doi(doi_str)
if ctx:
if ctx.data:
self.ctx.data = ctx.data
if ctx.files:
self.ctx.files = ctx.files

def _get_active_server_url(self) -> None:
for base_url in BASE_URLS:
if self._ping_server(base_url):
self.base_url = base_url
return
self.get_files()

def fetch_from_doi(self, doi_str):
doi_imp = papis.importer.get_importer_by_name('doi').match(doi_str)
if doi_imp is not None:
self.logger.info('getting data through doi')
doi_imp.fetch()
return doi_imp.ctx

def get_files(self):
# ignore the https warnings for scihub
warnings.simplefilter('ignore')
self.logger.warning(WARNING_NOTICE)
sh = scihub.SciHub(self.uri)
raise Exception("No Sci-Hub servers can be pinged")

def _ping_server(self, base_url: str) -> bool:
try:
ctx = sh.fetch()
except scihub.CaptchaNeededException as e:
curl = e.captcha_url
self.logger.warning(
'You have to solve the catcha in \n\t'
'{c.Back.RED}{c.Fore.WHITE}{url}{c.Style.RESET_ALL}'
.format(url=curl, c=colorama)
)
self.logger.info('opening a browser for you...')
webbrowser.open(curl, new=1, autoraise=True)
if papis.utils.confirm('Try again?'):
ctx = sh.fetch()
except scihub.DocumentUrlNotFound:
self.logger.error(
'Sorry, it does not appear to be possible to find and url'
' for the given document using scihub'
)
except Exception as e:
print(type(e))
self.logger.error(e)
ping = self.session.get(base_url, timeout=1, verify=False)
if ping.status_code != 200:
self.logger.error(f"server {base_url} is down")
return False
else:
self.logger.debug(f"server {base_url} is up")
return True
except Exception:
return False

def get_doi(self) -> str | None:
return self.doi

def get_document_url(self) -> str | None:
s = BeautifulSoup(self._body.content, "html.parser")
iframe = s.find("iframe")
if iframe:
src = iframe.get("src")
if src.startswith("//"):
src = f"https:{src}"
return src
else:
assert(ctx is not None)
assert(ctx.url is not None)
assert(ctx.pdf is not None)
out = tempfile.mktemp(suffix='.pdf')
self.logger.info('got file from: {0}'.format(ctx.url))
self.logger.info('writing file in: {0}'.format(out))
with open(out, 'wb+') as fd:
fd.write(ctx.pdf)
self.ctx.files = [out]
if not self.ctx.data and ctx.doi:
doi_ctx = self.fetch_from_doi(ctx.doi)
if doi_ctx.data:
self.logger.info('got data from doi {0}'.format(ctx.doi))
self.ctx.data = doi_ctx.data
return None

def download_bibtex(self) -> str:
self.bibtex_data = self.session.get(
f"https://doi.org/{self.doi}",
headers={"accept": "application/x-bibtex"}
).text


def _extract_doi(url: str) -> str:
parsed_url = urlparse(url)
if parsed_url.netloc:
if "doi.org" in parsed_url.netloc:
doi_ = doi.find_doi_in_text(url)
else:
doi_ = url
try:
doi.validate_doi(doi_)
return doi_
except Exception as e:
raise e(f"Cannot extract a valid DOI from the provided URL: {url}")

38 changes: 38 additions & 0 deletions papis-scihub/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
[build-system]
requires = ["setuptools", "setuptools-scm"]
build-backend = "setuptools.build_meta"

[project]
name = "papis-scihub"
description = "Sci-Hub plugin for the Papis bibliography manager"
readme = "README.md"
requires-python = ">=3.10"
license = {file = "LICENSE"}
authors = [
{name = "Raj Magesh Gauthaman", email = "[email protected]"}
]
keywords = [
"papis",
"sci-hub",
"bibliography",
]
classifiers = [
"Development Status :: 4 - Beta",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Natural Language :: English",
"Operating System :: POSIX :: Linux",
"Programming Language :: Python :: 3",
"Topic :: Software Development :: Libraries :: Python Modules",
"Typing :: Typed",
]
urls = {repository = "https://github.com/papis/scripts"}
dependencies = [
"papis",
"beautifulsoup4",
"python-doi",
]
dynamic = ["version"]

[project.entry-points."papis.importer"]
scihub = "papis_scihub.plugin:Downloader"

49 changes: 0 additions & 49 deletions papis-scihub/setup.py

This file was deleted.