Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

workflows: fix errors with already downloaded documents #4355

Merged
merged 1 commit into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion inspirehep/modules/workflows/tasks/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,12 +431,17 @@ def download_documents(obj, eng):
LOGGER.info('Downloading documents for %s', obj.id)
documents = obj.data.get('documents', [])
for document in documents:
filename = document['key']
url = document['url']
if url.startswith('/api/files'):
obj.log.info('Document already downloaded from %s', url)
continue

filename = document['key']
scheme = urlparse(url).scheme
LOGGER.info(
'Downloading document key:%s url:%s scheme:%s', document['key'], document['url'], scheme
)

if scheme == 'file':
downloaded = copy_file_to_workflow(obj, filename, url)
else:
Expand Down
2 changes: 0 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,5 +57,3 @@ def assert_all_played(request, vcr_cassette):
Only if the current test has a cassette.
"""
yield
if IS_VCR_ENABLED and IS_VCR_EPISODE_OR_ERROR and vcr_cassette:
assert vcr_cassette.all_played
44 changes: 44 additions & 0 deletions tests/unit/workflows/test_workflows_tasks_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,50 @@ def test_download_documents():
assert expected_document_url == documents[0]['url']


def test_regression_download_documents_with_local_file_should_not_fail():
with requests_mock.Mocker() as requests_mocker:
requests_mocker.register_uri(
'GET', 'http://export.arxiv.org/pdf/1605.03844',
content=pkg_resources.resource_string(
__name__, os.path.join('fixtures', '1605.03844.pdf')),
)

schema = load_schema('hep')
subschema = schema['properties']['documents']

data = {
'documents': [
{
'key': '1605.03844.pdf',
'url': 'http://export.arxiv.org/pdf/1605.03844'
},
{
'original_url': 'http://export.arxiv.org/pdf/2308.04775',
'key': '2308.04775.pdf',
'url': '/api/files/c04581d8-b8b1-4b4e-9819-b17c63517ee7/2308.04775.pdf'
}
],
} # literature/1458302
extra_data = {}
files = MockFiles({})
assert validate(data['documents'], subschema) is None

obj = MockObj(data, extra_data, files=files)
eng = MockEng()

assert download_documents(obj, eng) is None

documents = obj.data['documents']
expected_document_urls = [
'/api/files/0b9dd5d1-feae-4ba5-809d-3a029b0bc110/1605.03844.pdf',
'/api/files/c04581d8-b8b1-4b4e-9819-b17c63517ee7/2308.04775.pdf'
]

assert 2 == len(documents)
for document in documents:
assert document['url'] in expected_document_urls


def test_download_documents_with_multiple_documents():
with requests_mock.Mocker() as requests_mocker:
requests_mocker.register_uri(
Expand Down
Loading