diff --git a/codemeta.json b/codemeta.json index db8dceea692c7e229c97f4d92a33c8e46d8ac9b8..71902736cb5e9e318d941523bdea6e9c1ca41035 100644 --- a/codemeta.json +++ b/codemeta.json @@ -106,6 +106,17 @@ "@type": "Organization", "name": "ECAP, FAU (Nuremberg, Germany)" } + }, + { + "@type": "Person", + "@id": "https://orcid.org/0000-0001-5226-3089", + "givenName": "Tom", + "familyName": "François", + "email": "tom.francois@lapp.in2p3.fr", + "affiliation": { + "@type": "Organization", + "name": "Univ. Savoie Mont Blanc, CNRS, LAPP" + } } ], "funder": [ diff --git a/eossr/api/ossr.py b/eossr/api/ossr.py index 7f30df8d923b01aff33d8a171bf23a5535cb84f3..89c1887d84293866e44fba7c64658fb2af470ca0 100644 --- a/eossr/api/ossr.py +++ b/eossr/api/ossr.py @@ -1,18 +1,19 @@ #!/usr/bin/env python -from .zenodo import ZenodoAPI, query_records, search_records +from .zenodo import ZenodoAPI +from .zenodo.zenodo import Record, _search __all__ = [ - 'search_ossr_records', - 'get_ossr_records', - 'get_ossr_pending_requests', + "search_ossr_records", + "get_ossr_records", + "get_ossr_pending_requests", ] -escape_community = 'escape2020' -sandbox_escape_community = 'escape2020' +escape_community = "escape2020" +sandbox_escape_community = "escape2020" -def search_ossr_records(search='', sandbox=False, **kwargs): +def search_ossr_records(search="", sandbox=False, **kwargs): """ Search the OSSR for records whose names or descriptions include the provided string `search`. The default record type is 'software' or 'record'. @@ -40,28 +41,22 @@ def search_ossr_records(search='', sandbox=False, **kwargs): :return: [Record] """ - # make sure we find all OSSR records without limit on the number - params = kwargs - params['communities'] = escape_community - response = query_records(search, sandbox=sandbox, **params) - number_of_ossr_entries = response.json( - )['aggregations']['access_status']['buckets'][0]['doc_count'] - kwargs['size'] = number_of_ossr_entries - # if another community is specified, a logical OR is applied by zenodo API, # thus potentially finding entries that are not part of escape2020 # ruling out that possibility at the moment - if 'communities' in kwargs and kwargs['communities'] != escape_community: + if "communities" in kwargs and kwargs["communities"] != escape_community: raise NotImplementedError( "Searching in another community will search outside of the OSSR" "Use `eossr.api.zenodo.search_records` to do so" ) - kwargs['communities'] = escape_community + kwargs["communities"] = escape_community # OSSR is limited to software and datasets - kwargs.setdefault('type', ['software', 'dataset']) + kwargs.setdefault("type", ["software", "dataset"]) - return search_records(search, sandbox=sandbox, **kwargs) + # Use paginated search to fetch all records without hitting Zenodo's page size limits + hits = _search("records", search=search, sandbox=sandbox, **kwargs) + return [Record(hit) for hit in hits] def get_ossr_pending_requests(zenodo_token, **params): @@ -84,4 +79,4 @@ def get_ossr_records(sandbox=False, **kwargs): :return: """ - return search_ossr_records('', sandbox=sandbox, **kwargs) + return search_ossr_records("", sandbox=sandbox, **kwargs) diff --git a/eossr/api/zenodo/tests/test_zenodo.py b/eossr/api/zenodo/tests/test_zenodo.py index 9547a7725c276debd06707edd435f78115a91d11..c4fac97dfdc60d912308d4be30fca1e75229b1a6 100644 --- a/eossr/api/zenodo/tests/test_zenodo.py +++ b/eossr/api/zenodo/tests/test_zenodo.py @@ -3,14 +3,13 @@ import json import os import shutil import tempfile +import time import unittest from pathlib import Path import pytest import requests -import time from bs4 import BeautifulSoup -from eossr.api.zenodo.zenodo import PendingRequest from eossr.api.ossr import sandbox_escape_community from eossr.api.zenodo import ( @@ -27,7 +26,7 @@ from eossr.api.zenodo import ( zenodo_url, ) from eossr.api.zenodo.http_status import HTTPStatusError -from eossr.api.zenodo.zenodo import is_live, query_records +from eossr.api.zenodo.zenodo import PendingRequest, is_live, query_records from eossr.metadata.tests import ZENODO_TEST_FILE # test deactivated at the moment @@ -47,10 +46,10 @@ def test_is_live(): def temp_dir_with_file(): with tempfile.TemporaryDirectory() as tmpdirname: print(f"tmpdir {tmpdirname}") - shutil.copy(ZENODO_TEST_FILE, Path(tmpdirname, '.zenodo.json')) + shutil.copy(ZENODO_TEST_FILE, Path(tmpdirname, ".zenodo.json")) _, filename = tempfile.mkstemp(dir=tmpdirname) - Path(filename).write_text('Hello from eossr unit tests') + Path(filename).write_text("Hello from eossr unit tests") yield tmpdirname, filename @@ -58,12 +57,12 @@ def temp_dir_with_file(): class TestZenodoApiSandbox(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestZenodoApiSandbox, self).__init__(*args, **kwargs) - self.token = 'FakeToken' + self.token = "FakeToken" self.zenodo = ZenodoAPI(access_token=self.token, sandbox=True) - # getting all records for tests purposes - self.zenodo.parameters['size'] = 1000 - self.zenodo.parameters['all_versions'] = True + # Note: Zenodo API limits page size to 25 for unauthenticated and 100 for authenticated requests + self.zenodo.parameters["size"] = 25 + self.zenodo.parameters["all_versions"] = True def test_initialization_sandbox(self): from eossr.api.zenodo import zenodo_sandbox_api_base_url @@ -73,14 +72,14 @@ class TestZenodoApiSandbox(unittest.TestCase): assert self.zenodo.access_token == self.token def test_query_community_entries(self): - community_entries = self.zenodo.query_community_records('escape2020') + community_entries = self.zenodo.query_community_records("escape2020") assert isinstance(community_entries, requests.models.Response) class TestZenodoAPINoToken(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestZenodoAPINoToken, self).__init__(*args, **kwargs) - self.token = '' + self.token = "" self.zenodo = ZenodoAPI(access_token=self.token, sandbox=False) def test_initialization(self): @@ -96,15 +95,15 @@ class TestZenodoAPINoToken(unittest.TestCase): self.zenodo._raise_token_status() -@pytest.mark.skipif(os.getenv('SANDBOX_ZENODO_TOKEN') is None, reason="SANDBOX_ZENODO_TOKEN not defined") +@pytest.mark.skipif(os.getenv("SANDBOX_ZENODO_TOKEN") is None, reason="SANDBOX_ZENODO_TOKEN not defined") class TestZenodoAPITokenSandbox(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestZenodoAPITokenSandbox, self).__init__(*args, **kwargs) - self.token = os.getenv('SANDBOX_ZENODO_TOKEN') + self.token = os.getenv("SANDBOX_ZENODO_TOKEN") self.zenodo = ZenodoAPI(access_token=self.token, sandbox=True) def test_init(self): - assert self.zenodo.access_token == os.getenv('SANDBOX_ZENODO_TOKEN') + assert self.zenodo.access_token == os.getenv("SANDBOX_ZENODO_TOKEN") def test_raise_token_status(self): self.zenodo._raise_token_status() @@ -115,19 +114,19 @@ class TestZenodoAPITokenSandbox(unittest.TestCase): def test_create_erase_new_deposit(self): create_new_deposit = self.zenodo.create_new_deposit() assert isinstance(create_new_deposit, requests.models.Response) - record_id = create_new_deposit.json()['id'] + record_id = create_new_deposit.json()["id"] erase_deposit = self.zenodo.erase_deposit(record_id) assert isinstance(erase_deposit, requests.models.Response) def test_pending_request(self): z = self.zenodo pending_requests = z.get_community_pending_requests(sandbox_escape_community) - requests_ids = [rec.data['id'] for rec in pending_requests] + requests_ids = [rec.data["id"] for rec in pending_requests] # pending request in escape2020 sandbox community - 2023-11-13 - assert '6eaf34cf-f63d-4604-8b43-a6198bfdaa5d' in requests_ids + assert "6eaf34cf-f63d-4604-8b43-a6198bfdaa5d" in requests_ids def test_find_similar_records_sandbox(self): - self.zenodo.parameters.update({'size': 100}) + self.zenodo.parameters.update({"size": 100}) existing_record = Record.from_id(eossr_record_id) assert len(self.zenodo.find_similar_records(existing_record)) > 0 @@ -136,30 +135,27 @@ class TestZenodoAPITokenSandbox(unittest.TestCase): assert isinstance(records[0], Record) -@pytest.mark.skipif(os.getenv('SANDBOX_ZENODO_TOKEN') is None, reason="SANDBOX_ZENODO_TOKEN not defined") +@pytest.mark.skipif(os.getenv("SANDBOX_ZENODO_TOKEN") is None, reason="SANDBOX_ZENODO_TOKEN not defined") def test_upload_package(temp_dir_with_file): tmpdirname, _ = temp_dir_with_file - zenodo = ZenodoAPI(access_token=os.getenv( - 'SANDBOX_ZENODO_TOKEN'), sandbox=True) + zenodo = ZenodoAPI(access_token=os.getenv("SANDBOX_ZENODO_TOKEN"), sandbox=True) # create new record deposit_id = zenodo.upload_dir_content(tmpdirname, publish=True) print(f"{deposit_id} created") # update existing record _, filename = tempfile.mkstemp(dir=tmpdirname) - Path(filename).write_text('2nd upload from eossr unit tests') - new_deposit_id = zenodo.upload_dir_content( - tmpdirname, record_id=deposit_id, publish=False) + Path(filename).write_text("2nd upload from eossr unit tests") + new_deposit_id = zenodo.upload_dir_content(tmpdirname, record_id=deposit_id, publish=False) zenodo.erase_deposit(new_deposit_id) print(f"{new_deposit_id} created and deleted") -@pytest.mark.skipif(os.getenv('SANDBOX_ZENODO_TOKEN') is None, reason="SANDBOX_ZENODO_TOKEN not defined") +@pytest.mark.skipif(os.getenv("SANDBOX_ZENODO_TOKEN") is None, reason="SANDBOX_ZENODO_TOKEN not defined") def test_check_upload_to_zenodo(temp_dir_with_file): tmpdirname, _ = temp_dir_with_file - zenodo = ZenodoAPI(access_token=os.getenv( - 'SANDBOX_ZENODO_TOKEN'), sandbox=True) + zenodo = ZenodoAPI(access_token=os.getenv("SANDBOX_ZENODO_TOKEN"), sandbox=True) # 1 - Test connection test_connection = zenodo.query_user_deposits() assert test_connection.status_code == 200 @@ -169,11 +165,10 @@ def test_check_upload_to_zenodo(temp_dir_with_file): assert new_deposit.status_code == 201 # 3 - Test upload metadata - test_deposit_id = new_deposit.json()['id'] + test_deposit_id = new_deposit.json()["id"] with open(zenodo.path_zenodo_file(tmpdirname)) as file: metadata_entry = json.load(file) - updated_metadata = zenodo.set_deposit_metadata( - test_deposit_id, json_metadata=metadata_entry) + updated_metadata = zenodo.set_deposit_metadata(test_deposit_id, json_metadata=metadata_entry) assert updated_metadata.status_code == 200 # 4 - Test delete entry @@ -211,11 +206,11 @@ def test_check_upload_to_zenodo(temp_dir_with_file): # pass -@pytest.mark.skipif(os.getenv('ZENODO_TOKEN') is None, reason="ZENODO_TOKEN not defined") +@pytest.mark.skipif(os.getenv("ZENODO_TOKEN") is None, reason="ZENODO_TOKEN not defined") class TestZenodoAPIToken(unittest.TestCase): def __init__(self, *args, **kwargs): super(TestZenodoAPIToken, self).__init__(*args, **kwargs) - self.token = os.getenv('ZENODO_TOKEN') + self.token = os.getenv("ZENODO_TOKEN") self.zenodo = ZenodoAPI(access_token=self.token, sandbox=False) def test_query_user(self): @@ -224,13 +219,14 @@ class TestZenodoAPIToken(unittest.TestCase): def test_search_records(): zenodo_records = search_records( - 'ESCAPE template project', + "ESCAPE template project", all_versions=True, + max_results=50, timeout=tests_default_timeout, ) assert len(zenodo_records) > 1 - all_dois = [r.data['doi'] for r in zenodo_records] - assert '10.5281/zenodo.4923992' in all_dois + all_dois = [r.data["doi"] for r in zenodo_records] + assert "10.5281/zenodo.4923992" in all_dois @pytest.mark.xfail(raises=HTTPStatusError) @@ -239,9 +235,9 @@ def test_get_record_42(): def test_query_record_10005007(): - answer = query_records('eossr', sandbox=True) + answer = query_records("eossr", sandbox=True) assert answer.status_code == 200 - assert len(answer.json()['hits']['hits']) >= 1 + assert len(answer.json()["hits"]["hits"]) >= 1 @pytest.fixture @@ -250,11 +246,11 @@ def record_4923992(): def test_record(record_4923992): - assert record_4923992.data['conceptdoi'] == '10.5281/zenodo.3572654' + assert record_4923992.data["conceptdoi"] == "10.5281/zenodo.3572654" record_4923992.print_info() codemeta = record_4923992.get_codemeta() assert isinstance(codemeta, dict) - assert codemeta['name'] == 'ESCAPE template project' + assert codemeta["name"] == "ESCAPE template project" record_4923992.get_mybinder_url() @@ -264,28 +260,28 @@ def test_web_url(record_4923992): """ # Test for production URL (record_4923992 is from production) expected_prod_url = f"{zenodo_url}/records/4923992" - assert record_4923992.web_url == expected_prod_url, f"Expected {expected_prod_url}, but got {record_4923992.web_url}" + assert record_4923992.web_url == expected_prod_url, ( + f"Expected {expected_prod_url}, but got {record_4923992.web_url}" + ) assert requests.get(record_4923992.web_url, timeout=10).status_code == 200 def test_get_record_sandbox(): record = get_record(test_record_sandbox, sandbox=True) - assert record.data['doi'] == f'10.5072/zenodo.{test_record_sandbox}' + assert record.data["doi"] == f"10.5072/zenodo.{test_record_sandbox}" def test_write_record_zenodo(record_4923992, tmpdir): - record_4923992._write_zenodo_deposit( - filename=tmpdir / '.zenodo.json', validate=False) - with open(tmpdir / '.zenodo.json') as file: + record_4923992._write_zenodo_deposit(filename=tmpdir / ".zenodo.json", validate=False) + with open(tmpdir / ".zenodo.json") as file: json_dict = json.load(file) - assert json_dict['title'] == 'ESCAPE template project' - assert json_dict['version'] == 'v2.2' + assert json_dict["title"] == "ESCAPE template project" + assert json_dict["version"] == "v2.2" def test_search_funders(): - funders = zenodo.search_funders( - 'name:European+Commission', timeout=tests_default_timeout) - assert len(funders) > 1 + funders = zenodo.search_funders("European Commission", timeout=tests_default_timeout) + assert len(funders) >= 1 # def test_search_grants(): @@ -295,16 +291,16 @@ def test_search_funders(): def test_search_license(): - licenses = zenodo.search_licenses('MIT', timeout=tests_default_timeout) - assert licenses[0]['title'] == {'en': 'MIT License'} + licenses = zenodo.search_licenses("MIT", timeout=tests_default_timeout) + assert licenses[0]["title"] == {"en": "MIT License"} def test_search_communities(): communities = zenodo.search_communities( - 'id:8b951469-55d0-44f2-bb91-b541501c9c8e', + "id:8b951469-55d0-44f2-bb91-b541501c9c8e", timeout=tests_default_timeout, ) - assert communities[0]['slug'] == 'escape2020' + assert communities[0]["slug"] == "escape2020" def test_get_associated_versions(): @@ -318,9 +314,8 @@ def test_get_associated_versions(): # Seven versions, to date 21/03/2022 assert len(eossr_record_versions) >= 7 for recid in eossr_record_versions.keys(): - assert eossr_record.data['conceptrecid'] == Record.from_id( - recid).data['conceptrecid'] - assert eossr_record_versions[5524913] == 'v0.2' # ID of eOSSR version v0.2 + assert eossr_record.data["conceptrecid"] == Record.from_id(recid).data["conceptrecid"] + assert eossr_record_versions[5524913] == "v0.2" # ID of eOSSR version v0.2 @pytest.mark.xfail(raises=FileNotFoundError) @@ -332,8 +327,8 @@ def test_get_codemeta_fail(): def test_get_supported_licenses(): zenodo_licenses = zenodo.get_supported_licenses() assert isinstance(zenodo_licenses, list) - assert 'mit' in zenodo_licenses - assert 'apache-2.0' in zenodo_licenses + assert "mit" in zenodo_licenses + assert "apache-2.0" in zenodo_licenses assert "apache-license-x." not in zenodo_licenses @@ -341,76 +336,74 @@ def test_download_record(): with tempfile.TemporaryDirectory() as tmpdir: record = Record.from_id(3743490) record.download(tmpdir) - assert os.path.exists(f'{tmpdir}/template_project_escape-v1.1.zip') + assert os.path.exists(f"{tmpdir}/template_project_escape-v1.1.zip") def test_get_funder(): - funder_id = '00k4n6c32' # European Commission + funder_id = "00k4n6c32" # European Commission funder = get_funder(funder_id, sandbox=True) assert isinstance(funder, dict) - assert funder['id'] == funder_id - assert funder['name'] == "European Commission" + assert funder["id"] == funder_id + assert funder["name"] == "European Commission" def test_get_license(): - license_id = 'mit' + license_id = "mit" license = get_license(license_id, sandbox=True) assert isinstance(license, dict) - assert license['id'] == license_id + assert license["id"] == license_id def test_get_community(): community_slug = sandbox_escape_community community = get_community(community_slug, sandbox=True) assert isinstance(community, dict) - assert community['slug'] == community_slug - assert community['id'] == '012c7725-ee21-4603-855d-e675842b4f7b' + assert community["slug"] == community_slug + assert community["id"] == "012c7725-ee21-4603-855d-e675842b4f7b" -@pytest.mark.skipif(os.getenv('SANDBOX_ZENODO_TOKEN') is None, reason="SANDBOX_ZENODO_TOKEN not defined") +@pytest.mark.skipif(os.getenv("SANDBOX_ZENODO_TOKEN") is None, reason="SANDBOX_ZENODO_TOKEN not defined") def test_query_deposits(): search = "eossr" - access_token = os.getenv('SANDBOX_ZENODO_TOKEN') + access_token = os.getenv("SANDBOX_ZENODO_TOKEN") sandbox = True - deposits = query_deposits(search, access_token, - sandbox=sandbox, timeout=tests_default_timeout) + deposits = query_deposits(search, access_token, sandbox=sandbox, timeout=tests_default_timeout) assert isinstance(deposits, requests.Response) assert len(deposits.json()) > 0 -@pytest.mark.skipif(os.getenv('SANDBOX_ZENODO_TOKEN') is None, reason="SANDBOX_ZENODO_TOKEN not defined") +@pytest.mark.skipif(os.getenv("SANDBOX_ZENODO_TOKEN") is None, reason="SANDBOX_ZENODO_TOKEN not defined") def test_get_deposit(): - deposit = get_deposit(test_record_sandbox, sandbox=True, - access_token=os.getenv('SANDBOX_ZENODO_TOKEN')) + deposit = get_deposit(test_record_sandbox, sandbox=True, access_token=os.getenv("SANDBOX_ZENODO_TOKEN")) assert isinstance(deposit, dict) - assert deposit['conceptdoi'] == f'10.5072/zenodo.{test_record_sandbox-1}' + assert deposit["conceptdoi"] == f"10.5072/zenodo.{test_record_sandbox - 1}" -@pytest.mark.skipif(os.getenv('SANDBOX_ZENODO_TOKEN') is None, reason="SANDBOX_ZENODO_TOKEN not defined") +@pytest.mark.skipif(os.getenv("SANDBOX_ZENODO_TOKEN") is None, reason="SANDBOX_ZENODO_TOKEN not defined") def test_post_message(): request_data = { - 'id': '6eaf34cf-f63d-4604-8b43-a6198bfdaa5d', - 'links': { - 'timeline': 'https://sandbox.zenodo.org/api/requests/6eaf34cf-f63d-4604-8b43-a6198bfdaa5d/timeline', - 'comments': 'https://sandbox.zenodo.org/api/requests/6eaf34cf-f63d-4604-8b43-a6198bfdaa5d/comments' + "id": "6eaf34cf-f63d-4604-8b43-a6198bfdaa5d", + "links": { + "timeline": "https://sandbox.zenodo.org/api/requests/6eaf34cf-f63d-4604-8b43-a6198bfdaa5d/timeline", + "comments": "https://sandbox.zenodo.org/api/requests/6eaf34cf-f63d-4604-8b43-a6198bfdaa5d/comments", }, - 'receiver': {'community': '012c7725-ee21-4603-855d-e675842b4f7b'}, - 'topic': {'record': '27635'} + "receiver": {"community": "012c7725-ee21-4603-855d-e675842b4f7b"}, + "topic": {"record": "27635"}, } - pending_request = PendingRequest(request_data, access_token=os.getenv('SANDBOX_ZENODO_TOKEN')) + pending_request = PendingRequest(request_data, access_token=os.getenv("SANDBOX_ZENODO_TOKEN")) - message = 'Test message' + message = "Test message" response = pending_request.post_message(message) assert response is True time.sleep(1) - timeline = pending_request.get_timeline(force_refresh=True, sort='newest')['hits']['hits'] + timeline = pending_request.get_timeline(force_refresh=True, sort="newest")["hits"]["hits"] last_comment = timeline[0] - assert last_comment['type'] == 'C' - assert BeautifulSoup(last_comment['payload']['content'], "html.parser").get_text() == message + assert last_comment["type"] == "C" + assert BeautifulSoup(last_comment["payload"]["content"], "html.parser").get_text() == message - delete_url = last_comment['links']['self'] + delete_url = last_comment["links"]["self"] response = requests.delete(delete_url, params=pending_request.parameters, timeout=10) response.raise_for_status() diff --git a/eossr/api/zenodo/zenodo.py b/eossr/api/zenodo/zenodo.py index ba574e9ae9b4c489388b6b4b09d5e16dbdd20205..9d7c17c2f77807d43b89584f547b053c24438607 100644 --- a/eossr/api/zenodo/zenodo.py +++ b/eossr/api/zenodo/zenodo.py @@ -27,44 +27,42 @@ import sys import textwrap import warnings from copy import deepcopy -from pathlib import Path -from urllib.request import urlopen -from bs4 import BeautifulSoup from datetime import datetime +from pathlib import Path from typing import Union +from urllib.request import urlopen import requests +from bs4 import BeautifulSoup from ...metadata.zenodo import write_zenodo_metadata, zenodo_filepath from ...utils import get_codemeta_from_zipurl, write_json from . import http_status - - __all__ = [ - 'zenodo_url', - 'zenodo_sandbox_url', - 'zenodo_api_base_url', - 'zenodo_sandbox_api_base_url', - 'ZenodoAPI', - 'SimilarRecordError', # noqa - 'Record', - 'search_records', - 'query_records', - 'get_record', - 'get_supported_licenses', - 'search_records', - 'search_funders', + "zenodo_url", + "zenodo_sandbox_url", + "zenodo_api_base_url", + "zenodo_sandbox_api_base_url", + "ZenodoAPI", + "SimilarRecordError", # noqa + "Record", + "search_records", + "query_records", + "get_record", + "get_supported_licenses", + "search_records", + "search_funders", # 'search_grants', - 'search_communities', - 'search_licenses', - 'is_live', - 'get_community', - 'get_license', - 'get_funder', - 'query_deposits', - 'query_deposit', - 'get_deposit', + "search_communities", + "search_licenses", + "is_live", + "get_community", + "get_license", + "get_funder", + "query_deposits", + "query_deposit", + "get_deposit", ] @@ -73,7 +71,8 @@ zenodo_api_base_url = f"{zenodo_url}/api/" zenodo_sandbox_url = "https://sandbox.zenodo.org" zenodo_sandbox_api_base_url = f"{zenodo_sandbox_url}/api/" -_default_size_query = 50 +# Default page size for queries (25 is the max for unauthenticated requests) +_default_size_query = 25 _default_timeout = 25 @@ -117,17 +116,16 @@ class ZenodoAPI: if access_token is None: warnings.warn("No access token provided, limited functionalities") self.access_token = access_token - self.parameters = {'access_token': self.access_token} - self.parameters.setdefault('size', _default_size_query) + self.parameters = {"access_token": self.access_token} + self.parameters.setdefault("size", _default_size_query) def _raise_token_status(self): """ private method to check if a valid token has been provided, called in methods requiring a token :return: """ - if self.access_token is None or self.access_token == '': - raise ValueError( - "No access token was provided. This method requires one.") + if self.access_token is None or self.access_token == "": + raise ValueError("No access token was provided. This method requires one.") def query_user_deposits(self): """ @@ -136,7 +134,7 @@ class ZenodoAPI: :return: request.response """ self._raise_token_status() - response = query_deposits('', sandbox=self.sandbox, **self.parameters) + response = query_deposits("", sandbox=self.sandbox, **self.parameters) http_status.ZenodoHTTPStatus(response) return response @@ -199,8 +197,7 @@ class ZenodoAPI: self._raise_token_status() url = f"{self.api_base_url}/deposit/depositions" headers = {"Content-Type": "application/json"} - req = requests.post(url, json={}, headers=headers, - params=self.parameters, timeout=_default_timeout) + req = requests.post(url, json={}, headers=headers, params=self.parameters, timeout=_default_timeout) http_status.ZenodoHTTPStatus(req) return req @@ -228,12 +225,11 @@ class ZenodoAPI: # 2 - Upload the files # full url is recovered from previous GET method - bucket_url = response.json()['links']['bucket'] + bucket_url = response.json()["links"]["bucket"] url = f"{bucket_url}/{name_file}" - with open(path_file, 'rb') as upload_file: - upload = requests.put( - url, data=upload_file, params=self.parameters, timeout=_default_timeout) + with open(path_file, "rb") as upload_file: + upload = requests.put(url, data=upload_file, params=self.parameters, timeout=_default_timeout) http_status.ZenodoHTTPStatus(upload) return upload @@ -279,8 +275,8 @@ class ZenodoAPI: """ req = self.query_deposit(deposit_it) data = req.json() - data['metadata'].update(metadata) - req = self.set_deposit_metadata(deposit_it, data['metadata']) + data["metadata"].update(metadata) + req = self.set_deposit_metadata(deposit_it, data["metadata"]) return req def erase_deposit(self, deposit_id): @@ -305,8 +301,7 @@ class ZenodoAPI: """ self._raise_token_status() url = f"{self.api_base_url}/deposit/depositions/{deposit_id}" - req = requests.delete(url, params=self.parameters, - timeout=_default_timeout) + req = requests.delete(url, params=self.parameters, timeout=_default_timeout) if req.status_code == 204: print("The deposit has been deleted") return req @@ -339,8 +334,7 @@ class ZenodoAPI: """ self._raise_token_status() url = f"{self.api_base_url}/deposit/depositions/{deposit_id}/files/{file_id}" - req = requests.delete(url, params=self.parameters, - timeout=_default_timeout) + req = requests.delete(url, params=self.parameters, timeout=_default_timeout) http_status.ZenodoHTTPStatus(req) return req @@ -360,8 +354,7 @@ class ZenodoAPI: """ self._raise_token_status() url = f"{self.api_base_url}/deposit/depositions/{deposit_id}/actions/publish" - req = requests.post(url, params=self.parameters, - timeout=_default_timeout) + req = requests.post(url, params=self.parameters, timeout=_default_timeout) http_status.ZenodoHTTPStatus(req) return req @@ -387,12 +380,12 @@ class ZenodoAPI: """ self._raise_token_status() url = f"{self.api_base_url}/deposit/depositions/{record_id}/actions/newversion" - parameters = {'access_token': self.access_token} + parameters = {"access_token": self.access_token} req = requests.post(url, params=parameters, timeout=_default_timeout) http_status.ZenodoHTTPStatus(req) return req - def query_community_records(self, community_name='escape2020', **kwargs): + def query_community_records(self, community_name="escape2020", **kwargs): """ Query the records within a community. @@ -411,8 +404,8 @@ class ZenodoAPI: # https://developers.zenodo.org/#list36 parameters = deepcopy(self.parameters) parameters.update(kwargs) - parameters['communities'] = str(community_name) - return query_records('', sandbox=self.sandbox, **parameters) + parameters["communities"] = str(community_name) + return query_records("", sandbox=self.sandbox, **parameters) @staticmethod def path_zenodo_file(root_dir): @@ -448,20 +441,18 @@ class ZenodoAPI: # prepare new record version if record_id is not None: new_deposit = self.new_version_deposit(record_id) - new_deposit_id = new_deposit.json( - )['links']['latest_draft'].rsplit('/')[-1] + new_deposit_id = new_deposit.json()["links"]["latest_draft"].rsplit("/")[-1] print(f" * Preparing a new version of record {record_id}") # TODO: log if erase_previous_files: - old_files_ids = [file['id'] - for file in new_deposit.json()['files']] + old_files_ids = [file["id"] for file in new_deposit.json()["files"]] for file_id in old_files_ids: self.erase_file_deposit(new_deposit_id, file_id) print(f" - file {file_id} erased") else: new_deposit = self.create_new_deposit() - new_deposit_id = new_deposit.json()['id'] - print(' * Preparing a new record') + new_deposit_id = new_deposit.json()["id"] + print(" * Preparing a new record") print(f" * New record id: {new_deposit_id}") @@ -470,8 +461,7 @@ class ZenodoAPI: if metadata is not None: print(f" * Record metadata based on provided metadata: {metadata}") elif path_zenodo_file.exists(): - print( - f" - Record metadata based on zenodo file {path_zenodo_file}") + print(f" - Record metadata based on zenodo file {path_zenodo_file}") with open(path_zenodo_file) as file: metadata = json.load(file) else: @@ -480,8 +470,7 @@ class ZenodoAPI: # upload files dir_to_upload = Path(directory) for file in dir_to_upload.iterdir(): - self.upload_file_deposit( - deposit_id=new_deposit_id, name_file=file.name, path_file=file) + self.upload_file_deposit(deposit_id=new_deposit_id, name_file=file.name, path_file=file) print(f" * {file.name} uploaded") # and update metadata @@ -492,14 +481,12 @@ class ZenodoAPI: if publish: self.publish_deposit(new_deposit_id) if record_id: - print( - f" * New version of {record_id} published at {new_deposit_id} !") + print(f" * New version of {record_id} published at {new_deposit_id} !") else: print(f" * Record {new_deposit_id} published") print(f" * The new doi should be 10.5281/{new_deposit_id}") - print( - f" * Check the upload at {self.api_base_url[:-4]}/deposit/{new_deposit_id} *") + print(f" * Check the upload at {self.api_base_url[:-4]}/deposit/{new_deposit_id} *") return new_deposit_id @@ -522,8 +509,7 @@ class ZenodoAPI: if not path_zenodo_file.exists(): raise FileNotFoundError(f"No {path_zenodo_file} file.") - print( - f"\n * Using {path_zenodo_file} file to simulate a new upload to Zenodo... \n") + print(f"\n * Using {path_zenodo_file} file to simulate a new upload to Zenodo... \n") # 1 - Test connection print("1 --> Testing communication with Zenodo...") @@ -544,19 +530,17 @@ class ZenodoAPI: # 3 - Test upload metadata print("3 --> Testing the ingestion of the Zenodo metadata...") - test_deposit_id = new_deposit.json()['id'] + test_deposit_id = new_deposit.json()["id"] with open(path_zenodo_file) as file: metadata_entry = json.load(file) - updated_metadata = self.set_deposit_metadata( - test_deposit_id, json_metadata=metadata_entry) + updated_metadata = self.set_deposit_metadata(test_deposit_id, json_metadata=metadata_entry) try: http_status.ZenodoHTTPStatus(updated_metadata) print(" * Metadata deposit status OK !") pprint.pprint(metadata_entry) except http_status.HTTPStatusError: - print(" ! ERROR while testing update of metadata\n", - updated_metadata.json()) + print(" ! ERROR while testing update of metadata\n", updated_metadata.json()) print(" ! The deposit will be deleted") # 4 - Test delete entry @@ -565,10 +549,8 @@ class ZenodoAPI: try: http_status.ZenodoHTTPStatus(delete_test_entry) except http_status.HTTPStatusError: - print( - f" !! ERROR erasing dummy test entry: {delete_test_entry.json()}") - print( - f"Please erase it manually at {self.api_base_url[:-4]}/deposit") + print(f" !! ERROR erasing dummy test entry: {delete_test_entry.json()}") + print(f"Please erase it manually at {self.api_base_url[:-4]}/deposit") sys.exit(-1) print(" * Delete test entry status OK !") @@ -587,7 +569,7 @@ class ZenodoAPI: """ request = self.query_user_deposits() - return [Record(hit) for hit in request.json() if hit['state'] == 'done'] + return [Record(hit) for hit in request.json() if hit["state"] == "done"] def find_similar_records(self, record): """ @@ -609,12 +591,9 @@ class ZenodoAPI: if user_rec.title == record.title: similar_records.append(user_rec) - if 'related_identifiers' in user_rec.data['metadata'] and 'related_identifiers' in record.data['metadata']: - - relid1 = [r['identifier'] - for r in user_rec.data['metadata']['related_identifiers']] - relid2 = [r['identifier'] - for r in record.data['metadata']['related_identifiers']] + if "related_identifiers" in user_rec.data["metadata"] and "related_identifiers" in record.data["metadata"]: + relid1 = [r["identifier"] for r in user_rec.data["metadata"]["related_identifiers"]] + relid2 = [r["identifier"] for r in record.data["metadata"]["related_identifiers"]] if set(relid1).intersection(relid2): similar_records.append(user_rec) @@ -639,16 +618,18 @@ class ZenodoAPI: """ self._raise_token_status() - community_json = get_community( - community, sandbox=self.sandbox, token=self.access_token) - community_requests_url = community_json['links']['requests'] + community_json = get_community(community, sandbox=self.sandbox, token=self.access_token) + community_requests_url = community_json["links"]["requests"] parameters = deepcopy(self.parameters) parameters.update(params) - response = requests.get(community_requests_url, - params=parameters, timeout=_default_timeout) + response = requests.get(community_requests_url, params=parameters, timeout=_default_timeout) http_status.ZenodoHTTPStatus(response) - response_json = response.json()['hits']['hits'] - response_json = [PendingRequest(hit, sandbox=self.sandbox, access_token=self.access_token) for hit in response_json if hit['is_open']] + response_json = response.json()["hits"]["hits"] + response_json = [ + PendingRequest(hit, sandbox=self.sandbox, access_token=self.access_token) + for hit in response_json + if hit["is_open"] + ] return response_json def answer_to_pending_request(self, community: str, record_id: Union[str, int], accept: bool): @@ -671,21 +652,20 @@ class ZenodoAPI: HTTPStatusError If the request is not successful. - + """ self._raise_token_status() # check that the record is in the pending requests pending_requests = self.get_community_pending_requests(community) pending_record = None for req in pending_requests: - if req['topic']['record'] == record_id: + if req["topic"]["record"] == record_id: if accept: req.accept() else: req.decline() if pending_record is None: - raise ValueError( - f"Record {record_id} is not in the pending requests of community {community}") + raise ValueError(f"Record {record_id} is not in the pending requests of community {community}") def update_record_metadata(self, record_id, metadata): """ @@ -716,14 +696,14 @@ class ZenodoAPI: http_status.ZenodoHTTPStatus(req) record = get_record(record_id, sandbox=self.sandbox) - record_metadata = record.data['metadata'] - record_metadata['upload_type'] = record_metadata['resource_type']['type'] - if 'access_right_category' in record_metadata: - record_metadata.pop('access_right_category') - if 'related_identifiers' in record_metadata: - record_metadata.pop('related_identifiers') - record_metadata.pop('relations') - record_metadata.pop('resource_type') + record_metadata = record.data["metadata"] + record_metadata["upload_type"] = record_metadata["resource_type"]["type"] + if "access_right_category" in record_metadata: + record_metadata.pop("access_right_category") + if "related_identifiers" in record_metadata: + record_metadata.pop("related_identifiers") + record_metadata.pop("relations") + record_metadata.pop("resource_type") record_metadata.update(metadata) self.set_deposit_metadata(record_id, json_metadata=record_metadata) req = self.publish_deposit(record_id) @@ -740,15 +720,14 @@ class Record: """ def __init__(self, data: dict): - for k in ['id', 'metadata']: + for k in ["id", "metadata"]: if k not in data.keys(): raise ValueError(f"key {k} not present in data") # list of keys mandatory to create a Zenodo entry. # Other keys are either optional, or can be hidden in case of Closed Access entries. - for meta_key in ['title', 'doi']: - if meta_key not in data['metadata'].keys(): - raise ValueError( - f"Mandatory key {meta_key} not in data['metadata']") + for meta_key in ["title", "doi"]: + if meta_key not in data["metadata"].keys(): + raise ValueError(f"Mandatory key {meta_key} not in data['metadata']") self.data = data def __str__(self): @@ -757,7 +736,7 @@ class Record: def __repr__(self): return f"Record({self.id})" - def _write_zenodo_deposit(self, filename='.zenodo.json', overwrite=False, validate=True): + def _write_zenodo_deposit(self, filename=".zenodo.json", overwrite=False, validate=True): """ Write the zenodo metadata to a `.zenodo.json` file, so it can be used to create a new deposit. The created file is not guaranteed to be valid, but it is a good starting point. @@ -772,22 +751,20 @@ class Record: True to validate the metadata before writing the file. Default is True. """ # Transform metadata from record to deposit first - metadata = deepcopy(self.data['metadata']) - metadata['upload_type'] = metadata['resource_type']['type'] - metadata.pop('resource_type') - if 'access_right_category' in metadata: - metadata.pop('access_right_category') - if 'relations' in metadata: - metadata.pop('relations') - if 'communities' in metadata: - metadata['communities'] = [{'identifier': c['id']} - for c in metadata['communities']] - if 'zenodo' in metadata['doi']: - metadata.pop('doi') - metadata['license'] = metadata['license']['id'] - - write_zenodo_metadata(metadata, filename=filename, - overwrite=overwrite, validate=validate) + metadata = deepcopy(self.data["metadata"]) + metadata["upload_type"] = metadata["resource_type"]["type"] + metadata.pop("resource_type") + if "access_right_category" in metadata: + metadata.pop("access_right_category") + if "relations" in metadata: + metadata.pop("relations") + if "communities" in metadata: + metadata["communities"] = [{"identifier": c["id"]} for c in metadata["communities"]] + if "zenodo" in metadata["doi"]: + metadata.pop("doi") + metadata["license"] = metadata["license"]["id"] + + write_zenodo_metadata(metadata, filename=filename, overwrite=overwrite, validate=validate) def write_metadata(self, filename, overwrite=False): """ @@ -799,20 +776,19 @@ class Record: overwrite: bool True to overwrite existing file """ - write_json(self.data['metadata'], - filename=filename, overwrite=overwrite) + write_json(self.data["metadata"], filename=filename, overwrite=overwrite) @property def id(self): - return self.data['id'] + return self.data["id"] @property def title(self): - return self.data['metadata']['title'] + return self.data["metadata"]["title"] @property def metadata(self): - return self.data['metadata'] + return self.data["metadata"] @property def filelist(self): @@ -821,7 +797,7 @@ class Record: :return: [str] """ - return [f['links']['self'] for f in self.data['files']] + return [f["links"]["self"] for f in self.data["files"]] def get_last_version(self, token=None): """ @@ -838,10 +814,10 @@ class Record: eossr.api.zenodo.Record The last version of the record. """ - if 'relations' not in self.data['metadata'] or self.data['metadata']['relations']['version'][0]['is_last']: + if "relations" not in self.data["metadata"] or self.data["metadata"]["relations"]["version"][0]["is_last"]: return self else: - conceptrecid = self.data['conceptrecid'] + conceptrecid = self.data["conceptrecid"] return get_record(conceptrecid, sandbox=self.from_sandbox, token=token) @property @@ -850,7 +826,7 @@ class Record: Is the record from sandbox? :return: bool """ - if 'sandbox' in self.data['links']['self']: + if "sandbox" in self.data["links"]["self"]: return True else: return False @@ -862,7 +838,8 @@ class Record: Parameters ---------- size : int, optional - Number of results to return. Default is 50 (`_default_size_query`) + Number of results to return per page. Default is 25 (`_default_size_query`). + Pagination is automatic, so all versions will be fetched regardless of this value. **kwargs : dict Zenodo query arguments. For an exhaustive list, see the query arguments at https://developers.zenodo.org/#list36 @@ -871,19 +848,19 @@ class Record: dict A dictionary of `{record_id: record_version}` """ - conceptrecid = self.data['conceptrecid'] - params = {'all_versions': True, **kwargs} - params.setdefault('size', size) + conceptrecid = self.data["conceptrecid"] + params = {"all_versions": True, **kwargs} + params.setdefault("size", size) versions = {} - for record in search_records(f'conceptrecid:{conceptrecid}', sandbox=self.from_sandbox, **params): - if 'version' in record.metadata: - versions[record.id] = record.metadata['version'] + for record in search_records(f"conceptrecid:{conceptrecid}", sandbox=self.from_sandbox, **params): + if "version" in record.metadata: + versions[record.id] = record.metadata["version"] else: versions[record.id] = None return versions - def _summary(self, linebreak='\n'): + def _summary(self, linebreak="\n"): """ Generate a summary of the record information. @@ -902,20 +879,19 @@ class Record: The summary string. """ lines = [f"=== Record #{self.id} ===", f"Title: {self.title}"] - version = self.metadata.get('version', 'Unknown') + version = self.metadata.get("version", "Unknown") lines.append(f"Version: {version}") lines.append(f"DOI: {self.data.get('doi', 'Unknown')}") - links = self.data.get('links', {}) - if 'html' in links: + links = self.data.get("links", {}) + if "html" in links: lines.append(f"URL: {links['html']}") - description = self.metadata.get('description', '') + description = self.metadata.get("description", "") # Replace paragraph tags with newlines - description = re.sub('
', linebreak, re.sub( - '
', linebreak, description)) + description = re.sub("", linebreak, re.sub("
", linebreak, description)) # Then strip the remaining HTML tags - stripped_description = re.sub('<[^<]+?>', '', description) + stripped_description = re.sub("<[^<]+?>", "", description) # Wrap description text to 70 characters wide wrapped_description = textwrap.fill(stripped_description, width=70) @@ -924,7 +900,7 @@ class Record: descrp = linebreak.join(lines) return descrp - def print_info(self, linebreak='\n', file=sys.stdout): + def print_info(self, linebreak="\n", file=sys.stdout): """ Print the summary of the record information to a stream, or to `sys.stdout` by default. @@ -980,13 +956,13 @@ class Record: FileNotFoundError If no `codemeta.json` file is found in the record. """ - if 'files' not in self.data: - raise FileNotFoundError( - f'The record {self.id} does not contain any file') + if "files" not in self.data: + raise FileNotFoundError(f"The record {self.id} does not contain any file") - codemeta_paths = [s for s in self.filelist if Path( - s.rsplit('/content', maxsplit=1)[0]).name == 'codemeta.json'] - ziparchives = [s for s in self.filelist if s.endswith('.zip/content')] + codemeta_paths = [ + s for s in self.filelist if Path(s.rsplit("/content", maxsplit=1)[0]).name == "codemeta.json" + ] + ziparchives = [s for s in self.filelist if s.endswith(".zip/content")] if len(codemeta_paths) >= 1: # if there are more than one codemeta file in the repository, we consider the one in the root directory, # hence the one with the shortest path @@ -998,17 +974,15 @@ class Record: return get_codemeta_from_zipurl(zipurl, **zipurl_kwargs) except FileNotFoundError: pass - raise FileNotFoundError( - f"No `codemeta.json` file found in record {self.id}") + raise FileNotFoundError(f"No `codemeta.json` file found in record {self.id}") else: - raise FileNotFoundError( - f"No `codemeta.json` file found in record {self.id}") + raise FileNotFoundError(f"No `codemeta.json` file found in record {self.id}") @property def doi(self): - if 'doi' not in self.data: + if "doi" not in self.data: raise KeyError(f"Record {self.id} does not have a doi") - return self.data['doi'] + return self.data["doi"] @property def web_url(self): @@ -1029,11 +1003,11 @@ class Record: :return: str """ - binder_zenodo_url = 'https://mybinder.org/v2/zenodo/' + binder_zenodo_url = "https://mybinder.org/v2/zenodo/" doi = self.doi return binder_zenodo_url + doi - def download(self, directory='.', max_workers=None): + def download(self, directory=".", max_workers=None): """ Download the record to a directory. @@ -1050,19 +1024,19 @@ class Record: def download_file(url, path): response = requests.get(url, stream=True, timeout=_default_timeout) if response.status_code == 200: - with open(path, 'wb') as f: + with open(path, "wb") as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) else: raise Exception(f"Failed to download file from {url}") def remove_trailing_content(url): - return url.rsplit('/content', maxsplit=1)[0] + return url.rsplit("/content", maxsplit=1)[0] with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: future_to_url = { executor.submit( - download_file, url, f'{directory}/{os.path.basename(remove_trailing_content(url))}' + download_file, url, f"{directory}/{os.path.basename(remove_trailing_content(url))}" ): url for url in self.filelist } @@ -1070,9 +1044,9 @@ class Record: url = future_to_url[future] try: future.result() - print(f'{url} : Download complete') + print(f"{url} : Download complete") except Exception as exc: - print(f'{url} generated an exception: {exc}') + print(f"{url} generated an exception: {exc}") def query_records(search, sandbox=False, **kwargs): @@ -1112,7 +1086,7 @@ def query_records(search, sandbox=False, **kwargs): requests.response The response from the Zenodo API. """ - return _query('records', search=search, sandbox=sandbox, **kwargs) + return _query("records", search=search, sandbox=sandbox, **kwargs) def _zenodo_get_factory(endpoint): @@ -1148,7 +1122,7 @@ def _zenodo_get_factory(endpoint): base_url = zenodo_sandbox_api_base_url if sandbox else zenodo_api_base_url url = f"{base_url}{endpoint}/{id}" # Make a GET request to the Zenodo API - params = {'access_token': token} if token else {} + params = {"access_token": token} if token else {} response = requests.get(url, params=params, timeout=_default_timeout) # if response.status_code == 200: @@ -1270,26 +1244,21 @@ def get_funder(funder_id, sandbox=False, token=None) -> dict: return _zenodo_get_factory("funders")(funder_id, sandbox=sandbox, token=token) -def get_supported_licenses(size=1000): +def get_supported_licenses(): """ - Recovers the list of Zenodo supported license IDs and names. + Recovers the list of Zenodo supported license IDs. Makes a request.get() call to Zenodo. - Parameters - ---------- - size : int, optional - The number of licenses to retrieve (default is 1000) - Returns ------- list - A list of license IDs for Zenodo supported licenses + A list of license IDs for all Zenodo supported licenses """ - licenses = search_licenses(size=size) - return [license['id'] for license in licenses] + licenses = search_licenses() + return [license["id"] for license in licenses] -def _query(field, search='', sandbox=False, request_params=None, **zenodo_kwargs): +def _query(field, search="", sandbox=False, request_params=None, **zenodo_kwargs): """ Query Zenodo API to search for records, funders, grants, communities, or licenses. @@ -1358,16 +1327,21 @@ def _query(field, search='', sandbox=False, request_params=None, **zenodo_kwargs if request_params is None: request_params = {} - request_params.setdefault('timeout', _default_timeout) + request_params.setdefault("timeout", _default_timeout) # zenodo can't handle '/' in search query search = search.replace("/", " ") - params = {'q': search, **zenodo_kwargs} + params = {"q": search, **zenodo_kwargs} - params.setdefault('size', 100) + # Use a reasonable page size that works for both authenticated and unauthenticated requests + # Unauthenticated: max 25, Authenticated: max 100 + # We use 25 to ensure it works in all cases, cap at 100 for authenticated requests + if "size" in params and params["size"] > 100: + params["size"] = 100 + params.setdefault("size", 25) - for param_name in ['communities', 'type', 'file_type']: + for param_name in ["communities", "type", "file_type"]: if param_name in zenodo_kwargs: params[param_name] = lowercase(zenodo_kwargs[param_name]) @@ -1378,10 +1352,10 @@ def _query(field, search='', sandbox=False, request_params=None, **zenodo_kwargs return response -def _search(field, search='', sandbox=False, **kwargs): +def _search(field, search="", sandbox=False, request_params=None, max_results=None, **kwargs): """ - Text based search base function. - Returns a list of hits (JSON dict containing the object metadata). + Text based search base function with automatic pagination. + Returns all matching results as a list of hits (JSON dict containing the object metadata). Parameters ---------- @@ -1391,6 +1365,12 @@ def _search(field, search='', sandbox=False, **kwargs): The search query, by default '' sandbox : bool, optional True to search in the sandbox, by default False + request_params : dict, optional + Parameters for the `requests.get` function. Override the class parameters. + e.g. {'timeout': 10}. + max_results : int, optional + Maximum number of results to return. If None, returns all results. + Useful for limiting results when searching records. **kwargs Zenodo query arguments and common requests arguments. For an exhaustive list, see the query arguments at https://developers.zenodo.org/#list36 @@ -1399,8 +1379,8 @@ def _search(field, search='', sandbox=False, **kwargs): Zenodo access token May be necessary for private queries - size : int - Number of results to return - Default = 100 + Number of results to return per page. + Default = 25 (max for unauthenticated requests). - all_versions : int Show (1) or hide (0) all versions of records - type : str or list[str] @@ -1419,17 +1399,60 @@ def _search(field, search='', sandbox=False, **kwargs): Returns ------- list[dict] - A list of hits (JSON dict containing the object metadata). + A list of matching hits (JSON dict containing the object metadata). + If max_results is set, returns at most that many results. """ + if request_params is None: + request_params = {} + request_params.setdefault("timeout", _default_timeout) - query = _query(field, search=search, sandbox=sandbox, **kwargs) - http_status.ZenodoHTTPStatus(query) + # Use a reasonable page size that works for both authenticated and unauthenticated requests + # Unauthenticated: max 25, Authenticated: max 100 + # We use 25 to ensure it works in all cases + page_size = kwargs.pop("size", 25) + if page_size > 100: + page_size = 100 # Zenodo hard limit - hits = [hit for hit in query.json()["hits"]["hits"]] - return hits + # Get the access token if provided (needed for pagination requests) + access_token = kwargs.get("access_token") + + all_hits = [] + + # Make the first request + kwargs["size"] = page_size + response = _query(field, search=search, sandbox=sandbox, request_params=request_params, **kwargs) + http_status.ZenodoHTTPStatus(response) + + data = response.json() + all_hits.extend(data["hits"]["hits"]) + + # Check if we've reached max_results + if max_results is not None and len(all_hits) >= max_results: + return all_hits[:max_results] + + # Follow pagination links to fetch all results + while "next" in data.get("links", {}): + next_url = data["links"]["next"] + + # Add access token to pagination request if available + params = {} + if access_token: + params["access_token"] = access_token + + response = requests.get(next_url, params=params, **request_params) + http_status.ZenodoHTTPStatus(response) + + data = response.json() + all_hits.extend(data["hits"]["hits"]) + + # Check if we've reached max_results + if max_results is not None and len(all_hits) >= max_results: + return all_hits[:max_results] + + return all_hits -def search_records(search='', sandbox=False, **kwargs): +def search_records(search="", sandbox=False, max_results=None, **kwargs): """ Text based search of records. Returns a list of Record objects. @@ -1440,6 +1463,9 @@ def search_records(search='', sandbox=False, **kwargs): The search query. (default is '') sandbox : bool, optional True to search in the sandbox. (default is False) + max_results : int, optional + Maximum number of records to return. If None, returns all matching records. + Use this to limit results for broad searches. **kwargs : dict Additional keyword arguments for Zenodo query and common requests arguments. @@ -1454,15 +1480,15 @@ def search_records(search='', sandbox=False, **kwargs): Examples -------- - >>> search_records('data science', sandbox=True) + >>> search_records('data science', sandbox=True, max_results=50) """ - hits = _search('records', search=search, sandbox=sandbox, **kwargs) + hits = _search("records", search=search, sandbox=sandbox, max_results=max_results, **kwargs) return [Record(hit) for hit in hits] -def search_funders(search='', sandbox=False, **kwargs): +def search_funders(search="", sandbox=False, **kwargs): """ - Text based search of funders. + Text based search of funders. Returns all matching funders. Parameters ---------- @@ -1472,22 +1498,18 @@ def search_funders(search='', sandbox=False, **kwargs): True to search in the sandbox, False otherwise. **kwargs : dict Additional Zenodo query arguments. For an exhaustive list, see the query arguments at https://developers.zenodo.org/#list36. - Common arguments are: - - size : int, optional - Number of results to return. Default is 5. Returns ------- list of dict - A list of funders matching the search query. + A list of all funders matching the search query. Examples -------- - >>> search_funders('research') - [{'name': 'National Science Foundation'}, {'name': 'European Research Council'}, {'name': 'Wellcome Trust'}, {'name': 'Bill & Melinda Gates Foundation'}, {'name': 'National Institutes of Health (NIH)'}] + >>> search_funders('European Commission') + [{'id': '...', 'name': 'European Commission'}, ...] """ - kwargs.setdefault('size', 5) - hits = _search('funders', search=search, sandbox=sandbox, **kwargs) + hits = _search("funders", search=search, sandbox=sandbox, **kwargs) return hits @@ -1512,9 +1534,9 @@ def search_funders(search='', sandbox=False, **kwargs): # return hits -def search_communities(search='', sandbox=False, **kwargs): +def search_communities(search="", sandbox=False, **kwargs): """ - Text based search of communities. + Text based search of communities. Returns all matching communities. Parameters ---------- @@ -1524,23 +1546,20 @@ def search_communities(search='', sandbox=False, **kwargs): True to search in the sandbox, False otherwise. **kwargs : dict Additional query arguments. For an exhaustive list, see the query arguments at - https://developers.zenodo.org/#list36. Common arguments are: - - size : int, optional - Number of results to return. Default is 5. + https://developers.zenodo.org/#list36. Returns ------- list of dict - A list of communities matching the search query. + A list of all communities matching the search query. """ - kwargs.setdefault('size', 5) - hits = _search('communities', search=search, sandbox=sandbox, **kwargs) + hits = _search("communities", search=search, sandbox=sandbox, **kwargs) return hits -def search_licenses(search='', sandbox=False, **kwargs): +def search_licenses(search="", sandbox=False, **kwargs): """ - Text based search of licenses. + Text based search of licenses. Returns all matching licenses. Parameters ---------- @@ -1554,19 +1573,14 @@ def search_licenses(search='', sandbox=False, **kwargs): Returns ------- list of dict - A list of dictionaries representing the search results. + A list of dictionaries representing all matching licenses. Notes ----- For an exhaustive list of query arguments, see the Zenodo API documentation: https://developers.zenodo.org/#list36 - - Common query arguments include: - - size : int, optional - Number of results to return. Default is 5. """ - kwargs.setdefault('size', 5) - hits = _search('licenses', search=search, sandbox=sandbox, **kwargs) + hits = _search("licenses", search=search, sandbox=sandbox, **kwargs) return hits @@ -1585,7 +1599,7 @@ def is_live(sandbox=False): True if live """ url = zenodo_sandbox_api_base_url if sandbox else zenodo_api_base_url - req = requests.get(url + '/records?size=1', timeout=_default_timeout) + req = requests.get(url + "/records?size=1", timeout=_default_timeout) return req.status_code == 200 @@ -1607,7 +1621,7 @@ def query_deposit(deposit_id, access_token, sandbox=False): requests.response The query result. """ - return _query(f'deposit/depositions/{deposit_id}', '', sandbox=sandbox, access_token=access_token) + return _query(f"deposit/depositions/{deposit_id}", "", sandbox=sandbox, access_token=access_token) def query_deposits(search, access_token, sandbox=False, **kwargs): @@ -1653,8 +1667,8 @@ def query_deposits(search, access_token, sandbox=False, **kwargs): Records with the specified file_type. A logical OR is applied in case of a list. """ - field = 'deposit/depositions' - kwargs.setdefault('access_token', access_token) + field = "deposit/depositions" + kwargs.setdefault("access_token", access_token) return _query(field, search=search, sandbox=sandbox, **kwargs) @@ -1678,7 +1692,7 @@ def get_deposit(deposit_id, access_token, sandbox=False): dict The deposit corresponding to the given ID. """ - return _zenodo_get_factory('deposit/depositions')(deposit_id, sandbox=sandbox, token=access_token) + return _zenodo_get_factory("deposit/depositions")(deposit_id, sandbox=sandbox, token=access_token) class PendingRequest: @@ -1702,21 +1716,25 @@ class PendingRequest: """ Get the URL for this request """ - community_id = self.data.get('receiver', {}).get('community', None) + community_id = self.data.get("receiver", {}).get("community", None) if not community_id: raise KeyError("No community id found, can't reconstruct url") else: - return self.data.get('links', {}).get('self_html', '').replace('requests', f'communities/{community_id}/requests') + return ( + self.data.get("links", {}) + .get("self_html", "") + .replace("requests", f"communities/{community_id}/requests") + ) @property def parameters(self): """ Returns the parameters required for the Zenodo API. - + Returns: dict: A dictionary containing the access token. """ - return {'access_token': self.access_token} + return {"access_token": self.access_token} @property def record(self): @@ -1732,23 +1750,23 @@ class PendingRequest: @property def id(self): - return self.data.get('id') + return self.data.get("id") @property def title(self): - return self.data.get('title', None) + return self.data.get("title", None) @property def status(self): - return self.data.get('status', None) + return self.data.get("status", None) @property def is_open(self): - return self.data.get('is_open', None) + return self.data.get("is_open", None) @property def record_id(self): - return self.data.get('topic', {}).get('record', None) + return self.data.get("topic", {}).get("record", None) def get_timeline(self, force_refresh=False, expand=False, **parameters): """ @@ -1765,11 +1783,11 @@ class PendingRequest: params = self.parameters.copy() params.update(parameters) if self._timeline is None or force_refresh: - if 'links' not in self.data and 'timeline' not in self.data.get('links'): + if "links" not in self.data and "timeline" not in self.data.get("links"): raise KeyError(f"No timeline link found in pending request {self.id}") - url = self.data.get('links').get('timeline') + url = self.data.get("links").get("timeline") if expand: - url += '?expand=true' + url += "?expand=true" self._timeline = requests.get(url, params=params).json() return self._timeline @@ -1778,11 +1796,11 @@ class PendingRequest: Retrieves the discussion timeline and displays the content of each hit. """ timeline = self.get_timeline() - for hit in timeline['hits']['hits']: - if hit['type'] == 'C': - created_by = hit['created_by']['user'] - updated = datetime.fromisoformat(hit['updated']) - content = BeautifulSoup(hit['payload']['content'], "html.parser").get_text() + for hit in timeline["hits"]["hits"]: + if hit["type"] == "C": + created_by = hit["created_by"]["user"] + updated = datetime.fromisoformat(hit["updated"]) + content = BeautifulSoup(hit["payload"]["content"], "html.parser").get_text() print(f"User {created_by} ({updated.strftime('%Y-%m-%d %H:%M:%S')}): {content}") def __str__(self): @@ -1797,8 +1815,7 @@ class PendingRequest: Raises: KeyError: if no accept URL is found for the request. """ - accept_url = self.data.get('links', {}).get( - 'actions', {}).get('accept') + accept_url = self.data.get("links", {}).get("actions", {}).get("accept") if accept_url: response = requests.post(accept_url, params=self.parameters) response.raise_for_status() @@ -1810,12 +1827,11 @@ class PendingRequest: def decline(self): """ Declines the request by sending a POST request to the decline URL. - + Raises: KeyError: If no decline URL is found for this request. """ - decline_url = self.data.get('links', {}).get( - 'actions', {}).get('decline') + decline_url = self.data.get("links", {}).get("actions", {}).get("decline") if decline_url: response = requests.post(decline_url, params=self.parameters) response.raise_for_status() @@ -1834,17 +1850,14 @@ class PendingRequest: Returns: bool: True if the message was successfully posted, raises an HTTPStatusError otherwise. """ - comments_url = self.data['links']['comments'] + '?expand=1' + comments_url = self.data["links"]["comments"] + "?expand=1" data = { "payload": { "content": f"{message}
", "format": "html", } } - response = requests.post(comments_url, - params=self.parameters, - data=json.dumps(data), - timeout=_default_timeout) + response = requests.post(comments_url, params=self.parameters, data=json.dumps(data), timeout=_default_timeout) http_status.ZenodoHTTPStatus(response) if response.status_code == 201: return True