Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,44 @@ Works()["W2023271753"].ngrams()
```


### Get lists of entities based on ids

You can optimize your API requests when retrieving multiple entities from OpenAlex based on their IDs.
The entities can be requested by 100 for OpenAlex IDs and by 50 for the other IDs (DOI, ISSN, ORCID, ROR).

The following example demonstrates how to get Works based on their OpenAlex IDs:

```python
from pyalex import Works

works_ids = ["W4409154704", "W1999167944", "https://openalex.org/W2096885696"]

works = Works().get_from_ids(works_ids, ordered=True)
```

You can optionally order the results; as by default, the OpenAlex API doesn't align the results on the id list.
If you order the results, None values will be set in the list when entities are not found, otherwise you will only get the entities found in OpenAlex.

You can specify the IDs either with the full URL (e.g. `https://openalex.org/W4409154704`) or ID string (e.g. `W4409154704`)

`get_from_ids()` can retrieve entities from different types of ids. The supported `id_types` values are:
- `openalex_id` (default)
- `doi` (Works only)
- `issn` (Sources only)
- `orcid` (Authors only)
- `ror` (Institutions only).

The example bellow shows how to get Instituions based on their ROR ID:

```python
from pyalex import Institutions

institutions_ids = ["https://ror.org/03xjwb503", "0145rpw38",]

institutions = Institutions().get_from_ids(institutions_ids, id_type="ror", ordered=True)
```


### Serialize

All results from PyAlex can be serialized. For example, save the results to a JSON file:
Expand Down Expand Up @@ -461,6 +499,20 @@ Works() \

```

### Retrieve a list of articles from their DOIs

```python
from pyalex import Works

works_ids = [
"https://doi.org/10.1016/j.cosust.2025.101526",
"10.1126/science.1259855",
"https://doi.org/10.1038/461472a",
]

works = Works().get_from_ids(works_ids, id_type="doi", ordered=True)
```

## Experimental

### Authentication
Expand Down
69 changes: 69 additions & 0 deletions pyalex/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import requests
from requests.auth import AuthBase
from tqdm import tqdm
from urllib3.util import Retry

try:
Expand Down Expand Up @@ -32,6 +33,15 @@ class AlexConfig(dict):
Backoff factor for retries.
retry_http_codes : list
List of HTTP status codes to retry on.
disable_tqdm_loading_bar : bool
Disable tqdm progress bar if true. Defaults to False.
openalex_ids_batch_size : int
Batch size for retrieving multiple entities by their OpenAlex ID.
An integer from 1 to 100 (OpenAlex limit).
external_ids_batch_size : int
Batch size for retrieving multiple entities by ID external to OpenAlex (DOI,
ISSN, ORCID, ROR, ...).
An integer from 1 to 50 (OpenAlex limit).
"""

def __getattr__(self, key):
Expand All @@ -49,6 +59,9 @@ def __setattr__(self, key, value):
max_retries=0,
retry_backoff_factor=0.1,
retry_http_codes=[429, 500, 503],
disable_tqdm_loading_bar=False,
openalex_ids_batch_size=100,
external_ids_batch_size=50,
)


Expand Down Expand Up @@ -863,6 +876,62 @@ def autocomplete(self, s, return_meta=False):
else:
return resp_list

def get_from_ids(
self, ids: list, id_type: str = "openalex_id", ordered=False
) -> list:
"""Return the OpenAlex entities list from the requested ids.

Parameters
----------
ids : list[str]
IDs of the entities to get.
id_type : str
ID type for the entities to retrieve. One of "openalex_id", "doi" (Works),
"issn" (Sources), "orcid" (Authors) or "ror" (Institutions).
ordered : bool, optional
Whether keep the order from the input list ids in the results.
Defaults to False.

Returns
-------
list[OpenAlexEntity | None]
List of OpenAlex entities. If ordered == True, None is returned for not
found entities.
"""

ids_batch_size = (
config.openalex_ids_batch_size
if id_type == "openalex_id"
else config.external_ids_batch_size
)

res = [None] * len(ids)
with tqdm(total=len(ids), disable=config.disable_tqdm_loading_bar) as pbar:
for i in range(0, len(ids), ids_batch_size):
n_doc = (
ids_batch_size
if i + ids_batch_size <= len(ids)
else len(ids) % ids_batch_size
)
res[i : i + n_doc] = (
self.__class__()
.filter_or(**{id_type: ids[i : i + n_doc]})
.get(per_page=n_doc)
)
pbar.update(n_doc)

if ordered:
if id_type == "issn":
raise NotImplementedError(
"Ordering is not supported for ISSN ids as a single source can "
"have multiple ISSN."
)
id_field_name = "id" if id_type == "openalex_id" else id_type
map_ids = {doc[id_field_name].split("/")[-1].upper(): doc for doc in res}
res = [map_ids.get(id_.split("/")[-1].upper(), None) for id_ in ids]

return res


# The API

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ classifiers = [
"Programming Language :: Python :: 3.12"
]
license = {text = "MIT"}
dependencies = ["requests", "urllib3"]
dependencies = ["requests", "urllib3", "tqdm"]
dynamic = ["version"]
requires-python = ">=3.8"

Expand Down
70 changes: 70 additions & 0 deletions tests/test_pyalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,3 +448,73 @@ def test_premium_api():
Works().filter(from_updated_date=f"{datetime.datetime.now().year}-01-01").get()

pyalex.config.api_key = None


@pytest.mark.parametrize("openalex_ids_batch_size", [1, 2, 3, 100])
def test_get_from_ids_openalex_id(openalex_ids_batch_size):
"""
Test get_from_ids() for openalex_id and with various values of
openalex_ids_batch_size
"""
original_openalex_ids_batch_size = pyalex.config.openalex_ids_batch_size
pyalex.config.openalex_ids_batch_size = openalex_ids_batch_size
try:
entities_ids = [
"W4409154704",
"W1999167944",
"https://openalex.org/W2096885696",
]
entities_names = [
"Challenges and opportunities when assessing exposure of financial "
"investments to ecosystem regime shifts",
"Planetary boundaries: Guiding human development on a changing planet",
"A safe operating space for humanity",
]
res = pyalex.Works().get_from_ids(entities_ids, ordered=True)
for i in range(len(entities_names)):
assert entities_names[i] == res[i]["display_name"]
finally:
pyalex.config.openalex_ids_batch_size = original_openalex_ids_batch_size


def test_get_from_ids_doi():
entities_ids = [
"https://doi.org/10.1016/j.cosust.2025.101526",
"10.1126/science.1259855",
"https://doi.org/10.1038/461472a",
]
entities_names = [
"Challenges and opportunities when assessing exposure of financial "
"investments to ecosystem regime shifts",
"Planetary boundaries: Guiding human development on a changing planet",
"A safe operating space for humanity",
]
res = pyalex.Works().get_from_ids(entities_ids, id_type="doi", ordered=True)
for i in range(len(entities_names)):
assert entities_names[i] == res[i]["display_name"]


def test_get_from_ids_ror():
entities_ids = [
"https://ror.org/03xjwb503",
"0145rpw38",
]
entities_names = [
"Université Paris-Saclay",
"Stockholm Resilience Centre",
]
res = pyalex.Institutions().get_from_ids(entities_ids, id_type="ror", ordered=True)
for i in range(len(entities_names)):
assert entities_names[i] == res[i]["display_name"]


def test_get_from_ids_issn():
entities_ids = [
"1877-3435",
]
entities_names = [
"Current Opinion in Environmental Sustainability",
]
res = pyalex.Sources().get_from_ids(entities_ids, id_type="issn")
for i in range(len(entities_names)):
assert entities_names[i] == res[i]["display_name"]