Skip to content

Commit 5f08886

Browse files
mskarlinCopilot
andauthored
Allow for non-mutated user specified doc-ids (#977)
Co-authored-by: Copilot <[email protected]>
1 parent ce7660c commit 5f08886

File tree

4 files changed

+163
-6
lines changed

4 files changed

+163
-6
lines changed

paperqa/clients/journal_quality.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,14 @@ async def _process(
4545
# remember, if both have docnames (i.e. key) they are
4646
# wiped and re-generated with resultant data
4747
return doc_details + DocDetails(
48+
doc_id=doc_details.doc_id, # ensure doc_id is preserved
49+
dockey=doc_details.dockey, # ensure dockey is preserved
4850
source_quality=max(
4951
[
5052
self.data.get(query.journal.casefold(), DocDetails.UNDEFINED_JOURNAL_QUALITY), # type: ignore[union-attr]
5153
self.data.get("the " + query.journal.casefold(), DocDetails.UNDEFINED_JOURNAL_QUALITY), # type: ignore[union-attr]
5254
]
53-
)
55+
),
5456
)
5557

5658
def query_creator(self, doc_details: DocDetails, **kwargs) -> JournalQuery | None:

paperqa/clients/retractions.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616

1717

1818
class RetractionDataPostProcessor(MetadataPostProcessor[DOIQuery]):
19+
20+
RETRACTION_CACHE_DAYS: int = -1 # Number of days to cache, -1 is keep forever
21+
1922
def __init__(self, retraction_data_path: os.PathLike | str | None = None) -> None:
2023

2124
if retraction_data_path is None:
@@ -37,6 +40,8 @@ def __init__(self, retraction_data_path: os.PathLike | str | None = None) -> Non
3740
]
3841

3942
def _has_cache_expired(self) -> bool:
43+
if self.RETRACTION_CACHE_DAYS < 0:
44+
return False
4045
creation_time = os.path.getctime(self.retraction_data_path)
4146
file_creation_date = datetime.datetime.fromtimestamp(creation_time).replace(
4247
tzinfo=datetime.UTC
@@ -45,7 +50,7 @@ def _has_cache_expired(self) -> bool:
4550
current_time = datetime.datetime.now(datetime.UTC)
4651
time_difference = current_time - file_creation_date
4752

48-
return time_difference > datetime.timedelta(days=30)
53+
return time_difference > datetime.timedelta(days=self.RETRACTION_CACHE_DAYS)
4954

5055
def _is_csv_cached(self) -> bool:
5156
return os.path.exists(self.retraction_data_path)
@@ -71,7 +76,11 @@ async def _process(self, query: DOIQuery, doc_details: DocDetails) -> DocDetails
7176
if not self.doi_set:
7277
await self.load_data()
7378

74-
return doc_details + DocDetails(is_retracted=query.doi in self.doi_set)
79+
return doc_details + DocDetails(
80+
doc_id=doc_details.doc_id, # ensure doc_id is preserved
81+
dockey=doc_details.dockey, # ensure dockey is preserved
82+
is_retracted=query.doi in self.doi_set,
83+
)
7584

7685
def query_creator(self, doc_details: DocDetails, **kwargs) -> DOIQuery | None:
7786
try:

paperqa/types.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -431,14 +431,15 @@ def lowercase_doi_and_populate_doc_id(cls, data: dict[str, Any]) -> dict[str, An
431431
if doi.startswith(url_prefix_to_remove):
432432
doi = doi.replace(url_prefix_to_remove, "")
433433
data["doi"] = doi.lower()
434-
data["doc_id"] = encode_id(doi.lower())
435-
else:
434+
if "doc_id" not in data or not data["doc_id"]: # keep user defined doc_ids
435+
data["doc_id"] = encode_id(doi.lower())
436+
elif "doc_id" not in data or not data["doc_id"]: # keep user defined doc_ids
436437
data["doc_id"] = encode_id(uuid4())
437438

438439
if "dockey" in data.get(
439440
"fields_to_overwrite_from_metadata",
440441
DEFAULT_FIELDS_TO_OVERWRITE_FROM_METADATA,
441-
):
442+
) and ("dockey" not in data or not data["dockey"]):
442443
data["dockey"] = data["doc_id"]
443444

444445
return data

tests/test_paperqa.py

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
from paperqa.readers import parse_pdf_to_pages, read_doc
5151
from paperqa.types import ChunkMetadata
5252
from paperqa.utils import (
53+
encode_id,
5354
extract_score,
5455
get_citenames,
5556
maybe_get_date,
@@ -1488,6 +1489,150 @@ def test_docdetails_deserialization() -> None:
14881489
), "Deserialization should not mutate input"
14891490

14901491

1492+
def test_docdetails_doc_id_roundtrip() -> None:
1493+
"""Test that DocDetails can be initialized with doc_id or doi inputs."""
1494+
test_doi = "10.1234/test.doi"
1495+
test_doi_doc_id = encode_id(test_doi.lower())
1496+
test_specified_doc_id = "abc123"
1497+
# first we test without a doc_id or doi, ensure it's still valid
1498+
doc_details_no_doi_no_doc_id = DocDetails(
1499+
docname="test_doc",
1500+
citation="Test Citation",
1501+
dockey="test_dockey",
1502+
embedding=None,
1503+
formatted_citation="Formatted Test Citation",
1504+
)
1505+
1506+
assert (
1507+
doc_details_no_doi_no_doc_id.doc_id != test_doi_doc_id
1508+
), "DocDetails without doc_id should not match test_doi_doc_id"
1509+
assert (
1510+
doc_details_no_doi_no_doc_id.doi is None
1511+
), "DocDetails without doi should have None doi"
1512+
assert doc_details_no_doi_no_doc_id.dockey == doc_details_no_doi_no_doc_id.doc_id
1513+
1514+
# now round-trip serializaiton should keep the same doc_id
1515+
new_no_doi_no_doc_id = DocDetails(
1516+
**doc_details_no_doi_no_doc_id.model_dump(exclude_none=True)
1517+
)
1518+
assert (
1519+
new_no_doi_no_doc_id.doc_id == doc_details_no_doi_no_doc_id.doc_id
1520+
), "DocDetails without doc_id should keep the same doc_id after serialization"
1521+
1522+
# since validation runs on assignment, make sure we can assign correctly
1523+
doc_details_no_doi_no_doc_id.doc_id = test_specified_doc_id
1524+
assert (
1525+
doc_details_no_doi_no_doc_id.doc_id == test_specified_doc_id
1526+
), "DocDetails with doc_id should match test_specified_doc_id"
1527+
assert doc_details_no_doi_no_doc_id.dockey == doc_details_no_doi_no_doc_id.doc_id
1528+
1529+
# now let's do this with a doi
1530+
doc_details_with_doi_no_doc_id = DocDetails(
1531+
doi=test_doi,
1532+
docname="test_doc",
1533+
citation="Test Citation",
1534+
dockey="test_dockey",
1535+
embedding=None,
1536+
formatted_citation="Formatted Test Citation",
1537+
)
1538+
assert (
1539+
doc_details_with_doi_no_doc_id.doc_id == test_doi_doc_id
1540+
), "DocDetails with doc_id should not match test_doi_doc_id"
1541+
assert (
1542+
doc_details_with_doi_no_doc_id.doi == test_doi
1543+
), "DocDetails with doi should match test_doi"
1544+
assert (
1545+
doc_details_with_doi_no_doc_id.dockey == doc_details_with_doi_no_doc_id.doc_id
1546+
)
1547+
1548+
# round-trip serializaiton should keep the same doc_id
1549+
new_with_doi_no_doc_id = DocDetails(
1550+
**doc_details_with_doi_no_doc_id.model_dump(exclude_none=True)
1551+
)
1552+
assert (
1553+
new_with_doi_no_doc_id.doc_id == doc_details_with_doi_no_doc_id.doc_id
1554+
), "DocDetails with doc_id should keep the same doc_id after serialization"
1555+
1556+
# since validation runs on assignment, make sure we can assign correctly
1557+
doc_details_with_doi_no_doc_id.doc_id = test_specified_doc_id
1558+
assert (
1559+
doc_details_with_doi_no_doc_id.doc_id == test_specified_doc_id
1560+
), "DocDetails with doc_id should match test_specified_doc_id"
1561+
assert (
1562+
doc_details_with_doi_no_doc_id.dockey == doc_details_with_doi_no_doc_id.doc_id
1563+
)
1564+
1565+
# let's specify the doc_id directly
1566+
doc_details_no_doi_with_doc_id = DocDetails(
1567+
doc_id=test_specified_doc_id,
1568+
docname="test_doc",
1569+
citation="Test Citation",
1570+
dockey="test_dockey",
1571+
embedding=None,
1572+
formatted_citation="Formatted Test Citation",
1573+
)
1574+
assert (
1575+
doc_details_no_doi_with_doc_id.doc_id == test_specified_doc_id
1576+
), "DocDetails with doc_id should not match test_specified_doc_id"
1577+
assert (
1578+
doc_details_no_doi_with_doc_id.doi is None
1579+
), "DocDetails without doi should be None"
1580+
assert (
1581+
doc_details_no_doi_with_doc_id.dockey == doc_details_no_doi_with_doc_id.doc_id
1582+
), "DocDetails dockey should match doc_id for the same object"
1583+
1584+
# round-trip serializaiton should keep the same doc_id
1585+
new_no_doi_with_doc_id = DocDetails(
1586+
**doc_details_no_doi_with_doc_id.model_dump(exclude_none=True)
1587+
)
1588+
assert (
1589+
new_no_doi_with_doc_id.doc_id == doc_details_with_doi_no_doc_id.doc_id
1590+
), "DocDetails with doc_id should keep the same doc_id after serialization"
1591+
1592+
# since validation runs on assignment, make sure we can assign correctly
1593+
new_no_doi_with_doc_id.doc_id = test_doi_doc_id
1594+
assert (
1595+
new_no_doi_with_doc_id.doc_id == test_doi_doc_id
1596+
), "DocDetails with doc_id should match test_specified_doc_id"
1597+
assert new_no_doi_with_doc_id.dockey == new_no_doi_with_doc_id.doc_id
1598+
1599+
# now we specify both doi and doc_id, ensuring doc_id takes precedence
1600+
doc_details_with_doi_with_doc_id = DocDetails(
1601+
doc_id=test_specified_doc_id,
1602+
doi=test_doi,
1603+
docname="test_doc",
1604+
citation="Test Citation",
1605+
dockey="test_dockey",
1606+
embedding=None,
1607+
formatted_citation="Formatted Test Citation",
1608+
)
1609+
assert (
1610+
doc_details_with_doi_with_doc_id.doc_id == test_specified_doc_id
1611+
), "DocDetails with doc_id should not match test_specified_doc_id"
1612+
assert (
1613+
doc_details_with_doi_with_doc_id.doi == test_doi
1614+
), "DocDetails without doi should match test_doi"
1615+
assert (
1616+
doc_details_with_doi_with_doc_id.dockey
1617+
== doc_details_with_doi_with_doc_id.doc_id
1618+
)
1619+
1620+
# round-trip serializaiton should keep the same doc_id
1621+
new_with_doi_with_doc_id = DocDetails(
1622+
**doc_details_with_doi_with_doc_id.model_dump(exclude_none=True)
1623+
)
1624+
assert (
1625+
new_with_doi_with_doc_id.doc_id == doc_details_with_doi_with_doc_id.doc_id
1626+
), "DocDetails with doc_id should keep the same doc_id after serialization"
1627+
1628+
# since validation runs on assignment, make sure we can assign correctly
1629+
new_with_doi_with_doc_id.doc_id = test_doi_doc_id
1630+
assert (
1631+
new_with_doi_with_doc_id.doc_id == test_doi_doc_id
1632+
), "DocDetails with doc_id should match test_specified_doc_id"
1633+
assert new_with_doi_with_doc_id.dockey == new_with_doi_with_doc_id.doc_id
1634+
1635+
14911636
@pytest.mark.vcr
14921637
@pytest.mark.parametrize("use_partition", [True, False])
14931638
@pytest.mark.asyncio

0 commit comments

Comments
 (0)