|
50 | 50 | from paperqa.readers import parse_pdf_to_pages, read_doc
|
51 | 51 | from paperqa.types import ChunkMetadata
|
52 | 52 | from paperqa.utils import (
|
| 53 | + encode_id, |
53 | 54 | extract_score,
|
54 | 55 | get_citenames,
|
55 | 56 | maybe_get_date,
|
@@ -1488,6 +1489,150 @@ def test_docdetails_deserialization() -> None:
|
1488 | 1489 | ), "Deserialization should not mutate input"
|
1489 | 1490 |
|
1490 | 1491 |
|
| 1492 | +def test_docdetails_doc_id_roundtrip() -> None: |
| 1493 | + """Test that DocDetails can be initialized with doc_id or doi inputs.""" |
| 1494 | + test_doi = "10.1234/test.doi" |
| 1495 | + test_doi_doc_id = encode_id(test_doi.lower()) |
| 1496 | + test_specified_doc_id = "abc123" |
| 1497 | + # first we test without a doc_id or doi, ensure it's still valid |
| 1498 | + doc_details_no_doi_no_doc_id = DocDetails( |
| 1499 | + docname="test_doc", |
| 1500 | + citation="Test Citation", |
| 1501 | + dockey="test_dockey", |
| 1502 | + embedding=None, |
| 1503 | + formatted_citation="Formatted Test Citation", |
| 1504 | + ) |
| 1505 | + |
| 1506 | + assert ( |
| 1507 | + doc_details_no_doi_no_doc_id.doc_id != test_doi_doc_id |
| 1508 | + ), "DocDetails without doc_id should not match test_doi_doc_id" |
| 1509 | + assert ( |
| 1510 | + doc_details_no_doi_no_doc_id.doi is None |
| 1511 | + ), "DocDetails without doi should have None doi" |
| 1512 | + assert doc_details_no_doi_no_doc_id.dockey == doc_details_no_doi_no_doc_id.doc_id |
| 1513 | + |
| 1514 | + # now round-trip serializaiton should keep the same doc_id |
| 1515 | + new_no_doi_no_doc_id = DocDetails( |
| 1516 | + **doc_details_no_doi_no_doc_id.model_dump(exclude_none=True) |
| 1517 | + ) |
| 1518 | + assert ( |
| 1519 | + new_no_doi_no_doc_id.doc_id == doc_details_no_doi_no_doc_id.doc_id |
| 1520 | + ), "DocDetails without doc_id should keep the same doc_id after serialization" |
| 1521 | + |
| 1522 | + # since validation runs on assignment, make sure we can assign correctly |
| 1523 | + doc_details_no_doi_no_doc_id.doc_id = test_specified_doc_id |
| 1524 | + assert ( |
| 1525 | + doc_details_no_doi_no_doc_id.doc_id == test_specified_doc_id |
| 1526 | + ), "DocDetails with doc_id should match test_specified_doc_id" |
| 1527 | + assert doc_details_no_doi_no_doc_id.dockey == doc_details_no_doi_no_doc_id.doc_id |
| 1528 | + |
| 1529 | + # now let's do this with a doi |
| 1530 | + doc_details_with_doi_no_doc_id = DocDetails( |
| 1531 | + doi=test_doi, |
| 1532 | + docname="test_doc", |
| 1533 | + citation="Test Citation", |
| 1534 | + dockey="test_dockey", |
| 1535 | + embedding=None, |
| 1536 | + formatted_citation="Formatted Test Citation", |
| 1537 | + ) |
| 1538 | + assert ( |
| 1539 | + doc_details_with_doi_no_doc_id.doc_id == test_doi_doc_id |
| 1540 | + ), "DocDetails with doc_id should not match test_doi_doc_id" |
| 1541 | + assert ( |
| 1542 | + doc_details_with_doi_no_doc_id.doi == test_doi |
| 1543 | + ), "DocDetails with doi should match test_doi" |
| 1544 | + assert ( |
| 1545 | + doc_details_with_doi_no_doc_id.dockey == doc_details_with_doi_no_doc_id.doc_id |
| 1546 | + ) |
| 1547 | + |
| 1548 | + # round-trip serializaiton should keep the same doc_id |
| 1549 | + new_with_doi_no_doc_id = DocDetails( |
| 1550 | + **doc_details_with_doi_no_doc_id.model_dump(exclude_none=True) |
| 1551 | + ) |
| 1552 | + assert ( |
| 1553 | + new_with_doi_no_doc_id.doc_id == doc_details_with_doi_no_doc_id.doc_id |
| 1554 | + ), "DocDetails with doc_id should keep the same doc_id after serialization" |
| 1555 | + |
| 1556 | + # since validation runs on assignment, make sure we can assign correctly |
| 1557 | + doc_details_with_doi_no_doc_id.doc_id = test_specified_doc_id |
| 1558 | + assert ( |
| 1559 | + doc_details_with_doi_no_doc_id.doc_id == test_specified_doc_id |
| 1560 | + ), "DocDetails with doc_id should match test_specified_doc_id" |
| 1561 | + assert ( |
| 1562 | + doc_details_with_doi_no_doc_id.dockey == doc_details_with_doi_no_doc_id.doc_id |
| 1563 | + ) |
| 1564 | + |
| 1565 | + # let's specify the doc_id directly |
| 1566 | + doc_details_no_doi_with_doc_id = DocDetails( |
| 1567 | + doc_id=test_specified_doc_id, |
| 1568 | + docname="test_doc", |
| 1569 | + citation="Test Citation", |
| 1570 | + dockey="test_dockey", |
| 1571 | + embedding=None, |
| 1572 | + formatted_citation="Formatted Test Citation", |
| 1573 | + ) |
| 1574 | + assert ( |
| 1575 | + doc_details_no_doi_with_doc_id.doc_id == test_specified_doc_id |
| 1576 | + ), "DocDetails with doc_id should not match test_specified_doc_id" |
| 1577 | + assert ( |
| 1578 | + doc_details_no_doi_with_doc_id.doi is None |
| 1579 | + ), "DocDetails without doi should be None" |
| 1580 | + assert ( |
| 1581 | + doc_details_no_doi_with_doc_id.dockey == doc_details_no_doi_with_doc_id.doc_id |
| 1582 | + ), "DocDetails dockey should match doc_id for the same object" |
| 1583 | + |
| 1584 | + # round-trip serializaiton should keep the same doc_id |
| 1585 | + new_no_doi_with_doc_id = DocDetails( |
| 1586 | + **doc_details_no_doi_with_doc_id.model_dump(exclude_none=True) |
| 1587 | + ) |
| 1588 | + assert ( |
| 1589 | + new_no_doi_with_doc_id.doc_id == doc_details_with_doi_no_doc_id.doc_id |
| 1590 | + ), "DocDetails with doc_id should keep the same doc_id after serialization" |
| 1591 | + |
| 1592 | + # since validation runs on assignment, make sure we can assign correctly |
| 1593 | + new_no_doi_with_doc_id.doc_id = test_doi_doc_id |
| 1594 | + assert ( |
| 1595 | + new_no_doi_with_doc_id.doc_id == test_doi_doc_id |
| 1596 | + ), "DocDetails with doc_id should match test_specified_doc_id" |
| 1597 | + assert new_no_doi_with_doc_id.dockey == new_no_doi_with_doc_id.doc_id |
| 1598 | + |
| 1599 | + # now we specify both doi and doc_id, ensuring doc_id takes precedence |
| 1600 | + doc_details_with_doi_with_doc_id = DocDetails( |
| 1601 | + doc_id=test_specified_doc_id, |
| 1602 | + doi=test_doi, |
| 1603 | + docname="test_doc", |
| 1604 | + citation="Test Citation", |
| 1605 | + dockey="test_dockey", |
| 1606 | + embedding=None, |
| 1607 | + formatted_citation="Formatted Test Citation", |
| 1608 | + ) |
| 1609 | + assert ( |
| 1610 | + doc_details_with_doi_with_doc_id.doc_id == test_specified_doc_id |
| 1611 | + ), "DocDetails with doc_id should not match test_specified_doc_id" |
| 1612 | + assert ( |
| 1613 | + doc_details_with_doi_with_doc_id.doi == test_doi |
| 1614 | + ), "DocDetails without doi should match test_doi" |
| 1615 | + assert ( |
| 1616 | + doc_details_with_doi_with_doc_id.dockey |
| 1617 | + == doc_details_with_doi_with_doc_id.doc_id |
| 1618 | + ) |
| 1619 | + |
| 1620 | + # round-trip serializaiton should keep the same doc_id |
| 1621 | + new_with_doi_with_doc_id = DocDetails( |
| 1622 | + **doc_details_with_doi_with_doc_id.model_dump(exclude_none=True) |
| 1623 | + ) |
| 1624 | + assert ( |
| 1625 | + new_with_doi_with_doc_id.doc_id == doc_details_with_doi_with_doc_id.doc_id |
| 1626 | + ), "DocDetails with doc_id should keep the same doc_id after serialization" |
| 1627 | + |
| 1628 | + # since validation runs on assignment, make sure we can assign correctly |
| 1629 | + new_with_doi_with_doc_id.doc_id = test_doi_doc_id |
| 1630 | + assert ( |
| 1631 | + new_with_doi_with_doc_id.doc_id == test_doi_doc_id |
| 1632 | + ), "DocDetails with doc_id should match test_specified_doc_id" |
| 1633 | + assert new_with_doi_with_doc_id.dockey == new_with_doi_with_doc_id.doc_id |
| 1634 | + |
| 1635 | + |
1491 | 1636 | @pytest.mark.vcr
|
1492 | 1637 | @pytest.mark.parametrize("use_partition", [True, False])
|
1493 | 1638 | @pytest.mark.asyncio
|
|
0 commit comments