Skip to content

Commit 0697cc4

Browse files
committed
Add tind.io harvester for AgEcon with MODS
1 parent 9363699 commit 0697cc4

File tree

6 files changed

+107
-269
lines changed

6 files changed

+107
-269
lines changed

setup.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
'com.peerj.xml = share.transformers.com_peerj_xml:PeerJXMLTransformer',
2020
'com.researchregistry = share.transformers.com_researchregistry:RRTransformer',
2121
'com.springer = share.transformers.com_springer:SpringerTransformer',
22-
'edu.ageconsearch = share.transformers.edu_ageconsearch:AgeconTransformer',
2322
'edu.gwu = share.transformers.edu_gwu:GWScholarSpaceTransformer',
2423
'edu.harvarddataverse = share.transformers.edu_harvarddataverse:HarvardTransformer',
2524
'gov.clinicaltrials = share.transformers.gov_clinicaltrials:ClinicalTrialsTransformer',
@@ -57,7 +56,6 @@
5756
'com.peerj = share.harvesters.com_peerj:PeerJHarvester',
5857
'com.researchregistry = share.harvesters.com_researchregistry:ResearchRegistryHarvester',
5958
'com.springer = share.harvesters.com_springer:SpringerHarvester',
60-
'edu.ageconsearch = share.harvesters.edu_ageconsearch:AgEconHarvester',
6159
'edu.gwu = share.harvesters.edu_gwu:GWScholarSpaceHarvester',
6260
'edu.harvarddataverse = share.harvesters.edu_harvarddataverse:HarvardDataverseHarvester',
6361
'gov.clinicaltrials = share.harvesters.gov_clinicaltrials:ClinicalTrialsHarvester',
@@ -67,6 +65,7 @@
6765
'gov.scitech = share.harvesters.gov_scitech:SciTechHarvester',
6866
'gov.usgs = share.harvesters.gov_usgs:USGSHarvester',
6967
'io.osf = share.harvesters.io_osf:OSFHarvester',
68+
'io.tind = share.harvesters.io_tind:TindHarvester',
7069
'oai = share.harvesters.oai:OAIHarvester',
7170
'org.arxiv = share.harvesters.org_arxiv:ArxivHarvester',
7271
'org.biorxiv = share.harvesters.org_biorxiv:BiorxivHarvester',

share/harvesters/edu_ageconsearch.py

Lines changed: 0 additions & 117 deletions
This file was deleted.

share/harvesters/io_tind.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
import logging
2+
import dateutil
3+
4+
from furl import furl
5+
from lxml import etree
6+
import pendulum
7+
8+
from share.harvest import BaseHarvester
9+
10+
logger = logging.getLogger('__name__')
11+
12+
13+
class TindHarvester(BaseHarvester):
14+
"""
15+
Expected harvester kwargs:
16+
collection: collection name to harvest
17+
page_size: records per request
18+
format_code:
19+
'xo': MODS XML
20+
'xd': Dublin Core-ish XML
21+
'xm': MARC XML
22+
'hm': MARC
23+
'hb': HTML
24+
25+
API Query Parameters:
26+
dt (type of date filter: 'm' for date modified)
27+
d1d (start of date range day)
28+
d1m (start of date range month)
29+
d1y (start of date range year)
30+
d2d (end of date range day)
31+
d2m (end of date range month)
32+
d2y (end of date range year)
33+
sc (split by collection: 0 or 1)
34+
sf (sort field: e.g. 'latest first')
35+
so (sort order: 'a' for ascending, 'd' for descending)
36+
rg (page size)
37+
jrec (offset)
38+
of (format code, see above)
39+
"""
40+
VERSION = 1
41+
42+
namespaces = {
43+
'mods': 'http://www.loc.gov/mods/v3',
44+
}
45+
46+
def do_harvest(self, start_date: pendulum.Pendulum, end_date: pendulum.Pendulum):
47+
page_size = self.kwargs['page_size']
48+
offset = 1
49+
url = furl(self.config.base_url)
50+
url.args.update({
51+
'c': self.kwargs['collection'],
52+
'of': self.kwargs['format_code'],
53+
'rg': page_size,
54+
'dt': 'm',
55+
'd1d': start_date.day,
56+
'd1m': start_date.month,
57+
'd1y': start_date.year,
58+
'd2d': end_date.day,
59+
'd2m': end_date.month,
60+
'd2y': end_date.year,
61+
'sc': 0, # Splitting by collection screws up the page size
62+
'sf': 'latest first',
63+
'so': 'd',
64+
})
65+
66+
while True:
67+
logger.debug('Making request to %s', url.url)
68+
resp = self.requests.get(url.url)
69+
resp.raise_for_status()
70+
71+
parsed = etree.fromstring(resp.content, parser=etree.XMLParser(recover=True))
72+
records = parsed.xpath('/modsCollection/mods:mods', namespaces=self.namespaces)
73+
if not records:
74+
break
75+
76+
for record in records:
77+
id = record.xpath('mods:recordInfo/mods:recordIdentifier', namespaces=self.namespaces)[0].text
78+
yield (id, etree.tostring(record, encoding=str))
79+
80+
offset += page_size
81+
url.args['jrec'] = offset

share/sources/edu.ageconsearch/source.yaml

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
11
configs:
2-
- base_url: http://ageconsearch.umn.edu/browse-date
2+
- base_url: http://ageconsearch.tind.io/search
33
disabled: false
44
earliest_date: null
5-
harvester: edu.ageconsearch
6-
harvester_kwargs: {}
7-
label: edu.ageconsearch
5+
harvester: io.tind
6+
harvester_kwargs:
7+
collection: AgEcon Search
8+
page_size: 100
9+
format_code: xo
10+
label: edu.ageconsearch.tind
811
rate_limit_allowance: 1
912
rate_limit_period: 2
10-
transformer: edu.ageconsearch
11-
transformer_kwargs: {}
13+
transformer: mods
14+
transformer_kwargs:
15+
emitted_type: Preprint
1216
home_page: http://ageconsearch.umn.edu/
1317
long_title: AgEcon Search
1418
name: edu.ageconsearch

share/transformers/edu_ageconsearch.py

Lines changed: 0 additions & 138 deletions
This file was deleted.

0 commit comments

Comments
 (0)