Skip to content

Commit cd9911c

Browse files
committed
Cleanup hydroshare provider
- Cleanup how the provider is detected, as we were simply doing a domain check but with many extra steps - Move the tests to be real integration tests - Test detection, not content_id
1 parent aec0e02 commit cd9911c

File tree

3 files changed

+100
-216
lines changed

3 files changed

+100
-216
lines changed

repo2docker/contentproviders/hydroshare.py

Lines changed: 41 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,29 @@
22
import os
33
import shutil
44
import time
5+
import tempfile
56
import zipfile
67
from datetime import datetime, timedelta, timezone
78
from urllib.request import urlretrieve
9+
from urllib.parse import urlparse, urlunparse
810

911
from .base import ContentProviderException
1012
from .doi import DoiProvider
13+
from ..utils import is_doi
1114

1215

1316
class Hydroshare(DoiProvider):
1417
"""Provide contents of a Hydroshare resource."""
1518

16-
def _fetch_version(self, host):
17-
"""Fetch resource modified date and convert to epoch"""
18-
json_response = self.session.get(host["version"].format(self.resource_id)).json()
19+
HYDROSHARE_DOMAINS = ["www.hydroshare.org"]
20+
21+
def get_version(self, resource_id: str) -> str:
22+
"""
23+
Get current version of given resource_id
24+
"""
25+
api_url = f"https://{self.HYDROSHARE_DOMAIN}/hsapi/resource/{resource_id}/scimeta/elements"
26+
27+
json_response = self.session.get(api_url).json()
1928
date = next(
2029
item for item in json_response["dates"] if item["type"] == "modified"
2130
)["start_date"]
@@ -26,7 +35,7 @@ def _fetch_version(self, host):
2635
# truncate the timestamp
2736
return str(int(epoch))
2837

29-
def detect(self, doi, ref=None, extra_args=None):
38+
def detect(self, spec, ref=None, extra_args=None):
3039
"""Trigger this provider for things that resolve to a Hydroshare resource"""
3140
hosts = [
3241
{
@@ -35,30 +44,33 @@ def detect(self, doi, ref=None, extra_args=None):
3544
"http://www.hydroshare.org/resource/",
3645
],
3746
"django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
38-
"version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
47+
"version": "",
3948
}
4049
]
41-
url = self.doi2url(doi)
42-
43-
for host in hosts:
44-
if any([url.startswith(s) for s in host["hostname"]]):
45-
self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
46-
self.version = self._fetch_version(host)
47-
return {
48-
"resource": self.resource_id,
49-
"host": host,
50-
"version": self.version,
51-
}
50+
51+
# Our spec could be a doi that resolves to a hydroshare URL, or a hydroshare URL
52+
if is_doi(spec):
53+
url = self.doi2url(spec)
54+
else:
55+
url = spec
56+
57+
parsed = urlparse(url)
58+
59+
print(url)
60+
if parsed.netloc in self.HYDROSHARE_DOMAINS:
61+
return url
5262

5363
def _urlretrieve(self, bag_url):
5464
return urlretrieve(bag_url)
5565

5666
def fetch(self, spec, output_dir, yield_output=False, timeout=120):
5767
"""Fetch and unpack a Hydroshare resource"""
58-
resource_id = spec["resource"]
59-
host = spec["host"]
68+
url = spec
69+
print(url)
70+
parts = urlparse(url)
71+
self.resource_id = parts.path.strip("/").rsplit("/", maxsplit=1)[1]
6072

61-
bag_url = f'{host["django_irods"]}{resource_id}'
73+
bag_url = urlunparse(parts._replace(path=f"django_irods/download/bags/{self.resource_id}"))
6274

6375
yield f"Downloading {bag_url}.\n"
6476

@@ -87,16 +99,17 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120):
8799
filehandle, _ = self._urlretrieve(bag_url)
88100
zip_file_object = zipfile.ZipFile(filehandle, "r")
89101
yield "Downloaded, unpacking contents.\n"
90-
zip_file_object.extractall("temp")
91-
# resources store the contents in the data/contents directory, which is all we want to keep
92-
contents_dir = os.path.join("temp", self.resource_id, "data", "contents")
93-
files = os.listdir(contents_dir)
94-
for f in files:
95-
shutil.move(os.path.join(contents_dir, f), output_dir)
96-
yield "Finished, cleaning up.\n"
97-
shutil.rmtree("temp")
102+
103+
with tempfile.TemporaryDirectory() as d:
104+
zip_file_object.extractall(d)
105+
# resources store the contents in the data/contents directory, which is all we want to keep
106+
contents_dir = os.path.join(d, self.resource_id, "data", "contents")
107+
files = os.listdir(contents_dir)
108+
for f in files:
109+
shutil.move(os.path.join(contents_dir, f), output_dir)
110+
yield "Finished, cleaning up.\n"
98111

99112
@property
100113
def content_id(self):
101114
"""The HydroShare resource ID"""
102-
return f"{self.resource_id}.v{self.version}"
115+
return f"{self.resource_id}"
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import os
2+
import hashlib
3+
from tempfile import TemporaryDirectory
4+
5+
import pytest
6+
7+
from repo2docker.contentproviders import Hydroshare
8+
9+
10+
@pytest.mark.parametrize(
11+
("spec", "url"),
12+
[
13+
# Test a hydroshare DOI
14+
("10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
15+
# Hydroshare DOI in a different form
16+
("https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
17+
# Test a non-hydroshare DOI
18+
("doi:10.7910/DVN/TJCLKP", None),
19+
# Test a hydroshare URL
20+
("http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
21+
# Test a random URL
22+
("https://www.eff.org/cyberspace-independence", None)
23+
]
24+
)
25+
def test_detect(spec, url):
26+
assert Hydroshare().detect(spec) == url
27+
28+
29+
@pytest.mark.parametrize(
30+
("specs", "md5tree"),
31+
[
32+
(
33+
("https://www.hydroshare.org/resource/8f7c2f0341ef4180b0dbe97f59130756/", ),
34+
{
35+
"binder/Dockerfile": "872ab0ef22645a42a5560eae640cdc77",
36+
"README.md": "88ac547c3a5f616f6d26e0689d63a113",
37+
"notebooks/sanity-check.ipynb": "7fc4c455bc8cd244479f4d2282051ee6"
38+
},
39+
),
40+
],
41+
)
42+
def test_fetch(specs: list[str], md5tree):
43+
dv = Hydroshare()
44+
45+
for spec in specs:
46+
with TemporaryDirectory() as d:
47+
output = []
48+
for l in dv.fetch(dv.detect(spec), d):
49+
output.append(l)
50+
51+
# Verify md5 sum of the files we expect to find
52+
# We are using md5 instead of something more secure because that is what
53+
# dataverse itself uses
54+
for subpath, expected_sha in md5tree.items():
55+
with open(os.path.join(d, subpath), "rb") as f:
56+
h = hashlib.md5()
57+
h.update(f.read())
58+
assert h.hexdigest() == expected_sha
59+

tests/unit/contentproviders/test_hydroshare.py

Lines changed: 0 additions & 188 deletions
This file was deleted.

0 commit comments

Comments
 (0)