Cleanup hydroshare provider

yuvipanda · yuvipanda · commit cd9911c9f2d5 · 2024-12-20T16:44:15.000-08:00
- Cleanup how the provider is detected, as we were simply doing
  a domain check but with many extra steps
- Move the tests to be real integration tests
- Test detection, not content_id
diff --git a/repo2docker/contentproviders/hydroshare.py b/repo2docker/contentproviders/hydroshare.py
@@ -2,20 +2,29 @@
 import os
 import shutil
 import time
+import tempfile
 import zipfile
 from datetime import datetime, timedelta, timezone
 from urllib.request import urlretrieve
+from urllib.parse import urlparse, urlunparse
 
 from .base import ContentProviderException
 from .doi import DoiProvider
+from ..utils import is_doi
 
 
 class Hydroshare(DoiProvider):
     """Provide contents of a Hydroshare resource."""
 
-    def _fetch_version(self, host):
-        """Fetch resource modified date and convert to epoch"""
-        json_response = self.session.get(host["version"].format(self.resource_id)).json()
+    HYDROSHARE_DOMAINS = ["www.hydroshare.org"]
+
+    def get_version(self, resource_id: str) -> str:
+        """
+        Get current version of given resource_id
+        """
+        api_url = f"https://{self.HYDROSHARE_DOMAIN}/hsapi/resource/{resource_id}/scimeta/elements"
+
+        json_response = self.session.get(api_url).json()
         date = next(
             item for item in json_response["dates"] if item["type"] == "modified"
         )["start_date"]
@@ -26,7 +35,7 @@ def _fetch_version(self, host):
         # truncate the timestamp
         return str(int(epoch))
 
-    def detect(self, doi, ref=None, extra_args=None):
+    def detect(self, spec, ref=None, extra_args=None):
         """Trigger this provider for things that resolve to a Hydroshare resource"""
         hosts = [
             {
@@ -35,30 +44,33 @@ def detect(self, doi, ref=None, extra_args=None):
                     "http://www.hydroshare.org/resource/",
                 ],
                 "django_irods": "https://www.hydroshare.org/django_irods/download/bags/",
-                "version": "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements",
+                "version": "",
             }
         ]
-        url = self.doi2url(doi)
-
-        for host in hosts:
-            if any([url.startswith(s) for s in host["hostname"]]):
-                self.resource_id = url.strip("/").rsplit("/", maxsplit=1)[1]
-                self.version = self._fetch_version(host)
-                return {
-                    "resource": self.resource_id,
-                    "host": host,
-                    "version": self.version,
-                }
+
+        # Our spec could be a doi that resolves to a hydroshare URL, or a hydroshare URL
+        if is_doi(spec):
+            url = self.doi2url(spec)
+        else:
+            url = spec
+
+        parsed = urlparse(url)
+
+        print(url)
+        if parsed.netloc in self.HYDROSHARE_DOMAINS:
+            return url
 
     def _urlretrieve(self, bag_url):
         return urlretrieve(bag_url)
 
     def fetch(self, spec, output_dir, yield_output=False, timeout=120):
         """Fetch and unpack a Hydroshare resource"""
-        resource_id = spec["resource"]
-        host = spec["host"]
+        url = spec
+        print(url)
+        parts = urlparse(url)
+        self.resource_id = parts.path.strip("/").rsplit("/", maxsplit=1)[1]
 
-        bag_url = f'{host["django_irods"]}{resource_id}'
+        bag_url = urlunparse(parts._replace(path=f"django_irods/download/bags/{self.resource_id}"))
 
         yield f"Downloading {bag_url}.\n"
 
@@ -87,16 +99,17 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120):
         filehandle, _ = self._urlretrieve(bag_url)
         zip_file_object = zipfile.ZipFile(filehandle, "r")
         yield "Downloaded, unpacking contents.\n"
-        zip_file_object.extractall("temp")
-        # resources store the contents in the data/contents directory, which is all we want to keep
-        contents_dir = os.path.join("temp", self.resource_id, "data", "contents")
-        files = os.listdir(contents_dir)
-        for f in files:
-            shutil.move(os.path.join(contents_dir, f), output_dir)
-        yield "Finished, cleaning up.\n"
-        shutil.rmtree("temp")
+
+        with tempfile.TemporaryDirectory() as d:
+            zip_file_object.extractall(d)
+            # resources store the contents in the data/contents directory, which is all we want to keep
+            contents_dir = os.path.join(d, self.resource_id, "data", "contents")
+            files = os.listdir(contents_dir)
+            for f in files:
+                shutil.move(os.path.join(contents_dir, f), output_dir)
+            yield "Finished, cleaning up.\n"
 
     @property
     def content_id(self):
         """The HydroShare resource ID"""
-        return f"{self.resource_id}.v{self.version}"
+        return f"{self.resource_id}"
diff --git a/tests/contentproviders/test_hydroshare.py b/tests/contentproviders/test_hydroshare.py
@@ -0,0 +1,59 @@
+import os
+import hashlib
+from tempfile import TemporaryDirectory
+
+import pytest
+
+from repo2docker.contentproviders import Hydroshare
+
+
+@pytest.mark.parametrize(
+    ("spec", "url"),
+    [
+        # Test a hydroshare DOI
+        ("10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
+        # Hydroshare DOI in a different form
+        ("https://doi.org/10.4211/hs.b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
+        # Test a non-hydroshare DOI
+        ("doi:10.7910/DVN/TJCLKP", None),
+        # Test a hydroshare URL
+        ("http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61", "http://www.hydroshare.org/resource/b8f6eae9d89241cf8b5904033460af61"),
+        # Test a random URL
+        ("https://www.eff.org/cyberspace-independence", None)
+    ]
+)
+def test_detect(spec, url):
+    assert Hydroshare().detect(spec) == url
+
+
+@pytest.mark.parametrize(
+    ("specs", "md5tree"),
+    [
+        (
+            ("https://www.hydroshare.org/resource/8f7c2f0341ef4180b0dbe97f59130756/", ),
+            {
+                "binder/Dockerfile": "872ab0ef22645a42a5560eae640cdc77",
+                "README.md": "88ac547c3a5f616f6d26e0689d63a113",
+                "notebooks/sanity-check.ipynb": "7fc4c455bc8cd244479f4d2282051ee6"
+            },
+        ),
+    ],
+)
+def test_fetch(specs: list[str], md5tree):
+    dv = Hydroshare()
+
+    for spec in specs:
+        with TemporaryDirectory() as d:
+            output = []
+            for l in dv.fetch(dv.detect(spec), d):
+                output.append(l)
+
+            # Verify md5 sum of the files we expect to find
+            # We are using md5 instead of something more secure because that is what
+            # dataverse itself uses
+            for subpath, expected_sha in md5tree.items():
+                with open(os.path.join(d, subpath), "rb") as f:
+                    h = hashlib.md5()
+                    h.update(f.read())
+                    assert h.hexdigest() == expected_sha
+
diff --git a/tests/unit/contentproviders/test_hydroshare.py b/tests/unit/contentproviders/test_hydroshare.py