2
2
import os
3
3
import shutil
4
4
import time
5
+ import tempfile
5
6
import zipfile
6
7
from datetime import datetime , timedelta , timezone
7
8
from urllib .request import urlretrieve
9
+ from urllib .parse import urlparse , urlunparse
8
10
9
11
from .base import ContentProviderException
10
12
from .doi import DoiProvider
13
+ from ..utils import is_doi
11
14
12
15
13
16
class Hydroshare (DoiProvider ):
14
17
"""Provide contents of a Hydroshare resource."""
15
18
16
- def _fetch_version (self , host ):
17
- """Fetch resource modified date and convert to epoch"""
18
- json_response = self .session .get (host ["version" ].format (self .resource_id )).json ()
19
+ HYDROSHARE_DOMAINS = ["www.hydroshare.org" ]
20
+
21
+ def get_version (self , resource_id : str ) -> str :
22
+ """
23
+ Get current version of given resource_id
24
+ """
25
+ api_url = f"https://{ self .HYDROSHARE_DOMAIN } /hsapi/resource/{ resource_id } /scimeta/elements"
26
+
27
+ json_response = self .session .get (api_url ).json ()
19
28
date = next (
20
29
item for item in json_response ["dates" ] if item ["type" ] == "modified"
21
30
)["start_date" ]
@@ -26,7 +35,7 @@ def _fetch_version(self, host):
26
35
# truncate the timestamp
27
36
return str (int (epoch ))
28
37
29
- def detect (self , doi , ref = None , extra_args = None ):
38
+ def detect (self , spec , ref = None , extra_args = None ):
30
39
"""Trigger this provider for things that resolve to a Hydroshare resource"""
31
40
hosts = [
32
41
{
@@ -35,30 +44,33 @@ def detect(self, doi, ref=None, extra_args=None):
35
44
"http://www.hydroshare.org/resource/" ,
36
45
],
37
46
"django_irods" : "https://www.hydroshare.org/django_irods/download/bags/" ,
38
- "version" : "https://www.hydroshare.org/hsapi/resource/{}/scimeta/elements " ,
47
+ "version" : "" ,
39
48
}
40
49
]
41
- url = self .doi2url (doi )
42
-
43
- for host in hosts :
44
- if any ([url .startswith (s ) for s in host ["hostname" ]]):
45
- self .resource_id = url .strip ("/" ).rsplit ("/" , maxsplit = 1 )[1 ]
46
- self .version = self ._fetch_version (host )
47
- return {
48
- "resource" : self .resource_id ,
49
- "host" : host ,
50
- "version" : self .version ,
51
- }
50
+
51
+ # Our spec could be a doi that resolves to a hydroshare URL, or a hydroshare URL
52
+ if is_doi (spec ):
53
+ url = self .doi2url (spec )
54
+ else :
55
+ url = spec
56
+
57
+ parsed = urlparse (url )
58
+
59
+ print (url )
60
+ if parsed .netloc in self .HYDROSHARE_DOMAINS :
61
+ return url
52
62
53
63
def _urlretrieve (self , bag_url ):
54
64
return urlretrieve (bag_url )
55
65
56
66
def fetch (self , spec , output_dir , yield_output = False , timeout = 120 ):
57
67
"""Fetch and unpack a Hydroshare resource"""
58
- resource_id = spec ["resource" ]
59
- host = spec ["host" ]
68
+ url = spec
69
+ print (url )
70
+ parts = urlparse (url )
71
+ self .resource_id = parts .path .strip ("/" ).rsplit ("/" , maxsplit = 1 )[1 ]
60
72
61
- bag_url = f' { host [ "django_irods" ] } { resource_id } '
73
+ bag_url = urlunparse ( parts . _replace ( path = f "django_irods/download/bags/ { self . resource_id } " ))
62
74
63
75
yield f"Downloading { bag_url } .\n "
64
76
@@ -87,16 +99,17 @@ def fetch(self, spec, output_dir, yield_output=False, timeout=120):
87
99
filehandle , _ = self ._urlretrieve (bag_url )
88
100
zip_file_object = zipfile .ZipFile (filehandle , "r" )
89
101
yield "Downloaded, unpacking contents.\n "
90
- zip_file_object .extractall ("temp" )
91
- # resources store the contents in the data/contents directory, which is all we want to keep
92
- contents_dir = os .path .join ("temp" , self .resource_id , "data" , "contents" )
93
- files = os .listdir (contents_dir )
94
- for f in files :
95
- shutil .move (os .path .join (contents_dir , f ), output_dir )
96
- yield "Finished, cleaning up.\n "
97
- shutil .rmtree ("temp" )
102
+
103
+ with tempfile .TemporaryDirectory () as d :
104
+ zip_file_object .extractall (d )
105
+ # resources store the contents in the data/contents directory, which is all we want to keep
106
+ contents_dir = os .path .join (d , self .resource_id , "data" , "contents" )
107
+ files = os .listdir (contents_dir )
108
+ for f in files :
109
+ shutil .move (os .path .join (contents_dir , f ), output_dir )
110
+ yield "Finished, cleaning up.\n "
98
111
99
112
@property
100
113
def content_id (self ):
101
114
"""The HydroShare resource ID"""
102
- return f"{ self .resource_id } .v { self . version } "
115
+ return f"{ self .resource_id } "
0 commit comments