Skip to content

Commit 35931df

Browse files
committed
BREAKING: remove old functions and refactor others to solely use S3
1 parent 3607ec8 commit 35931df

File tree

7 files changed

+17
-105
lines changed

7 files changed

+17
-105
lines changed

CHANGELOG

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
0.13.0
2+
- BREAKING: remove `get_resource_path`, `get_dcor_depot_path`,
3+
`get_dcor_users_depot_path`
4+
- BREAKING: `wait_for_resource` and `get_dc_instance` now only check S3 data
25
- feat: add `is_resource_private` method
36
- enh: add "artifact" kwarg to `get_s3_dc_handle`
47
- enh: partial resource ID in basin names for `get_s3_dc_handle_basin_based`

dcor_shared/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# flake8: noqa: F401
22
from .ckan import (
3-
get_ckan_config_option, get_resource_dc_config, get_resource_path,
3+
get_ckan_config_option, get_resource_dc_config,
44
get_resource_info, is_resource_private
55
)
6-
from .data import DUMMY_BYTES, wait_for_resource
6+
from .data import wait_for_resource
77
from .dcinst import get_dc_instance
88
from .mime import DC_MIME_TYPES, VALID_FORMATS
99
from . import paths

dcor_shared/ckan.py

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import os
22
import pathlib
3-
import warnings
43

54
from .parse import get_ini_config_option
65

@@ -73,30 +72,6 @@ def get_resource_info(resource_id):
7372
return ds_dict, res_dict
7473

7574

76-
def get_resource_path(resource_id, create_dirs=False):
77-
"""Return the expected local path for a resource identifier
78-
79-
If `create_dirs` is True, create the parent directory tree.
80-
"""
81-
warnings.warn("`get_resource_path` should not be used since DCOR moved "
82-
"to storing data solely on S3",
83-
DeprecationWarning)
84-
rid = resource_id
85-
resources_path = get_ckan_storage_path() / "resources"
86-
pdir = resources_path / rid[:3] / rid[3:6]
87-
path = pdir / rid[6:]
88-
if create_dirs:
89-
try:
90-
pdir.mkdir(parents=True, exist_ok=True)
91-
os.makedirs(pdir)
92-
os.chown(pdir,
93-
os.stat(resources_path).st_uid,
94-
os.stat(resources_path).st_gid)
95-
except OSError:
96-
pass
97-
return pathlib.Path(path)
98-
99-
10075
def is_resource_private(resource_id):
10176
"""Return true if a resource is private"""
10277
from ckan import model

dcor_shared/data.py

Lines changed: 8 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,10 @@
11
import functools
2-
import pathlib
32
import time
43

5-
from .ckan import get_resource_path
64
from . import s3cc
75
from .util import sha256sum # noqa: F401
86

97

10-
#: Content of the dummy file created when importing data.
11-
DUMMY_BYTES = b"[Data import pending]"
12-
13-
148
@functools.lru_cache(maxsize=100)
159
def wait_for_resource(resource_id: str,
1610
timeout: float = 10):
@@ -19,35 +13,21 @@ def wait_for_resource(resource_id: str,
1913
This function can be used by other plugins to ensure that
2014
a resource is available for processing.
2115
22-
There multiple ways for data to become available:
23-
24-
1. The ckanext-dcor_depot plugin imports data by touching
25-
dummy files and then sym-linking to data on disk. Here
26-
we just check that the file is not a dummy file anymore.
27-
2. Legacy uploads via nginx/uwsgi directly into CKAN and onto
28-
the local block storage worked the same way. We have to wait
29-
for the dummy file to be replaced.
30-
3. The new (2024) way of uploading data is via pre-signed URLs
31-
to an S3 instance. Here, we have to make sure that the
32-
upload is complete and the file exists. If this is the case,
33-
then uploads should have already completed when this function
34-
is called, so we only check for the existence of the resource
35-
in ckan and whether the `s3_available` attribute is defined.
16+
The only way (since 2024) of uploading data is via pre-signed URLs
17+
to an S3 instance. Here, we have to make sure that the
18+
upload is complete and the file exists. If this is the case,
19+
then uploads should have already completed when this function
20+
is called, so we only check for the existence of the resource
21+
in ckan and whether the `s3_available` attribute is defined.
3622
"""
37-
from ckan.common import config
3823
from ckan import logic
3924

4025
if len(resource_id) != 36:
4126
raise ValueError(f"Invalid resource id: {resource_id}")
4227

4328
resource_show = logic.get_action("resource_show")
44-
path = pathlib.Path(get_resource_path(resource_id))
4529

46-
dcor_depot_available = "dcor_depot" in config.get('ckan.plugins', "")
47-
# Initially this was set to 10s, but if `/data` is mounted on a
48-
# network share then this part here just takes too long.
4930
t0 = time.time()
50-
ld = len(DUMMY_BYTES)
5131
while True:
5232
try:
5333
res_dict = resource_show(context={'ignore_auth': True,
@@ -74,21 +54,5 @@ def wait_for_resource(resource_id: str,
7454
break
7555

7656
if time.time() - t0 > timeout:
77-
raise OSError("Data import seems to take too long "
78-
"for '{}'!".format(path))
79-
elif not path.exists():
80-
time.sleep(5)
81-
continue
82-
elif dcor_depot_available and not path.is_symlink():
83-
# Resource is only available when it is symlinked by
84-
# the ckanext.dcor_depot `symlink_user_dataset` job
85-
# (or by the ckanext.dcor_depot importers).
86-
time.sleep(5)
87-
continue
88-
elif path.stat().st_size == ld and path.read_bytes() == DUMMY_BYTES:
89-
# wait a bit
90-
time.sleep(5)
91-
continue
92-
else:
93-
# not a dummy file
94-
break
57+
raise OSError(f"Data import seems to take too long "
58+
f"for '{resource_id}'!")

dcor_shared/dcinst.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,10 @@
1-
from dclab.rtdc_dataset import RTDC_HDF5
2-
3-
from .ckan import get_resource_path
41
from . import s3cc
52

63

74
def get_dc_instance(rid):
85
"""Return an instance of dclab's `RTDCBase` for a resource identifier"""
9-
# Try local file first
10-
path = get_resource_path(rid)
11-
if path.is_file():
12-
# Disable basins, because they could point to files on the local
13-
# file system (security).
14-
return RTDC_HDF5(path, enable_basins=False)
6+
# The resource must be on S3
7+
if s3cc.artifact_exists(rid):
8+
return s3cc.get_s3_dc_handle(rid)
159
else:
16-
# The resource must be on S3
17-
if s3cc.artifact_exists(rid):
18-
return s3cc.get_s3_dc_handle(rid)
19-
else:
20-
raise ValueError(f"Could not find resource {rid} anywhere")
10+
raise ValueError(f"Could not find resource {rid} anywhere")

dcor_shared/paths.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,7 @@
11
import pathlib
22

3-
from .ckan import get_ckan_config_path, get_ckan_config_option
43
from .ckan import get_ckan_storage_path, get_ckan_webassets_path # noqa: F401
54

6-
from .parse import get_ini_config_option
7-
8-
9-
def get_dcor_depot_path():
10-
return pathlib.Path(get_ckan_config_option(
11-
"ckanext.dcor_depot.depots_path"))
12-
13-
14-
def get_dcor_users_depot_path():
15-
depot = get_dcor_depot_path()
16-
return depot / get_ini_config_option(
17-
"ckanext.dcor_depot.users_depot_name",
18-
get_ckan_config_path())
19-
205

216
def get_nginx_config_path():
227
return pathlib.Path("/etc/nginx/sites-enabled/ckan")

tests/test_basic.py

Lines changed: 0 additions & 5 deletions
This file was deleted.

0 commit comments

Comments
 (0)