From fb954006764a50b82f28c0dc07a8695b6aab4635 Mon Sep 17 00:00:00 2001 From: bruvio Date: Mon, 31 Jan 2022 12:08:44 +0000 Subject: [PATCH 1/4] added wildcard search for a given endpoint and key --- fairdatapipeline/__init__.py | 3 +- fairdatapipeline/pipeline.py | 59 ++++++++++++++++++++++++++++++++++++ tests/ext/find_csv.yaml | 20 ++++++++++++ tests/test_raise_issue.py | 32 +++++++++++++++++++ 4 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 tests/ext/find_csv.yaml diff --git a/fairdatapipeline/__init__.py b/fairdatapipeline/__init__.py index 46217a6e..b4302fe4 100644 --- a/fairdatapipeline/__init__.py +++ b/fairdatapipeline/__init__.py @@ -3,6 +3,7 @@ "link_read", "link_write", "finalise", + "search", "raise_issue_by_data_product", "raise_issue_by_index", "raise_issue_with_config", @@ -15,7 +16,7 @@ from .fdp_utils import get_handle_index_from_path from .link import link_read, link_write -from .pipeline import finalise, initialise +from .pipeline import finalise, initialise, search from .raise_issue import ( raise_issue_by_data_product, raise_issue_by_existing_data_product, diff --git a/fairdatapipeline/pipeline.py b/fairdatapipeline/pipeline.py index c0a71e58..48e790cb 100644 --- a/fairdatapipeline/pipeline.py +++ b/fairdatapipeline/pipeline.py @@ -1,12 +1,71 @@ import datetime +import json import logging import os +import requests import yaml from fairdatapipeline import fdp_utils +def search( + token: str, + config: str, + script: str, + wildcard: str, + endpoint: str, + key: str, +) -> str: + """Reads in token, config file script, wildcard, endpoint and key and returns a list of items found at the endpoint where the key contains the wildcard + + Args: + | token: registry token + | config: Path to config file + | script: Path to script file + | wildcard: string to search + | key: key of the results dictionary where the wildcard is searched + | endpoint: api endpoint where the search is performed + + Returns: + | json: a list of dictionaries where the wildcard was found + """ + # Validate Yamls + if not fdp_utils.is_valid_yaml(config): + raise ValueError("Config is not a valid YAML file") + if not fdp_utils.is_file(script): + raise ValueError("Script does not exist") + + # Read config file and extract run metadata + with open(config, "r") as data: + config_yaml = yaml.safe_load(data) + run_metadata = config_yaml["run_metadata"] + registry_url = run_metadata["local_data_registry_url"] + if registry_url[-1] != "/": + registry_url += "/" + filename = os.path.basename(config) + + # @todo to be set from config + if "api_version" not in config_yaml["run_metadata"].keys(): + config_yaml["run_metadata"]["api_version"] = "1.0.0" + + api_version = config_yaml["run_metadata"]["api_version"] + + logging.info("Reading {} from local filestore".format(filename)) + + headers = fdp_utils.get_headers(token=token, api_version=api_version) + if registry_url[-1] != "/": + registry_url += "/" + registry_url += endpoint + "/?" + + response = requests.get(registry_url, headers=headers) + data = response.json() + + res = [obj for obj in data["results"] if wildcard in obj[key]] # type: ignore + + return json.dumps(res, indent=4, sort_keys=True) + + def initialise(token: str, config: str, script: str) -> dict: """Reads in token, config file and script, creates necessary registry items and creates new code run. diff --git a/tests/ext/find_csv.yaml b/tests/ext/find_csv.yaml new file mode 100644 index 00000000..110ef979 --- /dev/null +++ b/tests/ext/find_csv.yaml @@ -0,0 +1,20 @@ +run_metadata: + description: Write csv file + local_data_registry_url: http://127.0.0.1:8000/api/ + remote_data_registry_url: https://data.scrc.uk/api/ + default_input_namespace: testing + default_output_namespace: testing + write_data_store: tmp + local_repo: ./ + script: |- + python3 py.test + public: true + latest_commit: 221bfe8b52bbfb3b2dbdc23037b7dd94b49aaa70 + remote_repo: https://github.com/FAIRDataPipeline/pyDataPipeline + +write: +- data_product: find/csv + description: test csv to test find data products + file_type: csv + use: + version: 0.0.1 diff --git a/tests/test_raise_issue.py b/tests/test_raise_issue.py index 01fabdce..27ff6ef9 100644 --- a/tests/test_raise_issue.py +++ b/tests/test_raise_issue.py @@ -1,3 +1,4 @@ +import json import os import shutil @@ -29,6 +30,11 @@ def config(test_dir: str) -> str: return os.path.join(test_dir, "write_csv.yaml") +@pytest.fixture +def fconfig(test_dir: str) -> str: + return os.path.join(test_dir, "find_csv.yaml") + + @pytest.mark.pipeline def test_initialise(token: str, config: str, script: str) -> None: handle = pipeline.initialise(token, config, script) @@ -122,6 +128,32 @@ def test_link_read( assert type(link_read_1) == str and type(link_read_2) == str +@pytest.mark.pipeline +def test_find_data_product( + token: str, + fconfig: str, + script: str, + test_dir: str, + query: str = "find", + key: str = "name", +) -> None: + + handle = pipeline.initialise(token, fconfig, script) + tmp_csv = os.path.join(test_dir, "test.csv") + link_write = pipeline.link_write(handle, "find/csv") + shutil.copy(tmp_csv, link_write) + pipeline.finalise(token, handle) + + config = os.path.join(test_dir, "read_csv.yaml") + results = pipeline.search( + token, config, script, query, "data_product", "name" + ) + res = json.loads(results) + assert len(res) == 1 + result = fdp_utils.get_first_entry(res) + assert query in result[key] + + @pytest.mark.pipeline def test_link_read_data_product_exists( token: str, config: str, script: str, test_dir: str From b88b63d11727a9a48c42b4aed2be445ff0940d11 Mon Sep 17 00:00:00 2001 From: bruvio Date: Mon, 28 Feb 2022 12:16:19 +0000 Subject: [PATCH 2/4] updated search data product now search function uses a handle, token, wildcard endpoint and key --- data_pipeline_api/pipeline.py | 39 ++++++------------ pyproject.toml | 78 ++++++++++++++++------------------- tests/test_raise_issue.py | 11 +++-- 3 files changed, 54 insertions(+), 74 deletions(-) diff --git a/data_pipeline_api/pipeline.py b/data_pipeline_api/pipeline.py index f2925fa3..a6eaec7e 100644 --- a/data_pipeline_api/pipeline.py +++ b/data_pipeline_api/pipeline.py @@ -10,19 +10,18 @@ def search( + handle: dict, token: str, - config: str, - script: str, wildcard: str, endpoint: str, key: str, ) -> str: - """Reads in token, config file script, wildcard, endpoint and key and returns a list of items found at the endpoint where the key contains the wildcard + """ + Reads 'read' information from handle, wildcard, endpoint and key and returns a list of items found at the endpoint where the key contains the wildcard Args: + | handle: pipeline handle | token: registry token - | config: Path to config file - | script: Path to script file | wildcard: string to search | key: key of the results dictionary where the wildcard is searched | endpoint: api endpoint where the search is performed @@ -30,28 +29,14 @@ def search( Returns: | json: a list of dictionaries where the wildcard was found """ - # Validate Yamls - if not fdp_utils.is_valid_yaml(config): - raise ValueError("Config is not a valid YAML file") - if not fdp_utils.is_file(script): - raise ValueError("Script does not exist") - - # Read config file and extract run metadata - with open(config, "r") as data: - config_yaml = yaml.safe_load(data) - run_metadata = config_yaml["run_metadata"] - registry_url = run_metadata["local_data_registry_url"] - if registry_url[-1] != "/": - registry_url += "/" - filename = os.path.basename(config) - - # @todo to be set from config - if "api_version" not in config_yaml["run_metadata"].keys(): - config_yaml["run_metadata"]["api_version"] = "1.0.0" - api_version = config_yaml["run_metadata"]["api_version"] - - logging.info("Reading {} from local filestore".format(filename)) + try: + api_version = handle["yaml"]["run_metadata"]["api_version"] + registry_url = handle["yaml"]["run_metadata"][ + "local_data_registry_url" + ] + except KeyError: + return json.dumps([], indent=4, sort_keys=True) headers = fdp_utils.get_headers(token=token, api_version=api_version) if registry_url[-1] != "/": @@ -59,6 +44,8 @@ def search( registry_url += endpoint + "/?" response = requests.get(registry_url, headers=headers) + if response.status_code != 200: + return json.dumps([], indent=4, sort_keys=True) data = response.json() res = [obj for obj in data["results"] if wildcard in obj[key]] # type: ignore diff --git a/pyproject.toml b/pyproject.toml index 316d5562..95439309 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,15 +1,14 @@ [tool.poetry] +authors = [ + "Ryan J Field ", + "Dennis Reddyhoff ", + "Robert D Turner ", + "Bruno Viola ", + "Kristian Zarebski ", +] +description = "Python api to interact with the Fair Data Pipeline" name = "data_pipeline_api" version = "0.7.5" -description = "Python api to interact with the Fair Data Pipeline" -authors = [ - "Ryan J Field ", - "Dennis Reddyhoff ", - "Robert D Turner ", - "Bruno Viola ", - "Kristian Zarebski " - ] - homepage = "https://www.fairdatapipeline.org/" @@ -18,74 +17,69 @@ repository = "https://github.com/FAIRDataPipeline/pyDataPipeline" license = "GNU General Public License v3.0" packages = [ - {include = "data_pipeline_api"} + {include = "data_pipeline_api"}, ] readme = "README.md" keywords = [ - "FAIR Data Pipeline", - "FAIR", - "Data Management", - "Provenance" + "FAIR Data Pipeline", + "FAIR", + "Data Management", + "Provenance", ] [tool.poetry.dependencies] +PyYAML = "^6.0" +h5py = "^3.6.0" +matplotlib = "^3.5.1" python = "^3.7.1,<3.11" requests = "^2.27.1" -matplotlib = "^3.5.1" scipy = "^1.7.3" -h5py = "^3.6.0" -PyYAML = "^6.0" [tool.poetry.dev-dependencies] -pytest = "^7.0.0" +Sphinx = "^4.3.2" black = "^22.1" -mypy = "^0.931" flake8 = "^4.0.1" +isort = "^5.10.1" +mypy = "^0.931" poetry = "^1.1.6" -pytest-mock = "^3.7.0" -pytest-dependency = "^0.5.1" -pytest-cov = "^3.0.0" pre-commit = "^2.17.0" -isort = "^5.10.1" -Sphinx = "^4.3.2" +pytest = "^7.0.0" +pytest-cov = "^3.0.0" +pytest-dependency = "^0.5.1" +pytest-mock = "^3.7.0" sphinx-rtd-theme = "^1.0.0" [build-system] -requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" - +requires = ["poetry-core>=1.0.0"] [tool.poetry.urls] "Issue Tracker" = "https://github.com/FAIRDataPipeline/pyDataPipeline/issues" - - [tool.isort] -profile = 'black' -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -use_parentheses = true ensure_newline_before_comments = true +force_grid_wrap = 0 +include_trailing_comma = true line_length = 79 - - +multi_line_output = 3 +profile = 'black' +use_parentheses = true [tool.black] line-length = 79 [tool.pytest.ini_options] -addopts = '-s -v --cov=fairdatapipeline --cov-report=html --cov-report=term' +addopts = '-s -v --cov=data_pipeline_api --cov-report=html --cov-report=term' markers = [ - "pipeline: tests for 'pipeline' module ", - "issue: tests for raising issues ", - "utilities: tests for 'utilities' functions ", - "apiversion: tests for api versioning ", + "pipeline: tests for 'pipeline' module ", + "issue: tests for raising issues ", + "utilities: tests for 'utilities' functions ", + "apiversion: tests for api versioning ", ] [tool.mypy] -ignore_missing_imports = true -disallow_untyped_defs = true disallow_untyped_calls = true +disallow_untyped_defs = true +ignore_missing_imports = true diff --git a/tests/test_raise_issue.py b/tests/test_raise_issue.py index 701122ec..1a09ccb9 100644 --- a/tests/test_raise_issue.py +++ b/tests/test_raise_issue.py @@ -134,7 +134,7 @@ def test_find_data_product( fconfig: str, script: str, test_dir: str, - query: str = "find", + wildcard: str = "find", key: str = "name", ) -> None: @@ -143,15 +143,14 @@ def test_find_data_product( link_write = pipeline.link_write(handle, "find/csv") shutil.copy(tmp_csv, link_write) pipeline.finalise(token, handle) - config = os.path.join(test_dir, "read_csv.yaml") - results = pipeline.search( - token, config, script, query, "data_product", "name" - ) + handle = pipeline.initialise(token, config, script) + + results = pipeline.search(handle, token, wildcard, "data_product", "name") res = json.loads(results) assert len(res) == 1 result = fdp_utils.get_first_entry(res) - assert query in result[key] + assert wildcard in result[key] @pytest.mark.pipeline From 7d76fbc63911805b896fcac0166a4fa7b247a876 Mon Sep 17 00:00:00 2001 From: bruvio Date: Mon, 28 Feb 2022 15:12:44 +0000 Subject: [PATCH 3/4] added wrapper function to search data products name using wildcards added more tests --- data_pipeline_api/__init__.py | 3 ++- data_pipeline_api/pipeline.py | 33 ++++++++++++++++++++++++--- tests/test_raise_issue.py | 42 ++++++++++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 5 deletions(-) diff --git a/data_pipeline_api/__init__.py b/data_pipeline_api/__init__.py index b4302fe4..d85fe5a1 100644 --- a/data_pipeline_api/__init__.py +++ b/data_pipeline_api/__init__.py @@ -4,6 +4,7 @@ "link_write", "finalise", "search", + "search_data_products", "raise_issue_by_data_product", "raise_issue_by_index", "raise_issue_with_config", @@ -16,7 +17,7 @@ from .fdp_utils import get_handle_index_from_path from .link import link_read, link_write -from .pipeline import finalise, initialise, search +from .pipeline import finalise, initialise, search, search_data_products from .raise_issue import ( raise_issue_by_data_product, raise_issue_by_existing_data_product, diff --git a/data_pipeline_api/pipeline.py b/data_pipeline_api/pipeline.py index a6eaec7e..a6d79bb0 100644 --- a/data_pipeline_api/pipeline.py +++ b/data_pipeline_api/pipeline.py @@ -2,6 +2,7 @@ import json import logging import os +from typing import List import requests import yaml @@ -9,12 +10,36 @@ from data_pipeline_api import fdp_utils +def search_data_products( + handle: dict, wildcard: str, token: str = None +) -> str: + """ + search_data_products + wrapper function to search to search using a data_product name using a wildcard + + Parameters + ---------- + handle : dict + + wildcard : str + search parameter + token : str, optional + optional token, by default None + + Returns + ------- + List + list of data products that meet the search criteria + """ + return search(handle, wildcard, "data_product", "name", token=token) + + def search( handle: dict, - token: str, wildcard: str, endpoint: str, key: str, + token: str = None, ) -> str: """ Reads 'read' information from handle, wildcard, endpoint and key and returns a list of items found at the endpoint where the key contains the wildcard @@ -36,16 +61,18 @@ def search( "local_data_registry_url" ] except KeyError: - return json.dumps([], indent=4, sort_keys=True) + return json.dumps(None, indent=4, sort_keys=True) headers = fdp_utils.get_headers(token=token, api_version=api_version) + if not registry_url: + return json.dumps(None, indent=4, sort_keys=True) if registry_url[-1] != "/": registry_url += "/" registry_url += endpoint + "/?" response = requests.get(registry_url, headers=headers) if response.status_code != 200: - return json.dumps([], indent=4, sort_keys=True) + return json.dumps(None, indent=4, sort_keys=True) data = response.json() res = [obj for obj in data["results"] if wildcard in obj[key]] # type: ignore diff --git a/tests/test_raise_issue.py b/tests/test_raise_issue.py index 1a09ccb9..52c8cef6 100644 --- a/tests/test_raise_issue.py +++ b/tests/test_raise_issue.py @@ -1,3 +1,4 @@ +import copy import json import os import shutil @@ -146,13 +147,52 @@ def test_find_data_product( config = os.path.join(test_dir, "read_csv.yaml") handle = pipeline.initialise(token, config, script) - results = pipeline.search(handle, token, wildcard, "data_product", "name") + results = pipeline.search_data_products(handle, wildcard) res = json.loads(results) assert len(res) == 1 result = fdp_utils.get_first_entry(res) assert wildcard in result[key] +@pytest.mark.pipeline +def test_find_data_product_empty_handle( + wildcard: str = "find", +) -> None: + handle: dict = {} + + results = pipeline.search_data_products(handle, wildcard) + res = json.loads(results) + assert res is None + + +@pytest.mark.pipeline +def test_find_data_product_wrong_registry( + token: str, + fconfig: str, + script: str, + test_dir: str, + wildcard: str = "find", + key: str = "name", +) -> None: + + handle = pipeline.initialise(token, fconfig, script) + tmp_csv = os.path.join(test_dir, "test.csv") + link_write = pipeline.link_write(handle, "find/csv") + shutil.copy(tmp_csv, link_write) + pipeline.finalise(token, handle) + config = os.path.join(test_dir, "read_csv.yaml") + handle = pipeline.initialise(token, config, script) + wrong_handle = copy.deepcopy(handle) + wrong_handle["yaml"]["run_metadata"]["local_data_registry_url"] = "" + results = pipeline.search_data_products(wrong_handle, wildcard) + res = json.loads(results) + assert res is None + handle["yaml"]["run_metadata"]["local_data_registry_url"] += "-error" + results = pipeline.search_data_products(handle, wildcard) + res = json.loads(results) + assert res is None + + @pytest.mark.pipeline def test_link_read_data_product_exists( token: str, config: str, script: str, test_dir: str From 14c8012a9d78a558c7dc1284ffb2254c673b84f4 Mon Sep 17 00:00:00 2001 From: bruvio Date: Mon, 28 Feb 2022 15:19:58 +0000 Subject: [PATCH 4/4] fix code smell --- tests/test_raise_issue.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test_raise_issue.py b/tests/test_raise_issue.py index 52c8cef6..6a36072b 100644 --- a/tests/test_raise_issue.py +++ b/tests/test_raise_issue.py @@ -8,6 +8,8 @@ import data_pipeline_api as pipeline import data_pipeline_api.fdp_utils as fdp_utils +TEST_CSV = "test.csv" + @pytest.fixture def test_dir() -> str: @@ -116,7 +118,7 @@ def test_link_read( token: str, config: str, script: str, test_dir: str ) -> None: handle = pipeline.initialise(token, config, script) - tmp_csv = os.path.join(test_dir, "test.csv") + tmp_csv = os.path.join(test_dir, TEST_CSV) link_write = pipeline.link_write(handle, "test/csv") shutil.copy(tmp_csv, link_write) pipeline.finalise(token, handle) @@ -140,7 +142,7 @@ def test_find_data_product( ) -> None: handle = pipeline.initialise(token, fconfig, script) - tmp_csv = os.path.join(test_dir, "test.csv") + tmp_csv = os.path.join(test_dir, TEST_CSV) link_write = pipeline.link_write(handle, "find/csv") shutil.copy(tmp_csv, link_write) pipeline.finalise(token, handle) @@ -176,7 +178,7 @@ def test_find_data_product_wrong_registry( ) -> None: handle = pipeline.initialise(token, fconfig, script) - tmp_csv = os.path.join(test_dir, "test.csv") + tmp_csv = os.path.join(test_dir, TEST_CSV) link_write = pipeline.link_write(handle, "find/csv") shutil.copy(tmp_csv, link_write) pipeline.finalise(token, handle) @@ -198,7 +200,7 @@ def test_link_read_data_product_exists( token: str, config: str, script: str, test_dir: str ) -> None: handle = pipeline.initialise(token, config, script) - tmp_csv = os.path.join(test_dir, "test.csv") + tmp_csv = os.path.join(test_dir, TEST_CSV) link_write = pipeline.link_write(handle, "test/csv") shutil.copy(tmp_csv, link_write) pipeline.finalise(token, handle)