From fb954006764a50b82f28c0dc07a8695b6aab4635 Mon Sep 17 00:00:00 2001
From: bruvio <bruno.viola@pm.me>
Date: Mon, 31 Jan 2022 12:08:44 +0000
Subject: [PATCH 1/4] added wildcard search for a given endpoint and key

---
 fairdatapipeline/__init__.py |  3 +-
 fairdatapipeline/pipeline.py | 59 ++++++++++++++++++++++++++++++++++++
 tests/ext/find_csv.yaml      | 20 ++++++++++++
 tests/test_raise_issue.py    | 32 +++++++++++++++++++
 4 files changed, 113 insertions(+), 1 deletion(-)
 create mode 100644 tests/ext/find_csv.yaml

diff --git a/fairdatapipeline/__init__.py b/fairdatapipeline/__init__.py
index 46217a6e..b4302fe4 100644
--- a/fairdatapipeline/__init__.py
+++ b/fairdatapipeline/__init__.py
@@ -3,6 +3,7 @@
     "link_read",
     "link_write",
     "finalise",
+    "search",
     "raise_issue_by_data_product",
     "raise_issue_by_index",
     "raise_issue_with_config",
@@ -15,7 +16,7 @@
 
 from .fdp_utils import get_handle_index_from_path
 from .link import link_read, link_write
-from .pipeline import finalise, initialise
+from .pipeline import finalise, initialise, search
 from .raise_issue import (
     raise_issue_by_data_product,
     raise_issue_by_existing_data_product,
diff --git a/fairdatapipeline/pipeline.py b/fairdatapipeline/pipeline.py
index c0a71e58..48e790cb 100644
--- a/fairdatapipeline/pipeline.py
+++ b/fairdatapipeline/pipeline.py
@@ -1,12 +1,71 @@
 import datetime
+import json
 import logging
 import os
 
+import requests
 import yaml
 
 from fairdatapipeline import fdp_utils
 
 
+def search(
+    token: str,
+    config: str,
+    script: str,
+    wildcard: str,
+    endpoint: str,
+    key: str,
+) -> str:
+    """Reads in token, config file script, wildcard, endpoint and key and returns a list of items found at the endpoint where the key contains the wildcard
+
+    Args:
+        |   token: registry token
+        |   config: Path to config file
+        |   script: Path to script file
+        |   wildcard: string to search
+        |   key: key of the results dictionary where the wildcard is searched
+        |   endpoint: api endpoint where the search is performed
+
+    Returns:
+        |   json: a list of dictionaries where the wildcard was found
+    """
+    # Validate Yamls
+    if not fdp_utils.is_valid_yaml(config):
+        raise ValueError("Config is not a valid YAML file")
+    if not fdp_utils.is_file(script):
+        raise ValueError("Script does not exist")
+
+    # Read config file and extract run metadata
+    with open(config, "r") as data:
+        config_yaml = yaml.safe_load(data)
+    run_metadata = config_yaml["run_metadata"]
+    registry_url = run_metadata["local_data_registry_url"]
+    if registry_url[-1] != "/":
+        registry_url += "/"
+    filename = os.path.basename(config)
+
+    # @todo to be set from config
+    if "api_version" not in config_yaml["run_metadata"].keys():
+        config_yaml["run_metadata"]["api_version"] = "1.0.0"
+
+    api_version = config_yaml["run_metadata"]["api_version"]
+
+    logging.info("Reading {} from local filestore".format(filename))
+
+    headers = fdp_utils.get_headers(token=token, api_version=api_version)
+    if registry_url[-1] != "/":
+        registry_url += "/"
+    registry_url += endpoint + "/?"
+
+    response = requests.get(registry_url, headers=headers)
+    data = response.json()
+
+    res = [obj for obj in data["results"] if wildcard in obj[key]]  # type: ignore
+
+    return json.dumps(res, indent=4, sort_keys=True)
+
+
 def initialise(token: str, config: str, script: str) -> dict:
     """Reads in token, config file and script, creates necessary registry items
     and creates new code run.
diff --git a/tests/ext/find_csv.yaml b/tests/ext/find_csv.yaml
new file mode 100644
index 00000000..110ef979
--- /dev/null
+++ b/tests/ext/find_csv.yaml
@@ -0,0 +1,20 @@
+run_metadata:
+  description: Write csv file
+  local_data_registry_url: http://127.0.0.1:8000/api/
+  remote_data_registry_url: https://data.scrc.uk/api/
+  default_input_namespace: testing
+  default_output_namespace: testing
+  write_data_store: tmp
+  local_repo: ./
+  script: |-
+        python3 py.test
+  public: true
+  latest_commit: 221bfe8b52bbfb3b2dbdc23037b7dd94b49aaa70
+  remote_repo: https://github.com/FAIRDataPipeline/pyDataPipeline
+
+write:
+- data_product: find/csv
+  description: test csv to test find data products
+  file_type: csv
+  use:
+    version: 0.0.1
diff --git a/tests/test_raise_issue.py b/tests/test_raise_issue.py
index 01fabdce..27ff6ef9 100644
--- a/tests/test_raise_issue.py
+++ b/tests/test_raise_issue.py
@@ -1,3 +1,4 @@
+import json
 import os
 import shutil
 
@@ -29,6 +30,11 @@ def config(test_dir: str) -> str:
     return os.path.join(test_dir, "write_csv.yaml")
 
 
+@pytest.fixture
+def fconfig(test_dir: str) -> str:
+    return os.path.join(test_dir, "find_csv.yaml")
+
+
 @pytest.mark.pipeline
 def test_initialise(token: str, config: str, script: str) -> None:
     handle = pipeline.initialise(token, config, script)
@@ -122,6 +128,32 @@ def test_link_read(
     assert type(link_read_1) == str and type(link_read_2) == str
 
 
+@pytest.mark.pipeline
+def test_find_data_product(
+    token: str,
+    fconfig: str,
+    script: str,
+    test_dir: str,
+    query: str = "find",
+    key: str = "name",
+) -> None:
+
+    handle = pipeline.initialise(token, fconfig, script)
+    tmp_csv = os.path.join(test_dir, "test.csv")
+    link_write = pipeline.link_write(handle, "find/csv")
+    shutil.copy(tmp_csv, link_write)
+    pipeline.finalise(token, handle)
+
+    config = os.path.join(test_dir, "read_csv.yaml")
+    results = pipeline.search(
+        token, config, script, query, "data_product", "name"
+    )
+    res = json.loads(results)
+    assert len(res) == 1
+    result = fdp_utils.get_first_entry(res)
+    assert query in result[key]
+
+
 @pytest.mark.pipeline
 def test_link_read_data_product_exists(
     token: str, config: str, script: str, test_dir: str

From b88b63d11727a9a48c42b4aed2be445ff0940d11 Mon Sep 17 00:00:00 2001
From: bruvio <bruno.viola@pm.me>
Date: Mon, 28 Feb 2022 12:16:19 +0000
Subject: [PATCH 2/4] updated search data product

 now search function uses a handle, token, wildcard endpoint and key
---
 data_pipeline_api/pipeline.py | 39 ++++++------------
 pyproject.toml                | 78 ++++++++++++++++-------------------
 tests/test_raise_issue.py     | 11 +++--
 3 files changed, 54 insertions(+), 74 deletions(-)

diff --git a/data_pipeline_api/pipeline.py b/data_pipeline_api/pipeline.py
index f2925fa3..a6eaec7e 100644
--- a/data_pipeline_api/pipeline.py
+++ b/data_pipeline_api/pipeline.py
@@ -10,19 +10,18 @@
 
 
 def search(
+    handle: dict,
     token: str,
-    config: str,
-    script: str,
     wildcard: str,
     endpoint: str,
     key: str,
 ) -> str:
-    """Reads in token, config file script, wildcard, endpoint and key and returns a list of items found at the endpoint where the key contains the wildcard
+    """
+    Reads 'read' information from handle, wildcard, endpoint and key and returns a list of items found at the endpoint where the key contains the wildcard
 
     Args:
+        |   handle: pipeline handle
         |   token: registry token
-        |   config: Path to config file
-        |   script: Path to script file
         |   wildcard: string to search
         |   key: key of the results dictionary where the wildcard is searched
         |   endpoint: api endpoint where the search is performed
@@ -30,28 +29,14 @@ def search(
     Returns:
         |   json: a list of dictionaries where the wildcard was found
     """
-    # Validate Yamls
-    if not fdp_utils.is_valid_yaml(config):
-        raise ValueError("Config is not a valid YAML file")
-    if not fdp_utils.is_file(script):
-        raise ValueError("Script does not exist")
-
-    # Read config file and extract run metadata
-    with open(config, "r") as data:
-        config_yaml = yaml.safe_load(data)
-    run_metadata = config_yaml["run_metadata"]
-    registry_url = run_metadata["local_data_registry_url"]
-    if registry_url[-1] != "/":
-        registry_url += "/"
-    filename = os.path.basename(config)
-
-    # @todo to be set from config
-    if "api_version" not in config_yaml["run_metadata"].keys():
-        config_yaml["run_metadata"]["api_version"] = "1.0.0"
 
-    api_version = config_yaml["run_metadata"]["api_version"]
-
-    logging.info("Reading {} from local filestore".format(filename))
+    try:
+        api_version = handle["yaml"]["run_metadata"]["api_version"]
+        registry_url = handle["yaml"]["run_metadata"][
+            "local_data_registry_url"
+        ]
+    except KeyError:
+        return json.dumps([], indent=4, sort_keys=True)
 
     headers = fdp_utils.get_headers(token=token, api_version=api_version)
     if registry_url[-1] != "/":
@@ -59,6 +44,8 @@ def search(
     registry_url += endpoint + "/?"
 
     response = requests.get(registry_url, headers=headers)
+    if response.status_code != 200:
+        return json.dumps([], indent=4, sort_keys=True)
     data = response.json()
 
     res = [obj for obj in data["results"] if wildcard in obj[key]]  # type: ignore
diff --git a/pyproject.toml b/pyproject.toml
index 316d5562..95439309 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,15 +1,14 @@
 [tool.poetry]
+authors = [
+  "Ryan J Field <ryan.field@glasgow.ac.uk>",
+  "Dennis Reddyhoff <d.reddyhoff@sheffield.ac.uk>",
+  "Robert D Turner <r.d.turner@shef.ac.uk>",
+  "Bruno Viola <bruno.viola@ukaea.uk>",
+  "Kristian Zarebski <kristian.zarebski@ukaea.uk>",
+]
+description = "Python api to interact with the Fair Data Pipeline"
 name = "data_pipeline_api"
 version = "0.7.5"
-description = "Python api to interact with the Fair Data Pipeline"
-authors = [
-    "Ryan J Field <ryan.field@glasgow.ac.uk>",
-    "Dennis Reddyhoff <d.reddyhoff@sheffield.ac.uk>",
-    "Robert D Turner <r.d.turner@shef.ac.uk>",
-    "Bruno Viola <bruno.viola@ukaea.uk>",
-    "Kristian Zarebski <kristian.zarebski@ukaea.uk>"
-    ]
-
 
 homepage = "https://www.fairdatapipeline.org/"
 
@@ -18,74 +17,69 @@ repository = "https://github.com/FAIRDataPipeline/pyDataPipeline"
 license = "GNU General Public License v3.0"
 
 packages = [
-    {include = "data_pipeline_api"}
+  {include = "data_pipeline_api"},
 ]
 
 readme = "README.md"
 
 keywords = [
-    "FAIR Data Pipeline",
-    "FAIR",
-    "Data Management",
-    "Provenance"
+  "FAIR Data Pipeline",
+  "FAIR",
+  "Data Management",
+  "Provenance",
 ]
 
 [tool.poetry.dependencies]
+PyYAML = "^6.0"
+h5py = "^3.6.0"
+matplotlib = "^3.5.1"
 python = "^3.7.1,<3.11"
 requests = "^2.27.1"
-matplotlib = "^3.5.1"
 scipy = "^1.7.3"
-h5py = "^3.6.0"
-PyYAML = "^6.0"
 
 [tool.poetry.dev-dependencies]
-pytest = "^7.0.0"
+Sphinx = "^4.3.2"
 black = "^22.1"
-mypy = "^0.931"
 flake8 = "^4.0.1"
+isort = "^5.10.1"
+mypy = "^0.931"
 poetry = "^1.1.6"
-pytest-mock = "^3.7.0"
-pytest-dependency = "^0.5.1"
-pytest-cov = "^3.0.0"
 pre-commit = "^2.17.0"
-isort = "^5.10.1"
-Sphinx = "^4.3.2"
+pytest = "^7.0.0"
+pytest-cov = "^3.0.0"
+pytest-dependency = "^0.5.1"
+pytest-mock = "^3.7.0"
 sphinx-rtd-theme = "^1.0.0"
 
 [build-system]
-requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
-
+requires = ["poetry-core>=1.0.0"]
 
 [tool.poetry.urls]
 "Issue Tracker" = "https://github.com/FAIRDataPipeline/pyDataPipeline/issues"
 
-
-
 [tool.isort]
-profile = 'black'
-multi_line_output = 3
-include_trailing_comma = true
-force_grid_wrap = 0
-use_parentheses = true
 ensure_newline_before_comments = true
+force_grid_wrap = 0
+include_trailing_comma = true
 line_length = 79
-
-
+multi_line_output = 3
+profile = 'black'
+use_parentheses = true
 
 [tool.black]
 line-length = 79
 
 [tool.pytest.ini_options]
-addopts = '-s -v --cov=fairdatapipeline  --cov-report=html --cov-report=term'
+addopts = '-s -v --cov=data_pipeline_api  --cov-report=html --cov-report=term'
 markers = [
-    "pipeline: tests for 'pipeline' module ",
-    "issue: tests for raising issues ",
-    "utilities: tests for 'utilities' functions ",
-    "apiversion: tests for api versioning ",
+  "pipeline: tests for 'pipeline' module ",
+  "issue: tests for raising issues ",
+  "utilities: tests for 'utilities' functions ",
+  "apiversion: tests for api versioning ",
 ]
 
 [tool.mypy]
-ignore_missing_imports = true
-disallow_untyped_defs = true
 disallow_untyped_calls = true
+disallow_untyped_defs = true
+ignore_missing_imports = true
diff --git a/tests/test_raise_issue.py b/tests/test_raise_issue.py
index 701122ec..1a09ccb9 100644
--- a/tests/test_raise_issue.py
+++ b/tests/test_raise_issue.py
@@ -134,7 +134,7 @@ def test_find_data_product(
     fconfig: str,
     script: str,
     test_dir: str,
-    query: str = "find",
+    wildcard: str = "find",
     key: str = "name",
 ) -> None:
 
@@ -143,15 +143,14 @@ def test_find_data_product(
     link_write = pipeline.link_write(handle, "find/csv")
     shutil.copy(tmp_csv, link_write)
     pipeline.finalise(token, handle)
-
     config = os.path.join(test_dir, "read_csv.yaml")
-    results = pipeline.search(
-        token, config, script, query, "data_product", "name"
-    )
+    handle = pipeline.initialise(token, config, script)
+
+    results = pipeline.search(handle, token, wildcard, "data_product", "name")
     res = json.loads(results)
     assert len(res) == 1
     result = fdp_utils.get_first_entry(res)
-    assert query in result[key]
+    assert wildcard in result[key]
 
 
 @pytest.mark.pipeline

From 7d76fbc63911805b896fcac0166a4fa7b247a876 Mon Sep 17 00:00:00 2001
From: bruvio <bruno.viola@pm.me>
Date: Mon, 28 Feb 2022 15:12:44 +0000
Subject: [PATCH 3/4] added wrapper function to search data products name using
 wildcards

added more tests
---
 data_pipeline_api/__init__.py |  3 ++-
 data_pipeline_api/pipeline.py | 33 ++++++++++++++++++++++++---
 tests/test_raise_issue.py     | 42 ++++++++++++++++++++++++++++++++++-
 3 files changed, 73 insertions(+), 5 deletions(-)

diff --git a/data_pipeline_api/__init__.py b/data_pipeline_api/__init__.py
index b4302fe4..d85fe5a1 100644
--- a/data_pipeline_api/__init__.py
+++ b/data_pipeline_api/__init__.py
@@ -4,6 +4,7 @@
     "link_write",
     "finalise",
     "search",
+    "search_data_products",
     "raise_issue_by_data_product",
     "raise_issue_by_index",
     "raise_issue_with_config",
@@ -16,7 +17,7 @@
 
 from .fdp_utils import get_handle_index_from_path
 from .link import link_read, link_write
-from .pipeline import finalise, initialise, search
+from .pipeline import finalise, initialise, search, search_data_products
 from .raise_issue import (
     raise_issue_by_data_product,
     raise_issue_by_existing_data_product,
diff --git a/data_pipeline_api/pipeline.py b/data_pipeline_api/pipeline.py
index a6eaec7e..a6d79bb0 100644
--- a/data_pipeline_api/pipeline.py
+++ b/data_pipeline_api/pipeline.py
@@ -2,6 +2,7 @@
 import json
 import logging
 import os
+from typing import List
 
 import requests
 import yaml
@@ -9,12 +10,36 @@
 from data_pipeline_api import fdp_utils
 
 
+def search_data_products(
+    handle: dict, wildcard: str, token: str = None
+) -> str:
+    """
+    search_data_products
+    wrapper function to search to search using a data_product name using a wildcard
+
+    Parameters
+    ----------
+    handle : dict
+
+    wildcard : str
+        search parameter
+    token : str, optional
+        optional token, by default None
+
+    Returns
+    -------
+    List
+        list of data products that meet the search criteria
+    """
+    return search(handle, wildcard, "data_product", "name", token=token)
+
+
 def search(
     handle: dict,
-    token: str,
     wildcard: str,
     endpoint: str,
     key: str,
+    token: str = None,
 ) -> str:
     """
     Reads 'read' information from handle, wildcard, endpoint and key and returns a list of items found at the endpoint where the key contains the wildcard
@@ -36,16 +61,18 @@ def search(
             "local_data_registry_url"
         ]
     except KeyError:
-        return json.dumps([], indent=4, sort_keys=True)
+        return json.dumps(None, indent=4, sort_keys=True)
 
     headers = fdp_utils.get_headers(token=token, api_version=api_version)
+    if not registry_url:
+        return json.dumps(None, indent=4, sort_keys=True)
     if registry_url[-1] != "/":
         registry_url += "/"
     registry_url += endpoint + "/?"
 
     response = requests.get(registry_url, headers=headers)
     if response.status_code != 200:
-        return json.dumps([], indent=4, sort_keys=True)
+        return json.dumps(None, indent=4, sort_keys=True)
     data = response.json()
 
     res = [obj for obj in data["results"] if wildcard in obj[key]]  # type: ignore
diff --git a/tests/test_raise_issue.py b/tests/test_raise_issue.py
index 1a09ccb9..52c8cef6 100644
--- a/tests/test_raise_issue.py
+++ b/tests/test_raise_issue.py
@@ -1,3 +1,4 @@
+import copy
 import json
 import os
 import shutil
@@ -146,13 +147,52 @@ def test_find_data_product(
     config = os.path.join(test_dir, "read_csv.yaml")
     handle = pipeline.initialise(token, config, script)
 
-    results = pipeline.search(handle, token, wildcard, "data_product", "name")
+    results = pipeline.search_data_products(handle, wildcard)
     res = json.loads(results)
     assert len(res) == 1
     result = fdp_utils.get_first_entry(res)
     assert wildcard in result[key]
 
 
+@pytest.mark.pipeline
+def test_find_data_product_empty_handle(
+    wildcard: str = "find",
+) -> None:
+    handle: dict = {}
+
+    results = pipeline.search_data_products(handle, wildcard)
+    res = json.loads(results)
+    assert res is None
+
+
+@pytest.mark.pipeline
+def test_find_data_product_wrong_registry(
+    token: str,
+    fconfig: str,
+    script: str,
+    test_dir: str,
+    wildcard: str = "find",
+    key: str = "name",
+) -> None:
+
+    handle = pipeline.initialise(token, fconfig, script)
+    tmp_csv = os.path.join(test_dir, "test.csv")
+    link_write = pipeline.link_write(handle, "find/csv")
+    shutil.copy(tmp_csv, link_write)
+    pipeline.finalise(token, handle)
+    config = os.path.join(test_dir, "read_csv.yaml")
+    handle = pipeline.initialise(token, config, script)
+    wrong_handle = copy.deepcopy(handle)
+    wrong_handle["yaml"]["run_metadata"]["local_data_registry_url"] = ""
+    results = pipeline.search_data_products(wrong_handle, wildcard)
+    res = json.loads(results)
+    assert res is None
+    handle["yaml"]["run_metadata"]["local_data_registry_url"] += "-error"
+    results = pipeline.search_data_products(handle, wildcard)
+    res = json.loads(results)
+    assert res is None
+
+
 @pytest.mark.pipeline
 def test_link_read_data_product_exists(
     token: str, config: str, script: str, test_dir: str

From 14c8012a9d78a558c7dc1284ffb2254c673b84f4 Mon Sep 17 00:00:00 2001
From: bruvio <bruno.viola@pm.me>
Date: Mon, 28 Feb 2022 15:19:58 +0000
Subject: [PATCH 4/4] fix code smell

---
 tests/test_raise_issue.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/test_raise_issue.py b/tests/test_raise_issue.py
index 52c8cef6..6a36072b 100644
--- a/tests/test_raise_issue.py
+++ b/tests/test_raise_issue.py
@@ -8,6 +8,8 @@
 import data_pipeline_api as pipeline
 import data_pipeline_api.fdp_utils as fdp_utils
 
+TEST_CSV = "test.csv"
+
 
 @pytest.fixture
 def test_dir() -> str:
@@ -116,7 +118,7 @@ def test_link_read(
     token: str, config: str, script: str, test_dir: str
 ) -> None:
     handle = pipeline.initialise(token, config, script)
-    tmp_csv = os.path.join(test_dir, "test.csv")
+    tmp_csv = os.path.join(test_dir, TEST_CSV)
     link_write = pipeline.link_write(handle, "test/csv")
     shutil.copy(tmp_csv, link_write)
     pipeline.finalise(token, handle)
@@ -140,7 +142,7 @@ def test_find_data_product(
 ) -> None:
 
     handle = pipeline.initialise(token, fconfig, script)
-    tmp_csv = os.path.join(test_dir, "test.csv")
+    tmp_csv = os.path.join(test_dir, TEST_CSV)
     link_write = pipeline.link_write(handle, "find/csv")
     shutil.copy(tmp_csv, link_write)
     pipeline.finalise(token, handle)
@@ -176,7 +178,7 @@ def test_find_data_product_wrong_registry(
 ) -> None:
 
     handle = pipeline.initialise(token, fconfig, script)
-    tmp_csv = os.path.join(test_dir, "test.csv")
+    tmp_csv = os.path.join(test_dir, TEST_CSV)
     link_write = pipeline.link_write(handle, "find/csv")
     shutil.copy(tmp_csv, link_write)
     pipeline.finalise(token, handle)
@@ -198,7 +200,7 @@ def test_link_read_data_product_exists(
     token: str, config: str, script: str, test_dir: str
 ) -> None:
     handle = pipeline.initialise(token, config, script)
-    tmp_csv = os.path.join(test_dir, "test.csv")
+    tmp_csv = os.path.join(test_dir, TEST_CSV)
     link_write = pipeline.link_write(handle, "test/csv")
     shutil.copy(tmp_csv, link_write)
     pipeline.finalise(token, handle)