From 1295e91d8bf57039627c7cd74e096d7eaeffbf68 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 8 Aug 2025 17:19:37 -0500 Subject: [PATCH 01/28] start adding functions --- dataretrieval/waterdata.py | 46 ++++- dataretrieval/waterdata_helpers.py | 265 +++++++++++++++++++++++++++++ 2 files changed, 306 insertions(+), 5 deletions(-) create mode 100644 dataretrieval/waterdata_helpers.py diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index ceed581..f28d529 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -7,13 +7,14 @@ import json from io import StringIO -from typing import TYPE_CHECKING, Literal, get_args +from typing import TYPE_CHECKING, Literal, List, get_args import pandas as pd import requests from requests.models import PreparedRequest from dataretrieval.utils import BaseMetadata, to_str +import dataretrieval.waterdata_helpers if TYPE_CHECKING: from typing import Optional, Tuple, Union @@ -21,7 +22,9 @@ from pandas import DataFrame -_BASE_URL = "https://api.waterdata.usgs.gov/samples-data" +_BASE_URL = "https://api.waterdata.usgs.gov/" + +_SAMPLES_URL = _BASE_URL + "samples-data" _CODE_SERVICES = Literal[ "characteristicgroup", @@ -34,7 +37,6 @@ "states", ] - _SERVICES = Literal["activities", "locations", "organizations", "projects", "results"] _PROFILES = Literal[ @@ -72,6 +74,40 @@ ], } +def get_daily( + monitoring_location_id: Optional[Union[str, List[str]]] = None, + parameter_code: Optional[Union[str, List[str]]] = None, + statistic_id: Optional[Union[str, List[str]]] = None, + properties: Optional[List[str]] = None, + time_series_id: Optional[Union[str, List[str]]] = None, + daily_id: Optional[Union[str, List[str]]] = None, + approval_status: Optional[Union[str, List[str]]] = None, + unit_of_measure: Optional[Union[str, List[str]]] = None, + qualifier: Optional[Union[str, List[str]]] = None, + value: Optional[Union[str, List[str]]] = None, + last_modified: Optional[str] = None, + skipGeometry: Optional[bool] = None, + time: Optional[Union[str, List[str]]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + convertType: bool = True + ) -> pd.DataFrame: + + service = "daily" + output_id = "daily_id" + + return_list = _get_ogc_data( + + ) + +def get_monitoring_locations(): + +def get_ts_meta(): + +def get_latest_continuous(): + +def get_field_measurements(): def get_codes(code_service: _CODE_SERVICES) -> DataFrame: """Return codes from a Samples code service. @@ -90,7 +126,7 @@ def get_codes(code_service: _CODE_SERVICES) -> DataFrame: f"Valid options are: {valid_code_services}." ) - url = f"{_BASE_URL}/codeservice/{code_service}?mimeType=application%2Fjson" + url = f"{_SAMPLES_URL}/codeservice/{code_service}?mimeType=application%2Fjson" response = requests.get(url) @@ -305,7 +341,7 @@ def get_samples( if "boundingBox" in params: params["boundingBox"] = to_str(params["boundingBox"]) - url = f"{_BASE_URL}/{service}/{profile}" + url = f"{_SAMPLES_URL}/{service}/{profile}" req = PreparedRequest() req.prepare_url(url, params=params) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py new file mode 100644 index 0000000..6ea1423 --- /dev/null +++ b/dataretrieval/waterdata_helpers.py @@ -0,0 +1,265 @@ +import httpx +import os +import warnings +from typing import List, Dict, Any, Optional, Union +from datetime import datetime +import pytz +import pandas as pd + +BASE_API = "https://api.waterdata.usgs.gov/ogcapi/" +API_VERSION = "v0" + +# --- Caching for repeated calls --- +_cached_base_url = None +def _base_url(): + global _cached_base_url + if _cached_base_url is None: + _cached_base_url = f"{BASE_API}{API_VERSION}/" + return _cached_base_url + +def _setup_api(service: str): + return f"{_base_url()}collections/{service}/items" + +def _switch_arg_id(ls: Dict[str, Any], id_name: str, service: str): + service_id = service.replace("-", "_") + "_id" + ls.setdefault("id", ls.pop(service_id, ls.pop(id_name, None))) + return ls + +def _switch_properties_id(properties: Optional[List[str]], id_name: str, service: str): + if not properties: + return [] + service_id = service.replace("-", "_") + "_id" + last_letter = service[-1] + service_id_singular = "" + if last_letter == "s": + service_singular = service[:-1] + service_id_singular = service_singular.replace("-", "_") + "_id" + # Replace id fields with "id" + id_fields = [service_id, service_id_singular, id_name] + properties = ["id" if p in id_fields else p.replace("-", "_") for p in properties] + # Remove unwanted fields + return [p for p in properties if p not in ["geometry", service_id]] + +def _format_api_dates(datetime_list: Union[str, List[Union[str, datetime]]], date: bool = False): + def _iso8601(dt): + if isinstance(dt, str): + return dt + elif isinstance(dt, datetime): + if dt.tzinfo is None: + dt = pytz.UTC.localize(dt) + return dt.isoformat() + return str(dt) + + if isinstance(datetime_list, str): + if not datetime_list: + return None + if "P" in datetime_list or "/" in datetime_list: + return datetime_list + return datetime_list + if isinstance(datetime_list, list): + datetime_list = [None if not d else d for d in datetime_list] + if all(d is None for d in datetime_list): + return None + if len(datetime_list) == 1: + d = datetime_list[0] + if isinstance(d, str) and ("P" in d or "/" in d): + return d + return datetime.strptime(d, "%Y-%m-%d").strftime("%Y-%m-%d") if date else _iso8601(d) + elif len(datetime_list) == 2: + dates = [datetime.strptime(str(d), "%Y-%m-%d").strftime("%Y-%m-%d") if date and d else _iso8601(d) if d else "" for d in datetime_list] + return "/".join(dates).replace("NA", "..") + else: + raise ValueError("datetime should only include 1-2 values") + return None + +def _explode_post(ls: Dict[str, Any]): + return {k: _cql2_param({k: v if isinstance(v, list) else [v]}) for k, v in ls.items() if v is not None} + +def _cql2_param(parameter: Dict[str, List[str]]): + property_name = next(iter(parameter)) + parameters = [str(x) for x in parameter[property_name]] + return {"property": property_name, "parameter": parameters} + +def _default_headers(): + headers = { + "Accept-Encoding": "compress, gzip", + "Accept": "application/json", + "User-Agent": "python-dataretrieval/1.0", + "lang": "en-US" + } + token = os.getenv("API_USGS_PAT", "") + if token: + headers["X-Api-Key"] = token + return headers + +def _check_OGC_requests(endpoint: str = "daily", req_type: str = "queryables"): + assert req_type in ["queryables", "schema"] + url = f"{_base_url()}collections/{endpoint}/{req_type}" + resp = httpx.get(url, headers=_default_headers()) + resp.raise_for_status() + return resp.json() + +def _error_body(resp: httpx.Response): + if resp.status_code == 429: + return resp.json().get('error', {}).get('message') + elif resp.status_code == 403: + return "Query request denied. Possible reasons include query exceeding server limits." + return resp.text + +def _get_collection(): + url = f"{_base_url()}openapi?f=json" + resp = httpx.get(url, headers=_default_headers()) + resp.raise_for_status() + return resp.json() + +def _get_description(service: str): + tags = _get_collection().get("tags", []) + for tag in tags: + if tag.get("name") == service: + return tag.get("description") + return None + +def _get_params(service: str): + url = f"{_base_url()}collections/{service}/schema" + resp = httpx.get(url, headers=_default_headers()) + resp.raise_for_status() + properties = resp.json().get("properties", {}) + return {k: v.get("description") for k, v in properties.items()} + +def construct_api_requests( + service: str, + properties: Optional[List[str]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + skipGeometry: bool = False, + **kwargs +): + baseURL = _setup_api(service) + single_params = {"datetime", "last_modified", "begin", "end", "time"} + params = {k: v for k, v in kwargs.items() if k in single_params} + params["skipGeometry"] = skipGeometry + # Limit logic + params["limit"] = max_results if limit is None and max_results is not None else limit or 10000 + if max_results is not None and limit is not None and limit > max_results: + raise ValueError("limit cannot be greater than max_result") + post_params = _explode_post({k: v for k, v in kwargs.items() if k not in single_params}) + POST = bool(post_params) + + time_periods = {"last_modified", "datetime", "time", "begin", "end"} + for i in time_periods: + if i in params: + dates = service == "daily" and i != "last_modified" + params[i] = _format_api_dates(params[i], date=dates) + kwargs[i] = _format_api_dates(kwargs[i], date=dates) + + if bbox: + params["bbox"] = ",".join(map(str, bbox)) + if properties: + params["properties"] = ",".join(_switch_properties_id(properties, "monitoring_location_id", service)) + + headers = _default_headers() + if POST: + headers["Content-Type"] = "application/query-cql-json" + resp = httpx.post(baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) + else: + resp = httpx.get(baseURL, headers=headers, params={**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) + if resp.status_code != 200: + raise Exception(_error_body(resp)) + return resp.json() + +def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], service: str) -> pd.DataFrame: + if return_list.empty: + if not properties or all(pd.isna(properties)): + schema = _check_OGC_requests(endpoint=service, req_type="schema") + properties = list(schema.get("properties", {}).keys()) + return pd.DataFrame(columns=properties) + return return_list + +def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: str) -> pd.DataFrame: + if properties and not all(pd.isna(properties)): + if "id" not in properties: + if output_id in properties: + df = df.rename(columns={"id": output_id}) + else: + plural = output_id.replace("_id", "s_id") + if plural in properties: + df = df.rename(columns={"id": plural}) + return df.loc[:, [col for col in properties if col in df.columns]] + else: + return df.rename(columns={"id": output_id}) + +def _cleanup_cols(df: pd.DataFrame, service: str = "daily") -> pd.DataFrame: + if "qualifier" in df.columns: + df["qualifier"] = df["qualifier"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x) + if "time" in df.columns and service == "daily": + df["time"] = pd.to_datetime(df["time"]).dt.date + for col in ["value", "contributing_drainage_area"]: + if col in df.columns: + df[col] = pd.to_numeric(df[col], errors="coerce") + return df + +def _next_req_url(resp: httpx.Response, req_url: str) -> Optional[str]: + body = resp.json() + if not body.get("numberReturned"): + return None + header_info = resp.headers + if os.getenv("API_USGS_PAT", ""): + print("Remaining requests this hour:", header_info.get("x-ratelimit-remaining", "")) + for link in body.get("links", []): + if link.get("rel") == "next": + return link.get("href") + return None + +def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: + body = resp.json() + if not body.get("numberReturned"): + return pd.DataFrame() + df = pd.DataFrame(body.get("features", [])) + for col in ["geometry", "AsGeoJSON(geometry)"]: + if col in df.columns: + df = df.drop(columns=[col]) + return df + +def _walk_pages(req_url: str, max_results: Optional[int], client: Optional[httpx.Client] = None) -> pd.DataFrame: + print(f"Requesting:\n{req_url}") + client = client or httpx.Client() + if max_results is None or pd.isna(max_results): + dfs = [] + curr_url = req_url + failures = [] + while curr_url: + try: + resp = client.get(curr_url) + resp.raise_for_status() + df1 = _get_resp_data(resp) + dfs.append(df1) + curr_url = _next_req_url(resp, curr_url) + except Exception: + failures.append(curr_url) + curr_url = None + if failures: + print(f"There were {len(failures)} failed requests.") + return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame() + else: + resp = client.get(req_url) + resp.raise_for_status() + return _get_resp_data(resp) + +def _get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataFrame: + args = args.copy() # Don't mutate input + args["service"] = service + max_results = args.pop("max_results", None) + args = _switch_arg_id(args, id_name=output_id, service=service) + properties = args.get("properties") + args["properties"] = _switch_properties_id(properties, id_name=output_id, service=service) + convertType = args.pop("convertType", False) + req_url = construct_api_requests(**args) + return_list = _walk_pages(req_url, max_results) + return_list = _deal_with_empty(return_list, properties, service) + if convertType: + return_list = _cleanup_cols(return_list, service=service) + return_list = _rejigger_cols(return_list, properties, output_id) + # Metadata + return_list.attrs.update(request=req_url, queryTime=pd.Timestamp.now()) + return return_list \ No newline at end of file From c4b0b9ae1e02f15d2c5f1e3407ba390621e7ad90 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Wed, 27 Aug 2025 17:26:12 -0500 Subject: [PATCH 02/28] start adding documentation and going through functions --- dataretrieval/waterdata.py | 30 +++++++--- dataretrieval/waterdata_helpers.py | 93 +++++++++++++++++++++++++----- 2 files changed, 100 insertions(+), 23 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index f28d529..cb0ec59 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -14,7 +14,7 @@ from requests.models import PreparedRequest from dataretrieval.utils import BaseMetadata, to_str -import dataretrieval.waterdata_helpers +from dataretrieval import waterdata_helpers if TYPE_CHECKING: from typing import Optional, Tuple, Union @@ -97,17 +97,33 @@ def get_daily( service = "daily" output_id = "daily_id" - return_list = _get_ogc_data( + # Build argument dictionary, omitting None values + args = { + k: v for k, v in locals().items() + if k not in {"service", "output_id"} and v is not None + } + args["convertType"] = False + + return waterdata_helpers._get_ogc_data(args, output_id, service) + +# def get_monitoring_locations(): +# service = "monitoring-locations" +# output_id = "monitoring_location_id" - ) +# # Build argument dictionary, omitting None values +# args = { +# k: v for k, v in locals().items() +# if k not in {"service", "output_id"} and v is not None +# } +# args["convertType"] = False -def get_monitoring_locations(): +# return _get_ogc_data(args, output_id, service) -def get_ts_meta(): +# def get_ts_meta(): -def get_latest_continuous(): +# def get_latest_continuous(): -def get_field_measurements(): +# def get_field_measurements(): def get_codes(code_service: _CODE_SERVICES) -> DataFrame: """Return codes from a Samples code service. diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 6ea1423..49b4b3d 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -12,20 +12,79 @@ # --- Caching for repeated calls --- _cached_base_url = None def _base_url(): + """ + Returns the base URL for the USGS Water Data OGC API. + + Uses a cached value to avoid repeated string formatting. If the cached value + is not set, it constructs the base URL using the BASE_API and API_VERSION constants. + + Returns: + str: The base URL for the API (e.g., "https://api.waterdata.usgs.gov/ogcapi/v0/"). + """ global _cached_base_url if _cached_base_url is None: _cached_base_url = f"{BASE_API}{API_VERSION}/" return _cached_base_url def _setup_api(service: str): + """ + Constructs and returns the API endpoint URL for a specified service. + + Args: + service (str): The name of the service to be used in the API endpoint. + + Returns: + str: The full URL for the API endpoint corresponding to the given service. + + Example: + >>> _setup_api("daily") + 'https://api.waterdata.usgs.gov/ogcapi/v0/collections/daily/items' + """ return f"{_base_url()}collections/{service}/items" def _switch_arg_id(ls: Dict[str, Any], id_name: str, service: str): + """ + Switch argument id from its package-specific identifier to the standardized "id" key + that the API recognizes. + + Sets the "id" key in the provided dictionary `ls` + with the value from either the service name or the expected id column name. + If neither key exists, "id" will be set to None. + + Example: for service "time-series-metadata", the function will look for either "time_series_metadata_id" + or "time_series_id" and change the key to simply "id". + + Args: + ls (Dict[str, Any]): The dictionary containing identifier keys to be standardized. + id_name (str): The name of the specific identifier key to look for. + service (str): The service name. + + Returns: + Dict[str, Any]: The modified dictionary with the "id" key set appropriately. + """ service_id = service.replace("-", "_") + "_id" ls.setdefault("id", ls.pop(service_id, ls.pop(id_name, None))) return ls def _switch_properties_id(properties: Optional[List[str]], id_name: str, service: str): + """ + Switch properties id from its package-specific identifier to the standardized "id" key + that the API recognizes. + + Sets the "id" key in the provided dictionary `ls` with the value from either the service name + or the expected id column name. If neither key exists, "id" will be set to None. + + Example: for service "monitoring-locations", it will look for "monitoring_location_id" and change + it to "id". + + Args: + ls (Dict[str, Any]): The dictionary containing identifier keys to be standardized. + id_name (str): The name of the specific identifier key to look for. + service (str): The service name. + + Returns: + Dict[str, Any]: The modified dictionary with the "id" key set appropriately. + """ if not properties: return [] service_id = service.replace("-", "_") + "_id" @@ -87,7 +146,7 @@ def _default_headers(): "User-Agent": "python-dataretrieval/1.0", "lang": "en-US" } - token = os.getenv("API_USGS_PAT", "") + token = os.getenv("API_USGS_PAT") if token: headers["X-Api-Key"] = token return headers @@ -112,20 +171,6 @@ def _get_collection(): resp.raise_for_status() return resp.json() -def _get_description(service: str): - tags = _get_collection().get("tags", []) - for tag in tags: - if tag.get("name") == service: - return tag.get("description") - return None - -def _get_params(service: str): - url = f"{_base_url()}collections/{service}/schema" - resp = httpx.get(url, headers=_default_headers()) - resp.raise_for_status() - properties = resp.json().get("properties", {}) - return {k: v.get("description") for k, v in properties.items()} - def construct_api_requests( service: str, properties: Optional[List[str]] = None, @@ -159,6 +204,7 @@ def construct_api_requests( params["properties"] = ",".join(_switch_properties_id(properties, "monitoring_location_id", service)) headers = _default_headers() + print({**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) if POST: headers["Content-Type"] = "application/query-cql-json" resp = httpx.post(baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) @@ -262,4 +308,19 @@ def _get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.Data return_list = _rejigger_cols(return_list, properties, output_id) # Metadata return_list.attrs.update(request=req_url, queryTime=pd.Timestamp.now()) - return return_list \ No newline at end of file + return return_list + + +# def _get_description(service: str): +# tags = _get_collection().get("tags", []) +# for tag in tags: +# if tag.get("name") == service: +# return tag.get("description") +# return None + +# def _get_params(service: str): +# url = f"{_base_url()}collections/{service}/schema" +# resp = httpx.get(url, headers=_default_headers()) +# resp.raise_for_status() +# properties = resp.json().get("properties", {}) +# return {k: v.get("description") for k, v in properties.items()} \ No newline at end of file From c32ded583cc4890f9a8abef2b67411b0b4e94b9b Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Thu, 28 Aug 2025 17:06:24 -0500 Subject: [PATCH 03/28] adjust date function --- dataretrieval/waterdata_helpers.py | 81 ++++++++++++++++++------------ 1 file changed, 50 insertions(+), 31 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 49b4b3d..9cb6a45 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -5,6 +5,9 @@ from datetime import datetime import pytz import pandas as pd +from datetime import datetime +from zoneinfo import ZoneInfo +import re BASE_API = "https://api.waterdata.usgs.gov/ogcapi/" API_VERSION = "v0" @@ -78,12 +81,12 @@ def _switch_properties_id(properties: Optional[List[str]], id_name: str, service it to "id". Args: - ls (Dict[str, Any]): The dictionary containing identifier keys to be standardized. + properties (List[str]): A list containing the properties or column names to be pulled from the service. id_name (str): The name of the specific identifier key to look for. service (str): The service name. Returns: - Dict[str, Any]: The modified dictionary with the "id" key set appropriately. + List[str]: The modified list with the "id" key set appropriately. """ if not properties: return [] @@ -99,37 +102,53 @@ def _switch_properties_id(properties: Optional[List[str]], id_name: str, service # Remove unwanted fields return [p for p in properties if p not in ["geometry", service_id]] -def _format_api_dates(datetime_list: Union[str, List[Union[str, datetime]]], date: bool = False): - def _iso8601(dt): - if isinstance(dt, str): +def format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) -> Union[str, None]: + # Get timezone + local_timezone = ZoneInfo.local() + + # Return empty strings as None + if isinstance(datetime_input, str) and datetime_input.strip() == "": + return None + + # Convert single string to list for uniform processing + if isinstance(datetime_input, str): + datetime_input = [datetime_input] + + # Check for null or all NA and return None + if all(pd.isna(dt) or dt == "" for dt in datetime_input): + return None + # If the list is of length 1, first look for things like "P7D" or dates + # already formatted in ISO08601. Otherwise, try to coerce to datetime + if len(datetime_input) == 1: + dt = datetime_input[0] + if re.search(r"P", dt, re.IGNORECASE) or "/" in dt: return dt - elif isinstance(dt, datetime): - if dt.tzinfo is None: - dt = pytz.UTC.localize(dt) - return dt.isoformat() - return str(dt) - - if isinstance(datetime_list, str): - if not datetime_list: - return None - if "P" in datetime_list or "/" in datetime_list: - return datetime_list - return datetime_list - if isinstance(datetime_list, list): - datetime_list = [None if not d else d for d in datetime_list] - if all(d is None for d in datetime_list): - return None - if len(datetime_list) == 1: - d = datetime_list[0] - if isinstance(d, str) and ("P" in d or "/" in d): - return d - return datetime.strptime(d, "%Y-%m-%d").strftime("%Y-%m-%d") if date else _iso8601(d) - elif len(datetime_list) == 2: - dates = [datetime.strptime(str(d), "%Y-%m-%d").strftime("%Y-%m-%d") if date and d else _iso8601(d) if d else "" for d in datetime_list] - return "/".join(dates).replace("NA", "..") else: - raise ValueError("datetime should only include 1-2 values") - return None + try: + parsed_dt = pd.to_datetime(dt) + # If the service only accepts dates for this input, not datetimes (e.g. "daily"), + # return just the date, otherwise, return the datetime in UTC format. + if date: + return parsed_dt.strftime("%Y-%m-%d") + else: + parsed_dt.strftime("%Y-%m-%dT%H:%M:%SZ") + parsed_dt.replace(tzinfo=local_timezone) + return parsed_dt.astimezone(pytz.UTC) + except Exception: + return None + + elif len(datetime_input) == 2: + try: + parsed_dates = [pd.to_datetime(dt) for dt in datetime_input] + if date: + formatted = "/".join(dt.strftime("%Y-%m-%d") for dt in parsed_dates) + else: + formatted = "/".join(dt.strftime("%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=local_timezone).astimezone(pytz.UTC) for dt in parsed_dates) + return formatted.replace("", "..") + except Exception: + return None + else: + raise ValueError("datetime_input should only include 1-2 values") def _explode_post(ls: Dict[str, Any]): return {k: _cql2_param({k: v if isinstance(v, list) else [v]}) for k, v in ls.items() if v is not None} From 99e949cadb665b22706256302d1f9954e206d23f Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 29 Aug 2025 12:30:21 -0500 Subject: [PATCH 04/28] fix dates function --- dataretrieval/waterdata_helpers.py | 32 ++++++++++++++++-------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 9cb6a45..6793b09 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -5,6 +5,7 @@ from datetime import datetime import pytz import pandas as pd +import numpy as np from datetime import datetime from zoneinfo import ZoneInfo import re @@ -104,19 +105,19 @@ def _switch_properties_id(properties: Optional[List[str]], id_name: str, service def format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) -> Union[str, None]: # Get timezone - local_timezone = ZoneInfo.local() - - # Return empty strings as None - if isinstance(datetime_input, str) and datetime_input.strip() == "": - return None - + local_timezone = datetime.now().astimezone().tzinfo + # Convert single string to list for uniform processing if isinstance(datetime_input, str): datetime_input = [datetime_input] - + # Check for null or all NA and return None - if all(pd.isna(dt) or dt == "" for dt in datetime_input): + if all(pd.isna(dt) or dt == "" or dt == None for dt in datetime_input): return None + + # Replace all blanks with "nan" + datetime_input = ["nan" if x == "" else x for x in datetime_input] + # If the list is of length 1, first look for things like "P7D" or dates # already formatted in ISO08601. Otherwise, try to coerce to datetime if len(datetime_input) == 1: @@ -125,26 +126,27 @@ def format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) return dt else: try: - parsed_dt = pd.to_datetime(dt) + # Parse to naive datetime + parsed_dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") # If the service only accepts dates for this input, not datetimes (e.g. "daily"), # return just the date, otherwise, return the datetime in UTC format. if date: return parsed_dt.strftime("%Y-%m-%d") else: - parsed_dt.strftime("%Y-%m-%dT%H:%M:%SZ") - parsed_dt.replace(tzinfo=local_timezone) - return parsed_dt.astimezone(pytz.UTC) + dt_local = parsed_dt.replace(tzinfo=local_timezone) + # Convert to UTC and format as ISO 8601 with 'Z' + return dt_local.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") except Exception: return None elif len(datetime_input) == 2: try: - parsed_dates = [pd.to_datetime(dt) for dt in datetime_input] + parsed_dates = [datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") for dt in datetime_input] if date: formatted = "/".join(dt.strftime("%Y-%m-%d") for dt in parsed_dates) else: - formatted = "/".join(dt.strftime("%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=local_timezone).astimezone(pytz.UTC) for dt in parsed_dates) - return formatted.replace("", "..") + formatted = "/".join(dt.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") for dt in parsed_dates) + return formatted.replace("nan", "..") except Exception: return None else: From 1641e851b2229814051b5355d9fcf74444979f06 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 29 Aug 2025 14:52:19 -0500 Subject: [PATCH 05/28] keep working out issues with api calls --- dataretrieval/waterdata_helpers.py | 58 +++++++++++++++++++++--------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 6793b09..ce8413b 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -66,10 +66,22 @@ def _switch_arg_id(ls: Dict[str, Any], id_name: str, service: str): Returns: Dict[str, Any]: The modified dictionary with the "id" key set appropriately. """ + service_id = service.replace("-", "_") + "_id" - ls.setdefault("id", ls.pop(service_id, ls.pop(id_name, None))) + + if "id" not in ls: + if service_id in ls: + ls["id"] = ls[service_id] + elif id_name in ls: + ls["id"] = ls[id_name] + + # Remove the original keys regardless of whether they were used + ls.pop(service_id, None) + ls.pop(id_name, None) + return ls + def _switch_properties_id(properties: Optional[List[str]], id_name: str, service: str): """ Switch properties id from its package-specific identifier to the standardized "id" key @@ -103,7 +115,7 @@ def _switch_properties_id(properties: Optional[List[str]], id_name: str, service # Remove unwanted fields return [p for p in properties if p not in ["geometry", service_id]] -def format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) -> Union[str, None]: +def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) -> Union[str, None]: # Get timezone local_timezone = datetime.now().astimezone().tzinfo @@ -138,7 +150,8 @@ def format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) return dt_local.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") except Exception: return None - + # If the list is of length 2, parse the dates and if necessary, combine them together into + # the date range format accepted by the API elif len(datetime_input) == 2: try: parsed_dates = [datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") for dt in datetime_input] @@ -186,13 +199,7 @@ def _error_body(resp: httpx.Response): return "Query request denied. Possible reasons include query exceeding server limits." return resp.text -def _get_collection(): - url = f"{_base_url()}openapi?f=json" - resp = httpx.get(url, headers=_default_headers()) - resp.raise_for_status() - return resp.json() - -def construct_api_requests( +def _construct_api_requests( service: str, properties: Optional[List[str]] = None, bbox: Optional[List[float]] = None, @@ -209,9 +216,18 @@ def construct_api_requests( params["limit"] = max_results if limit is None and max_results is not None else limit or 10000 if max_results is not None and limit is not None and limit > max_results: raise ValueError("limit cannot be greater than max_result") - post_params = _explode_post({k: v for k, v in kwargs.items() if k not in single_params}) + + # Create post calls for any input parameters that are not in the single_params list + # and have more than one element associated with the list or tuple. + post_params = _explode_post({ + k: v for k, v in kwargs.items() + if k not in single_params and isinstance(v, (list, tuple)) and len(v) > 1 + }) + + # Indicate if function needs to perform POST conversion POST = bool(post_params) + # Convert dates to ISO08601 format time_periods = {"last_modified", "datetime", "time", "begin", "end"} for i in time_periods: if i in params: @@ -219,18 +235,21 @@ def construct_api_requests( params[i] = _format_api_dates(params[i], date=dates) kwargs[i] = _format_api_dates(kwargs[i], date=dates) + # String together bbox elements from a list to a comma-separated string, + # and string together properties if provided if bbox: params["bbox"] = ",".join(map(str, bbox)) if properties: - params["properties"] = ",".join(_switch_properties_id(properties, "monitoring_location_id", service)) + params["properties"] = ",".join(properties) headers = _default_headers() - print({**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) + if POST: headers["Content-Type"] = "application/query-cql-json" resp = httpx.post(baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) else: resp = httpx.get(baseURL, headers=headers, params={**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) + print(resp.url) if resp.status_code != 200: raise Exception(_error_body(resp)) return resp.json() @@ -313,7 +332,7 @@ def _walk_pages(req_url: str, max_results: Optional[int], client: Optional[httpx resp.raise_for_status() return _get_resp_data(resp) -def _get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataFrame: +def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataFrame: args = args.copy() # Don't mutate input args["service"] = service max_results = args.pop("max_results", None) @@ -321,7 +340,8 @@ def _get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.Data properties = args.get("properties") args["properties"] = _switch_properties_id(properties, id_name=output_id, service=service) convertType = args.pop("convertType", False) - req_url = construct_api_requests(**args) + args = {k: v for k, v in args.items() if v is not None} + req_url = _construct_api_requests(**args) return_list = _walk_pages(req_url, max_results) return_list = _deal_with_empty(return_list, properties, service) if convertType: @@ -344,4 +364,10 @@ def _get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.Data # resp = httpx.get(url, headers=_default_headers()) # resp.raise_for_status() # properties = resp.json().get("properties", {}) -# return {k: v.get("description") for k, v in properties.items()} \ No newline at end of file +# return {k: v.get("description") for k, v in properties.items()} + +# def _get_collection(): +# url = f"{_base_url()}openapi?f=json" +# resp = httpx.get(url, headers=_default_headers()) +# resp.raise_for_status() +# return resp.json() \ No newline at end of file From 7bc6c6f884631e09792b1884968362fb6830b869 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 29 Aug 2025 15:05:39 -0500 Subject: [PATCH 06/28] add documentation --- dataretrieval/waterdata_helpers.py | 47 ++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index ce8413b..841e1ed 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -116,6 +116,31 @@ def _switch_properties_id(properties: Optional[List[str]], id_name: str, service return [p for p in properties if p not in ["geometry", service_id]] def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) -> Union[str, None]: + """ + Formats date or datetime input(s) for use with an API, handling single values or ranges, and converting to ISO 8601 or date-only formats as needed. + Parameters + ---------- + datetime_input : Union[str, List[str]] + A single date/datetime string or a list of one or two date/datetime strings. Accepts formats like "%Y-%m-%d %H:%M:%S", ISO 8601, or relative periods (e.g., "P7D"). + date : bool, optional + If True, returns only the date portion ("YYYY-MM-DD"). If False (default), returns full datetime in UTC ISO 8601 format ("YYYY-MM-DDTHH:MM:SSZ"). + Returns + ------- + Union[str, None] + - If input is a single value, returns the formatted date/datetime string or None if parsing fails. + - If input is a list of two values, returns a date/datetime range string separated by "/" (e.g., "YYYY-MM-DD/YYYY-MM-DD" or "YYYY-MM-DDTHH:MM:SSZ/YYYY-MM-DDTHH:MM:SSZ"). + - Returns None if input is empty, all NA, or cannot be parsed. + Raises + ------ + ValueError + If `datetime_input` contains more than two values. + Notes + ----- + - Handles blank or NA values by returning None. + - Supports relative period strings (e.g., "P7D") and passes them through unchanged. + - Converts datetimes to UTC and formats as ISO 8601 with 'Z' suffix when `date` is False. + - For date ranges, replaces "nan" with ".." in the output. + """ # Get timezone local_timezone = datetime.now().astimezone().tzinfo @@ -174,6 +199,14 @@ def _cql2_param(parameter: Dict[str, List[str]]): return {"property": property_name, "parameter": parameters} def _default_headers(): + """ + Generate default HTTP headers for API requests. + + Returns: + dict: A dictionary containing default headers including 'Accept-Encoding', + 'Accept', 'User-Agent', and 'lang'. If the environment variable 'API_USGS_PAT' + is set, its value is included as the 'X-Api-Key' header. + """ headers = { "Accept-Encoding": "compress, gzip", "Accept": "application/json", @@ -186,6 +219,20 @@ def _default_headers(): return headers def _check_OGC_requests(endpoint: str = "daily", req_type: str = "queryables"): + """ + Sends an HTTP GET request to the specified OGC endpoint and request type, returning the JSON response. + + Args: + endpoint (str): The OGC collection endpoint to query. Defaults to "daily". + req_type (str): The type of request to make. Must be either "queryables" or "schema". Defaults to "queryables". + + Returns: + dict: The JSON response from the OGC endpoint. + + Raises: + AssertionError: If req_type is not "queryables" or "schema". + httpx.HTTPStatusError: If the HTTP request returns an unsuccessful status code. + """ assert req_type in ["queryables", "schema"] url = f"{_base_url()}collections/{endpoint}/{req_type}" resp = httpx.get(url, headers=_default_headers()) From 1b29d6aecf4c30fa399da0a341b56f09a20decc1 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 19 Sep 2025 08:59:33 -0500 Subject: [PATCH 07/28] adjust how response is handled and edit walk pages, fix API limit print --- dataretrieval/waterdata_helpers.py | 73 +++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 17 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 841e1ed..331194a 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -240,6 +240,17 @@ def _check_OGC_requests(endpoint: str = "daily", req_type: str = "queryables"): return resp.json() def _error_body(resp: httpx.Response): + """ + Extracts and returns an error message from an HTTP response object based on its status code. + + Args: + resp (httpx.Response): The HTTP response object to extract the error message from. + + Returns: + str: The extracted error message. For status code 429, returns the 'message' field from the JSON error object. + For status code 403, returns a predefined message indicating possible reasons for denial. + For other status codes, returns the raw response text. + """ if resp.status_code == 429: return resp.json().get('error', {}).get('message') elif resp.status_code == 403: @@ -255,11 +266,35 @@ def _construct_api_requests( skipGeometry: bool = False, **kwargs ): + """ + Constructs an HTTP request object for the specified water data API service. + Depending on the input parameters, the function determines whether to use a GET or POST request, + formats parameters appropriately, and sets required headers. + Args: + service (str): The name of the API service to query (e.g., "daily"). + properties (Optional[List[str]], optional): List of property names to include in the request. + bbox (Optional[List[float]], optional): Bounding box coordinates as a list of floats. + limit (Optional[int], optional): Maximum number of results to return per request. + max_results (Optional[int], optional): Maximum number of results allowed by the API. + skipGeometry (bool, optional): Whether to exclude geometry from the response. + **kwargs: Additional query parameters, including date/time filters and other API-specific options. + Returns: + httpx.Request: The constructed HTTP request object ready to be sent. + Raises: + ValueError: If `limit` is greater than `max_results`. + Notes: + - Date/time parameters are automatically formatted to ISO8601. + - If multiple values are provided for non-single parameters, a POST request is constructed. + - The function sets appropriate headers for GET and POST requests. + """ baseURL = _setup_api(service) + # Single parameters can only have one value single_params = {"datetime", "last_modified", "begin", "end", "time"} params = {k: v for k, v in kwargs.items() if k in single_params} + # Set skipGeometry parameter params["skipGeometry"] = skipGeometry - # Limit logic + # If limit is none and max_results is not none, then set limit to max results. Otherwise, + # if max_results is none, set it to 10000 (the API max). params["limit"] = max_results if limit is None and max_results is not None else limit or 10000 if max_results is not None and limit is not None and limit > max_results: raise ValueError("limit cannot be greater than max_result") @@ -293,13 +328,10 @@ def _construct_api_requests( if POST: headers["Content-Type"] = "application/query-cql-json" - resp = httpx.post(baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) + req = httpx.Request(method="POST", url=baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) else: - resp = httpx.get(baseURL, headers=headers, params={**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) - print(resp.url) - if resp.status_code != 200: - raise Exception(_error_body(resp)) - return resp.json() + req = httpx.Request(method="GET", url=baseURL, headers=headers, params={**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) + return req def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], service: str) -> pd.DataFrame: if return_list.empty: @@ -341,7 +373,9 @@ def _next_req_url(resp: httpx.Response, req_url: str) -> Optional[str]: print("Remaining requests this hour:", header_info.get("x-ratelimit-remaining", "")) for link in body.get("links", []): if link.get("rel") == "next": - return link.get("href") + next_url = link.get("href") + print(f"Next URL: {next_url}") + return next_url return None def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: @@ -354,17 +388,23 @@ def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: df = df.drop(columns=[col]) return df -def _walk_pages(req_url: str, max_results: Optional[int], client: Optional[httpx.Client] = None) -> pd.DataFrame: - print(f"Requesting:\n{req_url}") +def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional[httpx.Client] = None) -> pd.DataFrame: + print(f"Requesting:\n{req.url}") + + # Get first response from client + # using GET or POST call client = client or httpx.Client() + resp = client.send(req) + if resp.status_code != 200: raise Exception(_error_body(resp)) + if max_results is None or pd.isna(max_results): dfs = [] - curr_url = req_url + curr_url = _next_req_url(resp, req.url) failures = [] while curr_url: try: - resp = client.get(curr_url) - resp.raise_for_status() + resp = client.get(curr_url, headers=_default_headers()) + if resp.status_code != 200: raise Exception(_error_body(resp)) df1 = _get_resp_data(resp) dfs.append(df1) curr_url = _next_req_url(resp, curr_url) @@ -375,7 +415,6 @@ def _walk_pages(req_url: str, max_results: Optional[int], client: Optional[httpx print(f"There were {len(failures)} failed requests.") return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame() else: - resp = client.get(req_url) resp.raise_for_status() return _get_resp_data(resp) @@ -388,14 +427,14 @@ def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataF args["properties"] = _switch_properties_id(properties, id_name=output_id, service=service) convertType = args.pop("convertType", False) args = {k: v for k, v in args.items() if v is not None} - req_url = _construct_api_requests(**args) - return_list = _walk_pages(req_url, max_results) + req = _construct_api_requests(**args) + return_list = _walk_pages(req, max_results) return_list = _deal_with_empty(return_list, properties, service) if convertType: return_list = _cleanup_cols(return_list, service=service) return_list = _rejigger_cols(return_list, properties, output_id) # Metadata - return_list.attrs.update(request=req_url, queryTime=pd.Timestamp.now()) + return_list.attrs.update(request=req.url, queryTime=pd.Timestamp.now()) return return_list From 3289982351dc95be62e4d194468f7cbbb9361e8b Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 19 Sep 2025 09:09:25 -0500 Subject: [PATCH 08/28] add documentation --- dataretrieval/waterdata_helpers.py | 72 +++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 331194a..17a8030 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -334,6 +334,21 @@ def _construct_api_requests( return req def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], service: str) -> pd.DataFrame: + """ + Handles empty DataFrame results by returning a DataFrame with appropriate columns. + + If `return_list` is empty, determines the column names to use: + - If `properties` is not provided or contains only NaN values, retrieves the schema properties from the specified service. + - Otherwise, uses the provided `properties` list as column names. + + Args: + return_list (pd.DataFrame): The DataFrame to check for emptiness. + properties (Optional[List[str]]): List of property names to use as columns, or None. + service (str): The service endpoint to query for schema properties if needed. + + Returns: + pd.DataFrame: The original DataFrame if not empty, otherwise an empty DataFrame with the appropriate columns. + """ if return_list.empty: if not properties or all(pd.isna(properties)): schema = _check_OGC_requests(endpoint=service, req_type="schema") @@ -342,6 +357,23 @@ def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], return return_list def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: str) -> pd.DataFrame: + """ + Rearranges and renames columns in a DataFrame based on provided properties and output identifier. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame whose columns are to be rearranged or renamed. + properties : Optional[List[str]] + A list of column names to possibly rename. If None or contains only NaN, the function will rename 'id' to output_id. + output_id : str + The name to which the 'id' column should be renamed if applicable. + + Returns + ------- + pd.DataFrame + The DataFrame with columns rearranged and/or renamed according to the specified properties and output_id. + """ if properties and not all(pd.isna(properties)): if "id" not in properties: if output_id in properties: @@ -355,6 +387,27 @@ def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: return df.rename(columns={"id": output_id}) def _cleanup_cols(df: pd.DataFrame, service: str = "daily") -> pd.DataFrame: + """ + Cleans and standardizes columns in a pandas DataFrame for water data endpoints. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame containing water data. + service : str, optional + The type of water data service (default is "daily"). + + Returns + ------- + pd.DataFrame + The cleaned DataFrame with standardized columns. + + Notes + ----- + - If the 'qualifier' column exists, lists are joined into comma-separated strings. + - If the 'time' column exists and service is "daily", it is converted to date objects. + - The 'value' and 'contributing_drainage_area' columns are coerced to numeric types. + """ if "qualifier" in df.columns: df["qualifier"] = df["qualifier"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x) if "time" in df.columns and service == "daily": @@ -364,7 +417,24 @@ def _cleanup_cols(df: pd.DataFrame, service: str = "daily") -> pd.DataFrame: df[col] = pd.to_numeric(df[col], errors="coerce") return df -def _next_req_url(resp: httpx.Response, req_url: str) -> Optional[str]: +def _next_req_url(resp: httpx.Response) -> Optional[str]: + """ + Extracts the URL for the next page of results from an HTTP response from a water data endpoint. + + Parameters: + resp (httpx.Response): The HTTP response object containing JSON data and headers. + + Returns: + Optional[str]: The URL for the next page of results if available, otherwise None. + + Side Effects: + If the environment variable "API_USGS_PAT" is set, prints the remaining requests for the current hour. + Prints the next URL if found. + + Notes: + - Expects the response JSON to contain a "links" list with objects having "rel" and "href" keys. + - Checks for the "next" relation in the "links" to determine the next URL. + """ body = resp.json() if not body.get("numberReturned"): return None From 867d7283f550b11743e063f74c8b2cad43a39a80 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 19 Sep 2025 13:30:10 -0500 Subject: [PATCH 09/28] add more documentation, correct waterdata module --- dataretrieval/waterdata.py | 4 +- dataretrieval/waterdata_helpers.py | 90 ++++++++++++++++++++++++++---- 2 files changed, 81 insertions(+), 13 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index cb0ec59..a237695 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -104,7 +104,7 @@ def get_daily( } args["convertType"] = False - return waterdata_helpers._get_ogc_data(args, output_id, service) + return waterdata_helpers.get_ogc_data(args, output_id, service) # def get_monitoring_locations(): # service = "monitoring-locations" @@ -117,7 +117,7 @@ def get_daily( # } # args["convertType"] = False -# return _get_ogc_data(args, output_id, service) +# return waterdata_helpers.get_ogc_data(args, output_id, service) # def get_ts_meta(): diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 17a8030..d73ba3c 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -404,12 +404,9 @@ def _cleanup_cols(df: pd.DataFrame, service: str = "daily") -> pd.DataFrame: Notes ----- - - If the 'qualifier' column exists, lists are joined into comma-separated strings. - If the 'time' column exists and service is "daily", it is converted to date objects. - The 'value' and 'contributing_drainage_area' columns are coerced to numeric types. """ - if "qualifier" in df.columns: - df["qualifier"] = df["qualifier"].apply(lambda x: ", ".join(x) if isinstance(x, list) else x) if "time" in df.columns and service == "daily": df["time"] = pd.to_datetime(df["time"]).dt.date for col in ["value", "contributing_drainage_area"]: @@ -449,16 +446,58 @@ def _next_req_url(resp: httpx.Response) -> Optional[str]: return None def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: + """ + Extracts and normalizes data from an httpx.Response object containing GeoJSON features. + + Parameters: + resp (httpx.Response): The HTTP response object expected to contain a JSON body with a "features" key. + + Returns: + pd.DataFrame: A pandas DataFrame containing the normalized feature properties. + Returns an empty DataFrame if no features are returned. + + Notes: + - Drops columns "type", "geometry", and "AsGeoJSON(geometry)" if present. + - Flattens nested properties and removes the "properties_" prefix from column names. + """ body = resp.json() if not body.get("numberReturned"): return pd.DataFrame() - df = pd.DataFrame(body.get("features", [])) - for col in ["geometry", "AsGeoJSON(geometry)"]: - if col in df.columns: - df = df.drop(columns=[col]) + df = pd.json_normalize( + resp.json()["features"], + sep="_") + df = df.drop(columns=["type", "geometry", "AsGeoJSON(geometry)"], errors="ignore") + df.columns = [col.replace("properties_", "") for col in df.columns] return df def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional[httpx.Client] = None) -> pd.DataFrame: + """ + Iterates through paginated API responses and aggregates the results into a single DataFrame. + + Parameters + ---------- + req : httpx.Request + The initial HTTP request to send. + max_results : Optional[int] + The maximum number of results to retrieve. If None or NaN, retrieves all available pages. + client : Optional[httpx.Client], default None + An optional HTTP client to use for requests. If not provided, a new client is created. + + Returns + ------- + pd.DataFrame + A DataFrame containing the aggregated results from all pages. + + Raises + ------ + Exception + If a request fails or returns a non-200 status code. + + Notes + ----- + - If `max_results` is None or NaN, the function will continue to request subsequent pages until no more pages are available. + - Failed requests are tracked and reported, but do not halt the entire process unless the initial request fails. + """ print(f"Requesting:\n{req.url}") # Get first response from client @@ -469,7 +508,7 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional if max_results is None or pd.isna(max_results): dfs = [] - curr_url = _next_req_url(resp, req.url) + curr_url = _next_req_url(resp) failures = [] while curr_url: try: @@ -477,7 +516,7 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional if resp.status_code != 200: raise Exception(_error_body(resp)) df1 = _get_resp_data(resp) dfs.append(df1) - curr_url = _next_req_url(resp, curr_url) + curr_url = _next_req_url(resp) except Exception: failures.append(curr_url) curr_url = None @@ -489,21 +528,50 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional return _get_resp_data(resp) def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataFrame: - args = args.copy() # Don't mutate input + """ + Retrieves OGC (Open Geospatial Consortium) data from a specified water data endpoint and returns it as a pandas DataFrame. + + This function prepares request arguments, constructs API requests, handles pagination, processes the results, + and formats the output DataFrame according to the specified parameters. + + Args: + args (Dict[str, Any]): Dictionary of request arguments for the OGC service. + output_id (str): The name of the output identifier to use in the request. + service (str): The OGC service type (e.g., "wfs", "wms"). + + Returns: + pd.DataFrame: A DataFrame containing the retrieved and processed OGC data, with metadata attributes + including the request URL and query timestamp. + + Notes: + - The function does not mutate the input `args` dictionary. + - Handles optional arguments such as `max_results` and `convertType`. + - Applies column cleanup and reordering based on service and properties. + - Metadata is attached to the DataFrame via the `.attrs` attribute. + """ + args = args.copy() + # Add service as an argument args["service"] = service + # Pull out a max results input if exists max_results = args.pop("max_results", None) + # Switch the input id to "id" if needed args = _switch_arg_id(args, id_name=output_id, service=service) properties = args.get("properties") + # Switch properties id to "id" if needed args["properties"] = _switch_properties_id(properties, id_name=output_id, service=service) convertType = args.pop("convertType", False) + # Create fresh dictionary of args without any None values args = {k: v for k, v in args.items() if v is not None} + # Build API request req = _construct_api_requests(**args) + # Run API request and iterate through pages if needed return_list = _walk_pages(req, max_results) + # Manage some aspects of the returned dataset return_list = _deal_with_empty(return_list, properties, service) if convertType: return_list = _cleanup_cols(return_list, service=service) return_list = _rejigger_cols(return_list, properties, output_id) - # Metadata + # Add metadata return_list.attrs.update(request=req.url, queryTime=pd.Timestamp.now()) return return_list From 44213b58378cb24796d28f4c3ff028be96253400 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 19 Sep 2025 14:24:57 -0500 Subject: [PATCH 10/28] allow post and get calls in recursive walk pages, fix typo where first page not downloading, start to add more function outlines --- dataretrieval/waterdata.py | 59 +++++++++++++++++++++++------- dataretrieval/waterdata_helpers.py | 15 ++++++-- 2 files changed, 57 insertions(+), 17 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index a237695..10887af 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -106,24 +106,57 @@ def get_daily( return waterdata_helpers.get_ogc_data(args, output_id, service) -# def get_monitoring_locations(): -# service = "monitoring-locations" -# output_id = "monitoring_location_id" +def get_monitoring_locations() -> pd.DataFrame: + service = "monitoring-locations" + output_id = "monitoring_location_id" -# # Build argument dictionary, omitting None values -# args = { -# k: v for k, v in locals().items() -# if k not in {"service", "output_id"} and v is not None -# } -# args["convertType"] = False + # Build argument dictionary, omitting None values + args = { + k: v for k, v in locals().items() + if k not in {"service", "output_id"} and v is not None + } + args["convertType"] = False + + return waterdata_helpers.get_ogc_data(args, output_id, service) + +def get_ts_meta() -> pd.DataFrame: + service = "time-series-metadata" + output_id = "time_series_id" + + # Build argument dictionary, omitting None values + args = { + k: v for k, v in locals().items() + if k not in {"service", "output_id"} and v is not None + } + args["convertType"] = False + + return waterdata_helpers.get_ogc_data(args, output_id, service) -# return waterdata_helpers.get_ogc_data(args, output_id, service) +def get_latest_continuous() -> pd.DataFrame: + service = "latest-continuous" + output_id = "latest_continuous_id" -# def get_ts_meta(): + # Build argument dictionary, omitting None values + args = { + k: v for k, v in locals().items() + if k not in {"service", "output_id"} and v is not None + } + args["convertType"] = False + + return waterdata_helpers.get_ogc_data(args, output_id, service) -# def get_latest_continuous(): +def get_field_measurements() -> pd.DataFrame: + service = "field-measurements" + output_id = "field_measurement_id" -# def get_field_measurements(): + # Build argument dictionary, omitting None values + args = { + k: v for k, v in locals().items() + if k not in {"service", "output_id"} and v is not None + } + args["convertType"] = False + + return waterdata_helpers.get_ogc_data(args, output_id, service) def get_codes(code_service: _CODE_SERVICES) -> DataFrame: """Return codes from a Samples code service. diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index d73ba3c..632405f 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -506,23 +506,30 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional resp = client.send(req) if resp.status_code != 200: raise Exception(_error_body(resp)) + # Grab some aspects of the original request: headers and the + # request type (GET or POST) + method = req.method.upper() + headers = req.headers + content = req.content if method == "POST" else None + if max_results is None or pd.isna(max_results): - dfs = [] + dfs = _get_resp_data(resp) curr_url = _next_req_url(resp) failures = [] while curr_url: try: - resp = client.get(curr_url, headers=_default_headers()) + resp = client.request(method, curr_url, headers=headers, content=content if method == "POST" else None) if resp.status_code != 200: raise Exception(_error_body(resp)) df1 = _get_resp_data(resp) - dfs.append(df1) + dfs = pd.concat([dfs, df1], ignore_index=True) + #dfs.append(df1) curr_url = _next_req_url(resp) except Exception: failures.append(curr_url) curr_url = None if failures: print(f"There were {len(failures)} failed requests.") - return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame() + return dfs else: resp.raise_for_status() return _get_resp_data(resp) From 4affa2f41048802e9f1fd72eaa535658ec136719 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 19 Sep 2025 15:35:02 -0500 Subject: [PATCH 11/28] add in all possible arguments --- dataretrieval/waterdata.py | 121 +++++++++++++++++++++++++++++++++++-- 1 file changed, 116 insertions(+), 5 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index 10887af..a97830a 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -93,7 +93,7 @@ def get_daily( max_results: Optional[int] = None, convertType: bool = True ) -> pd.DataFrame: - + service = "daily" output_id = "daily_id" @@ -106,7 +106,55 @@ def get_daily( return waterdata_helpers.get_ogc_data(args, output_id, service) -def get_monitoring_locations() -> pd.DataFrame: +def get_monitoring_locations( + monitoring_location_id: Optional[List[str]] = None, + agency_code: Optional[List[str]] = None, + agency_name: Optional[List[str]] = None, + monitoring_location_number: Optional[List[str]] = None, + monitoring_location_name: Optional[List[str]] = None, + district_code: Optional[List[str]] = None, + country_code: Optional[List[str]] = None, + country_name: Optional[List[str]] = None, + state_code: Optional[List[str]] = None, + state_name: Optional[List[str]] = None, + county_code: Optional[List[str]] = None, + county_name: Optional[List[str]] = None, + minor_civil_division_code: Optional[List[str]] = None, + site_type_code: Optional[List[str]] = None, + site_type: Optional[List[str]] = None, + hydrologic_unit_code: Optional[List[str]] = None, + basin_code: Optional[List[str]] = None, + altitude: Optional[List[str]] = None, + altitude_accuracy: Optional[List[str]] = None, + altitude_method_code: Optional[List[str]] = None, + altitude_method_name: Optional[List[str]] = None, + vertical_datum: Optional[List[str]] = None, + vertical_datum_name: Optional[List[str]] = None, + horizontal_positional_accuracy_code: Optional[List[str]] = None, + horizontal_positional_accuracy: Optional[List[str]] = None, + horizontal_position_method_code: Optional[List[str]] = None, + horizontal_position_method_name: Optional[List[str]] = None, + original_horizontal_datum: Optional[List[str]] = None, + original_horizontal_datum_name: Optional[List[str]] = None, + drainage_area: Optional[List[str]] = None, + contributing_drainage_area: Optional[List[str]] = None, + time_zone_abbreviation: Optional[List[str]] = None, + uses_daylight_savings: Optional[List[str]] = None, + construction_date: Optional[List[str]] = None, + aquifer_code: Optional[List[str]] = None, + national_aquifer_code: Optional[List[str]] = None, + aquifer_type_code: Optional[List[str]] = None, + well_constructed_depth: Optional[List[str]] = None, + hole_constructed_depth: Optional[List[str]] = None, + depth_source_code: Optional[List[str]] = None, + properties: Optional[List[str]] = None, + skipGeometry: Optional[bool] = None, + time: Optional[Union[str, List[str]]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + convertType: bool = True + ) -> pd.DataFrame: service = "monitoring-locations" output_id = "monitoring_location_id" @@ -119,7 +167,32 @@ def get_monitoring_locations() -> pd.DataFrame: return waterdata_helpers.get_ogc_data(args, output_id, service) -def get_ts_meta() -> pd.DataFrame: +def get_timeseries_metadata( + monitoring_location_id: Optional[Union[str, List[str]]] = None, + parameter_code: Optional[Union[str, List[str]]] = None, + parameter_name: Optional[Union[str, List[str]]] = None, + properties: Optional[Union[str, List[str]]] = None, + statistic_id: Optional[Union[str, List[str]]] = None, + last_modified: Optional[Union[str, List[str]]] = None, + begin: Optional[Union[str, List[str]]] = None, + end: Optional[Union[str, List[str]]] = None, + unit_of_measure: Optional[Union[str, List[str]]] = None, + computation_period_identifier: Optional[Union[str, List[str]]] = None, + computation_identifier: Optional[Union[str, List[str]]] = None, + thresholds: Optional[int] = None, + sublocation_identifier: Optional[Union[str, List[str]]] = None, + primary: Optional[Union[str, List[str]]] = None, + parent_time_series_id: Optional[Union[str, List[str]]] = None, + time_series_id: Optional[Union[str, List[str]]] = None, + web_description: Optional[Union[str, List[str]]] = None, + skipGeometry: Optional[bool] = None, + time: Optional[Union[str, List[str]]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + convertType: bool = True +) -> pd.DataFrame: + service = "time-series-metadata" output_id = "time_series_id" @@ -132,7 +205,25 @@ def get_ts_meta() -> pd.DataFrame: return waterdata_helpers.get_ogc_data(args, output_id, service) -def get_latest_continuous() -> pd.DataFrame: +def get_latest_continuous( + monitoring_location_id: Optional[Union[str, List[str]]] = None, + parameter_code: Optional[Union[str, List[str]]] = None, + statistic_id: Optional[Union[str, List[str]]] = None, + properties: Optional[Union[str, List[str]]] = None, + time_series_id: Optional[Union[str, List[str]]] = None, + latest_continuous_id: Optional[Union[str, List[str]]] = None, + approval_status: Optional[Union[str, List[str]]] = None, + unit_of_measure: Optional[Union[str, List[str]]] = None, + qualifier: Optional[Union[str, List[str]]] = None, + value: Optional[int] = None, + last_modified: Optional[Union[str, List[str]]] = None, + skipGeometry: Optional[bool] = None, + time: Optional[Union[str, List[str]]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + convertType: bool = True + ) -> pd.DataFrame: service = "latest-continuous" output_id = "latest_continuous_id" @@ -145,7 +236,27 @@ def get_latest_continuous() -> pd.DataFrame: return waterdata_helpers.get_ogc_data(args, output_id, service) -def get_field_measurements() -> pd.DataFrame: +def get_field_measurements( + monitoring_location_id: Optional[Union[str, List[str]]] = None, + parameter_code: Optional[Union[str, List[str]]] = None, + observing_procedure_code: Optional[Union[str, List[str]]] = None, + properties: Optional[List[str]] = None, + field_visit_id: Optional[Union[str, List[str]]] = None, + approval_status: Optional[Union[str, List[str]]] = None, + unit_of_measure: Optional[Union[str, List[str]]] = None, + qualifier: Optional[Union[str, List[str]]] = None, + value: Optional[Union[str, List[str]]] = None, + last_modified: Optional[Union[str, List[str]]] = None, + observing_procedure: Optional[Union[str, List[str]]] = None, + vertical_datum: Optional[Union[str, List[str]]] = None, + measuring_agency: Optional[Union[str, List[str]]] = None, + skipGeometry: Optional[bool] = None, + time: Optional[Union[str, List[str]]] = None, + bbox: Optional[List[float]] = None, + limit: Optional[int] = None, + max_results: Optional[int] = None, + convertType: bool = True + ) -> pd.DataFrame: service = "field-measurements" output_id = "field_measurement_id" From 21691d0b8657a48c05d369c6aecb1096723eaca5 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 19 Sep 2025 16:57:14 -0500 Subject: [PATCH 12/28] trying to get cql2 query correct, will keep at it --- dataretrieval/waterdata_helpers.py | 187 ++++++++++++++++------------- 1 file changed, 101 insertions(+), 86 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 632405f..ab6e4bf 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -6,6 +6,7 @@ import pytz import pandas as pd import numpy as np +import json from datetime import datetime from zoneinfo import ZoneInfo import re @@ -193,10 +194,23 @@ def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) def _explode_post(ls: Dict[str, Any]): return {k: _cql2_param({k: v if isinstance(v, list) else [v]}) for k, v in ls.items() if v is not None} -def _cql2_param(parameter: Dict[str, List[str]]): - property_name = next(iter(parameter)) - parameters = [str(x) for x in parameter[property_name]] - return {"property": property_name, "parameter": parameters} +def _cql2_param(args): + filters = [] + for key, values in args.items(): + filters.append({ + "op": "in", + "args": [ + {"property": key}, + values + ] + }) + + query = { + "op": "and", + "args": filters + } + + return json.dumps(query, indent=4) def _default_headers(): """ @@ -328,92 +342,12 @@ def _construct_api_requests( if POST: headers["Content-Type"] = "application/query-cql-json" - req = httpx.Request(method="POST", url=baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) + #req = httpx.Request(method="POST", url=baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) + req = httpx.Request(method="POST", url=baseURL, headers=headers, data=_cql2_param(post_params), params=params) else: req = httpx.Request(method="GET", url=baseURL, headers=headers, params={**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) return req -def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], service: str) -> pd.DataFrame: - """ - Handles empty DataFrame results by returning a DataFrame with appropriate columns. - - If `return_list` is empty, determines the column names to use: - - If `properties` is not provided or contains only NaN values, retrieves the schema properties from the specified service. - - Otherwise, uses the provided `properties` list as column names. - - Args: - return_list (pd.DataFrame): The DataFrame to check for emptiness. - properties (Optional[List[str]]): List of property names to use as columns, or None. - service (str): The service endpoint to query for schema properties if needed. - - Returns: - pd.DataFrame: The original DataFrame if not empty, otherwise an empty DataFrame with the appropriate columns. - """ - if return_list.empty: - if not properties or all(pd.isna(properties)): - schema = _check_OGC_requests(endpoint=service, req_type="schema") - properties = list(schema.get("properties", {}).keys()) - return pd.DataFrame(columns=properties) - return return_list - -def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: str) -> pd.DataFrame: - """ - Rearranges and renames columns in a DataFrame based on provided properties and output identifier. - - Parameters - ---------- - df : pd.DataFrame - The input DataFrame whose columns are to be rearranged or renamed. - properties : Optional[List[str]] - A list of column names to possibly rename. If None or contains only NaN, the function will rename 'id' to output_id. - output_id : str - The name to which the 'id' column should be renamed if applicable. - - Returns - ------- - pd.DataFrame - The DataFrame with columns rearranged and/or renamed according to the specified properties and output_id. - """ - if properties and not all(pd.isna(properties)): - if "id" not in properties: - if output_id in properties: - df = df.rename(columns={"id": output_id}) - else: - plural = output_id.replace("_id", "s_id") - if plural in properties: - df = df.rename(columns={"id": plural}) - return df.loc[:, [col for col in properties if col in df.columns]] - else: - return df.rename(columns={"id": output_id}) - -def _cleanup_cols(df: pd.DataFrame, service: str = "daily") -> pd.DataFrame: - """ - Cleans and standardizes columns in a pandas DataFrame for water data endpoints. - - Parameters - ---------- - df : pd.DataFrame - The input DataFrame containing water data. - service : str, optional - The type of water data service (default is "daily"). - - Returns - ------- - pd.DataFrame - The cleaned DataFrame with standardized columns. - - Notes - ----- - - If the 'time' column exists and service is "daily", it is converted to date objects. - - The 'value' and 'contributing_drainage_area' columns are coerced to numeric types. - """ - if "time" in df.columns and service == "daily": - df["time"] = pd.to_datetime(df["time"]).dt.date - for col in ["value", "contributing_drainage_area"]: - if col in df.columns: - df[col] = pd.to_numeric(df[col], errors="coerce") - return df - def _next_req_url(resp: httpx.Response) -> Optional[str]: """ Extracts the URL for the next page of results from an HTTP response from a water data endpoint. @@ -534,6 +468,87 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional resp.raise_for_status() return _get_resp_data(resp) +def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], service: str) -> pd.DataFrame: + """ + Handles empty DataFrame results by returning a DataFrame with appropriate columns. + + If `return_list` is empty, determines the column names to use: + - If `properties` is not provided or contains only NaN values, retrieves the schema properties from the specified service. + - Otherwise, uses the provided `properties` list as column names. + + Args: + return_list (pd.DataFrame): The DataFrame to check for emptiness. + properties (Optional[List[str]]): List of property names to use as columns, or None. + service (str): The service endpoint to query for schema properties if needed. + + Returns: + pd.DataFrame: The original DataFrame if not empty, otherwise an empty DataFrame with the appropriate columns. + """ + if return_list.empty: + if not properties or all(pd.isna(properties)): + schema = _check_OGC_requests(endpoint=service, req_type="schema") + properties = list(schema.get("properties", {}).keys()) + return pd.DataFrame(columns=properties) + return return_list + +def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: str) -> pd.DataFrame: + """ + Rearranges and renames columns in a DataFrame based on provided properties and output identifier. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame whose columns are to be rearranged or renamed. + properties : Optional[List[str]] + A list of column names to possibly rename. If None or contains only NaN, the function will rename 'id' to output_id. + output_id : str + The name to which the 'id' column should be renamed if applicable. + + Returns + ------- + pd.DataFrame + The DataFrame with columns rearranged and/or renamed according to the specified properties and output_id. + """ + if properties and not all(pd.isna(properties)): + if "id" not in properties: + if output_id in properties: + df = df.rename(columns={"id": output_id}) + else: + plural = output_id.replace("_id", "s_id") + if plural in properties: + df = df.rename(columns={"id": plural}) + return df.loc[:, [col for col in properties if col in df.columns]] + else: + return df.rename(columns={"id": output_id}) + +def _cleanup_cols(df: pd.DataFrame, service: str = "daily") -> pd.DataFrame: + """ + Cleans and standardizes columns in a pandas DataFrame for water data endpoints. + + Parameters + ---------- + df : pd.DataFrame + The input DataFrame containing water data. + service : str, optional + The type of water data service (default is "daily"). + + Returns + ------- + pd.DataFrame + The cleaned DataFrame with standardized columns. + + Notes + ----- + - If the 'time' column exists and service is "daily", it is converted to date objects. + - The 'value' and 'contributing_drainage_area' columns are coerced to numeric types. + """ + if "time" in df.columns and service == "daily": + df["time"] = pd.to_datetime(df["time"]).dt.date + for col in ["value", "contributing_drainage_area"]: + if col in df.columns: + df[col] = pd.to_numeric(df[col], errors="coerce") + return df + def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataFrame: """ Retrieves OGC (Open Geospatial Consortium) data from a specified water data endpoint and returns it as a pandas DataFrame. From 4c2a3eef75a282b38b9febeb6bfd035bc9e67492 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Mon, 22 Sep 2025 12:59:14 -0500 Subject: [PATCH 13/28] correct cql2 queries --- dataretrieval/waterdata_helpers.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index ab6e4bf..166df13 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -191,9 +191,6 @@ def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) else: raise ValueError("datetime_input should only include 1-2 values") -def _explode_post(ls: Dict[str, Any]): - return {k: _cql2_param({k: v if isinstance(v, list) else [v]}) for k, v in ls.items() if v is not None} - def _cql2_param(args): filters = [] for key, values in args.items(): @@ -313,12 +310,11 @@ def _construct_api_requests( if max_results is not None and limit is not None and limit > max_results: raise ValueError("limit cannot be greater than max_result") - # Create post calls for any input parameters that are not in the single_params list - # and have more than one element associated with the list or tuple. - post_params = _explode_post({ - k: v for k, v in kwargs.items() - if k not in single_params and isinstance(v, (list, tuple)) and len(v) > 1 - }) + # Identify which parameters should be included in the POST content body + post_params = { + k: v for k, v in kwargs.items() + if k not in single_params and isinstance(v, (list, tuple)) and len(v) > 1 + } # Indicate if function needs to perform POST conversion POST = bool(post_params) @@ -343,7 +339,7 @@ def _construct_api_requests( if POST: headers["Content-Type"] = "application/query-cql-json" #req = httpx.Request(method="POST", url=baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) - req = httpx.Request(method="POST", url=baseURL, headers=headers, data=_cql2_param(post_params), params=params) + req = httpx.Request(method="POST", url=baseURL, headers=headers, content=_cql2_param(post_params), params=params) else: req = httpx.Request(method="GET", url=baseURL, headers=headers, params={**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) return req @@ -616,4 +612,7 @@ def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataF # url = f"{_base_url()}openapi?f=json" # resp = httpx.get(url, headers=_default_headers()) # resp.raise_for_status() -# return resp.json() \ No newline at end of file +# return resp.json() + +# def _explode_post(ls: Dict[str, Any]): +# return {k: _cql2_param({k: v if isinstance(v, list) else [v]}) for k, v in ls.items() if v is not None} \ No newline at end of file From 14f283025198aa4efd60214f07b8b02201f2e729 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Mon, 22 Sep 2025 13:18:24 -0500 Subject: [PATCH 14/28] simplify syntax, remove unneeded dependencies --- dataretrieval/waterdata_helpers.py | 31 ++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 166df13..164fdfc 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -3,9 +3,7 @@ import warnings from typing import List, Dict, Any, Optional, Union from datetime import datetime -import pytz import pandas as pd -import numpy as np import json from datetime import datetime from zoneinfo import ZoneInfo @@ -301,7 +299,23 @@ def _construct_api_requests( baseURL = _setup_api(service) # Single parameters can only have one value single_params = {"datetime", "last_modified", "begin", "end", "time"} - params = {k: v for k, v in kwargs.items() if k in single_params} + # params = {k: v for k, v in kwargs.items() if k in single_params} + # # Set skipGeometry parameter + # params["skipGeometry"] = skipGeometry + # # If limit is none and max_results is not none, then set limit to max results. Otherwise, + # # if max_results is none, set it to 10000 (the API max). + # params["limit"] = max_results if limit is None and max_results is not None else limit or 10000 + # if max_results is not None and limit is not None and limit > max_results: + # raise ValueError("limit cannot be greater than max_result") + + # Identify which parameters should be included in the POST content body + post_params = { + k: v for k, v in kwargs.items() + if k not in single_params and isinstance(v, (list, tuple)) and len(v) > 1 + } + + # Everything else goes into the params dictionary for the URL + params = {k: v for k, v in kwargs.items() if k not in post_params} # Set skipGeometry parameter params["skipGeometry"] = skipGeometry # If limit is none and max_results is not none, then set limit to max results. Otherwise, @@ -309,12 +323,6 @@ def _construct_api_requests( params["limit"] = max_results if limit is None and max_results is not None else limit or 10000 if max_results is not None and limit is not None and limit > max_results: raise ValueError("limit cannot be greater than max_result") - - # Identify which parameters should be included in the POST content body - post_params = { - k: v for k, v in kwargs.items() - if k not in single_params and isinstance(v, (list, tuple)) and len(v) > 1 - } # Indicate if function needs to perform POST conversion POST = bool(post_params) @@ -325,7 +333,7 @@ def _construct_api_requests( if i in params: dates = service == "daily" and i != "last_modified" params[i] = _format_api_dates(params[i], date=dates) - kwargs[i] = _format_api_dates(kwargs[i], date=dates) + #kwargs[i] = _format_api_dates(kwargs[i], date=dates) # String together bbox elements from a list to a comma-separated string, # and string together properties if provided @@ -338,10 +346,9 @@ def _construct_api_requests( if POST: headers["Content-Type"] = "application/query-cql-json" - #req = httpx.Request(method="POST", url=baseURL, headers=headers, json={"params": list(post_params.values())}, params=params) req = httpx.Request(method="POST", url=baseURL, headers=headers, content=_cql2_param(post_params), params=params) else: - req = httpx.Request(method="GET", url=baseURL, headers=headers, params={**params, **{k: v for k, v in kwargs.items() if k not in single_params}}) + req = httpx.Request(method="GET", url=baseURL, headers=headers, params=params) return req def _next_req_url(resp: httpx.Response) -> Optional[str]: From d25f854a77ee20dd13d572f9b4dc92c274268136 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Wed, 24 Sep 2025 15:26:36 -0500 Subject: [PATCH 15/28] start adding function documentation --- dataretrieval/waterdata.py | 598 ++++++++++++++++++++++++++++++++++++- 1 file changed, 596 insertions(+), 2 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index a97830a..4ff056a 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -93,7 +93,142 @@ def get_daily( max_results: Optional[int] = None, convertType: bool = True ) -> pd.DataFrame: + """Daily data provide one data value to represent water conditions for the day. + Throughout much of the history of the USGS, the primary water data available was + daily data collected manually at the monitoring location once each day. With + improved availability of computer storage and automated transmission of data, the + daily data published today are generally a statistical summary or metric of the + continuous data collected each day, such as the daily mean, minimum, or maximum + value. Daily data are automatically calculated from the continuous data of the same + parameter code and are described by parameter code and a statistic code. These data + have also been referred to as “daily values” or “DV”. + Parameters + ---------- + monitoring_location_id : string or list of strings, optional + A unique identifier representing a single monitoring location. This + corresponds to the id field in the monitoring-locations endpoint. + Monitoring location IDs are created by combining the agency code of + the agency responsible for the monitoring location (e.g. USGS) with + the ID number of the monitoring location (e.g. 02238500), separated + by a hyphen (e.g. USGS-02238500). + parameter_code : string or list of strings, optional + Parameter codes are 5-digit codes used to identify the constituent + measured and the units of measure. A complete list of parameter + codes and associated groupings can be found at + https://help.waterdata.usgs.gov/codes-and-parameters/parameters. + statistic_id : string or list of strings, optional + A code corresponding to the statistic an observation represents. + Example codes include 00001 (max), 00002 (min), and 00003 (mean). + A complete list of codes and their descriptions can be found at + https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html. + properties : string or list of strings, optional + A vector of requested columns to be returned from the query. + Available options are: geometry, id, time_series_id, + monitoring_location_id, parameter_code, statistic_id, time, value, + unit_of_measure, approval_status, qualifier, last_modified + time_series_id : string or list of strings, optional + A unique identifier representing a single time series. This + corresponds to the id field in the time-series-metadata endpoint. + daily_id : string or list of strings, optional + A universally unique identifier (UUID) representing a single + version of a record. It is not stable over time. Every time the + record is refreshed in our database (which may happen as part of + normal operations and does not imply any change to the data itself) + a new ID will be generated. To uniquely identify a single observation + over time, compare the time and time_series_id fields; each time series + will only have a single observation at a given time. + approval_status : string or list of strings, optional + Some of the data that you have obtained from this U.S. Geological + Survey database may not have received Director's approval. Any such + data values are qualified as provisional and are subject to revision. + Provisional data are released on the condition that neither the USGS + nor the United States Government may be held liable for any damages + resulting from its use. This field reflects the approval status of + each record, and is either "Approved", meaining processing review has + been completed and the data is approved for publication, or + "Provisional" and subject to revision. For more information about + provisional data, go to + https://waterdata.usgs.gov/provisional-data-statement/. + unit_of_measure : string or list of strings, optional + A human-readable description of the units of measurement associated + with an observation. + qualifier : string or list of strings, optional + This field indicates any qualifiers associated with an observation, for + instance if a sensor may have been impacted by ice or if values were + estimated. + value : string or list of strings, optional + The value of the observation. Values are transmitted as strings in + the JSON response format in order to preserve precision. + last_modified : string, optional + The last time a record was refreshed in our database. This may happen + due to regular operational processes and does not necessarily indicate + anything about the measurement has changed. You can query this field + using date-times or intervals, adhering to RFC 3339, or using ISO 8601 + duration objects. Intervals may be bounded or half-bounded (double-dots + at start or end). Examples: + - A date-time: "2018-02-12T23:20:50Z" + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + Only features that have a last_modified that intersects the value of datetime are selected. + skipGeometry : boolean, optional + This option can be used to skip response geometries for each feature. The returning + object will be a data frame with no spatial information. + time : string, optional + The date an observation represents. You can query this field using date-times + or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. + Intervals may be bounded or half-bounded (double-dots at start or end). + Examples: + - A date-time: "2018-02-12T23:20:50Z" + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + Only features that have a time that intersects the value of datetime are selected. If + a feature has multiple temporal properties, it is the decision of the server whether + only a single temporal property is used to determine the extent or all relevant temporal properties. + bbox : list of numbers, optional + Only features that have a geometry that intersects the bounding box are selected. + The bounding box is provided as four or six numbers, depending on whether the + coordinate reference system includes a vertical axis (height or depth). Coordinates + are assumed to be in crs 4326. The expected format is a numeric vector structured: + c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude, + Southern-most latitude, Eastern-most longitude, Northern-most longitude). + limit : numeric, optional + The optional limit parameter is used to control the subset of the selected features + that should be returned in each page. The maximum allowable limit is 10000. It may + be beneficial to set this number lower if your internet connection is spotty. The + default (NA) will set the limit to the maximum allowable limit for the service. + max_results : numeric, optional + The optional maximum number of rows to return. This value must be less than the + requested limit. + convertType : boolean, optional + If True, the function will convert the data to dates and qualifier to string vector + + Returns + ------- + df : ``pandas.DataFrame`` + Formatted data returned from the API query. + + Examples + -------- + .. code:: + + >>> # Get daily flow data from a single site + >>> # over a yearlong period + >>> df = dataretrieval.waterdata.get_daily( + ... monitoring_location_id = "USGS-02238500", + ... parameter_code = "00060", + ... time = "2021-01-01T00:00:00Z/2022-01-01T00:00:00Z" + ... ) + + >>> # Get monitoring location info for specific sites + >>> # and only specific properties + >>> df = dataretrieval.waterdata.get_daily( + ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], + ... approval_status = "Approved", + ... time = "2024-01-01/.." + """ service = "daily" output_id = "daily_id" @@ -154,7 +289,232 @@ def get_monitoring_locations( limit: Optional[int] = None, max_results: Optional[int] = None, convertType: bool = True - ) -> pd.DataFrame: + ) -> pd.DataFrame: + """Location information is basic information about the monitoring location + including the name, identifier, agency responsible for data collection, and + the date the location was established. It also includes information about + the type of location, such as stream, lake, or groundwater, and geographic + information about the location, such as state, county, latitude and longitude, + and hydrologic unit code (HUC). + + Parameters + ---------- + monitoring_location_id : string or list of strings, optional + A unique identifier representing a single monitoring location. This + corresponds to the id field in the monitoring-locations endpoint. + Monitoring location IDs are created by combining the agency code of + the agency responsible for the monitoring location (e.g. USGS) with + the ID number of the monitoring location (e.g. 02238500), separated + by a hyphen (e.g. USGS-02238500). + agency_code : string or list of strings, optional + The agency that is reporting the data. Agency codes are fixed values + assigned by the National Water Information System (NWIS). A list of + agency codes is available at this link. + agency_name : string or list of strings, optional + The name of the agency that is reporting the data. + monitoring_location_number : string or list of strings, optional + Each monitoring location in the USGS data base has a unique 8- to + 15-digit identification number. Monitoring location numbers are + assigned based on this logic. + monitoring_location_name : string or list of strings, optional + This is the official name of the monitoring location in the database. + For well information this can be a district-assigned local number. + district_code : string or list of strings, optional + The Water Science Centers (WSCs) across the United States use the FIPS + state code as the district code. In some case, monitoring locations and + samples may be managed by a water science center that is adjacent to the + state in which the monitoring location actually resides. For example a + monitoring location may have a district code of 30 which translates to + Montana, but the state code could be 56 for Wyoming because that is where + the monitoring location actually is located. + country_code : string or list of strings, optional + The code for the country in which the monitoring location is located. + country_name : string or list of strings, optional + The name of the country in which the monitoring location is located. + state_code : string or list of strings, optional + State code. A two-digit ANSI code (formerly FIPS code) as defined by + the American National Standards Institute, to define States and + equivalents. A three-digit ANSI code is used to define counties and + county equivalents. A lookup table is available. The only countries with + political subdivisions other than the US are Mexico and Canada. The Mexican + states have US state codes ranging from 81-86 and Canadian provinces have + state codes ranging from 90-98. + state_name : string or list of strings, optional + The name of the state or state equivalent in which the monitoring location + is located. + county_code : string or list of strings, optional + The code for the county or county equivalent (parish, borough, etc.) in which + the monitoring location is located. A list of codes is available. + county_name : string or list of strings, optional + The name of the county or county equivalent (parish, borough, etc.) in which + the monitoring location is located. A list of codes is available. + minor_civil_division_code : string or list of strings, optional + Codes for primary governmental or administrative divisions of the county or + county equivalent in which the monitoring location is located. + site_type_code : string or list of strings, optional + A code describing the hydrologic setting of the monitoring location. A list of + codes is available. + Example: "US:15:001" (United States: Hawaii, Hawaii County) + site_type : string or list of strings, optional + A description of the hydrologic setting of the monitoring location. A list of + codes is available. + hydrologic_unit_code : string or list of strings, optional + The United States is divided and sub-divided into successively smaller + hydrologic units which are classified into four levels: regions, + sub-regions, accounting units, and cataloging units. The hydrologic units + are arranged within each other, from the smallest (cataloging units) to the + largest (regions). Each hydrologic unit is identified by a unique hydrologic + unit code (HUC) consisting of two to eight digits based on the four levels + of classification in the hydrologic unit system. + basin_code : string or list of strings, optional + The Basin Code or "drainage basin code" is a two-digit code that further + subdivides the 8-digit hydrologic-unit code. The drainage basin code is + defined by the USGS State Office where the monitoring location is located. + altitude : string or list of strings, optional + Altitude of the monitoring location referenced to the specified Vertical + Datum. + altitude_accuracy : string or list of strings, optional + Accuracy of the altitude, in feet. An accuracy of +/- 0.1 foot would be + entered as “.1”. Many altitudes are interpolated from the contours on + topographic maps; accuracies determined in this way are generally entered + as one-half of the contour interval. + altitude_method_code : string or list of strings, optional + Codes representing the method used to measure altitude. A list of codes is + available. + altitude_method_name : float, optional + The name of the the method used to measure altitude. A list of codes is + available. + vertical_datum : float, optional + The datum used to determine altitude and vertical position at the + monitoring location. A list of codes is available. + vertical_datum_name : float, optional + The datum used to determine altitude and vertical position at the + monitoring location. A list of codes is available. + horizontal_positional_accuracy_code : string or list of strings, optional + Indicates the accuracy of the latitude longitude values. A list of codes + is available. + horizontal_positional_accuracy : string or list of strings, optional + Indicates the accuracy of the latitude longitude values. A list of codes + is available. + horizontal_position_method_code : string or list of strings, optional + Indicates the method used to determine latitude longitude values. A + list of codes is available. + horizontal_position_method_name : string or list of strings, optional + Indicates the method used to determine latitude longitude values. A + list of codes is available. + original_horizontal_datum : string or list of strings, optional + Coordinates are published in EPSG:4326 / WGS84 / World Geodetic System + 1984. This field indicates the original datum used to determine + coordinates before they were converted. A list of codes is available. + original_horizontal_datum_name : string or list of strings, optional + Coordinates are published in EPSG:4326 / WGS84 / World Geodetic System + 1984. This field indicates the original datum used to determine coordinates + before they were converted. A list of codes is available. + drainage_area : string or list of strings, optional + The area enclosed by a topographic divide from which direct surface runoff + from precipitation normally drains by gravity into the stream above that + point. + contributing_drainage_area : string or list of strings, optional + The contributing drainage area of a lake, stream, wetland, or estuary + monitoring location, in square miles. This item should be present only if + the contributing area is different from the total drainage area. This + situation can occur when part of the drainage area consists of very porous + soil or depressions that either allow all runoff to enter the groundwater + or traps the water in ponds so that rainfall does not contribute to runoff. + A transbasin diversion can also affect the total drainage area. + time_zone_abbreviation : string or list of strings, optional + A short code describing the time zone used by a monitoring location. + uses_daylight_savings : string or list of strings, optional + A flag indicating whether or not a monitoring location uses daylight savings. + construction_date : string or list of strings, optional + Date the well was completed. + aquifer_code : string or list of strings, optional + Local aquifers in the USGS water resources data base are identified by a + geohydrologic unit code (a three-digit number related to the age of the + formation, followed by a 4 or 5 character abbreviation for the geologic unit + or aquifer name). Additional information is available at this link. + national_aquifer_code : string or list of strings, optional + National aquifers are the principal aquifers or aquifer systems in the United + States, defined as regionally extensive aquifers or aquifer systems that have + the potential to be used as a source of potable water. Not all groundwater + monitoring locations can be associated with a National Aquifer. Such + monitoring locations will not be retrieved using this search criteria. A list + of National aquifer codes and names is available. + aquifer_type_code : string or list of strings, optional + Groundwater occurs in aquifers under two different conditions. Where water + only partly fills an aquifer, the upper surface is free to rise and decline. + These aquifers are referred to as unconfined (or water-table) aquifers. Where + water completely fills an aquifer that is overlain by a confining bed, the + aquifer is referred to as a confined (or artesian) aquifer. When a confined + aquifer is penetrated by a well, the water level in the well will rise above + the top of the aquifer (but not necessarily above land surface). Additional + information is available at this link. + well_constructed_depth : string or list of strings, optional + The depth of the finished well, in feet below land surface datum. Note: Not + all groundwater monitoring locations have information on Well Depth. Such + monitoring locations will not be retrieved using this search criteria. + hole_constructed_depth : string or list of strings, optional + The total depth to which the hole is drilled, in feet below land surface datum. + Note: Not all groundwater monitoring locations have information on Hole Depth. + Such monitoring locations will not be retrieved using this search criteria. + depth_source_code : string or list of strings, optional + A code indicating the source of water-level data. A list of codes is available. + properties : string or list of strings, optional + A vector of requested columns to be returned from the query. Available options + are: geometry, id, agency_code, agency_name, monitoring_location_number, + monitoring_location_name, district_code, country_code, country_name, state_code, + state_name, county_code, county_name, minor_civil_division_code, site_type_code, + site_type, hydrologic_unit_code, basin_code, altitude, altitude_accuracy, + altitude_method_code, altitude_method_name, vertical_datum, vertical_datum_name, + horizontal_positional_accuracy_code, horizontal_positional_accuracy, + horizontal_position_method_code, horizontal_position_method_name, + original_horizontal_datum, original_horizontal_datum_name, drainage_area, + contributing_drainage_area, time_zone_abbreviation, uses_daylight_savings, + construction_date, aquifer_code, national_aquifer_code, aquifer_type_code, + well_constructed_depth, hole_constructed_depth, depth_source_code. + bbox : list of numbers, optional + Only features that have a geometry that intersects the bounding box are selected. + The bounding box is provided as four or six numbers, depending on whether the + coordinate reference system includes a vertical axis (height or depth). Coordinates + are assumed to be in crs 4326. The expected format is a numeric vector structured: + c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude, + Southern-most latitude, Eastern-most longitude, Northern-most longitude). + limit : numeric, optional + The optional limit parameter is used to control the subset of the selected features + that should be returned in each page. The maximum allowable limit is 10000. It may + be beneficial to set this number lower if your internet connection is spotty. The + default (NA) will set the limit to the maximum allowable limit for the service. + max_results : numeric, optional + The optional maximum number of rows to return. This value must be less than the + requested limit. + skipGeometry : boolean, optional + This option can be used to skip response geometries for each feature. The returning + object will be a data frame with no spatial information. + + Returns + ------- + df : ``pandas.DataFrame`` + Formatted data returned from the API query. + + Examples + -------- + .. code:: + + >>> # Get monitoring locations within a bounding box + >>> # and leave out geometry + >>> df = dataretrieval.waterdata.get_monitoring_locations( + ... bbox=[-90.2,42.6,-88.7,43.2], + ... skipGeometry=True + ... ) + + >>> # Get monitoring location info for specific sites + >>> # and only specific properties + >>> df = dataretrieval.waterdata.get_monitoring_locations( + ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], + ... properties = ["monitoring_location_id", + ... "state_name", + ... "country_name"]) + """ service = "monitoring-locations" output_id = "monitoring_location_id" @@ -167,7 +527,7 @@ def get_monitoring_locations( return waterdata_helpers.get_ogc_data(args, output_id, service) -def get_timeseries_metadata( +def get_time_series_metadata( monitoring_location_id: Optional[Union[str, List[str]]] = None, parameter_code: Optional[Union[str, List[str]]] = None, parameter_name: Optional[Union[str, List[str]]] = None, @@ -192,6 +552,110 @@ def get_timeseries_metadata( max_results: Optional[int] = None, convertType: bool = True ) -> pd.DataFrame: + """Daily data and continuous measurements are grouped into time series, + which represent a collection of observations of a single parameter, + potentially aggregated using a standard statistic, at a single monitoring + location. This endpoint provides metadata about those time series, + including their operational thresholds, units of measurement, and when + the earliest and most recent observations in a time series occurred. + + Parameters + ---------- + monitoring_location_id : string or list of strings, optional + A unique identifier representing a single monitoring location. This + corresponds to the id field in the monitoring-locations endpoint. + Monitoring location IDs are created by combining the agency code of + the agency responsible for the monitoring location (e.g. USGS) with + the ID number of the monitoring location (e.g. 02238500), separated + by a hyphen (e.g. USGS-02238500). + parameter_code : string or list of strings, optional + Parameter codes are 5-digit codes used to identify the constituent + measured and the units of measure. A complete list of parameter + codes and associated groupings can be found at + https://help.waterdata.usgs.gov/codes-and-parameters/parameters. + parameter_name : + properties : string or list of strings, optional + A vector of requested columns to be returned from the query. + Available options are: geometry, id, time_series_id, + monitoring_location_id, parameter_code, statistic_id, time, value, + unit_of_measure, approval_status, qualifier, last_modified + statistic_id : string or list of strings, optional + A code corresponding to the statistic an observation represents. + Example codes include 00001 (max), 00002 (min), and 00003 (mean). + A complete list of codes and their descriptions can be found at + https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html. + last_modified : string, optional + The last time a record was refreshed in our database. This may happen + due to regular operational processes and does not necessarily indicate + anything about the measurement has changed. You can query this field + using date-times or intervals, adhering to RFC 3339, or using ISO 8601 + duration objects. Intervals may be bounded or half-bounded (double-dots + at start or end). Examples: + - A date-time: "2018-02-12T23:20:50Z" + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + Only features that have a last_modified that intersects the value of datetime are selected. + begin : + end : + unit_of_measure : string or list of strings, optional + A human-readable description of the units of measurement associated + with an observation. + computation_period_identifier : + computation_identifier : + thresholds : + sublocation_identifier : + primary : + parent_time_series_id : + time_series_id : string or list of strings, optional + A unique identifier representing a single time series. This + corresponds to the id field in the time-series-metadata endpoint. + web_description : + skipGeometry : boolean, optional + This option can be used to skip response geometries for each feature. The returning + object will be a data frame with no spatial information. + bbox : list of numbers, optional + Only features that have a geometry that intersects the bounding box are selected. + The bounding box is provided as four or six numbers, depending on whether the + coordinate reference system includes a vertical axis (height or depth). Coordinates + are assumed to be in crs 4326. The expected format is a numeric vector structured: + c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude, + Southern-most latitude, Eastern-most longitude, Northern-most longitude). + limit : numeric, optional + The optional limit parameter is used to control the subset of the selected features + that should be returned in each page. The maximum allowable limit is 10000. It may + be beneficial to set this number lower if your internet connection is spotty. The + default (NA) will set the limit to the maximum allowable limit for the service. + max_results : numeric, optional + The optional maximum number of rows to return. This value must be less than the + requested limit. + convertType : boolean, optional + If True, the function will convert the data to dates and qualifier to string vector + + Returns + ------- + df : ``pandas.DataFrame`` + Formatted data returned from the API query. + + Examples + -------- + .. code:: + + >>> # Get daily flow data from a single site + >>> # over a yearlong period + >>> df = dataretrieval.waterdata.get_daily( + ... monitoring_location_id = "USGS-02238500", + ... parameter_code = "00060", + ... time = "2021-01-01T00:00:00Z/2022-01-01T00:00:00Z" + ... ) + + >>> # Get monitoring location info for specific sites + >>> # and only specific properties + >>> df = dataretrieval.waterdata.get_daily( + ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], + ... approval_status = "Approved", + ... time = "2024-01-01/.." + """ service = "time-series-metadata" output_id = "time_series_id" @@ -224,6 +688,136 @@ def get_latest_continuous( max_results: Optional[int] = None, convertType: bool = True ) -> pd.DataFrame: + """This endpoint provides the most recent observation for each time series + of continuous data. Continuous data are collected via automated sensors + installed at a monitoring location. They are collected at a high frequencyand often at a fixed 15-minute interval. Depending on the specific monitoring location, the data may be transmitted automatically via telemetry and be available on WDFN within minutes of collection, while other times the delivery of data may be delayed if the monitoring location does not have the capacity to automatically transmit data. Continuous data are described by parameter name and parameter code. These data might also be referred to as "instantaneous values" or "IV" + + Parameters + ---------- + monitoring_location_id : string or list of strings, optional + A unique identifier representing a single monitoring location. This + corresponds to the id field in the monitoring-locations endpoint. + Monitoring location IDs are created by combining the agency code of + the agency responsible for the monitoring location (e.g. USGS) with + the ID number of the monitoring location (e.g. 02238500), separated + by a hyphen (e.g. USGS-02238500). + parameter_code : string or list of strings, optional + Parameter codes are 5-digit codes used to identify the constituent + measured and the units of measure. A complete list of parameter + codes and associated groupings can be found at + https://help.waterdata.usgs.gov/codes-and-parameters/parameters. + statistic_id : string or list of strings, optional + A code corresponding to the statistic an observation represents. + Example codes include 00001 (max), 00002 (min), and 00003 (mean). + A complete list of codes and their descriptions can be found at + https://help.waterdata.usgs.gov/code/stat_cd_nm_query?stat_nm_cd=%25&fmt=html. + properties : string or list of strings, optional + A vector of requested columns to be returned from the query. + Available options are: geometry, id, time_series_id, + monitoring_location_id, parameter_code, statistic_id, time, value, + unit_of_measure, approval_status, qualifier, last_modified + time_series_id : string or list of strings, optional + A unique identifier representing a single time series. This + corresponds to the id field in the time-series-metadata endpoint. + latest_continuous_id : string or list of strings, optional + A universally unique identifier (UUID) representing a single + version of a record. It is not stable over time. Every time the + record is refreshed in our database (which may happen as part of + normal operations and does not imply any change to the data itself) + a new ID will be generated. To uniquely identify a single observation + over time, compare the time and time_series_id fields; each time series + will only have a single observation at a given time. + approval_status : string or list of strings, optional + Some of the data that you have obtained from this U.S. Geological + Survey database may not have received Director's approval. Any such + data values are qualified as provisional and are subject to revision. + Provisional data are released on the condition that neither the USGS + nor the United States Government may be held liable for any damages + resulting from its use. This field reflects the approval status of + each record, and is either "Approved", meaining processing review has + been completed and the data is approved for publication, or + "Provisional" and subject to revision. For more information about + provisional data, go to + https://waterdata.usgs.gov/provisional-data-statement/. + unit_of_measure : string or list of strings, optional + A human-readable description of the units of measurement associated + with an observation. + qualifier : string or list of strings, optional + This field indicates any qualifiers associated with an observation, for + instance if a sensor may have been impacted by ice or if values were + estimated. + value : string or list of strings, optional + The value of the observation. Values are transmitted as strings in + the JSON response format in order to preserve precision. + last_modified : string, optional + The last time a record was refreshed in our database. This may happen + due to regular operational processes and does not necessarily indicate + anything about the measurement has changed. You can query this field + using date-times or intervals, adhering to RFC 3339, or using ISO 8601 + duration objects. Intervals may be bounded or half-bounded (double-dots + at start or end). Examples: + - A date-time: "2018-02-12T23:20:50Z" + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + Only features that have a last_modified that intersects the value of datetime are selected. + skipGeometry : boolean, optional + This option can be used to skip response geometries for each feature. The returning + object will be a data frame with no spatial information. + time : string, optional + The date an observation represents. You can query this field using date-times + or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. + Intervals may be bounded or half-bounded (double-dots at start or end). + Examples: + - A date-time: "2018-02-12T23:20:50Z" + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + Only features that have a time that intersects the value of datetime are selected. If + a feature has multiple temporal properties, it is the decision of the server whether + only a single temporal property is used to determine the extent or all relevant temporal properties. + bbox : list of numbers, optional + Only features that have a geometry that intersects the bounding box are selected. + The bounding box is provided as four or six numbers, depending on whether the + coordinate reference system includes a vertical axis (height or depth). Coordinates + are assumed to be in crs 4326. The expected format is a numeric vector structured: + c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude, + Southern-most latitude, Eastern-most longitude, Northern-most longitude). + limit : numeric, optional + The optional limit parameter is used to control the subset of the selected features + that should be returned in each page. The maximum allowable limit is 10000. It may + be beneficial to set this number lower if your internet connection is spotty. The + default (NA) will set the limit to the maximum allowable limit for the service. + max_results : numeric, optional + The optional maximum number of rows to return. This value must be less than the + requested limit. + convertType : boolean, optional + If True, the function will convert the data to dates and qualifier to string vector + + Returns + ------- + df : ``pandas.DataFrame`` + Formatted data returned from the API query. + + Examples + -------- + .. code:: + + >>> # Get daily flow data from a single site + >>> # over a yearlong period + >>> df = dataretrieval.waterdata.get_daily( + ... monitoring_location_id = "USGS-02238500", + ... parameter_code = "00060", + ... time = "2021-01-01T00:00:00Z/2022-01-01T00:00:00Z" + ... ) + + >>> # Get monitoring location info for specific sites + >>> # and only specific properties + >>> df = dataretrieval.waterdata.get_daily( + ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], + ... approval_status = "Approved", + ... time = "2024-01-01/.." + """ service = "latest-continuous" output_id = "latest_continuous_id" From 7fe486af62fac41344b709b35ddd2fc467df6a32 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Wed, 24 Sep 2025 20:10:10 -0500 Subject: [PATCH 16/28] add link urls --- dataretrieval/waterdata.py | 288 +++++++++++++++++++++++++++++++------ 1 file changed, 242 insertions(+), 46 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index 4ff056a..3c2335a 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -309,13 +309,14 @@ def get_monitoring_locations( agency_code : string or list of strings, optional The agency that is reporting the data. Agency codes are fixed values assigned by the National Water Information System (NWIS). A list of - agency codes is available at this link. + agency codes is available at + [this link](https://help.waterdata.usgs.gov/code/agency_cd_query?fmt=html). agency_name : string or list of strings, optional The name of the agency that is reporting the data. monitoring_location_number : string or list of strings, optional Each monitoring location in the USGS data base has a unique 8- to 15-digit identification number. Monitoring location numbers are - assigned based on this logic. + assigned based on [this logic](https://help.waterdata.usgs.gov/faq/sites/do-station-numbers-have-any-particular-meaning). monitoring_location_name : string or list of strings, optional This is the official name of the monitoring location in the database. For well information this can be a district-assigned local number. @@ -335,7 +336,8 @@ def get_monitoring_locations( State code. A two-digit ANSI code (formerly FIPS code) as defined by the American National Standards Institute, to define States and equivalents. A three-digit ANSI code is used to define counties and - county equivalents. A lookup table is available. The only countries with + county equivalents. A [lookup table](https://www.census.gov/library/reference/code-lists/ansi.html#states) + is available. The only countries with political subdivisions other than the US are Mexico and Canada. The Mexican states have US state codes ranging from 81-86 and Canadian provinces have state codes ranging from 90-98. @@ -344,20 +346,22 @@ def get_monitoring_locations( is located. county_code : string or list of strings, optional The code for the county or county equivalent (parish, borough, etc.) in which - the monitoring location is located. A list of codes is available. + the monitoring location is located. A [list of codes](https://help.waterdata.usgs.gov/code/county_query?fmt=html) + is available. county_name : string or list of strings, optional The name of the county or county equivalent (parish, borough, etc.) in which - the monitoring location is located. A list of codes is available. + the monitoring location is located. A [list of codes](https://help.waterdata.usgs.gov/code/county_query?fmt=html) + is available. minor_civil_division_code : string or list of strings, optional Codes for primary governmental or administrative divisions of the county or county equivalent in which the monitoring location is located. site_type_code : string or list of strings, optional - A code describing the hydrologic setting of the monitoring location. A list of - codes is available. + A code describing the hydrologic setting of the monitoring location. A [list of + codes](https://help.waterdata.usgs.gov/code/site_tp_query?fmt=html) is available. Example: "US:15:001" (United States: Hawaii, Hawaii County) site_type : string or list of strings, optional - A description of the hydrologic setting of the monitoring location. A list of - codes is available. + A description of the hydrologic setting of the monitoring location. A [list of + codes](https://help.waterdata.usgs.gov/code/site_tp_query?fmt=html) is available. hydrologic_unit_code : string or list of strings, optional The United States is divided and sub-divided into successively smaller hydrologic units which are classified into four levels: regions, @@ -379,37 +383,44 @@ def get_monitoring_locations( topographic maps; accuracies determined in this way are generally entered as one-half of the contour interval. altitude_method_code : string or list of strings, optional - Codes representing the method used to measure altitude. A list of codes is - available. + Codes representing the method used to measure altitude. A [list of codes](https://help.waterdata.usgs.gov/code/alt_meth_cd_query?fmt=html) + is available. altitude_method_name : float, optional - The name of the the method used to measure altitude. A list of codes is + The name of the the method used to measure altitude. A [list of codes](https://help.waterdata.usgs.gov/code/alt_meth_cd_query?fmt=html) + is available. vertical_datum : float, optional The datum used to determine altitude and vertical position at the - monitoring location. A list of codes is available. + monitoring location. A [list of codes](https://help.waterdata.usgs.gov/code/alt_datum_cd_query?fmt=html) + is available. vertical_datum_name : float, optional The datum used to determine altitude and vertical position at the - monitoring location. A list of codes is available. + monitoring location. A [list of codes](https://help.waterdata.usgs.gov/code/alt_datum_cd_query?fmt=html) + is available. horizontal_positional_accuracy_code : string or list of strings, optional - Indicates the accuracy of the latitude longitude values. A list of codes + Indicates the accuracy of the latitude longitude values. A [list of codes](https://help.waterdata.usgs.gov/code/coord_acy_cd_query?fmt=html) is available. horizontal_positional_accuracy : string or list of strings, optional - Indicates the accuracy of the latitude longitude values. A list of codes + Indicates the accuracy of the latitude longitude values. A [list of codes](https://help.waterdata.usgs.gov/code/coord_acy_cd_query?fmt=html) is available. horizontal_position_method_code : string or list of strings, optional Indicates the method used to determine latitude longitude values. A - list of codes is available. + [list of codes](https://help.waterdata.usgs.gov/code/coord_meth_cd_query?fmt=html) + is available. horizontal_position_method_name : string or list of strings, optional Indicates the method used to determine latitude longitude values. A - list of codes is available. + [list of codes](https://help.waterdata.usgs.gov/code/coord_meth_cd_query?fmt=html) + is available. original_horizontal_datum : string or list of strings, optional Coordinates are published in EPSG:4326 / WGS84 / World Geodetic System 1984. This field indicates the original datum used to determine - coordinates before they were converted. A list of codes is available. + coordinates before they were converted. A [list of codes](https://help.waterdata.usgs.gov/code/coord_datum_cd_query?fmt=html) + is available. original_horizontal_datum_name : string or list of strings, optional Coordinates are published in EPSG:4326 / WGS84 / World Geodetic System 1984. This field indicates the original datum used to determine coordinates - before they were converted. A list of codes is available. + before they were converted. A [list of codes](https://help.waterdata.usgs.gov/code/coord_datum_cd_query?fmt=html) + is available. drainage_area : string or list of strings, optional The area enclosed by a topographic divide from which direct surface runoff from precipitation normally drains by gravity into the stream above that @@ -432,14 +443,15 @@ def get_monitoring_locations( Local aquifers in the USGS water resources data base are identified by a geohydrologic unit code (a three-digit number related to the age of the formation, followed by a 4 or 5 character abbreviation for the geologic unit - or aquifer name). Additional information is available at this link. + or aquifer name). Additional information is available [at this link](https://help.waterdata.usgs.gov/faq/groundwater/local-aquifer-description). national_aquifer_code : string or list of strings, optional National aquifers are the principal aquifers or aquifer systems in the United States, defined as regionally extensive aquifers or aquifer systems that have the potential to be used as a source of potable water. Not all groundwater monitoring locations can be associated with a National Aquifer. Such - monitoring locations will not be retrieved using this search criteria. A list - of National aquifer codes and names is available. + monitoring locations will not be retrieved using this search criteria. A [list + of National aquifer codes and names](https://help.waterdata.usgs.gov/code/nat_aqfr_query?fmt=html) + is available. aquifer_type_code : string or list of strings, optional Groundwater occurs in aquifers under two different conditions. Where water only partly fills an aquifer, the upper surface is free to rise and decline. @@ -448,7 +460,7 @@ def get_monitoring_locations( aquifer is referred to as a confined (or artesian) aquifer. When a confined aquifer is penetrated by a well, the water level in the well will rise above the top of the aquifer (but not necessarily above land surface). Additional - information is available at this link. + information is available [at this link](https://help.waterdata.usgs.gov/faq/groundwater/local-aquifer-description). well_constructed_depth : string or list of strings, optional The depth of the finished well, in feet below land surface datum. Note: Not all groundwater monitoring locations have information on Well Depth. Such @@ -458,7 +470,8 @@ def get_monitoring_locations( Note: Not all groundwater monitoring locations have information on Hole Depth. Such monitoring locations will not be retrieved using this search criteria. depth_source_code : string or list of strings, optional - A code indicating the source of water-level data. A list of codes is available. + A code indicating the source of water-level data. A [list of codes](https://help.waterdata.usgs.gov/code/water_level_src_cd_query?fmt=html) + is available. properties : string or list of strings, optional A vector of requested columns to be returned from the query. Available options are: geometry, id, agency_code, agency_name, monitoring_location_number, @@ -573,7 +586,8 @@ def get_time_series_metadata( measured and the units of measure. A complete list of parameter codes and associated groupings can be found at https://help.waterdata.usgs.gov/codes-and-parameters/parameters. - parameter_name : + parameter_name : string or list of strings, optional + A human-understandable name corresponding to parameter_code. properties : string or list of strings, optional A vector of requested columns to be returned from the query. Available options are: geometry, id, time_series_id, @@ -596,21 +610,67 @@ def get_time_series_metadata( - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours Only features that have a last_modified that intersects the value of datetime are selected. - begin : - end : + begin : string or list of strings, optional + The datetime of the earliest observation in the time series. Together with end, + this field represents the period of record of a time series. Note that some time + series may have large gaps in their collection record. This field is currently + in the local time of the monitoring location. We intend to update this in version + v0 to use UTC with a time zone. You can query this field using date-times or + intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals + may be bounded or half-bounded (double-dots at start or end). Examples: + + - A date-time: "2018-02-12T23:20:50Z" + + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + + Only features that have a begin that intersects the value of datetime are selected. + end : string or list of strings, optional + The datetime of the most recent observation in the time series. Data returned by + this endpoint updates at most once per day, and potentially less frequently than + that, and as such there may be more recent observations within a time series + than the time series end value reflects. Together with begin, this field + represents the period of record of a time series. It is additionally used to + determine whether a time series is "active". We intend to update this in + version v0 to use UTC with a time zone. You can query this field using date-times + or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. Intervals + may be bounded or half-bounded (double-dots at start or end). Examples: + + - A date-time: "2018-02-12T23:20:50Z" + + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + + Only features that have a end that intersects the value of datetime are selected. unit_of_measure : string or list of strings, optional A human-readable description of the units of measurement associated with an observation. - computation_period_identifier : - computation_identifier : - thresholds : - sublocation_identifier : - primary : - parent_time_series_id : + computation_period_identifier : string or list of strings, optional + Indicates the period of data used for any statistical computations. + computation_identifier : string or list of strings, optional + Indicates whether the data from this time series represent a specific statistical + computation. + thresholds : numeric or list of numbers, optional + Thresholds represent known numeric limits for a time series, for example the + historic maximum value for a parameter or a level below which a sensor is + non-operative. These thresholds are sometimes used to automatically determine if + an observation is erroneous due to sensor error, and therefore shouldn't be included + in the time series. + sublocation_identifier : string or list of strings, optional + primary : string or list of strings, optional + parent_time_series_id : string or list of strings, optional time_series_id : string or list of strings, optional A unique identifier representing a single time series. This corresponds to the id field in the time-series-metadata endpoint. - web_description : + web_description : string or list of strings, optional + A description of what this time series represents, as used by WDFN and other USGS + data dissemination products. skipGeometry : boolean, optional This option can be used to skip response geometries for each feature. The returning object will be a data frame with no spatial information. @@ -643,7 +703,7 @@ def get_time_series_metadata( >>> # Get daily flow data from a single site >>> # over a yearlong period - >>> df = dataretrieval.waterdata.get_daily( + >>> df = dataretrieval.waterdata.get_time_series_metadata( ... monitoring_location_id = "USGS-02238500", ... parameter_code = "00060", ... time = "2021-01-01T00:00:00Z/2022-01-01T00:00:00Z" @@ -651,12 +711,10 @@ def get_time_series_metadata( >>> # Get monitoring location info for specific sites >>> # and only specific properties - >>> df = dataretrieval.waterdata.get_daily( + >>> df = dataretrieval.waterdata.get_time_series_metadata( ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], - ... approval_status = "Approved", ... time = "2024-01-01/.." """ - service = "time-series-metadata" output_id = "time_series_id" @@ -690,7 +748,14 @@ def get_latest_continuous( ) -> pd.DataFrame: """This endpoint provides the most recent observation for each time series of continuous data. Continuous data are collected via automated sensors - installed at a monitoring location. They are collected at a high frequencyand often at a fixed 15-minute interval. Depending on the specific monitoring location, the data may be transmitted automatically via telemetry and be available on WDFN within minutes of collection, while other times the delivery of data may be delayed if the monitoring location does not have the capacity to automatically transmit data. Continuous data are described by parameter name and parameter code. These data might also be referred to as "instantaneous values" or "IV" + installed at a monitoring location. They are collected at a high frequency + and often at a fixed 15-minute interval. Depending on the specific monitoring + location, the data may be transmitted automatically via telemetry and be + available on WDFN within minutes of collection, while other times the delivery + of data may be delayed if the monitoring location does not have the capacity to + automatically transmit data. Continuous data are described by parameter name + and parameter code. These data might also be referred to as "instantaneous + values" or "IV" Parameters ---------- @@ -805,18 +870,16 @@ def get_latest_continuous( >>> # Get daily flow data from a single site >>> # over a yearlong period - >>> df = dataretrieval.waterdata.get_daily( + >>> df = dataretrieval.waterdata.get_latest_continuous( ... monitoring_location_id = "USGS-02238500", - ... parameter_code = "00060", - ... time = "2021-01-01T00:00:00Z/2022-01-01T00:00:00Z" + ... parameter_code = "00060" ... ) >>> # Get monitoring location info for specific sites >>> # and only specific properties >>> df = dataretrieval.waterdata.get_daily( - ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"], - ... approval_status = "Approved", - ... time = "2024-01-01/.." + ... monitoring_location_id = ["USGS-05114000", "USGS-09423350"] + ... ) """ service = "latest-continuous" output_id = "latest_continuous_id" @@ -851,6 +914,139 @@ def get_field_measurements( max_results: Optional[int] = None, convertType: bool = True ) -> pd.DataFrame: + """Field measurements are physically measured values collected during + a visit to the monitoring location. Field measurements consist of + measurements of gage height and discharge, and readings of groundwater + levels, and are primarily used as calibration readings for the automated + sensors collecting continuous data. They are collected at a low frequency, + and delivery of the data in WDFN may be delayed due to data processing + time. + + Parameters + ---------- + monitoring_location_id : string or list of strings, optional + A unique identifier representing a single monitoring location. This + corresponds to the id field in the monitoring-locations endpoint. + Monitoring location IDs are created by combining the agency code of + the agency responsible for the monitoring location (e.g. USGS) with + the ID number of the monitoring location (e.g. 02238500), separated + by a hyphen (e.g. USGS-02238500). + parameter_code : string or list of strings, optional + Parameter codes are 5-digit codes used to identify the constituent + measured and the units of measure. A complete list of parameter + codes and associated groupings can be found at + https://help.waterdata.usgs.gov/codes-and-parameters/parameters. + observing_procedure_code : string or list of strings, optional + A short code corresponding to the observing procedure for the field + measurement. + properties : string or list of strings, optional + A vector of requested columns to be returned from the query. + Available options are: geometry, id, time_series_id, + monitoring_location_id, parameter_code, statistic_id, time, value, + unit_of_measure, approval_status, qualifier, last_modified + field_visit_id : string or list of strings, optional + A universally unique identifier (UUID) for the field visit. + Multiple measurements may be made during a single field visit. + approval_status : string or list of strings, optional + Some of the data that you have obtained from this U.S. Geological + Survey database may not have received Director's approval. Any such + data values are qualified as provisional and are subject to revision. + Provisional data are released on the condition that neither the USGS + nor the United States Government may be held liable for any damages + resulting from its use. This field reflects the approval status of + each record, and is either "Approved", meaining processing review has + been completed and the data is approved for publication, or + "Provisional" and subject to revision. For more information about + provisional data, go to + https://waterdata.usgs.gov/provisional-data-statement/. + unit_of_measure : string or list of strings, optional + A human-readable description of the units of measurement associated + with an observation. + qualifier : string or list of strings, optional + This field indicates any qualifiers associated with an observation, for + instance if a sensor may have been impacted by ice or if values were + estimated. + value : string or list of strings, optional + The value of the observation. Values are transmitted as strings in + the JSON response format in order to preserve precision. + last_modified : string, optional + The last time a record was refreshed in our database. This may happen + due to regular operational processes and does not necessarily indicate + anything about the measurement has changed. You can query this field + using date-times or intervals, adhering to RFC 3339, or using ISO 8601 + duration objects. Intervals may be bounded or half-bounded (double-dots + at start or end). Examples: + - A date-time: "2018-02-12T23:20:50Z" + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + Only features that have a last_modified that intersects the value of datetime are selected. + observing_procedure : string or list of strings, optional + Water measurement or water-quality observing procedure descriptions. + vertical_datum : string or list of strings, optional + The datum used to determine altitude and vertical position at the monitoring location. + A list of codes is available. + measuring_agency : string or list of strings, optional + The agency performing the measurement. + skipGeometry : boolean, optional + This option can be used to skip response geometries for each feature. The returning + object will be a data frame with no spatial information. + time : string, optional + The date an observation represents. You can query this field using date-times + or intervals, adhering to RFC 3339, or using ISO 8601 duration objects. + Intervals may be bounded or half-bounded (double-dots at start or end). + Examples: + - A date-time: "2018-02-12T23:20:50Z" + - A bounded interval: "2018-02-12T00:00:00Z/2018-03-18T12:31:12Z" + - Half-bounded intervals: "2018-02-12T00:00:00Z/.." or "../2018-03-18T12:31:12Z" + - Duration objects: "P1M" for data from the past month or "PT36H" for the last 36 hours + Only features that have a time that intersects the value of datetime are selected. If + a feature has multiple temporal properties, it is the decision of the server whether + only a single temporal property is used to determine the extent or all relevant temporal properties. + bbox : list of numbers, optional + Only features that have a geometry that intersects the bounding box are selected. + The bounding box is provided as four or six numbers, depending on whether the + coordinate reference system includes a vertical axis (height or depth). Coordinates + are assumed to be in crs 4326. The expected format is a numeric vector structured: + c(xmin,ymin,xmax,ymax). Another way to think of it is c(Western-most longitude, + Southern-most latitude, Eastern-most longitude, Northern-most longitude). + limit : numeric, optional + The optional limit parameter is used to control the subset of the selected features + that should be returned in each page. The maximum allowable limit is 10000. It may + be beneficial to set this number lower if your internet connection is spotty. The + default (NA) will set the limit to the maximum allowable limit for the service. + max_results : numeric, optional + The optional maximum number of rows to return. This value must be less than the + requested limit. + convertType : boolean, optional + If True, the function will convert the data to dates and qualifier to string vector + + Returns + ------- + df : ``pandas.DataFrame`` + Formatted data returned from the API query. + + Examples + -------- + .. code:: + + >>> # Get daily flow data from a single site + >>> # over a yearlong period + >>> df = dataretrieval.waterdata.get_field_measurements( + ... monitoring_location_id = "USGS-375907091432201", + ... parameter_code = "72019", + ... skipGeometry = True + ... ) + + >>> # Get monitoring location info for specific sites + >>> # and only specific properties + >>> df = dataretrieval.waterdata.get_field_measurements( + ... monitoring_location_id = ["USGS-451605097071701", + "USGS-263819081585801"], + ... parameter_code = ["62611", "72019"], + ... time = "P20Y" + ... ) + """ service = "field-measurements" output_id = "field_measurement_id" From fad9ce0d0c063ce89fb64eca6e429138c09484d7 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Wed, 24 Sep 2025 20:10:30 -0500 Subject: [PATCH 17/28] fix date formatting function --- dataretrieval/waterdata_helpers.py | 48 +++++++++++++----------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 164fdfc..c0714bd 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -122,7 +122,7 @@ def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) datetime_input : Union[str, List[str]] A single date/datetime string or a list of one or two date/datetime strings. Accepts formats like "%Y-%m-%d %H:%M:%S", ISO 8601, or relative periods (e.g., "P7D"). date : bool, optional - If True, returns only the date portion ("YYYY-MM-DD"). If False (default), returns full datetime in UTC ISO 8601 format ("YYYY-MM-DDTHH:MM:SSZ"). + If True, uses only the date portion ("YYYY-MM-DD"). If False (default), returns full datetime in UTC ISO 8601 format ("YYYY-MM-DDTHH:MM:SSZ"). Returns ------- Union[str, None] @@ -154,38 +154,31 @@ def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) # Replace all blanks with "nan" datetime_input = ["nan" if x == "" else x for x in datetime_input] - # If the list is of length 1, first look for things like "P7D" or dates - # already formatted in ISO08601. Otherwise, try to coerce to datetime - if len(datetime_input) == 1: - dt = datetime_input[0] - if re.search(r"P", dt, re.IGNORECASE) or "/" in dt: - return dt + if len(datetime_input) <=2: + # If the list is of length 1, first look for things like "P7D" or dates + # already formatted in ISO08601. Otherwise, try to coerce to datetime + if len(datetime_input) == 1 and re.search(r"P", datetime_input[0], re.IGNORECASE) or "/" in datetime_input[0]: + return datetime_input[0] + # Otherwise, use list comprehension to parse dates else: try: # Parse to naive datetime - parsed_dt = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") - # If the service only accepts dates for this input, not datetimes (e.g. "daily"), - # return just the date, otherwise, return the datetime in UTC format. - if date: - return parsed_dt.strftime("%Y-%m-%d") - else: - dt_local = parsed_dt.replace(tzinfo=local_timezone) - # Convert to UTC and format as ISO 8601 with 'Z' - return dt_local.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") + parsed_dates = [datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") for dt in datetime_input] except Exception: - return None - # If the list is of length 2, parse the dates and if necessary, combine them together into - # the date range format accepted by the API - elif len(datetime_input) == 2: - try: - parsed_dates = [datetime.strptime(dt, "%Y-%m-%d %H:%M:%S") for dt in datetime_input] + # Parse to date only + try: + parsed_dates = [datetime.strptime(dt, "%Y-%m-%d") for dt in datetime_input] + except Exception: + return None + # If the service only accepts dates for this input, not datetimes (e.g. "daily"), + # return just the dates separated by a "/", otherwise, return the datetime in UTC + # format. if date: - formatted = "/".join(dt.strftime("%Y-%m-%d") for dt in parsed_dates) + return "/".join(dt.strftime("%Y-%m-%d") for dt in parsed_dates) else: - formatted = "/".join(dt.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") for dt in parsed_dates) - return formatted.replace("nan", "..") - except Exception: - return None + parsed_locals = [dt.replace(tzinfo=local_timezone) for dt in parsed_dates] + formatted = "/".join(dt.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") for dt in parsed_locals) + return formatted.replace("nan", "..") else: raise ValueError("datetime_input should only include 1-2 values") @@ -333,7 +326,6 @@ def _construct_api_requests( if i in params: dates = service == "daily" and i != "last_modified" params[i] = _format_api_dates(params[i], date=dates) - #kwargs[i] = _format_api_dates(kwargs[i], date=dates) # String together bbox elements from a list to a comma-separated string, # and string together properties if provided From a33d201bb99230e4606af1bd9142ba06def6ec60 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Thu, 25 Sep 2025 13:02:53 -0500 Subject: [PATCH 18/28] make waterdata outputs geopandas if geometry included --- dataretrieval/waterdata_helpers.py | 37 +++++++++++++++++------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index c0714bd..3a35d40 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -1,10 +1,10 @@ import httpx import os -import warnings from typing import List, Dict, Any, Optional, Union from datetime import datetime import pandas as pd import json +import geopandas as gpd from datetime import datetime from zoneinfo import ZoneInfo import re @@ -243,7 +243,7 @@ def _check_OGC_requests(endpoint: str = "daily", req_type: str = "queryables"): def _error_body(resp: httpx.Response): """ - Extracts and returns an error message from an HTTP response object based on its status code. + Provide more informative error messages based on the response status. Args: resp (httpx.Response): The HTTP response object to extract the error message from. @@ -270,8 +270,10 @@ def _construct_api_requests( ): """ Constructs an HTTP request object for the specified water data API service. - Depending on the input parameters, the function determines whether to use a GET or POST request, - formats parameters appropriately, and sets required headers. + Depending on the input parameters (whether there's lists of multiple argument values), + the function determines whether to use a GET or POST request, formats parameters + appropriately, and sets required headers. + Args: service (str): The name of the API service to query (e.g., "daily"). properties (Optional[List[str]], optional): List of property names to include in the request. @@ -382,21 +384,25 @@ def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: resp (httpx.Response): The HTTP response object expected to contain a JSON body with a "features" key. Returns: - pd.DataFrame: A pandas DataFrame containing the normalized feature properties. - Returns an empty DataFrame if no features are returned. - - Notes: - - Drops columns "type", "geometry", and "AsGeoJSON(geometry)" if present. - - Flattens nested properties and removes the "properties_" prefix from column names. + gpd.GeoDataFrame or pd.DataFrame: A geopandas GeoDataFrame if geometry is included, or a + pandas DataFrame containing the feature properties and each row's service-specific id. + Returns an empty pandas DataFrame if no features are returned. """ body = resp.json() if not body.get("numberReturned"): return pd.DataFrame() - df = pd.json_normalize( - resp.json()["features"], - sep="_") - df = df.drop(columns=["type", "geometry", "AsGeoJSON(geometry)"], errors="ignore") - df.columns = [col.replace("properties_", "") for col in df.columns] + #df = pd.json_normalize( + # resp.json()["features"], + # sep="_") + #df = df.drop(columns=["type", "geometry", "AsGeoJSON(geometry)"], errors="ignore") + #df.columns = [col.replace("properties_", "") for col in df.columns] + + df = gpd.GeoDataFrame.from_features(body["features"]) + df["id"] = pd.json_normalize(body["features"])["id"].values + + if df["geometry"].isnull().all(): + df = pd.DataFrame(df.drop(columns="geometry")) + return df def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional[httpx.Client] = None) -> pd.DataFrame: @@ -451,7 +457,6 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional if resp.status_code != 200: raise Exception(_error_body(resp)) df1 = _get_resp_data(resp) dfs = pd.concat([dfs, df1], ignore_index=True) - #dfs.append(df1) curr_url = _next_req_url(resp) except Exception: failures.append(curr_url) From bd82c4900a5f44145691643e1c4a156dd0181f3e Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Thu, 25 Sep 2025 13:25:33 -0500 Subject: [PATCH 19/28] make gpd an optional dependency and change returns accordingly --- dataretrieval/waterdata_helpers.py | 31 +++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 3a35d40..0c84ea5 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -1,13 +1,21 @@ import httpx import os +import warnings from typing import List, Dict, Any, Optional, Union from datetime import datetime import pandas as pd import json -import geopandas as gpd from datetime import datetime from zoneinfo import ZoneInfo import re +try: + import geopandas as gpd + gpd = True +except ImportError: + warnings.warn("Geopandas is not installed. Data frames containing geometry will be returned as pandas DataFrames.", ImportWarning) + gpd = False + + BASE_API = "https://api.waterdata.usgs.gov/ogcapi/" API_VERSION = "v0" @@ -388,18 +396,27 @@ def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: pandas DataFrame containing the feature properties and each row's service-specific id. Returns an empty pandas DataFrame if no features are returned. """ + # Check if it's an empty response body = resp.json() if not body.get("numberReturned"): return pd.DataFrame() - #df = pd.json_normalize( - # resp.json()["features"], - # sep="_") - #df = df.drop(columns=["type", "geometry", "AsGeoJSON(geometry)"], errors="ignore") - #df.columns = [col.replace("properties_", "") for col in df.columns] + # If geopandas not installed, return a pandas dataframe + if not gpd: + df = pd.json_normalize( + body["features"], + sep="_") + df = df.drop(columns=["type", "geometry", "AsGeoJSON(geometry)"], errors="ignore") + df.columns = [col.replace("properties_", "") for col in df.columns] + return df + + # Organize json into geodataframe and make sure id column comes along. df = gpd.GeoDataFrame.from_features(body["features"]) df["id"] = pd.json_normalize(body["features"])["id"].values + df = df[["id"] + [col for col in df.columns if col != "id"]] + # If no geometry present, then return pandas dataframe. A geodataframe + # is not needed. if df["geometry"].isnull().all(): df = pd.DataFrame(df.drop(columns="geometry")) @@ -506,7 +523,7 @@ def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: Returns ------- - pd.DataFrame + pd.DataFrame or gpd.GeoDataFrame The DataFrame with columns rearranged and/or renamed according to the specified properties and output_id. """ if properties and not all(pd.isna(properties)): From 06b0e69e90fb9f179382e0b32c62bcb3d0c891cc Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Thu, 25 Sep 2025 13:46:50 -0500 Subject: [PATCH 20/28] incorporate geopandas boolean into function arguments and ensure user knows when they will receive a pandas df --- dataretrieval/waterdata_helpers.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 0c84ea5..19937af 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -10,10 +10,9 @@ import re try: import geopandas as gpd - gpd = True + geopd = True except ImportError: - warnings.warn("Geopandas is not installed. Data frames containing geometry will be returned as pandas DataFrames.", ImportWarning) - gpd = False + geopd = False @@ -384,7 +383,7 @@ def _next_req_url(resp: httpx.Response) -> Optional[str]: return next_url return None -def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: +def _get_resp_data(resp: httpx.Response, geopd: bool) -> pd.DataFrame: """ Extracts and normalizes data from an httpx.Response object containing GeoJSON features. @@ -402,7 +401,7 @@ def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: return pd.DataFrame() # If geopandas not installed, return a pandas dataframe - if not gpd: + if not geopd: df = pd.json_normalize( body["features"], sep="_") @@ -422,7 +421,7 @@ def _get_resp_data(resp: httpx.Response) -> pd.DataFrame: return df -def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional[httpx.Client] = None) -> pd.DataFrame: +def _walk_pages(geopd: bool, req: httpx.Request, max_results: Optional[int], client: Optional[httpx.Client] = None) -> pd.DataFrame: """ Iterates through paginated API responses and aggregates the results into a single DataFrame. @@ -452,6 +451,9 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional """ print(f"Requesting:\n{req.url}") + if not geopd: + print("Geopandas is not installed. Data frames containing geometry will be returned as pandas DataFrames.") + # Get first response from client # using GET or POST call client = client or httpx.Client() @@ -465,14 +467,14 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional content = req.content if method == "POST" else None if max_results is None or pd.isna(max_results): - dfs = _get_resp_data(resp) + dfs = _get_resp_data(resp, geopd=geopd) curr_url = _next_req_url(resp) failures = [] while curr_url: try: resp = client.request(method, curr_url, headers=headers, content=content if method == "POST" else None) if resp.status_code != 200: raise Exception(_error_body(resp)) - df1 = _get_resp_data(resp) + df1 = _get_resp_data(resp, geopd=geopd) dfs = pd.concat([dfs, df1], ignore_index=True) curr_url = _next_req_url(resp) except Exception: @@ -483,7 +485,7 @@ def _walk_pages(req: httpx.Request, max_results: Optional[int], client: Optional return dfs else: resp.raise_for_status() - return _get_resp_data(resp) + return _get_resp_data(resp, geopd=geopd) def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], service: str) -> pd.DataFrame: """ @@ -604,7 +606,7 @@ def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataF # Build API request req = _construct_api_requests(**args) # Run API request and iterate through pages if needed - return_list = _walk_pages(req, max_results) + return_list = _walk_pages(geopd=geopd, req=req, max_results=max_results) # Manage some aspects of the returned dataset return_list = _deal_with_empty(return_list, properties, service) if convertType: From 253da79846cbf13beb43c51e71c6e1c0642b6fe8 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Thu, 25 Sep 2025 14:07:26 -0500 Subject: [PATCH 21/28] clean up some documentation and comments --- dataretrieval/waterdata.py | 10 +++++----- dataretrieval/waterdata_helpers.py | 14 +++++++++----- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index 3c2335a..04f3615 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -207,7 +207,7 @@ def get_daily( Returns ------- - df : ``pandas.DataFrame`` + df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. Examples @@ -506,7 +506,7 @@ def get_monitoring_locations( Returns ------- - df : ``pandas.DataFrame`` + df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. Examples @@ -694,7 +694,7 @@ def get_time_series_metadata( Returns ------- - df : ``pandas.DataFrame`` + df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. Examples @@ -861,7 +861,7 @@ def get_latest_continuous( Returns ------- - df : ``pandas.DataFrame`` + df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. Examples @@ -1023,7 +1023,7 @@ def get_field_measurements( Returns ------- - df : ``pandas.DataFrame`` + df : ``pandas.DataFrame`` or ``geopandas.GeoDataFrame`` Formatted data returned from the API query. Examples diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 19937af..ef8e235 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -512,7 +512,7 @@ def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: str) -> pd.DataFrame: """ - Rearranges and renames columns in a DataFrame based on provided properties and output identifier. + Rearranges and renames columns in a DataFrame based on provided properties and service's output id. Parameters ---------- @@ -530,8 +530,14 @@ def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: """ if properties and not all(pd.isna(properties)): if "id" not in properties: + # If user refers to service-specific output id in properties, + # then rename the "id" column to the output_id (id column is + # automatically included). if output_id in properties: df = df.rename(columns={"id": output_id}) + # If output id is not in properties, but user requests the plural + # of the output_id (e.g. "monitoring_locations_id"), then rename + # "id" to plural. This is pretty niche. else: plural = output_id.replace("_id", "s_id") if plural in properties: @@ -581,8 +587,8 @@ def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataF service (str): The OGC service type (e.g., "wfs", "wms"). Returns: - pd.DataFrame: A DataFrame containing the retrieved and processed OGC data, with metadata attributes - including the request URL and query timestamp. + pd.DataFrame or gpd.GeoDataFrame: A DataFrame containing the retrieved and processed OGC data, + with metadata attributes including the request URL and query timestamp. Notes: - The function does not mutate the input `args` dictionary. @@ -637,5 +643,3 @@ def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataF # resp.raise_for_status() # return resp.json() -# def _explode_post(ls: Dict[str, Any]): -# return {k: _cql2_param({k: v if isinstance(v, list) else [v]}) for k, v in ls.items() if v is not None} \ No newline at end of file From f5cca0777e63a753724aa34c4df745b31fcc29ed Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Thu, 25 Sep 2025 14:14:30 -0500 Subject: [PATCH 22/28] add optional dependency to pyproject.toml --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index a276f11..e55dc81 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,10 @@ nldi = [ 'geopandas>=0.10' ] +waterdata = [ + 'geopandas>=0.10', +] + [project.urls] homepage = "https://github.com/DOI-USGS/dataretrieval-python" documentation = "https://doi-usgs.github.io/dataretrieval-python/" From 5c546e7e3baca5f9713d45a83ddc97ccf1763c0e Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Thu, 25 Sep 2025 16:58:06 -0500 Subject: [PATCH 23/28] set convertType to default or user specification --- dataretrieval/waterdata.py | 5 ----- dataretrieval/waterdata_helpers.py | 8 ++------ 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index 04f3615..c2d7031 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -237,7 +237,6 @@ def get_daily( k: v for k, v in locals().items() if k not in {"service", "output_id"} and v is not None } - args["convertType"] = False return waterdata_helpers.get_ogc_data(args, output_id, service) @@ -536,7 +535,6 @@ def get_monitoring_locations( k: v for k, v in locals().items() if k not in {"service", "output_id"} and v is not None } - args["convertType"] = False return waterdata_helpers.get_ogc_data(args, output_id, service) @@ -723,7 +721,6 @@ def get_time_series_metadata( k: v for k, v in locals().items() if k not in {"service", "output_id"} and v is not None } - args["convertType"] = False return waterdata_helpers.get_ogc_data(args, output_id, service) @@ -889,7 +886,6 @@ def get_latest_continuous( k: v for k, v in locals().items() if k not in {"service", "output_id"} and v is not None } - args["convertType"] = False return waterdata_helpers.get_ogc_data(args, output_id, service) @@ -1055,7 +1051,6 @@ def get_field_measurements( k: v for k, v in locals().items() if k not in {"service", "output_id"} and v is not None } - args["convertType"] = False return waterdata_helpers.get_ogc_data(args, output_id, service) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index ef8e235..c535afd 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -1,11 +1,9 @@ import httpx import os -import warnings from typing import List, Dict, Any, Optional, Union from datetime import datetime import pandas as pd import json -from datetime import datetime from zoneinfo import ZoneInfo import re try: @@ -158,9 +156,6 @@ def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) if all(pd.isna(dt) or dt == "" or dt == None for dt in datetime_input): return None - # Replace all blanks with "nan" - datetime_input = ["nan" if x == "" else x for x in datetime_input] - if len(datetime_input) <=2: # If the list is of length 1, first look for things like "P7D" or dates # already formatted in ISO08601. Otherwise, try to coerce to datetime @@ -185,7 +180,7 @@ def _format_api_dates(datetime_input: Union[str, List[str]], date: bool = False) else: parsed_locals = [dt.replace(tzinfo=local_timezone) for dt in parsed_dates] formatted = "/".join(dt.astimezone(ZoneInfo("UTC")).strftime("%Y-%m-%dT%H:%M:%SZ") for dt in parsed_locals) - return formatted.replace("nan", "..") + return formatted else: raise ValueError("datetime_input should only include 1-2 values") @@ -407,6 +402,7 @@ def _get_resp_data(resp: httpx.Response, geopd: bool) -> pd.DataFrame: sep="_") df = df.drop(columns=["type", "geometry", "AsGeoJSON(geometry)"], errors="ignore") df.columns = [col.replace("properties_", "") for col in df.columns] + df.rename(columns={"geometry_coordinates": "geometry"}, inplace=True) return df # Organize json into geodataframe and make sure id column comes along. From e9221ac68831722b8cf15858090fcbbcbdcfdf52 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Thu, 25 Sep 2025 16:58:27 -0500 Subject: [PATCH 24/28] start unit tests on new functions --- tests/waterdata_test.py | 90 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/tests/waterdata_test.py b/tests/waterdata_test.py index 50eefdc..d0e7a49 100755 --- a/tests/waterdata_test.py +++ b/tests/waterdata_test.py @@ -6,6 +6,11 @@ from dataretrieval.waterdata import ( _check_profiles, get_samples, + get_daily, + get_monitoring_locations, + get_latest_continuous, + get_field_measurements, + get_time_series_metadata, _SERVICES, _PROFILES ) @@ -105,3 +110,88 @@ def test_samples_organizations(): ) assert len(df) == 1 assert df.size == 3 + +def test_get_daily(): + df = get_daily( + monitoring_location_id="USGS-05427718", + parameter_code="00060", + time="2025-01-01/.." + ) + assert "daily_id" in df.columns + assert "geometry" in df.columns + assert df.shape[1] == 12 + assert df.parameter_code.unique().tolist() == ["00060"] + assert df.monitoring_location_id.unique().tolist() == ["USGS-05427718"] + assert df["time"].apply(lambda x: isinstance(x, datetime.date)).all() + assert df["value"].dtype == "float64" + +def test_get_daily_properties(): + df = get_daily( + monitoring_location_id="USGS-05427718", + parameter_code="00060", + time="2025-01-01/..", + properties=["daily_id", "monitoring_location_id", "parameter_code", "time", "value", "geometry"] + ) + assert "daily_id" in df.columns + assert "geometry" in df.columns + assert df.shape[1] == 6 + assert (df["time"] >= datetime.date(2025, 1, 1)).all() + +def test_get_daily_no_geometry(): + df = get_daily( + monitoring_location_id="USGS-05427718", + parameter_code="00060", + time="2025-01-01/..", + skipGeometry=True + ) + assert "geometry" not in df.columns + assert df.shape[1] == 11 + assert isinstance(df, DataFrame) + +def test_get_monitoring_locations(): + df = get_monitoring_locations( + state_name="Connecticut", + site_type_code="GW" + ) + assert df.site_type_code.unique().tolist() == ["GW"] + +def test_get_monitoring_locations_hucs(): + df = get_monitoring_locations( + hydrologic_unit_code=["010802050102", "010802050103"] + ) + assert set(df.hydrologic_unit_code.unique().tolist()) == {"010802050102", "010802050103"} + +def test_get_latest_continuous(): + df = get_latest_continuous( + monitoring_location_id=["USGS-05427718", "USGS-05427719"], + parameter_code=["00060", "00065"] + ) + assert df.shape[0] <= 4 + assert df.statistic_id.unique().tolist() == ["00011"] + try: + datetime.datetime.strptime(df['time'].iloc[0], "%Y-%m-%dT%H:%M:%S+00:00") + out=True + except: + out=False + assert out + +def test_get_field_measurements(): + df = get_field_measurements( + monitoring_location_id="USGS-05427718", + unit_of_measure="ft^3/s", + time="2025-01-01/2025-10-01", + skipGeometry=True + ) + assert "field_measurement_id" in df.columns + assert "geometry" not in df.columns + assert df.unit_of_measure.unique().tolist() == ["ft^3/s"] + +def test_get_time_series_metadata(): + df = get_time_series_metadata( + bbox=[-89.840355,42.853411,-88.818626,43.422598], + parameter_code=["00060", "00065", "72019"], + skipGeometry=True + ) + assert set(df['parameter_name'].unique().tolist()) == {"Gage height", "Water level, depth LSD", "Discharge"} + + From b1436db5e7ea5ac355b26eeaa7ec38fd7b73effe Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 26 Sep 2025 10:19:42 -0500 Subject: [PATCH 25/28] update README and add a NEWS markdown in which to place past updates --- NEWS.md | 7 +++++++ README.md | 25 ++++++++++++++----------- 2 files changed, 21 insertions(+), 11 deletions(-) create mode 100644 NEWS.md diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..a071d49 --- /dev/null +++ b/NEWS.md @@ -0,0 +1,7 @@ +**10/01/2025:** `dataretrieval` is pleased to offer a new module, `waterdata`, which gives users access USGS's modernized [Water Data APIs](https://api.waterdata.usgs.gov/). The Water Data API endpoints include daily values, instantaneous values, field measurements (modernized groundwater levels service), time series metadata, and discrete water quality data from the Samples database. Though there will be a period of overlap, the functions within `waterdata` will eventually replace the `nwis` module, which currently provides access to the legacy [NWIS Water Services](https://waterservices.usgs.gov/). More example workflows and functions coming soon. Check `help(waterdata)` for more information. + +**09/03/2024:** The groundwater levels service has switched endpoints, and `dataretrieval` was updated accordingly in [`v1.0.10`](https://github.com/DOI-USGS/dataretrieval-python/releases/tag/v1.0.10). Older versions using the discontinued endpoint will return 503 errors for `nwis.get_gwlevels` or the `service='gwlevels'` argument. Visit [Water Data For the Nation](https://waterdata.usgs.gov/blog/wdfn-waterservices-2024/) for more information. + +**03/01/2024:** USGS data availability and format have changed on Water Quality Portal (WQP). Since March 2024, data obtained from WQP legacy profiles will not include new USGS data or recent updates to existing data. All USGS data (up to and beyond March 2024) are available using the new WQP beta services. You can access the beta services by setting `legacy=False` in the functions in the `wqp` module. + +To view the status of changes in data availability and code functionality, visit: https://doi-usgs.github.io/dataRetrieval/articles/Status.html \ No newline at end of file diff --git a/README.md b/README.md index f8c14a3..7464121 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,19 @@ ![Conda Version](https://img.shields.io/conda/v/conda-forge/dataretrieval) ![Downloads](https://static.pepy.tech/badge/dataretrieval) -:warning: USGS data availability and format have changed on Water Quality Portal (WQP). Since March 2024, data obtained from WQP legacy profiles will not include new USGS data or recent updates to existing data. All USGS data (up to and beyond March 2024) are available using the new WQP beta services. You can access the beta services by setting `legacy=False` in the functions in the `wqp` module. +## Latest Announcements -To view the status of changes in data availability and code functionality, visit: https://doi-usgs.github.io/dataRetrieval/articles/Status.html +:mega: **10/01/2025:** `dataretrieval` is pleased to offer a new, *in-development* module, `waterdata`, which gives users access USGS's modernized [Water Data APIs](https://api.waterdata.usgs.gov/). The Water Data API endpoints include daily values, instantaneous values, field measurements (modernized groundwater levels service), time series metadata, and discrete water quality data from the Samples database. Though there will be a period of overlap, the functions within `waterdata` will eventually replace the `nwis` module, which currently provides access to the legacy [NWIS Water Services](https://waterservices.usgs.gov/). More example workflows and functions coming soon. Check `help(waterdata)` for more information. -:mega: **09/03/2024:** The groundwater levels service has switched endpoints, and `dataretrieval` was updated accordingly in [`v1.0.10`](https://github.com/DOI-USGS/dataretrieval-python/releases/tag/v1.0.10). Older versions using the discontinued endpoint will return 503 errors for `nwis.get_gwlevels` or the `service='gwlevels'` argument. Visit [Water Data For the Nation](https://waterdata.usgs.gov/blog/wdfn-waterservices-2024/) for more information. +**Important:** Users of the Water Data APIs are strongly encouraged to obtain an API key, which gives users higher rate limits and thus greater access to USGS data. [Register for an API key](https://api.waterdata.usgs.gov/signup/) and then place that API key in your python environment as an environment variable named "API_USGS_PAT". One option is to set the variable as follows: + +```python +import os +os.environ["API_USGS_PAT"] = "your_api_key_here" +``` +Note that you may need to restart your python session for the environment variable to be recognized. + +Check out the [NEWS](NEWS.md) file for all updates and announcements, or track updates to the package via the GitHub releases. ## What is dataretrieval? `dataretrieval` was created to simplify the process of loading hydrologic data into the Python environment. @@ -20,8 +28,7 @@ Environmental Protection Agency (EPA), U.S. Department of Agriculture (USDA), and USGS. Direct USGS data is obtained from a service called the National Water Information System (NWIS). -Note that the python version is not a direct port of the original: it attempts to reproduce the functionality of the R package, -though its organization and interface often differ. +Note that the python version is not a direct port of the original: it attempts to reproduce the functionality of the R package, though its organization and interface often differ. If there's a hydrologic or environmental data portal that you'd like dataretrieval to work with, raise it as an [issue](https://github.com/USGS-python/dataretrieval/issues). @@ -53,7 +60,7 @@ Water quality data are available from: - [Samples](https://waterdata.usgs.gov/download-samples/#dataProfile=site) - Discrete USGS water quality data only - [Water Quality Portal](https://www.waterqualitydata.us/) - Discrete water quality data from USGS and EPA. Older data are available in the legacy WQX version 2 format; all data are available in the beta WQX3.0 format. -To access the full functionality available from NWIS web services, nwis.get record appends any additional kwargs into the REST request. For example, this function call: +To access the full functionality available from NWIS web services, `nwis.get_record()` appends any additional kwargs into the REST request. For example, this function call: ```python nwis.get_record(sites='03339000', service='dv', start='2017-12-31', parameterCd='00060') ``` @@ -67,8 +74,6 @@ For example nwis.get_record(sites='05404147',service='iv', start='2021-01-01', end='2021-3-01', access='3') ``` -More services and documentation to come! - ## Quick start dataretrieval can be installed using pip: @@ -99,13 +104,11 @@ For more details, see the file [CONTRIBUTING.md](CONTRIBUTING.md). ## Need help? -The Water Mission Area of the USGS supports the development and maintenance of `dataretrieval`. Any questions can be directed to the Computational Tools team at -comptools@usgs.gov. +The Water Mission Area of the USGS supports the development and maintenance of `dataretrieval`. Any questions can be directed to the Computational Tools team at comptools@usgs.gov. Resources are available primarily for maintenance and responding to user questions. Priorities on the development of new features are determined by the `dataretrieval` development team. - ## Acknowledgments This material is partially based upon work supported by the National Science Foundation (NSF) under award 1931297. Any opinions, findings, conclusions, or recommendations expressed in this material are those of the authors and do not necessarily reflect the views of the NSF. From dc24658e0e8a292fe8296c62015871b5ad374563 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 26 Sep 2025 10:57:17 -0500 Subject: [PATCH 26/28] make a few small changes to names and documentation --- dataretrieval/waterdata_helpers.py | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index c535afd..0b5ad14 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -21,7 +21,7 @@ _cached_base_url = None def _base_url(): """ - Returns the base URL for the USGS Water Data OGC API. + Returns the base URL for the USGS Water Data APIs. Uses a cached value to avoid repeated string formatting. If the cached value is not set, it constructs the base URL using the BASE_API and API_VERSION constants. @@ -222,7 +222,7 @@ def _default_headers(): headers["X-Api-Key"] = token return headers -def _check_OGC_requests(endpoint: str = "daily", req_type: str = "queryables"): +def _check_ogc_requests(endpoint: str = "daily", req_type: str = "queryables"): """ Sends an HTTP GET request to the specified OGC endpoint and request type, returning the JSON response. @@ -281,7 +281,7 @@ def _construct_api_requests( properties (Optional[List[str]], optional): List of property names to include in the request. bbox (Optional[List[float]], optional): Bounding box coordinates as a list of floats. limit (Optional[int], optional): Maximum number of results to return per request. - max_results (Optional[int], optional): Maximum number of results allowed by the API. + max_results (Optional[int], optional): Maximum number of rows to return. skipGeometry (bool, optional): Whether to exclude geometry from the response. **kwargs: Additional query parameters, including date/time filters and other API-specific options. Returns: @@ -296,14 +296,6 @@ def _construct_api_requests( baseURL = _setup_api(service) # Single parameters can only have one value single_params = {"datetime", "last_modified", "begin", "end", "time"} - # params = {k: v for k, v in kwargs.items() if k in single_params} - # # Set skipGeometry parameter - # params["skipGeometry"] = skipGeometry - # # If limit is none and max_results is not none, then set limit to max results. Otherwise, - # # if max_results is none, set it to 10000 (the API max). - # params["limit"] = max_results if limit is None and max_results is not None else limit or 10000 - # if max_results is not None and limit is not None and limit > max_results: - # raise ValueError("limit cannot be greater than max_result") # Identify which parameters should be included in the POST content body post_params = { @@ -384,6 +376,7 @@ def _get_resp_data(resp: httpx.Response, geopd: bool) -> pd.DataFrame: Parameters: resp (httpx.Response): The HTTP response object expected to contain a JSON body with a "features" key. + geopd (bool): Indicates whether geopandas is installed and should be used to handle geometries. Returns: gpd.GeoDataFrame or pd.DataFrame: A geopandas GeoDataFrame if geometry is included, or a @@ -423,10 +416,12 @@ def _walk_pages(geopd: bool, req: httpx.Request, max_results: Optional[int], cli Parameters ---------- + geopd : bool + Indicates whether geopandas is installed and should be used for handling geometries. req : httpx.Request The initial HTTP request to send. max_results : Optional[int] - The maximum number of results to retrieve. If None or NaN, retrieves all available pages. + Maximum number of rows to return. If None or NaN, retrieves all available pages. client : Optional[httpx.Client], default None An optional HTTP client to use for requests. If not provided, a new client is created. @@ -501,12 +496,12 @@ def _deal_with_empty(return_list: pd.DataFrame, properties: Optional[List[str]], """ if return_list.empty: if not properties or all(pd.isna(properties)): - schema = _check_OGC_requests(endpoint=service, req_type="schema") + schema = _check_ogc_requests(endpoint=service, req_type="schema") properties = list(schema.get("properties", {}).keys()) return pd.DataFrame(columns=properties) return return_list -def _rejigger_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: str) -> pd.DataFrame: +def _arrange_cols(df: pd.DataFrame, properties: Optional[List[str]], output_id: str) -> pd.DataFrame: """ Rearranges and renames columns in a DataFrame based on provided properties and service's output id. @@ -613,7 +608,7 @@ def get_ogc_data(args: Dict[str, Any], output_id: str, service: str) -> pd.DataF return_list = _deal_with_empty(return_list, properties, service) if convertType: return_list = _cleanup_cols(return_list, service=service) - return_list = _rejigger_cols(return_list, properties, output_id) + return_list = _arrange_cols(return_list, properties, output_id) # Add metadata return_list.attrs.update(request=req.url, queryTime=pd.Timestamp.now()) return return_list From 89b960ca822a7dec3a601ee25791216c346f7534 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 26 Sep 2025 11:14:20 -0500 Subject: [PATCH 27/28] define max_results when it is an input --- dataretrieval/waterdata_helpers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dataretrieval/waterdata_helpers.py b/dataretrieval/waterdata_helpers.py index 0b5ad14..70e9530 100644 --- a/dataretrieval/waterdata_helpers.py +++ b/dataretrieval/waterdata_helpers.py @@ -310,6 +310,10 @@ def _construct_api_requests( # If limit is none and max_results is not none, then set limit to max results. Otherwise, # if max_results is none, set it to 10000 (the API max). params["limit"] = max_results if limit is None and max_results is not None else limit or 10000 + # Add max results as a parameter if it is not None + if max_results is not None: + params["max_results"] = max_results + if max_results is not None and limit is not None and limit > max_results: raise ValueError("limit cannot be greater than max_result") From 1237777bcf0e036be065bd45e845ede92294a4c8 Mon Sep 17 00:00:00 2001 From: Elise Hinman Date: Fri, 26 Sep 2025 13:27:42 -0500 Subject: [PATCH 28/28] comment out code that wasn't doing the correct thing with max_results --- dataretrieval/waterdata.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dataretrieval/waterdata.py b/dataretrieval/waterdata.py index c2d7031..7c503b2 100644 --- a/dataretrieval/waterdata.py +++ b/dataretrieval/waterdata.py @@ -683,7 +683,7 @@ def get_time_series_metadata( The optional limit parameter is used to control the subset of the selected features that should be returned in each page. The maximum allowable limit is 10000. It may be beneficial to set this number lower if your internet connection is spotty. The - default (NA) will set the limit to the maximum allowable limit for the service. + default (None) will set the limit to the maximum allowable limit for the service. max_results : numeric, optional The optional maximum number of rows to return. This value must be less than the requested limit. @@ -849,7 +849,7 @@ def get_latest_continuous( The optional limit parameter is used to control the subset of the selected features that should be returned in each page. The maximum allowable limit is 10000. It may be beneficial to set this number lower if your internet connection is spotty. The - default (NA) will set the limit to the maximum allowable limit for the service. + default (None) will set the limit to the maximum allowable limit for the service. max_results : numeric, optional The optional maximum number of rows to return. This value must be less than the requested limit. @@ -1010,7 +1010,7 @@ def get_field_measurements( The optional limit parameter is used to control the subset of the selected features that should be returned in each page. The maximum allowable limit is 10000. It may be beneficial to set this number lower if your internet connection is spotty. The - default (NA) will set the limit to the maximum allowable limit for the service. + default (None) will set the limit to the maximum allowable limit for the service. max_results : numeric, optional The optional maximum number of rows to return. This value must be less than the requested limit.