diff --git a/packaging/src/docker/thirdparties/gravitino/README.md b/packaging/src/docker/thirdparties/gravitino/README.md new file mode 100644 index 000000000000..d97c5725eeb4 --- /dev/null +++ b/packaging/src/docker/thirdparties/gravitino/README.md @@ -0,0 +1,242 @@ + + +# Hive + Gravitino + Keycloak: Docker-Compose Setup + +This package contains a docker-compose-based setup integrating Apache Hive, Gravitino Iceberg REST server, and Keycloak for OAuth2 authentication. It allows Hive to use an Iceberg REST catalog secured via Keycloak. + +## Table of Contents +- Architecture Overview +- Prerequisites +- Quickstart +- Configuration + - Keycloak + - Gravitino + - Hive +- Networking Notes + +## Architecture Overview +This diagram illustrates the key docker-compose components and their interactions in this setup: + +``` + oAuth2 (REST API) + +-------------------------------------------------------------------+ + | | + | v ++--------+----------+ +-------------------+ +-----------------+ +| | RESTCatalog | | oauth2 | | +| Hive | (REST API) | Gravitino | (REST API) | Keycloak | +| (HiveServer2) +-------------->| Iceberg REST +----------->| OAuth2 Auth | +| | | Server | | Server | ++--------+----------+ +---------+---------+ +-----------------+ + | | + data | metadata files | + files +------------------------------------+ + | + v ++-------------------+ +-------------------+ +| | creates dir | | +| /warehouse |<--------------+ init | +| (Docker volume) | sets | container | +| | permissions | | ++-------------------+ +-------------------+ +``` + +- Hive: + - Runs HiveServer2, connects to Gravitino via Iceberg REST catalog. + - Write Iceberg data files to the shared warehouse volume. +- Gravitino: + - Exposes REST API for Iceberg catalog. + - Writes Iceberg metadata files to shared warehouse volume (.metadata.json). + - Doesn't supports serving as oauth2 provider, so this example uses an external OAuth2 provider (Keyclock). +- Keycloak: + - OAuth2 server providing authentication and token issuance for Hive/Gravitino. +- /warehouse: + - Shared Docker volume for Iceberg table data and metadata. +- Init container: + - Creates shared /warehouse folder and sets filesystem permissions as a one time initialization step. + +## Prerequisites +- Docker & Docker Compose +- Java (for local Hive beeline client) +- ```$HIVE_HOME``` environment variable pointing to Hive installation (for connecting to Beeline) + +## Quickstart + +### STEP 1: Export the Hive version +```shell +export HIVE_VERSION=4.2.0 +``` + +### STEP 2: Start services +```shell +docker-compose up -d +``` + +### STEP 3: Connect to beeline +```shell +"${HIVE_HOME}/bin/beeline" -u "jdbc:hive2://localhost:10001/default" -n hive -p hive +``` + +### STEP 4: Stop services: +```shell +docker-compose down -v +``` + +### Configuration + +#### Keycloak + +- Realm: hive +- Client: iceberg-client + - Secret: iceberg-client-secret + - Protocol: OpenID Connect + - Audience: hive-iceberg +- Imported via `realm-export.json` in Keycloak container. +- Port: 8080 + +#### Gravitino + +- HTTP port: 9001 +- Catalog backend: JDBC H2 (/tmp/gravitino_h2_db) +- Warehouse: /warehouse (shared with Hive) +- Iceberg REST Catalog Backend config: + ``` + # Backend type for the catalog. Here we use JDBC (H2 database) as the metadata store. + gravitino.iceberg-rest.catalog-backend = jdbc + + # JDBC connection URI for the H2 database storing catalog metadata. + gravitino.iceberg-rest.uri = jdbc:h2:file:/tmp/gravitino_h2_db;AUTO_SERVER=TRUE + + # JDBC driver class used to connect to the metadata database. + gravitino.iceberg-rest.jdbc-driver = org.h2.Driver + + # Database username for connecting to the metadata store. + gravitino.iceberg-rest.jdbc-user = sa + + # Database password for connecting to the metadata store (empty here). + gravitino.iceberg-rest.jdbc-password = "" + + # Whether to initialize the catalog schema on startup. + gravitino.iceberg-rest.jdbc-initialize = true + + # --- Warehouse Location (shared folder) --- + + # Path to the Iceberg warehouse directory shared with Hive. + gravitino.iceberg-rest.warehouse = file:///warehouse + ``` +- OAuth2 config pointing to Keycloak: + ``` + # Enables OAuth2 as the authentication mechanism for Gravitino. + gravitino.authenticators = oauth + + # URL of the Keycloak realm to request tokens from. + gravitino.authenticator.oauth.serverUri = http://keycloak:8080/realms/hive + + # Path to the OAuth2 token endpoint on Keycloak. + gravitino.authenticator.oauth.tokenPath = /protocol/openid-connect/token + + # OAuth2 scopes requested when obtaining a token. Includes "openid" and the custom "catalog" scope. + gravitino.authenticator.oauth.scope = openid catalog + + # OAuth2 client ID registered in Keycloak. + gravitino.authenticator.oauth.clientId = iceberg-client + + # OAuth2 client secret associated with the client ID. + gravitino.authenticator.oauth.clientSecret = iceberg-client-secret + + # Java class used to validate incoming JWT tokens using the JWKS endpoint. + gravitino.authenticator.oauth.tokenValidatorClass = org.apache.gravitino.server.authentication.JwksTokenValidator + + # URL to fetch JSON Web Key Set (JWKS) for verifying token signatures. + gravitino.authenticator.oauth.jwksUri = http://keycloak:8080/realms/hive/protocol/openid-connect/certs + + # Identifier for the OAuth2 provider configuration in Gravitino. + gravitino.authenticator.oauth.provider = default + + # JWT claim field(s) to extract as the principal/username (here, 'sub' claim). + gravitino.authenticator.oauth.principalFields = sub + + # Acceptable clock skew (in seconds) when validating token expiration times. + gravitino.authenticator.oauth.allowSkewSecs = 60 + + # Expected audience claim in the token to ensure it is intended for this service. + gravitino.authenticator.oauth.serviceAudience = hive-iceberg + ``` + +#### Hive + +- Uses ```HiveRESTCatalogClient``` for connecting to Iceberg REST catalog (Gravitino). +- Catalog configuration in ```hive-site.xml```: + ``` + + metastore.catalog.default + ice01 + Sets the default Iceberg catalog for Hive. Here, "ice01" is used. + + + + metastore.client.impl + org.apache.iceberg.hive.client.HiveRESTCatalogClient + Specifies the client implementation to use for accessing Iceberg via REST. + + + + iceberg.catalog.ice01.uri + http://gravitino:9001/iceberg + URI of the Iceberg REST server (Gravitino). Hive will send catalog requests here. + + + + iceberg.catalog.ice01.type + rest + Defines the catalog type as "rest", indicating it uses a REST API backend. + + + + + + iceberg.catalog.ice01.rest.auth.type + oauth2 + Configures Hive to use OAuth2 for authenticating requests to the REST catalog. + + + + iceberg.catalog.ice01.oauth2-server-uri + http://keycloak:8080/realms/hive/protocol/openid-connect/token + URL of the Keycloak OAuth2 token endpoint used to request access tokens. + + + + iceberg.catalog.ice01.credential + iceberg-client:iceberg-client-secret + Client credentials (ID and secret) used to authenticate with Keycloak. + + ``` +- HiveServer2 port: 10000 (mapped to 10001 in Docker Compose) + +## Networking Notes + +- All containers share a custom bridge network ```hive-net```. +- Services communicate via container names: hive, gravitino, keycloak. +- Ports mapped for host access: + - Keycloak → 8080 + - Gravitino → 9001 + - HiveServer2 → 10001 + diff --git a/packaging/src/docker/thirdparties/gravitino/common/init.sh b/packaging/src/docker/thirdparties/gravitino/common/init.sh new file mode 100755 index 000000000000..0f1b24d07c80 --- /dev/null +++ b/packaging/src/docker/thirdparties/gravitino/common/init.sh @@ -0,0 +1,30 @@ +#!/bin/sh -x + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +apk add --no-cache acl + +mkdir -p /tmp/hive/jars +mkdir -p $WAREHOUSE +chmod 777 $WAREHOUSE + +# Give the hive user id full rwx access to all existing files and directories under $WAREHOUSE +setfacl -R -m u:$HIVE_USER_ID:rwx $WAREHOUSE + +# Ensure all new files/directories created inside $WAREHOUSE automatically grant rwx access to hive user id +setfacl -d -m u:$HIVE_USER_ID:rwx $WAREHOUSE diff --git a/packaging/src/docker/thirdparties/gravitino/docker-compose.yml b/packaging/src/docker/thirdparties/gravitino/docker-compose.yml new file mode 100644 index 000000000000..694934e7a847 --- /dev/null +++ b/packaging/src/docker/thirdparties/gravitino/docker-compose.yml @@ -0,0 +1,103 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +version: "3.9" + +name: hive-gravitino-rest-catalog-integration + +services: + keycloak: + image: quay.io/keycloak/keycloak:25.0.1 + container_name: keycloak + environment: + KEYCLOAK_ADMIN: admin + KEYCLOAK_ADMIN_PASSWORD: admin + volumes: + - ./keycloak/realm-export.json:/opt/keycloak/data/import/realm-export.json + ports: + - "8080:8080" + networks: + - hive-net + command: [ + "start-dev", + "--import-realm", + "--health-enabled=true" + ] + healthcheck: + test: "exec 3<>/dev/tcp/localhost/9000 && \ + echo -e 'GET /health/ready HTTP/1.1\\r\\nHost: localhost\\r\\nConnection: close\\r\\n\\r\\n' >&3 && \ + cat <&3 | grep -q '200 OK'" + interval: 5s + timeout: 2s + retries: 15 + + gravitino: + image: apache/gravitino-iceberg-rest:1.0.0 + container_name: gravitino + environment: + JAVA_OPTS: "-Dlog4j2.formatMsgNoLookups=true" + volumes: + - ./gravitino:/tmp/gravitino + - warehouse:/warehouse + ports: + - "9001:9001" + networks: + - hive-net + entrypoint: /bin/bash /tmp/gravitino/init.sh + healthcheck: + test: [ "CMD", "/tmp/gravitino/healthcheck.sh" ] + interval: 5s + timeout: 60s + retries: 5 + start_period: 20s + + hive: + image: apache/hive:${HIVE_VERSION} + container_name: hive + depends_on: + keycloak: + condition: service_healthy + gravitino: + condition: service_healthy + environment: + SERVICE_NAME: hiveserver2 + volumes: + - ./hive/hive-site.xml:/opt/hive/conf/hive-site.xml + - warehouse:/warehouse + ports: + - "10001:10000" + networks: + - hive-net + entrypoint: '/bin/sh -c "/opt/hive/bin/schematool -dbType derby -initOrUpgradeSchema && /entrypoint.sh"' + + init: + image: alpine/curl + container_name: init + user: "0:0" # run as root + environment: + - WAREHOUSE=/warehouse + - HIVE_USER_ID=1000 + volumes: + - ./common/:/common + - warehouse:/warehouse + entrypoint: '/bin/sh -c /common/init.sh' + +networks: + hive-net: + driver: bridge + +volumes: + warehouse: diff --git a/packaging/src/docker/thirdparties/gravitino/gravitino/gravitino-iceberg-rest-server.conf b/packaging/src/docker/thirdparties/gravitino/gravitino/gravitino-iceberg-rest-server.conf new file mode 100644 index 000000000000..f0c4bb72f22a --- /dev/null +++ b/packaging/src/docker/thirdparties/gravitino/gravitino/gravitino-iceberg-rest-server.conf @@ -0,0 +1,50 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# --- HTTP Server --- +gravitino.iceberg-rest.httpPort = 9001 + +# --- Iceberg REST Catalog Backend (JDBC/H2) --- +gravitino.iceberg-rest.catalog-backend = jdbc +gravitino.iceberg-rest.uri = jdbc:h2:file:/tmp/gravitino_h2_db;AUTO_SERVER=TRUE +gravitino.iceberg-rest.jdbc-driver = org.h2.Driver +gravitino.iceberg-rest.jdbc-user = sa +gravitino.iceberg-rest.jdbc-password = "" +gravitino.iceberg-rest.jdbc-initialize = true + +# --- Warehouse Location (shared folder) --- +gravitino.iceberg-rest.warehouse = file:///warehouse + +# --- OAuth2 Authentication --- +gravitino.authenticators = oauth + +gravitino.authenticator.oauth.serverUri = http://keycloak:8080/realms/hive +gravitino.authenticator.oauth.tokenPath = /protocol/openid-connect/token +gravitino.authenticator.oauth.scope = openid catalog +gravitino.authenticator.oauth.clientId = iceberg-client +gravitino.authenticator.oauth.clientSecret = iceberg-client-secret + +gravitino.authenticator.oauth.tokenValidatorClass = org.apache.gravitino.server.authentication.JwksTokenValidator +gravitino.authenticator.oauth.jwksUri = http://keycloak:8080/realms/hive/protocol/openid-connect/certs +gravitino.authenticator.oauth.provider = default +gravitino.authenticator.oauth.principalFields = sub +gravitino.authenticator.oauth.allowSkewSecs = 60 +gravitino.authenticator.oauth.serviceAudience = hive-iceberg + +# --- Logging --- +gravitino.logging.level = INFO + diff --git a/packaging/src/docker/thirdparties/gravitino/gravitino/healthcheck.sh b/packaging/src/docker/thirdparties/gravitino/gravitino/healthcheck.sh new file mode 100755 index 000000000000..8f6b6cffc35f --- /dev/null +++ b/packaging/src/docker/thirdparties/gravitino/gravitino/healthcheck.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +set -ex + +max_attempts=3 +attempt=0 +success=false + +while [ $attempt -lt $max_attempts ]; do + response=$(curl -X GET -H "Content-Type: application/json" http://gravitino:9001/iceberg/api/version) + + if echo "$response" | grep -q "HTTP ERROR 401 The provided credentials did not support" + then + success=true + break + else + echo "Attempt $((attempt + 1)) failed..." + sleep 1 + fi + + ((attempt++)) +done + +if [ "$success" = true ]; then + exit 0 +else + exit 1 +fi \ No newline at end of file diff --git a/packaging/src/docker/thirdparties/gravitino/gravitino/init.sh b/packaging/src/docker/thirdparties/gravitino/gravitino/init.sh new file mode 100755 index 000000000000..ecb05e0459e8 --- /dev/null +++ b/packaging/src/docker/thirdparties/gravitino/gravitino/init.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Copy Gravitino start script +cp /tmp/gravitino/start-iceberg-rest-server.sh /root/gravitino-iceberg-rest-server/bin/start-iceberg-rest-server.sh + +# Copy Gravitino config file +cp /tmp/gravitino/gravitino-iceberg-rest-server.conf /root/gravitino-iceberg-rest-server/conf/gravitino-iceberg-rest-server.conf + +# Download H2 Driver to Gravitino libs folder +mkdir -p /root/gravitino-iceberg-rest-server/libs +curl -L -o /root/gravitino-iceberg-rest-server/libs/h2-2.2.220.jar https://repo1.maven.org/maven2/com/h2database/h2/2.2.220/h2-2.2.220.jar + +/bin/bash /root/gravitino-iceberg-rest-server/bin/start-iceberg-rest-server.sh \ No newline at end of file diff --git a/packaging/src/docker/thirdparties/gravitino/gravitino/start-iceberg-rest-server.sh b/packaging/src/docker/thirdparties/gravitino/gravitino/start-iceberg-rest-server.sh new file mode 100755 index 000000000000..9b487a102931 --- /dev/null +++ b/packaging/src/docker/thirdparties/gravitino/gravitino/start-iceberg-rest-server.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +set -ex +bin_dir="$(dirname "${BASH_SOURCE-$0}")" +iceberg_rest_server_dir="$(cd "${bin_dir}/../">/dev/null; pwd)" + +cd ${iceberg_rest_server_dir} + +JAVA_OPTS+=" -XX:-UseContainerSupport" +export JAVA_OPTS + +./bin/gravitino-iceberg-rest-server.sh start \ No newline at end of file diff --git a/packaging/src/docker/thirdparties/gravitino/hive/hive-site.xml b/packaging/src/docker/thirdparties/gravitino/hive/hive-site.xml new file mode 100644 index 000000000000..59e07cbe6434 --- /dev/null +++ b/packaging/src/docker/thirdparties/gravitino/hive/hive-site.xml @@ -0,0 +1,123 @@ + + + + + + + + hive.jar.directory + file:/tmp/hive/jars + + + + hive.user.install.directory + file:/tmp/hive/user_install + + + + tez.local.mode + true + + + + tez.runtime.optimize.local.fetch + true + + + + tez.am.mode.session + true + + + + hive.scheduled.queries.executor.enabled + false + + + + hive.materializedview.rebuild.incremental + false + + + + hive.metastore.transactional.event.listeners + + + + + hive.notification.event.poll.interval + 0 + + + + hive.stats.autogather + false + + + + hive.stats.fetch.column.stats + false + + + + hive.stats.estimate + false + + + + + + + + metastore.catalog.default + ice01 + + + + metastore.client.impl + org.apache.iceberg.hive.client.HiveRESTCatalogClient + + + + iceberg.catalog.ice01.uri + http://gravitino:9001/iceberg + + + + iceberg.catalog.ice01.type + rest + + + + + + iceberg.catalog.ice01.rest.auth.type + oauth2 + + + + iceberg.catalog.ice01.oauth2-server-uri + http://keycloak:8080/realms/hive/protocol/openid-connect/token + + + + iceberg.catalog.ice01.credential + iceberg-client:iceberg-client-secret + + + diff --git a/packaging/src/docker/thirdparties/gravitino/keycloak/realm-export.json b/packaging/src/docker/thirdparties/gravitino/keycloak/realm-export.json new file mode 100644 index 000000000000..26c34b52e216 --- /dev/null +++ b/packaging/src/docker/thirdparties/gravitino/keycloak/realm-export.json @@ -0,0 +1,43 @@ +{ + "realm": "hive", + "enabled": true, + "clients": [ + { + "clientId": "iceberg-client", + "secret": "iceberg-client-secret", + "enabled": true, + "redirectUris": ["*"], + "serviceAccountsEnabled": true, + "protocol": "openid-connect", + "publicClient": false, + "directAccessGrantsEnabled": false, + "standardFlowEnabled": false, + "defaultClientScopes": ["catalog"], + "optionalClientScopes": [], + "protocolMappers": [ + { + "name": "audience", + "protocol": "openid-connect", + "protocolMapper": "oidc-audience-mapper", + "consentRequired": false, + "config": { + "included.client.audience": "hive-iceberg", + "id.token.claim": "false", + "access.token.claim": "true" + } + } + ], + "attributes": { + "access.token.lifespan": "3600" + } + } + ], + "clientScopes": [ + { + "name": "catalog", + "protocol": "openid-connect", + "attributes": {}, + "protocolMappers": [] + } + ] +} diff --git a/packaging/src/docker/thirdparties/polaris/README.md b/packaging/src/docker/thirdparties/polaris/README.md new file mode 100644 index 000000000000..f3305ac19907 --- /dev/null +++ b/packaging/src/docker/thirdparties/polaris/README.md @@ -0,0 +1,184 @@ + + +# Hive + Polaris: Docker-Compose Setup + +This package contains a docker-compose-based setup integrating Apache Hive and Polaris. +It allows Hive to use an Iceberg REST catalog secured with oauth2 provided by Polaris. + +## Table of Contents +- Architecture Overview +- Prerequisites +- Quickstart +- Configuration + - Polaris + - Hive +- Networking Notes + +## Architecture Overview +This diagram illustrates the key docker-compose components and their interactions in this setup: +``` ++-------------------+ +-------------------+ +| | RESTCatalog | | +| Hive | (REST API) | Polaris |<-------+ +| (HiveServer2) +-------------->| Server | | +| | oAuth2 | | | ++--------+----------+ (REST API) +---------+---------+ | creates: + | | | catalog, + data | metadata files | | principal, + files +------------------------------------+ | roles, + | | grants (REST API) + v | ++-------------------+ +-------------------+ | +| | creates dir | | | +| /warehouse |<--------------+ Polaris-init +--------+ +| (Docker volume) | syncs | container | +| | permissions | | ++-------------------+ +-------------------+ +``` + +- Hive: + - Runs HiveServer2, connects to Polaris via Iceberg REST catalog. + - Write Iceberg data files to shared warehouse volume. +- Polaris: + - Exposes REST API for Iceberg catalog and provides oauth2 for authentication. + - Supports serving as oauth2 provider, so this example doesn't need an external OAuth2 component. + - Writes Iceberg metadata files to shared warehouse volume (.metadata.json). +- /warehouse: + - Shared Docker volume for Iceberg table data and metadata. +- Polaris-init + - Bootstraps Polaris for Hive-Iceberg. + - Creates and configures Polaris resources via REST API. + - Continuously synchronizes filesystem permissions for the shared /warehouse/* folders. + - required because Polaris and Hive run as different users in their respective containers. + +## Prerequisites +- Docker & Docker Compose +- Java (for local Hive beeline client) +- ```$HIVE_HOME``` environment variable pointing to Hive installation (for connecting to Beeline) + +## Quickstart + +### STEP 1: Export the Hive version +```shell +export HIVE_VERSION=4.2.0-SNAPSHOT +``` + +### STEP 2: Start services +```shell +docker-compose up -d +``` + +### STEP 3: Connect to beeline +```shell +"${HIVE_HOME}/bin/beeline" -u "jdbc:hive2://localhost:10001/default" -n hive -p hive +``` + +### STEP 4: Stop services: +```shell +docker-compose down -v +``` + +## Configuration + +### Polaris + +- HTTP port: 8181 +- Warehouse: /warehouse (shared with Hive) +- Key Polaris configs (defined via env variables in docker-compose.yml) : + ``` + # A realm provides logical isolation for different Polaris environments. + polaris.realm-context.realms: POLARIS + + # Initial bootstrap credentials for the Polaris server. + # The format is: ,, + POLARIS_BOOTSTRAP_CREDENTIALS: POLARIS,iceberg-client,iceberg-client-secret` + ``` + +### Hive + +- Uses ```HiveRESTCatalogClient``` for connecting to Iceberg REST catalog (Polaris). +- Catalog configuration in ```hive-site.xml```: + ``` + + metastore.catalog.default + ice01 + Sets the default Iceberg catalog for Hive. Here, "ice01" is used. + + + + metastore.client.impl + org.apache.iceberg.hive.client.HiveRESTCatalogClient + Specifies the client implementation to use for accessing Iceberg via REST. + + + + iceberg.catalog.ice01.uri + http://polaris:8181/api/catalog + URI of the Iceberg REST server (Polaris). Hive will send catalog requests here. + + + + iceberg.catalog.ice01.type + rest + Defines the catalog type as "rest", indicating it uses a REST API backend. + + + + hive.metastore.warehouse.dir + file:///warehouse + Defines the warehouse location, required for Polaris + + + + + + iceberg.catalog.ice01.rest.auth.type + oauth2 + Configures Hive to use OAuth2 for authenticating requests to the REST catalog. + + + + iceberg.catalog.ice01.oauth2-server-uri + http://polaris:8181/api/catalog/v1/oauth/tokens + URL of the Polaris OAuth2 token endpoint used to request access tokens. + + + + iceberg.catalog.ice01.credential + iceberg-client:iceberg-client-secret + Client credentials (ID and secret) used to authenticate with Keycloak. + + + + iceberg.catalog.ice01.scope + PRINCIPAL_ROLE:ALL + oAuth2 scope tied to the principal role defined in Polaris + + ``` +- HiveServer2 port: 10000 (mapped to 10001 in Docker Compose) + +## Networking Notes + +- All containers share a custom bridge network ```hive-net```. +- Services communicate via container names: hive and polaris +- Ports mapped for host access: + - Polaris → 8181 + - HiveServer2 → 10001 + diff --git a/packaging/src/docker/thirdparties/polaris/docker-compose.yml b/packaging/src/docker/thirdparties/polaris/docker-compose.yml new file mode 100644 index 000000000000..f6979e57f67a --- /dev/null +++ b/packaging/src/docker/thirdparties/polaris/docker-compose.yml @@ -0,0 +1,94 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +version: "3.9" + +name: hive-polaris-rest-catalog-integration + +services: + + polaris: + image: apache/polaris:latest + container_name: polaris + ports: + # API port + - "8181:8181" + # Management port (metrics and health checks) + - "8182:8182" + environment: + polaris.realm-context.realms: POLARIS + quarkus.otel.sdk.disabled: "true" + POLARIS_BOOTSTRAP_CREDENTIALS: POLARIS,iceberg-client,iceberg-client-secret + polaris.features."ALLOW_INSECURE_STORAGE_TYPES": "true" + polaris.features."SUPPORTED_CATALOG_STORAGE_TYPES": "[\"FILE\"]" + polaris.readiness.ignore-severe-issues: "true" + healthcheck: + test: ["CMD", "curl", "http://localhost:8182/q/health"] + interval: 2s + timeout: 10s + retries: 10 + start_period: 10s + volumes: + - warehouse:/warehouse + networks: + - hive-net + + polaris-init: + image: alpine/curl + container_name: polaris-init + user: "0:0" # run as root + depends_on: + polaris: + condition: service_healthy + environment: + - CLIENT_ID=iceberg-client + - CLIENT_SECRET=iceberg-client-secret + - WAREHOUSE=/warehouse + - REALM=POLARIS + - POLARIS_USER_ID=10000 + - HIVE_USER_ID=1000 + volumes: + - ./polaris/:/polaris + - warehouse:/warehouse + entrypoint: '/bin/sh -c "chmod +x /polaris/init.sh && /polaris/init.sh"' + networks: + - hive-net + + hive: + image: apache/hive:${HIVE_VERSION} + container_name: hive + depends_on: + polaris: + condition: service_healthy + environment: + SERVICE_NAME: hiveserver2 + volumes: + - ./hive/hive-site.xml:/opt/hive/conf/hive-site.xml + - warehouse:/warehouse + ports: + - "10001:10000" + networks: + - hive-net + entrypoint: '/bin/sh -c "mkdir -p /tmp/hive/jars && \ + /opt/hive/bin/schematool -dbType derby -initOrUpgradeSchema && sh /entrypoint.sh"' + +networks: + hive-net: + driver: bridge + +volumes: + warehouse: + realm-data: diff --git a/packaging/src/docker/thirdparties/polaris/hive/hive-site.xml b/packaging/src/docker/thirdparties/polaris/hive/hive-site.xml new file mode 100644 index 000000000000..df952937757c --- /dev/null +++ b/packaging/src/docker/thirdparties/polaris/hive/hive-site.xml @@ -0,0 +1,138 @@ + + + + + + + + hive.jar.directory + file:/tmp/hive/jars + + + + hive.user.install.directory + file:/tmp/hive/user_install + + + + tez.local.mode + true + + + + tez.runtime.optimize.local.fetch + true + + + + tez.am.mode.session + true + + + + hive.scheduled.queries.executor.enabled + false + + + + hive.materializedview.rebuild.incremental + false + + + + hive.metastore.transactional.event.listeners + + + + + hive.notification.event.poll.interval + 0 + + + + hive.stats.autogather + false + + + + hive.stats.fetch.column.stats + false + + + + hive.stats.estimate + false + + + + + + + + metastore.catalog.default + ice01 + + + + metastore.client.impl + org.apache.iceberg.hive.client.HiveRESTCatalogClient + + + + iceberg.catalog.ice01.uri + http://polaris:8181/api/catalog + + + + iceberg.catalog.ice01.type + rest + + + + iceberg.catalog.ice01.warehouse + ice01 + + + + hive.metastore.warehouse.dir + file:///warehouse + + + + + + iceberg.catalog.ice01.rest.auth.type + oauth2 + + + + iceberg.catalog.ice01.oauth2-server-uri + http://polaris:8181/api/catalog/v1/oauth/tokens + + + + iceberg.catalog.ice01.credential + iceberg-client:iceberg-client-secret + + + + iceberg.catalog.ice01.scope + PRINCIPAL_ROLE:ALL + + + diff --git a/packaging/src/docker/thirdparties/polaris/polaris/init.sh b/packaging/src/docker/thirdparties/polaris/polaris/init.sh new file mode 100755 index 000000000000..72201f278c9e --- /dev/null +++ b/packaging/src/docker/thirdparties/polaris/polaris/init.sh @@ -0,0 +1,184 @@ +#!/bin/sh -x + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +set -e + +apk add --no-cache jq +apk add --no-cache acl + +#-------------------------------------------------------------------------------- +# OBTAIN TOKEN +#-------------------------------------------------------------------------------- + +source /polaris/obtain-token.sh + +echo +echo "Obtained access token: ${TOKEN}" + +#-------------------------------------------------------------------------------- +# CREATE CATALOG +#-------------------------------------------------------------------------------- + +echo +echo Creating a catalog named ice01 in realm $REALM... + +STORAGE_TYPE="FILE" +STORAGE_LOCATION="file://${WAREHOUSE}" +STORAGE_CONFIG_INFO="{\"storageType\": \"$STORAGE_TYPE\", \"allowedLocations\": [\"$STORAGE_LOCATION\"]}" + +PAYLOAD='{ + "catalog": { + "name": "ice01", + "type": "INTERNAL", + "readOnly": false, + "properties": { + "default-base-location": "'$STORAGE_LOCATION'" + }, + "storageConfigInfo": '$STORAGE_CONFIG_INFO' + } + }' + +echo $PAYLOAD + +curl -s -H "Authorization: Bearer ${TOKEN}" \ + -H 'Accept: application/json' \ + -H 'Content-Type: application/json' \ + -H "Polaris-Realm: $REALM" \ + http://polaris:8181/api/management/v1/catalogs \ + -d "$PAYLOAD" -v + +#-------------------------------------------------------------------------------- +# CREATE PRINCIPAL +#-------------------------------------------------------------------------------- + +echo +echo Creating a principal named 'ice01_principal' in realm $REALM... + +PAYLOAD='{ + "principal": { + "name": "ice01_principal" + }, + "credentialRotationRequired": false + }' + +echo $PAYLOAD + +curl -s -H "Authorization: Bearer ${TOKEN}" \ + -H 'Accept: application/json' \ + -H 'Content-Type: application/json' \ + -H "Polaris-Realm: $REALM" \ + http://polaris:8181/api/management/v1/principals \ + -d "$PAYLOAD" -v + +#-------------------------------------------------------------------------------- +# CREATE PRINCIPAL ROLE +#-------------------------------------------------------------------------------- + +PAYLOAD='{ + "principalRole": { + "name": "ice01_principal_role" + } + }' + +echo $PAYLOAD + +curl -s -H "Authorization: Bearer ${TOKEN}" \ + -H 'Accept: application/json' \ + -H 'Content-Type: application/json' \ + -H "Polaris-Realm: $REALM" \ + http://polaris:8181/api/management/v1/principal-roles \ + -d "$PAYLOAD" -v + +#-------------------------------------------------------------------------------- +# CATALOG ROLE +#-------------------------------------------------------------------------------- + +PAYLOAD='{ + "catalogRole": { + "name": "ice01_catalog_role" + } +}' + +echo $PAYLOAD + +curl -s -H "Authorization: Bearer ${TOKEN}" \ + -H 'Accept: application/json' \ + -H 'Content-Type: application/json' \ + -H "Polaris-Realm: $REALM" \ + http://polaris:8181/api/management/v1/catalogs/ice01/catalog-roles \ + -d "$PAYLOAD" -v + +#-------------------------------------------------------------------------------- +# GRANT THE PRINCIPAL THE PRINCIPAL ROLE +#-------------------------------------------------------------------------------- + +PAYLOAD='{ + "principalRole": { + "name": "ice01_principal_role" + } +}' + +echo $PAYLOAD + +curl -s -X PUT -H "Authorization: Bearer ${TOKEN}" \ + -H 'Accept: application/json' \ + -H 'Content-Type: application/json' \ + -H "Polaris-Realm: $REALM" \ + http://polaris:8181/api/management/v1/principals/ice01_principal/principal-roles \ + -d "$PAYLOAD" -v + +#-------------------------------------------------------------------------------- +# GRANT THE CATALOG ROLE TO THE PRINCIPAL ROLE +#-------------------------------------------------------------------------------- + +PAYLOAD='{ + "type": "catalog", + "privilege": "CATALOG_MANAGE_CONTENT" +}' + +echo $PAYLOAD + +curl -s -X PUT -H "Authorization: Bearer ${TOKEN}" \ + -H 'Accept: application/json' \ + -H 'Content-Type: application/json' \ + -H "Polaris-Realm: $REALM" \ + http://polaris:8181/api/management/v1/catalogs/ice01/catalog-roles/ice01_catalog_role/grants \ + -d "$PAYLOAD" -v + +#-------------------------------------------------------------------------------- +# Create warehouse directory and set permissions +#-------------------------------------------------------------------------------- + +mkdir -p ${WAREHOUSE} +chmod 777 ${WAREHOUSE} + +#-------------------------------------------------------------------------------- +# Start ACL sync on the warehouse folder for hive and polaris users +#-------------------------------------------------------------------------------- + +while true +do + # Existing files ACLs + setfacl -R -m u:$HIVE_USER_ID:rwx,u:$POLARIS_USER_ID:rwx ${WAREHOUSE} + + # Default ACLs for new files + setfacl -R -d -m u:$HIVE_USER_ID:rwx,u:$POLARIS_USER_ID:rwx ${WAREHOUSE} +done diff --git a/packaging/src/docker/thirdparties/polaris/polaris/obtain-token.sh b/packaging/src/docker/thirdparties/polaris/polaris/obtain-token.sh new file mode 100755 index 000000000000..aba4faf4e070 --- /dev/null +++ b/packaging/src/docker/thirdparties/polaris/polaris/obtain-token.sh @@ -0,0 +1,37 @@ +#!/bin/sh -x + +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +set -e + +apk add --no-cache jq + +TOKEN=$(curl -s http://polaris:8181/api/catalog/v1/oauth/tokens \ + --user ${CLIENT_ID}:${CLIENT_SECRET} \ + -H "Polaris-Realm: $REALM" \ + -d grant_type=client_credentials \ + -d scope=PRINCIPAL_ROLE:ALL | jq -r .access_token) + +if [ -z "${TOKEN}" ]; then + echo "Failed to obtain access token." + exit 1 +fi + +export TOKEN