diff --git a/docker/Dockerfile b/docker/Dockerfile index cddc136..b397d8f 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -34,7 +34,8 @@ RUN apt-get update && \ sudo \ curl \ mysql-client=${MYSQL_CLIENT_VERSION} && \ - apt-get clean -y + apt-get clean -y && \ + rm -rf /var/lib/apt/lists/* /var/cache/apt/archives WORKDIR /opt @@ -45,8 +46,24 @@ RUN if echo $METASTORE_VERSION | grep -E '^3\.' > /dev/null; then \ fi # download and install hadoop and fix the (>= ubuntu jammy) distribution executable bug -RUN curl -L https://dlcdn.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - && \ - sed -i 's|if \[\[ ! -x "\$JAVA" \]\]; then|if [ \$("$JAVA" -version) ]; then|' ${HADOOP_HOME}/libexec/hadoop-functions.sh +RUN set -eux; \ + curl -L https://dlcdn.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz | tar zxf - && \ + sed -i 's|if \[\[ ! -x "\$JAVA" \]\]; then|if [ \$("$JAVA" -version) ]; then|' ${HADOOP_HOME}/libexec/hadoop-functions.sh && \ + echo "Cleaning up unnecessary Hadoop components for Hive Metastore..." && \ + rm -rf ${HADOOP_HOME}/share/hadoop/yarn/* && \ + rm -rf ${HADOOP_HOME}/share/hadoop/mapreduce/* && \ + rm -rf ${HADOOP_HOME}/share/hadoop/client/* && \ + find ${HADOOP_HOME}/share/hadoop -type d \( \ + -name "jdiff" -o \ + -name "test" -o \ + -name "examples" \ + \) -exec rm -rf {} + && \ + rm -rf ${HADOOP_HOME}/share/doc && \ + rm -rf ${HADOOP_HOME}/share/hadoop/hdfs/webapps && \ + find ${HADOOP_HOME}/share/hadoop -type d -name "webapps" -exec rm -rf {} + && \ + echo "Verify what remains" && \ + du -sh ${HADOOP_HOME}/share/hadoop/* || true && \ + echo "Hadoop cleanup completed." RUN rm -f ${HIVE_HOME}/lib/postgresql-*.jar && \ curl -sL https://jdbc.postgresql.org/download/postgresql-${JDBC_VERSION}.jar -o /opt/apache-hive-metastore-${METASTORE_VERSION}-bin/lib/postgresql-${JDBC_VERSION}.jar @@ -71,55 +88,43 @@ RUN curl -sLO https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_ja # Download and install GCS connector and Google Cloud Storage dependencies RUN mkdir -p ${HIVE_HOME}/lib/ && \ - curl -sLO "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/${GCS_CONNECTOR_VERSION}/gcs-connector-${GCS_CONNECTOR_VERSION}-sources.jar" && \ mv gcs-connector-${GCS_CONNECTOR_VERSION}-sources.jar ${HIVE_HOME}/lib/ && \ - curl -sLO "https://repo1.maven.org/maven2/com/google/cloud/google-cloud-storage/${GOOGLE_CLOUD_STORAGE_VERSION}/google-cloud-storage-${GOOGLE_CLOUD_STORAGE_VERSION}.jar" && \ mv google-cloud-storage-${GOOGLE_CLOUD_STORAGE_VERSION}.jar ${HIVE_HOME}/lib/ && \ - curl -sLO "https://repo1.maven.org/maven2/com/google/auth/google-auth-library-oauth2-http/${GOOGLE_AUTH_LIBRARY_VERSION}/google-auth-library-oauth2-http-${GOOGLE_AUTH_LIBRARY_VERSION}.jar" && \ mv google-auth-library-oauth2-http-${GOOGLE_AUTH_LIBRARY_VERSION}.jar ${HIVE_HOME}/lib/ && \ - curl -sLO "https://repo1.maven.org/maven2/com/google/http-client/google-http-client-jackson2/${GOOGLE_HTTP_CLIENT_VERSION}/google-http-client-jackson2-${GOOGLE_HTTP_CLIENT_VERSION}.jar" && \ mv google-http-client-jackson2-${GOOGLE_HTTP_CLIENT_VERSION}.jar ${HIVE_HOME}/lib/ && \ - curl -sLO "https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop3.jar" && \ mv gcs-connector-latest-hadoop3.jar ${HIVE_HOME}/lib/ && \ - chmod 644 ${HIVE_HOME}/lib/*.jar && \ - export GCS_CONNECTOR_JAR_HIVE="${HIVE_HOME}/lib/gcs-connector-${GCS_CONNECTOR_VERSION}-sources.jar" && \ - export GCS_CONNECTOR_JAR_HADOOP="${HADOOP_HOME}/share/hadoop/common/lib/gcs-connector-${GCS_CONNECTOR_VERSION}-sources.jar" &&\ - + export GCS_CONNECTOR_JAR_HADOOP="${HADOOP_HOME}/share/hadoop/common/lib/gcs-connector-${GCS_CONNECTOR_VERSION}-sources.jar" && \ export GCS_CLIENT_HTTP_JAR_HIVE="${HIVE_HOME}/lib/google-http-client-jackson2-${GOOGLE_HTTP_CLIENT_VERSION}.jar" && \ - export GCS_CLIENT_HTTP_JAR_HADOOP="${HADOOP_HOME}/share/hadoop/common/lib/google-http-client-jackson2-${GOOGLE_HTTP_CLIENT_VERSION}.jar" &&\ - + export GCS_CLIENT_HTTP_JAR_HADOOP="${HADOOP_HOME}/share/hadoop/common/lib/google-http-client-jackson2-${GOOGLE_HTTP_CLIENT_VERSION}.jar" && \ export GCS_CLOUD_STORAGE_JAR_HIVE="${HIVE_HOME}/lib/google-cloud-storage-${GOOGLE_CLOUD_STORAGE_VERSION}.jar" && \ - export GCS_CLOUD_STORAGE_JAR_HADOOP="${HADOOP_HOME}/share/hadoop/common/lib/google-cloud-storage-${GOOGLE_CLOUD_STORAGE_VERSION}.jar" &&\ - + export GCS_CLOUD_STORAGE_JAR_HADOOP="${HADOOP_HOME}/share/hadoop/common/lib/google-cloud-storage-${GOOGLE_CLOUD_STORAGE_VERSION}.jar" && \ export GCS_OAUTH_JAR_HIVE="${HIVE_HOME}/lib/google-auth-library-oauth2-http-${GOOGLE_AUTH_LIBRARY_VERSION}.jar" && \ - export GCS_OAUTH_JAR_HADOOP="${HADOOP_HOME}/share/hadoop/common/lib/google-auth-library-oauth2-http-${GOOGLE_AUTH_LIBRARY_VERSION}.jar" &&\ - + export GCS_OAUTH_JAR_HADOOP="${HADOOP_HOME}/share/hadoop/common/lib/google-auth-library-oauth2-http-${GOOGLE_AUTH_LIBRARY_VERSION}.jar" && \ export GCS_HADOOP_CONNECTOR_JAR_HIVE="${HIVE_HOME}/lib/gcs-connector-latest-hadoop3.jar" && \ - export GCS_HADOOP_CONNECTOR_JAR_HADOOP="${HADOOP_HOME}/share/hadoop/common/lib/gcs-connector-latest-hadoop3.jar" &&\ - - cp "${GCS_CONNECTOR_JAR_HIVE}" "${GCS_CONNECTOR_JAR_HADOOP}" &&\ - cp "${GCS_CLIENT_HTTP_JAR_HIVE}" "${GCS_CLIENT_HTTP_JAR_HADOOP}" &&\ - cp "${GCS_CLOUD_STORAGE_JAR_HIVE}" "${GCS_CLOUD_STORAGE_JAR_HADOOP}" &&\ - cp "${GCS_OAUTH_JAR_HIVE}" "${GCS_OAUTH_JAR_HADOOP}" &&\ - cp "${GCS_HADOOP_CONNECTOR_JAR_HIVE}" "${GCS_HADOOP_CONNECTOR_JAR_HADOOP}" &&\ - - chown ubuntu:ubuntu "${GCS_CONNECTOR_JAR_HADOOP}" &&\ - chown ubuntu:ubuntu "${GCS_CLIENT_HTTP_JAR_HADOOP}" &&\ - chown ubuntu:ubuntu "${GCS_CLOUD_STORAGE_JAR_HADOOP}" &&\ - chown ubuntu:ubuntu "${GCS_OAUTH_JAR_HADOOP}" &&\ - chown ubuntu:ubuntu "${GCS_HADOOP_CONNECTOR_JAR_HADOOP}" &&\ - + export GCS_HADOOP_CONNECTOR_JAR_HADOOP="${HADOOP_HOME}/share/hadoop/common/lib/gcs-connector-latest-hadoop3.jar" && \ + cp "${GCS_CONNECTOR_JAR_HIVE}" "${GCS_CONNECTOR_JAR_HADOOP}" && \ + cp "${GCS_CLIENT_HTTP_JAR_HIVE}" "${GCS_CLIENT_HTTP_JAR_HADOOP}" && \ + cp "${GCS_CLOUD_STORAGE_JAR_HIVE}" "${GCS_CLOUD_STORAGE_JAR_HADOOP}" && \ + cp "${GCS_OAUTH_JAR_HIVE}" "${GCS_OAUTH_JAR_HADOOP}" && \ + cp "${GCS_HADOOP_CONNECTOR_JAR_HIVE}" "${GCS_HADOOP_CONNECTOR_JAR_HADOOP}" && \ + chown ubuntu:ubuntu "${GCS_CONNECTOR_JAR_HADOOP}" && \ + chown ubuntu:ubuntu "${GCS_CLIENT_HTTP_JAR_HADOOP}" && \ + chown ubuntu:ubuntu "${GCS_CLOUD_STORAGE_JAR_HADOOP}" && \ + chown ubuntu:ubuntu "${GCS_OAUTH_JAR_HADOOP}" && \ + chown ubuntu:ubuntu "${GCS_HADOOP_CONNECTOR_JAR_HADOOP}" && \ export GCS_CONNECTOR_JAR="${GCS_CONNECTOR_JAR_HADOOP}" -# Ensure GCS connector is in the classpath +# Ensure both GCS and AWS connectors are on the Hadoop classpath RUN echo "export HADOOP_CLASSPATH=\${HADOOP_CLASSPATH}:${HADOOP_HOME}/share/hadoop/common/lib/gcs-connector-${GCS_CONNECTOR_VERSION}-sources.jar" \ + >> ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh && \ + echo "export HADOOP_CLASSPATH=\${HADOOP_CLASSPATH}:${HADOOP_HOME}/share/hadoop/tools/lib/aws-java-sdk-bundle-*.jar:${HADOOP_HOME}/share/hadoop/tools/lib/hadoop-aws-${HADOOP_VERSION}.jar" \ >> ${HADOOP_HOME}/etc/hadoop/hadoop-env.sh RUN sed -i '/<\/configuration>/i \ diff --git a/docker/metastore.sh b/docker/metastore.sh index 6276b16..a52a5ba 100755 --- a/docker/metastore.sh +++ b/docker/metastore.sh @@ -293,7 +293,7 @@ if [ "$MODE" = "init" ]; then if [ $? -ne 0 ]; then echo "Will initialize the DB"; ${BASEDIR}/apache-hive-metastore-${METASTORE_VERSION}-bin/bin/schematool -initSchema -dbType ${DB_DRIVER_NAME} -userName ${HIVEMS_USER} -passWord ${HIVEMS_PASSWORD} -url "jdbc:mysql://${DB_HOST}:${DB_PORT}/${HIVEMS_DB}?createDatabaseIfNotExist=true&connectTimeout=1000"; fi else psql --host=${DB_HOST} --port=${DB_PORT} -U ${HIVEMS_USER} -d ${HIVEMS_DB} -c 'SELECT "DB_ID" FROM "DBS"' >/dev/null 2>&1; - if [ $? -ne 0 ]; then echo "Will initialize the DB"; ${BASEDIR}/apache-hive-metastore-${METASTORE_VERSION}-bin/bin/schematool -initSchema -dbType ${DB_DRIVER_NAME}; fi + if [ $? -ne 0 ]; then echo "Will initialize the DB"; ${BASEDIR}/apache-hive-metastore-${METASTORE_VERSION}-bin/bin/schematool -initSchema -dbType postgres; fi fi echo "DATABASE SCHEMA SHOULD BE OK NOW!!" exit 0 diff --git a/helm/hive-metastore/Chart.yaml b/helm/hive-metastore/Chart.yaml index 73a55ed..5284b3b 100644 --- a/helm/hive-metastore/Chart.yaml +++ b/helm/hive-metastore/Chart.yaml @@ -1,7 +1,7 @@ # File generated by Makefile apiVersion: v2 name: hive-metastore -version: 3.1.3-1.3.0 +version: 3.1.3-1.3.1 sources: - https://github.com/okdp/hive-metastore appVersion: 3.1.3 diff --git a/helm/hive-metastore/README.md b/helm/hive-metastore/README.md index 51fd62c..fd5d369 100644 --- a/helm/hive-metastore/README.md +++ b/helm/hive-metastore/README.md @@ -35,13 +35,13 @@ This chart bootstraps a [Hive Metastore](https://cwiki.apache.org/confluence/dis To install the chart with the release name `my-release`: ```shell -$ helm install my-release oci://quay.io/okdp/charts/hive-metastore --version 3.1.3-1.3.0 +$ helm install my-release oci://quay.io/okdp/charts/hive-metastore --version 3.1.3-1.3.1 ``` This will create a release of `my-release` in the default namespace. To install in a different namespace: ```shell -$ helm install my-release oci://quay.io/okdp/charts/hive-metastore --version 3.1.3-1.3.0 \ +$ helm install my-release oci://quay.io/okdp/charts/hive-metastore --version 3.1.3-1.3.1 \ --namespace hive-metastore ``` @@ -62,7 +62,7 @@ The command removes all the Kubernetes components associated with the chart and To download the chart locally, use the following command: ```shell -$ helm pull oci://quay.io/okdp/charts/hive-metastore --version 3.1.3-1.3.0 +$ helm pull oci://quay.io/okdp/charts/hive-metastore --version 3.1.3-1.3.1 ``` ## Values @@ -76,7 +76,7 @@ $ helm pull oci://quay.io/okdp/charts/hive-metastore --version 3.1.3-1.3.0 - affinity + affinity object @@ -88,7 +88,7 @@ object Affinity for the hive-metastore pod. - autoscaling + autoscaling object @@ -142,7 +142,7 @@ string - commonAnnotations + commonAnnotations object @@ -154,7 +154,7 @@ object Annotations to be added to all resources. - containerSecurityContext.allowPrivilegeEscalation + containerSecurityContext.allowPrivilegeEscalation bool @@ -166,7 +166,7 @@ false - containerSecurityContext.capabilities.drop[0] + containerSecurityContext.capabilities.drop[0] string @@ -178,7 +178,7 @@ string - containerSecurityContext.readOnlyRootFilesystem + containerSecurityContext.readOnlyRootFilesystem bool @@ -211,7 +211,7 @@ string "postgresql" - Hive metastore driver name. + Hive metastore driver name. One of `` `postgresql` or `mysql` db.driverRef @@ -286,7 +286,7 @@ null Hive metastore database existing kubernetes secret name. - deploymentName + deploymentName string @@ -298,7 +298,7 @@ null Will default to {{ include "metastore.fullname" . }} - exposure + exposure object @@ -330,7 +330,19 @@ object - fullNameOverride + extraEnvRaw + +list + +
+
+[]
+
+
+ Extra environment variables in RAW format that will be passed into pods + + + fullNameOverride string @@ -414,7 +426,7 @@ string - hpaName + hpaName string @@ -426,7 +438,7 @@ null Will default to {{ include "metastore.fullname" . }} - image.pullPolicy + image.pullPolicy string @@ -438,7 +450,7 @@ string Image pull policy. - image.pullSecrets + image.pullSecrets list @@ -454,7 +466,7 @@ list Image pullSecrets for private registries. - image.repository + image.repository string @@ -466,7 +478,7 @@ string Docker image registry. - image.tag + image.tag string @@ -478,7 +490,7 @@ string Image tag. - initJob + initJob object @@ -492,7 +504,7 @@ object Hive metastore database initialization job - jobName + jobName string @@ -504,7 +516,7 @@ null Will default to {{ include "metastore.fullname" . }} - logLevel + logLevel string @@ -516,7 +528,7 @@ string Log4j2 log level. One of `` `debug`, `info`, `warn`, `error`, `fatal`, `trace` - nameOverride + nameOverride string @@ -528,7 +540,7 @@ null Allow chart name overriding. - networkPolicies.allowedNamespace + networkPolicies.allowedNamespace list @@ -540,7 +552,7 @@ list - networkPolicies.allowedNamespaceLabels + networkPolicies.allowedNamespaceLabels object @@ -552,7 +564,7 @@ object - networkPolicies.enabled + networkPolicies.enabled bool @@ -564,7 +576,7 @@ true - networkPolicyName + networkPolicyName string @@ -576,7 +588,7 @@ null Will default to {{ include "metastore.fullname" . }} - nodeSelector + nodeSelector object @@ -588,7 +600,7 @@ object Node selector for the hive-metastore pod. - podAnnotations + podAnnotations object @@ -600,7 +612,7 @@ object Annotations to be added to the pod. - podSecurityContext + podSecurityContext object @@ -617,7 +629,7 @@ object Security profile for the hive-metastore pod. - replicaCount + replicaCount int @@ -629,7 +641,7 @@ int Desired number of hive-metastore pods to run. Set 'replicaCount' to 0 or leave it unused when autoscaling is enabled. - resources + resources object @@ -749,7 +761,7 @@ null S3 warehouse directory/bucket name. - serviceAccount + serviceAccount object @@ -765,7 +777,7 @@ object Specifies whether a service account should be created. - serviceName + serviceName string @@ -777,7 +789,7 @@ null Will default to {{ include "metastore.fullname" . }} - servicePort + servicePort int @@ -789,7 +801,7 @@ int Hive metastore service port. - tolerations + tolerations list diff --git a/helm/hive-metastore/templates/deployment.yaml b/helm/hive-metastore/templates/deployment.yaml index 4c7fba3..9fe74ac 100644 --- a/helm/hive-metastore/templates/deployment.yaml +++ b/helm/hive-metastore/templates/deployment.yaml @@ -118,13 +118,18 @@ spec: value: "{{ .Values.db.port | default "5432" }}" - name: HIVEMS_DB value: {{ .Values.db.databaseName }} + {{- if .Values.db.user.name }} - name: HIVEMS_USER - value: {{ .Values.db.user.name }} + value: {{ .Values.db.user.name | quote }} + {{- end }} - name: HIVEMS_PASSWORD valueFrom: secretKeyRef: name: {{ .Values.db.user.password.secretName }} key: {{ .Values.db.user.password.propertyName }} + {{- if .Values.extraEnvRaw }} + {{- toYaml .Values.extraEnvRaw | nindent 12 }} + {{- end }} {{- if and (eq .Values.cloud_storage "gcs") (eq .Values.gcs.workloadIdentity.enabled false) }} volumeMounts: - name: gcs-key diff --git a/helm/hive-metastore/templates/job.yaml b/helm/hive-metastore/templates/job.yaml index cff85d4..fbec0f1 100644 --- a/helm/hive-metastore/templates/job.yaml +++ b/helm/hive-metastore/templates/job.yaml @@ -50,21 +50,48 @@ spec: value: "{{ .Values.db.port | default "5432" }}" - name: HIVEMS_DB value: {{ .Values.db.databaseName }} + {{- if .Values.db.user.name }} - name: HIVEMS_USER - value: {{ .Values.db.user.name }} + value: {{ .Values.db.user.name | quote }} + {{- end }} - name: HIVEMS_PASSWORD valueFrom: secretKeyRef: name: {{ .Values.db.user.password.secretName }} key: {{ .Values.db.user.password.propertyName }} + {{- if .Values.extraEnvRaw }} + {{- toYaml .Values.extraEnvRaw | nindent 12 }} + {{- end }} command: ["/bin/sh","-c"] args: - | - TABLE_EXISTS=$(mysql -h $DB_HOST -u$HIVEMS_USER -p$HIVEMS_PASSWORD -D $HIVEMS_DB -sN -e "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = '${HIVEMS_DB}' AND table_name = 'METASTORE_DB_PROPERTIES';") - echo "TABLE_EXISTS: $TABLE_EXISTS" - if [ "$TABLE_EXISTS" != "" ] && [ "$TABLE_EXISTS" -eq 0 ]; then - /metastore.sh init - fi + set -e + echo "Using driver: {{ .Values.db.driverName }}" + {{- if eq .Values.db.driverName "mysql" }} + echo "MySQL — running initialization check..." + TABLE_EXISTS=$(mysql -h $DB_HOST -u$HIVEMS_USER -p$HIVEMS_PASSWORD -D $HIVEMS_DB -sN -e \ + "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = '${HIVEMS_DB}' AND table_name = 'METASTORE_DB_PROPERTIES';") + if [ -n "$TABLE_EXISTS" ] && [ "$TABLE_EXISTS" -eq 0 ]; then + echo "Initializing Hive Metastore schema for MySQL..." + /metastore.sh init + else + echo "Hive Metastore schema already exists, skipping initialization." + fi + {{- else if eq .Values.db.driverName "postgresql" }} + echo "PostgreSQL — running initialization check..." + TABLE_EXISTS=$(PGPASSWORD=$HIVEMS_PASSWORD psql -h $DB_HOST -U $HIVEMS_USER -d $HIVEMS_DB -tAc \ + "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'metastore_db_properties';") + if [ -n "$TABLE_EXISTS" ] && [ "$TABLE_EXISTS" -eq 0 ]; then + echo "Initializing Hive Metastore schema for PostgreSQL..." + /metastore.sh init + else + echo "Hive Metastore schema already exists, skipping initialization." + fi + {{- else }} + echo "❌ Unsupported database driver: {{ .Values.db.driverName }}" + echo "Supported drivers are: mysql, postgres" + exit 1 + {{- end }} restartPolicy: Never {{- with .Values.podSecurityContext }} securityContext: diff --git a/helm/hive-metastore/values.yaml b/helm/hive-metastore/values.yaml index 25a5cf0..379e201 100644 --- a/helm/hive-metastore/values.yaml +++ b/helm/hive-metastore/values.yaml @@ -2,7 +2,7 @@ db: # -- Hive metastore driver reference. driverRef: org.postgresql.Driver - # -- Hive metastore driver name. + # -- Hive metastore driver name. One of `` `postgresql` or `mysql` driverName: postgresql # -- Hive metastore database host. host: @@ -61,6 +61,15 @@ aws: # -- S3 IAM role ARN for hive-metastore access to S3. s3AssumeRoleArn: +# -- Extra environment variables in RAW format that will be passed into pods +extraEnvRaw: [] +# Load DB username from other secret +# - name: HIVEMS_USER +# valueFrom: +# secretKeyRef: +# name: my-database-secret +# key: username + # As the metastore does not provide authentication/authorization mechanism, # everybody will be able to access and modify all metastore data. # So, we need to restrict access only from allowed namespace