opea-project · krish918 · Oct 9, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 17, 2024
diff --git a/.github/workflows/_helm-e2e.yaml b/.github/workflows/_helm-e2e.yaml
@@ -65,7 +65,7 @@ jobs:
           echo "CHART_NAME=$CHART_NAME" >> $GITHUB_ENV
           echo "RELEASE_NAME=${CHART_NAME}$(date +%Y%m%d%H%M%S)" >> $GITHUB_ENV
           echo "NAMESPACE=${CHART_NAME}-$(date +%Y%m%d%H%M%S)" >> $GITHUB_ENV
-          echo "ROLLOUT_TIMEOUT_SECONDS=600s" >> $GITHUB_ENV
+          echo "ROLLOUT_TIMEOUT_SECONDS=1200s" >> $GITHUB_ENV
           echo "TEST_TIMEOUT_SECONDS=600s" >> $GITHUB_ENV
           echo "KUBECTL_TIMEOUT_SECONDS=60s" >> $GITHUB_ENV
           echo "should_cleanup=false" >> $GITHUB_ENV

@@ -17,6 +17,10 @@ dependencies:
     version: 1.0.0
     repository: file://../redis-vector-db
     condition: redis-vector-db.enabled
+  - name: vdms-vector-db
+    version: 1.0.0
+    repository: file://../vdms-vector-db
+    condition: vdms-vector-db.enabled
   - name: milvus
     version: 4.2.12
     repository: https://zilliztech.github.io/milvus-helm/

@@ -1,14 +1,18 @@
-# data-prep
+# Data-Prep Microservice
 
-Helm chart for deploying data-prep microservice.
+Helm chart for deploying data-prep microservice. Data-Prep is consumed by several reference applications present in [GenAIExample](https://github.com/opea-project/GenAIExamples/tree/main).
 
-data-prep will use redis and tei service, please specify the endpoints.
+There are 2 versions of Data-Prep microservice. First version is unimodal based on redis-vector-db and TEI. It performs data preparation for textual data. An alternative multimodal version based on `vdms-values.yaml` file, performs data preparation for visual data input. Follow along to select and install the version which suites your use case.
 
-## (Option1): Installing the chart separately
+Data-Prep uses redis-vector-db and tei. The multimodal version uses vdms-vector-db service. Endpoints for these dependencies should be set properly before installing the chart.
+
+## Install the chart for data preparation using Redis Vector DB
+
+### (Option1): Installing the chart separately
 
 First, you need to install the tei and redis-vector-db chart, please refer to the [tei](../tei/README.md) and [redis-vector-db](../redis-vector-db/README.md) for more information.
 
-After you've deployted the tei and redis-vector-db chart successfully, please run `kubectl get svc` to get the service endpoint and URL respectively, i.e. `http://tei`, `redis://redis-vector-db:6379`.
+After you've deployed the tei and redis-vector-db chart successfully, please run `kubectl get svc` to get the service endpoint and URL respectively, i.e. `http://tei`, `redis://redis-vector-db:6379`.
 
 To install data-prep chart, run the following:
 
@@ -20,7 +24,7 @@ helm dependency update
 helm install data-prep . --set REDIS_URL=${REDIS_URL} --set TEI_EMBEDDING_ENDPOINT=${TEI_EMBEDDING_ENDPOINT}
 ```
 
-## (Option2): Installing the chart with dependencies automatically
+### (Option2): Installing the chart with dependencies automatically
 
 ```console
 cd GenAIInfra/helm-charts/common/data-prep
@@ -29,6 +33,52 @@ helm install data-prep . --set redis-vector-db.enabled=true --set tei.enabled=tr
 
 ```
 
+## Install the chart for multimodal data preparation using VDMS Vector DB
+
+### (Option1): Installing the chart separately
+
+First, you need to install the `vdms-vector-db` chart. Please refer to the [vdms-vector-db](../vdms-vector-db/README.md) for more information.
+
+After you've deployed the `vdms-vector-db` chart successfully, please run `kubectl get svc` to get the service host and port respectively, for example: `http://vdms-vector-db:8001`.
+
+Next, Run the following commands to install data-prep chart:
+
+```bash
+cd GenAIInfra/helm-charts/common/data-prep
+
+# Use the host and port received in previous step as VDMS_HOST and VDMS_PORT.
+export VDMS_HOST="vdms-vector-db"
+export VDMS_PORT="8001"
+export INDEX_NAME="mega-videoqna"
+export HFTOKEN=<your huggingface token>
+# Set a directory to cache emdedding models
+export CACHEDIR="/home/$USER/.cache"
+
+# Export the proxy variables. Assign empty string if no proxy setup required.
+export https_proxy="your_http_proxy"
+export http_proxy="your_https_proxy"
+
+helm dependency update
+helm install data-prep . -f ../variant_videoqna-values.yaml --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set indexName=${INDEX_NAME} --set global.cacheUseHostPath=${CACHEDIR} --set vdmsHost=${VDMS_HOST} --set vdmsPort=${VDMS_PORT} --set global.https_proxy=${https_proxy} --set global.http_proxy=${http_proxy}
+```
+
+### (Option2): Installing the chart with dependencies automatically
+
+```bash
+cd GenAIInfra/helm-charts/common/data-prep
+export INDEX_NAME="mega-videoqna"
+export HFTOKEN=<your huggingface token>
+# Set a directory to cache emdedding models
+export CACHEDIR="/home/$USER/.cache"
+
+# Export the proxy variables. Assign empty string if no proxy setup required.
+export https_proxy="your_http_proxy"
+export http_proxy="your_https_proxy"
+
+helm dependency update
+helm install data-prep . -f ./variant_videoqna-values.yaml --set vdms-vector-db.enabled=true --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set indexName=${INDEX_NAME} --set global.cacheUseHostPath=${CACHEDIR} --set global.https_proxy=${https_proxy} --set global.http_proxy=${http_proxy}
+```
+
 ## Verify
 
 To verify the installation, run the command `kubectl get pod` to make sure all pods are running.
@@ -37,21 +87,40 @@ Then run the command `kubectl port-forward svc/data-prep 6007:6007` to expose th
 
 Open another terminal and run the following command to verify the service if working:
 
-```console
+### 1. For Data-prep service using redis-vector-db:
+
+```bash
+
 curl http://localhost:6007/v1/dataprep  \
     -X POST \
     -H "Content-Type: multipart/form-data" \
     -F "files=@./README.md"
 ```
 
+### 2. For multimodal data prep service using vdms-vector-db:
+
+```bash
+# 1) Download a sample video in current dir:
+curl -svLO "https://github.com/opea-project/GenAIExamples/raw/refs/heads/main/VideoQnA/docker_compose/intel/cpu/xeon/data/op_1_0320241830.mp4"
+
+# 2) Verify using above video
+curl -X POST http://localhost:6007/v1/dataprep \
+      -H "Content-Type: multipart/form-data" \
+      -F "files=@./op_1_0320241830.mp4"
+```
+
 ## Values
 
-| Key                    | Type   | Default                 | Description |
-| ---------------------- | ------ | ----------------------- | ----------- |
-| image.repository       | string | `"opea/dataprep-redis"` |             |
-| service.port           | string | `"6007"`                |             |
-| REDIS_URL              | string | `""`                    |             |
-| TEI_EMBEDDING_ENDPOINT | string | `""`                    |             |
+| Key                          | Type   | Default                           | Description |
+| ---------------------------- | ------ | --------------------------------- | ----------- |
+| image.repository             | string | `"opea/dataprep-redis"`           |             |
+| service.port                 | string | `"6007"`                          |             |
+| REDIS_URL                    | string | `""`                              |             |
+| TEI_EMBEDDING_ENDPOINT       | string | `""`                              |             |
+| vdms-values:image.repository | string | `"opea/dataprep-multimodal-vdms"` |             |
+| vdms-values:vdmsHost         | string | `""`                              |             |
+| vdms-values:vdmsPort         | string | `"8001"`                          |             |
+| vdms-values:indexName        | string | `"mega-videoqna"`                 |             |
 
 ## Milvus support
 

@@ -0,0 +1,21 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+vdms-vector-db:
+  enabled: true
+
+image:
+  repository: opea/dataprep-multimodal-vdms
+  pullPolicy: IfNotPresent
+  # Overrides the image tag whose default is the chart appVersion.
+  tag: "latest"
+
+indexName: "mega-videoqna"
+vdmsHost: ""
+vdmsPort: "8001"
+entryCommand: ["/bin/sh"]
+extraArgs: ["-c", "sleep 15 && python ingest_videos.py"]
+
+# Set cacheUseHostPath to for caching encoding/embedding models and other related data
+global:
+  cacheUseHostPath: ""
@@ -8,6 +8,14 @@ metadata:
   labels:
     {{- include "data-prep.labels" . | nindent 4 }}
 data:
+  {{- if .Values.vdmsHost }}
+  VDMS_HOST: {{ .Values.vdmsHost | quote }}
+  {{- else }}
+  VDMS_HOST: "{{ .Release.Name }}-vdms-vector-db"
+  {{- end }}
+  {{- if .Values.vdmsPort }}
+  VDMS_PORT: {{ .Values.vdmsPort | quote }}
+  {{- end }}
   {{- if .Values.MOSEC_EMBEDDING_ENDPOINT }}
   MOSEC_EMBEDDING_ENDPOINT: {{ .Values.MOSEC_EMBEDDING_ENDPOINT | quote}}
   MOSEC_EMBEDDING_MODEL: {{ .Values.MOSEC_EMBEDDING_MODEL | quote}}
@@ -26,7 +34,7 @@ data:
   {{- else }}
   REDIS_URL: "redis://{{ .Release.Name }}-redis-vector-db:6379"
   {{- end }}
-  INDEX_NAME: {{ .Values.INDEX_NAME | quote }}
+  INDEX_NAME: {{ .Values.indexName | quote }}
   KEY_INDEX_NAME: {{ .Values.KEY_INDEX_NAME | quote }}
   SEARCH_BATCH_SIZE: {{ .Values.SEARCH_BATCH_SIZE | quote }}
   {{- if .Values.MILVUS_HOST }}
@@ -47,7 +55,7 @@ data:
   http_proxy: {{ .Values.global.http_proxy | quote }}
   https_proxy: {{ .Values.global.https_proxy | quote }}
   {{- if and (not .Values.REDIS_URL) (and (not .Values.TEI_EMBEDDING_ENDPOINT) (or .Values.global.http_proxy .Values.global.https_proxy)) }}
-  no_proxy: "{{ .Release.Name }}-tei,{{ .Release.Name }}-redis-vector-db,{{ .Values.global.no_proxy }}"
+  no_proxy: "{{ .Release.Name }}-tei,{{ .Release.Name }}-redis-vector-db,{{ .Release.Name }}-vdms-vector-db,{{ .Values.global.no_proxy }}"
   {{- else }}
   no_proxy: {{ .Values.global.no_proxy | quote }}
   {{- end }}

@@ -41,11 +41,28 @@ spec:
             {{- toYaml .Values.securityContext | nindent 12 }}
           image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
           imagePullPolicy: {{ .Values.image.pullPolicy }}
+          {{- if .Values.entryCommand }}
+          command: {{ .Values.entryCommand }}
+          {{- end }}
+          {{- if .Values.extraArgs }}
+          args:
+            {{- range .Values.extraArgs }}
+            - {{ . | quote }}
+            {{- end }}
+          {{- end }}
           ports:
             - name: data-prep
-              containerPort: {{ .Values.port }}
+              containerPort: {{ .Values.service.containerPort }}
               protocol: TCP
           volumeMounts:
+            {{- if .Values.global.cacheUseHostPath }}
+            - mountPath: /home/user/.cache/clip
+              name: cache-volume
+              subPath: clip
+            - mountPath: /home/user/.cache/huggingface/hub
+              name: cache-volume
+              subPath: huggingface/hub
+            {{- end }}
             - mountPath: /tmp
               name: tmp
           {{- if .Values.livenessProbe }}
@@ -63,6 +80,12 @@ spec:
           resources:
             {{- toYaml .Values.resources | nindent 12 }}
       volumes:
+        {{- if .Values.global.cacheUseHostPath }}
+        - name: cache-volume
+          hostPath:
+            path: {{ .Values.global.cacheUseHostPath }}
+            type: Directory
+        {{- end }}
         - name: tmp
           emptyDir: {}
       {{- with .Values.nodeSelector }}

@@ -11,7 +11,7 @@ spec:
   type: {{ .Values.service.type }}
   ports:
     - port: {{ .Values.service.port }}
-      targetPort: {{ .Values.port }}
+      targetPort: {{ .Values.service.containerPort }}
       protocol: TCP
       name: data-prep
   selector:

@@ -15,6 +15,21 @@ spec:
     - name: curl
       image: python:3.10.14
       command: ['bash', '-c']
+      {{- if contains "dataprep-multimodal-vdms" .Values.image.repository }}
+      args:
+        - |
+          https_proxy={{ .Values.global.https_proxy }} curl -svLO "https://github.com/opea-project/GenAIExamples/raw/refs/heads/main/VideoQnA/docker_compose/intel/cpu/xeon/data/op_1_0320241830.mp4"
+          max_retry=5
+          for ((i=1; i<=max_retry; i++)); do
+            curl http://{{ include "data-prep.fullname" . }}:{{ .Values.service.port }}/v1/dataprep -sS --fail-with-body \
+            -X POST \
+            -H "Content-Type: multipart/form-data" \
+            -F "files=@./op_1_0320241830.mp4" && break;
+            curlcode=$?
+            if [[ $curlcode -eq 7 ]]; then sleep 10; else echo "curl failed with code $curlcode"; exit 1; fi;
+          done;
+          if [ $i -gt $max_retry ]; then echo "test failed with maximum retry"; exit 1; fi
+      {{- else }}
       args:
         - |
           echo "test file" > /tmp/file1.txt;
@@ -32,4 +47,5 @@ spec:
           -H "Content-Type: application/json" \
           -d '{"file_path": "file1.txt"}';
           if [ $i -gt $max_retry ]; then echo "test failed with maximum retry"; exit 1; fi
+      {{- end }}
   restartPolicy: Never
@@ -11,6 +11,8 @@ milvus:
   enabled: false
 redis-vector-db:
   enabled: false
+vdms-vector-db:
+  enabled: false
 
 replicaCount: 1
 
@@ -44,6 +46,7 @@ port: 6007
 service:
   type: ClusterIP
   port: 6007
+  containerPort: 6007
 
 resources: {}
   # We usually recommend not to specify default resources and to leave this as a conscious
@@ -96,7 +99,7 @@ LOCAL_EMBEDDING_MODEL: ""
 
 # redis DB service URL, e.g. redis://<service-name>:<port>
 REDIS_URL: ""
-INDEX_NAME: "rag-redis"
+indexName: "rag-redis"
 KEY_INDEX_NAME: "file-keys"
 SEARCH_BATCH_SIZE: 10
 

@@ -0,0 +1,18 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+image:
+  repository: opea/dataprep-multimodal-vdms
+  pullPolicy: IfNotPresent
+  # Overrides the image tag whose default is the chart appVersion.
+  tag: "latest"
+
+indexName: "mega-videoqna"
+vdmsHost: ""
+vdmsPort: "8001"
+entryCommand: ["/bin/sh"]
+extraArgs: ["-c", "sleep 15 && python ingest_videos.py"]
+
+# Set cacheUseHostPath to for caching encoding/embedding models and other related data
+global:
+  cacheUseHostPath: ""