From f54ebaf8f34e97a04c49ed38421d3ede0e416ba5 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Tue, 30 Sep 2025 16:03:42 +0100 Subject: [PATCH 01/20] Update cli_steps.py Signed-off-by: Sajid Alam --- kedro-docker/features/steps/cli_steps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro-docker/features/steps/cli_steps.py b/kedro-docker/features/steps/cli_steps.py index 9b3f86503..1b260049c 100644 --- a/kedro-docker/features/steps/cli_steps.py +++ b/kedro-docker/features/steps/cli_steps.py @@ -1,4 +1,4 @@ -"""Behave step definitions for the cli_scenarios feature.""" +"""Behave step definitions for the cli_scenarios feature. test""" import re import sys From 109e7bc2d7471e3a4c09cd4c6defae0c17418b9c Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Thu, 2 Oct 2025 15:10:59 +0100 Subject: [PATCH 02/20] Update Dockerfile.spark Signed-off-by: Sajid Alam --- .../kedro_docker/template/Dockerfile.spark | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/kedro-docker/kedro_docker/template/Dockerfile.spark b/kedro-docker/kedro_docker/template/Dockerfile.spark index f0911e8c8..75f5c1b75 100644 --- a/kedro-docker/kedro_docker/template/Dockerfile.spark +++ b/kedro-docker/kedro_docker/template/Dockerfile.spark @@ -1,9 +1,23 @@ ARG BASE_IMAGE=python:3.9-slim FROM $BASE_IMAGE as runtime-environment -# install JVM +# Install JDK and required libraries for Arrow RUN apt-get update && mkdir -p /usr/share/man/man1 && \ - apt-get install -y procps default-jre-headless && rm -rf /var/lib/apt/lists/* + apt-get install -y \ + procps \ + openjdk-17-jdk-headless \ + libnss3 \ + && rm -rf /var/lib/apt/lists/* + +# Set JVM options to allow Unsafe operations +ENV JAVA_OPTS="-Dio.netty.tryReflectionSetAccessible=true \ + --add-opens=java.base/java.nio=ALL-UNNAMED \ + --add-opens=java.base/sun.nio.ch=ALL-UNNAMED \ + --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED" + +# Set Spark configuration to handle Arrow properly +ENV SPARK_SUBMIT_OPTS="-Dio.netty.tryReflectionSetAccessible=true" +ENV ARROW_PRE_0_15_IPC_FORMAT=1 # update pip and install uv RUN python -m pip install -U "pip>=21.2" From 74e9844b35adde0663a0a4a231c096039bb3f1d2 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Thu, 2 Oct 2025 15:25:14 +0100 Subject: [PATCH 03/20] attempt fix Signed-off-by: Sajid Alam --- kedro-docker/features/steps/cli_steps.py | 2 +- kedro-docker/kedro_docker/template/Dockerfile.spark | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/kedro-docker/features/steps/cli_steps.py b/kedro-docker/features/steps/cli_steps.py index 1b260049c..9b3f86503 100644 --- a/kedro-docker/features/steps/cli_steps.py +++ b/kedro-docker/features/steps/cli_steps.py @@ -1,4 +1,4 @@ -"""Behave step definitions for the cli_scenarios feature. test""" +"""Behave step definitions for the cli_scenarios feature.""" import re import sys diff --git a/kedro-docker/kedro_docker/template/Dockerfile.spark b/kedro-docker/kedro_docker/template/Dockerfile.spark index 75f5c1b75..bc0c6c822 100644 --- a/kedro-docker/kedro_docker/template/Dockerfile.spark +++ b/kedro-docker/kedro_docker/template/Dockerfile.spark @@ -1,13 +1,10 @@ ARG BASE_IMAGE=python:3.9-slim FROM $BASE_IMAGE as runtime-environment -# Install JDK and required libraries for Arrow +# Install JVM RUN apt-get update && mkdir -p /usr/share/man/man1 && \ - apt-get install -y \ - procps \ - openjdk-17-jdk-headless \ - libnss3 \ - && rm -rf /var/lib/apt/lists/* + apt-get install -y procps default-jre-headless && \ + rm -rf /var/lib/apt/lists/* # Set JVM options to allow Unsafe operations ENV JAVA_OPTS="-Dio.netty.tryReflectionSetAccessible=true \ From 271114489e63c83ab43239ebb702769777b184c8 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Thu, 2 Oct 2025 15:41:09 +0100 Subject: [PATCH 04/20] Update Dockerfile.spark Signed-off-by: Sajid Alam --- kedro-docker/kedro_docker/template/Dockerfile.spark | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kedro-docker/kedro_docker/template/Dockerfile.spark b/kedro-docker/kedro_docker/template/Dockerfile.spark index bc0c6c822..533668440 100644 --- a/kedro-docker/kedro_docker/template/Dockerfile.spark +++ b/kedro-docker/kedro_docker/template/Dockerfile.spark @@ -1,18 +1,17 @@ ARG BASE_IMAGE=python:3.9-slim FROM $BASE_IMAGE as runtime-environment -# Install JVM +# install JVM RUN apt-get update && mkdir -p /usr/share/man/man1 && \ - apt-get install -y procps default-jre-headless && \ - rm -rf /var/lib/apt/lists/* + apt-get install -y procps default-jre-headless && rm -rf /var/lib/apt/lists/* -# Set JVM options to allow Unsafe operations +# set JVM options to allow Unsafe operations ENV JAVA_OPTS="-Dio.netty.tryReflectionSetAccessible=true \ --add-opens=java.base/java.nio=ALL-UNNAMED \ --add-opens=java.base/sun.nio.ch=ALL-UNNAMED \ --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED" -# Set Spark configuration to handle Arrow properly +# set Spark configuration to handle Arrow properly ENV SPARK_SUBMIT_OPTS="-Dio.netty.tryReflectionSetAccessible=true" ENV ARROW_PRE_0_15_IPC_FORMAT=1 From a185fbaf1d8166d6233d495a9a3ec22baadf5276 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Fri, 3 Oct 2025 09:03:22 +0100 Subject: [PATCH 05/20] Update Dockerfile.spark Signed-off-by: Sajid Alam --- kedro-docker/kedro_docker/template/Dockerfile.spark | 4 ---- 1 file changed, 4 deletions(-) diff --git a/kedro-docker/kedro_docker/template/Dockerfile.spark b/kedro-docker/kedro_docker/template/Dockerfile.spark index 533668440..b4381f844 100644 --- a/kedro-docker/kedro_docker/template/Dockerfile.spark +++ b/kedro-docker/kedro_docker/template/Dockerfile.spark @@ -11,10 +11,6 @@ ENV JAVA_OPTS="-Dio.netty.tryReflectionSetAccessible=true \ --add-opens=java.base/sun.nio.ch=ALL-UNNAMED \ --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED" -# set Spark configuration to handle Arrow properly -ENV SPARK_SUBMIT_OPTS="-Dio.netty.tryReflectionSetAccessible=true" -ENV ARROW_PRE_0_15_IPC_FORMAT=1 - # update pip and install uv RUN python -m pip install -U "pip>=21.2" RUN pip install uv From 472779b3283845f399b42904bef2e87d75971882 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Fri, 3 Oct 2025 09:28:45 +0100 Subject: [PATCH 06/20] Update Dockerfile.spark Signed-off-by: Sajid Alam --- kedro-docker/kedro_docker/template/Dockerfile.spark | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kedro-docker/kedro_docker/template/Dockerfile.spark b/kedro-docker/kedro_docker/template/Dockerfile.spark index b4381f844..345b988a6 100644 --- a/kedro-docker/kedro_docker/template/Dockerfile.spark +++ b/kedro-docker/kedro_docker/template/Dockerfile.spark @@ -11,6 +11,9 @@ ENV JAVA_OPTS="-Dio.netty.tryReflectionSetAccessible=true \ --add-opens=java.base/sun.nio.ch=ALL-UNNAMED \ --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED" +# set Spark configuration to handle Arrow properly +ENV SPARK_SUBMIT_OPTS="-Dio.netty.tryReflectionSetAccessible=true" + # update pip and install uv RUN python -m pip install -U "pip>=21.2" RUN pip install uv From 55a9a565b38221147d58e9c386b3f80e52e5c59d Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Fri, 3 Oct 2025 10:18:59 +0100 Subject: [PATCH 07/20] Update Dockerfile.spark Signed-off-by: Sajid Alam --- kedro-docker/kedro_docker/template/Dockerfile.spark | 1 + 1 file changed, 1 insertion(+) diff --git a/kedro-docker/kedro_docker/template/Dockerfile.spark b/kedro-docker/kedro_docker/template/Dockerfile.spark index 345b988a6..533668440 100644 --- a/kedro-docker/kedro_docker/template/Dockerfile.spark +++ b/kedro-docker/kedro_docker/template/Dockerfile.spark @@ -13,6 +13,7 @@ ENV JAVA_OPTS="-Dio.netty.tryReflectionSetAccessible=true \ # set Spark configuration to handle Arrow properly ENV SPARK_SUBMIT_OPTS="-Dio.netty.tryReflectionSetAccessible=true" +ENV ARROW_PRE_0_15_IPC_FORMAT=1 # update pip and install uv RUN python -m pip install -U "pip>=21.2" From cbac114d6acc5607018cbd52354b3a4de64a9d0a Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Fri, 3 Oct 2025 11:19:45 +0100 Subject: [PATCH 08/20] check pyspark version debug Signed-off-by: Sajid Alam --- kedro-docker/kedro_docker/template/Dockerfile.spark | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kedro-docker/kedro_docker/template/Dockerfile.spark b/kedro-docker/kedro_docker/template/Dockerfile.spark index 533668440..ff6eb88f4 100644 --- a/kedro-docker/kedro_docker/template/Dockerfile.spark +++ b/kedro-docker/kedro_docker/template/Dockerfile.spark @@ -23,6 +23,9 @@ RUN pip install uv COPY requirements.txt /tmp/requirements.txt RUN uv pip install --system --no-cache-dir -r /tmp/requirements.txt && rm -f /tmp/requirements.txt +# Debug: Check PySpark version +RUN python -c "import pyspark; print(f'PySpark version: {pyspark.__version__}')" + # add kedro user ARG KEDRO_UID=999 ARG KEDRO_GID=0 From 20255f7c8c28eb16174a27d0ff7ff586c04339c5 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Fri, 3 Oct 2025 11:39:56 +0100 Subject: [PATCH 09/20] debug Signed-off-by: Sajid Alam --- kedro-docker/features/steps/cli_steps.py | 5 +++++ kedro-docker/kedro_docker/template/Dockerfile.spark | 3 --- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/kedro-docker/features/steps/cli_steps.py b/kedro-docker/features/steps/cli_steps.py index 9b3f86503..1bb24a231 100644 --- a/kedro-docker/features/steps/cli_steps.py +++ b/kedro-docker/features/steps/cli_steps.py @@ -186,6 +186,11 @@ def exec_kedro_command(context, command): """Execute Kedro command and check the status.""" make_cmd = [context.kedro] + command.split() + # Debug: Check Spark version if this is a spark-related command + debug_cmd = [context.python, "-c", "import pyspark; print(f'PySpark version: {pyspark.__version__}')"] + res = run(debug_cmd, env=context.env, cwd=str(context.root_project_dir)) + print(f"DEBUG: {res.stdout}") + res = run(make_cmd, env=context.env, cwd=str(context.root_project_dir)) if res.returncode != OK_EXIT_CODE: diff --git a/kedro-docker/kedro_docker/template/Dockerfile.spark b/kedro-docker/kedro_docker/template/Dockerfile.spark index ff6eb88f4..533668440 100644 --- a/kedro-docker/kedro_docker/template/Dockerfile.spark +++ b/kedro-docker/kedro_docker/template/Dockerfile.spark @@ -23,9 +23,6 @@ RUN pip install uv COPY requirements.txt /tmp/requirements.txt RUN uv pip install --system --no-cache-dir -r /tmp/requirements.txt && rm -f /tmp/requirements.txt -# Debug: Check PySpark version -RUN python -c "import pyspark; print(f'PySpark version: {pyspark.__version__}')" - # add kedro user ARG KEDRO_UID=999 ARG KEDRO_GID=0 From 27404576a0b5c6fbb5d267dd3e9e498d64a93100 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Fri, 3 Oct 2025 12:03:40 +0100 Subject: [PATCH 10/20] more debug Signed-off-by: Sajid Alam --- kedro-docker/features/docker.feature | 14 ++++++++++++++ .../kedro_docker/template/Dockerfile.spark | 3 +++ 2 files changed, 17 insertions(+) diff --git a/kedro-docker/features/docker.feature b/kedro-docker/features/docker.feature index 2d54de7f3..629f58901 100644 --- a/kedro-docker/features/docker.feature +++ b/kedro-docker/features/docker.feature @@ -120,3 +120,17 @@ Feature: Docker commands in new projects When I execute the kedro command "docker dive" Then I should get an error exit code And Standard output should contain a message including "Error: Unable to find image `project-dummy` locally." + +Scenario: Execute docker build and run using spark Dockerfile + Given I have prepared a config file + And I run a non-interactive kedro new using spaceflights-pyspark starter + And I have installed the project dependencies + And I have removed old docker image of test project + When I execute the kedro command "docker build --with-spark" + Then I should get a successful exit code + And A new docker image for test project should be created + # Force output by checking for a specific version string + When I execute the kedro command "docker cmd python -c 'import pyspark; print(pyspark.__version__); assert False, f\"DEBUG: PySpark {pyspark.__version__}\"'" + When I execute the kedro command "docker run" + Then I should get a successful exit code + And I should get a message including "Pipeline execution completed" diff --git a/kedro-docker/kedro_docker/template/Dockerfile.spark b/kedro-docker/kedro_docker/template/Dockerfile.spark index 533668440..ff6eb88f4 100644 --- a/kedro-docker/kedro_docker/template/Dockerfile.spark +++ b/kedro-docker/kedro_docker/template/Dockerfile.spark @@ -23,6 +23,9 @@ RUN pip install uv COPY requirements.txt /tmp/requirements.txt RUN uv pip install --system --no-cache-dir -r /tmp/requirements.txt && rm -f /tmp/requirements.txt +# Debug: Check PySpark version +RUN python -c "import pyspark; print(f'PySpark version: {pyspark.__version__}')" + # add kedro user ARG KEDRO_UID=999 ARG KEDRO_GID=0 From e7d86ff5f15c9a117ac3f2b784899055fe58caf8 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Fri, 3 Oct 2025 12:15:13 +0100 Subject: [PATCH 11/20] attempt 2 Signed-off-by: Sajid Alam --- kedro-docker/features/docker.feature | 14 -------------- kedro-docker/features/docker_with_spark.feature | 5 ++++- kedro-docker/features/steps/cli_steps.py | 5 ----- 3 files changed, 4 insertions(+), 20 deletions(-) diff --git a/kedro-docker/features/docker.feature b/kedro-docker/features/docker.feature index 629f58901..2d54de7f3 100644 --- a/kedro-docker/features/docker.feature +++ b/kedro-docker/features/docker.feature @@ -120,17 +120,3 @@ Feature: Docker commands in new projects When I execute the kedro command "docker dive" Then I should get an error exit code And Standard output should contain a message including "Error: Unable to find image `project-dummy` locally." - -Scenario: Execute docker build and run using spark Dockerfile - Given I have prepared a config file - And I run a non-interactive kedro new using spaceflights-pyspark starter - And I have installed the project dependencies - And I have removed old docker image of test project - When I execute the kedro command "docker build --with-spark" - Then I should get a successful exit code - And A new docker image for test project should be created - # Force output by checking for a specific version string - When I execute the kedro command "docker cmd python -c 'import pyspark; print(pyspark.__version__); assert False, f\"DEBUG: PySpark {pyspark.__version__}\"'" - When I execute the kedro command "docker run" - Then I should get a successful exit code - And I should get a message including "Pipeline execution completed" diff --git a/kedro-docker/features/docker_with_spark.feature b/kedro-docker/features/docker_with_spark.feature index 972b758a3..a38b16710 100644 --- a/kedro-docker/features/docker_with_spark.feature +++ b/kedro-docker/features/docker_with_spark.feature @@ -18,6 +18,9 @@ Feature: Docker commands in new Spark projects When I execute the kedro command "docker build --with-spark" Then I should get a successful exit code And A new docker image for test project should be created + # DEBUG: Check PySpark version + When I execute the kedro command "docker cmd python -c 'import pyspark; print(\"PySpark version:\", pyspark.__version__)'" + Then I should get a message including "PySpark version:" When I execute the kedro command "docker run" Then I should get a successful exit code - And I should get a message including "Pipeline execution completed" + And I should get a message including "Pipeline execution completed" \ No newline at end of file diff --git a/kedro-docker/features/steps/cli_steps.py b/kedro-docker/features/steps/cli_steps.py index 1bb24a231..9b3f86503 100644 --- a/kedro-docker/features/steps/cli_steps.py +++ b/kedro-docker/features/steps/cli_steps.py @@ -186,11 +186,6 @@ def exec_kedro_command(context, command): """Execute Kedro command and check the status.""" make_cmd = [context.kedro] + command.split() - # Debug: Check Spark version if this is a spark-related command - debug_cmd = [context.python, "-c", "import pyspark; print(f'PySpark version: {pyspark.__version__}')"] - res = run(debug_cmd, env=context.env, cwd=str(context.root_project_dir)) - print(f"DEBUG: {res.stdout}") - res = run(make_cmd, env=context.env, cwd=str(context.root_project_dir)) if res.returncode != OK_EXIT_CODE: From beef9fba8a86eb8f6cccba0f92f3004994057573 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Fri, 3 Oct 2025 12:48:39 +0100 Subject: [PATCH 12/20] try java 11 Signed-off-by: Sajid Alam --- .../features/docker_with_spark.feature | 5 +---- .../kedro_docker/template/Dockerfile.spark | 18 +++++++++--------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/kedro-docker/features/docker_with_spark.feature b/kedro-docker/features/docker_with_spark.feature index a38b16710..972b758a3 100644 --- a/kedro-docker/features/docker_with_spark.feature +++ b/kedro-docker/features/docker_with_spark.feature @@ -18,9 +18,6 @@ Feature: Docker commands in new Spark projects When I execute the kedro command "docker build --with-spark" Then I should get a successful exit code And A new docker image for test project should be created - # DEBUG: Check PySpark version - When I execute the kedro command "docker cmd python -c 'import pyspark; print(\"PySpark version:\", pyspark.__version__)'" - Then I should get a message including "PySpark version:" When I execute the kedro command "docker run" Then I should get a successful exit code - And I should get a message including "Pipeline execution completed" \ No newline at end of file + And I should get a message including "Pipeline execution completed" diff --git a/kedro-docker/kedro_docker/template/Dockerfile.spark b/kedro-docker/kedro_docker/template/Dockerfile.spark index ff6eb88f4..9595a717a 100644 --- a/kedro-docker/kedro_docker/template/Dockerfile.spark +++ b/kedro-docker/kedro_docker/template/Dockerfile.spark @@ -3,17 +3,17 @@ FROM $BASE_IMAGE as runtime-environment # install JVM RUN apt-get update && mkdir -p /usr/share/man/man1 && \ - apt-get install -y procps default-jre-headless && rm -rf /var/lib/apt/lists/* + apt-get install -y procps openjdk-11-jre-headless && rm -rf /var/lib/apt/lists/* -# set JVM options to allow Unsafe operations -ENV JAVA_OPTS="-Dio.netty.tryReflectionSetAccessible=true \ - --add-opens=java.base/java.nio=ALL-UNNAMED \ - --add-opens=java.base/sun.nio.ch=ALL-UNNAMED \ - --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED" +# set JAVA_HOME +ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 -# set Spark configuration to handle Arrow properly -ENV SPARK_SUBMIT_OPTS="-Dio.netty.tryReflectionSetAccessible=true" -ENV ARROW_PRE_0_15_IPC_FORMAT=1 +# configure JVM for Arrow +ENV JAVA_TOOL_OPTIONS="--add-opens=java.base/java.nio=ALL-UNNAMED \ + --add-opens=java.base/sun.nio.ch=ALL-UNNAMED \ + --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED \ + --add-opens=java.base/sun.misc=ALL-UNNAMED \ + -Dio.netty.tryReflectionSetAccessible=true" # update pip and install uv RUN python -m pip install -U "pip>=21.2" From 240b05705f51c7c3b2d0cbb86957e7c0df0625b5 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Fri, 3 Oct 2025 12:58:42 +0100 Subject: [PATCH 13/20] Update Dockerfile.spark Signed-off-by: Sajid Alam --- .../kedro_docker/template/Dockerfile.spark | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/kedro-docker/kedro_docker/template/Dockerfile.spark b/kedro-docker/kedro_docker/template/Dockerfile.spark index 9595a717a..5f41aeaa4 100644 --- a/kedro-docker/kedro_docker/template/Dockerfile.spark +++ b/kedro-docker/kedro_docker/template/Dockerfile.spark @@ -3,16 +3,23 @@ FROM $BASE_IMAGE as runtime-environment # install JVM RUN apt-get update && mkdir -p /usr/share/man/man1 && \ - apt-get install -y procps openjdk-11-jre-headless && rm -rf /var/lib/apt/lists/* - -# set JAVA_HOME -ENV JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 + apt-get install -y procps default-jre-headless && rm -rf /var/lib/apt/lists/* # configure JVM for Arrow -ENV JAVA_TOOL_OPTIONS="--add-opens=java.base/java.nio=ALL-UNNAMED \ +ENV JAVA_TOOL_OPTIONS="--add-opens=java.base/java.lang=ALL-UNNAMED \ + --add-opens=java.base/java.lang.invoke=ALL-UNNAMED \ + --add-opens=java.base/java.lang.reflect=ALL-UNNAMED \ + --add-opens=java.base/java.io=ALL-UNNAMED \ + --add-opens=java.base/java.net=ALL-UNNAMED \ + --add-opens=java.base/java.nio=ALL-UNNAMED \ + --add-opens=java.base/java.util=ALL-UNNAMED \ + --add-opens=java.base/java.util.concurrent=ALL-UNNAMED \ + --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED \ --add-opens=java.base/sun.nio.ch=ALL-UNNAMED \ + --add-opens=java.base/sun.nio.cs=ALL-UNNAMED \ + --add-opens=java.base/sun.security.action=ALL-UNNAMED \ + --add-opens=java.base/sun.util.calendar=ALL-UNNAMED \ --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED \ - --add-opens=java.base/sun.misc=ALL-UNNAMED \ -Dio.netty.tryReflectionSetAccessible=true" # update pip and install uv From 6c0922a718c3e07e34ba702b128f98478202293b Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Fri, 3 Oct 2025 14:41:53 +0100 Subject: [PATCH 14/20] Update Dockerfile.spark Signed-off-by: Sajid Alam --- .../kedro_docker/template/Dockerfile.spark | 21 +++---------------- 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/kedro-docker/kedro_docker/template/Dockerfile.spark b/kedro-docker/kedro_docker/template/Dockerfile.spark index 5f41aeaa4..48a36fa17 100644 --- a/kedro-docker/kedro_docker/template/Dockerfile.spark +++ b/kedro-docker/kedro_docker/template/Dockerfile.spark @@ -5,30 +5,15 @@ FROM $BASE_IMAGE as runtime-environment RUN apt-get update && mkdir -p /usr/share/man/man1 && \ apt-get install -y procps default-jre-headless && rm -rf /var/lib/apt/lists/* -# configure JVM for Arrow -ENV JAVA_TOOL_OPTIONS="--add-opens=java.base/java.lang=ALL-UNNAMED \ - --add-opens=java.base/java.lang.invoke=ALL-UNNAMED \ - --add-opens=java.base/java.lang.reflect=ALL-UNNAMED \ - --add-opens=java.base/java.io=ALL-UNNAMED \ - --add-opens=java.base/java.net=ALL-UNNAMED \ - --add-opens=java.base/java.nio=ALL-UNNAMED \ - --add-opens=java.base/java.util=ALL-UNNAMED \ - --add-opens=java.base/java.util.concurrent=ALL-UNNAMED \ - --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED \ - --add-opens=java.base/sun.nio.ch=ALL-UNNAMED \ - --add-opens=java.base/sun.nio.cs=ALL-UNNAMED \ - --add-opens=java.base/sun.security.action=ALL-UNNAMED \ - --add-opens=java.base/sun.util.calendar=ALL-UNNAMED \ - --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED \ - -Dio.netty.tryReflectionSetAccessible=true" - # update pip and install uv RUN python -m pip install -U "pip>=21.2" RUN pip install uv # install project requirements COPY requirements.txt /tmp/requirements.txt -RUN uv pip install --system --no-cache-dir -r /tmp/requirements.txt && rm -f /tmp/requirements.txt +RUN uv pip install --system --no-cache-dir -r /tmp/requirements.txt && \ + uv pip install --system --no-cache-dir "pyarrow>=6.0,<12.0" && \ + rm -f /tmp/requirements.txt # Debug: Check PySpark version RUN python -c "import pyspark; print(f'PySpark version: {pyspark.__version__}')" From 555d973515b34a89ab16432d52266df212e18546 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Mon, 6 Oct 2025 10:27:30 +0100 Subject: [PATCH 15/20] Update Dockerfile.spark Signed-off-by: Sajid Alam --- kedro-docker/kedro_docker/template/Dockerfile.spark | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kedro-docker/kedro_docker/template/Dockerfile.spark b/kedro-docker/kedro_docker/template/Dockerfile.spark index 48a36fa17..16aa28f2e 100644 --- a/kedro-docker/kedro_docker/template/Dockerfile.spark +++ b/kedro-docker/kedro_docker/template/Dockerfile.spark @@ -11,12 +11,10 @@ RUN pip install uv # install project requirements COPY requirements.txt /tmp/requirements.txt -RUN uv pip install --system --no-cache-dir -r /tmp/requirements.txt && \ - uv pip install --system --no-cache-dir "pyarrow>=6.0,<12.0" && \ - rm -f /tmp/requirements.txt +RUN uv pip install --system --no-cache-dir -r /tmp/requirements.txt && rm -f /tmp/requirements.txt -# Debug: Check PySpark version -RUN python -c "import pyspark; print(f'PySpark version: {pyspark.__version__}')" +RUN python -c "import kedro, sys; print(sys.version); print('kedro', kedro.__version__)" +RUN python -c "import pyspark, pyarrow; print('pyspark', pyspark.__version__); print('pyarrow', pyarrow.__version__)" # add kedro user ARG KEDRO_UID=999 From 389320e90522e6bc277ceda98f698786e27ccdca Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Mon, 6 Oct 2025 10:32:16 +0100 Subject: [PATCH 16/20] Update e2e-tests.yml Signed-off-by: Sajid Alam --- .github/workflows/e2e-tests.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 8a36e5db6..086350797 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -40,7 +40,9 @@ jobs: - name: pip freeze run: uv pip freeze --system - name: Run end to end tests - # Custom shell to run kedro-docker e2e-tests because -it flag for `docker run` - # isn't supported on Github Actions. See https://github.com/actions/runner/issues/241 + env: + DOCKER_BUILDKIT: "1" + BUILDKIT_PROGRESS: plain shell: 'script -q -e -c "bash {0}"' run: make plugin=${{ inputs.plugin }} e2e-tests + From 0dbb3ae24b13f3050bb546be87ecdf4469661410 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Mon, 6 Oct 2025 10:55:04 +0100 Subject: [PATCH 17/20] Update kedro-docker.yml Signed-off-by: Sajid Alam --- .github/workflows/kedro-docker.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/kedro-docker.yml b/.github/workflows/kedro-docker.yml index 16ffcbafe..2345f640a 100644 --- a/.github/workflows/kedro-docker.yml +++ b/.github/workflows/kedro-docker.yml @@ -40,7 +40,7 @@ jobs: strategy: matrix: os: [ ubuntu-latest ] - python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ] + python-version: [ "3.9", "3.10", "3.11", "3.12" ] uses: ./.github/workflows/e2e-tests.yml with: plugin: kedro-docker From 17770bbd57da8674f3478e18abc248f1050a41f2 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Mon, 6 Oct 2025 11:10:10 +0100 Subject: [PATCH 18/20] use specific versions Signed-off-by: Sajid Alam --- .github/workflows/kedro-docker.yml | 2 +- kedro-docker/kedro_docker/template/Dockerfile.spark | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/kedro-docker.yml b/.github/workflows/kedro-docker.yml index 2345f640a..16ffcbafe 100644 --- a/.github/workflows/kedro-docker.yml +++ b/.github/workflows/kedro-docker.yml @@ -40,7 +40,7 @@ jobs: strategy: matrix: os: [ ubuntu-latest ] - python-version: [ "3.9", "3.10", "3.11", "3.12" ] + python-version: [ "3.9", "3.10", "3.11", "3.12", "3.13" ] uses: ./.github/workflows/e2e-tests.yml with: plugin: kedro-docker diff --git a/kedro-docker/kedro_docker/template/Dockerfile.spark b/kedro-docker/kedro_docker/template/Dockerfile.spark index 16aa28f2e..95290753d 100644 --- a/kedro-docker/kedro_docker/template/Dockerfile.spark +++ b/kedro-docker/kedro_docker/template/Dockerfile.spark @@ -13,8 +13,10 @@ RUN pip install uv COPY requirements.txt /tmp/requirements.txt RUN uv pip install --system --no-cache-dir -r /tmp/requirements.txt && rm -f /tmp/requirements.txt -RUN python -c "import kedro, sys; print(sys.version); print('kedro', kedro.__version__)" -RUN python -c "import pyspark, pyarrow; print('pyspark', pyspark.__version__); print('pyarrow', pyarrow.__version__)" +ARG PYSPARK_VERSION=3.5.1 +ARG PYARROW_VERSION=12.0.1 +RUN uv pip install --system "pyspark==${PYSPARK_VERSION}" "pyarrow==${PYARROW_VERSION}" + # add kedro user ARG KEDRO_UID=999 From 7b764edbe6db26951205bb900cf77a5c9a07bcc8 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Mon, 6 Oct 2025 11:25:43 +0100 Subject: [PATCH 19/20] Update Dockerfile.spark Signed-off-by: Sajid Alam --- kedro-docker/kedro_docker/template/Dockerfile.spark | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kedro-docker/kedro_docker/template/Dockerfile.spark b/kedro-docker/kedro_docker/template/Dockerfile.spark index 95290753d..d135fa3c5 100644 --- a/kedro-docker/kedro_docker/template/Dockerfile.spark +++ b/kedro-docker/kedro_docker/template/Dockerfile.spark @@ -14,10 +14,9 @@ COPY requirements.txt /tmp/requirements.txt RUN uv pip install --system --no-cache-dir -r /tmp/requirements.txt && rm -f /tmp/requirements.txt ARG PYSPARK_VERSION=3.5.1 -ARG PYARROW_VERSION=12.0.1 +ARG PYARROW_VERSION=14.0.2 RUN uv pip install --system "pyspark==${PYSPARK_VERSION}" "pyarrow==${PYARROW_VERSION}" - # add kedro user ARG KEDRO_UID=999 ARG KEDRO_GID=0 From 4655426b468b6a1aae017f5df60d6c636b0e2ae2 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Mon, 6 Oct 2025 11:57:35 +0100 Subject: [PATCH 20/20] revert Signed-off-by: Sajid Alam --- .github/workflows/e2e-tests.yml | 6 ++---- kedro-docker/kedro_docker/template/Dockerfile.spark | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 086350797..8a36e5db6 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -40,9 +40,7 @@ jobs: - name: pip freeze run: uv pip freeze --system - name: Run end to end tests - env: - DOCKER_BUILDKIT: "1" - BUILDKIT_PROGRESS: plain + # Custom shell to run kedro-docker e2e-tests because -it flag for `docker run` + # isn't supported on Github Actions. See https://github.com/actions/runner/issues/241 shell: 'script -q -e -c "bash {0}"' run: make plugin=${{ inputs.plugin }} e2e-tests - diff --git a/kedro-docker/kedro_docker/template/Dockerfile.spark b/kedro-docker/kedro_docker/template/Dockerfile.spark index d135fa3c5..e67266e3b 100644 --- a/kedro-docker/kedro_docker/template/Dockerfile.spark +++ b/kedro-docker/kedro_docker/template/Dockerfile.spark @@ -14,7 +14,7 @@ COPY requirements.txt /tmp/requirements.txt RUN uv pip install --system --no-cache-dir -r /tmp/requirements.txt && rm -f /tmp/requirements.txt ARG PYSPARK_VERSION=3.5.1 -ARG PYARROW_VERSION=14.0.2 +ARG PYARROW_VERSION=16.1.0 RUN uv pip install --system "pyspark==${PYSPARK_VERSION}" "pyarrow==${PYARROW_VERSION}" # add kedro user