From 277367549dcc0d884221bb5eebe4752e8bd3a38d Mon Sep 17 00:00:00 2001 From: Tingweiftw Date: Tue, 5 Sep 2023 09:50:11 +0800 Subject: [PATCH 01/11] add spark 3.4.0 --- .github/workflows/ci.yml | 24 ++++++++++++++++++++++++ templates/vars.yml | 2 +- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 92141b5..49b818f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -111,6 +111,30 @@ jobs: scala: "2.13" with_hive: "true" with_pyspark: "true" + - spark: "3.4.0" + java: "8" + hadoop: "3.3.2" + scala: "2.12" + with_hive: "true" + with_pyspark: "true" + - spark: "3.4.0" + java: "8" + hadoop: "3.3.2" + scala: "2.13" + with_hive: "true" + with_pyspark: "true" + - spark: "3.4.0" + java: "11" + hadoop: "3.3.2" + scala: "2.12" + with_hive: "true" + with_pyspark: "true" + - spark: "3.4.0" + java: "11" + hadoop: "3.3.2" + scala: "2.13" + with_hive: "true" + with_pyspark: "true" runs-on: ubuntu-20.04 env: IMAGE_NAME: "spark-k8s" diff --git a/templates/vars.yml b/templates/vars.yml index 1c8e674..14ce7da 100644 --- a/templates/vars.yml +++ b/templates/vars.yml @@ -11,7 +11,7 @@ versions: hadoop: ['3.3.1'] scala: ['2.12', '2.13'] -- spark: ['3.3.0', '3.3.1'] +- spark: ['3.3.0', '3.3.1', '3.4.0'] java: ['8', '11'] hadoop: ['3.3.2'] scala: ['2.12', '2.13'] From fffcd9ebc6d75ef2b758db9da3d590dd48189786 Mon Sep 17 00:00:00 2001 From: Tingweiftw Date: Tue, 5 Sep 2023 10:28:47 +0800 Subject: [PATCH 02/11] upgrade required hadoop and scala --- .github/workflows/ci.yml | 20 ++++---------------- templates/vars.yml | 7 ++++++- 2 files changed, 10 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 49b818f..89eca0c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -113,26 +113,14 @@ jobs: with_pyspark: "true" - spark: "3.4.0" java: "8" - hadoop: "3.3.2" - scala: "2.12" - with_hive: "true" - with_pyspark: "true" - - spark: "3.4.0" - java: "8" - hadoop: "3.3.2" - scala: "2.13" + hadoop: "3.3.4" + scala: "2.14" with_hive: "true" with_pyspark: "true" - spark: "3.4.0" java: "11" - hadoop: "3.3.2" - scala: "2.12" - with_hive: "true" - with_pyspark: "true" - - spark: "3.4.0" - java: "11" - hadoop: "3.3.2" - scala: "2.13" + hadoop: "3.3.4" + scala: "2.14" with_hive: "true" with_pyspark: "true" runs-on: ubuntu-20.04 diff --git a/templates/vars.yml b/templates/vars.yml index 14ce7da..7d7e36a 100644 --- a/templates/vars.yml +++ b/templates/vars.yml @@ -11,7 +11,12 @@ versions: hadoop: ['3.3.1'] scala: ['2.12', '2.13'] -- spark: ['3.3.0', '3.3.1', '3.4.0'] +- spark: ['3.3.0', '3.3.1'] java: ['8', '11'] hadoop: ['3.3.2'] scala: ['2.12', '2.13'] + +- spark: ['3.4.0'] + java: ['8', '11'] + hadoop: ['3.3.4'] + scala: ['2.14'] From 20fac38a644e720e9c0d96e9d3359c3468ffd8e5 Mon Sep 17 00:00:00 2001 From: Tingweiftw Date: Tue, 5 Sep 2023 10:30:51 +0800 Subject: [PATCH 03/11] change scala version --- templates/vars.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/vars.yml b/templates/vars.yml index 7d7e36a..192023c 100644 --- a/templates/vars.yml +++ b/templates/vars.yml @@ -19,4 +19,4 @@ versions: - spark: ['3.4.0'] java: ['8', '11'] hadoop: ['3.3.4'] - scala: ['2.14'] + scala: ['2.12', '2.13'] \ No newline at end of file From 99bccee08c5ac0040f284d8ed448649f98da1860 Mon Sep 17 00:00:00 2001 From: Tingweiftw Date: Tue, 5 Sep 2023 10:32:32 +0800 Subject: [PATCH 04/11] update ci yml --- .github/workflows/ci.yml | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 89eca0c..4c81d0c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -114,13 +114,25 @@ jobs: - spark: "3.4.0" java: "8" hadoop: "3.3.4" - scala: "2.14" + scala: "2.12" + with_hive: "true" + with_pyspark: "true" + - spark: "3.4.0" + java: "8" + hadoop: "3.3.4" + scala: "2.13" with_hive: "true" with_pyspark: "true" - spark: "3.4.0" java: "11" hadoop: "3.3.4" - scala: "2.14" + scala: "2.12" + with_hive: "true" + with_pyspark: "true" + - spark: "3.4.0" + java: "11" + hadoop: "3.3.4" + scala: "2.13" with_hive: "true" with_pyspark: "true" runs-on: ubuntu-20.04 From 013039bcc696efff746e6a3c304029702139c909 Mon Sep 17 00:00:00 2001 From: Tingweiftw Date: Tue, 5 Sep 2023 13:24:50 +0800 Subject: [PATCH 05/11] add image variant condition in make-distr --- make-distribution.sh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/make-distribution.sh b/make-distribution.sh index d7cd5c6..2e8df19 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -73,17 +73,17 @@ fi SPARK_LABEL="${SPARK_VERSION}" TAG_NAME="${SELF_VERSION}_${SPARK_LABEL}_hadoop-${HADOOP_VERSION}_scala-${SCALA_VERSION}_java-${JAVA_VERSION}" -# ./bin/docker-image-tool.sh \ -# -b java_image_tag=${JAVA_VERSION}-jre-slim-buster \ -# -r "${IMAGE_NAME}" \ -# -t "${TAG_NAME}" \ -# -f "${DOCKERFILE_BASE}" \ -# -p "${DOCKERFILE_PY}" \ -# -R "${DOCKERFILE_R}" \ -# build +if [[ ${SPARK_MAJOR_VERSION} -eq 3 && ${SPARK_MINOR_VERSION} -ge 4 ]]; then # >=3.4 + # From Spark v3.4.0 onwards, openjdk is not the prefered base image source as it i + # deprecated and taken over by eclipse-temurin. slim-buster variants are not available + # on eclipse-temurin at the moment. + IMAGE_VARIANT="jre-slim-buster" +else + IMAGE_VARIANT="jre" +fi ./bin/docker-image-tool.sh \ - -b java_image_tag=${JAVA_VERSION}-jre-slim-buster \ + -b java_image_tag=${JAVA_VERSION}-${IMAGE_VARIANT} \ -r "${IMAGE_NAME}" \ -t "${TAG_NAME}" \ -f "${DOCKERFILE_BASE}" \ From b5516228ad91bfbaf446f3fee5b4ff65719cb34f Mon Sep 17 00:00:00 2001 From: Tingweiftw Date: Tue, 5 Sep 2023 14:04:42 +0800 Subject: [PATCH 06/11] fix image variant condition --- make-distribution.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/make-distribution.sh b/make-distribution.sh index 2e8df19..68336ab 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -77,9 +77,9 @@ if [[ ${SPARK_MAJOR_VERSION} -eq 3 && ${SPARK_MINOR_VERSION} -ge 4 ]]; then # > # From Spark v3.4.0 onwards, openjdk is not the prefered base image source as it i # deprecated and taken over by eclipse-temurin. slim-buster variants are not available # on eclipse-temurin at the moment. - IMAGE_VARIANT="jre-slim-buster" -else IMAGE_VARIANT="jre" +else + IMAGE_VARIANT="jre-slim-buster" fi ./bin/docker-image-tool.sh \ From 562d56e91892971818c5196f772b77013629e228 Mon Sep 17 00:00:00 2001 From: Tingweiftw Date: Tue, 5 Sep 2023 15:16:14 +0800 Subject: [PATCH 07/11] changelogs and readme --- CHANGELOG.md | 3 ++- README.md | 16 +++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 235ef00..64b3e9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,8 +3,9 @@ ## v3 - (Temporarily drop support for R due to keyserver issues) -- Only supports for for 3.1.3, 3.2.2, 3.3.0 (dropped 2.4.8). +- Only supports for for 3.1.3, 3.2.2, 3.3.0, 3.4.0 (dropped 2.4.8). - Supports both Java 8 and 11 for Spark 3 builds. +- Add Ubuntu-based image since the migration to eclipse-temurin for jre image source. ## v2 diff --git a/README.md b/README.md index de0891f..31d760d 100644 --- a/README.md +++ b/README.md @@ -12,16 +12,22 @@ Debian: - `3.3.0` - `3.2.2` - `3.1.3` +- `3.4.0` ## Note (R builds are temporarily suspended due to keyserver issues at current time.) -All the build images here are Debian based as the official Spark repository now -uses `openjdk:-jdk-slim-buster` as the base image for Kubernetes build. -Because currently the official Dockerfiles do not pin the Debian distribution, -they are incorrectly using the latest Debian `bullseye`, which does not have -support for Python 2, and its Python 3.9 do not work well with PySpark. +Build image for Spark 3.4.0 is Ubuntu based because openjdk is deprecated and +going forward the official Spark repository uses `eclipse-temurin:-jre` +where slim variants of jre images are not available at the moment. + +All the build images with Spark before v3.4.0 are Debian based as the official +Spark repository now uses `openjdk:-jre-slim-buster` as the base image +for Kubernetes build. Because currently the official Dockerfiles do not pin +the Debian distribution, they are incorrectly using the latest Debian `bullseye`, +which does not have support for Python 2, and its Python 3.9 do not work well +with PySpark. Hence some Dockerfile overrides are in-place to make sure that Spark 2 builds can still work. From ffe6529ec8115219be22d31c6b6fcfafbc5956d4 Mon Sep 17 00:00:00 2001 From: Tingweiftw Date: Tue, 5 Sep 2023 15:16:25 +0800 Subject: [PATCH 08/11] reorder and add back commented out R --- make-distribution.sh | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/make-distribution.sh b/make-distribution.sh index 68336ab..1f4e2e2 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -67,12 +67,6 @@ else DOCKERFILE_PY="./resource-managers/kubernetes/docker/src/main/dockerfiles/spark/bindings/python/Dockerfile" fi -# Temporarily remove R build due to keyserver issue -# DOCKERFILE_R="./resource-managers/kubernetes/docker/src/main/dockerfiles/R/Dockerfile" - -SPARK_LABEL="${SPARK_VERSION}" -TAG_NAME="${SELF_VERSION}_${SPARK_LABEL}_hadoop-${HADOOP_VERSION}_scala-${SCALA_VERSION}_java-${JAVA_VERSION}" - if [[ ${SPARK_MAJOR_VERSION} -eq 3 && ${SPARK_MINOR_VERSION} -ge 4 ]]; then # >=3.4 # From Spark v3.4.0 onwards, openjdk is not the prefered base image source as it i # deprecated and taken over by eclipse-temurin. slim-buster variants are not available @@ -82,6 +76,21 @@ else IMAGE_VARIANT="jre-slim-buster" fi +# Temporarily remove R build due to keyserver issue +# DOCKERFILE_R="./resource-managers/kubernetes/docker/src/main/dockerfiles/R/Dockerfile" + +SPARK_LABEL="${SPARK_VERSION}" +TAG_NAME="${SELF_VERSION}_${SPARK_LABEL}_hadoop-${HADOOP_VERSION}_scala-${SCALA_VERSION}_java-${JAVA_VERSION}" + +# ./bin/docker-image-tool.sh \ +# -b java_image_tag=${JAVA_VERSION}-jre-slim-buster \ +# -r "${IMAGE_NAME}" \ +# -t "${TAG_NAME}" \ +# -f "${DOCKERFILE_BASE}" \ +# -p "${DOCKERFILE_PY}" \ +# -R "${DOCKERFILE_R}" \ +# build + ./bin/docker-image-tool.sh \ -b java_image_tag=${JAVA_VERSION}-${IMAGE_VARIANT} \ -r "${IMAGE_NAME}" \ From f8849c786dabbe5e9b243c3b10725cc7a1951568 Mon Sep 17 00:00:00 2001 From: Tingweiftw Date: Tue, 5 Sep 2023 15:24:36 +0800 Subject: [PATCH 09/11] update to 3.4.1 --- .github/workflows/ci.yml | 8 ++++---- CHANGELOG.md | 2 +- README.md | 4 ++-- templates/vars.yml | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4c81d0c..02dfb3d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -111,25 +111,25 @@ jobs: scala: "2.13" with_hive: "true" with_pyspark: "true" - - spark: "3.4.0" + - spark: "3.4.1" java: "8" hadoop: "3.3.4" scala: "2.12" with_hive: "true" with_pyspark: "true" - - spark: "3.4.0" + - spark: "3.4.1" java: "8" hadoop: "3.3.4" scala: "2.13" with_hive: "true" with_pyspark: "true" - - spark: "3.4.0" + - spark: "3.4.1" java: "11" hadoop: "3.3.4" scala: "2.12" with_hive: "true" with_pyspark: "true" - - spark: "3.4.0" + - spark: "3.4.1" java: "11" hadoop: "3.3.4" scala: "2.13" diff --git a/CHANGELOG.md b/CHANGELOG.md index 64b3e9b..2ff7de0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## v3 - (Temporarily drop support for R due to keyserver issues) -- Only supports for for 3.1.3, 3.2.2, 3.3.0, 3.4.0 (dropped 2.4.8). +- Only supports for for 3.1.3, 3.2.2, 3.3.0, 3.4.1 (dropped 2.4.8). - Supports both Java 8 and 11 for Spark 3 builds. - Add Ubuntu-based image since the migration to eclipse-temurin for jre image source. diff --git a/README.md b/README.md index 31d760d..e3c9202 100644 --- a/README.md +++ b/README.md @@ -12,13 +12,13 @@ Debian: - `3.3.0` - `3.2.2` - `3.1.3` -- `3.4.0` +- `3.4.1` ## Note (R builds are temporarily suspended due to keyserver issues at current time.) -Build image for Spark 3.4.0 is Ubuntu based because openjdk is deprecated and +Build image for Spark 3.4.1 is Ubuntu based because openjdk is deprecated and going forward the official Spark repository uses `eclipse-temurin:-jre` where slim variants of jre images are not available at the moment. diff --git a/templates/vars.yml b/templates/vars.yml index 192023c..32720f7 100644 --- a/templates/vars.yml +++ b/templates/vars.yml @@ -16,7 +16,7 @@ versions: hadoop: ['3.3.2'] scala: ['2.12', '2.13'] -- spark: ['3.4.0'] +- spark: ['3.4.1'] java: ['8', '11'] hadoop: ['3.3.4'] scala: ['2.12', '2.13'] \ No newline at end of file From 32607985a2ee6866b2759d042071680c1fd713e2 Mon Sep 17 00:00:00 2001 From: Tingweiftw Date: Tue, 5 Sep 2023 15:50:33 +0800 Subject: [PATCH 10/11] use alpine instead --- README.md | 3 ++- make-distribution.sh | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e3c9202..e89280a 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,8 @@ Debian: Build image for Spark 3.4.1 is Ubuntu based because openjdk is deprecated and going forward the official Spark repository uses `eclipse-temurin:-jre` -where slim variants of jre images are not available at the moment. +where slim variants of jre images are not available at the moment. As such +the alpine version will be used instead. All the build images with Spark before v3.4.0 are Debian based as the official Spark repository now uses `openjdk:-jre-slim-buster` as the base image diff --git a/make-distribution.sh b/make-distribution.sh index 1f4e2e2..e066fc0 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -71,7 +71,7 @@ if [[ ${SPARK_MAJOR_VERSION} -eq 3 && ${SPARK_MINOR_VERSION} -ge 4 ]]; then # > # From Spark v3.4.0 onwards, openjdk is not the prefered base image source as it i # deprecated and taken over by eclipse-temurin. slim-buster variants are not available # on eclipse-temurin at the moment. - IMAGE_VARIANT="jre" + IMAGE_VARIANT="jre-alpine" else IMAGE_VARIANT="jre-slim-buster" fi From 963844f9db84c1996b4030d5ec49004d2da4d37b Mon Sep 17 00:00:00 2001 From: tingwei Date: Tue, 5 Sep 2023 16:31:49 +0800 Subject: [PATCH 11/11] Revert "use alpine instead" This reverts commit 32607985a2ee6866b2759d042071680c1fd713e2. --- README.md | 3 +-- make-distribution.sh | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e89280a..e3c9202 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,7 @@ Debian: Build image for Spark 3.4.1 is Ubuntu based because openjdk is deprecated and going forward the official Spark repository uses `eclipse-temurin:-jre` -where slim variants of jre images are not available at the moment. As such -the alpine version will be used instead. +where slim variants of jre images are not available at the moment. All the build images with Spark before v3.4.0 are Debian based as the official Spark repository now uses `openjdk:-jre-slim-buster` as the base image diff --git a/make-distribution.sh b/make-distribution.sh index e066fc0..1f4e2e2 100755 --- a/make-distribution.sh +++ b/make-distribution.sh @@ -71,7 +71,7 @@ if [[ ${SPARK_MAJOR_VERSION} -eq 3 && ${SPARK_MINOR_VERSION} -ge 4 ]]; then # > # From Spark v3.4.0 onwards, openjdk is not the prefered base image source as it i # deprecated and taken over by eclipse-temurin. slim-buster variants are not available # on eclipse-temurin at the moment. - IMAGE_VARIANT="jre-alpine" + IMAGE_VARIANT="jre" else IMAGE_VARIANT="jre-slim-buster" fi