From d9519580627ae80d2c59a37f48ea57550169dd26 Mon Sep 17 00:00:00 2001 From: timl Date: Wed, 11 Sep 2024 13:47:17 +0800 Subject: [PATCH 1/2] Install cuDF-py against python 3.10 on Databricks Fix on Databricks runtime for : https://github.com/NVIDIA/spark-rapids/issues/11394 Enable the udf_cudf_test test case for Databricks-13.3 Rapids 24.10+ drops python 3.9 or below conda packages. ref: https://docs.rapids.ai/notices/rsn0040/ Install cuDF-py packages against python 3.10 and above on Databricks runtime to run UDF cuDF tests, because on DB-13.3 Conda is not installed by default. Signed-off-by: timl --- .../src/main/python/udf_cudf_test.py | 5 -- jenkins/databricks/cudf_udf_test.sh | 19 +++--- jenkins/databricks/init_cudf_udf.sh | 60 ++++++++++++------- 3 files changed, 47 insertions(+), 37 deletions(-) diff --git a/integration_tests/src/main/python/udf_cudf_test.py b/integration_tests/src/main/python/udf_cudf_test.py index 6d94a5da206..59069820d29 100644 --- a/integration_tests/src/main/python/udf_cudf_test.py +++ b/integration_tests/src/main/python/udf_cudf_test.py @@ -41,11 +41,6 @@ from marks import cudf_udf -if is_databricks_runtime() and is_spark_340_or_later(): - # Databricks 13.3 does not use separate reader/writer threads for Python UDFs - # which can lead to hangs. Skipping these tests until the Python UDF handling is updated. - pytestmark = pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/9493") - _conf = { 'spark.rapids.sql.exec.AggregateInPandasExec': 'true', 'spark.rapids.sql.exec.FlatMapCoGroupsInPandasExec': 'true', diff --git a/jenkins/databricks/cudf_udf_test.sh b/jenkins/databricks/cudf_udf_test.sh index 685df1db482..1153acb68ec 100644 --- a/jenkins/databricks/cudf_udf_test.sh +++ b/jenkins/databricks/cudf_udf_test.sh @@ -16,7 +16,7 @@ # # This script sets the environment to run cudf_udf tests of RAPIDS Accelerator for Apache Spark on DB. -# cudf conda packages need to be installed in advance, please refer to +# cudf python packages need to be installed in advance, please refer to # './jenkins/databricks/init_cudf_udf.sh' to install. # All the environments can be overwritten by shell variables: # LOCAL_JAR_PATH: Location of the RAPIDS jars @@ -26,23 +26,20 @@ # - Running tests on Databricks: # `./jenkins/databricks/cudf-udf-test.sh` # To add support of a new runtime: -# 1. Check if any more dependencies need to be added to the apt/conda install commands. +# 1. Check if any more dependencies need to be added to the apt/conda/pip install commands. # 2. If you had to go beyond the above steps to support the new runtime, then update the # instructions accordingly. set -ex -# Try to use "cudf-udf" conda environment for the python cudf-udf tests. -CONDA_HOME=${CONDA_HOME:-"/databricks/conda"} -if [ ! -d "${CONDA_HOME}/envs/cudf-udf" ]; then - echo "Error not found cudf conda packages! Please refer to './jenkins/databricks/init_cudf_udf.sh' to install." +# Try to use "cudf-udf" conda/pip environment for the python cudf-udf tests. +CUDF_PY_ENV=${CUDF_PY_ENV:-$(echo /databricks/*/envs/cudf-udf)} +if [ ! -d "${CUDF_PY_ENV}" ]; then + echo "Error not found cudf-py packages! Please refer to './jenkins/databricks/init_cudf_udf.sh' to install." exit -1 fi -export PATH=${CONDA_HOME}/envs/cudf-udf/bin:$PATH -export PYSPARK_PYTHON=${CONDA_HOME}/envs/cudf-udf/bin/python # Set the path of python site-packages. -# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3 -PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))') -PYTHON_SITE_PACKAGES="${CONDA_HOME}/envs/cudf-udf/lib/${PYTHON_VERSION}/site-packages" +PYTHON_SITE_PACKAGES=$(echo -n ${CUDF_PY_ENV}/*/lib/site-packages) +[ -d "${CUDF_PY_ENV}/bin" ] && export PATH=${CUDF_PY_ENV}/bin:$PATH SOURCE_PATH="/home/ubuntu/spark-rapids" [[ -d "$LOCAL_JAR_PATH" ]] && cd $LOCAL_JAR_PATH || cd $SOURCE_PATH diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh index aed4e2a4c25..e875bdce0fd 100755 --- a/jenkins/databricks/init_cudf_udf.sh +++ b/jenkins/databricks/init_cudf_udf.sh @@ -28,30 +28,16 @@ export PATH=/databricks/conda/bin:$PATH # Set Python for the running instance export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"} PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("{}.{}".format(sys.version_info.major, sys.version_info.minor))') -# Rapids 23.06+ drops python 3.8 conda packages. ref: https://docs.rapids.ai/notices/rsn0029/ -if [[ "$(printf '%s\n' "3.9" "${PYTHON_VERSION}" | sort -V | head -n1)" = "3.9" ]]; then +# Rapids 24.10+ drops python 3.9 or below conda packages. ref: https://docs.rapids.ai/notices/rsn0040/ +if [[ "$(printf '%s\n' "3.10" "${PYTHON_VERSION}" | sort -V | head -n1)" == "3.10" ]]; then # To fix "'lsb_release -a' returned non-zero". ref: https://github.com/pypa/pip/issues/4924 [[ -n "$(which lsb_release)" ]] && mv $(which lsb_release) $(which lsb_release)"-bak" else - echo "Rapids 23.06+ drops python 3.8 or below versions of conda packages" + echo "Rapids 24.10+ drops python 3.9 or below versions of conda packages" exit -1 fi -base=$(conda info --base) -# Create and activate 'cudf-udf' conda env for cudf-udf tests -sudo chmod a+w ${base}/envs && conda config --add envs_dirs ${base}/envs -conda create -y -n cudf-udf -c conda-forge python=$PYTHON_VERSION mamba && \ - source activate && \ - conda activate cudf-udf - -# Use mamba to install cudf-udf packages to speed up conda resolve time -conda install -y -c conda-forge mamba python=$PYTHON_VERSION -# Do not error out "This operation will remove conda without replacing it with another version of conda." for now -${base}/envs/cudf-udf/bin/mamba remove -y c-ares zstd libprotobuf pandas || true - REQUIRED_PACKAGES=( - cuda-version=$CUDA_VER - cudf=$CUDF_VER findspark pandas pyarrow @@ -61,9 +47,41 @@ REQUIRED_PACKAGES=( requests sre_yield ) +if base=$(conda info --base); then + # Create and activate 'cudf-udf' conda env for cudf-udf tests + sudo chmod a+w ${base}/envs && conda config --add envs_dirs ${base}/envs + conda create -y -n cudf-udf -c conda-forge python=$PYTHON_VERSION mamba && \ + source activate && \ + conda activate cudf-udf + + # Use mamba to install cudf-udf packages to speed up conda resolve time + conda install -y -c conda-forge mamba python=$PYTHON_VERSION + # Do not error out "This operation will remove conda without replacing it with another version of conda." for now + ${base}/envs/cudf-udf/bin/mamba remove -y c-ares zstd libprotobuf pandas || true + + REQUIRED_PACKAGES=( + cuda-version=$CUDA_VER + cudf=$CUDF_VER + ${REQUIRED_PACKAGES[@]} + ) -${base}/envs/cudf-udf/bin/mamba install -y \ - -c rapidsai -c rapidsai-nightly -c nvidia -c conda-forge -c defaults \ - "${REQUIRED_PACKAGES[@]}" + ${base}/envs/cudf-udf/bin/mamba install -y \ + -c rapidsai -c rapidsai-nightly -c nvidia -c conda-forge -c defaults \ + "${REQUIRED_PACKAGES[@]}" -source deactivate && conda deactivate + source deactivate && conda deactivate +else + # pip install cudf-py, refer to: https://docs.rapids.ai/install#selector + # The prefix /databricks/python-bootstrap/ for PYTHON_SITE_PACKAGES is mandatory for Databricks init scripts + PYTHON_SITE_PACKAGES="/databricks/python-bootstrap/envs/cudf-udf/$PYTHON_VERSION/lib/site-packages" + pip install --target=${PYTHON_SITE_PACKAGES} \ + --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple \ + "cudf-cu11>=${CUDF_VER}.0a0,<=${CUDF_VER}" + + REQUIRED_PACKAGES=( + ${REQUIRED_PACKAGES[@]} + scipy + numexpr + ) + pip install --target=${PYTHON_SITE_PACKAGES} ${REQUIRED_PACKAGES[@]} +fi From 2e37cfdf6f7d3c01333193c873e9801638cbf9fc Mon Sep 17 00:00:00 2001 From: timl Date: Thu, 19 Sep 2024 10:54:46 +0800 Subject: [PATCH 2/2] Check if 'conda' exists to make the if/else expression more readable Signed-off-by: timl --- jenkins/databricks/init_cudf_udf.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/jenkins/databricks/init_cudf_udf.sh b/jenkins/databricks/init_cudf_udf.sh index e875bdce0fd..deaf127cd5a 100755 --- a/jenkins/databricks/init_cudf_udf.sh +++ b/jenkins/databricks/init_cudf_udf.sh @@ -47,7 +47,8 @@ REQUIRED_PACKAGES=( requests sre_yield ) -if base=$(conda info --base); then +if command -v conda >/dev/null 2>&1; then + base=$(conda info --base) # Create and activate 'cudf-udf' conda env for cudf-udf tests sudo chmod a+w ${base}/envs && conda config --add envs_dirs ${base}/envs conda create -y -n cudf-udf -c conda-forge python=$PYTHON_VERSION mamba && \