Skip to content

Commit

Permalink
Install cuDF-py against python 3.10 on Databricks
Browse files Browse the repository at this point in the history
Fix on Databricks runtime for : #11394

Enable the udf_cudf_test test case for Databricks-13.3

Rapids 24.10+ drops python 3.9 or below conda packages. ref: https://docs.rapids.ai/notices/rsn0040/

Install cuDF-py packages against python 3.10 and above on Databricks runtime to run UDF cuDF tests, because on DB-13.3 Conda is not installed by default.

Signed-off-by: timl <[email protected]>
  • Loading branch information
NvTimLiu committed Sep 18, 2024
1 parent 2589976 commit d951958
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 37 deletions.
5 changes: 0 additions & 5 deletions integration_tests/src/main/python/udf_cudf_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,6 @@
from marks import cudf_udf


if is_databricks_runtime() and is_spark_340_or_later():
# Databricks 13.3 does not use separate reader/writer threads for Python UDFs
# which can lead to hangs. Skipping these tests until the Python UDF handling is updated.
pytestmark = pytest.mark.skip(reason="https://github.com/NVIDIA/spark-rapids/issues/9493")

_conf = {
'spark.rapids.sql.exec.AggregateInPandasExec': 'true',
'spark.rapids.sql.exec.FlatMapCoGroupsInPandasExec': 'true',
Expand Down
19 changes: 8 additions & 11 deletions jenkins/databricks/cudf_udf_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#

# This script sets the environment to run cudf_udf tests of RAPIDS Accelerator for Apache Spark on DB.
# cudf conda packages need to be installed in advance, please refer to
# cudf python packages need to be installed in advance, please refer to
# './jenkins/databricks/init_cudf_udf.sh' to install.
# All the environments can be overwritten by shell variables:
# LOCAL_JAR_PATH: Location of the RAPIDS jars
Expand All @@ -26,23 +26,20 @@
# - Running tests on Databricks:
# `./jenkins/databricks/cudf-udf-test.sh`
# To add support of a new runtime:
# 1. Check if any more dependencies need to be added to the apt/conda install commands.
# 1. Check if any more dependencies need to be added to the apt/conda/pip install commands.
# 2. If you had to go beyond the above steps to support the new runtime, then update the
# instructions accordingly.
set -ex

# Try to use "cudf-udf" conda environment for the python cudf-udf tests.
CONDA_HOME=${CONDA_HOME:-"/databricks/conda"}
if [ ! -d "${CONDA_HOME}/envs/cudf-udf" ]; then
echo "Error not found cudf conda packages! Please refer to './jenkins/databricks/init_cudf_udf.sh' to install."
# Try to use "cudf-udf" conda/pip environment for the python cudf-udf tests.
CUDF_PY_ENV=${CUDF_PY_ENV:-$(echo /databricks/*/envs/cudf-udf)}
if [ ! -d "${CUDF_PY_ENV}" ]; then
echo "Error not found cudf-py packages! Please refer to './jenkins/databricks/init_cudf_udf.sh' to install."
exit -1
fi
export PATH=${CONDA_HOME}/envs/cudf-udf/bin:$PATH
export PYSPARK_PYTHON=${CONDA_HOME}/envs/cudf-udf/bin/python
# Set the path of python site-packages.
# Get Python version (major.minor). i.e., python3.8 for DB10.4 and python3.9 for DB11.3
PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("python{}.{}".format(sys.version_info.major, sys.version_info.minor))')
PYTHON_SITE_PACKAGES="${CONDA_HOME}/envs/cudf-udf/lib/${PYTHON_VERSION}/site-packages"
PYTHON_SITE_PACKAGES=$(echo -n ${CUDF_PY_ENV}/*/lib/site-packages)
[ -d "${CUDF_PY_ENV}/bin" ] && export PATH=${CUDF_PY_ENV}/bin:$PATH

SOURCE_PATH="/home/ubuntu/spark-rapids"
[[ -d "$LOCAL_JAR_PATH" ]] && cd $LOCAL_JAR_PATH || cd $SOURCE_PATH
Expand Down
60 changes: 39 additions & 21 deletions jenkins/databricks/init_cudf_udf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,30 +28,16 @@ export PATH=/databricks/conda/bin:$PATH
# Set Python for the running instance
export PYSPARK_PYTHON=${PYSPARK_PYTHON:-"$(which python)"}
PYTHON_VERSION=$(${PYSPARK_PYTHON} -c 'import sys; print("{}.{}".format(sys.version_info.major, sys.version_info.minor))')
# Rapids 23.06+ drops python 3.8 conda packages. ref: https://docs.rapids.ai/notices/rsn0029/
if [[ "$(printf '%s\n' "3.9" "${PYTHON_VERSION}" | sort -V | head -n1)" = "3.9" ]]; then
# Rapids 24.10+ drops python 3.9 or below conda packages. ref: https://docs.rapids.ai/notices/rsn0040/
if [[ "$(printf '%s\n' "3.10" "${PYTHON_VERSION}" | sort -V | head -n1)" == "3.10" ]]; then
# To fix "'lsb_release -a' returned non-zero". ref: https://github.com/pypa/pip/issues/4924
[[ -n "$(which lsb_release)" ]] && mv $(which lsb_release) $(which lsb_release)"-bak"
else
echo "Rapids 23.06+ drops python 3.8 or below versions of conda packages"
echo "Rapids 24.10+ drops python 3.9 or below versions of conda packages"
exit -1
fi

base=$(conda info --base)
# Create and activate 'cudf-udf' conda env for cudf-udf tests
sudo chmod a+w ${base}/envs && conda config --add envs_dirs ${base}/envs
conda create -y -n cudf-udf -c conda-forge python=$PYTHON_VERSION mamba && \
source activate && \
conda activate cudf-udf

# Use mamba to install cudf-udf packages to speed up conda resolve time
conda install -y -c conda-forge mamba python=$PYTHON_VERSION
# Do not error out "This operation will remove conda without replacing it with another version of conda." for now
${base}/envs/cudf-udf/bin/mamba remove -y c-ares zstd libprotobuf pandas || true

REQUIRED_PACKAGES=(
cuda-version=$CUDA_VER
cudf=$CUDF_VER
findspark
pandas
pyarrow
Expand All @@ -61,9 +47,41 @@ REQUIRED_PACKAGES=(
requests
sre_yield
)
if base=$(conda info --base); then
# Create and activate 'cudf-udf' conda env for cudf-udf tests
sudo chmod a+w ${base}/envs && conda config --add envs_dirs ${base}/envs
conda create -y -n cudf-udf -c conda-forge python=$PYTHON_VERSION mamba && \
source activate && \
conda activate cudf-udf

# Use mamba to install cudf-udf packages to speed up conda resolve time
conda install -y -c conda-forge mamba python=$PYTHON_VERSION
# Do not error out "This operation will remove conda without replacing it with another version of conda." for now
${base}/envs/cudf-udf/bin/mamba remove -y c-ares zstd libprotobuf pandas || true

REQUIRED_PACKAGES=(
cuda-version=$CUDA_VER
cudf=$CUDF_VER
${REQUIRED_PACKAGES[@]}
)

${base}/envs/cudf-udf/bin/mamba install -y \
-c rapidsai -c rapidsai-nightly -c nvidia -c conda-forge -c defaults \
"${REQUIRED_PACKAGES[@]}"
${base}/envs/cudf-udf/bin/mamba install -y \
-c rapidsai -c rapidsai-nightly -c nvidia -c conda-forge -c defaults \
"${REQUIRED_PACKAGES[@]}"

source deactivate && conda deactivate
source deactivate && conda deactivate
else
# pip install cudf-py, refer to: https://docs.rapids.ai/install#selector
# The prefix /databricks/python-bootstrap/ for PYTHON_SITE_PACKAGES is mandatory for Databricks init scripts
PYTHON_SITE_PACKAGES="/databricks/python-bootstrap/envs/cudf-udf/$PYTHON_VERSION/lib/site-packages"
pip install --target=${PYTHON_SITE_PACKAGES} \
--extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple \
"cudf-cu11>=${CUDF_VER}.0a0,<=${CUDF_VER}"

REQUIRED_PACKAGES=(
${REQUIRED_PACKAGES[@]}
scipy
numexpr
)
pip install --target=${PYTHON_SITE_PACKAGES} ${REQUIRED_PACKAGES[@]}
fi

0 comments on commit d951958

Please sign in to comment.