rapidsai · AyodeAwe · Sep 20, 2024 · Sep 17, 2024 · Sep 17, 2024 · Sep 17, 2024
@@ -30,6 +30,7 @@ jobs:
       - wheel-tests-cudf
       - wheel-build-cudf-polars
       - wheel-tests-cudf-polars
+      - cudf-polars-polars-tests
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
       - devcontainer
@@ -244,6 +245,17 @@ jobs:
       # This always runs, but only fails if this PR touches code in
       # pylibcudf or cudf_polars
       script: "ci/test_wheel_cudf_polars.sh"
+  cudf-polars-polars-tests:
+    needs: wheel-build-cudf-polars
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      # This always runs, but only fails if this PR touches code in
+      # pylibcudf or cudf_polars
+      script: "ci/test_cudf_polars_polars_tests.sh"
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit

@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support invoking run_cudf_polars_pytests.sh outside the script directory
+# Assumption, polars has been cloned in the root of the repo.
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../polars/
+
+DESELECTED_TESTS=(
+    "tests/unit/test_polars_import.py::test_polars_import" # relies on a polars built in place
+    "tests/unit/streaming/test_streaming_sort.py::test_streaming_sort[True]" # relies on polars built in debug mode
+    "tests/unit/test_cpu_check.py::test_check_cpu_flags_skipped_no_flags" # Mock library error
+    "tests/docs/test_user_guide.py" # No dot binary in CI image
+)
+
+DESELECTED_TESTS=$(printf -- " --deselect %s" "${DESELECTED_TESTS[@]}")
+python -m pytest \
+       --import-mode=importlib \
+       --cache-clear \
+       -m "" \
+       -p cudf_polars.testing.plugin \
+       -v \
+       --tb=short \
+       ${DESELECTED_TESTS} \
+       "$@" \
+       py-polars/tests
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -eou pipefail
+
+# We will only fail these tests if the PR touches code in pylibcudf
+# or cudf_polars itself.
+# Note, the three dots mean we are doing diff between the merge-base
+# of upstream and HEAD. So this is asking, "does _this branch_ touch
+# files in cudf_polars/pylibcudf", rather than "are there changes
+# between upstream and this branch which touch cudf_polars/pylibcudf"
+# TODO: is the target branch exposed anywhere in an environment variable?
+if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
+then
+    HAS_CHANGES=1
+    rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
+else
+    HAS_CHANGES=0
+    rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
+fi
+
+rapids-logger "Download wheels"
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
+
+# Download the pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
+
+rapids-logger "Install pylibcudf"
+python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
+
+rapids-logger "Install cudf_polars"
+python -m pip install $(echo ./dist/cudf_polars*.whl)
+
+# TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
+TAG="py-1.7.0"
-# TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
-TAG="py-1.7.0"
+TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
-# TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
-TAG="py-1.7.0"
+TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")')
+rapids-logger "Clone polars to ${TAG}"
+git clone https://github.com/pola-rs/polars.git --branch ${TAG} --depth 1
+
+# Install requirements for running polars tests
+rapids-logger "Install polars test requirements"
+python -m pip install -r polars/py-polars/requirements-dev.txt -r polars/py-polars/requirements-ci.txt
+
+function set_exitcode()
+{
+    EXITCODE=$?
+}
+EXITCODE=0
+trap set_exitcode ERR
+set +e
+
+rapids-logger "Run polars tests"
+./ci/run_cudf_polars_polars_tests.sh
+
+trap ERR
+set -e
+
+if [ ${EXITCODE} != 0 ]; then
+    rapids-logger "Running polars test suite FAILED: exitcode ${EXITCODE}"
+else
+    rapids-logger "Running polars test suite PASSED"
+fi
+
+if [ ${HAS_CHANGES} == 1 ]; then
+    exit ${EXITCODE}
+else
+    exit 0
+fi
@@ -13,10 +13,14 @@ set -eou pipefail
 if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/pylibcudf/)" ];
 then
     HAS_CHANGES=1
+    rapids-logger "PR has changes in cudf-polars/pylibcudf, test fails treated as failure"
 else
     HAS_CHANGES=0
+    rapids-logger "PR does not have changes in cudf-polars/pylibcudf, test fails NOT treated as failure"
 fi
 
+rapids-logger "Download wheels"
+
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 python ./dist
 
@@ -43,6 +47,9 @@ python -m pip install \
     "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
     "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
+rapids-logger "Pin to 1.7.0 Temporarily"
+python -m pip install polars==1.7.0
+
 rapids-logger "Run cudf_polars tests"
 
 function set_exitcode()

@@ -650,7 +650,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.0,<1.3
+          - polars>=1.6
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]

diff --git a/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png b/docs/cudf/source/_static/Polars_GPU_speedup_80GB.png
diff --git a/docs/cudf/source/_static/compute_heavy_queries_polars.png b/docs/cudf/source/_static/compute_heavy_queries_polars.png
diff --git a/docs/cudf/source/_static/pds_benchmark_polars.png b/docs/cudf/source/_static/pds_benchmark_polars.png
diff --git a/docs/cudf/source/cudf_polars/index.rst b/docs/cudf/source/cudf_polars/index.rst
@@ -0,0 +1,41 @@
+cuDF-based GPU backend for Polars [Open Beta]
+=============================================
+
+cuDF supports an in-memory, GPU-accelerated execution engine for Python users of the Polars Lazy API.
+The engine supports most of the core expressions and data types as well as a growing set of more advanced dataframe manipulations
+and data file formats. When using the GPU engine, Polars will convert expressions into an optimized query plan and determine
+whether the plan is supported on the GPU. If it is not, the execution will transparently fall back to the standard Polars engine
+and run on the CPU.
+
+Benchmark
+---------
+We reproduced the `Polars Decision Support (PDS) <https://github.com/pola-rs/polars-benchmark>`__ benchmark to compare Polars GPU engine with the default CPU settings across several dataset sizes. Here are the results:
+
+.. figure:: ../_static/pds_benchmark_polars.png
+   :width: 600px
+
+
+
+You can see up to 13x speedup using the GPU backend on the compute-heavy PDS queries involving complex aggregation and join operations. Below are the speedups for the top performing queries:
+
+
+.. figure:: ../_static/compute_heavy_queries_polars.png
+   :width: 1000px
+
+:emphasis:`PDS-H benchmark | GPU: NVIDIA H100 PCIe | CPU: Intel Xeon W9-3495X (Sapphire Rapids) | Storage: Local NVMe`
+
+You can reproduce the results by visiting the `Polars Decision Support (PDS) GitHub repository <https://github.com/pola-rs/polars-benchmark>`__.
+
+Learn More
+----------
+
+The GPU backend for Polars is now available in Open Beta and the engine is undergoing rapid development. To learn more, visit the `GPU Support page <https://docs.pola.rs/user-guide/gpu-support/>`__ on the Polars website.
+
+Launch on Google Colab
+----------------------
+
+.. figure:: ../_static/colab.png
+   :width: 200px
+   :target: https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb
+
+   Take the cuDF backend for Polars for a test-drive in a free GPU-enabled notebook environment using your Google account by `launching on Colab <https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/accelerated_data_processing_examples/polars_gpu_engine_demo.ipynb>`__.
diff --git a/docs/cudf/source/index.rst b/docs/cudf/source/index.rst
@@ -29,5 +29,6 @@ other operations.
 
    user_guide/index
    cudf_pandas/index
+   cudf_polars/index
    libcudf_docs/index
    developer_guide/index
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -14,3 +14,4 @@ strings
     repeat
     replace
     slice
+    strip
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/strip.rst
@@ -0,0 +1,6 @@
+=====
+strip
+=====
+
+.. automodule:: pylibcudf.strings.strip
+   :members:
@@ -17,6 +17,8 @@ from pylibcudf.libcudf.types cimport size_type
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
+import pylibcudf as plc
+
 
 @acquire_spill_lock()
 def add_months(Column col, Column months):
@@ -38,43 +40,9 @@ def add_months(Column col, Column months):
 
 @acquire_spill_lock()
 def extract_datetime_component(Column col, object field):
-
-    cdef unique_ptr[column] c_result
-    cdef column_view col_view = col.view()
-
-    with nogil:
-        if field == "year":
-            c_result = move(libcudf_datetime.extract_year(col_view))
-        elif field == "month":
-            c_result = move(libcudf_datetime.extract_month(col_view))
-        elif field == "day":
-            c_result = move(libcudf_datetime.extract_day(col_view))
-        elif field == "weekday":
-            c_result = move(libcudf_datetime.extract_weekday(col_view))
-        elif field == "hour":
-            c_result = move(libcudf_datetime.extract_hour(col_view))
-        elif field == "minute":
-            c_result = move(libcudf_datetime.extract_minute(col_view))
-        elif field == "second":
-            c_result = move(libcudf_datetime.extract_second(col_view))
-        elif field == "millisecond":
-            c_result = move(
-                libcudf_datetime.extract_millisecond_fraction(col_view)
-            )
-        elif field == "microsecond":
-            c_result = move(
-                libcudf_datetime.extract_microsecond_fraction(col_view)
-            )
-        elif field == "nanosecond":
-            c_result = move(
-                libcudf_datetime.extract_nanosecond_fraction(col_view)
-            )
-        elif field == "day_of_year":
-            c_result = move(libcudf_datetime.day_of_year(col_view))
-        else:
-            raise ValueError(f"Invalid datetime field: '{field}'")
-
-    result = Column.from_unique_ptr(move(c_result))
+    result = Column.from_pylibcudf(
+        plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field)
+    )
 
     if field == "weekday":
         # Pandas counts Monday-Sunday as 0-6