diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 8190b5d0297..315a389339a 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -31,6 +31,6 @@ ENV PYTHONDONTWRITEBYTECODE="1" ENV SCCACHE_REGION="us-east-2" ENV SCCACHE_BUCKET="rapids-sccache-devs" -ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai" +ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs" ENV HISTFILE="/home/coder/.cache/._bash_history" ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache" diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json index 2a195c6c81d..a0e193ff0bf 100644 --- a/.devcontainer/cuda12.5-conda/devcontainer.json +++ b/.devcontainer/cuda12.5-conda/devcontainer.json @@ -15,9 +15,31 @@ ], "hostRequirements": {"gpu": "optional"}, "features": { + "ghcr.io/rapidsai/devcontainers/features/cuda:24.12": { + "version": "12.5", + "installCompilers": false, + "installProfilers": true, + "installDevPackages": false, + "installcuDNN": false, + "installcuTensor": false, + "installNCCL": false, + "installCUDARuntime": false, + "installNVRTC": false, + "installOpenCL": false, + "installcuBLAS": false, + "installcuSPARSE": false, + "installcuFFT": false, + "installcuFile": false, + "installcuRAND": false, + "installcuSOLVER": false, + "installNPP": false, + "installnvJPEG": false, + "pruneStaticLibs": true + }, "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.12": {} }, "overrideFeatureInstallOrder": [ + "ghcr.io/rapidsai/devcontainers/features/cuda", "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils" ], "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"], diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index af8d1289ea1..6f0e88fb245 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -62,3 +62,33 @@ jobs: UPDATE_ITEM: true UPDATE_LINKED_ISSUES: true secrets: inherit + + process-branch-name: + if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} + needs: get-project-id + runs-on: ubuntu-latest + outputs: + branch-name: ${{ steps.process-branch-name.outputs.branch-name }} + steps: + - name: Extract branch name + id: process-branch-name + run: | + branch=${{ github.event.pull_request.base.ref }} + release=${branch#branch-} + echo "branch-name=$release" >> "$GITHUB_OUTPUT" + + update-release: + # This job sets the PR and its linked issues to the release they are targeting + uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.12 + if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} + needs: [get-project-id, process-branch-name] + with: + PROJECT_ID: "PVT_kwDOAp2shc4AiNzl" + SINGLE_SELECT_FIELD_ID: "PVTSSF_lADOAp2shc4AiNzlzgg52UQ" + SINGLE_SELECT_FIELD_NAME: "Release" + SINGLE_SELECT_OPTION_VALUE: "${{ needs.process-branch-name.outputs.branch-name }}" + ITEM_PROJECT_ID: "${{ needs.get-project-id.outputs.ITEM_PROJECT_ID }}" + ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}" + UPDATE_ITEM: true + UPDATE_LINKED_ISSUES: true + secrets: inherit diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a22d3c5b9cc..1275aad757c 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -47,11 +47,23 @@ jobs: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 with: - build_type: pull-request + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} # Use the wheel container so we can skip conda solves and since our # primary static consumers (Spark) are not in conda anyway. container_image: "rapidsai/ci-wheel:latest" run_script: "ci/configure_cpp_static.sh" + clang-tidy: + secrets: inherit + uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.12 + with: + build_type: nightly + branch: ${{ inputs.branch }} + date: ${{ inputs.date }} + sha: ${{ inputs.sha }} + run_script: "ci/clang_tidy.sh" conda-python-cudf-tests: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.12 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f861fb57916..0e86407de11 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,17 +16,6 @@ repos: ^cpp/cmake/thirdparty/patches/.*| ^python/cudf/cudf/tests/data/subword_tokenizer_data/.* ) - - repo: https://github.com/PyCQA/isort - rev: 5.13.2 - hooks: - - id: isort - # Use the config file specific to each subproject so that each - # project can specify its own first/third-party packages. - args: ["--config-root=python/", "--resolve-all-configs"] - files: python/.* - exclude: | - (?x)^(^python/cudf_polars/.*) - types_or: [python, cython, pyi] - repo: https://github.com/MarcoGorelli/cython-lint rev: v0.16.2 hooks: @@ -95,6 +84,16 @@ repos: entry: 'pytest\.xfail' language: pygrep types: [python] + - id: no-unseeded-default-rng + name: no-unseeded-default-rng + description: 'Enforce that no non-seeded default_rng is used and default_rng is used instead of np.random.seed' + entry: | + # Check for usage of default_rng without seeding + default_rng\(\)| + # Check for usage of np.random.seed + np.random.seed\( + language: pygrep + types: [python] - id: cmake-format name: cmake-format entry: ./cpp/scripts/run-cmake-format.sh cmake-format @@ -140,6 +139,7 @@ repos: rev: v0.4.8 hooks: - id: ruff + args: ["--fix"] files: python/.*$ - id: ruff-format files: python/.*$ diff --git a/CHANGELOG.md b/CHANGELOG.md index f2a7c337675..7a75b2a95a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,299 @@ +# cudf 24.10.00 (9 Oct 2024) + +## 🚨 Breaking Changes + +- Whitespace normalization of nested column coerced as string column in JSONL inputs ([#16759](https://github.com/rapidsai/cudf/pull/16759)) [@shrshi](https://github.com/shrshi) +- Add libcudf wrappers around current_device_resource functions. ([#16679](https://github.com/rapidsai/cudf/pull/16679)) [@harrism](https://github.com/harrism) +- Fix empty cluster handling in tdigest merge ([#16675](https://github.com/rapidsai/cudf/pull/16675)) [@jihoonson](https://github.com/jihoonson) +- Remove java ColumnView.copyWithBooleanColumnAsValidity ([#16660](https://github.com/rapidsai/cudf/pull/16660)) [@revans2](https://github.com/revans2) +- Support reading multiple PQ sources with mismatching nullability for columns ([#16639](https://github.com/rapidsai/cudf/pull/16639)) [@mhaseeb123](https://github.com/mhaseeb123) +- Remove arrow_io_source ([#16607](https://github.com/rapidsai/cudf/pull/16607)) [@vyasr](https://github.com/vyasr) +- Remove legacy Arrow interop APIs ([#16590](https://github.com/rapidsai/cudf/pull/16590)) [@vyasr](https://github.com/vyasr) +- Remove NativeFile support from cudf Python ([#16589](https://github.com/rapidsai/cudf/pull/16589)) [@vyasr](https://github.com/vyasr) +- Revert "Make proxy NumPy arrays pass isinstance check in `cudf.pandas`" ([#16586](https://github.com/rapidsai/cudf/pull/16586)) [@Matt711](https://github.com/Matt711) +- Align public utility function signatures with pandas 2.x ([#16565](https://github.com/rapidsai/cudf/pull/16565)) [@mroeschke](https://github.com/mroeschke) +- Disallow cudf.Index accepting column in favor of ._from_column ([#16549](https://github.com/rapidsai/cudf/pull/16549)) [@mroeschke](https://github.com/mroeschke) +- Refactor dictionary encoding in PQ writer to migrate to the new `cuco::static_map` ([#16541](https://github.com/rapidsai/cudf/pull/16541)) [@mhaseeb123](https://github.com/mhaseeb123) +- Change IPv4 convert APIs to support UINT32 instead of INT64 ([#16489](https://github.com/rapidsai/cudf/pull/16489)) [@davidwendt](https://github.com/davidwendt) +- enable list to be forced as string in JSON reader. ([#16472](https://github.com/rapidsai/cudf/pull/16472)) [@karthikeyann](https://github.com/karthikeyann) +- Disallow cudf.Series to accept column in favor of `._from_column` ([#16454](https://github.com/rapidsai/cudf/pull/16454)) [@mroeschke](https://github.com/mroeschke) +- Align groupby APIs with pandas 2.x ([#16403](https://github.com/rapidsai/cudf/pull/16403)) [@mroeschke](https://github.com/mroeschke) +- Align misc DataFrame and MultiIndex methods with pandas 2.x ([#16402](https://github.com/rapidsai/cudf/pull/16402)) [@mroeschke](https://github.com/mroeschke) +- Align Index APIs with pandas 2.x ([#16361](https://github.com/rapidsai/cudf/pull/16361)) [@mroeschke](https://github.com/mroeschke) +- Add `stream` param to stream compaction APIs ([#16295](https://github.com/rapidsai/cudf/pull/16295)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub) + +## 🐛 Bug Fixes + +- Add license to the pylibcudf wheel ([#16976](https://github.com/rapidsai/cudf/pull/16976)) [@raydouglass](https://github.com/raydouglass) +- Parse newline as whitespace character while tokenizing JSONL inputs with non-newline delimiter ([#16950](https://github.com/rapidsai/cudf/pull/16950)) [@shrshi](https://github.com/shrshi) +- Add dask-cudf workaround for missing `rename_axis` support in cudf ([#16899](https://github.com/rapidsai/cudf/pull/16899)) [@rjzamora](https://github.com/rjzamora) +- Update oldest deps for `pyarrow` & `numpy` ([#16883](https://github.com/rapidsai/cudf/pull/16883)) [@galipremsagar](https://github.com/galipremsagar) +- Update labeler for pylibcudf ([#16868](https://github.com/rapidsai/cudf/pull/16868)) [@vyasr](https://github.com/vyasr) +- Revert "Refactor mixed_semi_join using cuco::static_set" ([#16855](https://github.com/rapidsai/cudf/pull/16855)) [@mhaseeb123](https://github.com/mhaseeb123) +- Fix metadata after implicit array conversion from Dask cuDF ([#16842](https://github.com/rapidsai/cudf/pull/16842)) [@rjzamora](https://github.com/rjzamora) +- Add cudf.pandas dependencies.yaml to update-version.sh ([#16840](https://github.com/rapidsai/cudf/pull/16840)) [@raydouglass](https://github.com/raydouglass) +- Use cupy 12.2.0 as oldest dependency pinning on CUDA 12 ARM ([#16808](https://github.com/rapidsai/cudf/pull/16808)) [@bdice](https://github.com/bdice) +- Revert "Fix empty cluster handling in tdigest merge ([#16675)" (#16800](https://github.com/rapidsai/cudf/pull/16675)" (#16800)) [@jihoonson](https://github.com/jihoonson) +- Intentionally leak thread_local CUDA resources to avoid crash (part 1) ([#16787](https://github.com/rapidsai/cudf/pull/16787)) [@kingcrimsontianyu](https://github.com/kingcrimsontianyu) +- Fix `cov`/`corr` bug in dask-cudf ([#16786](https://github.com/rapidsai/cudf/pull/16786)) [@rjzamora](https://github.com/rjzamora) +- Fix slice_strings wide strings logic with multi-byte characters ([#16777](https://github.com/rapidsai/cudf/pull/16777)) [@davidwendt](https://github.com/davidwendt) +- Fix nvbench output for sha512 ([#16773](https://github.com/rapidsai/cudf/pull/16773)) [@davidwendt](https://github.com/davidwendt) +- Allow read_csv(header=None) to return int column labels in `mode.pandas_compatible` ([#16769](https://github.com/rapidsai/cudf/pull/16769)) [@mroeschke](https://github.com/mroeschke) +- Whitespace normalization of nested column coerced as string column in JSONL inputs ([#16759](https://github.com/rapidsai/cudf/pull/16759)) [@shrshi](https://github.com/shrshi) +- Fix DataFrame.drop(columns=cudf.Series/Index, axis=1) ([#16712](https://github.com/rapidsai/cudf/pull/16712)) [@mroeschke](https://github.com/mroeschke) +- Use merge base when calculating changed files ([#16709](https://github.com/rapidsai/cudf/pull/16709)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Ensure we pass the has_nulls tparam to mixed_join kernels ([#16708](https://github.com/rapidsai/cudf/pull/16708)) [@abellina](https://github.com/abellina) +- Add boost-devel to Java CI Docker image ([#16707](https://github.com/rapidsai/cudf/pull/16707)) [@jlowe](https://github.com/jlowe) +- [BUG] Add gpu node type to cudf-pandas 3rd-party integration nightly CI job ([#16704](https://github.com/rapidsai/cudf/pull/16704)) [@Matt711](https://github.com/Matt711) +- Fix typo in column_factories.hpp comment from 'depth 1' to 'depth 2' ([#16700](https://github.com/rapidsai/cudf/pull/16700)) [@a-hirota](https://github.com/a-hirota) +- Fix Series.to_frame(name=None) setting a None name ([#16698](https://github.com/rapidsai/cudf/pull/16698)) [@mroeschke](https://github.com/mroeschke) +- Disable gtests/ERROR_TEST during compute-sanitizer memcheck test ([#16691](https://github.com/rapidsai/cudf/pull/16691)) [@davidwendt](https://github.com/davidwendt) +- Enable batched multi-source reading of JSONL files with large records ([#16687](https://github.com/rapidsai/cudf/pull/16687)) [@shrshi](https://github.com/shrshi) +- Handle `ordered` parameter in `CategoricalIndex.__repr__` ([#16683](https://github.com/rapidsai/cudf/pull/16683)) [@galipremsagar](https://github.com/galipremsagar) +- Fix loc/iloc.__setitem__[:, loc] with non cupy types ([#16677](https://github.com/rapidsai/cudf/pull/16677)) [@mroeschke](https://github.com/mroeschke) +- Fix empty cluster handling in tdigest merge ([#16675](https://github.com/rapidsai/cudf/pull/16675)) [@jihoonson](https://github.com/jihoonson) +- Fix `cudf::rank` not getting enough params ([#16666](https://github.com/rapidsai/cudf/pull/16666)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub) +- Fix slowdown in `CategoricalIndex.__repr__` ([#16665](https://github.com/rapidsai/cudf/pull/16665)) [@galipremsagar](https://github.com/galipremsagar) +- Remove java ColumnView.copyWithBooleanColumnAsValidity ([#16660](https://github.com/rapidsai/cudf/pull/16660)) [@revans2](https://github.com/revans2) +- Fix slowdown in DataFrame repr in jupyter notebook ([#16656](https://github.com/rapidsai/cudf/pull/16656)) [@galipremsagar](https://github.com/galipremsagar) +- Preserve Series name in duplicated method. ([#16655](https://github.com/rapidsai/cudf/pull/16655)) [@bdice](https://github.com/bdice) +- Fix interval_range right child non-zero offset ([#16651](https://github.com/rapidsai/cudf/pull/16651)) [@mroeschke](https://github.com/mroeschke) +- fix libcudf wheel publishing, make package-type explicit in wheel publishing ([#16650](https://github.com/rapidsai/cudf/pull/16650)) [@jameslamb](https://github.com/jameslamb) +- Revert "Hide all gtest symbols in cudftestutil ([#16546)" (#16644](https://github.com/rapidsai/cudf/pull/16546)" (#16644)) [@robertmaynard](https://github.com/robertmaynard) +- Fix integer overflow in indexalator pointer logic ([#16643](https://github.com/rapidsai/cudf/pull/16643)) [@davidwendt](https://github.com/davidwendt) +- Allow for binops between two differently sized DecimalDtypes ([#16638](https://github.com/rapidsai/cudf/pull/16638)) [@mroeschke](https://github.com/mroeschke) +- Move pragma once in rolling/jit/operation.hpp. ([#16636](https://github.com/rapidsai/cudf/pull/16636)) [@bdice](https://github.com/bdice) +- Fix overflow bug in low-memory JSON reader ([#16632](https://github.com/rapidsai/cudf/pull/16632)) [@shrshi](https://github.com/shrshi) +- Add the missing `num_aggregations` axis for `groupby_max_cardinality` ([#16630](https://github.com/rapidsai/cudf/pull/16630)) [@PointKernel](https://github.com/PointKernel) +- Fix strings::detail::copy_range when target contains nulls ([#16626](https://github.com/rapidsai/cudf/pull/16626)) [@davidwendt](https://github.com/davidwendt) +- Fix function parameters with common dependency modified during their evaluation ([#16620](https://github.com/rapidsai/cudf/pull/16620)) [@ttnghia](https://github.com/ttnghia) +- bug-fix: Don't enable the CUDA language if testing was requested when finding cudf ([#16615](https://github.com/rapidsai/cudf/pull/16615)) [@cryos](https://github.com/cryos) +- bug-fix: cudf/io/json.hpp use after move ([#16609](https://github.com/rapidsai/cudf/pull/16609)) [@NicolasDenoyelle](https://github.com/NicolasDenoyelle) +- Remove CUDA whole compilation ODR violations ([#16603](https://github.com/rapidsai/cudf/pull/16603)) [@robertmaynard](https://github.com/robertmaynard) +- MAINT: Adapt to numpy hiding flagsobject away ([#16593](https://github.com/rapidsai/cudf/pull/16593)) [@seberg](https://github.com/seberg) +- Revert "Make proxy NumPy arrays pass isinstance check in `cudf.pandas`" ([#16586](https://github.com/rapidsai/cudf/pull/16586)) [@Matt711](https://github.com/Matt711) +- Switch python version to `3.10` in `cudf.pandas` pandas test scripts ([#16559](https://github.com/rapidsai/cudf/pull/16559)) [@galipremsagar](https://github.com/galipremsagar) +- Hide all gtest symbols in cudftestutil ([#16546](https://github.com/rapidsai/cudf/pull/16546)) [@robertmaynard](https://github.com/robertmaynard) +- Update the java code to properly deal with lists being returned as strings ([#16536](https://github.com/rapidsai/cudf/pull/16536)) [@revans2](https://github.com/revans2) +- Register `read_parquet` and `read_csv` with dask-expr ([#16535](https://github.com/rapidsai/cudf/pull/16535)) [@rjzamora](https://github.com/rjzamora) +- Change cudf::empty_like to not include offsets for empty strings columns ([#16529](https://github.com/rapidsai/cudf/pull/16529)) [@davidwendt](https://github.com/davidwendt) +- Fix DataFrame reductions with median returning scalar instead of Series ([#16527](https://github.com/rapidsai/cudf/pull/16527)) [@mroeschke](https://github.com/mroeschke) +- Allow DataFrame.sort_values(by=) to select an index level ([#16519](https://github.com/rapidsai/cudf/pull/16519)) [@mroeschke](https://github.com/mroeschke) +- Fix `date_range(start, end, freq)` when end-start is divisible by freq ([#16516](https://github.com/rapidsai/cudf/pull/16516)) [@mroeschke](https://github.com/mroeschke) +- Preserve array name in MultiIndex.from_arrays ([#16515](https://github.com/rapidsai/cudf/pull/16515)) [@mroeschke](https://github.com/mroeschke) +- Disallow indexing by selecting duplicate labels ([#16514](https://github.com/rapidsai/cudf/pull/16514)) [@mroeschke](https://github.com/mroeschke) +- Fix `.replace(Index, Index)` raising a TypeError ([#16513](https://github.com/rapidsai/cudf/pull/16513)) [@mroeschke](https://github.com/mroeschke) +- Check index bounds in compact protocol reader. ([#16493](https://github.com/rapidsai/cudf/pull/16493)) [@bdice](https://github.com/bdice) +- Fix build failures with GCC 13 ([#16488](https://github.com/rapidsai/cudf/pull/16488)) [@PointKernel](https://github.com/PointKernel) +- Fix all-empty input column for strings split APIs ([#16466](https://github.com/rapidsai/cudf/pull/16466)) [@davidwendt](https://github.com/davidwendt) +- Fix segmented-sort overlapped input/output indices ([#16463](https://github.com/rapidsai/cudf/pull/16463)) [@davidwendt](https://github.com/davidwendt) +- Fix merge conflict for auto merge 16447 ([#16449](https://github.com/rapidsai/cudf/pull/16449)) [@davidwendt](https://github.com/davidwendt) + +## 📖 Documentation + +- Fix links in Dask cuDF documentation ([#16929](https://github.com/rapidsai/cudf/pull/16929)) [@rjzamora](https://github.com/rjzamora) +- Improve aggregation documentation ([#16822](https://github.com/rapidsai/cudf/pull/16822)) [@PointKernel](https://github.com/PointKernel) +- Add best practices page to Dask cuDF docs ([#16821](https://github.com/rapidsai/cudf/pull/16821)) [@rjzamora](https://github.com/rjzamora) +- [DOC] Update Pylibcudf doc strings ([#16810](https://github.com/rapidsai/cudf/pull/16810)) [@Matt711](https://github.com/Matt711) +- Recommending `miniforge` for conda install ([#16782](https://github.com/rapidsai/cudf/pull/16782)) [@mmccarty](https://github.com/mmccarty) +- Add labeling pylibcudf doc pages ([#16779](https://github.com/rapidsai/cudf/pull/16779)) [@mroeschke](https://github.com/mroeschke) +- Migrate dask-cudf README improvements to dask-cudf sphinx docs ([#16765](https://github.com/rapidsai/cudf/pull/16765)) [@rjzamora](https://github.com/rjzamora) +- [DOC] Remove out of date section from cudf.pandas docs ([#16697](https://github.com/rapidsai/cudf/pull/16697)) [@Matt711](https://github.com/Matt711) +- Add performance tips to cudf.pandas FAQ. ([#16693](https://github.com/rapidsai/cudf/pull/16693)) [@bdice](https://github.com/bdice) +- Update documentation for Dask cuDF ([#16671](https://github.com/rapidsai/cudf/pull/16671)) [@rjzamora](https://github.com/rjzamora) +- Add missing pylibcudf strings docs ([#16471](https://github.com/rapidsai/cudf/pull/16471)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- DOC: Refresh pylibcudf guide ([#15856](https://github.com/rapidsai/cudf/pull/15856)) [@lithomas1](https://github.com/lithomas1) + +## 🚀 New Features + +- Build `cudf-polars` with `build.sh` ([#16898](https://github.com/rapidsai/cudf/pull/16898)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Add polars to "all" dependency list. ([#16875](https://github.com/rapidsai/cudf/pull/16875)) [@bdice](https://github.com/bdice) +- nvCOMP GZIP integration ([#16770](https://github.com/rapidsai/cudf/pull/16770)) [@vuule](https://github.com/vuule) +- [FEA] Add support for `cudf.NamedAgg` ([#16744](https://github.com/rapidsai/cudf/pull/16744)) [@Matt711](https://github.com/Matt711) +- Add experimental `filesystem="arrow"` support in `dask_cudf.read_parquet` ([#16684](https://github.com/rapidsai/cudf/pull/16684)) [@rjzamora](https://github.com/rjzamora) +- Relax Arrow pin ([#16681](https://github.com/rapidsai/cudf/pull/16681)) [@vyasr](https://github.com/vyasr) +- Add libcudf wrappers around current_device_resource functions. ([#16679](https://github.com/rapidsai/cudf/pull/16679)) [@harrism](https://github.com/harrism) +- Move NDS-H examples into benchmarks ([#16663](https://github.com/rapidsai/cudf/pull/16663)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub) +- [FEA] Add third-party library integration testing of cudf.pandas to cudf ([#16645](https://github.com/rapidsai/cudf/pull/16645)) [@Matt711](https://github.com/Matt711) +- Make isinstance check pass for proxy ndarrays ([#16601](https://github.com/rapidsai/cudf/pull/16601)) [@Matt711](https://github.com/Matt711) +- [FEA] Add an environment variable to fail on fallback in `cudf.pandas` ([#16562](https://github.com/rapidsai/cudf/pull/16562)) [@Matt711](https://github.com/Matt711) +- [FEA] Add support for `cudf.unique` ([#16554](https://github.com/rapidsai/cudf/pull/16554)) [@Matt711](https://github.com/Matt711) +- [FEA] Support named aggregations in `df.groupby().agg()` ([#16528](https://github.com/rapidsai/cudf/pull/16528)) [@Matt711](https://github.com/Matt711) +- Change IPv4 convert APIs to support UINT32 instead of INT64 ([#16489](https://github.com/rapidsai/cudf/pull/16489)) [@davidwendt](https://github.com/davidwendt) +- enable list to be forced as string in JSON reader. ([#16472](https://github.com/rapidsai/cudf/pull/16472)) [@karthikeyann](https://github.com/karthikeyann) +- Remove cuDF dependency from pylibcudf column from_device tests ([#16441](https://github.com/rapidsai/cudf/pull/16441)) [@brandon-b-miller](https://github.com/brandon-b-miller) +- Enable cudf.pandas REPL and -c command support ([#16428](https://github.com/rapidsai/cudf/pull/16428)) [@bdice](https://github.com/bdice) +- Setup pylibcudf package ([#16299](https://github.com/rapidsai/cudf/pull/16299)) [@lithomas1](https://github.com/lithomas1) +- Add a libcudf/thrust-based TPC-H derived datagen ([#16294](https://github.com/rapidsai/cudf/pull/16294)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub) +- Make proxy NumPy arrays pass isinstance check in `cudf.pandas` ([#16286](https://github.com/rapidsai/cudf/pull/16286)) [@Matt711](https://github.com/Matt711) +- Add skiprows and nrows to parquet reader ([#16214](https://github.com/rapidsai/cudf/pull/16214)) [@lithomas1](https://github.com/lithomas1) +- Upgrade to nvcomp 4.0.1 ([#16076](https://github.com/rapidsai/cudf/pull/16076)) [@vuule](https://github.com/vuule) +- Migrate ORC reader to pylibcudf ([#16042](https://github.com/rapidsai/cudf/pull/16042)) [@lithomas1](https://github.com/lithomas1) +- JSON reader validation of values ([#15968](https://github.com/rapidsai/cudf/pull/15968)) [@karthikeyann](https://github.com/karthikeyann) +- Implement exposed null mask APIs in pylibcudf ([#15908](https://github.com/rapidsai/cudf/pull/15908)) [@charlesbluca](https://github.com/charlesbluca) +- Word-based nvtext::minhash function ([#15368](https://github.com/rapidsai/cudf/pull/15368)) [@davidwendt](https://github.com/davidwendt) + +## 🛠️ Improvements + +- Make tests deterministic ([#16910](https://github.com/rapidsai/cudf/pull/16910)) [@galipremsagar](https://github.com/galipremsagar) +- Update update-version.sh to use packaging lib ([#16891](https://github.com/rapidsai/cudf/pull/16891)) [@AyodeAwe](https://github.com/AyodeAwe) +- Pin polars for 24.10 and update polars test suite xfail list ([#16886](https://github.com/rapidsai/cudf/pull/16886)) [@wence-](https://github.com/wence-) +- Add in support for setting delim when parsing JSON through java ([#16867) (#16880](https://github.com/rapidsai/cudf/pull/16867) (#16880)) [@revans2](https://github.com/revans2) +- Remove unnecessary flag from build.sh ([#16879](https://github.com/rapidsai/cudf/pull/16879)) [@vyasr](https://github.com/vyasr) +- Ignore numba warning specific to ARM runners ([#16872](https://github.com/rapidsai/cudf/pull/16872)) [@galipremsagar](https://github.com/galipremsagar) +- Display deltas for `cudf.pandas` test summary ([#16864](https://github.com/rapidsai/cudf/pull/16864)) [@galipremsagar](https://github.com/galipremsagar) +- Switch to using native `traceback` ([#16851](https://github.com/rapidsai/cudf/pull/16851)) [@galipremsagar](https://github.com/galipremsagar) +- JSON tree algorithm code reorg ([#16836](https://github.com/rapidsai/cudf/pull/16836)) [@karthikeyann](https://github.com/karthikeyann) +- Add string.repeats API to pylibcudf ([#16834](https://github.com/rapidsai/cudf/pull/16834)) [@mroeschke](https://github.com/mroeschke) +- Use CI workflow branch 'branch-24.10' again ([#16832](https://github.com/rapidsai/cudf/pull/16832)) [@jameslamb](https://github.com/jameslamb) +- Rename the NDS-H benchmark binaries ([#16831](https://github.com/rapidsai/cudf/pull/16831)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub) +- Add string.findall APIs to pylibcudf ([#16825](https://github.com/rapidsai/cudf/pull/16825)) [@mroeschke](https://github.com/mroeschke) +- Add string.extract APIs to pylibcudf ([#16823](https://github.com/rapidsai/cudf/pull/16823)) [@mroeschke](https://github.com/mroeschke) +- use get-pr-info from nv-gha-runners ([#16819](https://github.com/rapidsai/cudf/pull/16819)) [@AyodeAwe](https://github.com/AyodeAwe) +- Add string.contains APIs to pylibcudf ([#16814](https://github.com/rapidsai/cudf/pull/16814)) [@mroeschke](https://github.com/mroeschke) +- Forward-merge branch-24.08 to branch-24.10 ([#16813](https://github.com/rapidsai/cudf/pull/16813)) [@bdice](https://github.com/bdice) +- Add io_type axis with default `PINNED_BUFFER` to nvbench PQ multithreaded reader ([#16809](https://github.com/rapidsai/cudf/pull/16809)) [@mhaseeb123](https://github.com/mhaseeb123) +- Update fmt (to 11.0.2) and spdlog (to 1.14.1). ([#16806](https://github.com/rapidsai/cudf/pull/16806)) [@jameslamb](https://github.com/jameslamb) +- Add ability to set parquet row group max #rows and #bytes in java ([#16805](https://github.com/rapidsai/cudf/pull/16805)) [@pmattione-nvidia](https://github.com/pmattione-nvidia) +- Add in option for Java JSON APIs to do column pruning in CUDF ([#16796](https://github.com/rapidsai/cudf/pull/16796)) [@revans2](https://github.com/revans2) +- Support drop_first in get_dummies ([#16795](https://github.com/rapidsai/cudf/pull/16795)) [@mroeschke](https://github.com/mroeschke) +- Exposed stream-ordering to join API ([#16793](https://github.com/rapidsai/cudf/pull/16793)) [@lamarrr](https://github.com/lamarrr) +- Add string.attributes APIs to pylibcudf ([#16785](https://github.com/rapidsai/cudf/pull/16785)) [@mroeschke](https://github.com/mroeschke) +- Java: Make ColumnVector.fromViewWithContiguousAllocation public ([#16784](https://github.com/rapidsai/cudf/pull/16784)) [@jlowe](https://github.com/jlowe) +- Add partitioning APIs to pylibcudf ([#16781](https://github.com/rapidsai/cudf/pull/16781)) [@mroeschke](https://github.com/mroeschke) +- Optimization of tdigest merge aggregation. ([#16780](https://github.com/rapidsai/cudf/pull/16780)) [@nvdbaranec](https://github.com/nvdbaranec) +- use libkvikio wheels in wheel builds ([#16778](https://github.com/rapidsai/cudf/pull/16778)) [@jameslamb](https://github.com/jameslamb) +- Exposed stream-ordering to datetime API ([#16774](https://github.com/rapidsai/cudf/pull/16774)) [@lamarrr](https://github.com/lamarrr) +- Add io/timezone APIs to pylibcudf ([#16771](https://github.com/rapidsai/cudf/pull/16771)) [@mroeschke](https://github.com/mroeschke) +- Remove `MultiIndex._poplevel` inplace implementation. ([#16767](https://github.com/rapidsai/cudf/pull/16767)) [@mroeschke](https://github.com/mroeschke) +- allow pandas patch version to float in cudf-pandas unit tests ([#16763](https://github.com/rapidsai/cudf/pull/16763)) [@jameslamb](https://github.com/jameslamb) +- Simplify the nvCOMP adapter ([#16762](https://github.com/rapidsai/cudf/pull/16762)) [@vuule](https://github.com/vuule) +- Add labeling APIs to pylibcudf ([#16761](https://github.com/rapidsai/cudf/pull/16761)) [@mroeschke](https://github.com/mroeschke) +- Add transform APIs to pylibcudf ([#16760](https://github.com/rapidsai/cudf/pull/16760)) [@mroeschke](https://github.com/mroeschke) +- Add a benchmark to study Parquet reader's performance for wide tables ([#16751](https://github.com/rapidsai/cudf/pull/16751)) [@mhaseeb123](https://github.com/mhaseeb123) +- Change the Parquet writer's `default_row_group_size_bytes` from 128MB to inf ([#16750](https://github.com/rapidsai/cudf/pull/16750)) [@mhaseeb123](https://github.com/mhaseeb123) +- Add transpose API to pylibcudf ([#16749](https://github.com/rapidsai/cudf/pull/16749)) [@mroeschke](https://github.com/mroeschke) +- Add support for Python 3.12, update Kafka dependencies to 2.5.x ([#16745](https://github.com/rapidsai/cudf/pull/16745)) [@jameslamb](https://github.com/jameslamb) +- Generate GPU vs CPU usage metrics per pytest file in pandas testsuite for `cudf.pandas` ([#16739](https://github.com/rapidsai/cudf/pull/16739)) [@galipremsagar](https://github.com/galipremsagar) +- Refactor cudf pandas integration tests CI ([#16728](https://github.com/rapidsai/cudf/pull/16728)) [@Matt711](https://github.com/Matt711) +- Remove ERROR_TEST gtest from libcudf ([#16722](https://github.com/rapidsai/cudf/pull/16722)) [@davidwendt](https://github.com/davidwendt) +- Use Series._from_column more consistently to avoid validation ([#16716](https://github.com/rapidsai/cudf/pull/16716)) [@mroeschke](https://github.com/mroeschke) +- remove some unnecessary libcudf nightly builds ([#16714](https://github.com/rapidsai/cudf/pull/16714)) [@jameslamb](https://github.com/jameslamb) +- Remove xfail from torch-cudf.pandas integration test ([#16705](https://github.com/rapidsai/cudf/pull/16705)) [@Matt711](https://github.com/Matt711) +- Add return type annotations to MultiIndex ([#16696](https://github.com/rapidsai/cudf/pull/16696)) [@mroeschke](https://github.com/mroeschke) +- Add type annotations to Index classes, utilize _from_column more ([#16695](https://github.com/rapidsai/cudf/pull/16695)) [@mroeschke](https://github.com/mroeschke) +- Have interval_range use IntervalIndex.from_breaks, remove column_empty_same_mask ([#16694](https://github.com/rapidsai/cudf/pull/16694)) [@mroeschke](https://github.com/mroeschke) +- Increase timeouts for couple of tests ([#16692](https://github.com/rapidsai/cudf/pull/16692)) [@galipremsagar](https://github.com/galipremsagar) +- Replace raw device_memory_resource pointer in pylibcudf Cython ([#16674](https://github.com/rapidsai/cudf/pull/16674)) [@harrism](https://github.com/harrism) +- switch from typing.Callable to collections.abc.Callable ([#16670](https://github.com/rapidsai/cudf/pull/16670)) [@jameslamb](https://github.com/jameslamb) +- Update rapidsai/pre-commit-hooks ([#16669](https://github.com/rapidsai/cudf/pull/16669)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Multi-file and Parquet-aware prefetching from remote storage ([#16657](https://github.com/rapidsai/cudf/pull/16657)) [@rjzamora](https://github.com/rjzamora) +- Access Frame attributes instead of ColumnAccessor attributes when available ([#16652](https://github.com/rapidsai/cudf/pull/16652)) [@mroeschke](https://github.com/mroeschke) +- Use non-mangled type names in nvbench output ([#16649](https://github.com/rapidsai/cudf/pull/16649)) [@davidwendt](https://github.com/davidwendt) +- Add pylibcudf build dir in build.sh for `clean` ([#16648](https://github.com/rapidsai/cudf/pull/16648)) [@galipremsagar](https://github.com/galipremsagar) +- Prune workflows based on changed files ([#16642](https://github.com/rapidsai/cudf/pull/16642)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Remove arrow dependency ([#16640](https://github.com/rapidsai/cudf/pull/16640)) [@vyasr](https://github.com/vyasr) +- Support reading multiple PQ sources with mismatching nullability for columns ([#16639](https://github.com/rapidsai/cudf/pull/16639)) [@mhaseeb123](https://github.com/mhaseeb123) +- Drop Python 3.9 support ([#16637](https://github.com/rapidsai/cudf/pull/16637)) [@jameslamb](https://github.com/jameslamb) +- Support DecimalDtype meta in dask_cudf ([#16634](https://github.com/rapidsai/cudf/pull/16634)) [@mroeschke](https://github.com/mroeschke) +- Add `num_multiprocessors` utility ([#16628](https://github.com/rapidsai/cudf/pull/16628)) [@PointKernel](https://github.com/PointKernel) +- Annotate `ColumnAccessor._data` labels as `Hashable` ([#16623](https://github.com/rapidsai/cudf/pull/16623)) [@mroeschke](https://github.com/mroeschke) +- Remove build_categorical_column in favor of CategoricalColumn constructor ([#16617](https://github.com/rapidsai/cudf/pull/16617)) [@mroeschke](https://github.com/mroeschke) +- Move apply_boolean_mask benchmark to nvbench ([#16616](https://github.com/rapidsai/cudf/pull/16616)) [@davidwendt](https://github.com/davidwendt) +- Revise `get_reader_filepath_or_buffer` to handle a list of data sources ([#16613](https://github.com/rapidsai/cudf/pull/16613)) [@rjzamora](https://github.com/rjzamora) +- do not install cudf in cudf_polars wheel tests ([#16612](https://github.com/rapidsai/cudf/pull/16612)) [@jameslamb](https://github.com/jameslamb) +- remove streamz git dependency, standardize build dependency names, consolidate some dependency lists ([#16611](https://github.com/rapidsai/cudf/pull/16611)) [@jameslamb](https://github.com/jameslamb) +- Fix C++ and Cython io types ([#16610](https://github.com/rapidsai/cudf/pull/16610)) [@vyasr](https://github.com/vyasr) +- Remove arrow_io_source ([#16607](https://github.com/rapidsai/cudf/pull/16607)) [@vyasr](https://github.com/vyasr) +- Remove thrust::optional from expression evaluator ([#16604](https://github.com/rapidsai/cudf/pull/16604)) [@bdice](https://github.com/bdice) +- Add stricter typing and validation to ColumnAccessor ([#16602](https://github.com/rapidsai/cudf/pull/16602)) [@mroeschke](https://github.com/mroeschke) +- make more use of YAML anchors in dependencies.yaml ([#16597](https://github.com/rapidsai/cudf/pull/16597)) [@jameslamb](https://github.com/jameslamb) +- Enable testing `cudf.pandas` unit tests for all minor versions of pandas ([#16595](https://github.com/rapidsai/cudf/pull/16595)) [@galipremsagar](https://github.com/galipremsagar) +- Extend the Parquet writer's dictionary encoding benchmark. ([#16591](https://github.com/rapidsai/cudf/pull/16591)) [@mhaseeb123](https://github.com/mhaseeb123) +- Remove legacy Arrow interop APIs ([#16590](https://github.com/rapidsai/cudf/pull/16590)) [@vyasr](https://github.com/vyasr) +- Remove NativeFile support from cudf Python ([#16589](https://github.com/rapidsai/cudf/pull/16589)) [@vyasr](https://github.com/vyasr) +- Add build job for pylibcudf ([#16587](https://github.com/rapidsai/cudf/pull/16587)) [@vyasr](https://github.com/vyasr) +- Add `public` qualifier for some member functions in Java class `Schema` ([#16583](https://github.com/rapidsai/cudf/pull/16583)) [@ttnghia](https://github.com/ttnghia) +- Enable gtests previously disabled for compute-sanitizer bug ([#16581](https://github.com/rapidsai/cudf/pull/16581)) [@davidwendt](https://github.com/davidwendt) +- [FEA] Add filesystem argument to `cudf.read_parquet` ([#16577](https://github.com/rapidsai/cudf/pull/16577)) [@rjzamora](https://github.com/rjzamora) +- Ensure size is always passed to NumericalColumn ([#16576](https://github.com/rapidsai/cudf/pull/16576)) [@mroeschke](https://github.com/mroeschke) +- standardize and consolidate wheel installations in testing scripts ([#16575](https://github.com/rapidsai/cudf/pull/16575)) [@jameslamb](https://github.com/jameslamb) +- Performance improvement for strings::slice for wide strings ([#16574](https://github.com/rapidsai/cudf/pull/16574)) [@davidwendt](https://github.com/davidwendt) +- Add `ToCudfBackend` expression to dask-cudf ([#16573](https://github.com/rapidsai/cudf/pull/16573)) [@rjzamora](https://github.com/rjzamora) +- CI: Test against old versions of key dependencies ([#16570](https://github.com/rapidsai/cudf/pull/16570)) [@seberg](https://github.com/seberg) +- Replace `NativeFile` dependency in dask-cudf Parquet reader ([#16569](https://github.com/rapidsai/cudf/pull/16569)) [@rjzamora](https://github.com/rjzamora) +- Align public utility function signatures with pandas 2.x ([#16565](https://github.com/rapidsai/cudf/pull/16565)) [@mroeschke](https://github.com/mroeschke) +- Move libcudf reduction google-benchmarks to nvbench ([#16564](https://github.com/rapidsai/cudf/pull/16564)) [@davidwendt](https://github.com/davidwendt) +- Rework strings::slice benchmark to use nvbench ([#16563](https://github.com/rapidsai/cudf/pull/16563)) [@davidwendt](https://github.com/davidwendt) +- Reenable arrow tests ([#16556](https://github.com/rapidsai/cudf/pull/16556)) [@vyasr](https://github.com/vyasr) +- Clean up reshaping ops ([#16553](https://github.com/rapidsai/cudf/pull/16553)) [@mroeschke](https://github.com/mroeschke) +- Disallow cudf.Index accepting column in favor of ._from_column ([#16549](https://github.com/rapidsai/cudf/pull/16549)) [@mroeschke](https://github.com/mroeschke) +- Rewrite remaining Python Arrow interop conversions using the C Data Interface ([#16548](https://github.com/rapidsai/cudf/pull/16548)) [@vyasr](https://github.com/vyasr) +- [REVIEW] JSON host tree algorithms ([#16545](https://github.com/rapidsai/cudf/pull/16545)) [@shrshi](https://github.com/shrshi) +- Refactor dictionary encoding in PQ writer to migrate to the new `cuco::static_map` ([#16541](https://github.com/rapidsai/cudf/pull/16541)) [@mhaseeb123](https://github.com/mhaseeb123) +- Remove hardcoded versions from workflows. ([#16540](https://github.com/rapidsai/cudf/pull/16540)) [@bdice](https://github.com/bdice) +- Ensure comparisons with pyints and integer series always succeed ([#16532](https://github.com/rapidsai/cudf/pull/16532)) [@seberg](https://github.com/seberg) +- Remove unneeded output size parameter from internal count_matches utility ([#16531](https://github.com/rapidsai/cudf/pull/16531)) [@davidwendt](https://github.com/davidwendt) +- Remove invalid column_view usage in string-scalar-to-column function ([#16530](https://github.com/rapidsai/cudf/pull/16530)) [@davidwendt](https://github.com/davidwendt) +- Raise NotImplementedError for Series.rename that's not a scalar ([#16525](https://github.com/rapidsai/cudf/pull/16525)) [@mroeschke](https://github.com/mroeschke) +- Remove deprecated public APIs from libcudf ([#16524](https://github.com/rapidsai/cudf/pull/16524)) [@davidwendt](https://github.com/davidwendt) +- Return Interval object in pandas compat mode for IntervalIndex reductions ([#16523](https://github.com/rapidsai/cudf/pull/16523)) [@mroeschke](https://github.com/mroeschke) +- Update json normalization to take device_buffer ([#16520](https://github.com/rapidsai/cudf/pull/16520)) [@karthikeyann](https://github.com/karthikeyann) +- Rework cudf::io::text::byte_range_info class member functions ([#16518](https://github.com/rapidsai/cudf/pull/16518)) [@davidwendt](https://github.com/davidwendt) +- Remove unneeded pair-iterator benchmark ([#16511](https://github.com/rapidsai/cudf/pull/16511)) [@davidwendt](https://github.com/davidwendt) +- Update pre-commit hooks ([#16510](https://github.com/rapidsai/cudf/pull/16510)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA) +- Improve update-version.sh ([#16506](https://github.com/rapidsai/cudf/pull/16506)) [@bdice](https://github.com/bdice) +- Use tool.scikit-build.cmake.version, set scikit-build-core minimum-version ([#16503](https://github.com/rapidsai/cudf/pull/16503)) [@jameslamb](https://github.com/jameslamb) +- Pass batch size to JSON reader using environment variable ([#16502](https://github.com/rapidsai/cudf/pull/16502)) [@shrshi](https://github.com/shrshi) +- Remove a deprecated multibyte_split API ([#16501](https://github.com/rapidsai/cudf/pull/16501)) [@davidwendt](https://github.com/davidwendt) +- Add interop example for `arrow::StringViewArray` to `cudf::column` ([#16498](https://github.com/rapidsai/cudf/pull/16498)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub) +- Add keep option to distinct nvbench ([#16497](https://github.com/rapidsai/cudf/pull/16497)) [@bdice](https://github.com/bdice) +- Use more idomatic cudf APIs in dask_cudf meta generation ([#16487](https://github.com/rapidsai/cudf/pull/16487)) [@mroeschke](https://github.com/mroeschke) +- Fix typo in dispatch_row_equal. ([#16473](https://github.com/rapidsai/cudf/pull/16473)) [@bdice](https://github.com/bdice) +- Use explicit construction of column subclass instead of `build_column` when type is known ([#16470](https://github.com/rapidsai/cudf/pull/16470)) [@mroeschke](https://github.com/mroeschke) +- Move exception handler into pylibcudf from cudf ([#16468](https://github.com/rapidsai/cudf/pull/16468)) [@lithomas1](https://github.com/lithomas1) +- Make StructColumn.__init__ strict ([#16467](https://github.com/rapidsai/cudf/pull/16467)) [@mroeschke](https://github.com/mroeschke) +- Make ListColumn.__init__ strict ([#16465](https://github.com/rapidsai/cudf/pull/16465)) [@mroeschke](https://github.com/mroeschke) +- Make Timedelta/DatetimeColumn.__init__ strict ([#16464](https://github.com/rapidsai/cudf/pull/16464)) [@mroeschke](https://github.com/mroeschke) +- Make NumericalColumn.__init__ strict ([#16457](https://github.com/rapidsai/cudf/pull/16457)) [@mroeschke](https://github.com/mroeschke) +- Make CategoricalColumn.__init__ strict ([#16456](https://github.com/rapidsai/cudf/pull/16456)) [@mroeschke](https://github.com/mroeschke) +- Disallow cudf.Series to accept column in favor of `._from_column` ([#16454](https://github.com/rapidsai/cudf/pull/16454)) [@mroeschke](https://github.com/mroeschke) +- Expose `stream` param in transform APIs ([#16452](https://github.com/rapidsai/cudf/pull/16452)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub) +- Add upper bound pin for polars ([#16442](https://github.com/rapidsai/cudf/pull/16442)) [@wence-](https://github.com/wence-) +- Make (Indexed)Frame.__init__ require data (and index) ([#16430](https://github.com/rapidsai/cudf/pull/16430)) [@mroeschke](https://github.com/mroeschke) +- Add Java APIs to copy column data to host asynchronously ([#16429](https://github.com/rapidsai/cudf/pull/16429)) [@jlowe](https://github.com/jlowe) +- Update docs of the TPC-H derived examples ([#16423](https://github.com/rapidsai/cudf/pull/16423)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub) +- Use RMM adaptor constructors instead of factories. ([#16414](https://github.com/rapidsai/cudf/pull/16414)) [@bdice](https://github.com/bdice) +- Align ewm APIs with pandas 2.x ([#16413](https://github.com/rapidsai/cudf/pull/16413)) [@mroeschke](https://github.com/mroeschke) +- Remove checking for specific tests in memcheck script ([#16412](https://github.com/rapidsai/cudf/pull/16412)) [@davidwendt](https://github.com/davidwendt) +- Add stream parameter to reshape APIs ([#16410](https://github.com/rapidsai/cudf/pull/16410)) [@davidwendt](https://github.com/davidwendt) +- Align groupby APIs with pandas 2.x ([#16403](https://github.com/rapidsai/cudf/pull/16403)) [@mroeschke](https://github.com/mroeschke) +- Align misc DataFrame and MultiIndex methods with pandas 2.x ([#16402](https://github.com/rapidsai/cudf/pull/16402)) [@mroeschke](https://github.com/mroeschke) +- update some branch references in GitHub Actions configs ([#16397](https://github.com/rapidsai/cudf/pull/16397)) [@jameslamb](https://github.com/jameslamb) +- Support reading matching projected and filter cols from Parquet files with otherwise mismatched schemas ([#16394](https://github.com/rapidsai/cudf/pull/16394)) [@mhaseeb123](https://github.com/mhaseeb123) +- Merge branch-24.08 into branch-24.10 ([#16393](https://github.com/rapidsai/cudf/pull/16393)) [@jameslamb](https://github.com/jameslamb) +- Add query 10 to the TPC-H suite ([#16392](https://github.com/rapidsai/cudf/pull/16392)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub) +- Use `make_host_vector` instead of `make_std_vector` to facilitate pinned memory optimizations ([#16386](https://github.com/rapidsai/cudf/pull/16386)) [@vuule](https://github.com/vuule) +- Fix some issues with deprecated / removed cccl facilities ([#16377](https://github.com/rapidsai/cudf/pull/16377)) [@miscco](https://github.com/miscco) +- Align IntervalIndex APIs with pandas 2.x ([#16371](https://github.com/rapidsai/cudf/pull/16371)) [@mroeschke](https://github.com/mroeschke) +- Align CategoricalIndex APIs with pandas 2.x ([#16369](https://github.com/rapidsai/cudf/pull/16369)) [@mroeschke](https://github.com/mroeschke) +- Align TimedeltaIndex APIs with pandas 2.x ([#16368](https://github.com/rapidsai/cudf/pull/16368)) [@mroeschke](https://github.com/mroeschke) +- Align DatetimeIndex APIs with pandas 2.x ([#16367](https://github.com/rapidsai/cudf/pull/16367)) [@mroeschke](https://github.com/mroeschke) +- fix [tool.setuptools] reference in custreamz config ([#16365](https://github.com/rapidsai/cudf/pull/16365)) [@jameslamb](https://github.com/jameslamb) +- Align Index APIs with pandas 2.x ([#16361](https://github.com/rapidsai/cudf/pull/16361)) [@mroeschke](https://github.com/mroeschke) +- Rebuild for & Support NumPy 2 ([#16300](https://github.com/rapidsai/cudf/pull/16300)) [@jakirkham](https://github.com/jakirkham) +- Add `stream` param to stream compaction APIs ([#16295](https://github.com/rapidsai/cudf/pull/16295)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub) +- Added batch memset to memset data and validity buffers in parquet reader ([#16281](https://github.com/rapidsai/cudf/pull/16281)) [@sdrp713](https://github.com/sdrp713) +- Deduplicate decimal32/decimal64 to decimal128 conversion function ([#16236](https://github.com/rapidsai/cudf/pull/16236)) [@mhaseeb123](https://github.com/mhaseeb123) +- Refactor mixed_semi_join using cuco::static_set ([#16230](https://github.com/rapidsai/cudf/pull/16230)) [@srinivasyadav18](https://github.com/srinivasyadav18) +- Improve performance of hash_character_ngrams using warp-per-string kernel ([#16212](https://github.com/rapidsai/cudf/pull/16212)) [@davidwendt](https://github.com/davidwendt) +- Add environment variable to log cudf.pandas fallback calls ([#16161](https://github.com/rapidsai/cudf/pull/16161)) [@mroeschke](https://github.com/mroeschke) +- Add libcudf example with large strings ([#15983](https://github.com/rapidsai/cudf/pull/15983)) [@davidwendt](https://github.com/davidwendt) +- JSON tree algorithms refactor I: CSR data structure for column tree ([#15979](https://github.com/rapidsai/cudf/pull/15979)) [@shrshi](https://github.com/shrshi) +- Support multiple new-line characters in regex APIs ([#15961](https://github.com/rapidsai/cudf/pull/15961)) [@davidwendt](https://github.com/davidwendt) +- adding wheel build for libcudf ([#15483](https://github.com/rapidsai/cudf/pull/15483)) [@msarahan](https://github.com/msarahan) +- Replace usages of `thrust::optional` with `std::optional` ([#15091](https://github.com/rapidsai/cudf/pull/15091)) [@miscco](https://github.com/miscco) + # cudf 24.08.00 (7 Aug 2024) ## 🚨 Breaking Changes diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f9cdde7c2b7..b55af21a300 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -293,8 +293,8 @@ In order to run doxygen as a linter on C++/CUDA code, run ./ci/checks/doxygen.sh ``` -Python code runs several linters including [Black](https://black.readthedocs.io/en/stable/), -[isort](https://pycqa.github.io/isort/), and [flake8](https://flake8.pycqa.org/en/latest/). +Python code runs several linters including [Ruff](https://docs.astral.sh/ruff/) +with its various rules for Black-like formatting or Isort. cuDF also uses [codespell](https://github.com/codespell-project/codespell) to find spelling mistakes, and this check is run as a pre-commit hook. To apply the suggested spelling fixes, diff --git a/ci/build_docs.sh b/ci/build_docs.sh index c67d127e635..4290d013fe4 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -5,7 +5,6 @@ set -euo pipefail export RAPIDS_VERSION="$(rapids-version)" export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)" -export RAPIDS_VERSION_NUMBER="$RAPIDS_VERSION_MAJOR_MINOR" rapids-logger "Create test conda environment" . /opt/conda/etc/profile.d/conda.sh @@ -29,13 +28,16 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python) rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ --channel "${PYTHON_CHANNEL}" \ - libcudf pylibcudf cudf dask-cudf + "libcudf=${RAPIDS_VERSION}" \ + "pylibcudf=${RAPIDS_VERSION}" \ + "cudf=${RAPIDS_VERSION}" \ + "dask-cudf=${RAPIDS_VERSION}" export RAPIDS_DOCS_DIR="$(mktemp -d)" rapids-logger "Build CPP docs" pushd cpp/doxygen -aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_NUMBER}/rmm.tag . || echo "Failed to download rmm Doxygen tag" +aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_MAJOR_MINOR}/rmm.tag . || echo "Failed to download rmm Doxygen tag" doxygen Doxyfile mkdir -p "${RAPIDS_DOCS_DIR}/libcudf/html" mv html/* "${RAPIDS_DOCS_DIR}/libcudf/html" @@ -55,4 +57,4 @@ mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/html" mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html" popd -rapids-upload-docs +RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs diff --git a/ci/build_python.sh b/ci/build_python.sh index 2e3f70ba767..823d7f62290 100755 --- a/ci/build_python.sh +++ b/ci/build_python.sh @@ -52,5 +52,10 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ conda/recipes/custreamz +RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \ + --no-test \ + --channel "${CPP_CHANNEL}" \ + --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \ + conda/recipes/cudf-polars rapids-upload-conda-to-s3 python diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index 8975381ceba..91bc071583e 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -5,11 +5,15 @@ set -euo pipefail package_dir="python/libcudf" +export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON" ./ci/build_wheel.sh ${package_dir} RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" mkdir -p ${package_dir}/final_dist -python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/* +python -m auditwheel repair \ + --exclude libnvcomp.so.4 \ + -w ${package_dir}/final_dist \ + ${package_dir}/dist/* RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist diff --git a/ci/clang_tidy.sh b/ci/clang_tidy.sh new file mode 100755 index 00000000000..4d5d3fc3136 --- /dev/null +++ b/ci/clang_tidy.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright (c) 2024, NVIDIA CORPORATION. + +set -euo pipefail + +rapids-logger "Create clang-tidy conda environment" +. /opt/conda/etc/profile.d/conda.sh + +ENV_YAML_DIR="$(mktemp -d)" + +rapids-dependency-file-generator \ + --output conda \ + --file-key clang_tidy \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml" + +rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n clang_tidy + +# Temporarily allow unbound variables for conda activation. +set +u +conda activate clang_tidy +set -u + +RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)" + +source rapids-configure-sccache + +# Run the build via CMake, which will run clang-tidy when CUDF_CLANG_TIDY is enabled. +cmake -S cpp -B cpp/build -DCMAKE_BUILD_TYPE=Release -DCUDF_CLANG_TIDY=ON -GNinja +cmake --build cpp/build diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 870901d223b..95f36653c2c 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -93,6 +93,7 @@ sed_runner "s/cudf-.*-SNAPSHOT/cudf-${NEXT_FULL_JAVA_TAG}/g" java/ci/README.md # .devcontainer files find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}" + sed_runner "s@rapidsai/devcontainers/features/cuda:[0-9.]*@rapidsai/devcontainers/features/cuda:${NEXT_SHORT_TAG_PEP440}@" "${filename}" sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}" sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}" done diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh index 0819eacf636..2439af5b644 100755 --- a/ci/run_cudf_examples.sh +++ b/ci/run_cudf_examples.sh @@ -23,7 +23,10 @@ compute-sanitizer --tool memcheck custom_optimized names.csv compute-sanitizer --tool memcheck custom_prealloc names.csv compute-sanitizer --tool memcheck custom_with_malloc names.csv -compute-sanitizer --tool memcheck parquet_io +compute-sanitizer --tool memcheck parquet_io example.parquet compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE +compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet +compute-sanitizer --tool memcheck parquet_io_multithreaded example.parquet 4 DEVICE_BUFFER 2 2 + exit ${EXITCODE} diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh index f5a8de543f6..8cd78eb11c2 100755 --- a/ci/test_cpp_common.sh +++ b/ci/test_cpp_common.sh @@ -5,6 +5,8 @@ set -euo pipefail . /opt/conda/etc/profile.d/conda.sh +RAPIDS_VERSION="$(rapids-version)" + rapids-logger "Generate C++ testing dependencies" ENV_YAML_DIR="$(mktemp -d)" @@ -31,7 +33,10 @@ rapids-print-env rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ - libcudf libcudf_kafka libcudf-tests libcudf-example + "libcudf=${RAPIDS_VERSION}" \ + "libcudf_kafka=${RAPIDS_VERSION}" \ + "libcudf-tests=${RAPIDS_VERSION}" \ + "libcudf-example=${RAPIDS_VERSION}" rapids-logger "Check GPU usage" nvidia-smi diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh index 55399d0371a..f5bcdc62604 100755 --- a/ci/test_cudf_polars_polars_tests.sh +++ b/ci/test_cudf_polars_polars_tests.sh @@ -24,14 +24,17 @@ rapids-logger "Download wheels" RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist -# Download the pylibcudf built in the previous step -RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep +# Download libcudf and pylibcudf built in the previous step +RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libcudf-dep +RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./local-pylibcudf-dep -rapids-logger "Install pylibcudf" -python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl +rapids-logger "Install libcudf, pylibcudf and cudf_polars" +python -m pip install \ + -v \ + "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \ + "$(echo ./local-libcudf-dep/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \ + "$(echo ./local-pylibcudf-dep/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" -rapids-logger "Install cudf_polars" -python -m pip install $(echo ./dist/cudf_polars*.whl) TAG=$(python -c 'import polars; print(f"py-{polars.__version__}")') rapids-logger "Clone polars to ${TAG}" diff --git a/ci/test_java.sh b/ci/test_java.sh index 629ad11014a..7f1aa633afc 100755 --- a/ci/test_java.sh +++ b/ci/test_java.sh @@ -5,6 +5,8 @@ set -euo pipefail . /opt/conda/etc/profile.d/conda.sh +RAPIDS_VERSION="$(rapids-version)" + rapids-logger "Generate Java testing dependencies" ENV_YAML_DIR="$(mktemp -d)" @@ -30,7 +32,7 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ - libcudf + "libcudf=${RAPIDS_VERSION}" rapids-logger "Check GPU usage" nvidia-smi diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh index da9478ce25d..4197dc5617f 100755 --- a/ci/test_notebooks.sh +++ b/ci/test_notebooks.sh @@ -5,6 +5,8 @@ set -euo pipefail . /opt/conda/etc/profile.d/conda.sh +RAPIDS_VERSION="$(rapids-version)" + rapids-logger "Generate notebook testing dependencies" ENV_YAML_DIR="$(mktemp -d)" @@ -30,7 +32,8 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python) rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ --channel "${PYTHON_CHANNEL}" \ - cudf libcudf + "cudf=${RAPIDS_VERSION}" \ + "libcudf=${RAPIDS_VERSION}" NBTEST="$(realpath "$(dirname "$0")/utils/nbtest.sh")" pushd notebooks diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh index dc70661a17a..4327bfff3af 100755 --- a/ci/test_python_common.sh +++ b/ci/test_python_common.sh @@ -7,6 +7,8 @@ set -euo pipefail . /opt/conda/etc/profile.d/conda.sh +RAPIDS_VERSION="$(rapids-version)" + rapids-logger "Generate Python testing dependencies" ENV_YAML_DIR="$(mktemp -d)" @@ -38,4 +40,5 @@ rapids-print-env rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ --channel "${PYTHON_CHANNEL}" \ - cudf libcudf + "cudf=${RAPIDS_VERSION}" \ + "libcudf=${RAPIDS_VERSION}" diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh index 2386414b32e..9528549a562 100755 --- a/ci/test_python_cudf.sh +++ b/ci/test_python_cudf.sh @@ -9,7 +9,7 @@ source ./ci/test_python_common.sh test_python_cudf rapids-logger "Check GPU usage" nvidia-smi - +rapids-print-env EXITCODE=0 trap "EXITCODE=1" ERR set +e diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index 67c97ad29a5..db86721755d 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -7,10 +7,15 @@ cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ # Common setup steps shared by Python test jobs source ./ci/test_python_common.sh test_python_other +RAPIDS_VERSION="$(rapids-version)" + rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ --channel "${PYTHON_CHANNEL}" \ - dask-cudf cudf_kafka custreamz + "dask-cudf=${RAPIDS_VERSION}" \ + "cudf_kafka=${RAPIDS_VERSION}" \ + "custreamz=${RAPIDS_VERSION}" \ + "cudf-polars=${RAPIDS_VERSION}" rapids-logger "Check GPU usage" nvidia-smi @@ -33,7 +38,7 @@ rapids-logger "pytest dask_cudf (legacy)" DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ --numprocesses=8 \ - --dist=loadscope \ + --dist=worksteal \ . rapids-logger "pytest cudf_kafka" @@ -50,5 +55,19 @@ rapids-logger "pytest custreamz" --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/custreamz-coverage.xml" \ --cov-report=term +# Note that cudf-polars uses rmm.mr.CudaAsyncMemoryResource() which allocates +# half the available memory. This doesn't play well with multiple workers, so +# we keep --numprocesses=1 for now. This should be resolved by +# https://github.com/rapidsai/cudf/issues/16723. +rapids-logger "pytest cudf-polars" +./ci/run_cudf_polars_pytests.sh \ + --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars.xml" \ + --numprocesses=1 \ + --dist=worksteal \ + --cov-config=./pyproject.toml \ + --cov=cudf_polars \ + --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-polars-coverage.xml" \ + --cov-report=term + rapids-logger "Test script exiting with value: $EXITCODE" exit ${EXITCODE} diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 8b45d26c367..c3716c4759a 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -63,9 +63,9 @@ dependencies: - openpyxl - packaging - pandas -- pandas>=2.0,<2.2.3dev0 +- pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.8,<1.9 +- polars>=1.11,<1.12 - pre-commit - ptxcompiler - pyarrow>=14.0.0,<18.0.0a0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 354c1360e5a..38e131e79cb 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -61,9 +61,9 @@ dependencies: - openpyxl - packaging - pandas -- pandas>=2.0,<2.2.3dev0 +- pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.8,<1.9 +- polars>=1.11,<1.12 - pre-commit - pyarrow>=14.0.0,<18.0.0a0 - pydata-sphinx-theme!=0.14.2 diff --git a/conda/recipes/cudf-polars/build.sh b/conda/recipes/cudf-polars/build.sh new file mode 100644 index 00000000000..06e2f1bcb99 --- /dev/null +++ b/conda/recipes/cudf-polars/build.sh @@ -0,0 +1,4 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +# This assumes the script is executed from the root of the repo directory +./build.sh cudf_polars diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml new file mode 100644 index 00000000000..edf92b930d9 --- /dev/null +++ b/conda/recipes/cudf-polars/meta.yaml @@ -0,0 +1,61 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} +{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} +{% set py_version = environ['CONDA_PY'] %} +{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %} +{% set cuda_major = cuda_version.split('.')[0] %} +{% set date_string = environ['RAPIDS_DATE_STRING'] %} + +package: + name: cudf-polars + version: {{ version }} + +source: + path: ../../.. + +build: + number: {{ GIT_DESCRIBE_NUMBER }} + string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} + script_env: + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + - AWS_SESSION_TOKEN + - CMAKE_C_COMPILER_LAUNCHER + - CMAKE_CUDA_COMPILER_LAUNCHER + - CMAKE_CXX_COMPILER_LAUNCHER + - CMAKE_GENERATOR + - PARALLEL_LEVEL + - SCCACHE_BUCKET + - SCCACHE_IDLE_TIMEOUT + - SCCACHE_REGION + - SCCACHE_S3_KEY_PREFIX=cudf-polars-aarch64 # [aarch64] + - SCCACHE_S3_KEY_PREFIX=cudf-polars-linux64 # [linux64] + - SCCACHE_S3_USE_SSL + - SCCACHE_S3_NO_CREDENTIALS + +requirements: + host: + - python + - rapids-build-backend >=0.3.0,<0.4.0.dev0 + - setuptools + - cuda-version ={{ cuda_version }} + run: + - python + - pylibcudf ={{ version }} + - polars >=1.11,<1.12 + - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} + +test: + requires: + - cuda-version ={{ cuda_version }} + imports: + - cudf_polars + + +about: + home: https://rapids.ai/ + license: Apache-2.0 + license_family: APACHE + license_file: LICENSE + summary: cudf-polars library diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 25e69b89789..2c254415318 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -78,7 +78,7 @@ requirements: run: - python - typing_extensions >=4.0.0 - - pandas >=2.0,<2.2.3dev0 + - pandas >=2.0,<2.2.4dev0 - cupy >=12.0.0 - numba-cuda >=0.0.13 - numpy >=1.23,<3.0a0 diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index 7c1efa0176c..3d965f30986 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -77,7 +77,7 @@ requirements: run: - python - typing_extensions >=4.0.0 - - pandas >=2.0,<2.2.3dev0 + - pandas >=2.0,<2.2.4dev0 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<18.0.0a0 - {{ pin_compatible('rmm', max_pin='x.x') }} diff --git a/cpp/.clang-tidy b/cpp/.clang-tidy index b791d846d1d..12120a5c6d1 100644 --- a/cpp/.clang-tidy +++ b/cpp/.clang-tidy @@ -1,18 +1,47 @@ --- +# Notes on disabled checks +# ------------------------ +# modernize-use-equals-default: +# auto-fix is broken (doesn't insert =default correctly) +# modernize-concat-nested-namespaces: +# auto-fix is broken (can delete code) +# modernize-use-trailing-return-type: +# Purely stylistic, no benefit to rewriting everything +# modernize-return-braced-init-list: +# Stylistically we prefer to see the return type at the return site. +# See https://github.com/rapidsai/cudf/pull/16956#pullrequestreview-2341891672 +# for more information. +# modernize-use-bool-literals: +# Our tests use int flags for validity masks extensively and we prefer that +# clang-analyzer-cplusplus.NewDeleteLeaks: +# This check has numerous bugs, see +# https://github.com/llvm/llvm-project/issues?q=is%3Aissue+is%3Aopen+newdeleteleaks +# We encounter at least +# https://github.com/llvm/llvm-project/issues/60896 +# https://github.com/llvm/llvm-project/issues/69602 +# clang-analyzer-optin.core.EnumCastOutOfRange +# We use enums as flags in multiple cases and this check makes ORing flags invalid +# clang-analyzer-optin.cplusplus.UninitializedObject' +# There is an error in nanoarrow that none of the clang-tidy filters (i.e. +# header-filter and exclude-header-filter are able to properly avoid. This +# merits further investigation +# +# We need to verify that broken checks are still broken Checks: 'modernize-*, -modernize-use-equals-default, -modernize-concat-nested-namespaces, -modernize-use-trailing-return-type, - -modernize-use-bool-literals' + -modernize-return-braced-init-list, + -modernize-use-bool-literals, + clang-analyzer-*, + -clang-analyzer-cplusplus.NewDeleteLeaks, + -clang-analyzer-optin.core.EnumCastOutOfRange, + -clang-analyzer-optin.cplusplus.UninitializedObject' - # -modernize-use-equals-default # auto-fix is broken (doesn't insert =default correctly) - # -modernize-concat-nested-namespaces # auto-fix is broken (can delete code) - # -modernize-use-trailing-return-type # just a preference - -WarningsAsErrors: '' -HeaderFilterRegex: '' -AnalyzeTemporaryDtors: false +WarningsAsErrors: '*' +HeaderFilterRegex: '.*cudf/cpp/(src|include|tests).*' +ExcludeHeaderFilterRegex: '.*(Message_generated.h|Schema_generated.h|brotli_dict.hpp|unbz2.hpp|cxxopts.hpp).*' FormatStyle: none CheckOptions: - key: modernize-loop-convert.MaxCopySize diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 136f43ee706..e4b9cbf8921 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -52,6 +52,7 @@ option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON) option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON) mark_as_advanced(CUDF_BUILD_TESTUTIL) option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON) +option(CUDF_EXPORT_NVCOMP "Export NVCOMP as a dependency" ON) option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF) mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED) option( @@ -87,6 +88,7 @@ option( ${DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL} ) mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL) +option(CUDF_CLANG_TIDY "Enable clang-tidy checking" OFF) message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}") message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}") @@ -143,6 +145,58 @@ if(NOT CUDF_GENERATED_INCLUDE_DIR) set(CUDF_GENERATED_INCLUDE_DIR ${CUDF_BINARY_DIR}) endif() +# ################################################################################################## +# * clang-tidy configuration ---------------------------------------------------------------------- +if(CUDF_CLANG_TIDY) + find_program( + CLANG_TIDY_EXE + NAMES "clang-tidy" + DOC "Path to clang-tidy executable" REQUIRED + ) + + execute_process( + COMMAND ${CLANG_TIDY_EXE} --version + OUTPUT_VARIABLE CLANG_TIDY_OUTPUT + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + string(REGEX MATCH "LLVM version ([0-9]+\\.[0-9]+)\\.[0-9]+" LLVM_VERSION_MATCH + "${CLANG_TIDY_OUTPUT}" + ) + # Discard the patch version and allow it to float. Empirically, results between patch versions are + # mostly stable, and looking at available packages on some package managers sometimes patch + # versions are skipped so we don't want to constrain to a patch version that the user can't + # install. + set(LLVM_VERSION "${CMAKE_MATCH_1}") + set(expected_clang_tidy_version 19.1) + if(NOT expected_clang_tidy_version VERSION_EQUAL LLVM_VERSION) + message( + FATAL_ERROR + "clang-tidy version ${expected_clang_tidy_version} is required, but found ${LLVM_VERSION}" + ) + endif() +endif() + +# Turn on the clang-tidy property for a target excluding the files specified in SKIPPED_FILES. +function(enable_clang_tidy target) + set(_tidy_options) + set(_tidy_one_value) + set(_tidy_multi_value SKIPPED_FILES) + cmake_parse_arguments( + _TIDY "${_tidy_options}" "${_tidy_one_value}" "${_tidy_multi_value}" ${ARGN} + ) + + if(CUDF_CLANG_TIDY) + # clang will complain about unused link libraries on the compile line unless we specify + # -Qunused-arguments. + set_target_properties( + ${target} PROPERTIES CXX_CLANG_TIDY "${CLANG_TIDY_EXE};--extra-arg=-Qunused-arguments" + ) + foreach(file IN LISTS _TIDY_SKIPPED_FILES) + set_source_files_properties(${file} PROPERTIES SKIP_LINTING ON) + endforeach() + endif() +endfunction() + # ################################################################################################## # * conda environment ----------------------------------------------------------------------------- rapids_cmake_support_conda_env(conda_env MODIFY_PREFIX_PATH) @@ -314,7 +368,13 @@ add_library( src/filling/repeat.cu src/filling/sequence.cu src/groupby/groupby.cu + src/groupby/hash/compute_groupby.cu + src/groupby/hash/compute_single_pass_aggs.cu + src/groupby/hash/create_sparse_results_table.cu + src/groupby/hash/flatten_single_pass_aggs.cpp src/groupby/hash/groupby.cu + src/groupby/hash/hash_compound_agg_finalizer.cu + src/groupby/hash/sparse_to_dense_results.cu src/groupby/sort/aggregate.cpp src/groupby/sort/group_argmax.cu src/groupby/sort/group_argmin.cu @@ -712,6 +772,7 @@ target_compile_options( cudf PRIVATE "$<$:${CUDF_CXX_FLAGS}>" "$<$:${CUDF_CUDA_FLAGS}>" ) +enable_clang_tidy(cudf SKIPPED_FILES src/io/comp/cpu_unbz2.cpp src/io/comp/brotli_dict.cpp) if(CUDF_BUILD_STACKTRACE_DEBUG) # Remove any optimization level to avoid nvcc warning "incompatible redefinition for option @@ -861,15 +922,7 @@ if(CUDF_BUILD_TESTUTIL) add_library(cudf::cudftest_default_stream ALIAS cudftest_default_stream) - add_library( - cudftestutil SHARED - tests/io/metadata_utilities.cpp - tests/utilities/column_utilities.cu - tests/utilities/debug_utilities.cu - tests/utilities/random_seed.cpp - tests/utilities/table_utilities.cu - tests/utilities/tdigest_utilities.cu - ) + add_library(cudftestutil INTERFACE) set_target_properties( cudftestutil @@ -878,32 +931,56 @@ if(CUDF_BUILD_TESTUTIL) # set target compile options CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON - CXX_VISIBILITY_PRESET hidden CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON - CUDA_VISIBILITY_PRESET hidden - POSITION_INDEPENDENT_CODE ON - INTERFACE_POSITION_INDEPENDENT_CODE ON ) target_compile_options( - cudftestutil PUBLIC "$:${CUDF_CXX_FLAGS}>>" - "$:${CUDF_CUDA_FLAGS}>>" + cudftestutil INTERFACE "$:${CUDF_CXX_FLAGS}>>" + "$:${CUDF_CUDA_FLAGS}>>" ) target_link_libraries( - cudftestutil - PUBLIC Threads::Threads cudf cudftest_default_stream - PRIVATE GTest::gmock GTest::gtest $ + cudftestutil INTERFACE Threads::Threads cudf cudftest_default_stream + $ ) target_include_directories( - cudftestutil PUBLIC "$" - "$" + cudftestutil INTERFACE "$" + "$" ) rapids_cuda_set_runtime(cudftestutil USE_STATIC ${CUDA_STATIC_RUNTIME}) add_library(cudf::cudftestutil ALIAS cudftestutil) + add_library(cudftestutil_impl INTERFACE) + add_library(cudf::cudftestutil_impl ALIAS cudftestutil_impl) + target_sources( + cudftestutil_impl + INTERFACE $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + $ + ) + target_link_libraries(cudftestutil_impl INTERFACE cudf::cudftestutil) + + install(FILES tests/io/metadata_utilities.cpp DESTINATION src/cudftestutil/io) + install( + FILES tests/utilities/column_utilities.cu + tests/utilities/debug_utilities.cu + tests/utilities/random_seed.cpp + tests/utilities/table_utilities.cu + tests/utilities/tdigest_utilities.cu + DESTINATION src/cudftestutil/utilities + ) + endif() # * build cudf_identify_stream_usage -------------------------------------------------------------- @@ -1004,7 +1081,7 @@ install( set(_components_export_string) if(TARGET cudftestutil) install( - TARGETS cudftest_default_stream cudftestutil + TARGETS cudftest_default_stream cudftestutil cudftestutil_impl DESTINATION ${lib_dir} EXPORT cudf-testing-exports ) @@ -1044,14 +1121,15 @@ targets: This module offers an optional testing component which defines the following IMPORTED GLOBAL targets: - cudf::cudftestutil - The main cudf testing library + cudf::cudftestutil - The main cudf testing library + cudf::cudftestutil_impl - C++ and CUDA sources to compile for definitions in cudf::cudftestutil ]=] ) rapids_export( INSTALL cudf EXPORT_SET cudf-exports ${_components_export_string} - GLOBAL_TARGETS cudf cudftestutil + GLOBAL_TARGETS cudf cudftestutil cudftestutil_impl NAMESPACE cudf:: DOCUMENTATION doc_string ) @@ -1072,7 +1150,7 @@ endif() rapids_export( BUILD cudf EXPORT_SET cudf-exports ${_components_export_string} - GLOBAL_TARGETS cudf cudftestutil + GLOBAL_TARGETS cudf cudftestutil cudftestutil_impl NAMESPACE cudf:: DOCUMENTATION doc_string FINAL_CODE_BLOCK build_code_string diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 4113e38dcf4..2a4ac789046 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -25,7 +25,7 @@ target_compile_options( target_link_libraries( cudf_datagen PUBLIC GTest::gmock GTest::gtest benchmark::benchmark nvbench::nvbench Threads::Threads cudf - cudftestutil nvtx3::nvtx3-cpp + cudf::cudftestutil nvtx3::nvtx3-cpp PRIVATE $ ) @@ -49,7 +49,7 @@ target_compile_options( target_link_libraries( ndsh_data_generator - PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp + PUBLIC cudf cudf::cudftestutil nvtx3::nvtx3-cpp PRIVATE $ ) @@ -65,14 +65,14 @@ target_include_directories( # Use an OBJECT library so we only compile these helper source files only once add_library( cudf_benchmark_common OBJECT - "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp" - synchronization/synchronization.cpp - io/cuio_common.cpp - common/table_utilities.cpp - common/benchmark_utilities.cpp - common/nvbench_utilities.cpp + synchronization/synchronization.cpp io/cuio_common.cpp common/table_utilities.cpp + common/benchmark_utilities.cpp common/nvbench_utilities.cpp ) -target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $) +target_link_libraries( + cudf_benchmark_common PRIVATE cudf_datagen $ GTest::gmock + GTest::gtest +) + add_custom_command( OUTPUT CUDF_BENCHMARKS COMMAND echo Running benchmarks @@ -99,7 +99,7 @@ function(ConfigureBench CMAKE_BENCH_NAME) ) target_link_libraries( ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen benchmark::benchmark_main - $ + cudf::cudftestutil_impl $ ) add_custom_command( OUTPUT CUDF_BENCHMARKS @@ -127,8 +127,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME) INSTALL_RPATH "\$ORIGIN/../../../lib" ) target_link_libraries( - ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen - nvbench::nvbench $ + ${CMAKE_BENCH_NAME} + PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen nvbench::nvbench + $ cudf::cudftestutil_impl ) install( TARGETS ${CMAKE_BENCH_NAME} @@ -245,6 +246,7 @@ ConfigureNVBench( REDUCTION_NVBENCH reduction/anyall.cpp reduction/dictionary.cpp + reduction/histogram.cpp reduction/minmax.cpp reduction/rank.cpp reduction/reduce.cpp @@ -270,8 +272,13 @@ ConfigureBench( ) ConfigureNVBench( - GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_max_multithreaded.cpp - groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp + GROUPBY_NVBENCH + groupby/group_histogram.cpp + groupby/group_max.cpp + groupby/group_max_multithreaded.cpp + groupby/group_nunique.cpp + groupby/group_rank.cpp + groupby/group_struct_keys.cpp ) # ################################################################################################## @@ -330,19 +337,19 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp) # ################################################################################################## # * ast benchmark --------------------------------------------------------------------------------- -ConfigureBench(AST_BENCH ast/transform.cpp) +ConfigureNVBench(AST_NVBENCH ast/transform.cpp) # ################################################################################################## # * binaryop benchmark ---------------------------------------------------------------------------- -ConfigureBench(BINARYOP_BENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) +ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp) # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- -ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp) +ConfigureBench(TEXT_BENCH text/subword.cpp) ConfigureNVBench( - TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp - text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp + TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp + text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp ) # ################################################################################################## @@ -377,6 +384,7 @@ ConfigureNVBench( string/join_strings.cpp string/lengths.cpp string/like.cpp + string/make_strings_column.cu string/replace_re.cpp string/reverse.cpp string/slice.cpp @@ -392,11 +400,6 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp) ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp) -# ################################################################################################## -# * multi buffer memset benchmark -# ---------------------------------------------------------------------- -ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp) - # ################################################################################################## # * io benchmark --------------------------------------------------------------------- ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp) diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp index 65a44532cf1..f44f26e4d2c 100644 --- a/cpp/benchmarks/ast/transform.cpp +++ b/cpp/benchmarks/ast/transform.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,14 +15,16 @@ */ #include -#include -#include #include #include +#include + #include +#include + #include #include #include @@ -35,13 +37,10 @@ enum class TreeType { }; template -class AST : public cudf::benchmark {}; - -template -static void BM_ast_transform(benchmark::State& state) +static void BM_ast_transform(nvbench::state& state) { - auto const table_size{static_cast(state.range(0))}; - auto const tree_levels{static_cast(state.range(1))}; + auto const table_size = static_cast(state.get_int64("table_size")); + auto const tree_levels = static_cast(state.get_int64("tree_levels")); // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; @@ -86,38 +85,22 @@ static void BM_ast_transform(benchmark::State& state) auto const& expression_tree_root = expressions.back(); - // Execute benchmark - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 - cudf::compute_column(table, expression_tree_root); - } - // Use the number of bytes read from global memory - state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0) * - (tree_levels + 1) * sizeof(key_type)); -} + state.add_global_memory_reads(table_size * (tree_levels + 1)); -static void CustomRanges(benchmark::internal::Benchmark* b) -{ - auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; - auto operation_counts = std::vector{1, 5, 10}; - for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { - b->Args({row_count, operation_count}); - } - } + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); }); } #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \ - BENCHMARK_TEMPLATE_DEFINE_F(AST, name, key_type, tree_type, reuse_columns, nullable) \ - (::benchmark::State & st) \ + static void name(::nvbench::state& st) \ { \ - BM_ast_transform(st); \ + ::BM_ast_transform(st); \ } \ - BENCHMARK_REGISTER_F(AST, name) \ - ->Apply(CustomRanges) \ - ->Unit(benchmark::kMillisecond) \ - ->UseManualTime(); + NVBENCH_BENCH(name) \ + .set_name(#name) \ + .add_int64_axis("tree_levels", {1, 5, 10}) \ + .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) AST_TRANSFORM_BENCHMARK_DEFINE( ast_int32_imbalanced_unique, int32_t, TreeType::IMBALANCED_LEFT, false, false); diff --git a/cpp/benchmarks/binaryop/binaryop.cpp b/cpp/benchmarks/binaryop/binaryop.cpp index fa98d9e601a..7d267a88764 100644 --- a/cpp/benchmarks/binaryop/binaryop.cpp +++ b/cpp/benchmarks/binaryop/binaryop.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,15 +15,14 @@ */ #include -#include -#include #include #include #include +#include + #include -#include // This set of benchmarks is designed to be a comparison for the AST benchmarks @@ -33,13 +32,10 @@ enum class TreeType { }; template -class BINARYOP : public cudf::benchmark {}; - -template -static void BM_binaryop_transform(benchmark::State& state) +static void BM_binaryop_transform(nvbench::state& state) { - auto const table_size{static_cast(state.range(0))}; - auto const tree_levels{static_cast(state.range(1))}; + auto const table_size{static_cast(state.get_int64("table_size"))}; + auto const tree_levels{static_cast(state.get_int64("tree_levels"))}; // Create table data auto const n_cols = reuse_columns ? 1 : tree_levels + 1; @@ -47,9 +43,10 @@ static void BM_binaryop_transform(benchmark::State& state) cycle_dtypes({cudf::type_to_id()}, n_cols), row_count{table_size}); cudf::table_view table{*source_table}; - // Execute benchmark - for (auto _ : state) { - cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 + // Use the number of bytes read from global memory + state.add_global_memory_reads(table_size * (tree_levels + 1)); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch&) { // Execute tree that chains additions like (((a + b) + c) + d) auto const op = cudf::binary_operator::ADD; auto const result_data_type = cudf::data_type(cudf::type_to_id()); @@ -64,16 +61,18 @@ static void BM_binaryop_transform(benchmark::State& state) result = cudf::binary_operation(result->view(), col, op, result_data_type); }); } - } - - // Use the number of bytes read from global memory - state.SetBytesProcessed(static_cast(state.iterations()) * state.range(0) * - (tree_levels + 1) * sizeof(key_type)); + }); } #define BINARYOP_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns) \ - BENCHMARK_TEMPLATE_DEFINE_F(BINARYOP, name, key_type, tree_type, reuse_columns) \ - (::benchmark::State & st) { BM_binaryop_transform(st); } + \ + static void name(::nvbench::state& st) \ + { \ + BM_binaryop_transform(st); \ + } \ + NVBENCH_BENCH(name) \ + .add_int64_axis("tree_levels", {1, 2, 5, 10}) \ + .add_int64_axis("table_size", {100'000, 1'000'000, 10'000'000, 100'000'000}) BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_int32_imbalanced_unique, int32_t, @@ -87,29 +86,3 @@ BINARYOP_TRANSFORM_BENCHMARK_DEFINE(binaryop_double_imbalanced_unique, double, TreeType::IMBALANCED_LEFT, false); - -static void CustomRanges(benchmark::internal::Benchmark* b) -{ - auto row_counts = std::vector{100'000, 1'000'000, 10'000'000, 100'000'000}; - auto operation_counts = std::vector{1, 2, 5, 10}; - for (auto const& row_count : row_counts) { - for (auto const& operation_count : operation_counts) { - b->Args({row_count, operation_count}); - } - } -} - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_unique) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_int32_imbalanced_reuse) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); - -BENCHMARK_REGISTER_F(BINARYOP, binaryop_double_imbalanced_unique) - ->Apply(CustomRanges) - ->Unit(benchmark::kMillisecond) - ->UseManualTime(); diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp index 7086a61c7c5..bc0ff69bce9 100644 --- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp +++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp @@ -15,20 +15,18 @@ */ #include -#include -#include #include -class COMPILED_BINARYOP : public cudf::benchmark {}; +#include template -void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) +void BM_compiled_binaryop(nvbench::state& state, cudf::binary_operator binop) { - auto const column_size{static_cast(state.range(0))}; + auto const table_size = static_cast(state.get_int64("table_size")); auto const source_table = create_random_table( - {cudf::type_to_id(), cudf::type_to_id()}, row_count{column_size}); + {cudf::type_to_id(), cudf::type_to_id()}, row_count{table_size}); auto lhs = cudf::column_view(source_table->get_column(0)); auto rhs = cudf::column_view(source_table->get_column(1)); @@ -38,31 +36,26 @@ void BM_compiled_binaryop(benchmark::State& state, cudf::binary_operator binop) // Call once for hot cache. cudf::binary_operation(lhs, rhs, binop, output_dtype); - for (auto _ : state) { - cuda_event_timer timer(state, true); - cudf::binary_operation(lhs, rhs, binop, output_dtype); - } - // use number of bytes read and written to global memory - state.SetBytesProcessed(static_cast(state.iterations()) * column_size * - (sizeof(TypeLhs) + sizeof(TypeRhs) + sizeof(TypeOut))); + state.add_global_memory_reads(table_size); + state.add_global_memory_reads(table_size); + state.add_global_memory_reads(table_size); + + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch&) { cudf::binary_operation(lhs, rhs, binop, output_dtype); }); } +#define BM_STRINGIFY(a) #a + // TODO tparam boolean for null. -#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \ - BENCHMARK_DEFINE_F(COMPILED_BINARYOP, name) \ - (::benchmark::State & st) \ - { \ - BM_compiled_binaryop(st, cudf::binary_operator::bop); \ - } \ - BENCHMARK_REGISTER_F(COMPILED_BINARYOP, name) \ - ->Unit(benchmark::kMicrosecond) \ - ->UseManualTime() \ - ->Arg(10000) /* 10k */ \ - ->Arg(100000) /* 100k */ \ - ->Arg(1000000) /* 1M */ \ - ->Arg(10000000) /* 10M */ \ - ->Arg(100000000); /* 100M */ +#define BM_BINARYOP_BENCHMARK_DEFINE(name, lhs, rhs, bop, tout) \ + static void name(::nvbench::state& st) \ + { \ + ::BM_compiled_binaryop(st, ::cudf::binary_operator::bop); \ + } \ + NVBENCH_BENCH(name) \ + .set_name("compiled_binary_op_" BM_STRINGIFY(name)) \ + .add_int64_axis("table_size", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}) #define build_name(a, b, c, d) a##_##b##_##c##_##d diff --git a/cpp/benchmarks/groupby/group_histogram.cpp b/cpp/benchmarks/groupby/group_histogram.cpp new file mode 100644 index 00000000000..cd7f9f298af --- /dev/null +++ b/cpp/benchmarks/groupby/group_histogram.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include + +#include + +template +void groupby_histogram_helper(nvbench::state& state, + cudf::size_type num_rows, + cudf::size_type cardinality, + double null_probability) +{ + auto const keys = [&] { + data_profile const profile = + data_profile_builder() + .cardinality(cardinality) + .no_validity() + .distribution(cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + return create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + }(); + + auto const values = [&] { + auto builder = data_profile_builder().cardinality(0).distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + if (null_probability > 0) { + builder.null_probability(null_probability); + } else { + builder.no_validity(); + } + return create_random_column( + cudf::type_to_id(), row_count{num_rows}, data_profile{builder}); + }(); + + // Vector of 1 request + std::vector requests(1); + requests.back().values = values->view(); + requests.back().aggregations.push_back( + cudf::make_histogram_aggregation()); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys->view()})); + auto const result = gb_obj.aggregate(requests); + }); + + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time, "rows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +template +void bench_groupby_histogram(nvbench::state& state, nvbench::type_list) +{ + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const null_probability = state.get_float64("null_probability"); + + if (cardinality > num_rows) { + state.skip("cardinality > num_rows"); + return; + } + + groupby_histogram_helper(state, num_rows, cardinality, null_probability); +} + +NVBENCH_BENCH_TYPES(bench_groupby_histogram, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("groupby_histogram") + .add_float64_axis("null_probability", {0, 0.1, 0.9}) + .add_int64_axis("cardinality", {100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000}) + .add_int64_axis("num_rows", {100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000}); diff --git a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp b/cpp/benchmarks/io/utilities/batched_memset_bench.cpp deleted file mode 100644 index 2905895a63b..00000000000 --- a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include - -#include -#include - -#include - -// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to -// run on most GPUs, but large enough to allow highest throughput -constexpr size_t data_size = 512 << 20; - -void parquet_read_common(cudf::size_type num_rows_to_read, - cudf::size_type num_cols_to_read, - cuio_source_sink_pair& source_sink, - nvbench::state& state) -{ - cudf::io::parquet_reader_options read_opts = - cudf::io::parquet_reader_options::builder(source_sink.make_source_info()); - - auto mem_stats_logger = cudf::memory_stats_logger(); - state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.exec( - nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { - try_drop_l3_cache(); - - timer.start(); - auto const result = cudf::io::read_parquet(read_opts); - timer.stop(); - - CUDF_EXPECTS(result.tbl->num_columns() == num_cols_to_read, "Unexpected number of columns"); - CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows"); - }); - - auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); - state.add_buffer_size( - mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); - state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size"); -} - -template -void bench_batched_memset(nvbench::state& state, nvbench::type_list>) -{ - auto const d_type = get_type_or_group(static_cast(DataType)); - auto const num_cols = static_cast(state.get_int64("num_cols")); - auto const cardinality = static_cast(state.get_int64("cardinality")); - auto const run_length = static_cast(state.get_int64("run_length")); - auto const source_type = retrieve_io_type_enum(state.get_string("io_type")); - auto const compression = cudf::io::compression_type::NONE; - cuio_source_sink_pair source_sink(source_type); - auto const tbl = - create_random_table(cycle_dtypes(d_type, num_cols), - table_size_bytes{data_size}, - data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); - auto const view = tbl->view(); - - cudf::io::parquet_writer_options write_opts = - cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) - .compression(compression); - cudf::io::write_parquet(write_opts); - auto const num_rows = view.num_rows(); - - parquet_read_common(num_rows, num_cols, source_sink, state); -} - -using d_type_list = nvbench::enum_type_list; - -NVBENCH_BENCH_TYPES(bench_batched_memset, NVBENCH_TYPE_AXES(d_type_list)) - .set_name("batched_memset") - .set_type_axes_names({"data_type"}) - .add_int64_axis("num_cols", {1000}) - .add_string_axis("io_type", {"DEVICE_BUFFER"}) - .set_min_samples(4) - .add_int64_axis("cardinality", {0, 1000}) - .add_int64_axis("run_length", {1, 32}); diff --git a/cpp/benchmarks/ndsh/q01.cpp b/cpp/benchmarks/ndsh/q01.cpp index ef709926ae9..485e8e5497c 100644 --- a/cpp/benchmarks/ndsh/q01.cpp +++ b/cpp/benchmarks/ndsh/q01.cpp @@ -104,7 +104,7 @@ } void run_ndsh_q1(nvbench::state& state, - std::unordered_map& sources) + std::unordered_map& sources) { // Define the column projections and filter predicate for `lineitem` table std::vector const lineitem_cols = {"l_returnflag", @@ -124,8 +124,8 @@ void run_ndsh_q1(nvbench::state& state, cudf::ast::ast_operator::LESS_EQUAL, shipdate_ref, shipdate_upper_literal); // Read out the `lineitem` table from parquet file - auto lineitem = - read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred)); + auto lineitem = read_parquet( + sources.at("lineitem").make_source_info(), lineitem_cols, std::move(lineitem_pred)); // Calculate the discount price and charge columns and append to lineitem table auto disc_price = @@ -170,7 +170,7 @@ void ndsh_q1(nvbench::state& state) { // Generate the required parquet files in device buffers double const scale_factor = state.get_float64("scale_factor"); - std::unordered_map sources; + std::unordered_map sources; generate_parquet_data_sources(scale_factor, {"lineitem"}, sources); auto stream = cudf::get_default_stream(); diff --git a/cpp/benchmarks/ndsh/q05.cpp b/cpp/benchmarks/ndsh/q05.cpp index 522bc4789c2..1c2d657913e 100644 --- a/cpp/benchmarks/ndsh/q05.cpp +++ b/cpp/benchmarks/ndsh/q05.cpp @@ -89,7 +89,7 @@ } void run_ndsh_q5(nvbench::state& state, - std::unordered_map& sources) + std::unordered_map& sources) { // Define the column projection and filter predicate for the `orders` table std::vector const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"}; @@ -120,17 +120,17 @@ void run_ndsh_q5(nvbench::state& state, // Read out the tables from parquet files // while pushing down the column projections and filter predicates auto const customer = - read_parquet(sources["customer"].make_source_info(), {"c_custkey", "c_nationkey"}); + read_parquet(sources.at("customer").make_source_info(), {"c_custkey", "c_nationkey"}); auto const orders = - read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred)); - auto const lineitem = read_parquet(sources["lineitem"].make_source_info(), + read_parquet(sources.at("orders").make_source_info(), orders_cols, std::move(orders_pred)); + auto const lineitem = read_parquet(sources.at("lineitem").make_source_info(), {"l_orderkey", "l_suppkey", "l_extendedprice", "l_discount"}); auto const supplier = - read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"}); + read_parquet(sources.at("supplier").make_source_info(), {"s_suppkey", "s_nationkey"}); auto const nation = - read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_regionkey", "n_name"}); + read_parquet(sources.at("nation").make_source_info(), {"n_nationkey", "n_regionkey", "n_name"}); auto const region = - read_parquet(sources["region"].make_source_info(), region_cols, std::move(region_pred)); + read_parquet(sources.at("region").make_source_info(), region_cols, std::move(region_pred)); // Perform the joins auto const join_a = apply_inner_join(region, nation, {"r_regionkey"}, {"n_regionkey"}); @@ -165,7 +165,7 @@ void ndsh_q5(nvbench::state& state) { // Generate the required parquet files in device buffers double const scale_factor = state.get_float64("scale_factor"); - std::unordered_map sources; + std::unordered_map sources; generate_parquet_data_sources( scale_factor, {"customer", "orders", "lineitem", "supplier", "nation", "region"}, sources); diff --git a/cpp/benchmarks/ndsh/q06.cpp b/cpp/benchmarks/ndsh/q06.cpp index 04078547973..e1e56c3622e 100644 --- a/cpp/benchmarks/ndsh/q06.cpp +++ b/cpp/benchmarks/ndsh/q06.cpp @@ -64,7 +64,7 @@ } void run_ndsh_q6(nvbench::state& state, - std::unordered_map& sources) + std::unordered_map& sources) { // Read out the `lineitem` table from parquet file std::vector const lineitem_cols = { @@ -83,8 +83,8 @@ void run_ndsh_q6(nvbench::state& state, cudf::ast::operation(cudf::ast::ast_operator::LESS, shipdate_ref, shipdate_upper_literal); auto const lineitem_pred = std::make_unique( cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b); - auto lineitem = - read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred)); + auto lineitem = read_parquet( + sources.at("lineitem").make_source_info(), lineitem_cols, std::move(lineitem_pred)); // Cast the discount and quantity columns to float32 and append to lineitem table auto discout_float = @@ -134,7 +134,7 @@ void ndsh_q6(nvbench::state& state) { // Generate the required parquet files in device buffers double const scale_factor = state.get_float64("scale_factor"); - std::unordered_map sources; + std::unordered_map sources; generate_parquet_data_sources(scale_factor, {"lineitem"}, sources); auto stream = cudf::get_default_stream(); diff --git a/cpp/benchmarks/ndsh/q09.cpp b/cpp/benchmarks/ndsh/q09.cpp index 59218ab8912..2e9a69d9ee2 100644 --- a/cpp/benchmarks/ndsh/q09.cpp +++ b/cpp/benchmarks/ndsh/q09.cpp @@ -112,20 +112,21 @@ } void run_ndsh_q9(nvbench::state& state, - std::unordered_map& sources) + std::unordered_map& sources) { // Read out the table from parquet files auto const lineitem = read_parquet( - sources["lineitem"].make_source_info(), + sources.at("lineitem").make_source_info(), {"l_suppkey", "l_partkey", "l_orderkey", "l_extendedprice", "l_discount", "l_quantity"}); - auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_name"}); + auto const nation = + read_parquet(sources.at("nation").make_source_info(), {"n_nationkey", "n_name"}); auto const orders = - read_parquet(sources["orders"].make_source_info(), {"o_orderkey", "o_orderdate"}); - auto const part = read_parquet(sources["part"].make_source_info(), {"p_partkey", "p_name"}); - auto const partsupp = read_parquet(sources["partsupp"].make_source_info(), + read_parquet(sources.at("orders").make_source_info(), {"o_orderkey", "o_orderdate"}); + auto const part = read_parquet(sources.at("part").make_source_info(), {"p_partkey", "p_name"}); + auto const partsupp = read_parquet(sources.at("partsupp").make_source_info(), {"ps_suppkey", "ps_partkey", "ps_supplycost"}); auto const supplier = - read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"}); + read_parquet(sources.at("supplier").make_source_info(), {"s_suppkey", "s_nationkey"}); // Generating the `profit` table // Filter the part table using `p_name like '%green%'` @@ -178,7 +179,7 @@ void ndsh_q9(nvbench::state& state) { // Generate the required parquet files in device buffers double const scale_factor = state.get_float64("scale_factor"); - std::unordered_map sources; + std::unordered_map sources; generate_parquet_data_sources( scale_factor, {"part", "supplier", "lineitem", "partsupp", "orders", "nation"}, sources); diff --git a/cpp/benchmarks/ndsh/q10.cpp b/cpp/benchmarks/ndsh/q10.cpp index a520480020a..72edd15083d 100644 --- a/cpp/benchmarks/ndsh/q10.cpp +++ b/cpp/benchmarks/ndsh/q10.cpp @@ -94,7 +94,7 @@ } void run_ndsh_q10(nvbench::state& state, - std::unordered_map& sources) + std::unordered_map& sources) { // Define the column projection and filter predicate for the `orders` table std::vector const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"}; @@ -122,15 +122,16 @@ void run_ndsh_q10(nvbench::state& state, // Read out the tables from parquet files // while pushing down the column projections and filter predicates auto const customer = read_parquet( - sources["customer"].make_source_info(), + sources.at("customer").make_source_info(), {"c_custkey", "c_name", "c_nationkey", "c_acctbal", "c_address", "c_phone", "c_comment"}); auto const orders = - read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred)); + read_parquet(sources.at("orders").make_source_info(), orders_cols, std::move(orders_pred)); auto const lineitem = - read_parquet(sources["lineitem"].make_source_info(), + read_parquet(sources.at("lineitem").make_source_info(), {"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"}, std::move(lineitem_pred)); - auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_name", "n_nationkey"}); + auto const nation = + read_parquet(sources.at("nation").make_source_info(), {"n_name", "n_nationkey"}); // Perform the joins auto const join_a = apply_inner_join(customer, nation, {"c_nationkey"}, {"n_nationkey"}); @@ -163,7 +164,7 @@ void ndsh_q10(nvbench::state& state) { // Generate the required parquet files in device buffers double const scale_factor = state.get_float64("scale_factor"); - std::unordered_map sources; + std::unordered_map sources; generate_parquet_data_sources( scale_factor, {"customer", "orders", "lineitem", "nation"}, sources); diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp index 62116ddf661..9f9849860c9 100644 --- a/cpp/benchmarks/ndsh/utilities.cpp +++ b/cpp/benchmarks/ndsh/utilities.cpp @@ -17,6 +17,8 @@ #include "utilities.hpp" #include "common/ndsh_data_generator/ndsh_data_generator.hpp" +#include "common/table_utilities.hpp" +#include "cudf/detail/utilities/integer_utils.hpp" #include #include @@ -30,8 +32,15 @@ #include #include +#include +#include +#include + +#include #include #include +#include +#include namespace { @@ -85,6 +94,15 @@ std::vector const NATION_SCHEMA = { "n_nationkey", "n_name", "n_regionkey", "n_comment"}; std::vector const REGION_SCHEMA = {"r_regionkey", "r_name", "r_comment"}; +std::unordered_map const> const SCHEMAS = { + {"orders", ORDERS_SCHEMA}, + {"lineitem", LINEITEM_SCHEMA}, + {"part", PART_SCHEMA}, + {"partsupp", PARTSUPP_SCHEMA}, + {"supplier", SUPPLIER_SCHEMA}, + {"customer", CUSTOMER_SCHEMA}, + {"nation", NATION_SCHEMA}, + {"region", REGION_SCHEMA}}; } // namespace cudf::table_view table_with_names::table() const { return tbl->view(); } @@ -337,7 +355,7 @@ int32_t days_since_epoch(int year, int month, int day) void write_to_parquet_device_buffer(std::unique_ptr const& table, std::vector const& col_names, - parquet_device_buffer& source) + cuio_source_sink_pair& source) { CUDF_FUNC_RANGE(); auto const stream = cudf::get_default_stream(); @@ -351,55 +369,124 @@ void write_to_parquet_device_buffer(std::unique_ptr const& table, metadata.schema_info = col_name_infos; auto const table_input_metadata = cudf::io::table_input_metadata{metadata}; - // Declare a host and device buffer - std::vector h_buffer; - + auto est_size = static_cast(estimate_size(table->view())); + constexpr auto PQ_MAX_TABLE_BYTES = 8ul << 30; // 8GB + // TODO: best to get this limit from percent_of_free_device_memory(50) of device memory resource. + if (est_size > PQ_MAX_TABLE_BYTES) { + auto builder = cudf::io::chunked_parquet_writer_options::builder(source.make_sink_info()); + builder.metadata(table_input_metadata); + auto const options = builder.build(); + auto num_splits = static_cast( + std::ceil(static_cast(est_size) / (PQ_MAX_TABLE_BYTES))); + std::vector splits(num_splits - 1); + auto num_rows = table->num_rows(); + auto num_row_per_chunk = cudf::util::div_rounding_up_safe(num_rows, num_splits); + std::generate_n(splits.begin(), splits.size(), [num_row_per_chunk, i = 0]() mutable { + return (i += num_row_per_chunk); + }); + std::vector split_tables = cudf::split(table->view(), splits, stream); + auto writer = cudf::io::parquet_chunked_writer(options, stream); + for (auto const& chunk_table : split_tables) { + writer.write(chunk_table); + } + writer.close(); + return; + } // Write parquet data to host buffer - auto builder = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&h_buffer), table->view()); + auto builder = cudf::io::parquet_writer_options::builder(source.make_sink_info(), table->view()); builder.metadata(table_input_metadata); auto const options = builder.build(); - cudf::io::write_parquet(options); + cudf::io::write_parquet(options, stream); +} - // Copy host buffer to device buffer - source.d_buffer.resize(h_buffer.size(), stream); - CUDF_CUDA_TRY(cudaMemcpyAsync( - source.d_buffer.data(), h_buffer.data(), h_buffer.size(), cudaMemcpyDefault, stream.value())); +inline auto make_managed_pool() +{ + return rmm::mr::make_owning_wrapper( + std::make_shared(), rmm::percent_of_free_device_memory(50)); } void generate_parquet_data_sources(double scale_factor, std::vector const& table_names, - std::unordered_map& sources) + std::unordered_map& sources) { CUDF_FUNC_RANGE(); - std::for_each(table_names.begin(), table_names.end(), [&](auto const& table_name) { - sources[table_name] = parquet_device_buffer(); - }); - auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part( - scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + // Set the memory resource to the managed pool + auto old_mr = cudf::get_current_device_resource(); + // if already managed pool or managed, don't create new one. + using managed_pool_mr_t = decltype(make_managed_pool()); + managed_pool_mr_t managed_pool_mr; + bool const is_managed = + dynamic_cast*>(old_mr) or + dynamic_cast(old_mr); + if (!is_managed) { + std::cout << "Creating managed pool just for data generation\n"; + managed_pool_mr = make_managed_pool(); + cudf::set_current_device_resource(managed_pool_mr.get()); + // drawback: if already pool takes 50% of free memory, we are left with 50% of 50% of free + // memory. + } - auto partsupp = cudf::datagen::generate_partsupp( - scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + std::unordered_set const requested_table_names = [&table_names]() { + if (table_names.empty()) { + return std::unordered_set{ + "orders", "lineitem", "part", "partsupp", "supplier", "customer", "nation", "region"}; + } + return std::unordered_set(table_names.begin(), table_names.end()); + }(); + std::for_each( + requested_table_names.begin(), requested_table_names.end(), [&](auto const& table_name) { + sources.emplace(table_name, cuio_source_sink_pair(io_type::HOST_BUFFER)); + }); + std::unordered_map> tables; + + if (sources.count("orders") or sources.count("lineitem") or sources.count("part")) { + auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part( + scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + if (sources.count("orders")) { + write_to_parquet_device_buffer(orders, SCHEMAS.at("orders"), sources.at("orders")); + orders = {}; + } + if (sources.count("part")) { + write_to_parquet_device_buffer(part, SCHEMAS.at("part"), sources.at("part")); + part = {}; + } + if (sources.count("lineitem")) { + write_to_parquet_device_buffer(lineitem, SCHEMAS.at("lineitem"), sources.at("lineitem")); + lineitem = {}; + } + } + + if (sources.count("partsupp")) { + auto partsupp = cudf::datagen::generate_partsupp( + scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + write_to_parquet_device_buffer(partsupp, SCHEMAS.at("partsupp"), sources.at("partsupp")); + } - auto supplier = cudf::datagen::generate_supplier( - scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + if (sources.count("supplier")) { + auto supplier = cudf::datagen::generate_supplier( + scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + write_to_parquet_device_buffer(supplier, SCHEMAS.at("supplier"), sources.at("supplier")); + } - auto customer = cudf::datagen::generate_customer( - scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + if (sources.count("customer")) { + auto customer = cudf::datagen::generate_customer( + scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref()); + write_to_parquet_device_buffer(customer, SCHEMAS.at("customer"), sources.at("customer")); + } - auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(), - cudf::get_current_device_resource_ref()); + if (sources.count("nation")) { + auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + write_to_parquet_device_buffer(nation, SCHEMAS.at("nation"), sources.at("nation")); + } - auto region = cudf::datagen::generate_region(cudf::get_default_stream(), - cudf::get_current_device_resource_ref()); + if (sources.count("region")) { + auto region = cudf::datagen::generate_region(cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + write_to_parquet_device_buffer(region, SCHEMAS.at("region"), sources.at("region")); + } - write_to_parquet_device_buffer(std::move(orders), ORDERS_SCHEMA, sources["orders"]); - write_to_parquet_device_buffer(std::move(lineitem), LINEITEM_SCHEMA, sources["lineitem"]); - write_to_parquet_device_buffer(std::move(part), PART_SCHEMA, sources["part"]); - write_to_parquet_device_buffer(std::move(partsupp), PARTSUPP_SCHEMA, sources["partsupp"]); - write_to_parquet_device_buffer(std::move(customer), CUSTOMER_SCHEMA, sources["customer"]); - write_to_parquet_device_buffer(std::move(supplier), SUPPLIER_SCHEMA, sources["supplier"]); - write_to_parquet_device_buffer(std::move(nation), NATION_SCHEMA, sources["nation"]); - write_to_parquet_device_buffer(std::move(region), REGION_SCHEMA, sources["region"]); + // Restore the original memory resource + if (!is_managed) { cudf::set_current_device_resource(old_mr); } } diff --git a/cpp/benchmarks/ndsh/utilities.hpp b/cpp/benchmarks/ndsh/utilities.hpp index 762e43deccf..cae07f86a98 100644 --- a/cpp/benchmarks/ndsh/utilities.hpp +++ b/cpp/benchmarks/ndsh/utilities.hpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "io/cuio_common.hpp" + #include #include #include @@ -196,24 +198,15 @@ std::tm make_tm(int year, int month, int day); int32_t days_since_epoch(int year, int month, int day); /** - * @brief Struct representing a parquet device buffer - */ -struct parquet_device_buffer { - parquet_device_buffer() : d_buffer{0, cudf::get_default_stream()} {}; - cudf::io::source_info make_source_info() { return cudf::io::source_info(d_buffer); } - rmm::device_uvector d_buffer; -}; - -/** - * @brief Write a `cudf::table` to a parquet device buffer + * @brief Write a `cudf::table` to a parquet cuio sink * * @param table The `cudf::table` to write * @param col_names The column names of the table - * @param parquet_device_buffer The parquet device buffer to write the table to + * @param source The source sink pair to write the table to */ void write_to_parquet_device_buffer(std::unique_ptr const& table, std::vector const& col_names, - parquet_device_buffer& source); + cuio_source_sink_pair& source); /** * @brief Generate NDS-H tables and write to parquet device buffers @@ -224,4 +217,4 @@ void write_to_parquet_device_buffer(std::unique_ptr const& table, */ void generate_parquet_data_sources(double scale_factor, std::vector const& table_names, - std::unordered_map& sources); + std::unordered_map& sources); diff --git a/cpp/benchmarks/reduction/histogram.cpp b/cpp/benchmarks/reduction/histogram.cpp new file mode 100644 index 00000000000..d0925de5c87 --- /dev/null +++ b/cpp/benchmarks/reduction/histogram.cpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cudf/aggregation.hpp" +#include "cudf/detail/aggregation/aggregation.hpp" + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +template +static void nvbench_reduction_histogram(nvbench::state& state, nvbench::type_list) +{ + auto const dtype = cudf::type_to_id(); + + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const null_probability = state.get_float64("null_probability"); + + if (cardinality > num_rows) { + state.skip("cardinality > num_rows"); + return; + } + + data_profile const profile = data_profile_builder() + .null_probability(null_probability) + .cardinality(cardinality) + .distribution(dtype, distribution_id::UNIFORM, 0, num_rows); + + auto const input = create_random_column(dtype, row_count{num_rows}, profile); + auto agg = cudf::make_histogram_aggregation(); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + auto result = cudf::reduce(*input, *agg, input->type(), stream_view); + }); + + state.add_element_count(input->size()); +} + +using data_type = nvbench::type_list; + +NVBENCH_BENCH_TYPES(nvbench_reduction_histogram, NVBENCH_TYPE_AXES(data_type)) + .set_name("histogram") + .add_float64_axis("null_probability", {0.1}) + .add_int64_axis("cardinality", + {0, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000, 50'000'000}) + .add_int64_axis("num_rows", {10'000, 100'000, 1'000'000, 10'000'000, 100'000'000}); diff --git a/cpp/benchmarks/string/make_strings_column.cu b/cpp/benchmarks/string/make_strings_column.cu new file mode 100644 index 00000000000..e86824b9f40 --- /dev/null +++ b/cpp/benchmarks/string/make_strings_column.cu @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include + +#include +#include + +#include + +#include + +namespace { + +constexpr int min_row_width = 0; +constexpr int max_row_width = 50; + +using string_index_pair = thrust::pair; + +template +std::vector> make_strings_columns( + std::vector> const& input, + rmm::cuda_stream_view stream) +{ + if constexpr (batch_construction) { + return cudf::make_strings_column_batch(input, stream); + } else { + std::vector> output; + output.reserve(input.size()); + for (auto const& column_input : input) { + output.emplace_back(cudf::make_strings_column(column_input, stream)); + } + return output; + } +} + +} // namespace + +static void BM_make_strings_column_batch(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const batch_size = static_cast(state.get_int64("batch_size")); + auto const has_nulls = true; + + data_profile const table_profile = + data_profile_builder() + .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_row_width, max_row_width) + .null_probability(has_nulls ? std::optional{0.1} : std::nullopt); + auto const data_table = create_random_table( + cycle_dtypes({cudf::type_id::STRING}, batch_size), row_count{num_rows}, table_profile); + + auto const stream = cudf::get_default_stream(); + auto input_data = std::vector>{}; + auto input = std::vector>{}; + input_data.reserve(batch_size); + input.reserve(batch_size); + for (auto const& cv : data_table->view()) { + auto const d_data_ptr = cudf::column_device_view::create(cv, stream); + auto batch_input = rmm::device_uvector(cv.size(), stream); + thrust::tabulate(rmm::exec_policy(stream), + batch_input.begin(), + batch_input.end(), + [data_col = *d_data_ptr] __device__(auto const idx) { + if (data_col.is_null(idx)) { return string_index_pair{nullptr, 0}; } + auto const row = data_col.element(idx); + return string_index_pair{row.data(), row.size_bytes()}; + }); + input_data.emplace_back(std::move(batch_input)); + input.emplace_back(input_data.back()); + } + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + [[maybe_unused]] auto const output = make_strings_columns(input, stream); + }); +} + +NVBENCH_BENCH(BM_make_strings_column_batch) + .set_name("make_strings_column_batch") + .add_int64_axis("num_rows", {100'000, 500'000, 1'000'000, 2'000'000}) + .add_int64_axis("batch_size", {10, 20, 50, 100}); diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp index 8e48f8e9a05..43d57201b20 100644 --- a/cpp/benchmarks/text/ngrams.cpp +++ b/cpp/benchmarks/text/ngrams.cpp @@ -15,58 +15,45 @@ */ #include -#include -#include -#include #include #include #include -class TextNGrams : public cudf::benchmark {}; +#include -enum class ngrams_type { tokens, characters }; - -static void BM_ngrams(benchmark::State& state, ngrams_type nt) +static void bench_ngrams(nvbench::state& state) { - auto const n_rows = static_cast(state.range(0)); - auto const max_str_length = static_cast(state.range(1)); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const row_width = static_cast(state.get_int64("row_width")); + auto const ngram_type = state.get_string("type"); + data_profile const profile = data_profile_builder().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length); - auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile); + cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); + auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile); cudf::strings_column_view input(column->view()); auto const separator = cudf::string_scalar("_"); - for (auto _ : state) { - cuda_event_timer raii(state, true); - switch (nt) { - case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break; - case ngrams_type::characters: nvtext::generate_character_ngrams(input); break; - } - } + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); - state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream())); -} + auto chars_size = input.chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); + state.add_global_memory_writes(chars_size * 2); -static void generate_bench_args(benchmark::internal::Benchmark* b) -{ - int const min_rows = 1 << 12; - int const max_rows = 1 << 24; - int const row_mult = 8; - int const min_rowlen = 5; - int const max_rowlen = 40; - int const len_mult = 2; - generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); + if (ngram_type == "chars") { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::generate_character_ngrams(input); + }); + } else { + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::generate_ngrams(input, 2, separator); + }); + } } -#define NVTEXT_BENCHMARK_DEFINE(name) \ - BENCHMARK_DEFINE_F(TextNGrams, name) \ - (::benchmark::State & st) { BM_ngrams(st, ngrams_type::name); } \ - BENCHMARK_REGISTER_F(TextNGrams, name) \ - ->Apply(generate_bench_args) \ - ->UseManualTime() \ - ->Unit(benchmark::kMillisecond); - -NVTEXT_BENCHMARK_DEFINE(tokens) -NVTEXT_BENCHMARK_DEFINE(characters) +NVBENCH_BENCH(bench_ngrams) + .set_name("ngrams") + .add_int64_axis("num_rows", {131072, 262144, 524288, 1048578}) + .add_int64_axis("row_width", {10, 20, 40, 100}) + .add_string_axis("type", {"chars", "tokens"}); diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index 8df1b431095..d7d7fcca044 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -14,15 +14,17 @@ # This function finds nanoarrow and sets any additional necessary environment variables. function(find_and_configure_nanoarrow) + include(${rapids-cmake-dir}/cpm/package_override.cmake) + + set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") + rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json") + # Currently we need to always build nanoarrow so we don't pickup a previous installed version set(CPM_DOWNLOAD_nanoarrow ON) rapids_cpm_find( nanoarrow 0.6.0.dev GLOBAL_TARGETS nanoarrow CPM_ARGS - GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git - GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb - GIT_SHALLOW FALSE OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ) set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/cpp/cmake/thirdparty/get_nvcomp.cmake b/cpp/cmake/thirdparty/get_nvcomp.cmake index 41bbf44abc8..33b1b45fb44 100644 --- a/cpp/cmake/thirdparty/get_nvcomp.cmake +++ b/cpp/cmake/thirdparty/get_nvcomp.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -16,11 +16,11 @@ function(find_and_configure_nvcomp) include(${rapids-cmake-dir}/cpm/nvcomp.cmake) - rapids_cpm_nvcomp( - BUILD_EXPORT_SET cudf-exports - INSTALL_EXPORT_SET cudf-exports - USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP} - ) + set(export_args) + if(CUDF_EXPORT_NVCOMP) + set(export_args BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) + endif() + rapids_cpm_nvcomp(${export_args} USE_PROPRIETARY_BINARY ${CUDF_USE_PROPRIETARY_NVCOMP}) # Per-thread default stream if(TARGET nvcomp AND CUDF_USE_PER_THREAD_DEFAULT_STREAM) diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff new file mode 100644 index 00000000000..e9a36fcb567 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff @@ -0,0 +1,38 @@ +diff --git a/src/nanoarrow/common/inline_buffer.h b/src/nanoarrow/common/inline_buffer.h +index caa6be4..70ec8a2 100644 +--- a/src/nanoarrow/common/inline_buffer.h ++++ b/src/nanoarrow/common/inline_buffer.h +@@ -347,7 +347,7 @@ static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) { + } + + static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) { +- *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | ++ *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | // NOLINT + ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) | + ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) | + ((values[7] + 0x7f) & 0x80)); +@@ -471,13 +471,13 @@ static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t l + // set bits within a single byte + const uint8_t only_byte_mask = + i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask); +- bits[bytes_begin] &= only_byte_mask; ++ bits[bytes_begin] &= only_byte_mask; // NOLINT + bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask); + return; + } + + // set/clear trailing bits of first byte +- bits[bytes_begin] &= first_byte_mask; ++ bits[bytes_begin] &= first_byte_mask; // NOLINT + bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask); + + if (bytes_end - bytes_begin > 2) { +@@ -637,7 +637,7 @@ static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap, + n_remaining -= n_full_bytes * 8; + if (n_remaining > 0) { + // Zero out the last byte +- *out_cursor = 0x00; ++ *out_cursor = 0x00; // NOLINT + for (int i = 0; i < n_remaining; i++) { + ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]); + } diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json new file mode 100644 index 00000000000..d529787e7c8 --- /dev/null +++ b/cpp/cmake/thirdparty/patches/nanoarrow_override.json @@ -0,0 +1,18 @@ + +{ + "packages" : { + "nanoarrow" : { + "version" : "0.6.0.dev", + "git_url" : "https://github.com/apache/arrow-nanoarrow.git", + "git_tag" : "1e2664a70ec14907409cadcceb14d79b9670bcdb", + "git_shallow" : false, + "patches" : [ + { + "file" : "${current_json_dir}/nanoarrow_clang_tidy_compliance.diff", + "issue" : "https://github.com/apache/arrow-nanoarrow/issues/537", + "fixed_in" : "" + } + ] + } + } +} diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md index fce8adb4c06..311539efbfc 100644 --- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md +++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md @@ -370,7 +370,7 @@ any type that cudf supports. For example, a `list_scalar` representing a list of |Value type|Scalar class|Notes| |-|-|-| |fixed-width|`fixed_width_scalar`| `T` can be any fixed-width type| -|numeric|`numeric_scalar` | `T` can be `int8_t`, `int16_t`, `int32_t`, `int_64_t`, `float` or `double`| +|numeric|`numeric_scalar` | `T` can be `int8_t`, `int16_t`, `int32_t`, `int64_t`, `float` or `double`| |fixed-point|`fixed_point_scalar` | `T` can be `numeric::decimal32` or `numeric::decimal64`| |timestamp|`timestamp_scalar` | `T` can be `timestamp_D`, `timestamp_s`, etc.| |duration|`duration_scalar` | `T` can be `duration_D`, `duration_s`, etc.| diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md index 6d1c91a5752..6902b1948bd 100644 --- a/cpp/doxygen/regex.md +++ b/cpp/doxygen/regex.md @@ -8,6 +8,7 @@ This page specifies which regular expression (regex) features are currently supp - cudf::strings::extract() - cudf::strings::extract_all_record() - cudf::strings::findall() +- cudf::strings::find_re() - cudf::strings::replace_re() - cudf::strings::replace_with_backrefs() - cudf::strings::split_re() diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt index d8e9205ffd4..a7d0146b170 100644 --- a/cpp/examples/parquet_io/CMakeLists.txt +++ b/cpp/examples/parquet_io/CMakeLists.txt @@ -16,10 +16,23 @@ project( include(../fetch_dependencies.cmake) -# Configure your project here +add_library(parquet_io_utils OBJECT common_utils.cpp io_source.cpp) +target_compile_features(parquet_io_utils PRIVATE cxx_std_17) +target_link_libraries(parquet_io_utils PRIVATE cudf::cudf) + +# Build and install parquet_io add_executable(parquet_io parquet_io.cpp) -target_link_libraries(parquet_io PRIVATE cudf::cudf) +target_link_libraries(parquet_io PRIVATE cudf::cudf nvToolsExt $) target_compile_features(parquet_io PRIVATE cxx_std_17) - install(TARGETS parquet_io DESTINATION bin/examples/libcudf) + +# Build and install parquet_io_multithreaded +add_executable(parquet_io_multithreaded parquet_io_multithreaded.cpp) +target_link_libraries( + parquet_io_multithreaded PRIVATE cudf::cudf nvToolsExt $ +) +target_compile_features(parquet_io_multithreaded PRIVATE cxx_std_17) +install(TARGETS parquet_io_multithreaded DESTINATION bin/examples/libcudf) + +# Install the example.parquet file install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf) diff --git a/cpp/examples/parquet_io/parquet_io.hpp b/cpp/examples/parquet_io/common_utils.cpp similarity index 50% rename from cpp/examples/parquet_io/parquet_io.hpp rename to cpp/examples/parquet_io/common_utils.cpp index e27cbec4fce..a79ca48af86 100644 --- a/cpp/examples/parquet_io/parquet_io.hpp +++ b/cpp/examples/parquet_io/common_utils.cpp @@ -14,30 +14,27 @@ * limitations under the License. */ -#pragma once +#include "common_utils.hpp" -#include +#include #include #include #include -#include -#include #include #include #include #include -#include -#include +#include #include /** - * @brief Create memory resource for libcudf functions + * @file common_utils.cpp + * @brief Definitions for common utilities for `parquet_io` examples * - * @param pool Whether to use a pool memory resource. - * @return Memory resource instance */ + std::shared_ptr create_memory_resource(bool is_pool_used) { auto cuda_mr = std::make_shared(); @@ -48,17 +45,11 @@ std::shared_ptr create_memory_resource(bool is_ return cuda_mr; } -/** - * @brief Get encoding type from the keyword - * - * @param name encoding keyword name - * @return corresponding column encoding type - */ -[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name) +cudf::io::column_encoding get_encoding_type(std::string name) { using encoding_type = cudf::io::column_encoding; - static const std::unordered_map map = { + static std::unordered_map const map = { {"DEFAULT", encoding_type::USE_DEFAULT}, {"DICTIONARY", encoding_type::DICTIONARY}, {"PLAIN", encoding_type::PLAIN}, @@ -69,26 +60,18 @@ std::shared_ptr create_memory_resource(bool is_ std::transform(name.begin(), name.end(), name.begin(), ::toupper); if (map.find(name) != map.end()) { return map.at(name); } - throw std::invalid_argument("FATAL: " + std::string(name) + + throw std::invalid_argument(name + " is not a valid encoding type.\n\n" "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n" "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n" - "DELTA_BYTE_ARRAY\n" - "\n" - "Exiting...\n"); + "DELTA_BYTE_ARRAY\n\n"); } -/** - * @brief Get compression type from the keyword - * - * @param name compression keyword name - * @return corresponding compression type - */ -[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name) +cudf::io::compression_type get_compression_type(std::string name) { using compression_type = cudf::io::compression_type; - static const std::unordered_map map = { + static std::unordered_map const map = { {"NONE", compression_type::NONE}, {"AUTO", compression_type::AUTO}, {"SNAPPY", compression_type::SNAPPY}, @@ -97,30 +80,58 @@ std::shared_ptr create_memory_resource(bool is_ std::transform(name.begin(), name.end(), name.begin(), ::toupper); if (map.find(name) != map.end()) { return map.at(name); } - throw std::invalid_argument("FATAL: " + std::string(name) + + throw std::invalid_argument(name + " is not a valid compression type.\n\n" - "Available compression_type types: NONE, AUTO, SNAPPY,\n" - "LZ4, ZSTD\n" - "\n" - "Exiting...\n"); + "Available compression types: NONE, AUTO, SNAPPY,\n" + "LZ4, ZSTD\n\n"); } -/** - * @brief Get the optional page size stat frequency from they keyword - * - * @param use_stats keyword affirmation string such as: Y, T, YES, TRUE, ON - * @return optional page statistics frequency set to full (STATISTICS_COLUMN) - */ -[[nodiscard]] std::optional get_page_size_stats(std::string use_stats) +bool get_boolean(std::string input) { - std::transform(use_stats.begin(), use_stats.end(), use_stats.begin(), ::toupper); + std::transform(input.begin(), input.end(), input.begin(), ::toupper); // Check if the input string matches to any of the following - if (not use_stats.compare("ON") or not use_stats.compare("TRUE") or - not use_stats.compare("YES") or not use_stats.compare("Y") or not use_stats.compare("T")) { - // Full column and offset indices - STATISTICS_COLUMN - return std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN); + return input == "ON" or input == "TRUE" or input == "YES" or input == "Y" or input == "T"; +} + +void check_tables_equal(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table) +{ + try { + // Left anti-join the original and transcoded tables + // identical tables should not throw an exception and + // return an empty indices vector + auto const indices = cudf::left_anti_join(lhs_table, rhs_table, cudf::null_equality::EQUAL); + + // No exception thrown, check indices + auto const valid = indices->size() == 0; + std::cout << "Tables identical: " << valid << "\n\n"; + } catch (std::exception& e) { + std::cerr << e.what() << std::endl << std::endl; + throw std::runtime_error("Tables identical: false\n\n"); } +} - return std::nullopt; +std::unique_ptr concatenate_tables(std::vector> tables, + rmm::cuda_stream_view stream) +{ + if (tables.size() == 1) { return std::move(tables[0]); } + + std::vector table_views; + table_views.reserve(tables.size()); + std::transform( + tables.begin(), tables.end(), std::back_inserter(table_views), [&](auto const& tbl) { + return tbl->view(); + }); + // Construct the final table + return cudf::concatenate(table_views, stream); +} + +std::string current_date_and_time() +{ + auto const time = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + auto const local_time = *std::localtime(&time); + // Stringstream to format the date and time + std::stringstream ss; + ss << std::put_time(&local_time, "%Y-%m-%d-%H-%M-%S"); + return ss.str(); } diff --git a/cpp/examples/parquet_io/common_utils.hpp b/cpp/examples/parquet_io/common_utils.hpp new file mode 100644 index 00000000000..12896e61a0d --- /dev/null +++ b/cpp/examples/parquet_io/common_utils.hpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +#include +#include + +/** + * @file common_utils.hpp + * @brief Common utilities for `parquet_io` examples + * + */ + +/** + * @brief Create memory resource for libcudf functions + * + * @param pool Whether to use a pool memory resource. + * @return Memory resource instance + */ +std::shared_ptr create_memory_resource(bool is_pool_used); + +/** + * @brief Get encoding type from the keyword + * + * @param name encoding keyword name + * @return corresponding column encoding type + */ +[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name); + +/** + * @brief Get compression type from the keyword + * + * @param name compression keyword name + * @return corresponding compression type + */ +[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name); + +/** + * @brief Get boolean from they keyword + * + * @param input keyword affirmation string such as: Y, T, YES, TRUE, ON + * @return true or false + */ +[[nodiscard]] bool get_boolean(std::string input); + +/** + * @brief Check if two tables are identical, throw an error otherwise + * + * @param lhs_table View to lhs table + * @param rhs_table View to rhs table + */ +void check_tables_equal(cudf::table_view const& lhs_table, cudf::table_view const& rhs_table); + +/** + * @brief Concatenate a vector of tables and return the resultant table + * + * @param tables Vector of tables to concatenate + * @param stream CUDA stream to use + * + * @return Unique pointer to the resultant concatenated table. + */ +std::unique_ptr concatenate_tables(std::vector> tables, + rmm::cuda_stream_view stream); + +/** + * @brief Returns a string containing current date and time + * + */ +std::string current_date_and_time(); diff --git a/cpp/examples/parquet_io/io_source.cpp b/cpp/examples/parquet_io/io_source.cpp new file mode 100644 index 00000000000..019b3f96474 --- /dev/null +++ b/cpp/examples/parquet_io/io_source.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "io_source.hpp" + +#include +#include + +#include +#include + +#include + +#include +#include +#include + +rmm::host_async_resource_ref pinned_memory_resource() +{ + static auto mr = rmm::mr::pinned_host_memory_resource{}; + return mr; +} + +io_source_type get_io_source_type(std::string name) +{ + static std::unordered_map const map = { + {"FILEPATH", io_source_type::FILEPATH}, + {"HOST_BUFFER", io_source_type::HOST_BUFFER}, + {"PINNED_BUFFER", io_source_type::PINNED_BUFFER}, + {"DEVICE_BUFFER", io_source_type::DEVICE_BUFFER}}; + + std::transform(name.begin(), name.end(), name.begin(), ::toupper); + if (map.find(name) != map.end()) { + return map.at(name); + } else { + throw std::invalid_argument(name + + " is not a valid io source type. Available: FILEPATH,\n" + "HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER.\n\n"); + } +} + +io_source::io_source(std::string_view file_path, io_source_type type, rmm::cuda_stream_view stream) + : pinned_buffer({pinned_memory_resource(), stream}), d_buffer{0, stream} +{ + std::string const file_name{file_path}; + auto const file_size = std::filesystem::file_size(file_name); + + // For filepath make a quick source_info and return early + if (type == io_source_type::FILEPATH) { + source_info = cudf::io::source_info(file_name); + return; + } + + std::ifstream file{file_name, std::ifstream::binary}; + + // Copy file contents to the specified io source buffer + switch (type) { + case io_source_type::HOST_BUFFER: { + h_buffer.resize(file_size); + file.read(h_buffer.data(), file_size); + source_info = cudf::io::source_info(h_buffer.data(), file_size); + break; + } + case io_source_type::PINNED_BUFFER: { + pinned_buffer.resize(file_size); + file.read(pinned_buffer.data(), file_size); + source_info = cudf::io::source_info(pinned_buffer.data(), file_size); + break; + } + case io_source_type::DEVICE_BUFFER: { + h_buffer.resize(file_size); + file.read(h_buffer.data(), file_size); + d_buffer.resize(file_size, stream); + CUDF_CUDA_TRY(cudaMemcpyAsync( + d_buffer.data(), h_buffer.data(), file_size, cudaMemcpyDefault, stream.value())); + + source_info = cudf::io::source_info(d_buffer); + break; + } + default: { + throw std::runtime_error("Encountered unexpected source type\n\n"); + } + } +} diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp new file mode 100644 index 00000000000..a614d348fae --- /dev/null +++ b/cpp/examples/parquet_io/io_source.hpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +#include + +#include + +/** + * @file io_source.hpp + * @brief Utilities for constructing the specified IO sources from the input parquet files. + * + */ + +/** + * @brief Available IO source types + */ +enum class io_source_type { FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER }; + +/** + * @brief Get io source type from the string keyword argument + * + * @param name io source type keyword name + * @return io source type + */ +[[nodiscard]] io_source_type get_io_source_type(std::string name); + +/** + * @brief Create and return a reference to a static pinned memory pool + * + * @return Reference to a static pinned memory pool + */ +rmm::host_async_resource_ref pinned_memory_resource(); + +/** + * @brief Custom allocator for pinned_buffer via RMM. + */ +template +struct pinned_allocator : public std::allocator { + pinned_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream) + : mr{_mr}, stream{_stream} + { + } + + T* allocate(std::size_t n) + { + auto ptr = mr.allocate_async(n * sizeof(T), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); + stream.synchronize(); + return static_cast(ptr); + } + + void deallocate(T* ptr, std::size_t n) + { + mr.deallocate_async(ptr, n * sizeof(T), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); + } + + private: + rmm::host_async_resource_ref mr; + rmm::cuda_stream_view stream; +}; + +/** + * @brief Class to create a cudf::io::source_info of given type from the input parquet file + * + */ +class io_source { + public: + io_source(std::string_view file_path, io_source_type io_type, rmm::cuda_stream_view stream); + + // Get the internal source info + [[nodiscard]] cudf::io::source_info get_source_info() const { return source_info; } + + private: + // alias for pinned vector + template + using pinned_vector = thrust::host_vector>; + cudf::io::source_info source_info; + std::vector h_buffer; + pinned_vector pinned_buffer; + rmm::device_uvector d_buffer; +}; diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp index 9cda22d0695..c11b8de82b5 100644 --- a/cpp/examples/parquet_io/parquet_io.cpp +++ b/cpp/examples/parquet_io/parquet_io.cpp @@ -14,11 +14,15 @@ * limitations under the License. */ -#include "parquet_io.hpp" - #include "../utilities/timer.hpp" +#include "common_utils.hpp" +#include "io_source.hpp" + +#include +#include +#include -#include +#include /** * @file parquet_io.cpp @@ -81,6 +85,18 @@ void write_parquet(cudf::table_view input, cudf::io::write_parquet(options); } +/** + * @brief Function to print example usage and argument information. + */ +void print_usage() +{ + std::cout << "\nUsage: parquet_io \n" + " \n\n" + "Available encoding types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED,\n" + " DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY\n\n" + "Available compression types: NONE, AUTO, SNAPPY, LZ4, ZSTD\n\n"; +} + /** * @brief Main for nested_types examples * @@ -97,29 +113,28 @@ void write_parquet(cudf::table_view input, */ int main(int argc, char const** argv) { - std::string input_filepath; - std::string output_filepath; - cudf::io::column_encoding encoding; - cudf::io::compression_type compression; - std::optional page_stats; + std::string input_filepath = "example.parquet"; + std::string output_filepath = "output.parquet"; + cudf::io::column_encoding encoding = get_encoding_type("DELTA_BINARY_PACKED"); + cudf::io::compression_type compression = get_compression_type("ZSTD"); + std::optional page_stats = std::nullopt; switch (argc) { - case 1: - input_filepath = "example.parquet"; - output_filepath = "output.parquet"; - encoding = get_encoding_type("DELTA_BINARY_PACKED"); - compression = get_compression_type("ZSTD"); - break; - case 6: page_stats = get_page_size_stats(argv[5]); [[fallthrough]]; - case 5: - input_filepath = argv[1]; - output_filepath = argv[2]; - encoding = get_encoding_type(argv[3]); - compression = get_compression_type(argv[4]); - break; - default: - throw std::runtime_error( - "Either provide all command-line arguments, or none to use defaults\n"); + case 6: + page_stats = get_boolean(argv[5]) + ? std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN) + : std::nullopt; + [[fallthrough]]; + case 5: compression = get_compression_type(argv[4]); [[fallthrough]]; + case 4: encoding = get_encoding_type(argv[3]); [[fallthrough]]; + case 3: output_filepath = argv[2]; [[fallthrough]]; + case 2: // Check if instead of input_paths, the first argument is `-h` or `--help` + if (auto arg = std::string{argv[1]}; arg != "-h" and arg != "--help") { + input_filepath = std::move(arg); + break; + } + [[fallthrough]]; + default: print_usage(); throw std::runtime_error(""); } // Create and use a memory pool @@ -130,18 +145,16 @@ int main(int argc, char const** argv) // Read input parquet file // We do not want to time the initial read time as it may include // time for nvcomp, cufile loading and RMM growth - std::cout << std::endl << "Reading " << input_filepath << "..." << std::endl; + std::cout << "\nReading " << input_filepath << "...\n"; std::cout << "Note: Not timing the initial parquet read as it may include\n" - "times for nvcomp, cufile loading and RMM growth." - << std::endl - << std::endl; + "times for nvcomp, cufile loading and RMM growth.\n\n"; auto [input, metadata] = read_parquet(input_filepath); // Status string to indicate if page stats are set to be written or not auto page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats"; // Write parquet file with the specified encoding and compression std::cout << "Writing " << output_filepath << " with encoding, compression and " - << page_stat_string << ".." << std::endl; + << page_stat_string << "..\n"; // `timer` is automatically started here cudf::examples::timer timer; @@ -149,7 +162,7 @@ int main(int argc, char const** argv) timer.print_elapsed_millis(); // Read the parquet file written with encoding and compression - std::cout << "Reading " << output_filepath << "..." << std::endl; + std::cout << "Reading " << output_filepath << "...\n"; // Reset the timer timer.reset(); @@ -157,23 +170,7 @@ int main(int argc, char const** argv) timer.print_elapsed_millis(); // Check for validity - try { - // Left anti-join the original and transcoded tables - // identical tables should not throw an exception and - // return an empty indices vector - auto const indices = cudf::left_anti_join(input->view(), - transcoded_input->view(), - cudf::null_equality::EQUAL, - cudf::get_default_stream(), - resource.get()); - - // No exception thrown, check indices - auto const valid = indices->size() == 0; - std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl; - } catch (std::exception& e) { - std::cerr << e.what() << std::endl << std::endl; - std::cout << "Transcoding valid: false" << std::endl; - } + check_tables_equal(input->view(), transcoded_input->view()); return 0; } diff --git a/cpp/examples/parquet_io/parquet_io_multithreaded.cpp b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp new file mode 100644 index 00000000000..6ad4b862240 --- /dev/null +++ b/cpp/examples/parquet_io/parquet_io_multithreaded.cpp @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../utilities/timer.hpp" +#include "common_utils.hpp" +#include "io_source.hpp" + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +/** + * @file parquet_io_multithreaded.cpp + * @brief Demonstrates reading parquet data from the specified io source using multiple threads. + * + * The input parquet data is provided via files which are converted to the specified io source type + * to be read using multiple threads. Optionally, the parquet data read by each thread can be + * written to corresponding files and checked for validatity of the output files against the input + * data. + * + * Run: ``parquet_io_multithreaded -h`` to see help with input args and more information. + * + * The following io source types are supported: + * IO source types: FILEPATH, HOST_BUFFER, PINNED_BUFFER, DEVICE_BUFFER + * + */ + +// Type alias for unique ptr to cudf table +using table_t = std::unique_ptr; + +/** + * @brief Behavior when handling the read tables by multiple threads + */ +enum class read_mode { + NO_CONCATENATE, ///< Only read and discard tables + CONCATENATE_THREAD, ///< Read and concatenate tables from each thread + CONCATENATE_ALL, ///< Read and concatenate everything to a single table +}; + +/** + * @brief Functor for multithreaded parquet reading based on the provided read_mode + */ +template +struct read_fn { + std::vector const& input_sources; + std::vector& tables; + int const thread_id; + int const thread_count; + rmm::cuda_stream_view stream; + + void operator()() + { + // Tables read by this thread + std::vector tables_this_thread; + + // Sweep the available input files + for (auto curr_file_idx = thread_id; curr_file_idx < input_sources.size(); + curr_file_idx += thread_count) { + auto builder = + cudf::io::parquet_reader_options::builder(input_sources[curr_file_idx].get_source_info()); + auto const options = builder.build(); + if constexpr (read_mode != read_mode::NO_CONCATENATE) { + tables_this_thread.push_back(cudf::io::read_parquet(options, stream).tbl); + } else { + cudf::io::read_parquet(options, stream); + } + } + + // Concatenate the tables read by this thread if not NO_CONCATENATE read_mode. + if constexpr (read_mode != read_mode::NO_CONCATENATE) { + auto table = concatenate_tables(std::move(tables_this_thread), stream); + stream.synchronize_no_throw(); + tables[thread_id] = std::move(table); + } else { + // Just synchronize this stream and exit + stream.synchronize_no_throw(); + } + } +}; + +/** + * @brief Function to setup and launch multithreaded parquet reading. + * + * @tparam read_mode Specifies if to concatenate and return the actual + * tables or discard them and return an empty vector + * + * @param input_sources List of input sources to read + * @param thread_count Number of threads + * @param stream_pool CUDA stream pool to use for threads + * + * @return Vector of read tables. + */ +template +std::vector read_parquet_multithreaded(std::vector const& input_sources, + int32_t thread_count, + rmm::cuda_stream_pool& stream_pool) +{ + // Tables read by each thread + std::vector tables(thread_count); + + // Table reading tasks + std::vector> read_tasks; + read_tasks.reserve(thread_count); + + // Create the read tasks + std::for_each( + thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) { + read_tasks.emplace_back( + read_fn{input_sources, tables, tid, thread_count, stream_pool.get_stream()}); + }); + + // Create threads with tasks + std::vector threads; + threads.reserve(thread_count); + for (auto& c : read_tasks) { + threads.emplace_back(c); + } + for (auto& t : threads) { + t.join(); + } + + // If CONCATENATE_ALL mode, then concatenate to a vector of one final table. + if (read_mode == read_mode::CONCATENATE_ALL) { + auto stream = stream_pool.get_stream(); + auto final_tbl = concatenate_tables(std::move(tables), stream); + stream.synchronize(); + tables.clear(); + tables.emplace_back(std::move(final_tbl)); + } + + return tables; +} + +/** + * @brief Functor for multithreaded parquet writing + */ +struct write_fn { + std::string const& output_path; + std::vector const& table_views; + int const thread_id; + rmm::cuda_stream_view stream; + + void operator()() + { + // Create a sink + cudf::io::sink_info const sink_info{output_path + "/table_" + std::to_string(thread_id) + + ".parquet"}; + // Writer options builder + auto builder = cudf::io::parquet_writer_options::builder(sink_info, table_views[thread_id]); + // Create a new metadata for the table + auto table_metadata = cudf::io::table_input_metadata{table_views[thread_id]}; + + builder.metadata(table_metadata); + auto options = builder.build(); + + // Write parquet data + cudf::io::write_parquet(options, stream); + + // Done with this stream + stream.synchronize_no_throw(); + } +}; + +/** + * @brief Function to setup and launch multithreaded writing parquet files. + * + * @param output_path Path to output directory + * @param tables List of at least table views to be written + * @param thread_count Number of threads to use for writing tables. + * @param stream_pool CUDA stream pool to use for threads + * + */ +void write_parquet_multithreaded(std::string const& output_path, + std::vector const& tables, + int32_t thread_count, + rmm::cuda_stream_pool& stream_pool) +{ + // Table writing tasks + std::vector write_tasks; + write_tasks.reserve(thread_count); + std::for_each( + thrust::make_counting_iterator(0), thrust::make_counting_iterator(thread_count), [&](auto tid) { + write_tasks.emplace_back(write_fn{output_path, tables, tid, stream_pool.get_stream()}); + }); + + // Writer threads + std::vector threads; + threads.reserve(thread_count); + for (auto& c : write_tasks) { + threads.emplace_back(c); + } + for (auto& t : threads) { + t.join(); + } +} + +/** + * @brief Function to print example usage and argument information. + */ +void print_usage() +{ + std::cout + << "\nUsage: parquet_io_multithreaded \n" + " \n" + " \n\n" + "Available IO source types: FILEPATH, HOST_BUFFER, PINNED_BUFFER (Default), " + "DEVICE_BUFFER\n\n" + "Note: Provide as many arguments as you like in the above order. Default values\n" + " for the unprovided arguments will be used. All input parquet files will\n" + " be converted to the specified IO source type before reading\n\n"; +} + +/** + * @brief Function to process comma delimited input paths string to parquet files and/or dirs + * and convert them to specified io sources. + * + * Process the input path string containing directories (of parquet files) and/or individual + * parquet files into a list of input parquet files, multiple the list by `input_multiplier`, + * make sure to have at least `thread_count` files to satisfy at least file per parallel thread, + * and convert the final list of files to a list of `io_source` and return. + * + * @param paths Comma delimited input paths string + * @param input_multiplier Multiplier for the input files list + * @param thread_count Number of threads being used in the example + * @param io_source_type Specified IO source type to convert input files to + * @param stream CUDA stream to use + * + * @return Vector of input sources for the given paths + */ +std::vector extract_input_sources(std::string const& paths, + int32_t input_multiplier, + int32_t thread_count, + io_source_type io_source_type, + rmm::cuda_stream_view stream) +{ + // Get the delimited paths to directory and/or files. + std::vector const delimited_paths = [&]() { + std::vector paths_list; + std::stringstream strstream{paths}; + std::string path; + // Extract the delimited paths. + while (std::getline(strstream, path, char{','})) { + paths_list.push_back(path); + } + return paths_list; + }(); + + // List of parquet files + std::vector parquet_files; + std::for_each(delimited_paths.cbegin(), delimited_paths.cend(), [&](auto const& path_string) { + std::filesystem::path path{path_string}; + // If this is a parquet file, add it. + if (std::filesystem::is_regular_file(path)) { + parquet_files.push_back(path_string); + } + // If this is a directory, add all files in the directory. + else if (std::filesystem::is_directory(path)) { + for (auto const& file : std::filesystem::directory_iterator(path)) { + if (std::filesystem::is_regular_file(file.path())) { + parquet_files.push_back(file.path().string()); + } else { + std::cout << "Skipping sub-directory: " << file.path().string() << "\n"; + } + } + } else { + print_usage(); + throw std::runtime_error("Encountered an invalid input path\n"); + } + }); + + // Current size of list of parquet files + auto const initial_size = parquet_files.size(); + if (initial_size == 0) { return {}; } + + // Reserve space + parquet_files.reserve(std::max(thread_count, input_multiplier * parquet_files.size())); + + // Append the input files by input_multiplier times + std::for_each(thrust::make_counting_iterator(1), + thrust::make_counting_iterator(input_multiplier), + [&](auto i) { + parquet_files.insert(parquet_files.end(), + parquet_files.begin(), + parquet_files.begin() + initial_size); + }); + + // Cycle append parquet files from the existing ones if less than the thread_count + std::cout << "Warning: Number of input sources < thread count. Cycling from\n" + "and appending to current input sources such that the number of\n" + "input source == thread count\n"; + for (size_t idx = 0; thread_count > static_cast(parquet_files.size()); idx++) { + parquet_files.emplace_back(parquet_files[idx % initial_size]); + } + + // Vector of io sources + std::vector input_sources; + input_sources.reserve(parquet_files.size()); + // Transform input files to the specified io sources + std::transform(parquet_files.begin(), + parquet_files.end(), + std::back_inserter(input_sources), + [&](auto const& file_name) { + return io_source{file_name, io_source_type, stream}; + }); + stream.synchronize(); + return input_sources; +} + +/** + * @brief The main function + */ +int32_t main(int argc, char const** argv) +{ + // Set arguments to defaults + std::string input_paths = "example.parquet"; + int32_t input_multiplier = 1; + int32_t num_reads = 1; + int32_t thread_count = 1; + io_source_type io_source_type = io_source_type::PINNED_BUFFER; + bool write_and_validate = false; + + // Set to the provided args + switch (argc) { + case 7: write_and_validate = get_boolean(argv[6]); [[fallthrough]]; + case 6: thread_count = std::max(thread_count, std::stoi(std::string{argv[5]})); [[fallthrough]]; + case 5: num_reads = std::max(1, std::stoi(argv[4])); [[fallthrough]]; + case 4: io_source_type = get_io_source_type(argv[3]); [[fallthrough]]; + case 3: + input_multiplier = std::max(input_multiplier, std::stoi(std::string{argv[2]})); + [[fallthrough]]; + case 2: + // Check if instead of input_paths, the first argument is `-h` or `--help` + if (auto arg = std::string{argv[1]}; arg != "-h" and arg != "--help") { + input_paths = std::move(arg); + break; + } + [[fallthrough]]; + default: print_usage(); throw std::runtime_error(""); + } + + // Initialize mr, default stream and stream pool + auto const is_pool_used = true; + auto resource = create_memory_resource(is_pool_used); + auto default_stream = cudf::get_default_stream(); + auto stream_pool = rmm::cuda_stream_pool(thread_count); + auto stats_mr = + rmm::mr::statistics_resource_adaptor(resource.get()); + rmm::mr::set_current_device_resource(&stats_mr); + + // List of input sources from the input_paths string. + auto const input_sources = extract_input_sources( + input_paths, input_multiplier, thread_count, io_source_type, default_stream); + + // Check if there is nothing to do + if (input_sources.empty()) { + print_usage(); + throw std::runtime_error("No input files to read. Exiting early.\n"); + } + + // Read the same parquet files specified times with multiple threads and discard the read tables + { + // Print status + std::cout << "\nReading " << input_sources.size() << " input sources " << num_reads + << " time(s) using " << thread_count + << " threads and discarding output " + "tables..\n"; + + if (io_source_type == io_source_type::FILEPATH) { + std::cout << "Note that the first read may include times for nvcomp, cufile loading and RMM " + "growth.\n\n"; + } + + cudf::examples::timer timer; + std::for_each(thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_reads), + [&](auto i) { // Read parquet files and discard the tables + std::ignore = read_parquet_multithreaded( + input_sources, thread_count, stream_pool); + }); + default_stream.synchronize(); + timer.print_elapsed_millis(); + } + + // Write parquet files and validate if needed + if (write_and_validate) { + // read_mode::CONCATENATE_THREADS returns a vector of `thread_count` tables + auto const tables = read_parquet_multithreaded( + input_sources, thread_count, stream_pool); + default_stream.synchronize(); + + // Construct a vector of table views for write_parquet_multithreaded + auto const table_views = [&tables]() { + std::vector table_views; + table_views.reserve(tables.size()); + std::transform( + tables.cbegin(), tables.cend(), std::back_inserter(table_views), [](auto const& tbl) { + return tbl->view(); + }); + return table_views; + }(); + + // Write tables to parquet + std::cout << "Writing parquet output files..\n"; + + // Create a directory at the tmpdir path. + std::string output_path = + std::filesystem::temp_directory_path().string() + "/output_" + current_date_and_time(); + std::filesystem::create_directory({output_path}); + cudf::examples::timer timer; + write_parquet_multithreaded(output_path, table_views, thread_count, stream_pool); + default_stream.synchronize(); + timer.print_elapsed_millis(); + + // Verify the output + std::cout << "Verifying output..\n"; + + // Simply concatenate the previously read tables from input sources + auto const input_table = cudf::concatenate(table_views, default_stream); + + // Sources from written parquet files + auto const written_pq_sources = extract_input_sources( + output_path, input_multiplier, thread_count, io_source_type, default_stream); + + // read_mode::CONCATENATE_ALL returns a concatenated vector of 1 table only + auto const transcoded_table = std::move(read_parquet_multithreaded( + written_pq_sources, thread_count, stream_pool) + .back()); + default_stream.synchronize(); + + // Check if the tables are identical + check_tables_equal(input_table->view(), transcoded_table->view()); + + // Remove the created temp directory and parquet data + std::filesystem::remove_all(output_path); + } + + // Print peak memory + std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n\n"; + + return 0; +} diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index c3b68b52c36..6bbe32de134 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -378,6 +378,26 @@ std::unique_ptr make_strings_column( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Construct a batch of STRING type columns given an array of device spans of pointer/size + * pairs. + * + * This function has input/output expectation similar to the `make_strings_column()` API that + * accepts only one device span of pointer/size pairs. The difference is that, this is designed to + * create many strings columns at once with minimal overhead of multiple kernel launches and + * stream synchronizations. + * + * @param input Array of device spans of pointer/size pairs, where each pointer is a device memory + * address or `nullptr` (indicating a null string), and size is string length (in bytes) + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used for memory allocation of the output columns + * @return Array of constructed strings columns + */ +std::vector> make_strings_column_batch( + std::vector const>> const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Construct a STRING type column given a device span of string_view. * diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 7359a0d5fde..1eaea5b6374 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -38,6 +38,22 @@ namespace datetime { * @file */ +/** + * @brief Types of datetime components that may be extracted. + */ +enum class datetime_component : uint8_t { + YEAR, + MONTH, + DAY, + WEEKDAY, + HOUR, + MINUTE, + SECOND, + MILLISECOND, + MICROSECOND, + NANOSECOND +}; + /** * @brief Extracts year from any datetime type and returns an int16_t * cudf::column. @@ -207,6 +223,24 @@ std::unique_ptr extract_nanosecond_fraction( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Extracts the specified datetime component from any datetime type and + * returns an int16_t cudf::column. + * + * @param column cudf::column_view of the input datetime values + * @param component The datetime component to extract + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate device memory of the returned column + * + * @returns cudf::column of the extracted int16_t datetime component + * @throw cudf::logic_error if input column datatype is not TIMESTAMP + */ +std::unique_ptr extract_datetime_component( + cudf::column_view const& column, + datetime_component component, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** @} */ // end of group /** * @addtogroup datetime_compute diff --git a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh index 10be5e1d36f..204eee49a2a 100644 --- a/cpp/include/cudf/detail/aggregation/device_aggregators.cuh +++ b/cpp/include/cudf/detail/aggregation/device_aggregators.cuh @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once #include @@ -29,12 +28,31 @@ #include namespace cudf::detail { +/// Checks if an aggregation kind needs to operate on the underlying storage type +template +__device__ constexpr bool uses_underlying_type() +{ + return k == aggregation::MIN or k == aggregation::MAX or k == aggregation::SUM; +} + +/// Gets the underlying target type for the given source type and aggregation kind +template +using underlying_target_t = + cuda::std::conditional_t(), + cudf::device_storage_type_t>, + cudf::detail::target_type_t>; + +/// Gets the underlying source type for the given source type and aggregation kind +template +using underlying_source_t = + cuda::std::conditional_t(), cudf::device_storage_type_t, Source>; + template struct update_target_element { - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept + __device__ void operator()(mutable_column_device_view, + size_type, + column_device_view, + size_type) const noexcept { CUDF_UNREACHABLE("Invalid source type and aggregation combination."); } @@ -51,8 +69,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; cudf::detail::atomic_min(&target.element(target_index), static_cast(source.element(source_index))); @@ -72,8 +88,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; using DeviceTarget = device_storage_type_t; using DeviceSource = device_storage_type_t; @@ -96,8 +110,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; cudf::detail::atomic_max(&target.element(target_index), static_cast(source.element(source_index))); @@ -117,8 +129,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; using DeviceTarget = device_storage_type_t; using DeviceSource = device_storage_type_t; @@ -141,8 +151,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; cudf::detail::atomic_add(&target.element(target_index), static_cast(source.element(source_index))); @@ -162,8 +170,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; using DeviceTarget = device_storage_type_t; using DeviceSource = device_storage_type_t; @@ -197,10 +203,10 @@ struct update_target_from_dictionary { template ()>* = nullptr> - __device__ void operator()(mutable_column_device_view target, - size_type target_index, - column_device_view source, - size_type source_index) const noexcept + __device__ void operator()(mutable_column_device_view, + size_type, + column_device_view, + size_type) const noexcept { } }; @@ -227,8 +233,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - dispatch_type_and_aggregation( source.child(cudf::dictionary_column_view::keys_column_index).type(), k, @@ -249,8 +253,6 @@ struct update_target_element; auto value = static_cast(source.element(source_index)); cudf::detail::atomic_add(&target.element(target_index), value * value); @@ -267,8 +269,6 @@ struct update_target_element; cudf::detail::atomic_mul(&target.element(target_index), static_cast(source.element(source_index))); @@ -286,8 +286,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; cudf::detail::atomic_add(&target.element(target_index), Target{1}); @@ -323,8 +321,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; auto old = cudf::detail::atomic_cas( &target.element(target_index), ARGMAX_SENTINEL, source_index); @@ -349,8 +345,6 @@ struct update_target_element< column_device_view source, size_type source_index) const noexcept { - if (source.is_null(source_index)) { return; } - using Target = target_type_t; auto old = cudf::detail::atomic_cas( &target.element(target_index), ARGMIN_SENTINEL, source_index); @@ -376,6 +370,9 @@ struct elementwise_aggregator { column_device_view source, size_type source_index) const noexcept { + if constexpr (k != cudf::aggregation::COUNT_ALL) { + if (source.is_null(source_index)) { return; } + } update_target_element{}(target, target_index, source, source_index); } }; diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index dfb646c66c4..4159e324472 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -36,7 +37,6 @@ #include #include -#include #include #include @@ -256,7 +256,7 @@ struct scatter_gather_functor { cudf::detail::grid_1d grid{input.size(), block_size, per_thread}; - rmm::device_scalar null_count{0, stream}; + cudf::detail::device_scalar null_count{0, stream}; if (output.nullable()) { // Have to initialize the output mask to all zeros because we may update // it with atomicOr(). diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh index a70cd5a0661..5dc75b1a3fb 100644 --- a/cpp/include/cudf/detail/copy_if_else.cuh +++ b/cpp/include/cudf/detail/copy_if_else.cuh @@ -19,12 +19,11 @@ #include #include #include +#include #include #include #include -#include - #include #include @@ -171,7 +170,7 @@ std::unique_ptr copy_if_else(bool nullable, // if we have validity in the output if (nullable) { - rmm::device_scalar valid_count{0, stream}; + cudf::detail::device_scalar valid_count{0, stream}; // call the kernel copy_if_else_kernel diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh index 3aa136d630b..fcb80fe45f7 100644 --- a/cpp/include/cudf/detail/copy_range.cuh +++ b/cpp/include/cudf/detail/copy_range.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -27,7 +28,6 @@ #include #include -#include #include #include @@ -154,7 +154,7 @@ void copy_range(SourceValueIterator source_value_begin, auto grid = cudf::detail::grid_1d{num_items, block_size, 1}; if (target.nullable()) { - rmm::device_scalar null_count(target.null_count(), stream); + cudf::detail::device_scalar null_count(target.null_count(), stream); auto kernel = copy_range_kernel; diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp index 9db7e48498f..df3050d6494 100644 --- a/cpp/include/cudf/detail/datetime.hpp +++ b/cpp/include/cudf/detail/datetime.hpp @@ -115,6 +115,16 @@ std::unique_ptr extract_nanosecond_fraction(cudf::column_view cons rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +/** + * @copydoc cudf::extract_datetime_component(cudf::column_view const&, datetime_component, + * rmm::cuda_stream_view, rmm::device_async_resource_ref) + * + */ +std::unique_ptr extract_datetime_component(cudf::column_view const& column, + datetime_component component, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + /** * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::cuda_stream_view, * rmm::device_async_resource_ref) diff --git a/cpp/include/cudf/detail/device_scalar.hpp b/cpp/include/cudf/detail/device_scalar.hpp new file mode 100644 index 00000000000..16ca06c6561 --- /dev/null +++ b/cpp/include/cudf/detail/device_scalar.hpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include +#include + +namespace CUDF_EXPORT cudf { +namespace detail { + +template +class device_scalar : public rmm::device_scalar { + public: +#ifdef __CUDACC__ +#pragma nv_exec_check_disable +#endif + ~device_scalar() = default; + +// Implementation is the same as what compiler should generate +// Could not use default move constructor as 11.8 compiler fails to generate it +#ifdef __CUDACC__ +#pragma nv_exec_check_disable +#endif + device_scalar(device_scalar&& other) noexcept + : rmm::device_scalar{std::move(other)}, bounce_buffer{std::move(other.bounce_buffer)} + { + } + device_scalar& operator=(device_scalar&&) noexcept = default; + + device_scalar(device_scalar const&) = delete; + device_scalar& operator=(device_scalar const&) = delete; + + device_scalar() = delete; + + explicit device_scalar( + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) + : rmm::device_scalar(stream, mr), bounce_buffer{make_host_vector(1, stream)} + { + } + + explicit device_scalar( + T const& initial_value, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) + : rmm::device_scalar(stream, mr), bounce_buffer{make_host_vector(1, stream)} + { + bounce_buffer[0] = initial_value; + cuda_memcpy_async(device_span{this->data(), 1}, bounce_buffer, stream); + } + + device_scalar(device_scalar const& other, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) + : rmm::device_scalar(other, stream, mr), bounce_buffer{make_host_vector(1, stream)} + { + } + + [[nodiscard]] T value(rmm::cuda_stream_view stream) const + { + cuda_memcpy(bounce_buffer, device_span{this->data(), 1}, stream); + return bounce_buffer[0]; + } + + void set_value_async(T const& value, rmm::cuda_stream_view stream) + { + bounce_buffer[0] = value; + cuda_memcpy_async(device_span{this->data(), 1}, bounce_buffer, stream); + } + + void set_value_async(T&& value, rmm::cuda_stream_view stream) + { + bounce_buffer[0] = std::move(value); + cuda_memcpy_async(device_span{this->data(), 1}, bounce_buffer, stream); + } + + void set_value_to_zero_async(rmm::cuda_stream_view stream) { set_value_async(T{}, stream); } + + private: + mutable cudf::detail::host_vector bounce_buffer; +}; + +} // namespace detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh deleted file mode 100644 index 7de79b31bc7..00000000000 --- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -namespace cudf::detail { - -using hash_map_type = cuco::legacy:: - static_map>; - -/** - * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are - * rows that compared equal. - * - * TODO: We need to switch to use `static_reduction_map` when it is ready - * (https://github.com/NVIDIA/cuCollections/pull/98). - */ -template -struct reduce_by_row_fn_base { - protected: - MapView const d_map; - KeyHasher const d_hasher; - KeyEqual const d_equal; - OutputType* const d_output; - - reduce_by_row_fn_base(MapView const& d_map, - KeyHasher const& d_hasher, - KeyEqual const& d_equal, - OutputType* const d_output) - : d_map{d_map}, d_hasher{d_hasher}, d_equal{d_equal}, d_output{d_output} - { - } - - /** - * @brief Return a pointer to the output array at the given index. - * - * @param idx The access index - * @return A pointer to the given index in the output array - */ - __device__ OutputType* get_output_ptr(size_type const idx) const - { - auto const iter = d_map.find(idx, d_hasher, d_equal); - - if (iter != d_map.end()) { - // Only one (undetermined) index value of the duplicate rows could be inserted into the map. - // As such, looking up for all indices of duplicate rows always returns the same value. - auto const inserted_idx = iter->second.load(cuda::std::memory_order_relaxed); - - // All duplicate rows will have concurrent access to this same output slot. - return &d_output[inserted_idx]; - } else { - // All input `idx` values have been inserted into the map before. - // Thus, searching for an `idx` key resulting in the `end()` iterator only happens if - // `d_equal(idx, idx) == false`. - // Such situations are due to comparing nulls or NaNs which are considered as always unequal. - // In those cases, all rows containing nulls or NaNs are distinct. Just return their direct - // output slot. - return &d_output[idx]; - } - } -}; - -/** - * @brief Perform a reduction on groups of rows that are compared equal. - * - * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared - * equal. A hash table is used to find groups of equal rows. - * - * At the beginning of the operation, the entire output array is filled with a value given by - * the `init` parameter. Then, the reduction result for each row group is written into the output - * array at the index of an unspecified row in the group. - * - * @tparam ReduceFuncBuilder The builder class that must have a `build()` method returning a - * reduction functor derived from `reduce_by_row_fn_base` - * @tparam OutputType Type of the reduction results - * @param map The auxiliary map to perform reduction - * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row - * comparisons - * @param num_rows The number of all input rows - * @param has_nulls Indicate whether the input rows has any nulls at any nested levels - * @param has_nested_columns Indicates whether the input table has any nested columns - * @param nulls_equal Flag to specify whether null elements should be considered as equal - * @param nans_equal Flag to specify whether NaN values in floating point column should be - * considered equal. - * @param init The initial value for reduction of each row group - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned vector - * @return A device_uvector containing the reduction results - */ -template -rmm::device_uvector hash_reduce_by_row( - hash_map_type const& map, - std::shared_ptr const preprocessed_input, - size_type num_rows, - cudf::nullate::DYNAMIC has_nulls, - bool has_nested_columns, - null_equality nulls_equal, - nan_equality nans_equal, - ReduceFuncBuilder func_builder, - OutputType init, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto const map_dview = map.get_device_view(); - auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input); - auto const key_hasher = row_hasher.device_hasher(has_nulls); - auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); - - auto reduction_results = rmm::device_uvector(num_rows, stream, mr); - thrust::uninitialized_fill( - rmm::exec_policy(stream), reduction_results.begin(), reduction_results.end(), init); - - auto const reduce_by_row = [&](auto const value_comp) { - if (has_nested_columns) { - auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp); - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin())); - } else { - auto const key_equal = row_comp.equal_to(has_nulls, nulls_equal, value_comp); - thrust::for_each( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_rows), - func_builder.build(map_dview, key_hasher, key_equal, reduction_results.begin())); - } - }; - - if (nans_equal == nan_equality::ALL_EQUAL) { - using nan_equal_comparator = - cudf::experimental::row::equality::nan_equal_physical_equality_comparator; - reduce_by_row(nan_equal_comparator{}); - } else { - using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator; - reduce_by_row(nan_unequal_comparator{}); - } - - return reduction_results; -} - -} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh index 4349e1b70fd..30f36d6a5da 100644 --- a/cpp/include/cudf/detail/iterator.cuh +++ b/cpp/include/cudf/detail/iterator.cuh @@ -38,18 +38,19 @@ #include #include +#include +#include #include #include #include #include -#include - namespace cudf { namespace detail { /** * @brief Convenience wrapper for creating a `thrust::transform_iterator` over a - * `thrust::counting_iterator`. + * `thrust::counting_iterator` within the range [0, INT_MAX]. + * * * Example: * @code{.cpp} @@ -62,14 +63,21 @@ namespace detail { * iter[n] == n * n * @endcode * - * @param start The starting value of the counting iterator + * @param start The starting value of the counting iterator (must be size_type or smaller type). * @param f The unary function to apply to the counting iterator. * @return A transform iterator that applies `f` to a counting iterator */ -template -CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(cudf::size_type start, +template +CUDF_HOST_DEVICE inline auto make_counting_transform_iterator(CountingIterType start, UnaryFunction f) { + // Check if the `start` for counting_iterator is of size_type or a smaller integral type + static_assert( + cuda::std::is_integral_v and + cuda::std::numeric_limits::digits <= + cuda::std::numeric_limits::digits, + "The `start` for the counting_transform_iterator must be size_type or smaller type"); + return thrust::make_transform_iterator(thrust::make_counting_iterator(start), f); } diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 327c732716c..482265d633e 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -25,7 +26,6 @@ #include #include -#include #include #include @@ -165,7 +165,7 @@ size_type inplace_bitmask_binop(Binop op, "Mask pointer cannot be null"); rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref(); - rmm::device_scalar d_counter{0, stream, mr}; + cudf::detail::device_scalar d_counter{0, stream, mr}; rmm::device_uvector d_masks(masks.size(), stream, mr); rmm::device_uvector d_begin_bits(masks_begin_bits.size(), stream, mr); diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp index 18b1e9b2d2e..0f852db0c54 100644 --- a/cpp/include/cudf/detail/unary.hpp +++ b/cpp/include/cudf/detail/unary.hpp @@ -59,7 +59,7 @@ std::unique_ptr true_if(InputIterator begin, auto output_mutable_view = output->mutable_view(); auto output_data = output_mutable_view.data(); - thrust::transform(rmm::exec_policy(stream), begin, end, output_data, p); + thrust::transform(rmm::exec_policy_nosync(stream), begin, end, output_data, p); return output; } diff --git a/cpp/include/cudf/detail/utilities/batched_memcpy.hpp b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp new file mode 100644 index 00000000000..ed0ab9e6e5b --- /dev/null +++ b/cpp/include/cudf/detail/utilities/batched_memcpy.hpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include +#include +#include + +namespace CUDF_EXPORT cudf { +namespace detail { + +/** + * @brief A helper function that copies a vector of vectors from source to destination addresses in + * a batched manner. + * + * @tparam SrcIterator **[inferred]** The type of device-accessible source addresses iterator + * @tparam DstIterator **[inferred]** The type of device-accessible destination address iterator + * @tparam SizeIterator **[inferred]** The type of device-accessible buffer size iterator + * + * @param src_iter Device-accessible iterator to source addresses + * @param dst_iter Device-accessible iterator to destination addresses + * @param size_iter Device-accessible iterator to the buffer sizes (in bytes) + * @param num_buffs Number of buffers to be copied + * @param stream CUDA stream to use + */ +template +void batched_memcpy_async(SrcIterator src_iter, + DstIterator dst_iter, + SizeIterator size_iter, + size_t num_buffs, + rmm::cuda_stream_view stream) +{ + size_t temp_storage_bytes = 0; + cub::DeviceMemcpy::Batched( + nullptr, temp_storage_bytes, src_iter, dst_iter, size_iter, num_buffs, stream.value()); + + rmm::device_buffer d_temp_storage{temp_storage_bytes, stream.value()}; + + cub::DeviceMemcpy::Batched(d_temp_storage.data(), + temp_storage_bytes, + src_iter, + dst_iter, + size_iter, + num_buffs, + stream.value()); +} + +} // namespace detail +} // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/io/detail/batched_memset.hpp b/cpp/include/cudf/detail/utilities/batched_memset.hpp similarity index 95% rename from cpp/include/cudf/io/detail/batched_memset.hpp rename to cpp/include/cudf/detail/utilities/batched_memset.hpp index 1c74be4a9fe..78be5b91248 100644 --- a/cpp/include/cudf/io/detail/batched_memset.hpp +++ b/cpp/include/cudf/detail/utilities/batched_memset.hpp @@ -28,7 +28,7 @@ #include namespace CUDF_EXPORT cudf { -namespace io::detail { +namespace detail { /** * @brief A helper function that takes in a vector of device spans and memsets them to the @@ -53,8 +53,8 @@ void batched_memset(std::vector> const& bufs, cudf::detail::make_device_uvector_async(bufs, stream, cudf::get_current_device_resource_ref()); // get a vector with the sizes of all buffers - auto sizes = cudf::detail::make_counting_transform_iterator( - static_cast(0), + auto sizes = thrust::make_transform_iterator( + thrust::counting_iterator(0), cuda::proclaim_return_type( [gpu_bufs = gpu_bufs.data()] __device__(std::size_t i) { return gpu_bufs[i].size(); })); @@ -78,5 +78,5 @@ void batched_memset(std::vector> const& bufs, d_temp_storage.data(), temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream); } -} // namespace io::detail +} // namespace detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp index 632d5a732ec..4f0c52c5954 100644 --- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include @@ -25,33 +26,82 @@ namespace detail { enum class host_memory_kind : uint8_t { PINNED, PAGEABLE }; +void cuda_memcpy_async_impl( + void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); + /** - * @brief Asynchronously copies data between the host and device. + * @brief Asynchronously copies data from host to device memory. * * Implementation may use different strategies depending on the size and type of host data. * - * @param dst Destination memory address - * @param src Source memory address - * @param size Number of bytes to copy - * @param kind Type of host memory + * @param dst Destination device memory + * @param src Source host memory * @param stream CUDA stream used for the copy */ -void cuda_memcpy_async( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); +template +void cuda_memcpy_async(device_span dst, host_span src, rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async"); + auto const is_pinned = src.is_device_accessible(); + cuda_memcpy_async_impl(dst.data(), + src.data(), + src.size_bytes(), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); +} /** - * @brief Synchronously copies data between the host and device. + * @brief Asynchronously copies data from device to host memory. * * Implementation may use different strategies depending on the size and type of host data. * - * @param dst Destination memory address - * @param src Source memory address - * @param size Number of bytes to copy - * @param kind Type of host memory + * @param dst Destination host memory + * @param src Source device memory * @param stream CUDA stream used for the copy */ -void cuda_memcpy( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); +template +void cuda_memcpy_async(host_span dst, device_span src, rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(dst.size() == src.size(), "Mismatched sizes in cuda_memcpy_async"); + auto const is_pinned = dst.is_device_accessible(); + cuda_memcpy_async_impl(dst.data(), + src.data(), + src.size_bytes(), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); +} + +/** + * @brief Synchronously copies data from host to device memory. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination device memory + * @param src Source host memory + * @param stream CUDA stream used for the copy + */ +template +void cuda_memcpy(device_span dst, host_span src, rmm::cuda_stream_view stream) +{ + cuda_memcpy_async(dst, src, stream); + stream.synchronize(); +} + +/** + * @brief Synchronously copies data from device to host memory. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination host memory + * @param src Source device memory + * @param stream CUDA stream used for the copy + */ +template +void cuda_memcpy(host_span dst, device_span src, rmm::cuda_stream_view stream) +{ + cuda_memcpy_async(dst, src, stream); + stream.synchronize(); +} } // namespace detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/detail/utilities/logger.hpp b/cpp/include/cudf/detail/utilities/logger.hpp index 8c1c3c28df8..e7643eb44bd 100644 --- a/cpp/include/cudf/detail/utilities/logger.hpp +++ b/cpp/include/cudf/detail/utilities/logger.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,9 +19,9 @@ #include // Log messages that require computation should only be used at level TRACE and DEBUG -#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::logger(), __VA_ARGS__) -#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::logger(), __VA_ARGS__) +#define CUDF_LOG_TRACE(...) SPDLOG_LOGGER_TRACE(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_INFO(...) SPDLOG_LOGGER_INFO(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_WARN(...) SPDLOG_LOGGER_WARN(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_ERROR(...) SPDLOG_LOGGER_ERROR(&cudf::detail::logger(), __VA_ARGS__) +#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::detail::logger(), __VA_ARGS__) diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 953ae5b9308..1f1e7a2db77 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -101,12 +101,7 @@ rmm::device_uvector make_device_uvector_async(host_span source_data, rmm::device_async_resource_ref mr) { rmm::device_uvector ret(source_data.size(), stream, mr); - auto const is_pinned = source_data.is_device_accessible(); - cuda_memcpy_async(ret.data(), - source_data.data(), - source_data.size() * sizeof(T), - is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, - stream); + cuda_memcpy_async(ret, source_data, stream); return ret; } @@ -405,13 +400,8 @@ host_vector make_empty_host_vector(size_t capacity, rmm::cuda_stream_view str template host_vector make_host_vector_async(device_span v, rmm::cuda_stream_view stream) { - auto result = make_host_vector(v.size(), stream); - auto const is_pinned = result.get_allocator().is_device_accessible(); - cuda_memcpy_async(result.data(), - v.data(), - v.size() * sizeof(T), - is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, - stream); + auto result = make_host_vector(v.size(), stream); + cuda_memcpy_async(result, v, stream); return result; } diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index cfb2e70bfed..af182b69c3a 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include @@ -25,7 +26,6 @@ #include #include -#include #include @@ -101,7 +101,7 @@ std::pair valid_if(InputIterator begin, size_type null_count{0}; if (size > 0) { - rmm::device_scalar valid_count{0, stream}; + cudf::detail::device_scalar valid_count{0, stream}; constexpr size_type block_size{256}; grid_1d grid{size, block_size}; diff --git a/cpp/include/cudf/hashing/detail/helper_functions.cuh b/cpp/include/cudf/hashing/detail/helper_functions.cuh index 3489fdeccee..ea1accc62a4 100644 --- a/cpp/include/cudf/hashing/detail/helper_functions.cuh +++ b/cpp/include/cudf/hashing/detail/helper_functions.cuh @@ -47,197 +47,3 @@ inline size_t compute_hash_table_size(cudf::size_type num_keys_to_insert, return hash_table_size; } - -template -__forceinline__ __device__ pair_type load_pair_vectorized(pair_type const* __restrict__ const ptr) -{ - if (sizeof(uint4) == sizeof(pair_type)) { - union pair_type2vec_type { - uint4 vec_val; - pair_type pair_val; - }; - pair_type2vec_type converter = {0, 0, 0, 0}; - converter.vec_val = *reinterpret_cast(ptr); - return converter.pair_val; - } else if (sizeof(uint2) == sizeof(pair_type)) { - union pair_type2vec_type { - uint2 vec_val; - pair_type pair_val; - }; - pair_type2vec_type converter = {0, 0}; - converter.vec_val = *reinterpret_cast(ptr); - return converter.pair_val; - } else if (sizeof(int) == sizeof(pair_type)) { - union pair_type2vec_type { - int vec_val; - pair_type pair_val; - }; - pair_type2vec_type converter = {0}; - converter.vec_val = *reinterpret_cast(ptr); - return converter.pair_val; - } else if (sizeof(short) == sizeof(pair_type)) { - union pair_type2vec_type { - short vec_val; - pair_type pair_val; - }; - pair_type2vec_type converter = {0}; - converter.vec_val = *reinterpret_cast(ptr); - return converter.pair_val; - } else { - return *ptr; - } -} - -template -__forceinline__ __device__ void store_pair_vectorized(pair_type* __restrict__ const ptr, - pair_type const val) -{ - if (sizeof(uint4) == sizeof(pair_type)) { - union pair_type2vec_type { - uint4 vec_val; - pair_type pair_val; - }; - pair_type2vec_type converter = {0, 0, 0, 0}; - converter.pair_val = val; - *reinterpret_cast(ptr) = converter.vec_val; - } else if (sizeof(uint2) == sizeof(pair_type)) { - union pair_type2vec_type { - uint2 vec_val; - pair_type pair_val; - }; - pair_type2vec_type converter = {0, 0}; - converter.pair_val = val; - *reinterpret_cast(ptr) = converter.vec_val; - } else if (sizeof(int) == sizeof(pair_type)) { - union pair_type2vec_type { - int vec_val; - pair_type pair_val; - }; - pair_type2vec_type converter = {0}; - converter.pair_val = val; - *reinterpret_cast(ptr) = converter.vec_val; - } else if (sizeof(short) == sizeof(pair_type)) { - union pair_type2vec_type { - short vec_val; - pair_type pair_val; - }; - pair_type2vec_type converter = {0}; - converter.pair_val = val; - *reinterpret_cast(ptr) = converter.vec_val; - } else { - *ptr = val; - } -} - -template -CUDF_KERNEL void init_hashtbl(value_type* __restrict__ const hashtbl_values, - size_type const n, - key_type const key_val, - elem_type const elem_val) -{ - size_type const idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < n) { - store_pair_vectorized(hashtbl_values + idx, thrust::make_pair(key_val, elem_val)); - } -} - -template -struct equal_to { - using result_type = bool; - using first_argument_type = T; - using second_argument_type = T; - __forceinline__ __host__ __device__ constexpr bool operator()( - first_argument_type const& lhs, second_argument_type const& rhs) const - { - return lhs == rhs; - } -}; - -template -class cycle_iterator_adapter { - public: - using value_type = typename std::iterator_traits::value_type; - using difference_type = typename std::iterator_traits::difference_type; - using pointer = typename std::iterator_traits::pointer; - using reference = typename std::iterator_traits::reference; - using iterator_type = Iterator; - - cycle_iterator_adapter() = delete; - - __host__ __device__ explicit cycle_iterator_adapter(iterator_type const& begin, - iterator_type const& end, - iterator_type const& current) - : m_begin(begin), m_end(end), m_current(current) - { - } - - __host__ __device__ cycle_iterator_adapter& operator++() - { - if (m_end == (m_current + 1)) - m_current = m_begin; - else - ++m_current; - return *this; - } - - __host__ __device__ cycle_iterator_adapter const& operator++() const - { - if (m_end == (m_current + 1)) - m_current = m_begin; - else - ++m_current; - return *this; - } - - __host__ __device__ cycle_iterator_adapter& operator++(int) - { - cycle_iterator_adapter old(m_begin, m_end, m_current); - if (m_end == (m_current + 1)) - m_current = m_begin; - else - ++m_current; - return old; - } - - __host__ __device__ cycle_iterator_adapter const& operator++(int) const - { - cycle_iterator_adapter old(m_begin, m_end, m_current); - if (m_end == (m_current + 1)) - m_current = m_begin; - else - ++m_current; - return old; - } - - __host__ __device__ bool equal(cycle_iterator_adapter const& other) const - { - return m_current == other.m_current && m_begin == other.m_begin && m_end == other.m_end; - } - - __host__ __device__ reference& operator*() { return *m_current; } - - __host__ __device__ reference const& operator*() const { return *m_current; } - - __host__ __device__ const pointer operator->() const { return m_current.operator->(); } - - __host__ __device__ pointer operator->() { return m_current; } - - private: - iterator_type m_current; - iterator_type m_begin; - iterator_type m_end; -}; - -template -__host__ __device__ bool operator==(cycle_iterator_adapter const& lhs, - cycle_iterator_adapter const& rhs) -{ - return lhs.equal(rhs); -} - -template -__host__ __device__ bool operator!=(cycle_iterator_adapter const& lhs, - cycle_iterator_adapter const& rhs) -{ - return !lhs.equal(rhs); -} diff --git a/cpp/include/cudf/io/config_utils.hpp b/cpp/include/cudf/io/config_utils.hpp index 1827ba0e3e6..13a76d50346 100644 --- a/cpp/include/cudf/io/config_utils.hpp +++ b/cpp/include/cudf/io/config_utils.hpp @@ -18,7 +18,8 @@ #include namespace CUDF_EXPORT cudf { -namespace io::cufile_integration { +namespace io { +namespace cufile_integration { /** * @brief Returns true if cuFile and its compatibility mode are enabled. @@ -35,9 +36,15 @@ bool is_gds_enabled(); */ bool is_kvikio_enabled(); -} // namespace io::cufile_integration +/** + * @brief Set kvikIO thread pool size according to the environment variable KVIKIO_NTHREADS. If + * KVIKIO_NTHREADS is not set, use 8 threads by default. + */ +void set_thread_pool_nthreads_from_env(); + +} // namespace cufile_integration -namespace io::nvcomp_integration { +namespace nvcomp_integration { /** * @brief Returns true if all nvCOMP uses are enabled. @@ -49,5 +56,6 @@ bool is_all_enabled(); */ bool is_stable_enabled(); -} // namespace io::nvcomp_integration +} // namespace nvcomp_integration +} // namespace io } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index b12fbe39a57..7d2cc4ad493 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -86,14 +86,21 @@ class datasource { /** * @brief Creates a source from a file path. * + * Parameters `offset` and `max_size_estimate` are hints to the `datasource` implementation about + * the expected range of the data that will be read. The implementation may use these hints to + * optimize the read operation. These parameters are usually based on the byte range option. In + * this case, `max_size_estimate` can include padding after the byte range, to include additional + * data that may be needed for processing. + * * @param[in] filepath Path to the file to use - * @param[in] offset Bytes from the start of the file (the default is zero) - * @param[in] size Bytes from the offset; use zero for entire file (the default is zero) + * @param[in] offset Starting byte offset from which data will be read (the default is zero) + * @param[in] max_size_estimate Upper estimate of the data range that will be read (the default is + * zero, which means the whole file after `offset`) * @return Constructed datasource object */ static std::unique_ptr create(std::string const& filepath, - size_t offset = 0, - size_t size = 0); + size_t offset = 0, + size_t max_size_estimate = 0); /** * @brief Creates a source from a host memory buffer. diff --git a/cpp/include/cudf/reduction/detail/reduction_operators.cuh b/cpp/include/cudf/reduction/detail/reduction_operators.cuh index 4cf8564ab3a..5694362af8f 100644 --- a/cpp/include/cudf/reduction/detail/reduction_operators.cuh +++ b/cpp/include/cudf/reduction/detail/reduction_operators.cuh @@ -31,17 +31,41 @@ namespace detail { // intermediate data structure to compute `var`, `std` template struct var_std { - ResultType value; /// the value - ResultType value_squared; /// the value of squared - - CUDF_HOST_DEVICE inline var_std(ResultType _value = 0, ResultType _value_squared = 0) - : value(_value), value_squared(_value_squared){}; + // Uses the pairwise approach of Chan, Golub, and LeVeque, + // _Algorithms for computing the sample variance: analysis and + // recommendations_ (1983) + // https://doi.org/10.1080/00031305.1983.10483115 + // Also http://www.cs.yale.edu/publications/techreports/tr222.pdf + // This is a modification of Youngs and Cramer's online approach. + ResultType running_sum; + ResultType running_square_deviations; + size_type count; + + CUDF_HOST_DEVICE inline var_std(ResultType t = 0, ResultType s = 0, size_type n = 0) + : running_sum(t), running_square_deviations(s), count(n){}; using this_t = var_std; CUDF_HOST_DEVICE inline this_t operator+(this_t const& rhs) const { - return this_t((this->value + rhs.value), (this->value_squared + rhs.value_squared)); + // Updates as per equations 1.5a and 1.5b in the paper + // T_{1,m+n} = T_{1,m} + T_{m+1,n+1} + // S_{1,m+n} = S_{1,m} + S_{m+1,n+1} + m/(n(m+n)) * (n/m T_{1,m} - T_{m+1,n+1})**2 + // Here the first m samples are in this, the remaining n samples are in rhs. + auto const m = this->count; + auto const n = rhs.count; + // Avoid division by zero. + if (m == 0) { return rhs; } + if (n == 0) { return *this; } + auto const tm = this->running_sum; + auto const tn = rhs.running_sum; + auto const sm = this->running_square_deviations; + auto const sn = rhs.running_square_deviations; + auto const tmn = tm + tn; + auto const diff = ((static_cast(n) / m) * tm) - tn; + // Computing m/n(m+n) as m/n/(m+n) to avoid integer overflow + auto const smn = sm + sn + ((static_cast(m) / n) / (m + n)) * diff * diff; + return {tmn, smn, m + n}; }; }; @@ -50,10 +74,7 @@ template struct transformer_var_std { using OutputType = var_std; - CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value) - { - return OutputType(value, value * value); - }; + CUDF_HOST_DEVICE inline OutputType operator()(ResultType const& value) { return {value, 0, 1}; }; }; // ------------------------------------------------------------------------ @@ -257,12 +278,7 @@ struct variance : public compound_op { cudf::size_type const& count, cudf::size_type const& ddof) { - ResultType mean = input.value / count; - ResultType asum = input.value_squared; - cudf::size_type div = count - ddof; - ResultType var = asum / div - ((mean * mean) * count) / div; - - return var; + return input.running_square_deviations / (count - ddof); }; }; }; diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index 66be2a12fbe..360dde11fc0 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -94,8 +95,8 @@ class scalar { [[nodiscard]] bool const* validity_data() const; protected: - data_type _type{type_id::EMPTY}; ///< Logical type of value in the scalar - rmm::device_scalar _is_valid; ///< Device bool signifying validity + data_type _type{type_id::EMPTY}; ///< Logical type of value in the scalar + cudf::detail::device_scalar _is_valid; ///< Device bool signifying validity /** * @brief Move constructor for scalar. diff --git a/cpp/include/cudf/strings/convert/convert_urls.hpp b/cpp/include/cudf/strings/convert/convert_urls.hpp index d6e87f9d543..febc63d8779 100644 --- a/cpp/include/cudf/strings/convert/convert_urls.hpp +++ b/cpp/include/cudf/strings/convert/convert_urls.hpp @@ -28,7 +28,7 @@ namespace strings { */ /** - * @brief Decodes each string using URL encoding. + * @brief Encodes each string using URL encoding. * * Converts mostly non-ascii characters and control characters into UTF-8 hex code-points * prefixed with '%'. For example, the space character must be converted to characters '%20' where @@ -49,7 +49,7 @@ std::unique_ptr url_encode( rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** - * @brief Encodes each string using URL encoding. + * @brief Decodes each string using URL encoding. * * Converts all character sequences starting with '%' into character code-points * interpreting the 2 following characters as hex values to create the code-point. diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index 1283226879b..de2f1770e28 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -29,6 +30,8 @@ #include #include +#include +#include #include #include @@ -38,6 +41,62 @@ namespace cudf { namespace strings { namespace detail { +/** + * @brief Gather characters to create a strings column using the given string-index pair iterator + * + * @tparam IndexPairIterator iterator over type `pair` values + * + * @param offsets The offsets for the output strings column + * @param chars_size The size (in bytes) of the chars data + * @param begin Iterator to the first string-index pair + * @param strings_count The number of strings + * @param stream CUDA stream used for device memory operations + * @param mr Device memory resource used to allocate the returned column's device memory + * @return An array of chars gathered from the input string-index pair iterator + */ +template +rmm::device_uvector make_chars_buffer(column_view const& offsets, + int64_t chars_size, + IndexPairIterator begin, + size_type strings_count, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto chars_data = rmm::device_uvector(chars_size, stream, mr); + auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets); + + auto const src_ptrs = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type([begin] __device__(uint32_t idx) { + // Due to a bug in cub (https://github.com/NVIDIA/cccl/issues/586), + // we have to use `const_cast` to remove `const` qualifier from the source pointer. + // This should be fine as long as we only read but not write anything to the source. + return reinterpret_cast(const_cast(begin[idx].first)); + })); + auto const src_sizes = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [begin] __device__(uint32_t idx) { return begin[idx].second; })); + auto const dst_ptrs = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type([offsets = d_offsets, output = chars_data.data()] __device__( + uint32_t idx) { return output + offsets[idx]; })); + + size_t temp_storage_bytes = 0; + CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched( + nullptr, temp_storage_bytes, src_ptrs, dst_ptrs, src_sizes, strings_count, stream.value())); + rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); + CUDF_CUDA_TRY(cub::DeviceMemcpy::Batched(d_temp_storage.data(), + temp_storage_bytes, + src_ptrs, + dst_ptrs, + src_sizes, + strings_count, + stream.value())); + + return chars_data; +} + /** * @brief Create an offsets column to be a child of a compound column * diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh index 6b1b453a752..03240f418fe 100644 --- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh +++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh @@ -49,16 +49,6 @@ namespace detail { */ using string_index_pair = thrust::pair; -/** - * @brief Average string byte-length threshold for deciding character-level - * vs. row-level parallel algorithm. - * - * This value was determined by running the factory_benchmark against different - * string lengths and observing the point where the performance is faster for - * long strings. - */ -constexpr size_type FACTORY_BYTES_PER_ROW_THRESHOLD = 64; - /** * @brief Create a strings-type column from iterators of pointer/size pairs * @@ -88,8 +78,6 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer); auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column( offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto const d_offsets = - cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); // create null mask auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; }; @@ -99,38 +87,8 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr}; // build chars column - auto chars_data = [d_offsets, bytes = bytes, begin, strings_count, null_count, stream, mr] { - auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1); - // use a character-parallel kernel for long string lengths - if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) { - auto const str_begin = thrust::make_transform_iterator( - begin, cuda::proclaim_return_type([] __device__(auto ip) { - return string_view{ip.first, ip.second}; - })); - - return gather_chars(str_begin, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(strings_count), - d_offsets, - bytes, - stream, - mr); - } else { - // this approach is 2-3x faster for a large number of smaller string lengths - auto chars_data = rmm::device_uvector(bytes, stream, mr); - auto d_chars = chars_data.data(); - auto copy_chars = [d_chars] __device__(auto item) { - string_index_pair const str = thrust::get<0>(item); - int64_t const offset = thrust::get<1>(item); - if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second); - }; - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_zip_iterator(thrust::make_tuple(begin, d_offsets)), - strings_count, - copy_chars); - return chars_data; - } - }(); + auto chars_data = + make_chars_buffer(offsets_column->view(), bytes, begin, strings_count, stream, mr); return make_strings_column(strings_count, std::move(offsets_column), diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp index c6b9bc7e58a..867764b6d9a 100644 --- a/cpp/include/cudf/strings/findall.hpp +++ b/cpp/include/cudf/strings/findall.hpp @@ -66,6 +66,35 @@ std::unique_ptr findall( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the starting character index of the first match for the given pattern + * in each row of the input column + * + * @code{.pseudo} + * Example: + * s = ["bunny", "rabbit", "hare", "dog"] + * p = regex_program::create("[be]") + * r = find_re(s, p) + * r is now [0, 2, 3, -1] + * @endcode + * + * A null output row occurs if the corresponding input row is null. + * A -1 is returned for rows that do not contain a match. + * + * See the @ref md_regex "Regex Features" page for details on patterns supported by this API. + * + * @param input Strings instance for this operation + * @param prog Regex program instance + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of integers + */ +std::unique_ptr find_re( + strings_column_view const& input, + regex_program const& prog, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** @} */ // end of doxygen group } // namespace strings } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp index 762131a174f..15fdad21d9f 100644 --- a/cpp/include/cudf/table/table.hpp +++ b/cpp/include/cudf/table/table.hpp @@ -148,7 +148,7 @@ class table { std::vector columns(std::distance(begin, end)); std::transform( begin, end, columns.begin(), [this](auto index) { return _columns.at(index)->view(); }); - return table_view(columns); + return table_view{columns}; } /** diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp index 4a990f67ce4..d41176590ea 100644 --- a/cpp/include/cudf/table/table_view.hpp +++ b/cpp/include/cudf/table/table_view.hpp @@ -241,7 +241,7 @@ class table_view : public detail::table_view_base { { std::vector columns(std::distance(begin, end)); std::transform(begin, end, columns.begin(), [this](auto index) { return this->column(index); }); - return table_view(columns); + return table_view{columns}; } /** diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp index 45d5d1b12e1..982554a23f5 100644 --- a/cpp/include/cudf/utilities/logger.hpp +++ b/cpp/include/cudf/utilities/logger.hpp @@ -22,6 +22,10 @@ namespace CUDF_EXPORT cudf { +namespace detail { +spdlog::logger& logger(); +} + /** * @brief Returns the global logger. * @@ -43,6 +47,8 @@ namespace CUDF_EXPORT cudf { * * @return spdlog::logger& The logger. */ -spdlog::logger& logger(); +[[deprecated( + "Support for direct access to spdlog loggers in cudf is planned for removal")]] spdlog::logger& +logger(); } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 914731ea417..21ee4fa9e9b 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -180,18 +180,6 @@ class span_base { return Derived(_data + _size - count, count); } - /** - * @brief Obtains a span that is a view over the `count` elements of this span starting at offset - * - * @param offset The offset of the first element in the subspan - * @param count The number of elements in the subspan - * @return A subspan of the sequence, of requested count and offset - */ - [[nodiscard]] constexpr Derived subspan(size_type offset, size_type count) const noexcept - { - return Derived(_data + offset, count); - } - private: pointer _data{nullptr}; size_type _size{0}; @@ -234,6 +222,15 @@ struct host_span : public cudf::detail::span_base, // NOLINT void>* = nullptr> constexpr host_span(host_span const& other) noexcept - : base(other.data(), other.size()) + : base(other.data(), other.size()), _is_device_accessible{other.is_device_accessible()} { } @@ -299,6 +296,19 @@ struct host_span : public cudf::detail::span_basedata() + offset, count, _is_device_accessible}; + } + private: bool _is_device_accessible{false}; }; @@ -368,6 +378,19 @@ struct device_span : public cudf::detail::span_basedata() + offset, count}; + } }; /** @} */ // end of group @@ -386,42 +409,38 @@ class base_2dspan { constexpr base_2dspan() noexcept = default; /** - * @brief Constructor a 2D span + * @brief Constructor from a span and number of elements in each row. * - * @param data Pointer to the data - * @param rows Number of rows + * @param flat_view The flattened 2D span * @param columns Number of columns */ - constexpr base_2dspan(T* data, size_t rows, size_t columns) noexcept - : _data{data}, _size{rows, columns} + constexpr base_2dspan(RowType flat_view, size_t columns) + : _flat{flat_view}, _size{columns == 0 ? 0 : flat_view.size() / columns, columns} { + CUDF_EXPECTS(_size.first * _size.second == flat_view.size(), "Invalid 2D span size"); } - /** - * @brief Constructor a 2D span - * - * @param data Pointer to the data - * @param size Size of the 2D span as pair - */ - base_2dspan(T* data, size_type size) noexcept : _data{data}, _size{std::move(size)} {} /** * @brief Returns a pointer to the beginning of the sequence. * * @return A pointer to the first element of the span */ - constexpr auto data() const noexcept { return _data; } + [[nodiscard]] constexpr auto data() const noexcept { return _flat.data(); } + /** * @brief Returns the size in the span as pair. * * @return pair representing rows and columns size of the span */ - constexpr auto size() const noexcept { return _size; } + [[nodiscard]] constexpr auto size() const noexcept { return _size; } + /** * @brief Returns the number of elements in the span. * * @return Number of elements in the span */ - constexpr auto count() const noexcept { return size().first * size().second; } + [[nodiscard]] constexpr auto count() const noexcept { return _flat.size(); } + /** * @brief Checks if the span is empty. * @@ -429,19 +448,6 @@ class base_2dspan { */ [[nodiscard]] constexpr bool is_empty() const noexcept { return count() == 0; } - /** - * @brief Returns flattened index of the element at the specified 2D position. - * - * @param row The row index - * @param column The column index - * @param size The size of the 2D span as pair - * @return The flattened index of the element at the specified 2D position - */ - static constexpr size_t flatten_index(size_t row, size_t column, size_type size) noexcept - { - return row * size.second + column; - } - /** * @brief Returns a reference to the row-th element of the sequence. * @@ -453,41 +459,7 @@ class base_2dspan { */ constexpr RowType operator[](size_t row) const { - return {this->data() + flatten_index(row, 0, this->size()), this->size().second}; - } - - /** - * @brief Returns a reference to the first element in the span. - * - * Calling front() on an empty span results in undefined behavior. - * - * @return Reference to the first element in the span - */ - [[nodiscard]] constexpr RowType front() const { return (*this)[0]; } - /** - * @brief Returns a reference to the last element in the span. - * - * Calling back() on an empty span results in undefined behavior. - * - * @return Reference to the last element in the span - */ - [[nodiscard]] constexpr RowType back() const - { - return (*this)[size().first - 1]; - } - - /** - * @brief Obtains a 2D span that is a view over the `num_rows` rows of this span starting at - * `first_row` - * - * @param first_row The first row in the subspan - * @param num_rows The number of rows in the subspan - * @return A subspan of the sequence, of requested starting `first_row` and `num_rows` - */ - constexpr base_2dspan subspan(size_t first_row, size_t num_rows) const noexcept - { - return base_2dspan( - _data + flatten_index(first_row, 0, this->size()), num_rows, this->size().second); + return _flat.subspan(row * _size.second, _size.second); } /** @@ -495,10 +467,7 @@ class base_2dspan { * * @return A flattened span of the 2D span */ - constexpr RowType flat_view() - { - return {this->data(), this->size().first * this->size().second}; - } + [[nodiscard]] constexpr RowType flat_view() const { return _flat; } /** * @brief Construct a 2D span from another 2D span of convertible type @@ -514,13 +483,13 @@ class base_2dspan { RowType>, void>* = nullptr> constexpr base_2dspan(base_2dspan const& other) noexcept - : _data{other.data()}, _size{other.size()} + : _flat{other.flat_view()}, _size{other.size()} { } protected: - T* _data = nullptr; ///< pointer to the first element - size_type _size{0, 0}; ///< rows, columns + RowType _flat; ///< flattened 2D span + size_type _size{0, 0}; ///< num rows, num columns }; /** diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp index 272c91133f8..2bd08f410e0 100644 --- a/cpp/include/cudf_test/testing_main.hpp +++ b/cpp/include/cudf_test/testing_main.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -36,6 +37,12 @@ namespace CUDF_EXPORT cudf { namespace test { +struct config { + std::string rmm_mode; + std::string stream_mode; + std::string stream_error_mode; +}; + /// MR factory functions inline auto make_cuda() { return std::make_shared(); } @@ -157,10 +164,9 @@ inline auto parse_cudf_test_opts(int argc, char** argv) * @param cmd_opts Command line options returned by parse_cudf_test_opts * @return Memory resource adaptor */ -inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts) +inline auto make_memory_resource_adaptor(cudf::test::config const& config) { - auto const rmm_mode = cmd_opts["rmm_mode"].as(); - auto resource = cudf::test::create_memory_resource(rmm_mode); + auto resource = cudf::test::create_memory_resource(config.rmm_mode); cudf::set_current_device_resource(resource.get()); return resource; } @@ -176,37 +182,54 @@ inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts) * @param cmd_opts Command line options returned by parse_cudf_test_opts * @return Memory resource adaptor */ -inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts) +inline auto make_stream_mode_adaptor(cudf::test::config const& config) { auto resource = cudf::get_current_device_resource_ref(); - auto const stream_mode = cmd_opts["stream_mode"].as(); - auto const stream_error_mode = cmd_opts["stream_error_mode"].as(); - auto const error_on_invalid_stream = (stream_error_mode == "error"); - auto const check_default_stream = (stream_mode == "new_cudf_default"); + auto const error_on_invalid_stream = (config.stream_error_mode == "error"); + auto const check_default_stream = (config.stream_mode == "new_cudf_default"); auto adaptor = cudf::test::stream_checking_resource_adaptor( resource, error_on_invalid_stream, check_default_stream); - if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) { + if ((config.stream_mode == "new_cudf_default") || (config.stream_mode == "new_testing_default")) { cudf::set_current_device_resource(&adaptor); } return adaptor; } +/** + * @brief Should be called in every test program that uses rmm allocators since it maintains the + * lifespan of the rmm default memory resource. this function parses the command line to customize + * test behavior, like the allocation mode used for creating the default memory resource. + * + */ +inline void init_cudf_test(int argc, char** argv, cudf::test::config const& config_override = {}) +{ + // static lifetime to keep rmm resource alive till tests end + auto const cmd_opts = parse_cudf_test_opts(argc, argv); + cudf::test::config config = config_override; + if (config.rmm_mode.empty()) { config.rmm_mode = cmd_opts["rmm_mode"].as(); } + + if (config.stream_mode.empty()) { + config.stream_mode = cmd_opts["stream_mode"].as(); + } + + if (config.stream_error_mode.empty()) { + config.stream_error_mode = cmd_opts["stream_error_mode"].as(); + } + + [[maybe_unused]] static auto mr = make_memory_resource_adaptor(config); + [[maybe_unused]] static auto adaptor = make_stream_mode_adaptor(config); +} + /** * @brief Macro that defines main function for gtest programs that use rmm * - * Should be included in every test program that uses rmm allocators since - * it maintains the lifespan of the rmm default memory resource. * This `main` function is a wrapper around the google test generated `main`, - * maintaining the original functionality. In addition, this custom `main` - * function parses the command line to customize test behavior, like the - * allocation mode used for creating the default memory resource. + * maintaining the original functionality. */ -#define CUDF_TEST_PROGRAM_MAIN() \ - int main(int argc, char** argv) \ - { \ - ::testing::InitGoogleTest(&argc, argv); \ - auto const cmd_opts = parse_cudf_test_opts(argc, argv); \ - [[maybe_unused]] auto mr = make_memory_resource_adaptor(cmd_opts); \ - [[maybe_unused]] auto adaptor = make_stream_mode_adaptor(cmd_opts); \ - return RUN_ALL_TESTS(); \ +#define CUDF_TEST_PROGRAM_MAIN() \ + int main(int argc, char** argv) \ + { \ + ::testing::InitGoogleTest(&argc, argv); \ + init_cudf_test(argc, argv); \ + return RUN_ALL_TESTS(); \ } diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp index 723ba310a1e..dca590baebf 100644 --- a/cpp/include/nvtext/edit_distance.hpp +++ b/cpp/include/nvtext/edit_distance.hpp @@ -57,7 +57,7 @@ namespace CUDF_EXPORT nvtext { * @param targets Strings to compute edit distance against `input` * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings columns of with replaced strings + * @return New lists column of edit distance values */ std::unique_ptr edit_distance( cudf::strings_column_view const& input, diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 7c909f1a948..42124461cdf 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -41,6 +41,8 @@ namespace CUDF_EXPORT nvtext { * * This function uses MurmurHash3_x86_32 for the hash algorithm. * + * @deprecated Deprecated in 24.12 + * * @throw std::invalid_argument if the width < 2 * * @param input Strings column to compute minhash @@ -51,7 +53,7 @@ namespace CUDF_EXPORT nvtext { * @param mr Device memory resource used to allocate the returned column's device memory * @return Minhash values for each string in input */ -std::unique_ptr minhash( +[[deprecated]] std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::numeric_scalar seed = 0, cudf::size_type width = 4, @@ -71,6 +73,8 @@ std::unique_ptr minhash( * * Any null row entries result in corresponding null output rows. * + * @deprecated Deprecated in 24.12 - to be replaced in a future release + * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit @@ -83,7 +87,7 @@ std::unique_ptr minhash( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr minhash( +[[deprecated]] std::unique_ptr minhash( cudf::strings_column_view const& input, cudf::device_span seeds, cudf::size_type width = 4, @@ -102,6 +106,8 @@ std::unique_ptr minhash( * The hash function returns 2 uint64 values but only the first value * is used with the minhash calculation. * + * @deprecated Deprecated in 24.12 + * * @throw std::invalid_argument if the width < 2 * * @param input Strings column to compute minhash @@ -112,7 +118,7 @@ std::unique_ptr minhash( * @param mr Device memory resource used to allocate the returned column's device memory * @return Minhash values as UINT64 for each string in input */ -std::unique_ptr minhash64( +[[deprecated]] std::unique_ptr minhash64( cudf::strings_column_view const& input, cudf::numeric_scalar seed = 0, cudf::size_type width = 4, @@ -132,6 +138,8 @@ std::unique_ptr minhash64( * * Any null row entries result in corresponding null output rows. * + * @deprecated Deprecated in 24.12 - to be replaced in a future release + * * @throw std::invalid_argument if the width < 2 * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit @@ -144,7 +152,7 @@ std::unique_ptr minhash64( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr minhash64( +[[deprecated]] std::unique_ptr minhash64( cudf::strings_column_view const& input, cudf::device_span seeds, cudf::size_type width = 4, @@ -164,6 +172,8 @@ std::unique_ptr minhash64( * * Any null row entries result in corresponding null output rows. * + * @deprecated Deprecated in 24.12 + * * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * @@ -173,7 +183,7 @@ std::unique_ptr minhash64( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr word_minhash( +[[deprecated]] std::unique_ptr word_minhash( cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream = cudf::get_default_stream(), @@ -193,6 +203,8 @@ std::unique_ptr word_minhash( * * Any null row entries result in corresponding null output rows. * + * @deprecated Deprecated in 24.12 + * * @throw std::invalid_argument if seeds is empty * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit * @@ -202,7 +214,7 @@ std::unique_ptr word_minhash( * @param mr Device memory resource used to allocate the returned column's device memory * @return List column of minhash values for each string per seed */ -std::unique_ptr word_minhash64( +[[deprecated]] std::unique_ptr word_minhash64( cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream = cudf::get_default_stream(), diff --git a/cpp/include/nvtext/replace.hpp b/cpp/include/nvtext/replace.hpp index bbd0503379b..822edcbdb43 100644 --- a/cpp/include/nvtext/replace.hpp +++ b/cpp/include/nvtext/replace.hpp @@ -82,7 +82,7 @@ namespace CUDF_EXPORT nvtext { * The default of empty string will identify tokens using whitespace. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings columns of with replaced strings + * @return New strings column with replaced strings */ std::unique_ptr replace_tokens( cudf::strings_column_view const& input, @@ -131,7 +131,7 @@ std::unique_ptr replace_tokens( * The default of empty string will identify tokens using whitespace. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return New strings columns of with replaced strings + * @return New strings column of filtered strings */ std::unique_ptr filter_tokens( cudf::strings_column_view const& input, diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp index 55a4124bfd0..e5b2a4cc21b 100644 --- a/cpp/include/nvtext/stemmer.hpp +++ b/cpp/include/nvtext/stemmer.hpp @@ -51,7 +51,7 @@ enum class letter_type { * * @code{.pseudo} * Example: - * st = ["trouble", "toy", "sygyzy"] + * st = ["trouble", "toy", "syzygy"] * b1 = is_letter(st, VOWEL, 1) * b1 is now [false, true, true] * @endcode @@ -62,7 +62,7 @@ enum class letter_type { * * @code{.pseudo} * Example: - * st = ["trouble", "toy", "sygyzy"] + * st = ["trouble", "toy", "syzygy"] * b2 = is_letter(st, CONSONANT, -1) // last letter checked in each string * b2 is now [false, true, false] * @endcode @@ -99,7 +99,7 @@ std::unique_ptr is_letter( * * @code{.pseudo} * Example: - * st = ["trouble", "toy", "sygyzy"] + * st = ["trouble", "toy", "syzygy"] * ix = [3, 1, 4] * b1 = is_letter(st, VOWEL, ix) * b1 is now [true, true, false] @@ -111,7 +111,7 @@ std::unique_ptr is_letter( * * @code{.pseudo} * Example: - * st = ["trouble", "toy", "sygyzy"] + * st = ["trouble", "toy", "syzygy"] * ix = [3, -2, 4] // 2nd to last character in st[1] is checked * b2 = is_letter(st, CONSONANT, ix) * b2 is now [false, false, true] diff --git a/cpp/scripts/run-clang-tidy.py b/cpp/scripts/run-clang-tidy.py deleted file mode 100644 index e5e57dbf562..00000000000 --- a/cpp/scripts/run-clang-tidy.py +++ /dev/null @@ -1,253 +0,0 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import re -import os -import subprocess -import argparse -import json -import multiprocessing as mp -import shutil - - -EXPECTED_VERSION = "16.0.6" -VERSION_REGEX = re.compile(r" LLVM version ([0-9.]+)") -GPU_ARCH_REGEX = re.compile(r"sm_(\d+)") -SPACES = re.compile(r"\s+") -SEPARATOR = "-" * 16 - - -def parse_args(): - argparser = argparse.ArgumentParser("Runs clang-tidy on a project") - argparser.add_argument("-cdb", type=str, - # TODO This is a hack, needs to be fixed - default="cpp/build/cuda-11.5.0/clang-tidy/release/compile_commands.clangd.json", - help="Path to cmake-generated compilation database" - " file. It is always found inside the root of the " - "cmake build folder. So make sure that `cmake` has " - "been run once before running this script!") - argparser.add_argument("-exe", type=str, default="clang-tidy", - help="Path to clang-tidy exe") - argparser.add_argument("-ignore", type=str, default="[.]cu$|examples/kmeans/", - help="Regex used to ignore files from checking") - argparser.add_argument("-select", type=str, default=None, - help="Regex used to select files for checking") - argparser.add_argument("-j", type=int, default=-1, - help="Number of parallel jobs to launch.") - args = argparser.parse_args() - if args.j <= 0: - args.j = mp.cpu_count() - args.ignore_compiled = re.compile(args.ignore) if args.ignore else None - args.select_compiled = re.compile(args.select) if args.select else None - ret = subprocess.check_output("%s --version" % args.exe, shell=True) - ret = ret.decode("utf-8") - version = VERSION_REGEX.search(ret) - if version is None: - raise Exception("Failed to figure out clang-tidy version!") - version = version.group(1) - if version != EXPECTED_VERSION: - raise Exception("clang-tidy exe must be v%s found '%s'" % \ - (EXPECTED_VERSION, version)) - if not os.path.exists(args.cdb): - raise Exception("Compilation database '%s' missing" % args.cdb) - return args - - -def get_all_commands(cdb): - with open(cdb) as fp: - return json.load(fp) - - -def get_gpu_archs(command): - archs = [] - for loc in range(len(command)): - if command[loc] != "-gencode": - continue - arch_flag = command[loc + 1] - match = GPU_ARCH_REGEX.search(arch_flag) - if match is not None: - archs.append("--cuda-gpu-arch=sm_%s" % match.group(1)) - return archs - - -def get_index(arr, item): - try: - return arr.index(item) - except: - return -1 - - -def remove_item(arr, item): - loc = get_index(arr, item) - if loc >= 0: - del arr[loc] - return loc - - -def remove_item_plus_one(arr, item): - loc = get_index(arr, item) - if loc >= 0: - del arr[loc + 1] - del arr[loc] - return loc - - -def get_clang_includes(exe): - dir = os.getenv("CONDA_PREFIX") - if dir is None: - ret = subprocess.check_output("which %s 2>&1" % exe, shell=True) - ret = ret.decode("utf-8") - dir = os.path.dirname(os.path.dirname(ret)) - header = os.path.join(dir, "include", "ClangHeaders") - return ["-I", header] - - -def get_tidy_args(cmd, exe): - command, file = cmd["command"], cmd["file"] - is_cuda = file.endswith(".cu") - command = re.split(SPACES, command) - # compiler is always clang++! - command[0] = "clang++" - # remove compilation and output targets from the original command - remove_item_plus_one(command, "-c") - remove_item_plus_one(command, "-o") - if is_cuda: - # replace nvcc's "-gencode ..." with clang's "--cuda-gpu-arch ..." - archs = get_gpu_archs(command) - command.extend(archs) - while True: - loc = remove_item_plus_one(command, "-gencode") - if loc < 0: - break - # "-x cuda" is the right usage in clang - loc = get_index(command, "-x") - if loc >= 0: - command[loc + 1] = "cuda" - remove_item_plus_one(command, "-ccbin") - remove_item(command, "--expt-extended-lambda") - remove_item(command, "--diag_suppress=unrecognized_gcc_pragma") - command.extend(get_clang_includes(exe)) - return command, is_cuda - - -def run_clang_tidy_command(tidy_cmd): - cmd = " ".join(tidy_cmd) - result = subprocess.run(cmd, check=False, shell=True, - stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - status = result.returncode == 0 - if status: - out = "" - else: - out = "CMD: " + cmd - out += result.stdout.decode("utf-8").rstrip() - return status, out - - -def run_clang_tidy(cmd, args): - command, is_cuda = get_tidy_args(cmd, args.exe) - tidy_cmd = [args.exe, - "-header-filter='.*cudf/cpp/(src|include|bench|comms).*'", - cmd["file"], "--", ] - tidy_cmd.extend(command) - status = True - out = "" - if is_cuda: - tidy_cmd.append("--cuda-device-only") - tidy_cmd.append(cmd["file"]) - ret, out1 = run_clang_tidy_command(tidy_cmd) - out += out1 - out += "%s" % SEPARATOR - if not ret: - status = ret - tidy_cmd[-2] = "--cuda-host-only" - ret, out1 = run_clang_tidy_command(tidy_cmd) - if not ret: - status = ret - out += out1 - else: - tidy_cmd.append(cmd["file"]) - ret, out1 = run_clang_tidy_command(tidy_cmd) - if not ret: - status = ret - out += out1 - return status, out, cmd["file"] - - -# yikes! global var :( -results = [] -def collect_result(result): - global results - results.append(result) - - -def print_result(passed, stdout, file): - status_str = "PASSED" if passed else "FAILED" - print(f"{SEPARATOR} File:{file} {status_str} {SEPARATOR}") - if stdout: - print(stdout) - print(f"{SEPARATOR} File:{file} ENDS {SEPARATOR}") - - -def print_results(): - global results - status = True - for passed, stdout, file in results: - print_result(passed, stdout, file) - if not passed: - status = False - return status - - -def run_tidy_for_all_files(args, all_files): - pool = None if args.j == 1 else mp.Pool(args.j) - # actual tidy checker - for cmd in all_files: - # skip files that we don't want to look at - if args.ignore_compiled is not None and \ - re.search(args.ignore_compiled, cmd["file"]) is not None: - continue - if args.select_compiled is not None and \ - re.search(args.select_compiled, cmd["file"]) is None: - continue - if pool is not None: - pool.apply_async(run_clang_tidy, args=(cmd, args), - callback=collect_result) - else: - passed, stdout, file = run_clang_tidy(cmd, args) - collect_result((passed, stdout, file)) - if pool is not None: - pool.close() - pool.join() - return print_results() - - -def main(): - args = parse_args() - # Attempt to making sure that we run this script from root of repo always - if not os.path.exists(".git"): - raise Exception("This needs to always be run from the root of repo") - # Check whether clang-tidy exists - # print(args) - if "exe" not in args and shutil.which("clang-tidy") is not None: - print("clang-tidy not found. Exiting...") - return - all_files = get_all_commands(args.cdb) - status = run_tidy_for_all_files(args, all_files) - if not status: - raise Exception("clang-tidy failed! Refer to the errors above.") - - -if __name__ == "__main__": - main() diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 4ca05f9c335..e6659f76c7c 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -32,7 +33,6 @@ #include #include -#include #include #include @@ -329,7 +329,7 @@ cudf::size_type count_set_bits(bitmask_type const* bitmask, cudf::detail::grid_1d grid(num_words, block_size); - rmm::device_scalar non_zero_count(0, stream); + cudf::detail::device_scalar non_zero_count(0, stream); count_set_bits_kernel <<>>( diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index b8e140f1fa5..d8419760120 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -162,7 +163,7 @@ size_type concatenate_masks(device_span d_views, size_type output_size, rmm::cuda_stream_view stream) { - rmm::device_scalar d_valid_count(0, stream); + cudf::detail::device_scalar d_valid_count(0, stream); constexpr size_type block_size{256}; cudf::detail::grid_1d config(output_size, block_size); concatenate_masks_kernel @@ -265,7 +266,7 @@ std::unique_ptr fused_concatenate(host_span views, auto out_view = out_col->mutable_view(); auto d_out_view = mutable_column_device_view::create(out_view, stream); - rmm::device_scalar d_valid_count(0, stream); + cudf::detail::device_scalar d_valid_count(0, stream); // Launch kernel constexpr size_type block_size{256}; diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu index 29a28f81d1a..80b0bd5242f 100644 --- a/cpp/src/copying/get_element.cu +++ b/cpp/src/copying/get_element.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -71,7 +72,7 @@ struct get_element_functor { auto device_col = column_device_view::create(input, stream); rmm::device_scalar temp_data(stream, mr); - rmm::device_scalar temp_valid(stream, mr); + cudf::detail::device_scalar temp_valid(stream, mr); device_single_thread( [buffer = temp_data.data(), @@ -155,8 +156,8 @@ struct get_element_functor { auto device_col = column_device_view::create(input, stream); - rmm::device_scalar temp_data(stream, mr); - rmm::device_scalar temp_valid(stream, mr); + cudf::detail::device_scalar temp_data(stream, mr); + cudf::detail::device_scalar temp_valid(stream, mr); device_single_thread( [buffer = temp_data.data(), diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu index ddb0dbcd96d..a497cedb3bc 100644 --- a/cpp/src/datetime/datetime_ops.cu +++ b/cpp/src/datetime/datetime_ops.cu @@ -44,19 +44,6 @@ namespace cudf { namespace datetime { namespace detail { -enum class datetime_component { - INVALID = 0, - YEAR, - MONTH, - DAY, - WEEKDAY, - HOUR, - MINUTE, - SECOND, - MILLISECOND, - MICROSECOND, - NANOSECOND -}; enum class rounding_function { CEIL, ///< Rounds up to the next integer multiple of the provided frequency @@ -453,90 +440,70 @@ std::unique_ptr extract_year(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::YEAR, stream, mr); } std::unique_ptr extract_month(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::MONTH, stream, mr); } std::unique_ptr extract_day(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::DAY, stream, mr); } std::unique_ptr extract_weekday(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::WEEKDAY, stream, mr); } std::unique_ptr extract_hour(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::HOUR, stream, mr); } std::unique_ptr extract_minute(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::MINUTE, stream, mr); } std::unique_ptr extract_second(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::SECOND, stream, mr); } std::unique_ptr extract_millisecond_fraction(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::MILLISECOND, stream, mr); } std::unique_ptr extract_microsecond_fraction(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::MICROSECOND, stream, mr); } std::unique_ptr extract_nanosecond_fraction(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - return detail::apply_datetime_op< - detail::extract_component_operator, - cudf::type_id::INT16>(column, stream, mr); + return detail::extract_datetime_component(column, datetime_component::NANOSECOND, stream, mr); } std::unique_ptr last_day_of_month(column_view const& column, @@ -576,6 +543,32 @@ std::unique_ptr extract_quarter(column_view const& column, return apply_datetime_op(column, stream, mr); } +std::unique_ptr extract_datetime_component(cudf::column_view const& column, + datetime_component component, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ +#define extract(field) \ + case field: \ + return apply_datetime_op, cudf::type_id::INT16>( \ + column, stream, mr) + + switch (component) { + extract(datetime_component::YEAR); + extract(datetime_component::MONTH); + extract(datetime_component::DAY); + extract(datetime_component::WEEKDAY); + extract(datetime_component::HOUR); + extract(datetime_component::MINUTE); + extract(datetime_component::SECOND); + extract(datetime_component::MILLISECOND); + extract(datetime_component::MICROSECOND); + extract(datetime_component::NANOSECOND); + default: CUDF_FAIL("Unsupported datetime component."); + } +#undef extract +} + } // namespace detail std::unique_ptr ceil_datetimes(column_view const& column, @@ -661,6 +654,15 @@ std::unique_ptr extract_second(column_view const& column, return detail::extract_second(column, stream, mr); } +std::unique_ptr extract_datetime_component(cudf::column_view const& column, + datetime_component component, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::extract_datetime_component(column, component, stream, mr); +} + std::unique_ptr extract_millisecond_fraction(column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index a6b6cbbf0b5..2196ee97fee 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -138,7 +138,7 @@ struct timezone_file { std::filesystem::path{tzif_dir.value_or(tzif_system_directory)} / timezone_name; std::ifstream fin; fin.open(tz_filename, ios_base::in | ios_base::binary | ios_base::ate); - CUDF_EXPECTS(fin, "Failed to open the timezone file."); + CUDF_EXPECTS(fin, "Failed to open the timezone file '" + tz_filename.string() + "'"); auto const file_size = fin.tellg(); fin.seekg(0); diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu new file mode 100644 index 00000000000..59457bea694 --- /dev/null +++ b/cpp/src/groupby/hash/compute_groupby.cu @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_groupby.hpp" +#include "compute_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "sparse_to_dense_results.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +rmm::device_uvector extract_populated_keys(SetType const& key_set, + size_type num_keys, + rmm::cuda_stream_view stream) +{ + rmm::device_uvector populated_keys(num_keys, stream); + auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); + + populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); + return populated_keys; +} + +template +std::unique_ptr compute_groupby(table_view const& keys, + host_span requests, + bool skip_rows_with_nulls, + Equal const& d_row_equal, + Hash const& d_row_hash, + cudf::detail::result_cache* cache, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + // convert to int64_t to avoid potential overflow with large `keys` + auto const num_keys = static_cast(keys.num_rows()); + + // Cache of sparse results where the location of aggregate value in each + // column is indexed by the hash set + cudf::detail::result_cache sparse_results(requests.size()); + + auto const set = cuco::static_set{ + num_keys, + cudf::detail::CUCO_DESIRED_LOAD_FACTOR, // 50% load factor + cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, + d_row_equal, + probing_scheme_t{d_row_hash}, + cuco::thread_scope_device, + cuco::storage{}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; + + auto row_bitmask = + skip_rows_with_nulls + ? cudf::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first + : rmm::device_buffer{}; + + // Compute all single pass aggs first + compute_single_pass_aggs(num_keys, + skip_rows_with_nulls, + static_cast(row_bitmask.data()), + set.ref(cuco::insert_and_find), + requests, + &sparse_results, + stream); + + // Extract the populated indices from the hash set and create a gather map. + // Gathering using this map from sparse results will give dense results. + auto gather_map = extract_populated_keys(set, keys.num_rows(), stream); + + // Compact all results from sparse_results and insert into cache + sparse_to_dense_results(requests, + &sparse_results, + cache, + gather_map, + set.ref(cuco::find), + static_cast(row_bitmask.data()), + stream, + mr); + + return cudf::detail::gather(keys, + gather_map, + out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); +} + +template rmm::device_uvector extract_populated_keys( + global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream); + +template rmm::device_uvector extract_populated_keys( + nullable_global_set_t const& key_set, size_type num_keys, rmm::cuda_stream_view stream); + +template std::unique_ptr
compute_groupby( + table_view const& keys, + host_span requests, + bool skip_rows_with_nulls, + row_comparator_t const& d_row_equal, + row_hash_t const& d_row_hash, + cudf::detail::result_cache* cache, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +template std::unique_ptr
compute_groupby( + table_view const& keys, + host_span requests, + bool skip_rows_with_nulls, + nullable_row_comparator_t const& d_row_equal, + row_hash_t const& d_row_hash, + cudf::detail::result_cache* cache, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_groupby.hpp b/cpp/src/groupby/hash/compute_groupby.hpp new file mode 100644 index 00000000000..7bb3a60ff07 --- /dev/null +++ b/cpp/src/groupby/hash/compute_groupby.hpp @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Computes and returns a device vector containing all populated keys in + * `key_set`. + * + * @tparam SetType Type of key hash set + * + * @param key_set Key hash set + * @param num_keys Number of input keys + * @param stream CUDA stream used for device memory operations and kernel launches + * @return An array of unique keys contained in `key_set` + */ +template +rmm::device_uvector extract_populated_keys(SetType const& key_set, + size_type num_keys, + rmm::cuda_stream_view stream); + +/** + * @brief Computes groupby using hash table. + * + * First, we create a hash table that stores the indices of unique rows in + * `keys`. The upper limit on the number of values in this map is the number + * of rows in `keys`. + * + * To store the results of aggregations, we create temporary sparse columns + * which have the same size as input value columns. Using the hash map, we + * determine the location within the sparse column to write the result of the + * aggregation into. + * + * The sparse column results of all aggregations are stored into the cache + * `sparse_results`. This enables the use of previously calculated results in + * other aggregations. + * + * All the aggregations which can be computed in a single pass are computed + * first, in a combined kernel. Then using these results, aggregations that + * require multiple passes, will be computed. + * + * Finally, using the hash map, we generate a vector of indices of populated + * values in sparse result columns. Then, for each aggregation originally + * requested in `requests`, we gather sparse results into a column of dense + * results using the aforementioned index vector. Dense results are stored into + * the in/out parameter `cache`. + * + * @tparam Equal Device row comparator type + * @tparam Hash Device row hasher type + * + * @param keys Table whose rows act as the groupby keys + * @param requests The set of columns to aggregate and the aggregations to perform + * @param skip_rows_with_nulls Flag indicating whether to ignore nulls or not + * @param d_row_equal Device row comparator + * @param d_row_hash Device row hasher + * @param cache Dense aggregation results + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table + * @return Table of unique keys + */ +template +std::unique_ptr compute_groupby(table_view const& keys, + host_span requests, + bool skip_rows_with_nulls, + Equal const& d_row_equal, + Hash const& d_row_hash, + cudf::detail::result_cache* cache, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.cu b/cpp/src/groupby/hash/compute_single_pass_aggs.cu new file mode 100644 index 00000000000..e292543e6e9 --- /dev/null +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.cu @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_single_pass_aggs.hpp" +#include "create_sparse_results_table.hpp" +#include "flatten_single_pass_aggs.hpp" +#include "helpers.cuh" +#include "single_pass_functors.cuh" +#include "var_hash_functor.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Computes all aggregations from `requests` that require a single pass + * over the data and stores the results in `sparse_results` + */ +template +void compute_single_pass_aggs(int64_t num_keys, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + SetType set, + host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream) +{ + // flatten the aggs to a table that can be operated on by aggregate_row + auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); + + // make table that will hold sparse results + table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream); + // prepare to launch kernel to do the actual aggregation + auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); + auto d_values = table_device_view::create(flattened_values, stream); + auto const d_aggs = cudf::detail::make_device_uvector_async( + agg_kinds, stream, cudf::get_current_device_resource_ref()); + + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + num_keys, + hash::compute_single_pass_aggs_fn{ + set, *d_values, *d_sparse_table, d_aggs.data(), row_bitmask, skip_rows_with_nulls}); + // Add results back to sparse_results cache + auto sparse_result_cols = sparse_table.release(); + for (size_t i = 0; i < aggs.size(); i++) { + // Note that the cache will make a copy of this temporary aggregation + sparse_results->add_result( + flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); + } +} + +template void compute_single_pass_aggs>( + int64_t num_keys, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + hash_set_ref_t set, + host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); + +template void compute_single_pass_aggs>( + int64_t num_keys, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + nullable_hash_set_ref_t set, + host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_single_pass_aggs.hpp b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp new file mode 100644 index 00000000000..a7434bdf61a --- /dev/null +++ b/cpp/src/groupby/hash/compute_single_pass_aggs.hpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Computes all aggregations from `requests` that require a single pass + * over the data and stores the results in `sparse_results` + */ +template +void compute_single_pass_aggs(int64_t num_keys, + bool skip_rows_with_nulls, + bitmask_type const* row_bitmask, + SetType set, + cudf::host_span requests, + cudf::detail::result_cache* sparse_results, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu new file mode 100644 index 00000000000..22fa4fc584c --- /dev/null +++ b/cpp/src/groupby/hash/create_sparse_results_table.cu @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "create_sparse_results_table.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace cudf::groupby::detail::hash { +// make table that will hold sparse results +cudf::table create_sparse_results_table(table_view const& flattened_values, + std::vector aggs, + rmm::cuda_stream_view stream) +{ + // TODO single allocation - room for performance improvement + std::vector> sparse_columns; + sparse_columns.reserve(flattened_values.num_columns()); + std::transform( + flattened_values.begin(), + flattened_values.end(), + aggs.begin(), + std::back_inserter(sparse_columns), + [stream](auto const& col, auto const& agg) { + bool nullable = + (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL) + ? false + : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD); + auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED; + + auto col_type = cudf::is_dictionary(col.type()) + ? cudf::dictionary_column_view(col).keys().type() + : col.type(); + + return make_fixed_width_column( + cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); + }); + + table sparse_table(std::move(sparse_columns)); + mutable_table_view table_view = sparse_table.mutable_view(); + cudf::detail::initialize_with_identity(table_view, aggs, stream); + return sparse_table; +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp new file mode 100644 index 00000000000..c1d4e0d3f20 --- /dev/null +++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include + +#include + +namespace cudf::groupby::detail::hash { +// make table that will hold sparse results +cudf::table create_sparse_results_table(table_view const& flattened_values, + std::vector aggs_kinds, + rmm::cuda_stream_view stream); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp new file mode 100644 index 00000000000..b2048a9fbb8 --- /dev/null +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "flatten_single_pass_aggs.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cudf::groupby::detail::hash { + +class groupby_simple_aggregations_collector final + : public cudf::detail::simple_aggregations_collector { + public: + using cudf::detail::simple_aggregations_collector::visit; + + std::vector> visit(data_type col_type, + cudf::detail::min_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation() + : make_min_aggregation()); + return aggs; + } + + std::vector> visit(data_type col_type, + cudf::detail::max_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation() + : make_max_aggregation()); + return aggs; + } + + std::vector> visit(data_type col_type, + cudf::detail::mean_aggregation const&) override + { + (void)col_type; + CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type"); + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit(data_type, + cudf::detail::var_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit(data_type, + cudf::detail::std_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } + + std::vector> visit( + data_type, cudf::detail::correlation_aggregation const&) override + { + std::vector> aggs; + aggs.push_back(make_sum_aggregation()); + // COUNT_VALID + aggs.push_back(make_count_aggregation()); + + return aggs; + } +}; + +// flatten aggs to filter in single pass aggs +std::tuple, std::vector>> +flatten_single_pass_aggs(host_span requests) +{ + std::vector columns; + std::vector> aggs; + std::vector agg_kinds; + + for (auto const& request : requests) { + auto const& agg_v = request.aggregations; + + std::unordered_set agg_kinds_set; + auto insert_agg = [&](column_view const& request_values, std::unique_ptr&& agg) { + if (agg_kinds_set.insert(agg->kind).second) { + agg_kinds.push_back(agg->kind); + aggs.push_back(std::move(agg)); + columns.push_back(request_values); + } + }; + + auto values_type = cudf::is_dictionary(request.values.type()) + ? cudf::dictionary_column_view(request.values).keys().type() + : request.values.type(); + for (auto&& agg : agg_v) { + groupby_simple_aggregations_collector collector; + + for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) { + insert_agg(request.values, std::move(agg_s)); + } + } + } + + return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs)); +} + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp new file mode 100644 index 00000000000..2bf983e5e90 --- /dev/null +++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +namespace cudf::groupby::detail::hash { + +// flatten aggs to filter in single pass aggs +std::tuple, std::vector>> +flatten_single_pass_aggs(host_span requests); + +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/global_memory_aggregator.cuh b/cpp/src/groupby/hash/global_memory_aggregator.cuh new file mode 100644 index 00000000000..50e89c727ff --- /dev/null +++ b/cpp/src/groupby/hash/global_memory_aggregator.cuh @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +struct update_target_element_gmem { + __device__ void operator()(cudf::mutable_column_device_view, + cudf::size_type, + cudf::column_device_view, + cuda::std::byte*, + cudf::size_type) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::MIN, + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using DeviceType = cudf::detail::underlying_target_t; + DeviceType* source_casted = reinterpret_cast(source); + cudf::detail::atomic_min(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::MAX, + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using DeviceType = cudf::detail::underlying_target_t; + DeviceType* source_casted = reinterpret_cast(source); + cudf::detail::atomic_max(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::SUM, + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_timestamp()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using DeviceType = cudf::detail::underlying_target_t; + DeviceType* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +// The shared memory will already have it squared +template +struct update_target_element_gmem< + Source, + cudf::aggregation::SUM_OF_SQUARES, + cuda::std::enable_if_t()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + Target value = static_cast(source_casted[source_index]); + + cudf::detail::atomic_add(&target.element(target_index), value); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::PRODUCT, + cuda::std::enable_if_t()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_mul(&target.element(target_index), + static_cast(source_casted[source_index])); + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +// Assuming that the target column of COUNT_VALID, COUNT_ALL would be using fixed_width column and +// non-fixed point column +template +struct update_target_element_gmem< + Source, + cudf::aggregation::COUNT_VALID, + cuda::std::enable_if_t< + cudf::detail::is_valid_aggregation()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + + // It is assumed the output for COUNT_VALID is initialized to be all valid + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::COUNT_ALL, + cuda::std::enable_if_t< + cudf::detail::is_valid_aggregation()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + + Target* source_casted = reinterpret_cast(source); + cudf::detail::atomic_add(&target.element(target_index), + static_cast(source_casted[source_index])); + + // It is assumed the output for COUNT_ALL is initialized to be all valid + } +}; + +template +struct update_target_element_gmem< + Source, + cudf::aggregation::ARGMAX, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* source_casted = reinterpret_cast(source); + auto source_argmax_index = source_casted[source_index]; + auto old = cudf::detail::atomic_cas( + &target.element(target_index), cudf::detail::ARGMAX_SENTINEL, source_argmax_index); + if (old != cudf::detail::ARGMAX_SENTINEL) { + while (source_column.element(source_argmax_index) > + source_column.element(old)) { + old = + cudf::detail::atomic_cas(&target.element(target_index), old, source_argmax_index); + } + } + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; +template +struct update_target_element_gmem< + Source, + cudf::aggregation::ARGMIN, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* source_casted = reinterpret_cast(source); + auto source_argmin_index = source_casted[source_index]; + auto old = cudf::detail::atomic_cas( + &target.element(target_index), cudf::detail::ARGMIN_SENTINEL, source_argmin_index); + if (old != cudf::detail::ARGMIN_SENTINEL) { + while (source_column.element(source_argmin_index) < + source_column.element(old)) { + old = + cudf::detail::atomic_cas(&target.element(target_index), old, source_argmin_index); + } + } + + if (target.is_null(target_index)) { target.set_valid(target_index); } + } +}; + +/** + * @brief A functor that updates a single element in the target column stored in global memory by + * applying an aggregation operation to a corresponding element from a source column in shared + * memory. + * + * This functor can NOT be used for dictionary columns. + * + * This is a redundant copy replicating the behavior of `elementwise_aggregator` from + * `cudf/detail/aggregation/device_aggregators.cuh`. The key difference is that this functor accepts + * a pointer to raw bytes as the source, as `column_device_view` cannot yet be constructed from + * shared memory. + */ +struct gmem_element_aggregator { + template + __device__ void operator()(cudf::mutable_column_device_view target, + cudf::size_type target_index, + cudf::column_device_view source_column, + cuda::std::byte* source, + bool* source_mask, + cudf::size_type source_index) const noexcept + { + // Early exit for all aggregation kinds since shared memory aggregation of + // `COUNT_ALL` is always valid + if (!source_mask[source_index]) { return; } + + update_target_element_gmem{}( + target, target_index, source_column, source, source_index); + } +}; +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index f9a80a048b5..30e1d52fdbf 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -14,58 +14,32 @@ * limitations under the License. */ +#include "compute_groupby.hpp" #include "groupby/common/utils.hpp" -#include "groupby/hash/groupby_kernels.cuh" +#include "helpers.cuh" #include -#include -#include -#include -#include -#include #include -#include -#include -#include #include -#include -#include -#include +#include #include #include -#include #include #include -#include #include #include -#include #include #include #include -#include -#include -#include - +#include #include -#include #include +#include -namespace cudf { -namespace groupby { -namespace detail { -namespace hash { +namespace cudf::groupby::detail::hash { namespace { - -// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested -// types and `cg_size = 1`for flat data to improve performance -using probing_scheme_type = cuco::linear_probing< - 1, ///< Number of threads used to handle each input key - cudf::experimental::row::hash::device_row_hasher>; - /** * @brief List of aggregation operations that can be computed with a hash-based * implementation. @@ -110,517 +84,33 @@ bool constexpr is_hash_aggregation(aggregation::Kind t) return array_contains(hash_aggregations, t); } -class groupby_simple_aggregations_collector final - : public cudf::detail::simple_aggregations_collector { - public: - using cudf::detail::simple_aggregations_collector::visit; - - std::vector> visit(data_type col_type, - cudf::detail::min_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(col_type.id() == type_id::STRING ? make_argmin_aggregation() - : make_min_aggregation()); - return aggs; - } - - std::vector> visit(data_type col_type, - cudf::detail::max_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(col_type.id() == type_id::STRING ? make_argmax_aggregation() - : make_max_aggregation()); - return aggs; - } - - std::vector> visit(data_type col_type, - cudf::detail::mean_aggregation const&) override - { - (void)col_type; - CUDF_EXPECTS(is_fixed_width(col_type), "MEAN aggregation expects fixed width type"); - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit(data_type, - cudf::detail::var_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit(data_type, - cudf::detail::std_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } - - std::vector> visit( - data_type, cudf::detail::correlation_aggregation const&) override - { - std::vector> aggs; - aggs.push_back(make_sum_aggregation()); - // COUNT_VALID - aggs.push_back(make_count_aggregation()); - - return aggs; - } -}; - -template -class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { - column_view col; - data_type result_type; - cudf::detail::result_cache* sparse_results; - cudf::detail::result_cache* dense_results; - device_span gather_map; - SetType set; - bitmask_type const* __restrict__ row_bitmask; - rmm::cuda_stream_view stream; - rmm::device_async_resource_ref mr; - - public: - using cudf::detail::aggregation_finalizer::visit; - - hash_compound_agg_finalizer(column_view col, - cudf::detail::result_cache* sparse_results, - cudf::detail::result_cache* dense_results, - device_span gather_map, - SetType set, - bitmask_type const* row_bitmask, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - : col(col), - sparse_results(sparse_results), - dense_results(dense_results), - gather_map(gather_map), - set(set), - row_bitmask(row_bitmask), - stream(stream), - mr(mr) - { - result_type = cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() - : col.type(); - } - - auto to_dense_agg_result(cudf::aggregation const& agg) - { - auto s = sparse_results->get_result(col, agg); - auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}), - gather_map, - out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - return std::move(dense_result_table->release()[0]); - } - - // Enables conversion of ARGMIN/ARGMAX into MIN/MAX - auto gather_argminmax(aggregation const& agg) - { - auto arg_result = to_dense_agg_result(agg); - // We make a view of ARG(MIN/MAX) result without a null mask and gather - // using this map. The values in data buffer of ARG(MIN/MAX) result - // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL - // which is an out of bounds index value (-1) and causes the gathered - // value to be null. - column_view null_removed_map( - data_type(type_to_id()), - arg_result->size(), - static_cast(arg_result->view().template data()), - nullptr, - 0); - auto gather_argminmax = - cudf::detail::gather(table_view({col}), - null_removed_map, - arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY - : cudf::out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - return std::move(gather_argminmax->release()[0]); - } - - // Declare overloads for each kind of aggregation to dispatch - void visit(cudf::aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - - void visit(cudf::detail::min_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - if (result_type.id() == type_id::STRING) { - auto transformed_agg = make_argmin_aggregation(); - dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); - } else { - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - } - - void visit(cudf::detail::max_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - if (result_type.id() == type_id::STRING) { - auto transformed_agg = make_argmax_aggregation(); - dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); - } else { - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - } - - void visit(cudf::detail::mean_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - auto sum_agg = make_sum_aggregation(); - auto count_agg = make_count_aggregation(); - this->visit(*sum_agg); - this->visit(*count_agg); - column_view sum_result = dense_results->get_result(col, *sum_agg); - column_view count_result = dense_results->get_result(col, *count_agg); - - auto result = - cudf::detail::binary_operation(sum_result, - count_result, - binary_operator::DIV, - cudf::detail::target_type(result_type, aggregation::MEAN), - stream, - mr); - dense_results->add_result(col, agg, std::move(result)); - } - - void visit(cudf::detail::var_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - - auto sum_agg = make_sum_aggregation(); - auto count_agg = make_count_aggregation(); - this->visit(*sum_agg); - this->visit(*count_agg); - column_view sum_result = sparse_results->get_result(col, *sum_agg); - column_view count_result = sparse_results->get_result(col, *count_agg); - - auto values_view = column_device_view::create(col, stream); - auto sum_view = column_device_view::create(sum_result, stream); - auto count_view = column_device_view::create(count_result, stream); - - auto var_result = make_fixed_width_column( - cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); - auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream); - mutable_table_view var_table_view{{var_result->mutable_view()}}; - cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream); - - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - col.size(), - ::cudf::detail::var_hash_functor{ - set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); - sparse_results->add_result(col, agg, std::move(var_result)); - dense_results->add_result(col, agg, to_dense_agg_result(agg)); - } - - void visit(cudf::detail::std_aggregation const& agg) override - { - if (dense_results->has_result(col, agg)) return; - auto var_agg = make_variance_aggregation(agg._ddof); - this->visit(*dynamic_cast(var_agg.get())); - column_view variance = dense_results->get_result(col, *var_agg); - - auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr); - dense_results->add_result(col, agg, std::move(result)); - } -}; -// flatten aggs to filter in single pass aggs -std::tuple, std::vector>> -flatten_single_pass_aggs(host_span requests) +std::unique_ptr
dispatch_groupby(table_view const& keys, + host_span requests, + cudf::detail::result_cache* cache, + bool const keys_have_nulls, + null_policy const include_null_keys, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { - std::vector columns; - std::vector> aggs; - std::vector agg_kinds; - - for (auto const& request : requests) { - auto const& agg_v = request.aggregations; - - std::unordered_set agg_kinds_set; - auto insert_agg = [&](column_view const& request_values, std::unique_ptr&& agg) { - if (agg_kinds_set.insert(agg->kind).second) { - agg_kinds.push_back(agg->kind); - aggs.push_back(std::move(agg)); - columns.push_back(request_values); - } - }; - - auto values_type = cudf::is_dictionary(request.values.type()) - ? cudf::dictionary_column_view(request.values).keys().type() - : request.values.type(); - for (auto&& agg : agg_v) { - groupby_simple_aggregations_collector collector; - - for (auto& agg_s : agg->get_simple_aggregations(values_type, collector)) { - insert_agg(request.values, std::move(agg_s)); - } - } - } - - return std::make_tuple(table_view(columns), std::move(agg_kinds), std::move(aggs)); -} - -/** - * @brief Gather sparse results into dense using `gather_map` and add to - * `dense_cache` - * - * @see groupby_null_templated() - */ -template -void sparse_to_dense_results(table_view const& keys, - host_span requests, - cudf::detail::result_cache* sparse_results, - cudf::detail::result_cache* dense_results, - device_span gather_map, - SetType set, - bool keys_have_nulls, - null_policy include_null_keys, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto row_bitmask = - cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first; - bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; - bitmask_type const* row_bitmask_ptr = - skip_key_rows_with_nulls ? static_cast(row_bitmask.data()) : nullptr; - - for (auto const& request : requests) { - auto const& agg_v = request.aggregations; - auto const& col = request.values; - - // Given an aggregation, this will get the result from sparse_results and - // convert and return dense, compacted result - auto finalizer = hash_compound_agg_finalizer( - col, sparse_results, dense_results, gather_map, set, row_bitmask_ptr, stream, mr); - for (auto&& agg : agg_v) { - agg->finalize(finalizer); - } - } -} - -// make table that will hold sparse results -auto create_sparse_results_table(table_view const& flattened_values, - std::vector aggs, - rmm::cuda_stream_view stream) -{ - // TODO single allocation - room for performance improvement - std::vector> sparse_columns; - std::transform( - flattened_values.begin(), - flattened_values.end(), - aggs.begin(), - std::back_inserter(sparse_columns), - [stream](auto const& col, auto const& agg) { - bool nullable = - (agg == aggregation::COUNT_VALID or agg == aggregation::COUNT_ALL) - ? false - : (col.has_nulls() or agg == aggregation::VARIANCE or agg == aggregation::STD); - auto mask_flag = (nullable) ? mask_state::ALL_NULL : mask_state::UNALLOCATED; - - auto col_type = cudf::is_dictionary(col.type()) - ? cudf::dictionary_column_view(col).keys().type() - : col.type(); - - return make_fixed_width_column( - cudf::detail::target_type(col_type, agg), col.size(), mask_flag, stream); - }); - - table sparse_table(std::move(sparse_columns)); - mutable_table_view table_view = sparse_table.mutable_view(); - cudf::detail::initialize_with_identity(table_view, aggs, stream); - return sparse_table; -} - -/** - * @brief Computes all aggregations from `requests` that require a single pass - * over the data and stores the results in `sparse_results` - */ -template -void compute_single_pass_aggs(table_view const& keys, - host_span requests, - cudf::detail::result_cache* sparse_results, - SetType set, - bool keys_have_nulls, - null_policy include_null_keys, - rmm::cuda_stream_view stream) -{ - // flatten the aggs to a table that can be operated on by aggregate_row - auto const [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests); - - // make table that will hold sparse results - table sparse_table = create_sparse_results_table(flattened_values, agg_kinds, stream); - // prepare to launch kernel to do the actual aggregation - auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream); - auto d_values = table_device_view::create(flattened_values, stream); - auto const d_aggs = cudf::detail::make_device_uvector_async( - agg_kinds, stream, cudf::get_current_device_resource_ref()); - auto const skip_key_rows_with_nulls = - keys_have_nulls and include_null_keys == null_policy::EXCLUDE; - - auto row_bitmask = - skip_key_rows_with_nulls - ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first - : rmm::device_buffer{}; - - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - keys.num_rows(), - hash::compute_single_pass_aggs_fn{set, - *d_values, - *d_sparse_table, - d_aggs.data(), - static_cast(row_bitmask.data()), - skip_key_rows_with_nulls}); - // Add results back to sparse_results cache - auto sparse_result_cols = sparse_table.release(); - for (size_t i = 0; i < aggs.size(); i++) { - // Note that the cache will make a copy of this temporary aggregation - sparse_results->add_result( - flattened_values.column(i), *aggs[i], std::move(sparse_result_cols[i])); - } -} - -/** - * @brief Computes and returns a device vector containing all populated keys in - * `map`. - */ -template -rmm::device_uvector extract_populated_keys(SetType const& key_set, - size_type num_keys, - rmm::cuda_stream_view stream) -{ - rmm::device_uvector populated_keys(num_keys, stream); - auto const keys_end = key_set.retrieve_all(populated_keys.begin(), stream.value()); - - populated_keys.resize(std::distance(populated_keys.begin(), keys_end), stream); - return populated_keys; -} - -/** - * @brief Computes groupby using hash table. - * - * First, we create a hash table that stores the indices of unique rows in - * `keys`. The upper limit on the number of values in this map is the number - * of rows in `keys`. - * - * To store the results of aggregations, we create temporary sparse columns - * which have the same size as input value columns. Using the hash map, we - * determine the location within the sparse column to write the result of the - * aggregation into. - * - * The sparse column results of all aggregations are stored into the cache - * `sparse_results`. This enables the use of previously calculated results in - * other aggregations. - * - * All the aggregations which can be computed in a single pass are computed - * first, in a combined kernel. Then using these results, aggregations that - * require multiple passes, will be computed. - * - * Finally, using the hash map, we generate a vector of indices of populated - * values in sparse result columns. Then, for each aggregation originally - * requested in `requests`, we gather sparse results into a column of dense - * results using the aforementioned index vector. Dense results are stored into - * the in/out parameter `cache`. - */ -std::unique_ptr
groupby(table_view const& keys, - host_span requests, - cudf::detail::result_cache* cache, - bool const keys_have_nulls, - null_policy const include_null_keys, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - // convert to int64_t to avoid potential overflow with large `keys` - auto const num_keys = static_cast(keys.num_rows()); - auto const null_keys_are_equal = null_equality::EQUAL; - auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys)}; + auto const null_keys_are_equal = null_equality::EQUAL; + auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys)}; + auto const skip_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE; auto preprocessed_keys = cudf::experimental::row::hash::preprocessed_table::create(keys, stream); auto const comparator = cudf::experimental::row::equality::self_comparator{preprocessed_keys}; auto const row_hash = cudf::experimental::row::hash::row_hasher{std::move(preprocessed_keys)}; auto const d_row_hash = row_hash.device_hasher(has_null); - // Cache of sparse results where the location of aggregate value in each - // column is indexed by the hash set - cudf::detail::result_cache sparse_results(requests.size()); - - auto const comparator_helper = [&](auto const d_key_equal) { - auto const set = cuco::static_set{ - num_keys, - 0.5, // desired load factor - cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL}, - d_key_equal, - probing_scheme_type{d_row_hash}, - cuco::thread_scope_device, - cuco::storage<1>{}, - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - - // Compute all single pass aggs first - compute_single_pass_aggs(keys, - requests, - &sparse_results, - set.ref(cuco::insert_and_find), - keys_have_nulls, - include_null_keys, - stream); - - // Extract the populated indices from the hash set and create a gather map. - // Gathering using this map from sparse results will give dense results. - auto gather_map = extract_populated_keys(set, keys.num_rows(), stream); - - // Compact all results from sparse_results and insert into cache - sparse_to_dense_results(keys, - requests, - &sparse_results, - cache, - gather_map, - set.ref(cuco::find), - keys_have_nulls, - include_null_keys, - stream, - mr); - - return cudf::detail::gather(keys, - gather_map, - out_of_bounds_policy::DONT_CHECK, - cudf::detail::negative_index_policy::NOT_ALLOWED, - stream, - mr); - }; - if (cudf::detail::has_nested_columns(keys)) { - auto const d_key_equal = comparator.equal_to(has_null, null_keys_are_equal); - return comparator_helper(d_key_equal); + auto const d_row_equal = comparator.equal_to(has_null, null_keys_are_equal); + return compute_groupby( + keys, requests, skip_rows_with_nulls, d_row_equal, d_row_hash, cache, stream, mr); } else { - auto const d_key_equal = comparator.equal_to(has_null, null_keys_are_equal); - return comparator_helper(d_key_equal); + auto const d_row_equal = comparator.equal_to(has_null, null_keys_are_equal); + return compute_groupby( + keys, requests, skip_rows_with_nulls, d_row_equal, d_row_hash, cache, stream, mr); } } - } // namespace /** @@ -661,11 +151,8 @@ std::pair, std::vector> groupby( cudf::detail::result_cache cache(requests.size()); std::unique_ptr
unique_keys = - groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr); + dispatch_groupby(keys, requests, &cache, cudf::has_nulls(keys), include_null_keys, stream, mr); return std::pair(std::move(unique_keys), extract_results(requests, cache, stream, mr)); } -} // namespace hash -} // namespace detail -} // namespace groupby -} // namespace cudf +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu new file mode 100644 index 00000000000..37a61c1a22c --- /dev/null +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu @@ -0,0 +1,199 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hash_compound_agg_finalizer.hpp" +#include "helpers.cuh" +#include "var_hash_functor.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +template +hash_compound_agg_finalizer::hash_compound_agg_finalizer( + column_view col, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetType set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + : col(col), + sparse_results(sparse_results), + dense_results(dense_results), + gather_map(gather_map), + set(set), + row_bitmask(row_bitmask), + stream(stream), + mr(mr) +{ + result_type = + cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() : col.type(); +} + +template +auto hash_compound_agg_finalizer::to_dense_agg_result(cudf::aggregation const& agg) +{ + auto s = sparse_results->get_result(col, agg); + auto dense_result_table = cudf::detail::gather(table_view({std::move(s)}), + gather_map, + out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); + return std::move(dense_result_table->release()[0]); +} + +template +auto hash_compound_agg_finalizer::gather_argminmax(aggregation const& agg) +{ + auto arg_result = to_dense_agg_result(agg); + // We make a view of ARG(MIN/MAX) result without a null mask and gather + // using this map. The values in data buffer of ARG(MIN/MAX) result + // corresponding to null values was initialized to ARG(MIN/MAX)_SENTINEL + // which is an out of bounds index value (-1) and causes the gathered + // value to be null. + column_view null_removed_map( + data_type(type_to_id()), + arg_result->size(), + static_cast(arg_result->view().template data()), + nullptr, + 0); + auto gather_argminmax = + cudf::detail::gather(table_view({col}), + null_removed_map, + arg_result->nullable() ? cudf::out_of_bounds_policy::NULLIFY + : cudf::out_of_bounds_policy::DONT_CHECK, + cudf::detail::negative_index_policy::NOT_ALLOWED, + stream, + mr); + return std::move(gather_argminmax->release()[0]); +} + +template +void hash_compound_agg_finalizer::visit(cudf::aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + dense_results->add_result(col, agg, to_dense_agg_result(agg)); +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::min_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + if (result_type.id() == type_id::STRING) { + auto transformed_agg = make_argmin_aggregation(); + dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); + } else { + dense_results->add_result(col, agg, to_dense_agg_result(agg)); + } +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::max_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + + if (result_type.id() == type_id::STRING) { + auto transformed_agg = make_argmax_aggregation(); + dense_results->add_result(col, agg, gather_argminmax(*transformed_agg)); + } else { + dense_results->add_result(col, agg, to_dense_agg_result(agg)); + } +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::mean_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + + auto sum_agg = make_sum_aggregation(); + auto count_agg = make_count_aggregation(); + this->visit(*sum_agg); + this->visit(*count_agg); + column_view sum_result = dense_results->get_result(col, *sum_agg); + column_view count_result = dense_results->get_result(col, *count_agg); + + auto result = + cudf::detail::binary_operation(sum_result, + count_result, + binary_operator::DIV, + cudf::detail::target_type(result_type, aggregation::MEAN), + stream, + mr); + dense_results->add_result(col, agg, std::move(result)); +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::var_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + + auto sum_agg = make_sum_aggregation(); + auto count_agg = make_count_aggregation(); + this->visit(*sum_agg); + this->visit(*count_agg); + column_view sum_result = sparse_results->get_result(col, *sum_agg); + column_view count_result = sparse_results->get_result(col, *count_agg); + + auto values_view = column_device_view::create(col, stream); + auto sum_view = column_device_view::create(sum_result, stream); + auto count_view = column_device_view::create(count_result, stream); + + auto var_result = make_fixed_width_column( + cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream); + auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream); + mutable_table_view var_table_view{{var_result->mutable_view()}}; + cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream); + + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + col.size(), + var_hash_functor{ + set, row_bitmask, *var_result_view, *values_view, *sum_view, *count_view, agg._ddof}); + sparse_results->add_result(col, agg, std::move(var_result)); + dense_results->add_result(col, agg, to_dense_agg_result(agg)); +} + +template +void hash_compound_agg_finalizer::visit(cudf::detail::std_aggregation const& agg) +{ + if (dense_results->has_result(col, agg)) return; + auto var_agg = make_variance_aggregation(agg._ddof); + this->visit(*dynamic_cast(var_agg.get())); + column_view variance = dense_results->get_result(col, *var_agg); + + auto result = cudf::detail::unary_operation(variance, unary_operator::SQRT, stream, mr); + dense_results->add_result(col, agg, std::move(result)); +} + +template class hash_compound_agg_finalizer>; +template class hash_compound_agg_finalizer>; +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp new file mode 100644 index 00000000000..8bee1a92c40 --- /dev/null +++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.hpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +class hash_compound_agg_finalizer final : public cudf::detail::aggregation_finalizer { + column_view col; + data_type result_type; + cudf::detail::result_cache* sparse_results; + cudf::detail::result_cache* dense_results; + device_span gather_map; + SetType set; + bitmask_type const* __restrict__ row_bitmask; + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; + + public: + using cudf::detail::aggregation_finalizer::visit; + + hash_compound_agg_finalizer(column_view col, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetType set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + + auto to_dense_agg_result(cudf::aggregation const& agg); + + // Enables conversion of ARGMIN/ARGMAX into MIN/MAX + auto gather_argminmax(cudf::aggregation const& agg); + + // Declare overloads for each kind of aggregation to dispatch + void visit(cudf::aggregation const& agg) override; + + void visit(cudf::detail::min_aggregation const& agg) override; + + void visit(cudf::detail::max_aggregation const& agg) override; + + void visit(cudf::detail::mean_aggregation const& agg) override; + + void visit(cudf::detail::var_aggregation const& agg) override; + + void visit(cudf::detail::std_aggregation const& agg) override; +}; +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh new file mode 100644 index 00000000000..0d117ca35b3 --- /dev/null +++ b/cpp/src/groupby/hash/helpers.cuh @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include + +namespace cudf::groupby::detail::hash { +// TODO: similar to `contains_table`, using larger CG size like 2 or 4 for nested +// types and `cg_size = 1`for flat data to improve performance +/// Number of threads to handle each input element +CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1; + +/// Number of slots per thread +CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1; + +/// Thread block size +CUDF_HOST_DEVICE auto constexpr GROUPBY_BLOCK_SIZE = 128; + +/// Threshold cardinality to switch between shared memory aggregations and global memory +/// aggregations +CUDF_HOST_DEVICE auto constexpr GROUPBY_CARDINALITY_THRESHOLD = 128; + +// We add additional `block_size`, because after the number of elements in the local hash set +// exceeds the threshold, all threads in the thread block can still insert one more element. +/// The maximum number of elements handled per block +CUDF_HOST_DEVICE auto constexpr GROUPBY_SHM_MAX_ELEMENTS = + GROUPBY_CARDINALITY_THRESHOLD + GROUPBY_BLOCK_SIZE; + +// GROUPBY_SHM_MAX_ELEMENTS with 0.7 occupancy +/// Shared memory hash set extent type +using shmem_extent_t = + cuco::extent(static_cast(GROUPBY_SHM_MAX_ELEMENTS) * 1.43)>; + +/// Number of windows needed by each shared memory hash set +CUDF_HOST_DEVICE auto constexpr window_extent = + cuco::make_window_extent(shmem_extent_t{}); + +/** + * @brief Returns the smallest multiple of 8 that is greater than or equal to the given integer. + */ +CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num) +{ + std::size_t constexpr base = 8; + return cudf::util::div_rounding_up_safe(num, base) * base; +} + +using row_hash_t = + cudf::experimental::row::hash::device_row_hasher; + +/// Probing scheme type used by groupby hash table +using probing_scheme_t = cuco::linear_probing; + +using row_comparator_t = cudf::experimental::row::equality::device_row_comparator< + false, + cudf::nullate::DYNAMIC, + cudf::experimental::row::equality::nan_equal_physical_equality_comparator>; + +using nullable_row_comparator_t = cudf::experimental::row::equality::device_row_comparator< + true, + cudf::nullate::DYNAMIC, + cudf::experimental::row::equality::nan_equal_physical_equality_comparator>; + +using global_set_t = cuco::static_set, + cuda::thread_scope_device, + row_comparator_t, + probing_scheme_t, + cudf::detail::cuco_allocator, + cuco::storage>; + +using nullable_global_set_t = cuco::static_set, + cuda::thread_scope_device, + nullable_row_comparator_t, + probing_scheme_t, + cudf::detail::cuco_allocator, + cuco::storage>; + +template +using hash_set_ref_t = cuco::static_set_ref< + cudf::size_type, + cuda::thread_scope_device, + row_comparator_t, + probing_scheme_t, + cuco::aow_storage_ref>, + Op>; + +template +using nullable_hash_set_ref_t = cuco::static_set_ref< + cudf::size_type, + cuda::thread_scope_device, + nullable_row_comparator_t, + probing_scheme_t, + cuco::aow_storage_ref>, + Op>; +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/shared_memory_aggregator.cuh b/cpp/src/groupby/hash/shared_memory_aggregator.cuh new file mode 100644 index 00000000000..9cbeeb34b86 --- /dev/null +++ b/cpp/src/groupby/hash/shared_memory_aggregator.cuh @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +struct update_target_element_shmem { + __device__ void operator()( + cuda::std::byte*, bool*, cudf::size_type, cudf::column_device_view, cudf::size_type) const + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::MIN, + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using DeviceTarget = cudf::detail::underlying_target_t; + using DeviceSource = cudf::detail::underlying_source_t; + + DeviceTarget* target_casted = reinterpret_cast(target); + cudf::detail::atomic_min(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::MAX, + cuda::std::enable_if_t() && cudf::has_atomic_support()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using DeviceTarget = cudf::detail::underlying_target_t; + using DeviceSource = cudf::detail::underlying_source_t; + + DeviceTarget* target_casted = reinterpret_cast(target); + cudf::detail::atomic_max(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::SUM, + cuda::std::enable_if_t() && cudf::has_atomic_support() && + !cudf::is_timestamp()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using DeviceTarget = cudf::detail::underlying_target_t; + using DeviceSource = cudf::detail::underlying_source_t; + + DeviceTarget* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::SUM_OF_SQUARES, + cuda::std::enable_if_t()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + auto value = static_cast(source.element(source_index)); + cudf::detail::atomic_add(&target_casted[target_index], value * value); + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::PRODUCT, + cuda::std::enable_if_t()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_mul(&target_casted[target_index], + static_cast(source.element(source_index))); + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::COUNT_VALID, + cuda::std::enable_if_t< + cudf::detail::is_valid_aggregation()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + // The nullability was checked prior to this call in the `shmem_element_aggregator` functor + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], Target{1}); + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::COUNT_ALL, + cuda::std::enable_if_t< + cudf::detail::is_valid_aggregation()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + cudf::detail::atomic_add(&target_casted[target_index], Target{1}); + + // Assumes target is already set to be valid + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::ARGMAX, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + auto old = cudf::detail::atomic_cas( + &target_casted[target_index], cudf::detail::ARGMAX_SENTINEL, source_index); + if (old != cudf::detail::ARGMAX_SENTINEL) { + while (source.element(source_index) > source.element(old)) { + old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index); + } + } + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +template +struct update_target_element_shmem< + Source, + cudf::aggregation::ARGMIN, + cuda::std::enable_if_t() and + cudf::is_relationally_comparable()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + using Target = cudf::detail::target_type_t; + Target* target_casted = reinterpret_cast(target); + auto old = cudf::detail::atomic_cas( + &target_casted[target_index], cudf::detail::ARGMIN_SENTINEL, source_index); + if (old != cudf::detail::ARGMIN_SENTINEL) { + while (source.element(source_index) < source.element(old)) { + old = cudf::detail::atomic_cas(&target_casted[target_index], old, source_index); + } + } + + if (!target_mask[target_index]) { target_mask[target_index] = true; } + } +}; + +/** + * @brief A functor that updates a single element in the target column stored in shared memory by + * applying an aggregation operation to a corresponding element from a source column in global + * memory. + * + * This functor can NOT be used for dictionary columns. + * + * This is a redundant copy replicating the behavior of `elementwise_aggregator` from + * `cudf/detail/aggregation/device_aggregators.cuh`. The key difference is that this functor accepts + * a pointer to raw bytes as the source, as `column_device_view` cannot yet be constructed from + * shared memory. + */ +struct shmem_element_aggregator { + template + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type target_index, + cudf::column_device_view source, + cudf::size_type source_index) const noexcept + { + // Check nullability for all aggregation kinds but `COUNT_ALL` + if constexpr (k != cudf::aggregation::COUNT_ALL) { + if (source.is_null(source_index)) { return; } + } + update_target_element_shmem{}( + target, target_mask, target_index, source, source_index); + } +}; +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/groupby_kernels.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh similarity index 95% rename from cpp/src/groupby/hash/groupby_kernels.cuh rename to cpp/src/groupby/hash/single_pass_functors.cuh index 188d0cff3f1..73791b3aa71 100644 --- a/cpp/src/groupby/hash/groupby_kernels.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -13,22 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once -#include "multi_pass_kernels.cuh" - #include #include #include #include -#include +#include -namespace cudf { -namespace groupby { -namespace detail { -namespace hash { +namespace cudf::groupby::detail::hash { /** * @brief Computes single-pass aggregations and store results into a sparse `output_values` table, * and populate `set` with indices of unique keys @@ -104,8 +98,4 @@ struct compute_single_pass_aggs_fn { } } }; - -} // namespace hash -} // namespace detail -} // namespace groupby -} // namespace cudf +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.cu b/cpp/src/groupby/hash/sparse_to_dense_results.cu new file mode 100644 index 00000000000..e1c2cd22309 --- /dev/null +++ b/cpp/src/groupby/hash/sparse_to_dense_results.cu @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hash_compound_agg_finalizer.hpp" +#include "helpers.cuh" + +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +template +void sparse_to_dense_results(host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetRef set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + for (auto const& request : requests) { + auto const& agg_v = request.aggregations; + auto const& col = request.values; + + // Given an aggregation, this will get the result from sparse_results and + // convert and return dense, compacted result + auto finalizer = hash_compound_agg_finalizer( + col, sparse_results, dense_results, gather_map, set, row_bitmask, stream, mr); + for (auto&& agg : agg_v) { + agg->finalize(finalizer); + } + } +} + +template void sparse_to_dense_results>( + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + hash_set_ref_t set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +template void sparse_to_dense_results>( + host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + nullable_hash_set_ref_t set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/sparse_to_dense_results.hpp b/cpp/src/groupby/hash/sparse_to_dense_results.hpp new file mode 100644 index 00000000000..3a2b3090b99 --- /dev/null +++ b/cpp/src/groupby/hash/sparse_to_dense_results.hpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +/** + * @brief Gather sparse aggregation results into dense using `gather_map` and add to + * `dense_results` + * + * @tparam SetRef Device hash set ref type + * + * @param[in] requests The set of columns to aggregate and the aggregations to perform + * @param[in] sparse_results Sparse aggregation results + * @param[out] dense_results Dense aggregation results + * @param[in] gather_map Gather map indicating valid elements in `sparse_results` + * @param[in] set Device hash set ref + * @param[in] row_bitmask Bitmask indicating the validity of input keys + * @param[in] stream CUDA stream used for device memory operations and kernel launches + * @param[in] mr Device memory resource used to allocate the returned table + */ +template +void sparse_to_dense_results(host_span requests, + cudf::detail::result_cache* sparse_results, + cudf::detail::result_cache* dense_results, + device_span gather_map, + SetRef set, + bitmask_type const* row_bitmask, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/var_hash_functor.cuh similarity index 69% rename from cpp/src/groupby/hash/multi_pass_kernels.cuh rename to cpp/src/groupby/hash/var_hash_functor.cuh index 7043eafdc10..bb55cc9188c 100644 --- a/cpp/src/groupby/hash/multi_pass_kernels.cuh +++ b/cpp/src/groupby/hash/var_hash_functor.cuh @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #pragma once #include @@ -21,17 +20,14 @@ #include #include #include -#include #include +#include #include +#include -#include - -namespace cudf { -namespace detail { - -template +namespace cudf::groupby::detail::hash { +template struct var_hash_functor { SetType set; bitmask_type const* __restrict__ row_bitmask; @@ -47,13 +43,13 @@ struct var_hash_functor { column_device_view sum, column_device_view count, size_type ddof) - : set(set), - row_bitmask(row_bitmask), - target(target), - source(source), - sum(sum), - count(count), - ddof(ddof) + : set{set}, + row_bitmask{row_bitmask}, + target{target}, + source{source}, + sum{sum}, + count{count}, + ddof{ddof} { } @@ -64,23 +60,21 @@ struct var_hash_functor { } template - __device__ std::enable_if_t()> operator()(column_device_view const& source, - size_type source_index, - size_type target_index) noexcept + __device__ cuda::std::enable_if_t()> operator()( + column_device_view const& source, size_type source_index, size_type target_index) noexcept { CUDF_UNREACHABLE("Invalid source type for std, var aggregation combination."); } template - __device__ std::enable_if_t()> operator()(column_device_view const& source, - size_type source_index, - size_type target_index) noexcept + __device__ cuda::std::enable_if_t()> operator()( + column_device_view const& source, size_type source_index, size_type target_index) noexcept { - using Target = target_type_t; - using SumType = target_type_t; - using CountType = target_type_t; + using Target = cudf::detail::target_type_t; + using SumType = cudf::detail::target_type_t; + using CountType = cudf::detail::target_type_t; - if (source_has_nulls and source.is_null(source_index)) return; + if (source.is_null(source_index)) return; CountType group_size = count.element(target_index); if (group_size == 0 or group_size - ddof <= 0) return; @@ -91,8 +85,9 @@ struct var_hash_functor { ref.fetch_add(result, cuda::std::memory_order_relaxed); // STD sqrt is applied in finalize() - if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } + if (target.is_null(target_index)) { target.set_valid(target_index); } } + __device__ inline void operator()(size_type source_index) { if (row_bitmask == nullptr or cudf::bit_is_set(row_bitmask, source_index)) { @@ -110,6 +105,4 @@ struct var_hash_functor { } } }; - -} // namespace detail -} // namespace cudf +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu index 82d557b9f7e..d6c900fb689 100644 --- a/cpp/src/groupby/sort/group_quantiles.cu +++ b/cpp/src/groupby/sort/group_quantiles.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -108,7 +109,7 @@ struct quantiles_functor { auto values_view = column_device_view::create(values, stream); auto group_size_view = column_device_view::create(group_sizes, stream); auto result_view = mutable_column_device_view::create(result->mutable_view(), stream); - auto null_count = rmm::device_scalar(0, stream, mr); + auto null_count = cudf::detail::device_scalar(0, stream, mr); // For each group, calculate quantile if (!cudf::is_dictionary(values.type())) { diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu index 86ee20dbbe2..c3dfac46502 100644 --- a/cpp/src/groupby/sort/group_std.cu +++ b/cpp/src/groupby/sort/group_std.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -27,7 +28,6 @@ #include #include -#include #include #include @@ -134,7 +134,7 @@ struct var_functor { // set nulls auto result_view = mutable_column_device_view::create(*result, stream); - auto null_count = rmm::device_scalar(0, stream, mr); + auto null_count = cudf::detail::device_scalar(0, stream, mr); auto d_null_count = null_count.data(); thrust::for_each_n( rmm::exec_policy(stream), diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu index a2874b46b06..fc1b0226a48 100644 --- a/cpp/src/interop/to_arrow_device.cu +++ b/cpp/src/interop/to_arrow_device.cu @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -35,7 +36,6 @@ #include #include -#include #include #include @@ -60,7 +60,7 @@ template struct is_device_scalar : public std::false_type {}; template -struct is_device_scalar> : public std::true_type {}; +struct is_device_scalar> : public std::true_type {}; template struct is_device_uvector : public std::false_type {}; @@ -232,10 +232,10 @@ int dispatch_to_arrow_device::operator()(cudf::column&& colum // in the offsets buffer. While some arrow implementations may accept a zero-sized // offsets buffer, best practices would be to allocate the buffer with the single value. if (nanoarrow_type == NANOARROW_TYPE_STRING) { - auto zero = std::make_unique>(0, stream, mr); + auto zero = std::make_unique>(0, stream, mr); NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); } else { - auto zero = std::make_unique>(0, stream, mr); + auto zero = std::make_unique>(0, stream, mr); NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); } @@ -466,10 +466,10 @@ int dispatch_to_arrow_device_view::operator()(ArrowArray* out if (column.size() == 0) { // https://github.com/rapidsai/cudf/pull/15047#discussion_r1546528552 if (nanoarrow_type == NANOARROW_TYPE_LARGE_STRING) { - auto zero = std::make_unique>(0, stream, mr); + auto zero = std::make_unique>(0, stream, mr); NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); } else { - auto zero = std::make_unique>(0, stream, mr); + auto zero = std::make_unique>(0, stream, mr); NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); } diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu index 79fb7550044..8ec0904f1ba 100644 --- a/cpp/src/interop/to_arrow_host.cu +++ b/cpp/src/interop/to_arrow_host.cu @@ -44,6 +44,7 @@ #include #include #include +#include #include @@ -52,6 +53,30 @@ namespace detail { namespace { +/* + Enable Transparent Huge Pages (THP) for large (>4MB) allocations. + `buf` is returned untouched. + Enabling THP can improve performance of device-host memory transfers + significantly, see . +*/ +void enable_hugepage(ArrowBuffer* buffer) +{ + if (buffer->size_bytes < (1u << 22u)) { // Smaller than 4 MB + return; + } + +#ifdef MADV_HUGEPAGE + auto const pagesize = sysconf(_SC_PAGESIZE); + void* addr = const_cast(buffer->data); + auto length{static_cast(buffer->size_bytes)}; + if (std::align(pagesize, pagesize, addr, length)) { + // Intentionally not checking for errors that may be returned by older kernel versions; + // optimistically tries enabling huge pages. + madvise(addr, length, MADV_HUGEPAGE); + } +#endif +} + struct dispatch_to_arrow_host { cudf::column_view column; rmm::cuda_stream_view stream; @@ -62,6 +87,7 @@ struct dispatch_to_arrow_host { if (!column.has_nulls()) { return NANOARROW_OK; } NANOARROW_RETURN_NOT_OK(ArrowBitmapResize(bitmap, static_cast(column.size()), 0)); + enable_hugepage(&bitmap->buffer); CUDF_CUDA_TRY(cudaMemcpyAsync(bitmap->buffer.data, (column.offset() > 0) ? cudf::detail::copy_bitmask(column, stream, mr).data() @@ -76,6 +102,7 @@ struct dispatch_to_arrow_host { int populate_data_buffer(device_span input, ArrowBuffer* buffer) const { NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, input.size_bytes(), 1)); + enable_hugepage(buffer); CUDF_CUDA_TRY(cudaMemcpyAsync( buffer->data, input.data(), input.size_bytes(), cudaMemcpyDefault, stream.value())); return NANOARROW_OK; diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp index 03cf6d4a0e0..d5caa4720ac 100644 --- a/cpp/src/io/avro/avro.cpp +++ b/cpp/src/io/avro/avro.cpp @@ -16,6 +16,7 @@ #include "avro.hpp" +#include #include #include @@ -302,7 +303,7 @@ bool schema_parser::parse(std::vector& schema, std::string const& // Empty schema if (json_str == "[]") return true; - char depthbuf[MAX_SCHEMA_DEPTH]; + std::array depthbuf; int depth = 0, parent_idx = -1, entry_idx = -1; json_state_e state = state_attrname; std::string str; diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp index 1af45b41d8e..d4d6f46b99a 100644 --- a/cpp/src/io/comp/uncomp.cpp +++ b/cpp/src/io/comp/uncomp.cpp @@ -538,8 +538,10 @@ size_t decompress_zstd(host_span src, CUDF_EXPECTS(hd_stats[0].status == compression_status::SUCCESS, "ZSTD decompression failed"); // Copy temporary output to `dst` - CUDF_CUDA_TRY(cudaMemcpyAsync( - dst.data(), d_dst.data(), hd_stats[0].bytes_written, cudaMemcpyDefault, stream.value())); + cudf::detail::cuda_memcpy_async( + dst.subspan(0, hd_stats[0].bytes_written), + device_span{d_dst.data(), hd_stats[0].bytes_written}, + stream); return hd_stats[0].bytes_written; } diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 8c32fc85f78..72fca75c56b 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -21,6 +21,7 @@ #include "csv_common.hpp" #include "csv_gpu.hpp" +#include "cudf/detail/utilities/cuda_memcpy.hpp" #include "io/comp/io_uncomp.hpp" #include "io/utilities/column_buffer.hpp" #include "io/utilities/hostdevice_vector.hpp" @@ -275,11 +276,10 @@ std::pair, selected_rows_offsets> load_data_and_gather auto const read_offset = byte_range_offset + input_pos + previous_data_size; auto const read_size = target_pos - input_pos - previous_data_size; if (data.has_value()) { - CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data() + previous_data_size, - data->data() + read_offset, - target_pos - input_pos - previous_data_size, - cudaMemcpyDefault, - stream.value())); + cudf::detail::cuda_memcpy_async( + device_span{d_data.data() + previous_data_size, read_size}, + data->subspan(read_offset, read_size), + stream); } else { if (source->is_device_read_preferred(read_size)) { source->device_read(read_offset, @@ -288,12 +288,11 @@ std::pair, selected_rows_offsets> load_data_and_gather stream); } else { auto const buffer = source->host_read(read_offset, read_size); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data() + previous_data_size, - buffer->data(), - buffer->size(), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); // To prevent buffer going out of scope before we copy the data. + // Use sync version to prevent buffer going out of scope before we copy the data. + cudf::detail::cuda_memcpy( + device_span{d_data.data() + previous_data_size, read_size}, + host_span{reinterpret_cast(buffer->data()), buffer->size()}, + stream); } } @@ -311,12 +310,10 @@ std::pair, selected_rows_offsets> load_data_and_gather range_end, skip_rows, stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), - row_ctx.device_ptr(), - num_blocks * sizeof(uint64_t), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); + + cudf::detail::cuda_memcpy(host_span{row_ctx}.subspan(0, num_blocks), + device_span{row_ctx}.subspan(0, num_blocks), + stream); // Sum up the rows in each character block, selecting the row count that // corresponds to the current input context. Also stores the now known input @@ -331,11 +328,9 @@ std::pair, selected_rows_offsets> load_data_and_gather // At least one row in range in this batch all_row_offsets.resize(total_rows - skip_rows, stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.device_ptr(), - row_ctx.host_ptr(), - num_blocks * sizeof(uint64_t), - cudaMemcpyDefault, - stream.value())); + cudf::detail::cuda_memcpy_async(device_span{row_ctx}.subspan(0, num_blocks), + host_span{row_ctx}.subspan(0, num_blocks), + stream); // Pass 2: Output row offsets cudf::io::csv::gpu::gather_row_offsets(parse_opts.view(), @@ -352,12 +347,9 @@ std::pair, selected_rows_offsets> load_data_and_gather stream); // With byte range, we want to keep only one row out of the specified range if (range_end < data_size) { - CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), - row_ctx.device_ptr(), - num_blocks * sizeof(uint64_t), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); + cudf::detail::cuda_memcpy(host_span{row_ctx}.subspan(0, num_blocks), + device_span{row_ctx}.subspan(0, num_blocks), + stream); size_t rows_out_of_range = 0; for (uint32_t i = 0; i < num_blocks; i++) { @@ -401,12 +393,9 @@ std::pair, selected_rows_offsets> load_data_and_gather // Remove header rows and extract header auto const header_row_index = std::max(header_rows, 1) - 1; if (header_row_index + 1 < row_offsets.size()) { - CUDF_CUDA_TRY(cudaMemcpyAsync(row_ctx.host_ptr(), - row_offsets.data() + header_row_index, - 2 * sizeof(uint64_t), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); + cudf::detail::cuda_memcpy(host_span{row_ctx}.subspan(0, 2), + device_span{row_offsets.data() + header_row_index, 2}, + stream); auto const header_start = input_pos + row_ctx[0]; auto const header_end = input_pos + row_ctx[1]; diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index b84446b5f3e..2bbe05ced84 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -405,13 +406,8 @@ void write_chunked(data_sink* out_sink, out_sink->device_write(ptr_all_bytes, total_num_bytes, stream); } else { // copy the bytes to host to write them out - thrust::host_vector h_bytes(total_num_bytes); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_bytes.data(), - ptr_all_bytes, - total_num_bytes * sizeof(char), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); + auto const h_bytes = cudf::detail::make_host_vector_sync( + device_span{ptr_all_bytes, total_num_bytes}, stream); out_sink->host_write(h_bytes.data(), total_num_bytes); } diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index de8eea9e99b..a8682e6a760 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -122,14 +122,14 @@ chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder( namespace { std::vector> make_datasources(source_info const& info, - size_t range_offset = 0, - size_t range_size = 0) + size_t offset = 0, + size_t max_size_estimate = 0) { switch (info.type()) { case io_type::FILEPATH: { auto sources = std::vector>(); for (auto const& filepath : info.filepaths()) { - sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size)); + sources.emplace_back(cudf::io::datasource::create(filepath, offset, max_size_estimate)); } return sources; } diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index 5855f1b5a5f..d06338c6f69 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -134,12 +134,13 @@ std::vector copy_strings_to_host_sync( // build std::string vector from chars and offsets std::vector host_data; host_data.reserve(col.size()); - std::transform( - std::begin(h_offsets), - std::end(h_offsets) - 1, - std::begin(h_offsets) + 1, - std::back_inserter(host_data), - [&](auto start, auto end) { return std::string(h_chars.data() + start, end - start); }); + std::transform(std::begin(h_offsets), + std::end(h_offsets) - 1, + std::begin(h_offsets) + 1, + std::back_inserter(host_data), + [&h_chars](auto start, auto end) { + return std::string(h_chars.data() + start, end - start); + }); return host_data; }; return to_host(d_column_names->view()); @@ -170,643 +171,78 @@ rmm::device_uvector is_all_nulls_each_column(device_span rmm::device_uvector is_all_nulls(num_cols, stream); thrust::fill(rmm::exec_policy_nosync(stream), is_all_nulls.begin(), is_all_nulls.end(), true); - auto parse_opt = parsing_options(options, stream); - thrust::for_each_n( - rmm::exec_policy_nosync(stream), - thrust::counting_iterator(0), - num_nodes, - [options = parse_opt.view(), - data = input.data(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { - auto const node_category = column_categories[col_ids[i]]; - if (node_category == NC_STR or node_category == NC_VAL) { - auto const is_null_literal = serialized_trie_contains( - options.trie_na, - {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); - if (!is_null_literal) is_all_nulls[col_ids[i]] = false; - } - }); - return is_all_nulls; -} - -NodeIndexT get_row_array_parent_col_id(device_span col_ids, - bool is_enabled_lines, - rmm::cuda_stream_view stream) -{ - NodeIndexT value = parent_node_sentinel; - if (!col_ids.empty()) { - auto const list_node_index = is_enabled_lines ? 0 : 1; - CUDF_CUDA_TRY(cudaMemcpyAsync(&value, - col_ids.data() + list_node_index, - sizeof(NodeIndexT), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); - } - return value; -} -/** - * @brief Holds member data pointers of `d_json_column` - * - */ -struct json_column_data { - using row_offset_t = json_column::row_offset_t; - row_offset_t* string_offsets; - row_offset_t* string_lengths; - row_offset_t* child_offsets; - bitmask_type* validity; -}; - -using hashmap_of_device_columns = - std::unordered_map>; - -std::pair, hashmap_of_device_columns> build_tree( - device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); -void scatter_offsets(tree_meta_t const& tree, - device_span col_ids, - device_span row_offsets, - device_span node_ids, - device_span sorted_col_ids, // Reuse this for parent_col_ids - tree_meta_t const& d_column_tree, - host_span ignore_vals, - hashmap_of_device_columns const& columns, - rmm::cuda_stream_view stream); - -/** - * @brief Constructs `d_json_column` from node tree representation - * Newly constructed columns are inserted into `root`'s children. - * `root` must be a list type. - * - * @param input Input JSON string device data - * @param tree Node tree representation of the JSON string - * @param col_ids Column ids of the nodes in the tree - * @param row_offsets Row offsets of the nodes in the tree - * @param root Root node of the `d_json_column` tree - * @param is_array_of_arrays Whether the tree is an array of arrays - * @param options Parsing options specifying the parsing behaviour - * options affecting behaviour are - * is_enabled_lines: Whether the input is a line-delimited JSON - * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the device memory - * of child_offets and validity members of `d_json_column` - */ -void make_device_json_column(device_span input, - tree_meta_t const& tree, - device_span col_ids, - device_span row_offsets, - device_json_column& root, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - bool const is_enabled_lines = options.is_enabled_lines(); - bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); - // make a copy - auto sorted_col_ids = cudf::detail::make_device_uvector_async( - col_ids, stream, cudf::get_current_device_resource_ref()); - - // sort by {col_id} on {node_ids} stable - rmm::device_uvector node_ids(col_ids.size(), stream); - thrust::sequence(rmm::exec_policy_nosync(stream), node_ids.begin(), node_ids.end()); - thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), - sorted_col_ids.begin(), - sorted_col_ids.end(), - node_ids.begin()); - - NodeIndexT const row_array_parent_col_id = - get_row_array_parent_col_id(col_ids, is_enabled_lines, stream); - - // 1. gather column information. - auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = - reduce_to_column_tree(tree, - col_ids, - sorted_col_ids, - node_ids, - row_offsets, - is_array_of_arrays, - row_array_parent_col_id, - stream); - auto num_columns = d_unique_col_ids.size(); - std::vector column_names = copy_strings_to_host_sync( - input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); - // array of arrays column names - if (is_array_of_arrays) { - auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); - auto const column_parent_ids = - cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); - TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; - auto values_column_indices = - get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); - auto h_values_column_indices = - cudf::detail::make_host_vector_sync(values_column_indices, stream); - std::transform(unique_col_ids.begin(), - unique_col_ids.end(), - column_names.cbegin(), - column_names.begin(), - [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( - auto col_id, auto name) mutable { - return column_parent_ids[col_id] == row_array_parent_col_id - ? std::to_string(h_values_column_indices[col_id]) - : name; - }); - } - - auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() { - if (is_enabled_mixed_types_as_string) { - return cudf::detail::make_std_vector_sync( - is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream); - } - return std::vector(); - }(); - auto const [ignore_vals, columns] = build_tree(root, - is_str_column_all_nulls, - d_column_tree, - d_unique_col_ids, - d_max_row_offsets, - column_names, - row_array_parent_col_id, - is_array_of_arrays, - options, - stream, - mr); - - scatter_offsets(tree, - col_ids, - row_offsets, - node_ids, - sorted_col_ids, - d_column_tree, - ignore_vals, - columns, - stream); -} - -std::pair, hashmap_of_device_columns> build_tree( - device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); - auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); - auto column_categories = - cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream); - auto const column_parent_ids = - cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); - auto column_range_beg = - cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream); - auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream); - auto num_columns = d_unique_col_ids.size(); - stream.synchronize(); - - auto to_json_col_type = [](auto category) { - switch (category) { - case NC_STRUCT: return json_col_t::StructColumn; - case NC_LIST: return json_col_t::ListColumn; - case NC_STR: [[fallthrough]]; - case NC_VAL: return json_col_t::StringColumn; - default: return json_col_t::Unknown; - } - }; - auto init_to_zero = [stream](auto& v) { - thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0); - }; - - auto initialize_json_columns = [&](auto i, auto& col, auto column_category) { - if (column_category == NC_ERR || column_category == NC_FN) { - return; - } else if (column_category == NC_VAL || column_category == NC_STR) { - col.string_offsets.resize(max_row_offsets[i] + 1, stream); - col.string_lengths.resize(max_row_offsets[i] + 1, stream); - init_to_zero(col.string_offsets); - init_to_zero(col.string_lengths); - } else if (column_category == NC_LIST) { - col.child_offsets.resize(max_row_offsets[i] + 2, stream); - init_to_zero(col.child_offsets); - } - col.num_rows = max_row_offsets[i] + 1; - col.validity = - cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = to_json_col_type(column_category); - }; - - auto reinitialize_as_string = [&](auto i, auto& col) { - col.string_offsets.resize(max_row_offsets[i] + 1, stream); - col.string_lengths.resize(max_row_offsets[i] + 1, stream); - init_to_zero(col.string_offsets); - init_to_zero(col.string_lengths); - col.num_rows = max_row_offsets[i] + 1; - col.validity = - cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); - col.type = json_col_t::StringColumn; - // destroy references of all child columns after this step, by calling remove_child_columns - }; - - path_from_tree tree_path{column_categories, - column_parent_ids, - column_names, - is_array_of_arrays, - row_array_parent_col_id}; - - // 2. generate nested columns tree and its device_memory - // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. - auto h_range_col_id_it = - thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin()); - std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { - return thrust::get<0>(a) < thrust::get<0>(b); - }); - - // use hash map because we may skip field name's col_ids - hashmap_of_device_columns columns; - // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking - std::map, NodeIndexT> mapped_columns; - // find column_ids which are values, but should be ignored in validity - auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); - std::fill(ignore_vals.begin(), ignore_vals.end(), false); - std::vector is_mixed_type_column(num_columns, 0); - std::vector is_pruned(num_columns, 0); - // for columns that are not mixed type but have been forced as string - std::vector forced_as_string_column(num_columns); - columns.try_emplace(parent_node_sentinel, std::ref(root)); - - std::function remove_child_columns = - [&](NodeIndexT this_col_id, device_json_column& col) { - for (auto const& col_name : col.column_order) { - auto child_id = mapped_columns[{this_col_id, col_name}]; - is_mixed_type_column[child_id] = 1; - remove_child_columns(child_id, col.child_columns.at(col_name)); - mapped_columns.erase({this_col_id, col_name}); - columns.erase(child_id); - } - col.child_columns.clear(); // their references are deleted above. - col.column_order.clear(); - }; - - auto name_and_parent_index = [&is_array_of_arrays, - &row_array_parent_col_id, - &column_parent_ids, - &column_categories, - &column_names](auto this_col_id) { - std::string name = ""; - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { - if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { - name = column_names[this_col_id]; - } else { - name = list_child_name; - } - } else if (column_categories[parent_col_id] == NC_FN) { - auto field_name_col_id = parent_col_id; - parent_col_id = column_parent_ids[parent_col_id]; - name = column_names[field_name_col_id]; - } else { - CUDF_FAIL("Unexpected parent column category"); - } - return std::pair{name, parent_col_id}; - }; - - // Prune columns that are not required to be parsed. - if (options.is_enabled_prune_columns()) { - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value - auto [name, parent_col_id] = name_and_parent_index(this_col_id); - // get path of this column, and get its dtype if present in options - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) { - is_pruned[this_col_id] = 1; - continue; - } else { - // make sure all its parents are not pruned. - while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) { - is_pruned[parent_col_id] = 0; - parent_col_id = column_parent_ids[parent_col_id]; - } - } - } - } - - // Build the column tree, also, handles mixed types. - for (auto const this_col_id : unique_col_ids) { - if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) { - continue; - } - // Struct, List, String, Value - auto [name, parent_col_id] = name_and_parent_index(this_col_id); - - // if parent is mixed type column or this column is pruned or if parent - // has been forced as string, ignore this column. - if (parent_col_id != parent_node_sentinel && - (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) || - forced_as_string_column[parent_col_id]) { - ignore_vals[this_col_id] = true; - if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } - if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; } - continue; - } - - // If the child is already found, - // replace if this column is a nested column and the existing was a value column - // ignore this column if this column is a value column and the existing was a nested column - auto it = columns.find(parent_col_id); - CUDF_EXPECTS(it != columns.end(), "Parent column not found"); - auto& parent_col = it->second.get(); - bool replaced = false; - if (mapped_columns.count({parent_col_id, name}) > 0) { - auto const old_col_id = mapped_columns[{parent_col_id, name}]; - // If mixed type as string is enabled, make both of them strings and merge them. - // All child columns will be ignored when parsing. - if (is_enabled_mixed_types_as_string) { - bool const is_mixed_type = [&]() { - // If new or old is STR and they are all not null, make it mixed type, else ignore. - if (column_categories[this_col_id] == NC_VAL || - column_categories[this_col_id] == NC_STR) { - if (is_str_column_all_nulls[this_col_id]) return false; - } - if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { - if (is_str_column_all_nulls[old_col_id]) return false; - } - return true; - }(); - if (is_mixed_type) { - is_mixed_type_column[this_col_id] = 1; - is_mixed_type_column[old_col_id] = 1; - // if old col type (not cat) is list or struct, replace with string. - auto& col = columns.at(old_col_id).get(); - if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) { - reinitialize_as_string(old_col_id, col); - remove_child_columns(old_col_id, col); - // all its children (which are already inserted) are ignored later. - } - col.forced_as_string_column = true; - columns.try_emplace(this_col_id, columns.at(old_col_id)); - continue; - } - } - - if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { - ignore_vals[this_col_id] = true; - continue; - } - if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { - // remap - ignore_vals[old_col_id] = true; - mapped_columns.erase({parent_col_id, name}); - columns.erase(old_col_id); - parent_col.child_columns.erase(name); - replaced = true; // to skip duplicate name in column_order - } else { - // If this is a nested column but we're trying to insert either (a) a list node into a - // struct column or (b) a struct node into a list column, we fail - CUDF_EXPECTS(not((column_categories[old_col_id] == NC_LIST and - column_categories[this_col_id] == NC_STRUCT) or - (column_categories[old_col_id] == NC_STRUCT and - column_categories[this_col_id] == NC_LIST)), - "A mix of lists and structs within the same column is not supported"); - } - } - - auto this_column_category = column_categories[this_col_id]; - // get path of this column, check if it is a struct/list forced as string, and enforce it - auto const nt = tree_path.get_path(this_col_id); - std::optional const user_dtype = get_path_data_type(nt, options); - if ((column_categories[this_col_id] == NC_STRUCT or - column_categories[this_col_id] == NC_LIST) and - user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { - this_column_category = NC_STR; - } - - CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); - // move into parent - device_json_column col(stream, mr); - initialize_json_columns(this_col_id, col, this_column_category); - if ((column_categories[this_col_id] == NC_STRUCT or - column_categories[this_col_id] == NC_LIST) and - user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) { - col.forced_as_string_column = true; - forced_as_string_column[this_col_id] = true; - } - - auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second; - CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); - if (not replaced) parent_col.column_order.push_back(name); - columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name))); - mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id); - } - - if (is_enabled_mixed_types_as_string) { - // ignore all children of mixed type columns - for (auto const this_col_id : unique_col_ids) { - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { - is_mixed_type_column[this_col_id] = 1; - ignore_vals[this_col_id] = true; - columns.erase(this_col_id); - } - // Convert only mixed type columns as string (so to copy), but not its children - if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and - is_mixed_type_column[this_col_id] == 1) - column_categories[this_col_id] = NC_STR; - } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudf::detail::host_memory_kind::PAGEABLE, - stream); - } - - // ignore all children of columns forced as string - for (auto const this_col_id : unique_col_ids) { - auto parent_col_id = column_parent_ids[this_col_id]; - if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) { - forced_as_string_column[this_col_id] = true; - ignore_vals[this_col_id] = true; - } - // Convert only mixed type columns as string (so to copy), but not its children - if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and - forced_as_string_column[this_col_id]) - column_categories[this_col_id] = NC_STR; - } - cudf::detail::cuda_memcpy_async(d_column_tree.node_categories.begin(), - column_categories.data(), - column_categories.size() * sizeof(column_categories[0]), - cudf::detail::host_memory_kind::PAGEABLE, - stream); - - // restore unique_col_ids order - std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { - return thrust::get<1>(a) < thrust::get<1>(b); - }); - return {ignore_vals, columns}; -} - -void scatter_offsets(tree_meta_t const& tree, - device_span col_ids, - device_span row_offsets, - device_span node_ids, - device_span sorted_col_ids, // Reuse this for parent_col_ids - tree_meta_t const& d_column_tree, - host_span ignore_vals, - hashmap_of_device_columns const& columns, - rmm::cuda_stream_view stream) -{ - auto const num_nodes = col_ids.size(); - auto const num_columns = d_column_tree.node_categories.size(); - // move columns data to device. - auto columns_data = cudf::detail::make_host_vector(num_columns, stream); - for (auto& [col_id, col_ref] : columns) { - if (col_id == parent_node_sentinel) continue; - auto& col = col_ref.get(); - columns_data[col_id] = json_column_data{col.string_offsets.data(), - col.string_lengths.data(), - col.child_offsets.data(), - static_cast(col.validity.data())}; - } - - auto d_ignore_vals = cudf::detail::make_device_uvector_async( - ignore_vals, stream, cudf::get_current_device_resource_ref()); - auto d_columns_data = cudf::detail::make_device_uvector_async( - columns_data, stream, cudf::get_current_device_resource_ref()); - - // 3. scatter string offsets to respective columns, set validity bits - thrust::for_each_n( - rmm::exec_policy_nosync(stream), - thrust::counting_iterator(0), - num_nodes, - [column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin(), - row_offsets = row_offsets.begin(), - range_begin = tree.node_range_begin.begin(), - range_end = tree.node_range_end.begin(), - d_ignore_vals = d_ignore_vals.begin(), - d_columns_data = d_columns_data.begin()] __device__(size_type i) { - if (d_ignore_vals[col_ids[i]]) return; - auto const node_category = column_categories[col_ids[i]]; - switch (node_category) { - case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; - case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; - case NC_STR: [[fallthrough]]; - case NC_VAL: - if (d_ignore_vals[col_ids[i]]) break; - set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); - d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; - d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; - break; - default: break; - } - }); - - // 4. scatter List offset - // copy_if only node's whose parent is list, (node_id, parent_col_id) - // stable_sort by parent_col_id of {node_id}. - // For all unique parent_node_id of (i==0, i-1!=i), write start offset. - // (i==last, i+1!=i), write end offset. - // unique_copy_by_key {parent_node_id} {row_offset} to - // col[parent_col_id].child_offsets[row_offset[parent_node_id]] - - auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids - auto parent_col_id = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - cuda::proclaim_return_type( - [col_ids = col_ids.begin(), - parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { - return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel - : col_ids[parent_node_ids[node_id]]; - })); - auto const list_children_end = thrust::copy_if( - rmm::exec_policy_nosync(stream), - thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), - thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + - num_nodes, - thrust::make_counting_iterator(0), - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), - [d_ignore_vals = d_ignore_vals.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - column_categories = d_column_tree.node_categories.begin(), - col_ids = col_ids.begin()] __device__(size_type node_id) { - auto parent_node_id = parent_node_ids[node_id]; - return parent_node_id != parent_node_sentinel and - column_categories[col_ids[parent_node_id]] == NC_LIST and - (!d_ignore_vals[col_ids[parent_node_id]]); - }); - - auto const num_list_children = - list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); - thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), - parent_col_ids.begin(), - parent_col_ids.begin() + num_list_children, - node_ids.begin()); + auto parse_opt = parsing_options(options, stream); thrust::for_each_n( rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(0), - num_list_children, - [node_ids = node_ids.begin(), - parent_node_ids = tree.parent_node_ids.begin(), - parent_col_ids = parent_col_ids.begin(), - row_offsets = row_offsets.begin(), - d_columns_data = d_columns_data.begin(), - num_list_children] __device__(size_type i) { - auto const node_id = node_ids[i]; - auto const parent_node_id = parent_node_ids[node_id]; - // scatter to list_offset - if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { - d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = - row_offsets[node_id]; - } - // last value of list child_offset is its size. - if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { - d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = - row_offsets[node_id] + 1; + thrust::counting_iterator(0), + num_nodes, + [options = parse_opt.view(), + data = input.data(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + is_all_nulls = is_all_nulls.begin()] __device__(size_type i) { + auto const node_category = column_categories[col_ids[i]]; + if (node_category == NC_STR or node_category == NC_VAL) { + auto const is_null_literal = serialized_trie_contains( + options.trie_na, + {data + range_begin[i], static_cast(range_end[i] - range_begin[i])}); + if (!is_null_literal) is_all_nulls[col_ids[i]] = false; } }); + return is_all_nulls; +} - // 5. scan on offsets. - for (auto& [id, col_ref] : columns) { - auto& col = col_ref.get(); - if (col.type == json_col_t::StringColumn) { - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - col.string_offsets.begin(), - col.string_offsets.end(), - col.string_offsets.begin(), - thrust::maximum{}); - } else if (col.type == json_col_t::ListColumn) { - thrust::inclusive_scan(rmm::exec_policy_nosync(stream), - col.child_offsets.begin(), - col.child_offsets.end(), - col.child_offsets.begin(), - thrust::maximum{}); - } - } - stream.synchronize(); +NodeIndexT get_row_array_parent_col_id(device_span col_ids, + bool is_enabled_lines, + rmm::cuda_stream_view stream) +{ + if (col_ids.empty()) { return parent_node_sentinel; } + + auto const list_node_index = is_enabled_lines ? 0 : 1; + auto const value = cudf::detail::make_host_vector_sync( + device_span{col_ids.data() + list_node_index, 1}, stream); + + return value[0]; } +/** + * @brief Holds member data pointers of `d_json_column` + * + */ +struct json_column_data { + using row_offset_t = json_column::row_offset_t; + row_offset_t* string_offsets; + row_offset_t* string_lengths; + row_offset_t* child_offsets; + bitmask_type* validity; +}; + +using hashmap_of_device_columns = + std::unordered_map>; + +std::pair, hashmap_of_device_columns> build_tree( + device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); -namespace experimental { +void scatter_offsets(tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t const& d_column_tree, + host_span ignore_vals, + hashmap_of_device_columns const& columns, + rmm::cuda_stream_view stream); std::map unified_schema(cudf::io::json_reader_options const& options) { @@ -836,19 +272,6 @@ std::map unified_schema(cudf::io::json_reader_optio options.get_dtypes()); } -std::pair, hashmap_of_device_columns> build_tree( - device_json_column& root, - host_span is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - /** * @brief Constructs `d_json_column` from node tree representation * Newly constructed columns are inserted into `root`'s children. @@ -1040,7 +463,7 @@ std::pair, hashmap_of_device_columns> build_tree std::fill_n(is_pruned.begin(), num_columns, options.is_enabled_prune_columns()); // prune all children of a column, but not self. - auto ignore_all_children = [&](auto parent_col_id) { + auto ignore_all_children = [&adj, &is_pruned](auto parent_col_id) { std::deque offspring; if (adj.count(parent_col_id)) { for (auto const& child : adj[parent_col_id]) { @@ -1391,14 +814,149 @@ std::pair, hashmap_of_device_columns> build_tree column_categories.cbegin(), expected_types.begin(), [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; }); - cudaMemcpyAsync(d_column_tree.node_categories.begin(), - expected_types.data(), - expected_types.size() * sizeof(column_categories[0]), - cudaMemcpyDefault, - stream.value()); + cudf::detail::cuda_memcpy_async(d_column_tree.node_categories, expected_types, stream); return {is_pruned, columns}; } -} // namespace experimental + +void scatter_offsets(tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t const& d_column_tree, + host_span ignore_vals, + hashmap_of_device_columns const& columns, + rmm::cuda_stream_view stream) +{ + auto const num_nodes = col_ids.size(); + auto const num_columns = d_column_tree.node_categories.size(); + // move columns data to device. + auto columns_data = cudf::detail::make_host_vector(num_columns, stream); + for (auto& [col_id, col_ref] : columns) { + if (col_id == parent_node_sentinel) continue; + auto& col = col_ref.get(); + columns_data[col_id] = json_column_data{col.string_offsets.data(), + col.string_lengths.data(), + col.child_offsets.data(), + static_cast(col.validity.data())}; + } + + auto d_ignore_vals = cudf::detail::make_device_uvector_async( + ignore_vals, stream, cudf::get_current_device_resource_ref()); + auto d_columns_data = cudf::detail::make_device_uvector_async( + columns_data, stream, cudf::get_current_device_resource_ref()); + + // 3. scatter string offsets to respective columns, set validity bits + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + num_nodes, + [column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin(), + row_offsets = row_offsets.begin(), + range_begin = tree.node_range_begin.begin(), + range_end = tree.node_range_end.begin(), + d_ignore_vals = d_ignore_vals.begin(), + d_columns_data = d_columns_data.begin()] __device__(size_type i) { + if (d_ignore_vals[col_ids[i]]) return; + auto const node_category = column_categories[col_ids[i]]; + switch (node_category) { + case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break; + case NC_STR: [[fallthrough]]; + case NC_VAL: + if (d_ignore_vals[col_ids[i]]) break; + set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); + d_columns_data[col_ids[i]].string_offsets[row_offsets[i]] = range_begin[i]; + d_columns_data[col_ids[i]].string_lengths[row_offsets[i]] = range_end[i] - range_begin[i]; + break; + default: break; + } + }); + + // 4. scatter List offset + // copy_if only node's whose parent is list, (node_id, parent_col_id) + // stable_sort by parent_col_id of {node_id}. + // For all unique parent_node_id of (i==0, i-1!=i), write start offset. + // (i==last, i+1!=i), write end offset. + // unique_copy_by_key {parent_node_id} {row_offset} to + // col[parent_col_id].child_offsets[row_offset[parent_node_id]] + + auto& parent_col_ids = sorted_col_ids; // reuse sorted_col_ids + auto parent_col_id = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [col_ids = col_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) { + return parent_node_ids[node_id] == parent_node_sentinel ? parent_node_sentinel + : col_ids[parent_node_ids[node_id]]; + })); + auto const list_children_end = thrust::copy_if( + rmm::exec_policy_nosync(stream), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), + thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + + num_nodes, + thrust::make_counting_iterator(0), + thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), + [d_ignore_vals = d_ignore_vals.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + column_categories = d_column_tree.node_categories.begin(), + col_ids = col_ids.begin()] __device__(size_type node_id) { + auto parent_node_id = parent_node_ids[node_id]; + return parent_node_id != parent_node_sentinel and + column_categories[col_ids[parent_node_id]] == NC_LIST and + (!d_ignore_vals[col_ids[parent_node_id]]); + }); + + auto const num_list_children = + list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); + thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), + parent_col_ids.begin(), + parent_col_ids.begin() + num_list_children, + node_ids.begin()); + thrust::for_each_n( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + num_list_children, + [node_ids = node_ids.begin(), + parent_node_ids = tree.parent_node_ids.begin(), + parent_col_ids = parent_col_ids.begin(), + row_offsets = row_offsets.begin(), + d_columns_data = d_columns_data.begin(), + num_list_children] __device__(size_type i) { + auto const node_id = node_ids[i]; + auto const parent_node_id = parent_node_ids[node_id]; + // scatter to list_offset + if (i == 0 or parent_node_ids[node_ids[i - 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id]] = + row_offsets[node_id]; + } + // last value of list child_offset is its size. + if (i == num_list_children - 1 or parent_node_ids[node_ids[i + 1]] != parent_node_id) { + d_columns_data[parent_col_ids[i]].child_offsets[row_offsets[parent_node_id] + 1] = + row_offsets[node_id] + 1; + } + }); + + // 5. scan on offsets. + for (auto& [id, col_ref] : columns) { + auto& col = col_ref.get(); + if (col.type == json_col_t::StringColumn) { + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + col.string_offsets.begin(), + col.string_offsets.end(), + col.string_offsets.begin(), + thrust::maximum{}); + } else if (col.type == json_col_t::ListColumn) { + thrust::inclusive_scan(rmm::exec_policy_nosync(stream), + col.child_offsets.begin(), + col.child_offsets.end(), + col.child_offsets.begin(), + thrust::maximum{}); + } + } + stream.synchronize(); +} } // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 912e93d52ae..7e4d975e431 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -485,16 +485,6 @@ std::pair, std::vector> device_json_co } } -template -auto make_device_json_column_dispatch(bool experimental, Args&&... args) -{ - if (experimental) { - return experimental::make_device_json_column(std::forward(args)...); - } else { - return make_device_json_column(std::forward(args)...); - } -} - table_with_metadata device_parse_nested_json(device_span d_input, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, @@ -523,16 +513,14 @@ table_with_metadata device_parse_nested_json(device_span d_input, #endif bool const is_array_of_arrays = [&]() { - std::array h_node_categories = {NC_ERR, NC_ERR}; - auto const size_to_copy = std::min(size_t{2}, gpu_tree.node_categories.size()); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_node_categories.data(), - gpu_tree.node_categories.data(), - sizeof(node_t) * size_to_copy, - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); + auto const size_to_copy = std::min(size_t{2}, gpu_tree.node_categories.size()); + if (size_to_copy == 0) return false; + auto const h_node_categories = cudf::detail::make_host_vector_sync( + device_span{gpu_tree.node_categories.data(), size_to_copy}, stream); + if (options.is_enabled_lines()) return h_node_categories[0] == NC_LIST; - return h_node_categories[0] == NC_LIST and h_node_categories[1] == NC_LIST; + return h_node_categories.size() >= 2 and h_node_categories[0] == NC_LIST and + h_node_categories[1] == NC_LIST; }(); auto [gpu_col_id, gpu_row_offsets] = @@ -553,16 +541,15 @@ table_with_metadata device_parse_nested_json(device_span d_input, 0); // Get internal JSON column - make_device_json_column_dispatch(options.is_enabled_experimental(), - d_input, - gpu_tree, - gpu_col_id, - gpu_row_offsets, - root_column, - is_array_of_arrays, - options, - stream, - mr); + make_device_json_column(d_input, + gpu_tree, + gpu_col_id, + gpu_row_offsets, + root_column, + is_array_of_arrays, + options, + stream, + mr); // data_root refers to the root column of the data represented by the given JSON string auto& data_root = diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 2d435dc8e1a..34a87918e57 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -16,6 +16,7 @@ #include "io/fst/lookup_tables.cuh" +#include #include #include #include @@ -24,7 +25,6 @@ #include #include -#include #include #include @@ -316,7 +316,7 @@ void normalize_single_quotes(datasource::owning_buffer& inda stream); rmm::device_buffer outbuf(indata.size() * 2, stream, mr); - rmm::device_scalar outbuf_size(stream, mr); + cudf::detail::device_scalar outbuf_size(stream, mr); parser.Transduce(reinterpret_cast(indata.data()), static_cast(indata.size()), static_cast(outbuf.data()), @@ -401,7 +401,7 @@ std:: stream); rmm::device_uvector outbuf_indices(inbuf.size(), stream, mr); - rmm::device_scalar outbuf_indices_size(stream, mr); + cudf::detail::device_scalar outbuf_indices_size(stream, mr); parser.Transduce(inbuf.data(), static_cast(inbuf.size()), thrust::make_discard_iterator(), diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index d949635c1cc..e2fe926ea19 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -264,16 +264,13 @@ tree_meta_t get_tree_representation(device_span tokens, error_count > 0) { auto const error_location = thrust::find(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin); - SymbolOffsetT error_index; - CUDF_CUDA_TRY( - cudaMemcpyAsync(&error_index, - token_indices.data() + thrust::distance(tokens.begin(), error_location), - sizeof(SymbolOffsetT), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); + auto error_index = cudf::detail::make_host_vector_sync( + device_span{ + token_indices.data() + thrust::distance(tokens.begin(), error_location), 1}, + stream); + CUDF_FAIL("JSON Parser encountered an invalid format at location " + - std::to_string(error_index)); + std::to_string(error_index[0])); } auto const num_tokens = tokens.size(); diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 3d9a51833e0..f6be4539d7f 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -405,21 +405,6 @@ void make_device_json_column(device_span input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); -namespace experimental { -/** - * @copydoc cudf::io::json::detail::make_device_json_column - */ -void make_device_json_column(device_span input, - tree_meta_t const& tree, - device_span col_ids, - device_span row_offsets, - device_json_column& root, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); -} // namespace experimental - /** * @brief Retrieves the parse_options to be used for type inference and type casting * diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 76816071d8c..60e78f4763d 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -21,6 +21,7 @@ #include "nested_json.hpp" #include +#include #include #include #include @@ -34,7 +35,6 @@ #include #include -#include #include #include @@ -83,8 +83,7 @@ struct tree_node { void check_input_size(std::size_t input_size) { // Transduce() writes symbol offsets that may be as large input_size-1 - CUDF_EXPECTS(input_size == 0 || - (input_size - 1) <= std::numeric_limits::max(), + CUDF_EXPECTS(input_size == 0 || (input_size - 1) <= std::numeric_limits::max(), "Given JSON input is too large"); } } // namespace @@ -1447,11 +1446,7 @@ void get_stack_context(device_span json_in, constexpr StackSymbolT read_symbol = 'x'; // Number of stack operations in the input (i.e., number of '{', '}', '[', ']' outside of quotes) - rmm::device_scalar d_num_stack_ops(stream); - - // Sequence of stack symbols and their position in the original input (sparse representation) - rmm::device_uvector stack_ops{json_in.size(), stream}; - rmm::device_uvector stack_op_indices{json_in.size(), stream}; + cudf::detail::device_scalar d_num_stack_ops(stream); // Prepare finite-state transducer that only selects '{', '}', '[', ']' outside of quotes constexpr auto max_translation_table_size = @@ -1469,11 +1464,26 @@ void get_stack_context(device_span json_in, // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end // of structs/lists + // Run FST to estimate the sizes of translated buffers + json_to_stack_ops_fst.Transduce(json_in.begin(), + static_cast(json_in.size()), + thrust::make_discard_iterator(), + thrust::make_discard_iterator(), + d_num_stack_ops.data(), + to_stack_op::start_state, + stream); + + auto stack_ops_bufsize = d_num_stack_ops.value(stream); + // Sequence of stack symbols and their position in the original input (sparse representation) + rmm::device_uvector stack_ops{stack_ops_bufsize, stream}; + rmm::device_uvector stack_op_indices{stack_ops_bufsize, stream}; + + // Run bracket-brace FST to retrieve starting positions of structs and lists json_to_stack_ops_fst.Transduce(json_in.begin(), static_cast(json_in.size()), stack_ops.data(), stack_op_indices.data(), - d_num_stack_ops.data(), + thrust::make_discard_iterator(), to_stack_op::start_state, stream); @@ -1509,6 +1519,7 @@ std::pair, rmm::device_uvector> pr device_span token_indices, rmm::cuda_stream_view stream) { + CUDF_FUNC_RANGE(); // Instantiate FST for post-processing the token stream to remove all tokens that belong to an // invalid JSON line token_filter::UnwrapTokenFromSymbolOp sgid_op{}; @@ -1520,7 +1531,7 @@ std::pair, rmm::device_uvector> pr stream); auto const mr = cudf::get_current_device_resource_ref(); - rmm::device_scalar d_num_selected_tokens(stream, mr); + cudf::detail::device_scalar d_num_selected_tokens(stream, mr); rmm::device_uvector filtered_tokens_out{tokens.size(), stream, mr}; rmm::device_uvector filtered_token_indices_out{tokens.size(), stream, mr}; @@ -1639,26 +1650,33 @@ std::pair, rmm::device_uvector> ge std::size_t constexpr max_tokens_per_struct = 6; auto const max_token_out_count = cudf::util::div_rounding_up_safe(json_in.size(), min_chars_per_struct) * max_tokens_per_struct; - rmm::device_scalar num_written_tokens{stream}; + cudf::detail::device_scalar num_written_tokens{stream}; // In case we're recovering on invalid JSON lines, post-processing the token stream requires to // see a JSON-line delimiter as the very first item SymbolOffsetT const delimiter_offset = (format == tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER ? 1 : 0); - rmm::device_uvector tokens{max_token_out_count + delimiter_offset, stream, mr}; - rmm::device_uvector tokens_indices{ - max_token_out_count + delimiter_offset, stream, mr}; + // Run FST to estimate the size of output buffers json_to_tokens_fst.Transduce(zip_in, static_cast(json_in.size()), - tokens.data() + delimiter_offset, - tokens_indices.data() + delimiter_offset, + thrust::make_discard_iterator(), + thrust::make_discard_iterator(), num_written_tokens.data(), tokenizer_pda::start_state, stream); auto const num_total_tokens = num_written_tokens.value(stream) + delimiter_offset; - tokens.resize(num_total_tokens, stream); - tokens_indices.resize(num_total_tokens, stream); + rmm::device_uvector tokens{num_total_tokens, stream, mr}; + rmm::device_uvector tokens_indices{num_total_tokens, stream, mr}; + + // Run FST to translate the input JSON string into tokens and indices at which they occur + json_to_tokens_fst.Transduce(zip_in, + static_cast(json_in.size()), + tokens.data() + delimiter_offset, + tokens_indices.data() + delimiter_offset, + thrust::make_discard_iterator(), + tokenizer_pda::start_state, + stream); if (delimiter_offset == 1) { tokens.set_element(0, token_t::LineEnd, stream); diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 83c7b663980..d41d137a2c9 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -87,13 +88,25 @@ void validate_token_stream(device_span d_input, { CUDF_FUNC_RANGE(); if (!options.is_strict_validation()) { return; } + + rmm::device_uvector d_invalid = cudf::detail::make_zeroed_device_uvector_async( + tokens.size(), stream, cudf::get_current_device_resource_ref()); + using token_t = cudf::io::json::token_t; - cudf::detail::optional_trie trie_na = - cudf::detail::create_serialized_trie(options.get_na_values(), stream); - auto trie_na_view = cudf::detail::make_trie_view(trie_na); + auto literals = options.get_na_values(); + literals.emplace_back("null"); // added these too to single trie + literals.emplace_back("true"); + literals.emplace_back("false"); + + cudf::detail::optional_trie trie_literals = + cudf::detail::create_serialized_trie(literals, stream); + cudf::detail::optional_trie trie_nonnumeric = cudf::detail::create_serialized_trie( + {"NaN", "Infinity", "+INF", "+Infinity", "-INF", "-Infinity"}, stream); + auto validate_values = cuda::proclaim_return_type( [data = d_input.data(), - trie_na = trie_na_view, + trie_literals = cudf::detail::make_trie_view(trie_literals), + trie_nonnumeric = cudf::detail::make_trie_view(trie_nonnumeric), allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(), allow_nonnumeric = options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start, @@ -101,24 +114,15 @@ void validate_token_stream(device_span d_input, // This validates an unquoted value. A value must match https://www.json.org/json-en.html // but the leading and training whitespace should already have been removed, and is not // a string - auto c = data[start]; - auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start}); - if (is_null_literal) { - return true; - } else if ('n' == c) { - return substr_eq(data, start, end, 4, "null"); - } else if ('t' == c) { - return substr_eq(data, start, end, 4, "true"); - } else if ('f' == c) { - return substr_eq(data, start, end, 5, "false"); - } else if (allow_nonnumeric && c == 'N') { - return substr_eq(data, start, end, 3, "NaN"); - } else if (allow_nonnumeric && c == 'I') { - return substr_eq(data, start, end, 8, "Infinity"); - } else if (allow_nonnumeric && c == '+') { - return substr_eq(data, start, end, 4, "+INF") || - substr_eq(data, start, end, 9, "+Infinity"); - } else if ('-' == c || c <= '9' && 'c' >= '0') { + auto const is_literal = serialized_trie_contains(trie_literals, {data + start, end - start}); + if (is_literal) { return true; } + if (allow_nonnumeric) { + auto const is_nonnumeric = + serialized_trie_contains(trie_nonnumeric, {data + start, end - start}); + if (is_nonnumeric) { return true; } + } + auto c = data[start]; + if ('-' == c || c <= '9' && 'c' >= '0') { // number auto num_state = number_state::START; for (auto at = start; at < end; at++) { @@ -140,9 +144,6 @@ void validate_token_stream(device_span d_input, num_state = number_state::LEADING_ZERO; } else if (c >= '1' && c <= '9') { num_state = number_state::WHOLE; - } else if (allow_nonnumeric && 'I' == c) { - return substr_eq(data, start, end, 4, "-INF") || - substr_eq(data, start, end, 9, "-Infinity"); } else { return false; } @@ -273,33 +274,44 @@ void validate_token_stream(device_span d_input, auto num_tokens = tokens.size(); auto count_it = thrust::make_counting_iterator(0); - auto predicate = [tokens = tokens.begin(), - token_indices = token_indices.begin(), - validate_values, - validate_strings] __device__(auto i) -> bool { + auto predicate = cuda::proclaim_return_type([tokens = tokens.begin(), + token_indices = token_indices.begin(), + validate_values, + validate_strings] __device__(auto i) -> bool { if (tokens[i] == token_t::ValueEnd) { return !validate_values(token_indices[i - 1], token_indices[i]); } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) { return !validate_strings(token_indices[i - 1], token_indices[i]); } return false; - }; + }); + + auto conditional_invalidout_it = + cudf::detail::make_tabulate_output_iterator(cuda::proclaim_return_type( + [d_invalid = d_invalid.begin()] __device__(size_type i, bool x) -> void { + if (x) { d_invalid[i] = true; } + })); + thrust::transform(rmm::exec_policy_nosync(stream), + count_it, + count_it + num_tokens, + conditional_invalidout_it, + predicate); using scan_type = write_if::scan_type; auto conditional_write = write_if{tokens.begin(), num_tokens}; auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write); - auto transform_op = cuda::proclaim_return_type( - [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type { - if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd}; - return {static_cast(tokens[i]), tokens[i] == token_t::LineEnd}; - }); - auto binary_op = cuda::proclaim_return_type( + auto binary_op = cuda::proclaim_return_type( [] __device__(scan_type prev, scan_type curr) -> scan_type { auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first); - return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second); + return {(curr.second ? curr.first : op_result), prev.second | curr.second}; + }); + auto transform_op = cuda::proclaim_return_type( + [d_invalid = d_invalid.begin(), tokens = tokens.begin()] __device__(auto i) -> scan_type { + if (d_invalid[i]) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd}; + return {static_cast(tokens[i]), tokens[i] == token_t::LineEnd}; }); - thrust::transform_inclusive_scan(rmm::exec_policy(stream), + thrust::transform_inclusive_scan(rmm::exec_policy_nosync(stream), count_it, count_it + num_tokens, conditional_output_it, diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 99a5b17bce8..8a740ae17ef 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -315,13 +315,12 @@ device_span ingest_raw_input(device_span buffer, // Reading to host because decompression of a single block is much faster on the CPU sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data()); auto uncomp_data = decompress(compression, hbuffer); - CUDF_CUDA_TRY(cudaMemcpyAsync(buffer.data(), - reinterpret_cast(uncomp_data.data()), - uncomp_data.size() * sizeof(char), - cudaMemcpyHostToDevice, - stream.value())); - stream.synchronize(); - return buffer.first(uncomp_data.size()); + auto ret_buffer = buffer.first(uncomp_data.size()); + cudf::detail::cuda_memcpy( + ret_buffer, + host_span{reinterpret_cast(uncomp_data.data()), uncomp_data.size()}, + stream); + return ret_buffer; } table_with_metadata read_json(host_span> sources, @@ -351,10 +350,16 @@ table_with_metadata read_json(host_span> sources, * JSON inputs. */ std::size_t const total_source_size = sources_size(sources, 0, 0); - std::size_t chunk_offset = reader_opts.get_byte_range_offset(); - std::size_t chunk_size = reader_opts.get_byte_range_size(); - chunk_size = !chunk_size ? total_source_size - chunk_offset - : std::min(chunk_size, total_source_size - chunk_offset); + + // Batching is enabled only for JSONL inputs, not regular JSON files + CUDF_EXPECTS( + reader_opts.is_enabled_lines() || total_source_size < std::numeric_limits::max(), + "Parsing Regular JSON inputs of size greater than INT_MAX bytes is not supported"); + + std::size_t chunk_offset = reader_opts.get_byte_range_offset(); + std::size_t chunk_size = reader_opts.get_byte_range_size(); + chunk_size = !chunk_size ? total_source_size - chunk_offset + : std::min(chunk_size, total_source_size - chunk_offset); std::size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size); std::size_t const batch_size_upper_bound = get_batch_size_upper_bound(); diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index dc7199d7ab1..e1241f8f90c 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -170,6 +170,9 @@ struct escape_strings_fn { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + if (column_v.is_empty()) { // empty begets empty + return make_empty_column(type_id::STRING); + } auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr); diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 5be75350951..0cb5c382631 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -77,20 +77,6 @@ void rowgroup_char_counts(device_2dspan counts, counts, orc_columns, rowgroup_bounds, str_col_indexes); } -template -CUDF_KERNEL void __launch_bounds__(block_size) - initialize_dictionary_hash_maps_kernel(device_span dictionaries) -{ - auto const dict_map = dictionaries[blockIdx.x].map_slots; - auto const t = threadIdx.x; - for (size_type i = 0; i < dict_map.size(); i += block_size) { - if (t + i < dict_map.size()) { - new (&dict_map[t + i].first) map_type::atomic_key_type{KEY_SENTINEL}; - new (&dict_map[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL}; - } - } -} - struct equality_functor { column_device_view const& col; __device__ bool operator()(size_type lhs_idx, size_type rhs_idx) const @@ -109,6 +95,9 @@ struct hash_functor { } }; +// Probing scheme to use for the hash map +using probing_scheme_type = cuco::linear_probing; + template CUDF_KERNEL void __launch_bounds__(block_size) populate_dictionary_hash_maps_kernel(device_2dspan dictionaries, @@ -121,26 +110,34 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto const& col = columns[dict.column_idx]; // Make a view of the hash map - auto hash_map_mutable = map_type::device_mutable_view(dict.map_slots.data(), - dict.map_slots.size(), - cuco::empty_key{KEY_SENTINEL}, - cuco::empty_value{VALUE_SENTINEL}); auto const hash_fn = hash_functor{col}; auto const equality_fn = equality_functor{col}; + storage_ref_type const storage_ref{dict.map_slots.size(), dict.map_slots.data()}; + // Make a view of the hash map. + auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL}, + cuco::empty_value{VALUE_SENTINEL}, + equality_fn, + probing_scheme_type{hash_fn}, + cuco::thread_scope_block, + storage_ref}; + + // Create a map ref with `cuco::insert` operator + auto has_map_insert_ref = hash_map_ref.rebind_operators(cuco::insert); + auto const start_row = dict.start_row; auto const end_row = dict.start_row + dict.num_rows; size_type entry_count{0}; size_type char_count{0}; + // all threads should loop the same number of times for (thread_index_type cur_row = start_row + t; cur_row - t < end_row; cur_row += block_size) { auto const is_valid = cur_row < end_row and col.is_valid(cur_row); if (is_valid) { // insert element at cur_row to hash map and count successful insertions - auto const is_unique = - hash_map_mutable.insert(std::pair(cur_row, cur_row), hash_fn, equality_fn); + auto const is_unique = has_map_insert_ref.insert(cuco::pair{cur_row, cur_row}); if (is_unique) { ++entry_count; @@ -175,24 +172,23 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (not dict.is_enabled) { return; } auto const t = threadIdx.x; - auto map = map_type::device_view(dict.map_slots.data(), - dict.map_slots.size(), - cuco::empty_key{KEY_SENTINEL}, - cuco::empty_value{VALUE_SENTINEL}); - __shared__ cuda::atomic counter; using cuda::std::memory_order_relaxed; if (t == 0) { new (&counter) cuda::atomic{0}; } __syncthreads(); + for (size_type i = 0; i < dict.map_slots.size(); i += block_size) { if (t + i < dict.map_slots.size()) { - auto* slot = reinterpret_cast(map.begin_slot() + t + i); - auto key = slot->first; - if (key != KEY_SENTINEL) { - auto loc = counter.fetch_add(1, memory_order_relaxed); - dict.data[loc] = key; - slot->second = loc; + auto window = dict.map_slots.begin() + t + i; + // Collect all slots from each window. + for (auto& slot : *window) { + auto const key = slot.first; + if (key != KEY_SENTINEL) { + auto loc = counter.fetch_add(1, memory_order_relaxed); + dict.data[loc] = key; + slot.second = loc; + } } } } @@ -205,47 +201,42 @@ CUDF_KERNEL void __launch_bounds__(block_size) { auto const col_idx = blockIdx.x; auto const stripe_idx = blockIdx.y; + auto const t = threadIdx.x; auto const& dict = dictionaries[col_idx][stripe_idx]; auto const& col = columns[dict.column_idx]; if (not dict.is_enabled) { return; } - auto const t = threadIdx.x; + // Make a view of the hash map + auto const hash_fn = hash_functor{col}; + auto const equality_fn = equality_functor{col}; + + storage_ref_type const storage_ref{dict.map_slots.size(), dict.map_slots.data()}; + // Make a view of the hash map. + auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL}, + cuco::empty_value{VALUE_SENTINEL}, + equality_fn, + probing_scheme_type{hash_fn}, + cuco::thread_scope_block, + storage_ref}; + + // Create a map ref with `cuco::insert` operator + auto has_map_find_ref = hash_map_ref.rebind_operators(cuco::find); + auto const start_row = dict.start_row; auto const end_row = dict.start_row + dict.num_rows; - auto const map = map_type::device_view(dict.map_slots.data(), - dict.map_slots.size(), - cuco::empty_key{KEY_SENTINEL}, - cuco::empty_value{VALUE_SENTINEL}); - - thread_index_type cur_row = start_row + t; - while (cur_row < end_row) { + for (thread_index_type cur_row = start_row + t; cur_row < end_row; cur_row += block_size) { if (col.is_valid(cur_row)) { - auto const hash_fn = hash_functor{col}; - auto const equality_fn = equality_functor{col}; - auto const found_slot = map.find(cur_row, hash_fn, equality_fn); - cudf_assert(found_slot != map.end() && + auto const found_slot = has_map_find_ref.find(cur_row); + // Fail if we didn't find the previously inserted key. + cudf_assert(found_slot != has_map_find_ref.end() && "Unable to find value in map in dictionary index construction"); - if (found_slot != map.end()) { - // No need for atomic as this is not going to be modified by any other thread - auto const val_ptr = reinterpret_cast(&found_slot->second); - dict.index[cur_row] = *val_ptr; - } + dict.index[cur_row] = found_slot->second; } - cur_row += block_size; } } -void initialize_dictionary_hash_maps(device_2dspan dictionaries, - rmm::cuda_stream_view stream) -{ - if (dictionaries.count() == 0) { return; } - constexpr int block_size = 1024; - initialize_dictionary_hash_maps_kernel - <<>>(dictionaries.flat_view()); -} - void populate_dictionary_hash_maps(device_2dspan dictionaries, device_span columns, rmm::cuda_stream_view stream) diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp index 790532c9d54..5ab36fdae8e 100644 --- a/cpp/src/io/orc/orc.hpp +++ b/cpp/src/io/orc/orc.hpp @@ -258,7 +258,7 @@ class ProtobufReader { private: template - friend class FunctionSwitchImpl; + friend struct FunctionSwitchImpl; void skip_bytes(size_t bytecnt) { diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index 8c7ccf0527f..0949fafe9a4 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -21,6 +21,7 @@ #include "io/utilities/column_buffer.hpp" #include "orc.hpp" +#include #include #include #include @@ -40,19 +41,27 @@ namespace gpu { using cudf::detail::device_2dspan; using cudf::detail::host_2dspan; +using key_type = size_type; +using mapped_type = size_type; +using slot_type = cuco::pair; +auto constexpr map_cg_size = + 1; ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset. + ///< Note: Adjust insert and find loops to use `cg::tile` if increasing this. +auto constexpr window_size = + 1; ///< Number of concurrent slots (set for best performance) handled by each thread. +auto constexpr occupancy_factor = 1.43f; ///< cuCollections suggests using a hash map of size + ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor. +using storage_type = cuco::aow_storage, + cudf::detail::cuco_allocator>; +using storage_ref_type = typename storage_type::ref_type; +using window_type = typename storage_type::window_type; +using slot_type = cuco::pair; + auto constexpr KEY_SENTINEL = size_type{-1}; auto constexpr VALUE_SENTINEL = size_type{-1}; -using map_type = cuco::legacy::static_map; - -/** - * @brief The alias of `map_type::pair_atomic_type` class. - * - * Declare this struct by trivial subclassing instead of type aliasing so we can have forward - * declaration of this struct somewhere else. - */ -struct slot_type : public map_type::slot_type {}; - struct CompressedStreamInfo { CompressedStreamInfo() = default; explicit constexpr CompressedStreamInfo(uint8_t const* compressed_data_, size_t compressed_size_) @@ -184,11 +193,11 @@ struct StripeStream { */ struct stripe_dictionary { // input - device_span map_slots; // hash map storage - uint32_t column_idx = 0; // column index - size_type start_row = 0; // first row in the stripe - size_type start_rowgroup = 0; // first rowgroup in the stripe - size_type num_rows = 0; // number of rows in the stripe + device_span map_slots; // hash map (windows) storage + uint32_t column_idx = 0; // column index + size_type start_row = 0; // first row in the stripe + size_type start_rowgroup = 0; // first rowgroup in the stripe + size_type num_rows = 0; // number of rows in the stripe // output device_span data; // index of elements in the column to include in the dictionary diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu index 01ee5ad177d..fcaee9c548e 100644 --- a/cpp/src/io/orc/reader_impl_chunking.cu +++ b/cpp/src/io/orc/reader_impl_chunking.cu @@ -500,6 +500,8 @@ void reader_impl::load_next_stripe_data(read_mode mode) auto const [read_begin, read_end] = merge_selected_ranges(_file_itm_data.stripe_data_read_ranges, load_stripe_range); + bool stream_synchronized{false}; + for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) { auto const& read_info = _file_itm_data.data_read_info[read_idx]; auto const source_ptr = _metadata.per_file_metadata[read_info.source_idx].source; @@ -507,10 +509,17 @@ void reader_impl::load_next_stripe_data(read_mode mode) lvl_stripe_data[read_info.level][read_info.stripe_idx - stripe_start].data()); if (source_ptr->is_device_read_preferred(read_info.length)) { - device_read_tasks.push_back( - std::pair(source_ptr->device_read_async( - read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream), - read_info.length)); + // `device_read_async` may not use _stream at all. + // Instead, it may use some other stream(s) to sync the H->D memcpy. + // As such, we need to make sure the device buffers in `lvl_stripe_data` are ready first. + if (!stream_synchronized) { + _stream.synchronize(); + stream_synchronized = true; + } + device_read_tasks.emplace_back( + source_ptr->device_read_async( + read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream), + read_info.length); } else { auto buffer = source_ptr->host_read(read_info.offset, read_info.length); @@ -659,8 +668,8 @@ void reader_impl::load_next_stripe_data(read_mode mode) if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) { auto const& decompressor = *_metadata.per_file_metadata[0].decompressor; - auto compinfo = cudf::detail::hostdevice_span( - hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size()); + auto compinfo = cudf::detail::hostdevice_span{hd_compinfo}.subspan( + 0, stream_range.size()); for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) { auto const& info = stream_info[stream_idx]; auto const dst_base = diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index d628e936cb1..c42348a165f 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -22,6 +22,7 @@ #include "io/utilities/hostdevice_span.hpp" #include +#include #include #include #include @@ -32,7 +33,6 @@ #include #include -#include #include #include @@ -451,7 +451,7 @@ void decode_stream_data(int64_t num_dicts, update_null_mask(chunks, out_buffers, stream, mr); } - rmm::device_scalar error_count(0, stream); + cudf::detail::device_scalar error_count(0, stream); gpu::DecodeOrcColumnData(chunks.base_device_ptr(), global_dict.data(), row_groups, @@ -508,21 +508,20 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector const& auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async( prefix_sums_to_update, stream, cudf::get_current_device_resource_ref()); - thrust::for_each( - rmm::exec_policy_nosync(stream), - d_prefix_sums_to_update.begin(), - d_prefix_sums_to_update.end(), - [num_stripes, chunks = cudf::detail::device_2dspan{chunks}] __device__( - auto const& idx_psums) { - auto const col_idx = idx_psums.first; - auto const psums = idx_psums.second; - thrust::transform(thrust::seq, - thrust::make_counting_iterator(0ul), - thrust::make_counting_iterator(num_stripes), - psums, - [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; }); - thrust::inclusive_scan(thrust::seq, psums, psums + num_stripes, psums); - }); + thrust::for_each(rmm::exec_policy_nosync(stream), + d_prefix_sums_to_update.begin(), + d_prefix_sums_to_update.end(), + [num_stripes, chunks = chunks.device_view()] __device__(auto const& idx_psums) { + auto const col_idx = idx_psums.first; + auto const psums = idx_psums.second; + thrust::transform( + thrust::seq, + thrust::make_counting_iterator(0ul), + thrust::make_counting_iterator(num_stripes), + psums, + [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; }); + thrust::inclusive_scan(thrust::seq, psums, psums + num_stripes, psums); + }); // `prefix_sums_to_update` goes out of scope, copy has to be done before we return stream.synchronize(); } @@ -554,12 +553,12 @@ void aggregate_child_meta(std::size_t level, col_meta.num_child_rows_per_stripe.resize(number_of_child_chunks); col_meta.rwgrp_meta.resize(num_of_rowgroups * num_child_cols); - auto child_start_row = cudf::detail::host_2dspan( - col_meta.child_start_row.data(), num_of_stripes, num_child_cols); - auto num_child_rows_per_stripe = cudf::detail::host_2dspan( - col_meta.num_child_rows_per_stripe.data(), num_of_stripes, num_child_cols); + auto child_start_row = + cudf::detail::host_2dspan(col_meta.child_start_row, num_child_cols); + auto num_child_rows_per_stripe = + cudf::detail::host_2dspan(col_meta.num_child_rows_per_stripe, num_child_cols); auto rwgrp_meta = cudf::detail::host_2dspan( - col_meta.rwgrp_meta.data(), num_of_rowgroups, num_child_cols); + col_meta.rwgrp_meta, num_child_cols); int index = 0; // number of child column processed @@ -951,8 +950,9 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode) // Setup row group descriptors if using indexes. if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) { - auto compinfo = cudf::detail::hostdevice_span( - hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size()); + auto const compinfo = + cudf::detail::hostdevice_span{hd_compinfo}.subspan( + 0, stream_range.size()); auto decomp_data = decompress_stripe_data(load_stripe_range, stream_range, stripe_count, diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 5c70e35fd2e..ed0b6969154 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -20,6 +20,8 @@ #include "orc_gpu.hpp" #include +#include +#include #include #include #include @@ -1087,37 +1089,42 @@ CUDF_KERNEL void __launch_bounds__(block_size) /** * @brief Merge chunked column data into a single contiguous stream * - * @param[in,out] strm_desc StripeStream device array [stripe][stream] - * @param[in,out] streams List of encoder chunk streams [column][rowgroup] + * @param[in] strm_desc StripeStream device array [stripe][stream] + * @param[in] streams List of encoder chunk streams [column][rowgroup] + * @param[out] srcs List of source encoder chunk stream data addresses + * @param[out] dsts List of destination StripeStream data addresses + * @param[out] sizes List of stream sizes in bytes */ // blockDim {compact_streams_block_size,1,1} CUDF_KERNEL void __launch_bounds__(compact_streams_block_size) - gpuCompactOrcDataStreams(device_2dspan strm_desc, - device_2dspan streams) + gpuInitBatchedMemcpy(device_2dspan strm_desc, + device_2dspan streams, + device_span srcs, + device_span dsts, + device_span sizes) { - __shared__ __align__(16) StripeStream ss; - - auto const stripe_id = blockIdx.x; + auto const stripe_id = cudf::detail::grid_1d::global_thread_id(); auto const stream_id = blockIdx.y; - auto const t = threadIdx.x; + if (stripe_id >= strm_desc.size().first) { return; } - if (t == 0) { ss = strm_desc[stripe_id][stream_id]; } - __syncthreads(); + auto const out_id = stream_id * strm_desc.size().first + stripe_id; + StripeStream ss = strm_desc[stripe_id][stream_id]; if (ss.data_ptr == nullptr) { return; } auto const cid = ss.stream_type; auto dst_ptr = ss.data_ptr; for (auto group = ss.first_chunk_id; group < ss.first_chunk_id + ss.num_chunks; ++group) { + auto const out_id = stream_id * streams.size().second + group; + srcs[out_id] = streams[ss.column_id][group].data_ptrs[cid]; + dsts[out_id] = dst_ptr; + + // Also update the stream here, data will be copied in a separate kernel + streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; + auto const len = streams[ss.column_id][group].lengths[cid]; - if (len > 0) { - auto const src_ptr = streams[ss.column_id][group].data_ptrs[cid]; - for (uint32_t i = t; i < len; i += blockDim.x) { - dst_ptr[i] = src_ptr[i]; - } - __syncthreads(); - } - if (t == 0) { streams[ss.column_id][group].data_ptrs[cid] = dst_ptr; } + // len is the size (in bytes) of the current stream. + sizes[out_id] = len; dst_ptr += len; } } @@ -1325,9 +1332,26 @@ void CompactOrcDataStreams(device_2dspan strm_desc, device_2dspan enc_streams, rmm::cuda_stream_view stream) { + auto const num_rowgroups = enc_streams.size().second; + auto const num_streams = strm_desc.size().second; + auto const num_stripes = strm_desc.size().first; + auto const num_chunks = num_rowgroups * num_streams; + auto srcs = cudf::detail::make_zeroed_device_uvector_async( + num_chunks, stream, rmm::mr::get_current_device_resource()); + auto dsts = cudf::detail::make_zeroed_device_uvector_async( + num_chunks, stream, rmm::mr::get_current_device_resource()); + auto lengths = cudf::detail::make_zeroed_device_uvector_async( + num_chunks, stream, rmm::mr::get_current_device_resource()); + dim3 dim_block(compact_streams_block_size, 1); - dim3 dim_grid(strm_desc.size().first, strm_desc.size().second); - gpuCompactOrcDataStreams<<>>(strm_desc, enc_streams); + dim3 dim_grid(cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size), + strm_desc.size().second); + gpuInitBatchedMemcpy<<>>( + strm_desc, enc_streams, srcs, dsts, lengths); + + // Copy streams in a batched manner. + cudf::detail::batched_memcpy_async( + srcs.begin(), dsts.begin(), lengths.begin(), lengths.size(), stream); } std::optional CompressOrcDataStreams( diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 60a64fb0ee6..d432deb8e79 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -19,7 +19,9 @@ * @brief cuDF-IO ORC writer class implementation */ +#include "cudf/detail/utilities/cuda_memcpy.hpp" #include "io/comp/nvcomp_adapter.hpp" +#include "io/orc/orc_gpu.hpp" #include "io/statistics/column_statistics.cuh" #include "io/utilities/column_utils.cuh" #include "writer_impl.hpp" @@ -717,8 +719,8 @@ std::vector> calculate_aligned_rowgroup_bounds( auto d_pd_set_counts_data = rmm::device_uvector( orc_table.num_columns() * segmentation.num_rowgroups(), stream); - auto const d_pd_set_counts = device_2dspan{ - d_pd_set_counts_data.data(), segmentation.num_rowgroups(), orc_table.num_columns()}; + auto const d_pd_set_counts = + device_2dspan{d_pd_set_counts_data, orc_table.num_columns()}; gpu::reduce_pushdown_masks(orc_table.d_columns, segmentation.rowgroups, d_pd_set_counts, stream); auto aligned_rgs = hostdevice_2dvector( @@ -739,7 +741,7 @@ std::vector> calculate_aligned_rowgroup_bounds( [columns = device_span{orc_table.d_columns}, stripes = device_span{d_stripes}, d_pd_set_counts, - out_rowgroups = device_2dspan{aligned_rgs}] __device__(auto& idx) { + out_rowgroups = aligned_rgs.device_view()] __device__(auto& idx) { uint32_t const col_idx = idx / stripes.size(); // No alignment needed for root columns if (not columns[col_idx].parent_index.has_value()) return; @@ -911,7 +913,7 @@ encoded_data encode_columns(orc_table_view const& orc_table, rmm::exec_policy(stream), thrust::make_counting_iterator(0ul), chunks.count(), - [chunks = device_2dspan{chunks}, + [chunks = chunks.device_view(), cols = device_span{orc_table.d_columns}] __device__(auto& idx) { auto const col_idx = idx / chunks.size().second; auto const rg_idx = idx % chunks.size().second; @@ -1407,7 +1409,8 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer, num_entries_seen += stripes_per_col; } - std::vector file_stats_merge(num_file_blobs); + auto file_stats_merge = + cudf::detail::make_host_vector(num_file_blobs, stream); for (auto i = 0u; i < num_file_blobs; ++i) { auto col_stats = &file_stats_merge[i]; col_stats->col_dtype = per_chunk_stats.col_types[i]; @@ -1417,11 +1420,10 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer, } auto d_file_stats_merge = stats_merge.device_ptr(num_stripe_blobs); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_file_stats_merge, - file_stats_merge.data(), - num_file_blobs * sizeof(statistics_merge_group), - cudaMemcpyDefault, - stream.value())); + cudf::detail::cuda_memcpy_async( + device_span{stats_merge.device_ptr(num_stripe_blobs), num_file_blobs}, + file_stats_merge, + stream); auto file_stat_chunks = stat_chunks.data() + num_stripe_blobs; detail::merge_group_statistics( @@ -1572,7 +1574,7 @@ void write_index_stream(int32_t stripe_id, * @param[in] strm_desc Stream's descriptor * @param[in] enc_stream Chunk's streams * @param[in] compressed_data Compressed stream data - * @param[in,out] stream_out Temporary host output buffer + * @param[in,out] bounce_buffer Pinned memory bounce buffer for D2H data transfer * @param[in,out] stripe Stream's parent stripe * @param[in,out] streams List of all streams * @param[in] compression_kind The compression kind @@ -1583,7 +1585,7 @@ void write_index_stream(int32_t stripe_id, std::future write_data_stream(gpu::StripeStream const& strm_desc, gpu::encoder_chunk_streams const& enc_stream, uint8_t const* compressed_data, - uint8_t* stream_out, + host_span bounce_buffer, StripeInformation* stripe, orc_streams* streams, CompressionKind compression_kind, @@ -1603,11 +1605,10 @@ std::future write_data_stream(gpu::StripeStream const& strm_desc, if (out_sink->is_device_write_preferred(length)) { return out_sink->device_write_async(stream_in, length, stream); } else { - CUDF_CUDA_TRY( - cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDefault, stream.value())); - stream.synchronize(); + cudf::detail::cuda_memcpy( + bounce_buffer.subspan(0, length), device_span{stream_in, length}, stream); - out_sink->host_write(stream_out, length); + out_sink->host_write(bounce_buffer.data(), length); return std::async(std::launch::deferred, [] {}); } }(); @@ -1897,7 +1898,7 @@ hostdevice_2dvector calculate_rowgroup_bounds(orc_table_view cons thrust::make_counting_iterator(0ul), num_rowgroups, [cols = device_span{orc_table.d_columns}, - rg_bounds = device_2dspan{rowgroup_bounds}, + rg_bounds = rowgroup_bounds.device_view(), rowgroup_size] __device__(auto rg_idx) mutable { thrust::transform( thrust::seq, cols.begin(), cols.end(), rg_bounds[rg_idx].begin(), [&](auto const& col) { @@ -1987,8 +1988,7 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table, d_tmp_rowgroup_sizes.end(), [src = esizes.data(), col_idx = col_idx, - rg_bounds = device_2dspan{ - segmentation.rowgroups}] __device__(auto idx) { + rg_bounds = segmentation.rowgroups.device_view()] __device__(auto idx) { return src[rg_bounds[idx][col_idx].end - 1]; }); @@ -2050,7 +2050,7 @@ auto set_rowgroup_char_counts(orc_table_view& orc_table, auto const num_str_cols = orc_table.num_string_columns(); auto counts = rmm::device_uvector(num_str_cols * num_rowgroups, stream); - auto counts_2d_view = device_2dspan(counts.data(), num_str_cols, num_rowgroups); + auto counts_2d_view = device_2dspan(counts, num_rowgroups); gpu::rowgroup_char_counts(counts_2d_view, orc_table.d_columns, rowgroup_bounds, @@ -2110,7 +2110,9 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, bool sort_dictionaries, rmm::cuda_stream_view stream) { - std::vector>> hash_maps_storage( + // Variable to keep track of the current total map storage size + size_t total_map_storage_size = 0; + std::vector> hash_maps_storage_offsets( orc_table.string_column_indices.size()); for (auto col_idx : orc_table.string_column_indices) { auto& str_column = orc_table.column(col_idx); @@ -2119,14 +2121,21 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, stripe.size == 0 ? 0 : segmentation.rowgroups[stripe.first + stripe.size - 1][col_idx].end - segmentation.rowgroups[stripe.first][col_idx].begin; - hash_maps_storage[str_column.str_index()].emplace_back(stripe_num_rows * 1.43, stream); + hash_maps_storage_offsets[str_column.str_index()].emplace_back(total_map_storage_size); + total_map_storage_size += stripe_num_rows * gpu::occupancy_factor; } + hash_maps_storage_offsets[str_column.str_index()].emplace_back(total_map_storage_size); } hostdevice_2dvector stripe_dicts( orc_table.num_string_columns(), segmentation.num_stripes(), stream); if (stripe_dicts.count() == 0) return {std::move(stripe_dicts), {}, {}}; + // Create a single bulk storage to use for all sub-dictionaries + auto map_storage = std::make_unique( + total_map_storage_size, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}); + // Initialize stripe dictionaries for (auto col_idx : orc_table.string_column_indices) { auto& str_column = orc_table.column(col_idx); @@ -2137,7 +2146,9 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, auto const stripe_idx = stripe.id; auto& sd = stripe_dicts[str_col_idx][stripe_idx]; - sd.map_slots = hash_maps_storage[str_col_idx][stripe_idx]; + sd.map_slots = {map_storage->data() + hash_maps_storage_offsets[str_col_idx][stripe_idx], + hash_maps_storage_offsets[str_col_idx][stripe_idx + 1] - + hash_maps_storage_offsets[str_col_idx][stripe_idx]}; sd.column_idx = col_idx; sd.start_row = segmentation.rowgroups[stripe.first][col_idx].begin; sd.start_rowgroup = stripe.first; @@ -2150,7 +2161,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, } stripe_dicts.host_to_device_async(stream); - gpu::initialize_dictionary_hash_maps(stripe_dicts, stream); + map_storage->initialize_async({gpu::KEY_SENTINEL, gpu::VALUE_SENTINEL}, {stream.value()}); gpu::populate_dictionary_hash_maps(stripe_dicts, orc_table.d_columns, stream); // Copy the entry counts and char counts from the device to the host stripe_dicts.device_to_host_sync(stream); @@ -2184,8 +2195,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, col_use_dictionary = true; } else { // Clear hash map storage as dictionary encoding is not used for this stripe - hash_maps_storage[str_col_idx][stripe_idx] = rmm::device_uvector(0, stream); - sd.map_slots = {}; + sd.map_slots = {}; } } // If any stripe uses dictionary encoding, allocate index storage for the whole column @@ -2203,7 +2213,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, gpu::get_dictionary_indices(stripe_dicts, orc_table.d_columns, stream); // deallocate hash map storage, unused after this point - hash_maps_storage.clear(); + map_storage.reset(); // Clear map slots and attach order buffers auto dictionaries_flat = stripe_dicts.host_view().flat_view(); @@ -2606,7 +2616,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data, strm_desc, enc_data.streams[strm_desc.column_id][segmentation.stripes[stripe_id].first], compressed_data.data(), - bounce_buffer.data(), + bounce_buffer, &stripe, &streams, _compression_kind, diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 17ccb73c0a8..b85ebf2fa1a 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -84,7 +84,7 @@ struct map_insert_fn { storage_ref}; // Create a map ref with `cuco::insert` operator - auto map_insert_ref = hash_map_ref.with_operators(cuco::insert); + auto map_insert_ref = hash_map_ref.rebind_operators(cuco::insert); auto const t = threadIdx.x; // Create atomic refs to the current chunk's num_dict_entries and uniq_data_size @@ -186,7 +186,7 @@ struct map_find_fn { storage_ref}; // Create a map ref with `cuco::find` operator - auto const map_find_ref = hash_map_ref.with_operators(cuco::find); + auto const map_find_ref = hash_map_ref.rebind_operators(cuco::find); auto const t = threadIdx.x; // Note: Adjust the following loop to use `cg::tiles` if needed in the future. @@ -194,17 +194,12 @@ struct map_find_fn { val_idx += block_size) { // Find the key using a single thread for best performance for now. if (data_col.is_valid(val_idx)) { + auto const found_slot = map_find_ref.find(val_idx); + // Fail if we didn't find the previously inserted key. + cudf_assert(found_slot != map_find_ref.end() && + "Unable to find value in map in dictionary index construction"); // No need for atomic as this is not going to be modified by any other thread. - chunk->dict_index[val_idx - s_ck_start_val_idx] = [&]() { - auto const found_slot = map_find_ref.find(val_idx); - - // Fail if we didn't find the previously inserted key. - cudf_assert(found_slot != map_find_ref.end() && - "Unable to find value in map in dictionary index construction"); - - // Return the found value. - return found_slot->second; - }(); + chunk->dict_index[val_idx - s_ck_start_val_idx] = found_slot->second; } } } else { diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index b978799b8bc..d276e946a51 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -228,7 +228,8 @@ class parquet_field_string : public parquet_field { * @return True if field types mismatch or if the process of reading a * string fails */ -struct parquet_field_string_list : public parquet_field_list { +class parquet_field_string_list : public parquet_field_list { + public: parquet_field_string_list(int f, std::vector& v) : parquet_field_list(f, v) { auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) { @@ -308,10 +309,10 @@ class parquet_field_struct : public parquet_field { template class parquet_field_union_struct : public parquet_field { E& enum_val; - cuda::std::optional& val; // union structs are always wrapped in std::optional + std::optional& val; // union structs are always wrapped in std::optional public: - parquet_field_union_struct(int f, E& ev, cuda::std::optional& v) + parquet_field_union_struct(int f, E& ev, std::optional& v) : parquet_field(f), enum_val(ev), val(v) { } @@ -396,8 +397,9 @@ class parquet_field_binary : public parquet_field { * @return True if field types mismatch or if the process of reading a * binary fails */ -struct parquet_field_binary_list +class parquet_field_binary_list : public parquet_field_list, FieldType::BINARY> { + public: parquet_field_binary_list(int f, std::vector>& v) : parquet_field_list(f, v) { auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) { @@ -437,10 +439,10 @@ class parquet_field_struct_blob : public parquet_field { */ template class parquet_field_optional : public parquet_field { - cuda::std::optional& val; + std::optional& val; public: - parquet_field_optional(int f, cuda::std::optional& v) : parquet_field(f), val(v) {} + parquet_field_optional(int f, std::optional& v) : parquet_field(f), val(v) {} inline void operator()(CompactProtocolReader* cpr, int field_type) { diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 8a866141c4b..4522ea7fe56 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -24,6 +24,59 @@ namespace cudf::io::parquet::detail { namespace { +// Unlike cub's algorithm, this provides warp-wide and block-wide results simultaneously. +// Also, this provides the ability to compute warp_bits & lane_mask manually, which we need for +// lists. +struct block_scan_results { + uint32_t warp_bits; + int thread_count_within_warp; + int warp_count; + + int thread_count_within_block; + int block_count; +}; + +template +static __device__ void scan_block_exclusive_sum(int thread_bit, block_scan_results& results) +{ + int const t = threadIdx.x; + int const warp_index = t / cudf::detail::warp_size; + int const warp_lane = t % cudf::detail::warp_size; + uint32_t const lane_mask = (uint32_t(1) << warp_lane) - 1; + + uint32_t warp_bits = ballot(thread_bit); + scan_block_exclusive_sum(warp_bits, warp_lane, warp_index, lane_mask, results); +} + +template +__device__ static void scan_block_exclusive_sum(uint32_t warp_bits, + int warp_lane, + int warp_index, + uint32_t lane_mask, + block_scan_results& results) +{ + // Compute # warps + constexpr int num_warps = decode_block_size / cudf::detail::warp_size; + + // Compute the warp-wide results + results.warp_bits = warp_bits; + results.warp_count = __popc(results.warp_bits); + results.thread_count_within_warp = __popc(results.warp_bits & lane_mask); + + // Share the warp counts amongst the block threads + __shared__ int warp_counts[num_warps]; + if (warp_lane == 0) { warp_counts[warp_index] = results.warp_count; } + __syncthreads(); + + // Compute block-wide results + results.block_count = 0; + results.thread_count_within_block = results.thread_count_within_warp; + for (int warp_idx = 0; warp_idx < num_warps; ++warp_idx) { + results.block_count += warp_counts[warp_idx]; + if (warp_idx < warp_index) { results.thread_count_within_block += warp_counts[warp_idx]; } + } +} + template __device__ inline void gpuDecodeFixedWidthValues( page_state_s* s, state_buf* const sb, int start, int end, int t) @@ -194,7 +247,7 @@ struct decode_fixed_width_split_values_func { } }; -template +template static __device__ int gpuUpdateValidityAndRowIndicesNested( int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t) { @@ -211,29 +264,28 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( int const row_index_lower_bound = s->row_index_lower_bound; - int const max_depth = s->col.max_nesting_depth - 1; + int const max_depth = s->col.max_nesting_depth - 1; + auto& max_depth_ni = s->nesting_info[max_depth]; + int max_depth_valid_count = max_depth_ni.valid_count; + __syncthreads(); while (value_count < capped_target_value_count) { int const batch_size = min(max_batch_size, capped_target_value_count - value_count); - // definition level. only need to process for nullable columns - int d = 0; - if constexpr (nullable) { - if (def) { - d = t < batch_size - ? static_cast(def[rolling_index(value_count + t)]) - : -1; - } else { - d = t < batch_size ? 1 : -1; - } + // definition level + int d = 1; + if (t >= batch_size) { + d = -1; + } else if (def) { + d = static_cast(def[rolling_index(value_count + t)]); } - int const thread_value_count = t + 1; + int const thread_value_count = t; int const block_value_count = batch_size; // compute our row index, whether we're in row bounds, and validity - int const row_index = (thread_value_count + value_count) - 1; + int const row_index = thread_value_count + value_count; int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row); int const write_start = __ffs(in_write_row_bounds) - 1; // first bit in the warp to store @@ -242,90 +294,75 @@ static __device__ int gpuUpdateValidityAndRowIndicesNested( for (int d_idx = 0; d_idx <= max_depth; d_idx++) { auto& ni = s->nesting_info[d_idx]; - int is_valid; - if constexpr (nullable) { - is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0; - } else { - is_valid = in_row_bounds; - } + int const is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0; // thread and block validity count + using block_scan = cub::BlockScan; + __shared__ typename block_scan::TempStorage scan_storage; int thread_valid_count, block_valid_count; - if constexpr (nullable) { - using block_scan = cub::BlockScan; - __shared__ typename block_scan::TempStorage scan_storage; - block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count); - __syncthreads(); - - // validity is processed per-warp - // - // nested schemas always read and write to the same bounds (that is, read and write - // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading - // at the first value, even if that is before first_row, because we cannot trivially jump to - // the correct position to start reading. since we are about to write the validity vector - // here we need to adjust our computed mask to take into account the write row bounds. - int warp_null_count = 0; - if (write_start >= 0 && ni.valid_map != nullptr) { - int const valid_map_offset = ni.valid_map_offset; - uint32_t const warp_validity_mask = ballot(is_valid); - // lane 0 from each warp writes out validity - if ((t % cudf::detail::warp_size) == 0) { - int const vindex = - (value_count + thread_value_count) - 1; // absolute input value index - int const bit_offset = (valid_map_offset + vindex + write_start) - - first_row; // absolute bit offset into the output validity map - int const write_end = cudf::detail::warp_size - - __clz(in_write_row_bounds); // last bit in the warp to store - int const bit_count = write_end - write_start; - warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); - - store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); - } - } + block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count); - // sum null counts. we have to do it this way instead of just incrementing by (value_count - - // valid_count) because valid_count also includes rows that potentially start before our row - // bounds. if we could come up with a way to clean that up, we could remove this and just - // compute it directly at the end of the kernel. - size_type const block_null_count = - cudf::detail::single_lane_block_sum_reduce(warp_null_count); - if (t == 0) { ni.null_count += block_null_count; } - } - // trivial for non-nullable columns - else { - thread_valid_count = thread_value_count; - block_valid_count = block_value_count; + // validity is processed per-warp + // + // nested schemas always read and write to the same bounds (that is, read and write + // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading + // at the first value, even if that is before first_row, because we cannot trivially jump to + // the correct position to start reading. since we are about to write the validity vector + // here we need to adjust our computed mask to take into account the write row bounds. + int warp_null_count = 0; + if (ni.valid_map != nullptr) { + uint32_t const warp_validity_mask = ballot(is_valid); + // lane 0 from each warp writes out validity + if ((write_start >= 0) && ((t % cudf::detail::warp_size) == 0)) { + int const valid_map_offset = ni.valid_map_offset; + int const vindex = value_count + thread_value_count; // absolute input value index + int const bit_offset = (valid_map_offset + vindex + write_start) - + first_row; // absolute bit offset into the output validity map + int const write_end = + cudf::detail::warp_size - __clz(in_write_row_bounds); // last bit in the warp to store + int const bit_count = write_end - write_start; + warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); + + store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); + } } + // sum null counts. we have to do it this way instead of just incrementing by (value_count - + // valid_count) because valid_count also includes rows that potentially start before our row + // bounds. if we could come up with a way to clean that up, we could remove this and just + // compute it directly at the end of the kernel. + size_type const block_null_count = + cudf::detail::single_lane_block_sum_reduce(warp_null_count); + if (t == 0) { ni.null_count += block_null_count; } + // if this is valid and we're at the leaf, output dst_pos - __syncthreads(); // handle modification of ni.value_count from below - if (is_valid && d_idx == max_depth) { - // for non-list types, the value count is always the same across - int const dst_pos = (value_count + thread_value_count) - 1; - int const src_pos = (ni.valid_count + thread_valid_count) - 1; - sb->nz_idx[rolling_index(src_pos)] = dst_pos; + if (d_idx == max_depth) { + if (is_valid) { + int const dst_pos = value_count + thread_value_count; + int const src_pos = max_depth_valid_count + thread_valid_count; + sb->nz_idx[rolling_index(src_pos)] = dst_pos; + } + // update stuff + max_depth_valid_count += block_valid_count; } - __syncthreads(); // handle modification of ni.value_count from below - // update stuff - if (t == 0) { ni.valid_count += block_valid_count; } - } + } // end depth loop value_count += block_value_count; - } + } // end loop if (t == 0) { // update valid value count for decoding and total # of values we've processed - s->nz_count = s->nesting_info[max_depth].valid_count; - s->input_value_count = value_count; - s->input_row_count = value_count; + max_depth_ni.valid_count = max_depth_valid_count; + s->nz_count = max_depth_valid_count; + s->input_value_count = value_count; + s->input_row_count = value_count; } - __syncthreads(); - return s->nesting_info[max_depth].valid_count; + return max_depth_valid_count; } -template +template static __device__ int gpuUpdateValidityAndRowIndicesFlat( int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t) { @@ -351,83 +388,67 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( while (value_count < capped_target_value_count) { int const batch_size = min(max_batch_size, capped_target_value_count - value_count); - // definition level. only need to process for nullable columns - int d = 0; - if constexpr (nullable) { - if (def) { - d = t < batch_size - ? static_cast(def[rolling_index(value_count + t)]) - : -1; - } else { - d = t < batch_size ? 1 : -1; - } - } - - int const thread_value_count = t + 1; + int const thread_value_count = t; int const block_value_count = batch_size; // compute our row index, whether we're in row bounds, and validity - int const row_index = (thread_value_count + value_count) - 1; + int const row_index = thread_value_count + value_count; int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); + + // use definition level & row bounds to determine if is valid int is_valid; - if constexpr (nullable) { - is_valid = ((d > 0) && in_row_bounds) ? 1 : 0; + if (t >= batch_size) { + is_valid = 0; + } else if (def) { + int const def_level = + static_cast(def[rolling_index(value_count + t)]); + is_valid = ((def_level > 0) && in_row_bounds) ? 1 : 0; } else { is_valid = in_row_bounds; } // thread and block validity count + using block_scan = cub::BlockScan; + __shared__ typename block_scan::TempStorage scan_storage; int thread_valid_count, block_valid_count; - if constexpr (nullable) { - using block_scan = cub::BlockScan; - __shared__ typename block_scan::TempStorage scan_storage; - block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count); - __syncthreads(); - - // validity is processed per-warp - // - // nested schemas always read and write to the same bounds (that is, read and write - // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading - // at the first value, even if that is before first_row, because we cannot trivially jump to - // the correct position to start reading. since we are about to write the validity vector - // here we need to adjust our computed mask to take into account the write row bounds. - int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row); - int const write_start = __ffs(in_write_row_bounds) - 1; // first bit in the warp to store - int warp_null_count = 0; - if (write_start >= 0) { - uint32_t const warp_validity_mask = ballot(is_valid); - // lane 0 from each warp writes out validity - if ((t % cudf::detail::warp_size) == 0) { - int const vindex = (value_count + thread_value_count) - 1; // absolute input value index - int const bit_offset = (valid_map_offset + vindex + write_start) - - first_row; // absolute bit offset into the output validity map - int const write_end = - cudf::detail::warp_size - __clz(in_write_row_bounds); // last bit in the warp to store - int const bit_count = write_end - write_start; - warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); - - store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); - } - } - - // sum null counts. we have to do it this way instead of just incrementing by (value_count - - // valid_count) because valid_count also includes rows that potentially start before our row - // bounds. if we could come up with a way to clean that up, we could remove this and just - // compute it directly at the end of the kernel. - size_type const block_null_count = - cudf::detail::single_lane_block_sum_reduce(warp_null_count); - if (t == 0) { ni.null_count += block_null_count; } - } - // trivial for non-nullable columns - else { - thread_valid_count = thread_value_count; - block_valid_count = block_value_count; + block_scan(scan_storage).ExclusiveSum(is_valid, thread_valid_count, block_valid_count); + uint32_t const warp_validity_mask = ballot(is_valid); + + // validity is processed per-warp + // + // nested schemas always read and write to the same bounds (that is, read and write + // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading + // at the first value, even if that is before first_row, because we cannot trivially jump to + // the correct position to start reading. since we are about to write the validity vector + // here we need to adjust our computed mask to take into account the write row bounds. + int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row); + int const write_start = __ffs(in_write_row_bounds) - 1; // first bit in the warp to store + int warp_null_count = 0; + // lane 0 from each warp writes out validity + if ((write_start >= 0) && ((t % cudf::detail::warp_size) == 0)) { + int const vindex = value_count + thread_value_count; // absolute input value index + int const bit_offset = (valid_map_offset + vindex + write_start) - + first_row; // absolute bit offset into the output validity map + int const write_end = + cudf::detail::warp_size - __clz(in_write_row_bounds); // last bit in the warp to store + int const bit_count = write_end - write_start; + warp_null_count = bit_count - __popc(warp_validity_mask >> write_start); + + store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count); } + // sum null counts. we have to do it this way instead of just incrementing by (value_count - + // valid_count) because valid_count also includes rows that potentially start before our row + // bounds. if we could come up with a way to clean that up, we could remove this and just + // compute it directly at the end of the kernel. + size_type const block_null_count = + cudf::detail::single_lane_block_sum_reduce(warp_null_count); + if (t == 0) { ni.null_count += block_null_count; } + // output offset if (is_valid) { - int const dst_pos = (value_count + thread_value_count) - 1; - int const src_pos = (valid_count + thread_valid_count) - 1; + int const dst_pos = value_count + thread_value_count; + int const src_pos = valid_count + thread_valid_count; sb->nz_idx[rolling_index(src_pos)] = dst_pos; } @@ -448,6 +469,70 @@ static __device__ int gpuUpdateValidityAndRowIndicesFlat( return valid_count; } +template +static __device__ int gpuUpdateValidityAndRowIndicesNonNullable(int32_t target_value_count, + page_state_s* s, + state_buf* sb, + int t) +{ + constexpr int num_warps = decode_block_size / cudf::detail::warp_size; + constexpr int max_batch_size = num_warps * cudf::detail::warp_size; + + // cap by last row so that we don't process any rows past what we want to output. + int const first_row = s->first_row; + int const last_row = first_row + s->num_rows; + int const capped_target_value_count = min(target_value_count, last_row); + int const row_index_lower_bound = s->row_index_lower_bound; + + // how many (input) values we've processed in the page so far + int value_count = s->input_value_count; + + int const max_depth = s->col.max_nesting_depth - 1; + auto& ni = s->nesting_info[max_depth]; + int valid_count = ni.valid_count; + + __syncthreads(); + + while (value_count < capped_target_value_count) { + int const batch_size = min(max_batch_size, capped_target_value_count - value_count); + + int const thread_value_count = t; + int const block_value_count = batch_size; + + // compute our row index, whether we're in row bounds, and validity + int const row_index = thread_value_count + value_count; + int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row); + + int const is_valid = in_row_bounds; + int const thread_valid_count = thread_value_count; + int const block_valid_count = block_value_count; + + // if this is valid and we're at the leaf, output dst_pos + if (is_valid) { + // for non-list types, the value count is always the same across + int const dst_pos = value_count + thread_value_count; + int const src_pos = valid_count + thread_valid_count; + + sb->nz_idx[rolling_index(src_pos)] = dst_pos; + } + + // update stuff + value_count += block_value_count; + valid_count += block_valid_count; + } // end loop + + if (t == 0) { + // update valid value count for decoding and total # of values we've processed + ni.valid_count = valid_count; + ni.value_count = value_count; + s->nz_count = valid_count; + s->input_value_count = value_count; + s->input_row_count = value_count; + } + + return valid_count; +} + // is the page marked nullable or not __device__ inline bool is_nullable(page_state_s* s) { @@ -605,7 +690,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) int valid_count = 0; // the core loop. decode batches of level stream data using rle_stream objects // and pass the results to gpuDecodeValues - while (s->error == 0 && processed_count < s->page.num_input_values) { + // For chunked reads we may not process all of the rows on the page; if not stop early + int last_row = s->first_row + s->num_rows; + while ((s->error == 0) && (processed_count < s->page.num_input_values) && + (s->input_row_count <= last_row)) { int next_valid_count; // only need to process definition levels if this is a nullable column @@ -614,10 +702,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) __syncthreads(); if constexpr (has_nesting_t) { - next_valid_count = gpuUpdateValidityAndRowIndicesNested( + next_valid_count = gpuUpdateValidityAndRowIndicesNested( processed_count, s, sb, def, t); } else { - next_valid_count = gpuUpdateValidityAndRowIndicesFlat( + next_valid_count = gpuUpdateValidityAndRowIndicesFlat( processed_count, s, sb, def, t); } } @@ -626,15 +714,8 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t) // nz_idx. gpuDecodeFixedWidthValues would be the only work that happens. else { processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count); - - if constexpr (has_nesting_t) { - next_valid_count = - gpuUpdateValidityAndRowIndicesNested( - processed_count, s, sb, nullptr, t); - } else { - next_valid_count = gpuUpdateValidityAndRowIndicesFlat( - processed_count, s, sb, nullptr, t); - } + next_valid_count = + gpuUpdateValidityAndRowIndicesNonNullable(processed_count, s, sb, t); } __syncthreads(); diff --git a/cpp/src/io/parquet/error.hpp b/cpp/src/io/parquet/error.hpp index f0fc9fab3ab..8b3d1d7a6c3 100644 --- a/cpp/src/io/parquet/error.hpp +++ b/cpp/src/io/parquet/error.hpp @@ -26,7 +26,7 @@ namespace cudf::io::parquet { /** - * @brief Wrapper around a `rmm::device_scalar` for use in reporting errors that occur in + * @brief Specialized device scalar for use in reporting errors that occur in * kernel calls. * * The `kernel_error` object is created with a `rmm::cuda_stream_view` which is used throughout diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index e0d50d7ccf9..0d24fa4236f 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -17,8 +17,11 @@ #include "page_data.cuh" #include "page_decode.cuh" +#include + #include +#include #include namespace cudf::io::parquet::detail { @@ -466,4 +469,28 @@ void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span pages, } } +void WriteFinalOffsets(host_span offsets, + host_span buff_addrs, + rmm::cuda_stream_view stream) +{ + // Copy offsets to device and create an iterator + auto d_src_data = cudf::detail::make_device_uvector_async( + offsets, stream, cudf::get_current_device_resource_ref()); + // Iterator for the source (scalar) data + auto src_iter = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [src = d_src_data.begin()] __device__(std::size_t i) { return src + i; })); + + // Copy buffer addresses to device and create an iterator + auto d_dst_addrs = cudf::detail::make_device_uvector_async( + buff_addrs, stream, cudf::get_current_device_resource_ref()); + // size_iter is simply a constant iterator of sizeof(size_type) bytes. + auto size_iter = thrust::make_constant_iterator(sizeof(size_type)); + + // Copy offsets to buffers in batched manner. + cudf::detail::batched_memcpy_async( + src_iter, d_dst_addrs.begin(), size_iter, offsets.size(), stream); +} + } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index 7c985643887..2851ef67a65 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -20,8 +20,6 @@ #include -#include - #include #include #include @@ -94,10 +92,10 @@ struct LogicalType { BSON }; Type type; - cuda::std::optional decimal_type; - cuda::std::optional time_type; - cuda::std::optional timestamp_type; - cuda::std::optional int_type; + std::optional decimal_type; + std::optional time_type; + std::optional timestamp_type; + std::optional int_type; LogicalType(Type tp = UNDEFINED) : type(tp) {} LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {} @@ -178,21 +176,21 @@ struct SchemaElement { // 5: nested fields int32_t num_children = 0; // 6: DEPRECATED: record the original type before conversion to parquet type - cuda::std::optional converted_type; + std::optional converted_type; // 7: DEPRECATED: record the scale for DECIMAL converted type int32_t decimal_scale = 0; // 8: DEPRECATED: record the precision for DECIMAL converted type int32_t decimal_precision = 0; // 9: save field_id from original schema - cuda::std::optional field_id; + std::optional field_id; // 10: replaces converted type - cuda::std::optional logical_type; + std::optional logical_type; // extra cudf specific fields bool output_as_byte_array = false; // cudf type determined from arrow:schema - cuda::std::optional arrow_type; + std::optional arrow_type; // The following fields are filled in later during schema initialization int max_definition_level = 0; @@ -258,21 +256,21 @@ struct SchemaElement { */ struct Statistics { // deprecated max value in signed comparison order - cuda::std::optional> max; + std::optional> max; // deprecated min value in signed comparison order - cuda::std::optional> min; + std::optional> min; // count of null values in the column - cuda::std::optional null_count; + std::optional null_count; // count of distinct values occurring - cuda::std::optional distinct_count; + std::optional distinct_count; // max value for column determined by ColumnOrder - cuda::std::optional> max_value; + std::optional> max_value; // min value for column determined by ColumnOrder - cuda::std::optional> min_value; + std::optional> min_value; // If true, max_value is the actual maximum value for a column - cuda::std::optional is_max_value_exact; + std::optional is_max_value_exact; // If true, min_value is the actual minimum value for a column - cuda::std::optional is_min_value_exact; + std::optional is_min_value_exact; }; /** @@ -281,7 +279,7 @@ struct Statistics { struct SizeStatistics { // Number of variable-width bytes stored for the page/chunk. Should not be set for anything // but the BYTE_ARRAY physical type. - cuda::std::optional unencoded_byte_array_data_bytes; + std::optional unencoded_byte_array_data_bytes; /** * When present, there is expected to be one element corresponding to each * repetition (i.e. size=max repetition_level+1) where each element @@ -290,14 +288,14 @@ struct SizeStatistics { * * This value should not be written if max_repetition_level is 0. */ - cuda::std::optional> repetition_level_histogram; + std::optional> repetition_level_histogram; /** * Same as repetition_level_histogram except for definition levels. * * This value should not be written if max_definition_level is 0 or 1. */ - cuda::std::optional> definition_level_histogram; + std::optional> definition_level_histogram; }; /** @@ -318,7 +316,7 @@ struct OffsetIndex { std::vector page_locations; // per-page size info. see description of the same field in SizeStatistics. only present for // columns with a BYTE_ARRAY physical type. - cuda::std::optional> unencoded_byte_array_data_bytes; + std::optional> unencoded_byte_array_data_bytes; }; /** @@ -329,11 +327,11 @@ struct ColumnIndex { std::vector> min_values; // lower bound for values in each page std::vector> max_values; // upper bound for values in each page BoundaryOrder boundary_order = - BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered - cuda::std::optional> null_counts; // Optional count of null values per page + BoundaryOrder::UNORDERED; // Indicates if min and max values are ordered + std::optional> null_counts; // Optional count of null values per page // Repetition/definition level histograms for the column chunk - cuda::std::optional> repetition_level_histogram; - cuda::std::optional> definition_level_histogram; + std::optional> repetition_level_histogram; + std::optional> definition_level_histogram; }; /** @@ -383,11 +381,11 @@ struct ColumnChunkMetaData { Statistics statistics; // Set of all encodings used for pages in this column chunk. This information can be used to // determine if all data pages are dictionary encoded for example. - cuda::std::optional> encoding_stats; + std::optional> encoding_stats; // Optional statistics to help estimate total memory when converted to in-memory representations. // The histograms contained in these statistics can also be useful in some cases for more // fine-grained nullability/list length filter pushdown. - cuda::std::optional size_statistics; + std::optional size_statistics; }; /** @@ -429,13 +427,13 @@ struct RowGroup { int64_t num_rows = 0; // If set, specifies a sort ordering of the rows in this RowGroup. // The sorting columns can be a subset of all the columns. - cuda::std::optional> sorting_columns; + std::optional> sorting_columns; // Byte offset from beginning of file to first page (data or dictionary) in this row group - cuda::std::optional file_offset; + std::optional file_offset; // Total byte size of all compressed (and potentially encrypted) column data in this row group - cuda::std::optional total_compressed_size; + std::optional total_compressed_size; // Row group ordinal in the file - cuda::std::optional ordinal; + std::optional ordinal; }; /** @@ -460,7 +458,7 @@ struct FileMetaData { std::vector row_groups; std::vector key_value_metadata; std::string created_by = ""; - cuda::std::optional> column_orders; + std::optional> column_orders; }; /** diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index e631e12119d..be502b581af 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -22,14 +22,13 @@ #include "io/parquet/parquet_common.hpp" #include "io/statistics/statistics.cuh" #include "io/utilities/column_buffer.hpp" -#include "io/utilities/hostdevice_vector.hpp" +#include #include #include #include #include -#include #include #include @@ -395,7 +394,7 @@ struct ColumnChunkDesc { uint8_t def_level_bits_, uint8_t rep_level_bits_, Compression codec_, - cuda::std::optional logical_type_, + std::optional logical_type_, int32_t ts_clock_rate_, int32_t src_col_index_, int32_t src_col_schema_, @@ -441,12 +440,12 @@ struct ColumnChunkDesc { int32_t num_data_pages{}; // number of data pages int32_t num_dict_pages{}; // number of dictionary pages PageInfo const* dict_page{}; - string_index_pair* str_dict_index{}; // index for string dictionary - bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column - void** column_data_base{}; // base pointers of column data - void** column_string_base{}; // base pointers of column string data - Compression codec{}; // compressed codec enum - cuda::std::optional logical_type{}; // logical type + string_index_pair* str_dict_index{}; // index for string dictionary + bitmask_type** valid_map_base{}; // base pointers of valid bit map for this column + void** column_data_base{}; // base pointers of column data + void** column_string_base{}; // base pointers of column string data + Compression codec{}; // compressed codec enum + std::optional logical_type{}; // logical type int32_t ts_clock_rate{}; // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns) int32_t src_col_index{}; // my input column index @@ -797,6 +796,18 @@ void DecodeSplitPageData(cudf::detail::hostdevice_span pages, kernel_error::pointer error_code, rmm::cuda_stream_view stream); +/** + * @brief Writes the final offsets to the corresponding list and string buffer end addresses in a + * batched manner. + * + * @param offsets Host span of final offsets + * @param buff_addrs Host span of corresponding output col buffer end addresses + * @param stream CUDA stream to use + */ +void WriteFinalOffsets(host_span offsets, + host_span buff_addrs, + rmm::cuda_stream_view stream); + /** * @brief Launches kernel for reading the string column data stored in the pages * diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index b90ca36c8c7..32e922b04bb 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -152,7 +152,7 @@ struct stats_caster { } void set_index(size_type index, - cuda::std::optional> const& binary_value, + std::optional> const& binary_value, Type const type) { if (binary_value.has_value()) { @@ -234,8 +234,8 @@ struct stats_caster { max.set_index(stats_idx, max_value, colchunk.meta_data.type); } else { // Marking it null, if column present in row group - min.set_index(stats_idx, cuda::std::nullopt, {}); - max.set_index(stats_idx, cuda::std::nullopt, {}); + min.set_index(stats_idx, std::nullopt, {}); + max.set_index(stats_idx, std::nullopt, {}); } stats_idx++; } @@ -454,15 +454,18 @@ std::optional>> aggregate_reader_metadata::fi CUDF_EXPECTS(predicate.type().id() == cudf::type_id::BOOL8, "Filter expression must return a boolean column"); - auto num_bitmasks = num_bitmask_words(predicate.size()); - std::vector host_bitmask(num_bitmasks, ~bitmask_type{0}); - if (predicate.nullable()) { - CUDF_CUDA_TRY(cudaMemcpyAsync(host_bitmask.data(), - predicate.null_mask(), - num_bitmasks * sizeof(bitmask_type), - cudaMemcpyDefault, - stream.value())); - } + auto const host_bitmask = [&] { + auto const num_bitmasks = num_bitmask_words(predicate.size()); + if (predicate.nullable()) { + return cudf::detail::make_host_vector_sync( + device_span(predicate.null_mask(), num_bitmasks), stream); + } else { + auto bitmask = cudf::detail::make_host_vector(num_bitmasks, stream); + std::fill(bitmask.begin(), bitmask.end(), ~bitmask_type{0}); + return bitmask; + } + }(); + auto validity_it = cudf::detail::make_counting_transform_iterator( 0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); }); diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index 7d817bde7af..0705ff6f5cc 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -38,7 +38,7 @@ namespace { // be treated as a string. Currently the only logical type that has special handling is DECIMAL. // Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which // for now would also be treated as a string). -inline bool is_treat_fixed_length_as_string(cuda::std::optional const& logical_type) +inline bool is_treat_fixed_length_as_string(std::optional const& logical_type) { if (!logical_type.has_value()) { return true; } return logical_type->type != LogicalType::DECIMAL; @@ -78,7 +78,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // TODO: This step is somewhat redundant if size info has already been calculated (nested schema, // chunked reader). auto const has_strings = (kernel_mask & STRINGS_MASK) != 0; - std::vector col_string_sizes(_input_columns.size(), 0L); + auto col_string_sizes = cudf::detail::make_host_vector(_input_columns.size(), _stream); if (has_strings) { // need to compute pages bounds/sizes if we lack page indexes or are using custom bounds // TODO: we could probably dummy up size stats for FLBA data since we know the width @@ -371,13 +371,15 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error)); } - // for list columns, add the final offset to every offset buffer. - // TODO : make this happen in more efficiently. Maybe use thrust::for_each - // on each buffer. + // For list and string columns, add the final offset to every offset buffer. // Note : the reason we are doing this here instead of in the decode kernel is // that it is difficult/impossible for a given page to know that it is writing the very // last value that should then be followed by a terminator (because rows can span // page boundaries). + std::vector out_buffers; + std::vector final_offsets; + out_buffers.reserve(_input_columns.size()); + final_offsets.reserve(_input_columns.size()); for (size_t idx = 0; idx < _input_columns.size(); idx++) { input_column_info const& input_col = _input_columns[idx]; @@ -393,25 +395,21 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // the final offset for a list at level N is the size of it's child size_type const offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size; - CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + (out_buf.size - 1), - &offset, - sizeof(size_type), - cudaMemcpyDefault, - _stream.value())); + out_buffers.emplace_back(static_cast(out_buf.data()) + (out_buf.size - 1)); + final_offsets.emplace_back(offset); out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; } else if (out_buf.type.id() == type_id::STRING) { // need to cap off the string offsets column auto const sz = static_cast(col_string_sizes[idx]); if (sz <= strings::detail::get_offset64_threshold()) { - CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast(out_buf.data()) + out_buf.size, - &sz, - sizeof(size_type), - cudaMemcpyDefault, - _stream.value())); + out_buffers.emplace_back(static_cast(out_buf.data()) + out_buf.size); + final_offsets.emplace_back(sz); } } } } + // Write the final offsets for list and string columns in a batched manner + WriteFinalOffsets(final_offsets, out_buffers, _stream); // update null counts in the final column buffers for (size_t idx = 0; idx < subpass.pages.size(); idx++) { diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp index 62ffc4d3077..3aa9b94ed6b 100644 --- a/cpp/src/io/parquet/reader_impl.hpp +++ b/cpp/src/io/parquet/reader_impl.hpp @@ -284,7 +284,7 @@ class reader::impl { * * @return Vector of total string data sizes for each column */ - std::vector calculate_page_string_offsets(); + cudf::detail::host_vector calculate_page_string_offsets(); /** * @brief Converts the page data and outputs to columns. diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index c588fedb85c..27312a4da89 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -371,11 +371,11 @@ int64_t find_next_split(int64_t cur_pos, * * @return A tuple of Parquet clock rate and Parquet decimal type. */ -[[nodiscard]] std::tuple> conversion_info( +[[nodiscard]] std::tuple> conversion_info( type_id column_type_id, type_id timestamp_type_id, Type physical, - cuda::std::optional logical_type) + std::optional logical_type) { int32_t const clock_rate = is_chrono(data_type{column_type_id}) ? to_clockrate(timestamp_type_id) : 0; @@ -386,11 +386,11 @@ int64_t find_next_split(int64_t cur_pos, // if decimal but not outputting as float or decimal, then convert to no logical type if (column_type_id != type_id::FLOAT64 and not cudf::is_fixed_point(data_type{column_type_id})) { - return std::make_tuple(clock_rate, cuda::std::nullopt); + return {clock_rate, std::nullopt}; } } - return std::make_tuple(clock_rate, std::move(logical_type)); + return {clock_rate, std::move(logical_type)}; } /** diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp index 3a3cdd34a58..a0c2dbd3e44 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.hpp +++ b/cpp/src/io/parquet/reader_impl_chunking.hpp @@ -107,7 +107,7 @@ struct subpass_intermediate_data { * rowgroups may represent less than all of the rowgroups to be read for the file. */ struct pass_intermediate_data { - std::vector> raw_page_data; + std::vector raw_page_data; // rowgroup, chunk and page information for the current pass. bool has_compressed_data{false}; diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 6d566b5815e..a6562d33de2 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -38,7 +38,7 @@ namespace flatbuf = cudf::io::parquet::flatbuf; namespace { -cuda::std::optional converted_to_logical_type(SchemaElement const& schema) +std::optional converted_to_logical_type(SchemaElement const& schema) { if (schema.converted_type.has_value()) { switch (schema.converted_type.value()) { @@ -66,7 +66,7 @@ cuda::std::optional converted_to_logical_type(SchemaElement const& default: return LogicalType{LogicalType::UNDEFINED}; } } - return cuda::std::nullopt; + return std::nullopt; } } // namespace @@ -246,7 +246,7 @@ void metadata::sanitize_schema() struct_elem.repetition_type = REQUIRED; struct_elem.num_children = schema_elem.num_children; struct_elem.type = UNDEFINED_TYPE; - struct_elem.converted_type = cuda::std::nullopt; + struct_elem.converted_type = std::nullopt; // swap children struct_elem.children_idx = std::move(schema_elem.children_idx); diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 3763c2e8e6d..f03f1214b9a 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -19,9 +19,9 @@ #include #include +#include #include #include -#include #include #include @@ -44,6 +44,7 @@ #include #include +#include #include namespace cudf::io::parquet::detail { @@ -217,7 +218,7 @@ void generate_depth_remappings( */ [[nodiscard]] std::future read_column_chunks_async( std::vector> const& sources, - std::vector>& page_data, + cudf::host_span page_data, cudf::detail::hostdevice_vector& chunks, size_t begin_chunk, size_t end_chunk, @@ -250,23 +251,24 @@ void generate_depth_remappings( if (source->is_device_read_preferred(io_size)) { // Buffer needs to be padded. // Required by `gpuDecodePageData`. - auto buffer = + page_data[chunk] = rmm::device_buffer(cudf::util::round_up_safe(io_size, BUFFER_PADDING_MULTIPLE), stream); auto fut_read_size = source->device_read_async( - io_offset, io_size, static_cast(buffer.data()), stream); + io_offset, io_size, static_cast(page_data[chunk].data()), stream); read_tasks.emplace_back(std::move(fut_read_size)); - page_data[chunk] = datasource::buffer::create(std::move(buffer)); } else { auto const read_buffer = source->host_read(io_offset, io_size); // Buffer needs to be padded. // Required by `gpuDecodePageData`. - auto tmp_buffer = rmm::device_buffer( + page_data[chunk] = rmm::device_buffer( cudf::util::round_up_safe(read_buffer->size(), BUFFER_PADDING_MULTIPLE), stream); - CUDF_CUDA_TRY(cudaMemcpyAsync( - tmp_buffer.data(), read_buffer->data(), read_buffer->size(), cudaMemcpyDefault, stream)); - page_data[chunk] = datasource::buffer::create(std::move(tmp_buffer)); + CUDF_CUDA_TRY(cudaMemcpyAsync(page_data[chunk].data(), + read_buffer->data(), + read_buffer->size(), + cudaMemcpyDefault, + stream)); } - auto d_compdata = page_data[chunk]->data(); + auto d_compdata = static_cast(page_data[chunk].data()); do { chunks[chunk].compressed_data = d_compdata; d_compdata += chunks[chunk].compressed_size; @@ -979,7 +981,7 @@ std::pair> reader::impl::read_column_chunks() std::vector chunk_source_map(num_chunks); // Tracker for eventually deallocating compressed and uncompressed data - raw_page_data = std::vector>(num_chunks); + raw_page_data = std::vector(num_chunks); // Keep track of column chunk file offsets std::vector column_chunk_offsets(num_chunks); @@ -1592,36 +1594,68 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num auto const d_cols_info = cudf::detail::make_device_uvector_async( h_cols_info, _stream, cudf::get_current_device_resource_ref()); - auto const num_keys = _input_columns.size() * max_depth * subpass.pages.size(); - // size iterator. indexes pages by sorted order - rmm::device_uvector size_input{num_keys, _stream}; - thrust::transform( - rmm::exec_policy(_stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(num_keys), - size_input.begin(), - get_page_nesting_size{ - d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.device_begin()}); - auto const reduction_keys = - cudf::detail::make_counting_transform_iterator(0, get_reduction_key{subpass.pages.size()}); + // Vector to store page sizes for each column at each depth cudf::detail::hostdevice_vector sizes{_input_columns.size() * max_depth, _stream}; - // find the size of each column - thrust::reduce_by_key(rmm::exec_policy(_stream), - reduction_keys, - reduction_keys + num_keys, - size_input.cbegin(), - thrust::make_discard_iterator(), - sizes.d_begin()); - - // for nested hierarchies, compute per-page start offset - thrust::exclusive_scan_by_key( - rmm::exec_policy(_stream), - reduction_keys, - reduction_keys + num_keys, - size_input.cbegin(), - start_offset_output_iterator{ - subpass.pages.device_begin(), 0, d_cols_info.data(), max_depth, subpass.pages.size()}); + // Total number of keys to process + auto const num_keys = _input_columns.size() * max_depth * subpass.pages.size(); + + // Maximum 1 billion keys processed per iteration + auto constexpr max_keys_per_iter = + static_cast(std::numeric_limits::max() / 2); + + // Number of keys for per each column + auto const num_keys_per_col = max_depth * subpass.pages.size(); + + // The largest multiple of `num_keys_per_col` that is <= `num_keys` + auto const num_keys_per_iter = + num_keys <= max_keys_per_iter + ? num_keys + : num_keys_per_col * std::max(1, max_keys_per_iter / num_keys_per_col); + + // Size iterator. Indexes pages by sorted order + rmm::device_uvector size_input{num_keys_per_iter, _stream}; + + // To keep track of the starting key of an iteration + size_t key_start = 0; + // Loop until all keys are processed + while (key_start < num_keys) { + // Number of keys processed in this iteration + auto const num_keys_this_iter = std::min(num_keys_per_iter, num_keys - key_start); + thrust::transform( + rmm::exec_policy_nosync(_stream), + thrust::make_counting_iterator(key_start), + thrust::make_counting_iterator(key_start + num_keys_this_iter), + size_input.begin(), + get_page_nesting_size{ + d_cols_info.data(), max_depth, subpass.pages.size(), subpass.pages.device_begin()}); + + // Manually create a size_t `key_start` compatible counting_transform_iterator. + auto const reduction_keys = + thrust::make_transform_iterator(thrust::make_counting_iterator(key_start), + get_reduction_key{subpass.pages.size()}); + + // Find the size of each column + thrust::reduce_by_key(rmm::exec_policy_nosync(_stream), + reduction_keys, + reduction_keys + num_keys_this_iter, + size_input.cbegin(), + thrust::make_discard_iterator(), + sizes.d_begin() + (key_start / subpass.pages.size())); + + // For nested hierarchies, compute per-page start offset + thrust::exclusive_scan_by_key(rmm::exec_policy_nosync(_stream), + reduction_keys, + reduction_keys + num_keys_this_iter, + size_input.cbegin(), + start_offset_output_iterator{subpass.pages.device_begin(), + key_start, + d_cols_info.data(), + max_depth, + subpass.pages.size()}); + // Increment the key_start + key_start += num_keys_this_iter; + } sizes.device_to_host_sync(_stream); for (size_type idx = 0; idx < static_cast(_input_columns.size()); idx++) { @@ -1656,21 +1690,20 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num } } - cudf::io::detail::batched_memset(memset_bufs, static_cast(0), _stream); + cudf::detail::batched_memset(memset_bufs, static_cast(0), _stream); // Need to set null mask bufs to all high bits - cudf::io::detail::batched_memset( + cudf::detail::batched_memset( nullmask_bufs, std::numeric_limits::max(), _stream); } -std::vector reader::impl::calculate_page_string_offsets() +cudf::detail::host_vector reader::impl::calculate_page_string_offsets() { auto& pass = *_pass_itm_data; auto& subpass = *pass.subpass; auto page_keys = make_page_key_iterator(subpass.pages); - std::vector col_sizes(_input_columns.size(), 0L); - rmm::device_uvector d_col_sizes(col_sizes.size(), _stream); + rmm::device_uvector d_col_sizes(_input_columns.size(), _stream); // use page_index to fetch page string sizes in the proper order auto val_iter = thrust::make_transform_iterator(subpass.pages.device_begin(), @@ -1684,7 +1717,7 @@ std::vector reader::impl::calculate_page_string_offsets() page_offset_output_iter{subpass.pages.device_ptr()}); // now sum up page sizes - rmm::device_uvector reduce_keys(col_sizes.size(), _stream); + rmm::device_uvector reduce_keys(d_col_sizes.size(), _stream); thrust::reduce_by_key(rmm::exec_policy_nosync(_stream), page_keys, page_keys + subpass.pages.size(), @@ -1692,14 +1725,7 @@ std::vector reader::impl::calculate_page_string_offsets() reduce_keys.begin(), d_col_sizes.begin()); - cudaMemcpyAsync(col_sizes.data(), - d_col_sizes.data(), - sizeof(size_t) * col_sizes.size(), - cudaMemcpyDeviceToHost, - _stream); - _stream.synchronize(); - - return col_sizes; + return cudf::detail::make_host_vector_sync(d_col_sizes, _stream); } } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index ec05f35d405..f865c9a7643 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -183,10 +183,10 @@ struct aggregate_writer_metadata { std::vector row_groups; std::vector key_value_metadata; std::vector offset_indexes; - std::vector> column_indexes; + std::vector> column_indexes; }; std::vector files; - cuda::std::optional> column_orders = cuda::std::nullopt; + std::optional> column_orders = std::nullopt; }; namespace { @@ -472,7 +472,7 @@ struct leaf_schema_fn { std::enable_if_t, void> operator()() { col_schema.type = (timestamp_is_int96) ? Type::INT96 : Type::INT64; - col_schema.converted_type = cuda::std::nullopt; + col_schema.converted_type = std::nullopt; col_schema.stats_dtype = statistics_dtype::dtype_timestamp64; if (timestamp_is_int96) { col_schema.ts_scale = -1000; // negative value indicates division by absolute value @@ -750,7 +750,7 @@ std::vector construct_parquet_schema_tree( col_schema.type = Type::BYTE_ARRAY; } - col_schema.converted_type = cuda::std::nullopt; + col_schema.converted_type = std::nullopt; col_schema.stats_dtype = statistics_dtype::dtype_byte_array; col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED; col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name(); @@ -1543,12 +1543,7 @@ void encode_pages(hostdevice_2dvector& chunks, d_chunks.flat_view(), {column_stats, pages.size()}, column_index_truncate_length, stream); } - auto h_chunks = chunks.host_view(); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks.data(), - d_chunks.data(), - d_chunks.flat_view().size_bytes(), - cudaMemcpyDefault, - stream.value())); + chunks.device_to_host_async(stream); if (comp_stats.has_value()) { comp_stats.value() += collect_compression_statistics(comp_in, comp_res, stream); @@ -2559,12 +2554,11 @@ void writer::impl::write_parquet_data_to_sink( } else { CUDF_EXPECTS(bounce_buffer.size() >= ck.compressed_size, "Bounce buffer was not properly initialized."); - CUDF_CUDA_TRY(cudaMemcpyAsync(bounce_buffer.data(), - dev_bfr + ck.ck_stat_size, - ck.compressed_size, - cudaMemcpyDefault, - _stream.value())); - _stream.synchronize(); + cudf::detail::cuda_memcpy( + host_span{bounce_buffer}.subspan(0, ck.compressed_size), + device_span{dev_bfr + ck.ck_stat_size, ck.compressed_size}, + _stream); + _out_sink[p]->host_write(bounce_buffer.data(), ck.compressed_size); } @@ -2600,13 +2594,8 @@ void writer::impl::write_parquet_data_to_sink( auto const& column_chunk_meta = row_group.columns[i].meta_data; // start transfer of the column index - std::vector column_idx; - column_idx.resize(ck.column_index_size); - CUDF_CUDA_TRY(cudaMemcpyAsync(column_idx.data(), - ck.column_index_blob, - ck.column_index_size, - cudaMemcpyDefault, - _stream.value())); + auto column_idx = cudf::detail::make_host_vector_async( + device_span{ck.column_index_blob, ck.column_index_size}, _stream); // calculate offsets while the column index is transferring int64_t curr_pg_offset = column_chunk_meta.data_page_offset; @@ -2795,7 +2784,7 @@ std::unique_ptr> writer::merge_row_group_metadata( // See https://github.com/rapidsai/cudf/pull/14264#issuecomment-1778311615 for (auto& se : md.schema) { if (se.logical_type.has_value() && se.logical_type.value().type == LogicalType::UNKNOWN) { - se.logical_type = cuda::std::nullopt; + se.logical_type = std::nullopt; } } diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu index badcd3f58f9..06069630685 100644 --- a/cpp/src/io/text/bgzip_data_chunk_source.cu +++ b/cpp/src/io/text/bgzip_data_chunk_source.cu @@ -74,8 +74,8 @@ class bgzip_data_chunk_reader : public data_chunk_reader { // Buffer needs to be padded. // Required by `inflate_kernel`. device.resize(cudf::util::round_up_safe(host.size(), BUFFER_PADDING_MULTIPLE), stream); - CUDF_CUDA_TRY(cudaMemcpyAsync( - device.data(), host.data(), host.size() * sizeof(T), cudaMemcpyDefault, stream.value())); + cudf::detail::cuda_memcpy_async( + device_span{device}.subspan(0, host.size()), host, stream); } struct decompression_blocks { diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp index 58faa0ebfe4..4baea8655e0 100644 --- a/cpp/src/io/text/data_chunk_source_factories.cpp +++ b/cpp/src/io/text/data_chunk_source_factories.cpp @@ -87,8 +87,10 @@ class datasource_chunk_reader : public data_chunk_reader { _source->host_read(_offset, read_size, reinterpret_cast(h_ticket.buffer.data())); // copy the host-pinned data on to device - CUDF_CUDA_TRY(cudaMemcpyAsync( - chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value())); + cudf::detail::cuda_memcpy_async( + device_span{chunk}.subspan(0, read_size), + host_span{h_ticket.buffer}.subspan(0, read_size), + stream); // record the host-to-device copy. CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value())); @@ -153,8 +155,10 @@ class istream_data_chunk_reader : public data_chunk_reader { auto chunk = rmm::device_uvector(read_size, stream); // copy the host-pinned data on to device - CUDF_CUDA_TRY(cudaMemcpyAsync( - chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value())); + cudf::detail::cuda_memcpy_async( + device_span{chunk}.subspan(0, read_size), + host_span{h_ticket.buffer}.subspan(0, read_size), + stream); // record the host-to-device copy. CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value())); @@ -193,12 +197,10 @@ class host_span_data_chunk_reader : public data_chunk_reader { auto chunk = rmm::device_uvector(read_size, stream); // copy the host data to device - CUDF_CUDA_TRY(cudaMemcpyAsync( // - chunk.data(), - _data.data() + _position, - read_size, - cudaMemcpyDefault, - stream.value())); + cudf::detail::cuda_memcpy_async( + cudf::device_span{chunk}.subspan(0, read_size), + cudf::host_span{_data}.subspan(_position, read_size), + stream); _position += read_size; diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp index a3afbd52896..813743fa7b4 100644 --- a/cpp/src/io/utilities/config_utils.cpp +++ b/cpp/src/io/utilities/config_utils.cpp @@ -19,7 +19,10 @@ #include #include +#include + #include +#include #include #include @@ -53,6 +56,14 @@ bool is_gds_enabled() { return is_always_enabled() or get_env_policy() == usage_ bool is_kvikio_enabled() { return get_env_policy() == usage_policy::KVIKIO; } +void set_thread_pool_nthreads_from_env() +{ + static std::once_flag flag{}; + std::call_once(flag, [] { + auto nthreads = getenv_or("KVIKIO_NTHREADS", 8U); + kvikio::defaults::thread_pool_nthreads_reset(nthreads); + }); +} } // namespace cufile_integration namespace nvcomp_integration { @@ -81,5 +92,4 @@ bool is_all_enabled() { return get_env_policy() == usage_policy::ALWAYS; } bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_policy::STABLE; } } // namespace nvcomp_integration - } // namespace cudf::io diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index f70171eef68..0c49b2e5d78 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -800,7 +801,7 @@ template static std::unique_ptr parse_string(string_view_pair_it str_tuples, size_type col_size, rmm::device_buffer&& null_mask, - rmm::device_scalar& d_null_count, + cudf::detail::device_scalar& d_null_count, cudf::io::parse_options_view const& options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) @@ -930,7 +931,7 @@ std::unique_ptr parse_data( CUDF_FUNC_RANGE(); if (col_size == 0) { return make_empty_column(col_type); } - auto d_null_count = rmm::device_scalar(null_count, stream); + auto d_null_count = cudf::detail::device_scalar(null_count, stream); auto null_count_data = d_null_count.data(); if (null_mask.is_empty()) { null_mask = cudf::create_null_mask(col_size, mask_state::ALL_VALID, stream, mr); diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 1dbb9369115..a8a275919d8 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -42,6 +42,7 @@ class file_sink : public data_sink { if (!_output_stream.is_open()) { detail::throw_on_file_open_failure(filepath, true); } if (cufile_integration::is_kvikio_enabled()) { + cufile_integration::set_thread_pool_nthreads_from_env(); _kvikio_file = kvikio::FileHandle(filepath, "w"); CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.", _kvikio_file.is_compat_mode_on() ? "on" : "off"); @@ -50,7 +51,8 @@ class file_sink : public data_sink { } } - ~file_sink() override { flush(); } + // Marked as NOLINT because we are calling a virtual method in the destructor + ~file_sink() override { flush(); } // NOLINT void host_write(void const* data, size_t size) override { @@ -114,7 +116,8 @@ class host_buffer_sink : public data_sink { public: explicit host_buffer_sink(std::vector* buffer) : buffer_(buffer) {} - ~host_buffer_sink() override { flush(); } + // Marked as NOLINT because we are calling a virtual method in the destructor + ~host_buffer_sink() override { flush(); } // NOLINT void host_write(void const* data, size_t size) override { diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index e4313eba454..4e8908a8942 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -15,8 +15,10 @@ */ #include "file_io_utilities.hpp" +#include "getenv_or.hpp" #include +#include #include #include #include @@ -32,6 +34,7 @@ #include #include +#include namespace cudf { namespace io { @@ -46,6 +49,7 @@ class file_source : public datasource { { detail::force_init_cuda_context(); if (cufile_integration::is_kvikio_enabled()) { + cufile_integration::set_thread_pool_nthreads_from_env(); _kvikio_file = kvikio::FileHandle(filepath); CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.", _kvikio_file.is_compat_mode_on() ? "on" : "off"); @@ -54,6 +58,30 @@ class file_source : public datasource { } } + std::unique_ptr host_read(size_t offset, size_t size) override + { + lseek(_file.desc(), offset, SEEK_SET); + + // Clamp length to available data + ssize_t const read_size = std::min(size, _file.size() - offset); + + std::vector v(read_size); + CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed"); + return buffer::create(std::move(v)); + } + + size_t host_read(size_t offset, size_t size, uint8_t* dst) override + { + lseek(_file.desc(), offset, SEEK_SET); + + // Clamp length to available data + auto const read_size = std::min(size, _file.size() - offset); + + CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast(read_size), + "read failed"); + return read_size; + } + ~file_source() override = default; [[nodiscard]] bool supports_device_read() const override @@ -109,27 +137,6 @@ class file_source : public datasource { static constexpr size_t _gds_read_preferred_threshold = 128 << 10; // 128KB }; -/** - * @brief Memoized pageableMemoryAccessUsesHostPageTables device property. - */ -[[nodiscard]] bool pageableMemoryAccessUsesHostPageTables() -{ - static std::unordered_map result_cache{}; - - int deviceId{}; - CUDF_CUDA_TRY(cudaGetDevice(&deviceId)); - - if (result_cache.find(deviceId) == result_cache.end()) { - cudaDeviceProp props{}; - CUDF_CUDA_TRY(cudaGetDeviceProperties(&props, deviceId)); - result_cache[deviceId] = (props.pageableMemoryAccessUsesHostPageTables == 1); - CUDF_LOG_INFO( - "Device {} pageableMemoryAccessUsesHostPageTables: {}", deviceId, result_cache[deviceId]); - } - - return result_cache[deviceId]; -} - /** * @brief Implementation class for reading from a file using memory mapped access. * @@ -138,40 +145,53 @@ class file_source : public datasource { */ class memory_mapped_source : public file_source { public: - explicit memory_mapped_source(char const* filepath, size_t offset, size_t size) + explicit memory_mapped_source(char const* filepath, size_t offset, size_t max_size_estimate) : file_source(filepath) { if (_file.size() != 0) { - map(_file.desc(), offset, size); - register_mmap_buffer(); + // Memory mapping is not exclusive, so we can include the whole region we expect to read + map(_file.desc(), offset, max_size_estimate); } } ~memory_mapped_source() override { - if (_map_addr != nullptr) { - munmap(_map_addr, _map_size); - unregister_mmap_buffer(); - } + if (_map_addr != nullptr) { unmap(); } } std::unique_ptr host_read(size_t offset, size_t size) override { - CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping"); + // Clamp length to available data + auto const read_size = std::min(size, +_file.size() - offset); + + // If the requested range is outside of the mapped region, read from the file + if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) { + return file_source::host_read(offset, read_size); + } + + // If the requested range is only partially within the registered region, copy to a new + // host buffer to make the data safe to copy to the device + if (_reg_addr != nullptr and + (offset < _reg_offset or offset + read_size > (_reg_offset + _reg_size))) { + auto const src = static_cast(_map_addr) + (offset - _map_offset); - // Clamp length to available data in the mapped region - auto const read_size = std::min(size, _map_size - (offset - _map_offset)); + return std::make_unique>>( + std::vector(src, src + read_size)); + } return std::make_unique( - static_cast(_map_addr) + (offset - _map_offset), read_size); + static_cast(_map_addr) + offset - _map_offset, read_size); } size_t host_read(size_t offset, size_t size, uint8_t* dst) override { - CUDF_EXPECTS(offset >= _map_offset, "Requested offset is outside mapping"); + // Clamp length to available data + auto const read_size = std::min(size, +_file.size() - offset); - // Clamp length to available data in the mapped region - auto const read_size = std::min(size, _map_size - (offset - _map_offset)); + // If the requested range is outside of the mapped region, read from the file + if (offset < _map_offset or offset + read_size > (_map_offset + _map_size)) { + return file_source::host_read(offset, read_size, dst); + } auto const src = static_cast(_map_addr) + (offset - _map_offset); std::memcpy(dst, src, read_size); @@ -179,42 +199,6 @@ class memory_mapped_source : public file_source { } private: - /** - * @brief Page-locks (registers) the memory range of the mapped file. - * - * Fixes nvbugs/4215160 - */ - void register_mmap_buffer() - { - if (_map_addr == nullptr or _map_size == 0 or not pageableMemoryAccessUsesHostPageTables()) { - return; - } - - auto const result = cudaHostRegister(_map_addr, _map_size, cudaHostRegisterDefault); - if (result == cudaSuccess) { - _is_map_registered = true; - } else { - CUDF_LOG_WARN("cudaHostRegister failed with {} ({})", - static_cast(result), - cudaGetErrorString(result)); - } - } - - /** - * @brief Unregisters the memory range of the mapped file. - */ - void unregister_mmap_buffer() - { - if (not _is_map_registered) { return; } - - auto const result = cudaHostUnregister(_map_addr); - if (result != cudaSuccess) { - CUDF_LOG_WARN("cudaHostUnregister failed with {} ({})", - static_cast(result), - cudaGetErrorString(result)); - } - } - void map(int fd, size_t offset, size_t size) { CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file", std::overflow_error); @@ -226,52 +210,30 @@ class memory_mapped_source : public file_source { // Size for `mmap()` needs to include the page padding _map_size = size + (offset - _map_offset); + if (_map_size == 0) { return; } // Check if accessing a region within already mapped area _map_addr = mmap(nullptr, _map_size, PROT_READ, MAP_PRIVATE, fd, _map_offset); CUDF_EXPECTS(_map_addr != MAP_FAILED, "Cannot create memory mapping"); } - private: - size_t _map_size = 0; - size_t _map_offset = 0; - void* _map_addr = nullptr; - bool _is_map_registered = false; -}; - -/** - * @brief Implementation class for reading from a file using `read` calls - * - * Potentially faster than `memory_mapped_source` when only a small portion of the file is read - * through the host. - */ -class direct_read_source : public file_source { - public: - explicit direct_read_source(char const* filepath) : file_source(filepath) {} - - std::unique_ptr host_read(size_t offset, size_t size) override + void unmap() { - lseek(_file.desc(), offset, SEEK_SET); - - // Clamp length to available data - ssize_t const read_size = std::min(size, _file.size() - offset); - - std::vector v(read_size); - CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed"); - return buffer::create(std::move(v)); + if (_map_addr != nullptr) { + auto const result = munmap(_map_addr, _map_size); + if (result != 0) { CUDF_LOG_WARN("munmap failed with {}", result); } + _map_addr = nullptr; + } } - size_t host_read(size_t offset, size_t size, uint8_t* dst) override - { - lseek(_file.desc(), offset, SEEK_SET); - - // Clamp length to available data - auto const read_size = std::min(size, _file.size() - offset); + private: + size_t _map_offset = 0; + size_t _map_size = 0; + void* _map_addr = nullptr; - CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast(read_size), - "read failed"); - return read_size; - } + size_t _reg_offset = 0; + size_t _reg_size = 0; + void* _reg_addr = nullptr; }; /** @@ -286,17 +248,18 @@ class device_buffer_source final : public datasource { size_t host_read(size_t offset, size_t size, uint8_t* dst) override { auto const count = std::min(size, this->size() - offset); - auto const stream = cudf::get_default_stream(); - CUDF_CUDA_TRY( - cudaMemcpyAsync(dst, _d_buffer.data() + offset, count, cudaMemcpyDefault, stream.value())); - stream.synchronize(); + auto const stream = cudf::detail::global_cuda_stream_pool().get_stream(); + cudf::detail::cuda_memcpy(host_span{dst, count}, + device_span{ + reinterpret_cast(_d_buffer.data() + offset), count}, + stream); return count; } std::unique_ptr host_read(size_t offset, size_t size) override { auto const count = std::min(size, this->size() - offset); - auto const stream = cudf::get_default_stream(); + auto const stream = cudf::detail::global_cuda_stream_pool().get_stream(); auto h_data = cudf::detail::make_host_vector_async( cudf::device_span{_d_buffer.data() + offset, count}, stream); stream.synchronize(); @@ -431,16 +394,23 @@ class user_datasource_wrapper : public datasource { std::unique_ptr datasource::create(std::string const& filepath, size_t offset, - size_t size) + size_t max_size_estimate) { -#ifdef CUFILE_FOUND - if (cufile_integration::is_always_enabled()) { - // avoid mmap as GDS is expected to be used for most reads - return std::make_unique(filepath.c_str()); + auto const use_memory_mapping = [] { + auto const policy = getenv_or("LIBCUDF_MMAP_ENABLED", std::string{"ON"}); + + if (policy == "ON") { return true; } + if (policy == "OFF") { return false; } + + CUDF_FAIL("Invalid LIBCUDF_MMAP_ENABLED value: " + policy); + }(); + + if (use_memory_mapping) { + return std::make_unique(filepath.c_str(), offset, max_size_estimate); + } else { + // `file_source` reads the file directly, without memory mapping + return std::make_unique(filepath.c_str()); } -#endif - // Use our own memory mapping implementation for direct file reads - return std::make_unique(filepath.c_str(), offset, size); } std::unique_ptr datasource::create(host_buffer const& buffer) diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index d7b54399f8d..98ed9b28f0a 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -239,7 +239,7 @@ std::vector> make_sliced_tasks( std::vector> slice_tasks; std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) { return pool.submit_task( - [&] { return function(ptr + slice.offset, slice.size, offset + slice.offset); }); + [=] { return function(ptr + slice.offset, slice.size, offset + slice.offset); }); }); return slice_tasks; } diff --git a/cpp/src/io/utilities/hostdevice_span.hpp b/cpp/src/io/utilities/hostdevice_span.hpp index d9eac423901..a3ddef52dd8 100644 --- a/cpp/src/io/utilities/hostdevice_span.hpp +++ b/cpp/src/io/utilities/hostdevice_span.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -33,31 +34,18 @@ class hostdevice_span { hostdevice_span(hostdevice_span const&) = default; ///< Copy constructor hostdevice_span(hostdevice_span&&) = default; ///< Move constructor - hostdevice_span(T* cpu_data, T* gpu_data, size_t size) - : _size(size), _device_data(gpu_data), _host_data(cpu_data) + hostdevice_span(host_span host_data, T* device_data) + : _host_data{host_data}, _device_data{device_data} { } - /// Constructor from container - /// @param in The container to construct the span from - template ().host_ptr())> (*)[], - T (*)[]>>* = nullptr> - constexpr hostdevice_span(C& in) : hostdevice_span(in.host_ptr(), in.device_ptr(), in.size()) - { - } - - /// Constructor from const container - /// @param in The container to construct the span from - template ().host_ptr())> (*)[], - T (*)[]>>* = nullptr> - constexpr hostdevice_span(C const& in) - : hostdevice_span(in.host_ptr(), in.device_ptr(), in.size()) + // Copy construction to support const conversion + /// @param other The span to copy + template , // NOLINT + void>* = nullptr> + constexpr hostdevice_span(hostdevice_span const& other) noexcept + : _host_data{host_span{other}}, _device_data{other.device_ptr()} { } @@ -74,15 +62,13 @@ class hostdevice_span { * @tparam T The device span type. * @return A typed device span of the hostdevice view's data. */ - [[nodiscard]] operator cudf::device_span() { return {_device_data, size()}; } - - /** - * @brief Converts a hostdevice view into a device span of const data. - * - * @tparam T The device span type. - * @return A const typed device span of the hostdevice view's data. - */ - [[nodiscard]] operator cudf::device_span() const { return {_device_data, size()}; } + template , // NOLINT + void>* = nullptr> + [[nodiscard]] operator cudf::device_span() const noexcept + { + return {_device_data, size()}; + } /** * @brief Returns the underlying device data. @@ -114,9 +100,12 @@ class hostdevice_span { * @tparam T The host span type. * @return A typed host span of the hostdevice_span's data. */ - [[nodiscard]] operator cudf::host_span() const noexcept + template , // NOLINT + void>* = nullptr> + [[nodiscard]] operator host_span() const noexcept { - return cudf::host_span(_host_data, size()); + return {_host_data}; } /** @@ -125,7 +114,7 @@ class hostdevice_span { * @tparam T The type to cast to * @return T* Typed pointer to underlying data */ - [[nodiscard]] T* host_ptr(size_t offset = 0) const noexcept { return _host_data + offset; } + [[nodiscard]] T* host_ptr(size_t offset = 0) const noexcept { return _host_data.data() + offset; } /** * @brief Return first element in host data. @@ -136,19 +125,19 @@ class hostdevice_span { [[nodiscard]] T* host_begin() const noexcept { return host_ptr(); } /** - * @brief Return one past the last elementin host data. + * @brief Return one past the last element in host data. * * @tparam T The desired type * @return T const* Pointer to one past the last element */ - [[nodiscard]] T* host_end() const noexcept { return host_begin() + size(); } + [[nodiscard]] T* host_end() const noexcept { return _host_data.end(); } /** * @brief Returns the number of elements in the view * * @return The number of elements in the view */ - [[nodiscard]] std::size_t size() const noexcept { return _size; } + [[nodiscard]] std::size_t size() const noexcept { return _host_data.size(); } /** * @brief Returns true if `size()` returns zero, or false otherwise @@ -159,12 +148,11 @@ class hostdevice_span { [[nodiscard]] size_t size_bytes() const noexcept { return sizeof(T) * size(); } - [[nodiscard]] T& operator[](size_t i) { return _host_data[i]; } - [[nodiscard]] T const& operator[](size_t i) const { return _host_data[i]; } + [[nodiscard]] T& operator[](size_t i) const { return _host_data[i]; } /** - * @brief Obtains a hostdevice_span that is a view over the `count` elements of this - * hostdevice_span starting at offset + * @brief Obtains a `hostdevice_span` that is a view over the `count` elements of this + * hostdevice_span starting at `offset` * * @param offset The offset of the first element in the subspan * @param count The number of elements in the subspan @@ -172,37 +160,37 @@ class hostdevice_span { */ [[nodiscard]] constexpr hostdevice_span subspan(size_t offset, size_t count) const noexcept { - return hostdevice_span(_host_data + offset, _device_data + offset, count); + return hostdevice_span(_host_data.subspan(offset, count), device_ptr(offset)); } - void host_to_device_async(rmm::cuda_stream_view stream) + void host_to_device_async(rmm::cuda_stream_view stream) const { - CUDF_CUDA_TRY( - cudaMemcpyAsync(device_ptr(), host_ptr(), size_bytes(), cudaMemcpyDefault, stream.value())); + static_assert(not std::is_const_v, "Cannot copy to const device memory"); + cudf::detail::cuda_memcpy_async(device_span{device_ptr(), size()}, _host_data, stream); } - void host_to_device_sync(rmm::cuda_stream_view stream) + void host_to_device_sync(rmm::cuda_stream_view stream) const { host_to_device_async(stream); stream.synchronize(); } - void device_to_host_async(rmm::cuda_stream_view stream) + void device_to_host_async(rmm::cuda_stream_view stream) const { - CUDF_CUDA_TRY( - cudaMemcpyAsync(host_ptr(), device_ptr(), size_bytes(), cudaMemcpyDefault, stream.value())); + static_assert(not std::is_const_v, "Cannot copy to const host memory"); + cudf::detail::cuda_memcpy_async( + _host_data, device_span{device_ptr(), size()}, stream); } - void device_to_host_sync(rmm::cuda_stream_view stream) + void device_to_host_sync(rmm::cuda_stream_view stream) const { device_to_host_async(stream); stream.synchronize(); } private: - size_t _size{}; ///< Number of elements - T* _device_data{}; ///< Pointer to device memory containing elements - T* _host_data{}; ///< Pointer to host memory containing elements + host_span _host_data; + T* _device_data{nullptr}; }; } // namespace cudf::detail diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index aed745c42dd..f969b45727b 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -117,55 +117,39 @@ class hostdevice_vector { return d_data.element(element_index, stream); } - operator cudf::host_span() { return {host_ptr(), size()}; } - operator cudf::host_span() const { return {host_ptr(), size()}; } + operator cudf::host_span() { return host_span{h_data}.subspan(0, size()); } + operator cudf::host_span() const + { + return host_span{h_data}.subspan(0, size()); + } operator cudf::device_span() { return {device_ptr(), size()}; } operator cudf::device_span() const { return {device_ptr(), size()}; } void host_to_device_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream); + cuda_memcpy_async(d_data, h_data, stream); } - void host_to_device_sync(rmm::cuda_stream_view stream) - { - cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream); - } + void host_to_device_sync(rmm::cuda_stream_view stream) { cuda_memcpy(d_data, h_data, stream); } void device_to_host_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream); + cuda_memcpy_async(h_data, d_data, stream); } - void device_to_host_sync(rmm::cuda_stream_view stream) - { - cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream); - } + void device_to_host_sync(rmm::cuda_stream_view stream) { cuda_memcpy(h_data, d_data, stream); } /** * @brief Converts a hostdevice_vector into a hostdevice_span. * * @return A typed hostdevice_span of the hostdevice_vector's data */ - [[nodiscard]] operator hostdevice_span() - { - return hostdevice_span{h_data.data(), d_data.data(), size()}; - } + [[nodiscard]] operator hostdevice_span() { return {host_span{h_data}, device_ptr()}; } - /** - * @brief Converts a part of a hostdevice_vector into a hostdevice_span. - * - * @param offset The offset of the first element in the subspan - * @param count The number of elements in the subspan - * @return A typed hostdevice_span of the hostdevice_vector's data - */ - [[nodiscard]] hostdevice_span subspan(size_t offset, size_t count) + [[nodiscard]] operator hostdevice_span() const { - CUDF_EXPECTS(offset < d_data.size(), "Offset is out of bounds."); - CUDF_EXPECTS(count <= d_data.size() - offset, - "The span with given offset and count is out of bounds."); - return hostdevice_span{h_data.data() + offset, d_data.data() + offset, count}; + return {host_span{h_data}, device_ptr()}; } private: @@ -188,38 +172,47 @@ class hostdevice_2dvector { { } - operator device_2dspan() { return {_data.device_ptr(), _size}; } - operator device_2dspan() const { return {_data.device_ptr(), _size}; } + operator device_2dspan() { return {device_span{_data}, _size.second}; } + operator device_2dspan() const { return {device_span{_data}, _size.second}; } device_2dspan device_view() { return static_cast>(*this); } - device_2dspan device_view() const { return static_cast>(*this); } + [[nodiscard]] device_2dspan device_view() const + { + return static_cast>(*this); + } - operator host_2dspan() { return {_data.host_ptr(), _size}; } - operator host_2dspan() const { return {_data.host_ptr(), _size}; } + operator host_2dspan() { return {host_span{_data}, _size.second}; } + operator host_2dspan() const { return {host_span{_data}, _size.second}; } host_2dspan host_view() { return static_cast>(*this); } - host_2dspan host_view() const { return static_cast>(*this); } + [[nodiscard]] host_2dspan host_view() const + { + return static_cast>(*this); + } host_span operator[](size_t row) { - return {_data.host_ptr() + host_2dspan::flatten_index(row, 0, _size), _size.second}; + return host_span{_data}.subspan(row * _size.second, _size.second); } host_span operator[](size_t row) const { - return {_data.host_ptr() + host_2dspan::flatten_index(row, 0, _size), _size.second}; + return host_span{_data}.subspan(row * _size.second, _size.second); } - auto size() const noexcept { return _size; } - auto count() const noexcept { return _size.first * _size.second; } - auto is_empty() const noexcept { return count() == 0; } + [[nodiscard]] auto size() const noexcept { return _size; } + [[nodiscard]] auto count() const noexcept { return _size.first * _size.second; } + [[nodiscard]] auto is_empty() const noexcept { return count() == 0; } T* base_host_ptr(size_t offset = 0) { return _data.host_ptr(offset); } T* base_device_ptr(size_t offset = 0) { return _data.device_ptr(offset); } - T const* base_host_ptr(size_t offset = 0) const { return _data.host_ptr(offset); } + [[nodiscard]] T const* base_host_ptr(size_t offset = 0) const { return _data.host_ptr(offset); } - T const* base_device_ptr(size_t offset = 0) const { return _data.device_ptr(offset); } + [[nodiscard]] T const* base_device_ptr(size_t offset = 0) const + { + return _data.device_ptr(offset); + } [[nodiscard]] size_t size_bytes() const noexcept { return _data.size_bytes(); } diff --git a/cpp/src/io/utilities/type_inference.cu b/cpp/src/io/utilities/type_inference.cu index 43dc38c4ac6..af32b207d20 100644 --- a/cpp/src/io/utilities/type_inference.cu +++ b/cpp/src/io/utilities/type_inference.cu @@ -18,11 +18,10 @@ #include "io/utilities/string_parsing.hpp" #include "io/utilities/trie.cuh" +#include #include #include -#include - #include #include @@ -242,7 +241,7 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options, constexpr int block_size = 128; auto const grid_size = (size + block_size - 1) / block_size; - auto d_column_info = rmm::device_scalar(stream); + auto d_column_info = cudf::detail::device_scalar(stream); CUDF_CUDA_TRY(cudaMemsetAsync( d_column_info.data(), 0, sizeof(cudf::io::column_type_histogram), stream.value())); diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index 2ec23e0dc6d..40d1c925889 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -81,7 +82,7 @@ std::unique_ptr> conditional_join_anti_semi( join_size = *output_size; } else { // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); + cudf::detail::device_scalar size(0, stream, mr); if (has_nulls) { compute_conditional_join_output_size <<>>( @@ -94,7 +95,7 @@ std::unique_ptr> conditional_join_anti_semi( join_size = size.value(stream); } - rmm::device_scalar write_index(0, stream); + cudf::detail::device_scalar write_index(0, stream); auto left_indices = std::make_unique>(join_size, stream, mr); @@ -197,7 +198,7 @@ conditional_join(table_view const& left, join_size = *output_size; } else { // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); + cudf::detail::device_scalar size(0, stream, mr); if (has_nulls) { compute_conditional_join_output_size <<>>( @@ -231,7 +232,7 @@ conditional_join(table_view const& left, std::make_unique>(0, stream, mr)); } - rmm::device_scalar write_index(0, stream); + cudf::detail::device_scalar write_index(0, stream); auto left_indices = std::make_unique>(join_size, stream, mr); auto right_indices = std::make_unique>(join_size, stream, mr); @@ -342,7 +343,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left, auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block; // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); + cudf::detail::device_scalar size(0, stream, mr); // Determine number of output rows without actually building the output to simply // find what the size of the output will be. diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu index c7294152982..515d28201e8 100644 --- a/cpp/src/join/distinct_hash_join.cu +++ b/cpp/src/join/distinct_hash_join.cu @@ -27,7 +27,6 @@ #include #include -#include #include #include diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index bd8c80652a0..a4ec97af235 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -67,7 +67,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) evaluator, thread_intermediate_storage, swap_tables, equality_probe}; // Create set ref with the new equality comparator - auto const set_ref_equality = set_ref.with_key_eq(equality); + auto const set_ref_equality = set_ref.rebind_key_eq(equality); // Total number of rows to query the set auto const outer_num_rows = left_table.num_rows(); diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index 83a55eca50f..62ba558b0bd 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -184,7 +184,8 @@ std::unique_ptr> mixed_join_semi( auto const row_hash = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; auto const hash_probe = row_hash.device_hasher(has_nulls); - hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe); + hash_set_ref_type const row_set_ref = + row_set.ref(cuco::contains).rebind_hash_function(hash_probe); // Vector used to indicate indices from left/probe table which are present in output auto left_table_keep_mask = rmm::device_uvector(probe.num_rows(), stream); diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh index 84e9be45030..4049ccf35e1 100644 --- a/cpp/src/join/mixed_join_size_kernel.cuh +++ b/cpp/src/join/mixed_join_size_kernel.cuh @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -122,7 +123,7 @@ std::size_t launch_compute_mixed_join_output_size( rmm::device_async_resource_ref mr) { // Allocate storage for the counter used to get the size of the join output - rmm::device_scalar size(0, stream, mr); + cudf::detail::device_scalar size(0, stream, mr); compute_mixed_join_output_size <<>>( diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu index 59fdbedf089..fb5cf66dd60 100644 --- a/cpp/src/json/json_path.cu +++ b/cpp/src/json/json_path.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -1031,7 +1032,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c cudf::detail::create_null_mask(col.size(), mask_state::UNINITIALIZED, stream, mr); // compute results - rmm::device_scalar d_valid_count{0, stream}; + cudf::detail::device_scalar d_valid_count{0, stream}; get_json_object_kernel <<>>( diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index b0a84a6d50c..d27420658d6 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -1126,12 +1126,8 @@ std::pair, rmm::device_uvector> generate_mer * `max` of 0. * * @param tdv input tdigests. The tdigests within this column are grouped by key. - * @param h_group_offsets a host iterator of the offsets to the start of each group. A group is - * counted as one even when the cluster is empty in it. The offsets should have the same values as - * the ones in `group_offsets`. * @param group_offsets a device iterator of the offsets to the start of each group. A group is - * counted as one even when the cluster is empty in it. The offsets should have the same values as - * the ones in `h_group_offsets`. + * counted as one even when the cluster is empty in it. * @param group_labels a device iterator of the the group label for each tdigest cluster including * empty clusters. * @param num_group_labels the number of unique group labels. @@ -1142,9 +1138,8 @@ std::pair, rmm::device_uvector> generate_mer * * @return A column containing the merged tdigests. */ -template +template std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, - HGroupOffsetIter h_group_offsets, GroupOffsetIter group_offsets, GroupLabelIter group_labels, size_t num_group_labels, @@ -1313,21 +1308,13 @@ std::unique_ptr reduce_merge_tdigest(column_view const& input, if (input.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); } - auto group_offsets_ = group_offsets_fn{input.size()}; - auto h_group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_); - auto group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_); - auto group_labels = thrust::make_constant_iterator(0); - return to_tdigest_scalar(merge_tdigests(tdv, - h_group_offsets, - group_offsets, - group_labels, - input.size(), - 1, - max_centroids, - stream, - mr), - stream, - mr); + auto group_offsets_ = group_offsets_fn{input.size()}; + auto group_offsets = cudf::detail::make_counting_transform_iterator(0, group_offsets_); + auto group_labels = thrust::make_constant_iterator(0); + return to_tdigest_scalar( + merge_tdigests(tdv, group_offsets, group_labels, input.size(), 1, max_centroids, stream, mr), + stream, + mr); } std::unique_ptr group_tdigest(column_view const& col, @@ -1376,16 +1363,7 @@ std::unique_ptr group_merge_tdigest(column_view const& input, return cudf::tdigest::detail::make_empty_tdigests_column(num_groups, stream, mr); } - // bring group offsets back to the host - std::vector h_group_offsets(group_offsets.size()); - cudaMemcpyAsync(h_group_offsets.data(), - group_offsets.begin(), - sizeof(size_type) * group_offsets.size(), - cudaMemcpyDefault, - stream); - return merge_tdigests(tdv, - h_group_offsets.begin(), group_offsets.data(), group_labels.data(), group_labels.size(), diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu index 67ea29a2cb1..890625830a5 100644 --- a/cpp/src/reductions/all.cu +++ b/cpp/src/reductions/all.cu @@ -16,6 +16,7 @@ #include "simple.cuh" +#include #include #include #include @@ -65,7 +66,8 @@ struct all_fn { cudf::dictionary::detail::make_dictionary_pair_iterator(*d_dict, input.has_nulls()); return thrust::make_transform_iterator(pair_iter, null_iter); }(); - auto d_result = rmm::device_scalar(1, stream, cudf::get_current_device_resource_ref()); + auto d_result = + cudf::detail::device_scalar(1, stream, cudf::get_current_device_resource_ref()); thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), input.size(), diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu index 057f038c622..d70da369d72 100644 --- a/cpp/src/reductions/any.cu +++ b/cpp/src/reductions/any.cu @@ -16,6 +16,7 @@ #include "simple.cuh" +#include #include #include #include @@ -65,7 +66,8 @@ struct any_fn { cudf::dictionary::detail::make_dictionary_pair_iterator(*d_dict, input.has_nulls()); return thrust::make_transform_iterator(pair_iter, null_iter); }(); - auto d_result = rmm::device_scalar(0, stream, cudf::get_current_device_resource_ref()); + auto d_result = + cudf::detail::device_scalar(0, stream, cudf::get_current_device_resource_ref()); thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), input.size(), diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh index 6bc8b48832f..cd9fade164a 100644 --- a/cpp/src/reductions/compound.cuh +++ b/cpp/src/reductions/compound.cuh @@ -18,13 +18,18 @@ #include #include +#include #include +#include #include #include #include #include +#include +#include + namespace cudf { namespace reduction { namespace compound { @@ -53,9 +58,17 @@ std::unique_ptr compound_reduction(column_view const& col, { auto const valid_count = col.size() - col.null_count(); + // All null input produces all null output + if (valid_count == 0 || + // Only care about ddof for standard deviation and variance right now + valid_count <= ddof && (std::is_same_v || + std::is_same_v)) { + auto result = cudf::make_fixed_width_scalar(output_dtype, stream, mr); + result->set_valid_async(false, stream); + return result; + } // reduction by iterator auto dcol = cudf::column_device_view::create(col, stream); - std::unique_ptr result; Op compound_op{}; if (!cudf::is_dictionary(col.type())) { @@ -63,25 +76,21 @@ std::unique_ptr compound_reduction(column_view const& col, auto it = thrust::make_transform_iterator( dcol->pair_begin(), compound_op.template get_null_replacing_element_transformer()); - result = cudf::reduction::detail::reduce( + return cudf::reduction::detail::reduce( it, col.size(), compound_op, valid_count, ddof, stream, mr); } else { auto it = thrust::make_transform_iterator( dcol->begin(), compound_op.template get_element_transformer()); - result = cudf::reduction::detail::reduce( + return cudf::reduction::detail::reduce( it, col.size(), compound_op, valid_count, ddof, stream, mr); } } else { auto it = thrust::make_transform_iterator( cudf::dictionary::detail::make_dictionary_pair_iterator(*dcol, col.has_nulls()), compound_op.template get_null_replacing_element_transformer()); - result = cudf::reduction::detail::reduce( + return cudf::reduction::detail::reduce( it, col.size(), compound_op, valid_count, ddof, stream, mr); } - - // set scalar is valid - result->set_valid_async(col.null_count() < col.size(), stream); - return result; }; // @brief result type dispatcher for compound reduction (a.k.a. mean, var, std) @@ -137,6 +146,7 @@ struct element_type_dispatcher { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_EXPECTS(ddof >= 0, "ddof must be non-negative", std::domain_error); return cudf::type_dispatcher( output_dtype, result_type_dispatcher(), col, output_dtype, ddof, stream, mr); } diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu index 362b5f74c46..b40b2b6dd2e 100644 --- a/cpp/src/reductions/histogram.cu +++ b/cpp/src/reductions/histogram.cu @@ -15,18 +15,24 @@ */ #include +#include #include -#include #include #include #include +#include #include +#include + +#include +#include #include #include #include #include #include +#include #include @@ -34,61 +40,12 @@ namespace cudf::reduction::detail { namespace { +// A CUDA Cooperative Group of 1 thread for the hash set for histogram +auto constexpr DEFAULT_HISTOGRAM_CG_SIZE = 1; + // Always use 64-bit signed integer for storing count. using histogram_count_type = int64_t; -/** - * @brief The functor to accumulate the frequency of each distinct rows in the input table. - */ -template -struct reduce_fn : cudf::detail::reduce_by_row_fn_base { - CountType const* d_partial_output; - - reduce_fn(MapView const& d_map, - KeyHasher const& d_hasher, - KeyEqual const& d_equal, - CountType* const d_output, - CountType const* const d_partial_output) - : cudf::detail::reduce_by_row_fn_base{d_map, - d_hasher, - d_equal, - d_output}, - d_partial_output{d_partial_output} - { - } - - // Count the number of rows in each group of rows that are compared equal. - __device__ void operator()(size_type const idx) const - { - auto const increment = d_partial_output ? d_partial_output[idx] : CountType{1}; - auto const count = - cuda::atomic_ref(*this->get_output_ptr(idx)); - count.fetch_add(increment, cuda::std::memory_order_relaxed); - } -}; - -/** - * @brief The builder to construct an instance of `reduce_fn` functor. - */ -template -struct reduce_func_builder { - CountType const* const d_partial_output; - - reduce_func_builder(CountType const* const d_partial_output) : d_partial_output{d_partial_output} - { - } - - template - auto build(MapView const& d_map, - KeyHasher const& d_hasher, - KeyEqual const& d_equal, - CountType* const d_output) - { - return reduce_fn{ - d_map, d_hasher, d_equal, d_output, d_partial_output}; - } -}; - /** * @brief Specialized functor to check for not-zero of the second component of the input. */ @@ -163,14 +120,6 @@ compute_row_frequencies(table_view const& input, "Nested types are not yet supported in histogram aggregation.", std::invalid_argument); - auto map = cudf::detail::hash_map_type{ - compute_hash_table_size(input.num_rows()), - cuco::empty_key{-1}, - cuco::empty_value{std::numeric_limits::min()}, - - cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, - stream.value()}; - auto const preprocessed_input = cudf::experimental::row::hash::preprocessed_table::create(input, stream); auto const has_nulls = nullate::DYNAMIC{cudf::has_nested_nulls(input)}; @@ -179,51 +128,68 @@ compute_row_frequencies(table_view const& input, auto const key_hasher = row_hasher.device_hasher(has_nulls); auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); - auto const pair_iter = cudf::detail::make_counting_transform_iterator( - size_type{0}, - cuda::proclaim_return_type>( - [] __device__(size_type const i) { return cuco::make_pair(i, i); })); - // Always compare NaNs as equal. using nan_equal_comparator = cudf::experimental::row::equality::nan_equal_physical_equality_comparator; auto const value_comp = nan_equal_comparator{}; + // Hard set the tparam `has_nested_columns` = false for now as we don't yet support nested columns + auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); + + using row_hash = + cudf::experimental::row::hash::device_row_hasher; + + size_t const num_rows = input.num_rows(); + + // Construct a vector to store reduced counts and init to zero + rmm::device_uvector reduction_results(num_rows, stream, mr); + thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), + reduction_results.begin(), + reduction_results.end(), + histogram_count_type{0}); + + // Construct a hash set + auto row_set = cuco::static_set{ + cuco::extent{num_rows}, + cudf::detail::CUCO_DESIRED_LOAD_FACTOR, + cuco::empty_key{-1}, + key_equal, + cuco::linear_probing{key_hasher}, + {}, // thread scope + {}, // storage + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; - if (has_nested_columns) { - auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); - map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value()); - } else { - auto const key_equal = row_comp.equal_to(has_nulls, null_equality::EQUAL, value_comp); - map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value()); - } - - // Gather the indices of distinct rows. - auto distinct_indices = std::make_unique>( - static_cast(map.get_size()), stream, mr); - - // Store the number of occurrences of each distinct row. - auto distinct_counts = make_numeric_column(data_type{type_to_id()}, - static_cast(map.get_size()), - mask_state::UNALLOCATED, - stream, - mr); + // Device-accessible reference to the hash set with `insert_and_find` operator + auto row_set_ref = row_set.ref(cuco::op::insert_and_find); // Compute frequencies (aka distinct counts) for the input rows. // Note that we consider null and NaNs as always equal. - auto const reduction_results = cudf::detail::hash_reduce_by_row( - map, - preprocessed_input, - input.num_rows(), - has_nulls, - has_nested_columns, - null_equality::EQUAL, - nan_equality::ALL_EQUAL, - reduce_func_builder{ - partial_counts ? partial_counts.value().begin() : nullptr}, - histogram_count_type{0}, - stream, - cudf::get_current_device_resource_ref()); - + thrust::for_each( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_rows), + [set_ref = row_set_ref, + increments = + partial_counts.has_value() ? partial_counts.value().begin() : nullptr, + counts = reduction_results.begin()] __device__(auto const idx) mutable { + auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx); + cuda::atomic_ref count_ref{ + counts[*inserted_idx_ptr]}; + auto const increment = increments ? increments[idx] : histogram_count_type{1}; + count_ref.fetch_add(increment, cuda::std::memory_order_relaxed); + }); + + // Set-size is the number of distinct (inserted) rows + auto const set_size = row_set.size(stream); + + // Vector of distinct indices + auto distinct_indices = std::make_unique>(set_size, stream, mr); + // Column of distinct counts + auto distinct_counts = make_numeric_column( + data_type{type_to_id()}, set_size, mask_state::UNALLOCATED, stream, mr); + + // Copy row indices and counts to the output if counts are non-zero auto const input_it = thrust::make_zip_iterator( thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin())); auto const output_it = thrust::make_zip_iterator(thrust::make_tuple( @@ -232,7 +198,7 @@ compute_row_frequencies(table_view const& input, // Reduction results above are either group sizes of equal rows, or `0`. // The final output is non-zero group sizes only. thrust::copy_if( - rmm::exec_policy(stream), input_it, input_it + input.num_rows(), output_it, is_not_zero{}); + rmm::exec_policy_nosync(stream), input_it, input_it + num_rows, output_it, is_not_zero{}); return {std::move(distinct_indices), std::move(distinct_counts)}; } diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu index 139de068050..4f6eb23ce5b 100644 --- a/cpp/src/reductions/minmax.cu +++ b/cpp/src/reductions/minmax.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -69,18 +70,18 @@ struct minmax_pair { * @param num_items number of items to reduce * @param binary_op binary operator used to reduce * @param stream CUDA stream to run kernels on. - * @return rmm::device_scalar + * @return cudf::detail::device_scalar */ template ::type> -rmm::device_scalar reduce_device(InputIterator d_in, - size_type num_items, - Op binary_op, - rmm::cuda_stream_view stream) +auto reduce_device(InputIterator d_in, + size_type num_items, + Op binary_op, + rmm::cuda_stream_view stream) { OutputType identity{}; - rmm::device_scalar result{identity, stream}; + cudf::detail::device_scalar result{identity, stream}; // Allocate temporary storage size_t storage_bytes = 0; diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 1df1549432f..d0e3358cc34 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -137,7 +138,7 @@ struct replace_nulls_column_kernel_forwarder { auto device_out = cudf::mutable_column_device_view::create(output_view, stream); auto device_replacement = cudf::column_device_view::create(replacement, stream); - rmm::device_scalar valid_counter(0, stream); + cudf::detail::device_scalar valid_counter(0, stream); cudf::size_type* valid_count = valid_counter.data(); replace<<>>( diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 86ec8cfc91e..0cc97ca05e0 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -53,7 +54,6 @@ #include #include -#include #include #include @@ -182,7 +182,7 @@ struct replace_kernel_forwarder { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - rmm::device_scalar valid_counter(0, stream); + cudf::detail::device_scalar valid_counter(0, stream); cudf::size_type* valid_count = valid_counter.data(); auto replace = [&] { diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh index 528700137bf..bc0ee2eb519 100644 --- a/cpp/src/rolling/detail/rolling.cuh +++ b/cpp/src/rolling/detail/rolling.cuh @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -49,7 +50,6 @@ #include #include -#include #include #include @@ -1105,7 +1105,7 @@ struct rolling_window_launcher { auto const d_inp_ptr = column_device_view::create(input, stream); auto const d_default_out_ptr = column_device_view::create(default_outputs, stream); auto const d_out_ptr = mutable_column_device_view::create(output->mutable_view(), stream); - auto d_valid_count = rmm::device_scalar{0, stream}; + auto d_valid_count = cudf::detail::device_scalar{0, stream}; auto constexpr block_size = 256; auto const grid = cudf::detail::grid_1d(input.size(), block_size); @@ -1271,7 +1271,7 @@ std::unique_ptr rolling_window_udf(column_view const& input, udf_agg._output_type, input.size(), cudf::mask_state::UNINITIALIZED, stream, mr); auto output_view = output->mutable_view(); - rmm::device_scalar device_valid_count{0, stream}; + cudf::detail::device_scalar device_valid_count{0, stream}; std::string kernel_name = jitify2::reflection::Template("cudf::rolling::jit::gpu_rolling_new") // diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index 4c015f3cbed..6a7c8ea45e9 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -348,7 +349,7 @@ std::unique_ptr convert_case(strings_column_view const& input, // This check incurs ~20% performance hit for smaller strings and so we only use it // after the threshold check above. The check makes very little impact for long strings // but results in a large performance gain when the input contains only single-byte characters. - rmm::device_scalar mb_count(0, stream); + cudf::detail::device_scalar mb_count(0, stream); // cudf::detail::grid_1d is limited to size_type elements auto const num_blocks = util::div_rounding_up_safe(chars_size / bytes_per_thread, block_size); // we only need to check every other byte since either will contain high bit diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu index 0db1adf1223..f5d052c6657 100644 --- a/cpp/src/strings/convert/convert_durations.cu +++ b/cpp/src/strings/convert/convert_durations.cu @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -152,12 +153,8 @@ struct format_compiler { } // create program in device memory - d_items.resize(items.size(), stream); - CUDF_CUDA_TRY(cudaMemcpyAsync(d_items.data(), - items.data(), - items.size() * sizeof(items[0]), - cudaMemcpyDefault, - stream.value())); + d_items = cudf::detail::make_device_uvector_sync( + items, stream, cudf::get_current_device_resource_ref()); } format_item const* compiled_format_items() { return d_items.data(); } diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 1d9d12686eb..9e4ef47ff79 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -27,7 +28,6 @@ #include #include -#include #include #include @@ -242,7 +242,7 @@ std::unique_ptr concatenate(host_span columns, } { // Copy offsets columns with single kernel launch - rmm::device_scalar d_valid_count(0, stream); + cudf::detail::device_scalar d_valid_count(0, stream); constexpr size_type block_size{256}; cudf::detail::grid_1d config(offsets_count, block_size); diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu index 7323918dcff..8683a9bdfbe 100644 --- a/cpp/src/strings/extract/extract.cu +++ b/cpp/src/strings/extract/extract.cu @@ -100,9 +100,8 @@ std::unique_ptr
extract(strings_column_view const& input, auto const groups = d_prog->group_counts(); CUDF_EXPECTS(groups > 0, "Group indicators not found in regex pattern"); - auto indices = rmm::device_uvector(input.size() * groups, stream); - auto d_indices = - cudf::detail::device_2dspan(indices.data(), input.size(), groups); + auto indices = rmm::device_uvector(input.size() * groups, stream); + auto d_indices = cudf::detail::device_2dspan(indices, groups); auto const d_strings = column_device_view::create(input.parent(), stream); diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 51c6e765edd..b923a301f84 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -710,19 +710,17 @@ class regex_parser { std::stack lbra_stack; auto repeat_start_index = -1; - for (std::size_t index = 0; index < in.size(); index++) { - auto const item = in[index]; - + for (auto const item : in) { if (item.type != COUNTED && item.type != COUNTED_LAZY) { out.push_back(item); if (item.type == LBRA || item.type == LBRA_NC) { - lbra_stack.push(index); + lbra_stack.push(out.size() - 1); repeat_start_index = -1; } else if (item.type == RBRA) { repeat_start_index = lbra_stack.top(); lbra_stack.pop(); } else if ((item.type & ITEM_MASK) != OPERATOR_MASK) { - repeat_start_index = index; + repeat_start_index = out.size() - 1; } } else { // item is of type COUNTED or COUNTED_LAZY @@ -731,26 +729,39 @@ class regex_parser { CUDF_EXPECTS(repeat_start_index >= 0, "regex: invalid counted quantifier location"); // range of affected item(s) to repeat - auto const begin = in.begin() + repeat_start_index; - auto const end = in.begin() + index; + auto const begin = out.begin() + repeat_start_index; + auto const end = out.end(); + // count range values auto const n = item.d.count.n; // minimum count auto const m = item.d.count.m; // maximum count - assert(n >= 0 && "invalid repeat count value n"); // zero-repeat edge-case: need to erase the previous items - if (n == 0) { out.erase(out.end() - (index - repeat_start_index), out.end()); } - - // minimum repeats (n) - for (int j = 1; j < n; j++) { - out.insert(out.end(), begin, end); + if (n == 0) { out.erase(begin, end); } + + std::vector repeat_copy(begin, end); + // special handling for quantified capture groups + if ((n > 1) && (*begin).type == LBRA) { + (*begin).type = LBRA_NC; // change first one to non-capture + // add intermediate groups as non-capture + std::vector ncg_copy(begin, end); + for (int j = 1; j < (n - 1); j++) { + out.insert(out.end(), ncg_copy.begin(), ncg_copy.end()); + } + // add the last entry as a regular capture-group + out.insert(out.end(), repeat_copy.begin(), repeat_copy.end()); + } else { + // minimum repeats (n) + for (int j = 1; j < n; j++) { + out.insert(out.end(), repeat_copy.begin(), repeat_copy.end()); + } } // optional maximum repeats (m) if (m >= 0) { for (int j = n; j < m; j++) { out.emplace_back(LBRA_NC, 0); - out.insert(out.end(), begin, end); + out.insert(out.end(), repeat_copy.begin(), repeat_copy.end()); } for (int j = n; j < m; j++) { out.emplace_back(RBRA, 0); @@ -760,8 +771,9 @@ class regex_parser { // infinite repeats if (n > 0) { // append '+' after last repetition out.emplace_back(item.type == COUNTED ? PLUS : PLUS_LAZY, 0); - } else { // copy it once then append '*' - out.insert(out.end(), begin, end); + } else { + // copy it once then append '*' + out.insert(out.end(), repeat_copy.begin(), repeat_copy.end()); out.emplace_back(item.type == COUNTED ? STAR : STAR_LAZY, 0); } } diff --git a/cpp/src/strings/replace/find_replace.cu b/cpp/src/strings/replace/find_replace.cu index 8a8001dd81a..957075017ba 100644 --- a/cpp/src/strings/replace/find_replace.cu +++ b/cpp/src/strings/replace/find_replace.cu @@ -14,6 +14,7 @@ * limitations under the License. */ #include +#include #include #include #include @@ -21,7 +22,6 @@ #include #include -#include #include #include diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index 352d883bdc5..88f343926c9 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -334,7 +334,7 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in // Count the number of targets in the entire column. // Note this may over-count in the case where a target spans adjacent strings. - rmm::device_scalar d_count(0, stream); + cudf::detail::device_scalar d_count(0, stream); auto const num_blocks = util::div_rounding_up_safe( util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), block_size); count_targets<<>>(fn, chars_bytes, d_count.data()); diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index 16df0dbabdf..52ddef76c1a 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -285,7 +285,7 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in // Count the number of targets in the entire column. // Note this may over-count in the case where a target spans adjacent strings. - rmm::device_scalar d_target_count(0, stream); + cudf::detail::device_scalar d_target_count(0, stream); constexpr int64_t block_size = 512; constexpr size_type bytes_per_thread = 4; auto const num_blocks = util::div_rounding_up_safe( diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu index d8c1b50a94b..21708e48a25 100644 --- a/cpp/src/strings/search/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -126,6 +126,43 @@ std::unique_ptr findall(strings_column_view const& input, mr); } +namespace { +struct find_re_fn { + column_device_view d_strings; + + __device__ size_type operator()(size_type const idx, + reprog_device const prog, + int32_t const thread_idx) const + { + if (d_strings.is_null(idx)) { return 0; } + auto const d_str = d_strings.element(idx); + + auto const result = prog.find(thread_idx, d_str, d_str.begin()); + return result.has_value() ? result.value().first : -1; + } +}; +} // namespace + +std::unique_ptr find_re(strings_column_view const& input, + regex_program const& prog, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto results = make_numeric_column(data_type{type_to_id()}, + input.size(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + stream, + mr); + if (input.is_empty()) { return results; } + + auto d_results = results->mutable_view().data(); + auto d_prog = regex_device_builder::create_prog_device(prog, stream); + auto const d_strings = column_device_view::create(input.parent(), stream); + launch_transform_kernel(find_re_fn{*d_strings}, *d_prog, d_results, input.size(), stream); + + return results; +} } // namespace detail // external API @@ -139,5 +176,14 @@ std::unique_ptr findall(strings_column_view const& input, return detail::findall(input, prog, stream, mr); } +std::unique_ptr find_re(strings_column_view const& input, + regex_program const& prog, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::find_re(input, prog, stream, mr); +} + } // namespace strings } // namespace cudf diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh index 81aca001d53..4b777be9d5b 100644 --- a/cpp/src/strings/split/split.cuh +++ b/cpp/src/strings/split/split.cuh @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -361,7 +362,7 @@ std::pair, rmm::device_uvector> split cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); // count the number of delimiters in the entire column - rmm::device_scalar d_count(0, stream); + cudf::detail::device_scalar d_count(0, stream); if (chars_bytes > 0) { constexpr int64_t block_size = 512; constexpr size_type bytes_per_thread = 4; diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 07516f91dcf..8e00a29f8e9 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -16,36 +16,171 @@ #include #include +#include #include +#include +#include #include -#include -#include #include -#include #include #include +#include #include +#include #include #include +#include +#include namespace cudf { +namespace strings::detail { + namespace { -struct string_view_to_pair { - string_view null_placeholder; - string_view_to_pair(string_view n) : null_placeholder(n) {} - __device__ thrust::pair operator()(string_view const& i) - { - return (i.data() == null_placeholder.data()) - ? thrust::pair{nullptr, 0} - : thrust::pair{i.data(), i.size_bytes()}; + +using column_string_pairs = cudf::device_span; + +template +std::pair>, rmm::device_uvector> +make_offsets_child_column_batch_async(std::vector const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const num_columns = input.size(); + std::vector> offsets_columns(num_columns); + rmm::device_uvector chars_sizes(num_columns, stream); + for (std::size_t idx = 0; idx < num_columns; ++idx) { + auto const string_pairs = input[idx]; + auto const string_count = static_cast(string_pairs.size()); + auto offsets = make_numeric_column( + data_type{type_to_id()}, string_count + 1, mask_state::UNALLOCATED, stream, mr); + + auto const offsets_transformer = cuda::proclaim_return_type( + [string_count, string_pairs = string_pairs.data()] __device__(size_type idx) -> size_type { + return idx < string_count ? string_pairs[idx].second : size_type{0}; + }); + auto const input_it = cudf::detail::make_counting_transform_iterator(0, offsets_transformer); + auto const d_offsets = offsets->mutable_view().template data(); + auto const output_it = cudf::detail::make_sizes_to_offsets_iterator( + d_offsets, d_offsets + string_count + 1, chars_sizes.data() + idx); + thrust::exclusive_scan(rmm::exec_policy_nosync(stream), + input_it, + input_it + string_count + 1, + output_it, + int64_t{0}); + offsets_columns[idx] = std::move(offsets); } -}; + + return {std::move(offsets_columns), std::move(chars_sizes)}; +} } // namespace +std::vector> make_strings_column_batch( + std::vector const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto const num_columns = input.size(); + + auto [offsets_cols, d_chars_sizes] = + make_offsets_child_column_batch_async(input, stream, mr); + + std::vector null_masks; + null_masks.reserve(num_columns); + + rmm::device_uvector d_valid_counts(num_columns, stream, mr); + thrust::uninitialized_fill( + rmm::exec_policy_nosync(stream), d_valid_counts.begin(), d_valid_counts.end(), 0); + + for (std::size_t idx = 0; idx < num_columns; ++idx) { + auto const& string_pairs = input[idx]; + auto const string_count = static_cast(string_pairs.size()); + null_masks.emplace_back( + cudf::create_null_mask(string_count, mask_state::UNINITIALIZED, stream, mr)); + + if (string_count == 0) { continue; } + + constexpr size_type block_size{256}; + auto const grid = + cudf::detail::grid_1d{static_cast(string_count), block_size}; + cudf::detail::valid_if_kernel + <<>>( + reinterpret_cast(null_masks.back().data()), + string_pairs.data(), + string_count, + [] __device__(string_index_pair const pair) -> bool { return pair.first != nullptr; }, + d_valid_counts.data() + idx); + } + + auto const chars_sizes = cudf::detail::make_std_vector_async(d_chars_sizes, stream); + auto const valid_counts = cudf::detail::make_std_vector_async(d_valid_counts, stream); + + // Except for other stream syncs in `CUB` that we cannot control, + // this should be the only stream sync we need in the entire API. + stream.synchronize(); + + auto const threshold = cudf::strings::get_offset64_threshold(); + auto const overflow_count = + std::count_if(chars_sizes.begin(), chars_sizes.end(), [threshold](auto const chars_size) { + return chars_size >= threshold; + }); + CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || overflow_count == 0, + "Size of output exceeds the column size limit", + std::overflow_error); + + if (overflow_count > 0) { + std::vector long_string_input; + std::vector long_string_col_idx; + long_string_input.reserve(overflow_count); + long_string_col_idx.reserve(overflow_count); + for (std::size_t idx = 0; idx < num_columns; ++idx) { + if (chars_sizes[idx] >= threshold) { + long_string_input.push_back(input[idx]); + long_string_col_idx.push_back(idx); + } + } + + [[maybe_unused]] auto [new_offsets_cols, d_new_chars_sizes] = + make_offsets_child_column_batch_async(long_string_input, stream, mr); + + // Update the new offsets columns. + // The new chars sizes should be the same as before, thus we don't need to update them. + for (std::size_t idx = 0; idx < long_string_col_idx.size(); ++idx) { + offsets_cols[long_string_col_idx[idx]] = std::move(new_offsets_cols[idx]); + } + } + + std::vector> output(num_columns); + for (std::size_t idx = 0; idx < num_columns; ++idx) { + auto const strings_count = static_cast(input[idx].size()); + if (strings_count == 0) { + output[idx] = make_empty_column(type_id::STRING); + continue; + } + + auto const chars_size = chars_sizes[idx]; + auto const valid_count = valid_counts[idx]; + + auto chars_data = make_chars_buffer( + offsets_cols[idx]->view(), chars_size, input[idx].data(), strings_count, stream, mr); + + auto const null_count = strings_count - valid_count; + output[idx] = make_strings_column( + strings_count, + std::move(offsets_cols[idx]), + chars_data.release(), + null_count, + null_count ? std::move(null_masks[idx]) : rmm::device_buffer{0, stream, mr}); + } + + return output; +} + +} // namespace strings::detail + // Create a strings-type column from vector of pointer/size pairs std::unique_ptr make_strings_column( device_span const> strings, @@ -53,10 +188,32 @@ std::unique_ptr make_strings_column( rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return cudf::strings::detail::make_strings_column(strings.begin(), strings.end(), stream, mr); } +std::vector> make_strings_column_batch( + std::vector const>> const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return cudf::strings::detail::make_strings_column_batch(input, stream, mr); +} + +namespace { +struct string_view_to_pair { + string_view null_placeholder; + string_view_to_pair(string_view n) : null_placeholder(n) {} + __device__ thrust::pair operator()(string_view const& i) + { + return (i.data() == null_placeholder.data()) + ? thrust::pair{nullptr, 0} + : thrust::pair{i.data(), i.size_bytes()}; + } +}; + +} // namespace + std::unique_ptr make_strings_column(device_span string_views, string_view null_placeholder, rmm::cuda_stream_view stream, diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index a87ecb81b9d..997b0278fe2 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,9 @@ namespace nvtext { namespace detail { namespace { +// long strings threshold found with benchmarking +constexpr cudf::size_type AVG_CHAR_BYTES_THRESHOLD = 64; + /** * @brief Generate ngrams from strings column. * @@ -173,33 +177,39 @@ constexpr cudf::thread_index_type bytes_per_thread = 4; /** * @brief Counts the number of ngrams in each row of the given strings column * - * Each warp processes a single string. + * Each warp/thread processes a single string. * Formula is `count = max(0,str.length() - ngrams + 1)` * If a string has less than ngrams characters, its count is 0. */ CUDF_KERNEL void count_char_ngrams_kernel(cudf::column_device_view const d_strings, cudf::size_type ngrams, + cudf::size_type tile_size, cudf::size_type* d_counts) { auto const idx = cudf::detail::grid_1d::global_thread_id(); - auto const str_idx = idx / cudf::detail::warp_size; + auto const str_idx = idx / tile_size; if (str_idx >= d_strings.size()) { return; } if (d_strings.is_null(str_idx)) { d_counts[str_idx] = 0; return; } + auto const d_str = d_strings.element(str_idx); + if (tile_size == 1) { + d_counts[str_idx] = cuda::std::max(0, (d_str.length() + 1 - ngrams)); + return; + } + namespace cg = cooperative_groups; auto const warp = cg::tiled_partition(cg::this_thread_block()); - auto const d_str = d_strings.element(str_idx); - auto const end = d_str.data() + d_str.size_bytes(); + auto const end = d_str.data() + d_str.size_bytes(); auto const lane_idx = warp.thread_rank(); cudf::size_type count = 0; for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end; - itr += cudf::detail::warp_size * bytes_per_thread) { + itr += tile_size * bytes_per_thread) { for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) { count += static_cast(cudf::strings::detail::is_begin_utf8_char(*s)); } @@ -256,19 +266,27 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie "Parameter ngrams should be an integer value of 2 or greater", std::invalid_argument); - auto const strings_count = input.size(); - if (strings_count == 0) { // if no strings, return an empty column - return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + if (input.is_empty()) { // if no strings, return an empty column + return cudf::lists::detail::make_empty_lists_column( + cudf::data_type{cudf::type_id::STRING}, stream, mr); + } + if (input.size() == input.null_count()) { + return cudf::lists::detail::make_all_nulls_lists_column( + input.size(), cudf::data_type{cudf::type_id::STRING}, stream, mr); } auto const d_strings = cudf::column_device_view::create(input.parent(), stream); auto [offsets, total_ngrams] = [&] { - auto counts = rmm::device_uvector(input.size(), stream); - auto const num_blocks = cudf::util::div_rounding_up_safe( - static_cast(input.size()) * cudf::detail::warp_size, block_size); - count_char_ngrams_kernel<<>>( - *d_strings, ngrams, counts.data()); + auto counts = rmm::device_uvector(input.size(), stream); + auto const avg_char_bytes = (input.chars_size(stream) / (input.size() - input.null_count())); + auto const tile_size = (avg_char_bytes < AVG_CHAR_BYTES_THRESHOLD) + ? 1 // thread per row + : cudf::detail::warp_size; // warp per row + auto const grid = cudf::detail::grid_1d( + static_cast(input.size()) * tile_size, block_size); + count_char_ngrams_kernel<<>>( + *d_strings, ngrams, tile_size, counts.data()); return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr); }(); auto d_offsets = offsets->view().data(); @@ -277,8 +295,8 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie "Insufficient number of characters in each string to generate ngrams"); character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets}; - auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( - generator, strings_count, total_ngrams, stream, mr); + auto [offsets_column, chars] = + cudf::strings::detail::make_strings_children(generator, input.size(), total_ngrams, stream, mr); auto output = cudf::make_strings_column( total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); @@ -368,7 +386,7 @@ std::unique_ptr hash_character_ngrams(cudf::strings_column_view co auto [offsets, total_ngrams] = [&] { auto counts = rmm::device_uvector(input.size(), stream); count_char_ngrams_kernel<<>>( - *d_strings, ngrams, counts.data()); + *d_strings, ngrams, cudf::detail::warp_size, counts.data()); return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr); }(); auto d_offsets = offsets->view().data(); diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index df25950e6d5..89ca8a089d6 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -221,7 +222,7 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const // To minimize memory, count the number of characters so we can // build the output offsets without an intermediate buffer. // In the worst case each byte is a character so the output is 4x the input. - rmm::device_scalar d_count(0, stream); + cudf::detail::device_scalar d_count(0, stream); auto const num_blocks = cudf::util::div_rounding_up_safe( cudf::util::div_rounding_up_safe(chars_bytes, static_cast(bytes_per_thread)), block_size); diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index 0efb881eb3e..c0af27a1748 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -30,7 +30,7 @@ namespace cudf::detail { namespace { // Simple kernel to copy between device buffers -CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n) +CUDF_KERNEL void copy_kernel(char const* __restrict__ src, char* __restrict__ dst, size_t n) { auto const idx = cudf::detail::grid_1d::global_thread_id(); if (idx < n) { dst[idx] = src[idx]; } @@ -61,7 +61,7 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea }; // namespace -void cuda_memcpy_async( +void cuda_memcpy_async_impl( void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) { if (kind == host_memory_kind::PINNED) { @@ -73,11 +73,4 @@ void cuda_memcpy_async( } } -void cuda_memcpy( - void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) -{ - cuda_memcpy_async(dst, src, size, kind, stream); - stream.synchronize(); -} - } // namespace cudf::detail diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 125b98c4a67..9d8e3cf2fa6 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -115,12 +115,19 @@ class fixed_pinned_pool_memory_resource { return !operator==(other); } - friend void get_property(fixed_pinned_pool_memory_resource const&, + // clang-tidy will complain about this function because it is completely + // unused at runtime and only exist for tag introspection by CCCL, so we + // ignore linting. This masks a real issue if we ever want to compile with + // clang, though, which is that the function will actually be compiled out by + // clang. If cudf were ever to try to support clang as a compile we would + // need to force the compiler to emit this symbol. The same goes for the + // other get_property definitions in this file. + friend void get_property(fixed_pinned_pool_memory_resource const&, // NOLINT cuda::mr::device_accessible) noexcept { } - friend void get_property(fixed_pinned_pool_memory_resource const&, + friend void get_property(fixed_pinned_pool_memory_resource const&, // NOLINT cuda::mr::host_accessible) noexcept { } @@ -235,7 +242,9 @@ class new_delete_memory_resource { bool operator!=(new_delete_memory_resource const& other) const { return !operator==(other); } + // NOLINTBEGIN friend void get_property(new_delete_memory_resource const&, cuda::mr::host_accessible) noexcept {} + // NOLINTEND }; static_assert(cuda::mr::resource_with, diff --git a/cpp/src/utilities/logger.cpp b/cpp/src/utilities/logger.cpp index d54f5677c4c..e52fffbd8c6 100644 --- a/cpp/src/utilities/logger.cpp +++ b/cpp/src/utilities/logger.cpp @@ -74,8 +74,10 @@ struct logger_wrapper { } // namespace -spdlog::logger& cudf::logger() +spdlog::logger& cudf::detail::logger() { static logger_wrapper wrapped{}; return wrapped.logger_; } + +spdlog::logger& cudf::logger() { return cudf::detail::logger(); } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index b67d922d377..b78a64d0e55 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -56,8 +56,15 @@ function(ConfigureTest CMAKE_TEST_NAME) target_link_libraries( ${CMAKE_TEST_NAME} - PRIVATE cudftestutil GTest::gmock GTest::gmock_main GTest::gtest GTest::gtest_main - nvtx3::nvtx3-cpp $ "${_CUDF_TEST_EXTRA_LIBS}" + PRIVATE cudf::cudftestutil + cudf::cudftestutil_impl + GTest::gmock + GTest::gmock_main + GTest::gtest + GTest::gtest_main + nvtx3::nvtx3-cpp + $ + "${_CUDF_TEST_EXTRA_LIBS}" ) rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME}) rapids_test_add( @@ -76,6 +83,7 @@ function(ConfigureTest CMAKE_TEST_NAME) "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$" ) endif() + enable_clang_tidy(${CMAKE_TEST_NAME}) endfunction() # ################################################################################################## @@ -385,6 +393,8 @@ ConfigureTest( # * utilities tests ------------------------------------------------------------------------------- ConfigureTest( UTILITIES_TEST + utilities_tests/batched_memcpy_tests.cu + utilities_tests/batched_memset_tests.cu utilities_tests/column_debug_tests.cpp utilities_tests/column_utilities_tests.cpp utilities_tests/column_wrapper_tests.cpp @@ -395,7 +405,6 @@ ConfigureTest( utilities_tests/pinned_memory_tests.cpp utilities_tests/type_check_tests.cpp utilities_tests/type_list_tests.cpp - utilities_tests/batched_memset_tests.cu ) # ################################################################################################## @@ -602,7 +611,6 @@ ConfigureTest( text/bpe_tests.cpp text/edit_distance_tests.cpp text/jaccard_tests.cpp - text/minhash_tests.cpp text/ngrams_tests.cpp text/ngrams_tokenize_tests.cpp text/normalize_tests.cpp @@ -717,6 +725,7 @@ ConfigureTest( streams/strings/contains_test.cpp streams/strings/convert_test.cpp streams/strings/extract_test.cpp + streams/strings/factory_test.cpp streams/strings/filter_test.cpp streams/strings/find_test.cpp streams/strings/replace_test.cpp diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp index 06e0d193d80..aa5b49567e6 100644 --- a/cpp/tests/binaryop/binop-compiled-test.cpp +++ b/cpp/tests/binaryop/binop-compiled-test.cpp @@ -557,7 +557,11 @@ auto NullOp_Result(cudf::column_view lhs, cudf::column_view rhs) std::transform(thrust::make_counting_iterator(0), thrust::make_counting_iterator(lhs.size()), result.begin(), - [&lhs_data, &lhs_mask, &rhs_data, &rhs_mask, &result_mask](auto i) -> TypeOut { + [&lhs_data = lhs_data, + &lhs_mask = lhs_mask, + &rhs_data = rhs_data, + &rhs_mask = rhs_mask, + &result_mask = result_mask](auto i) -> TypeOut { auto lhs_valid = lhs_mask.data() and cudf::bit_is_set(lhs_mask.data(), i); auto rhs_valid = rhs_mask.data() and cudf::bit_is_set(rhs_mask.data(), i); bool output_valid = lhs_valid or rhs_valid; diff --git a/cpp/tests/binaryop/util/operation.h b/cpp/tests/binaryop/util/operation.h index d36b48d666a..ef1ccfccab5 100644 --- a/cpp/tests/binaryop/util/operation.h +++ b/cpp/tests/binaryop/util/operation.h @@ -100,7 +100,7 @@ struct Mul { std::enable_if_t<(cudf::is_duration_t::value && std::is_integral_v) || (cudf::is_duration_t::value && std::is_integral_v), void>* = nullptr> - OutT DurationProduct(LhsT x, RhsT y) const + [[nodiscard]] OutT DurationProduct(LhsT x, RhsT y) const { return x * y; } @@ -128,7 +128,7 @@ struct Div { typename LhsT, typename RhsT, std::enable_if_t<(std::is_integral_v || cudf::is_duration()), void>* = nullptr> - OutT DurationDivide(LhsT x, RhsT y) const + [[nodiscard]] OutT DurationDivide(LhsT x, RhsT y) const { return x / y; } diff --git a/cpp/tests/column/column_test.cpp b/cpp/tests/column/column_test.cpp index 14b4197de71..631f5150829 100644 --- a/cpp/tests/column/column_test.cpp +++ b/cpp/tests/column/column_test.cpp @@ -340,7 +340,7 @@ TYPED_TEST(TypedColumnTest, MoveConstructorNoMask) cudf::column moved_to{std::move(original)}; - EXPECT_EQ(0, original.size()); + EXPECT_EQ(0, original.size()); // NOLINT EXPECT_EQ(cudf::data_type{cudf::type_id::EMPTY}, original.type()); verify_column_views(moved_to); @@ -359,7 +359,7 @@ TYPED_TEST(TypedColumnTest, MoveConstructorWithMask) cudf::column moved_to{std::move(original)}; verify_column_views(moved_to); - EXPECT_EQ(0, original.size()); + EXPECT_EQ(0, original.size()); // NOLINT EXPECT_EQ(cudf::data_type{cudf::type_id::EMPTY}, original.type()); // Verify move diff --git a/cpp/tests/copying/slice_tests.cpp b/cpp/tests/copying/slice_tests.cpp index bebd3d25610..aef0d4ad78a 100644 --- a/cpp/tests/copying/slice_tests.cpp +++ b/cpp/tests/copying/slice_tests.cpp @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -370,11 +371,12 @@ TEST_F(SliceStringTableTest, StringWithNulls) auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; - cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids}, - {strings[1].begin(), strings[1].end(), valids}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; + std::array sw{ + {{strings[0].begin(), strings[0].end(), valids}, + {strings[1].begin(), strings[1].end(), valids}}}; std::vector> scols; scols.push_back(sw[0].release()); diff --git a/cpp/tests/copying/slice_tests.cuh b/cpp/tests/copying/slice_tests.cuh index a180740f143..1e037294527 100644 --- a/cpp/tests/copying/slice_tests.cuh +++ b/cpp/tests/copying/slice_tests.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -148,7 +148,7 @@ std::vector create_expected_tables(cudf::size_type num_cols, } } - result.push_back(cudf::table(std::move(cols))); + result.emplace_back(std::move(cols)); } return result; @@ -163,13 +163,12 @@ inline std::vector create_expected_string_co for (unsigned long index = 0; index < indices.size(); index += 2) { if (not nullable) { - result.push_back(cudf::test::strings_column_wrapper(strings.begin() + indices[index], - strings.begin() + indices[index + 1])); + result.emplace_back(strings.begin() + indices[index], strings.begin() + indices[index + 1]); } else { auto valids = cudf::detail::make_counting_transform_iterator( indices[index], [](auto i) { return i % 2 == 0; }); - result.push_back(cudf::test::strings_column_wrapper( - strings.begin() + indices[index], strings.begin() + indices[index + 1], valids)); + result.emplace_back( + strings.begin() + indices[index], strings.begin() + indices[index + 1], valids); } } @@ -184,16 +183,16 @@ inline std::vector create_expected_string_co std::vector result = {}; for (unsigned long index = 0; index < indices.size(); index += 2) { - result.push_back(cudf::test::strings_column_wrapper(strings.begin() + indices[index], - strings.begin() + indices[index + 1], - validity.begin() + indices[index])); + result.emplace_back(strings.begin() + indices[index], + strings.begin() + indices[index + 1], + validity.begin() + indices[index]); } return result; } inline std::vector create_expected_string_tables( - std::vector const strings[2], + std::vector> const strings, std::vector const& indices, bool nullable) { @@ -216,7 +215,7 @@ inline std::vector create_expected_string_tables( } } - result.push_back(cudf::table(std::move(cols))); + result.emplace_back(std::move(cols)); } return result; diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp index ee3e7da5e0f..b56b0f2d3f8 100644 --- a/cpp/tests/copying/split_tests.cpp +++ b/cpp/tests/copying/split_tests.cpp @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -135,7 +136,7 @@ std::vector create_expected_tables_for_splits( } std::vector create_expected_string_tables_for_splits( - std::vector const strings[2], + std::vector> const strings, std::vector const& splits, bool nullable) { @@ -144,8 +145,8 @@ std::vector create_expected_string_tables_for_splits( } std::vector create_expected_string_tables_for_splits( - std::vector const strings[2], - std::vector const validity[2], + std::vector> const strings, + std::vector> const validity, std::vector const& splits) { std::vector indices = splits_to_indices(splits, strings[0].size()); @@ -627,11 +628,12 @@ void split_string_with_invalids(SplitFunc Split, auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; - cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids}, - {strings[1].begin(), strings[1].end(), valids}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; + std::array sw{ + {{strings[0].begin(), strings[0].end(), valids}, + {strings[1].begin(), strings[1].end(), valids}}}; std::vector> scols; scols.push_back(sw[0].release()); @@ -658,11 +660,12 @@ void split_empty_output_strings_column_value(SplitFunc Split, auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; - cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), valids}, - {strings[1].begin(), strings[1].end(), valids}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; + std::array sw{ + {{strings[0].begin(), strings[0].end(), valids}, + {strings[1].begin(), strings[1].end(), valids}}}; std::vector> scols; scols.push_back(sw[0].release()); @@ -684,9 +687,9 @@ void split_null_input_strings_column_value(SplitFunc Split, CompareFunc Compare) auto valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; std::vector splits{2, 5, 9}; @@ -699,16 +702,17 @@ void split_null_input_strings_column_value(SplitFunc Split, CompareFunc Compare) EXPECT_NO_THROW(Split(empty_table, splits)); } - cudf::test::strings_column_wrapper sw[2] = {{strings[0].begin(), strings[0].end(), no_valids}, - {strings[1].begin(), strings[1].end(), valids}}; + std::array sw{ + {{strings[0].begin(), strings[0].end(), no_valids}, + {strings[1].begin(), strings[1].end(), valids}}}; std::vector> scols; scols.push_back(sw[0].release()); scols.push_back(sw[1].release()); cudf::table src_table(std::move(scols)); auto result = Split(src_table, splits); - std::vector validity_masks[2] = {std::vector(strings[0].size()), - std::vector(strings[0].size())}; + std::vector> validity_masks{std::vector(strings[0].size()), + std::vector(strings[0].size())}; std::generate( validity_masks[1].begin(), validity_masks[1].end(), [i = 0]() mutable { return i++ % 2 == 0; }); @@ -1913,9 +1917,9 @@ TEST_F(ContiguousSplitTableCornerCases, MixedColumnTypes) cudf::size_type start = 0; auto valids = cudf::detail::make_counting_transform_iterator(start, [](auto i) { return true; }); - std::vector strings[2] = { - {"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, - {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}; + std::vector> strings{ + {{"", "this", "is", "a", "column", "of", "strings", "with", "in", "valid"}, + {"", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"}}}; std::vector> cols; @@ -2377,7 +2381,7 @@ TEST_F(ContiguousSplitTableCornerCases, OutBufferToSmall) { // internally, contiguous split chunks GPU work in 1MB contiguous copies // so the output buffer must be 1MB or larger. - EXPECT_THROW(cudf::chunked_pack::create({}, 1 * 1024), cudf::logic_error); + EXPECT_THROW(auto _ = cudf::chunked_pack::create({}, 1 * 1024), cudf::logic_error); } TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall) diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index 13577c4d0ea..603edb27c7c 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -196,6 +196,136 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) fixed_width_column_wrapper{0, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_ns), fixed_width_column_wrapper{766, 424, 623}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::YEAR), + fixed_width_column_wrapper{1965, 2018, 2023}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::YEAR), + fixed_width_column_wrapper{1965, 2018, 2023}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::YEAR), + fixed_width_column_wrapper{1965, 2018, 2023}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::YEAR), + fixed_width_column_wrapper{1969, 1970, 1970}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MONTH), + fixed_width_column_wrapper{10, 7, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MONTH), + fixed_width_column_wrapper{10, 7, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MONTH), + fixed_width_column_wrapper{10, 7, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MONTH), + fixed_width_column_wrapper{12, 1, 1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::DAY), + fixed_width_column_wrapper{26, 4, 25}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::DAY), + fixed_width_column_wrapper{26, 4, 25}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::DAY), + fixed_width_column_wrapper{26, 4, 25}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::DAY), + fixed_width_column_wrapper{31, 1, 1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::WEEKDAY), + fixed_width_column_wrapper{2, 3, 3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::WEEKDAY), + fixed_width_column_wrapper{2, 3, 3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::WEEKDAY), + fixed_width_column_wrapper{2, 3, 3}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::WEEKDAY), + fixed_width_column_wrapper{2, 3, 3}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::HOUR), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::HOUR), + fixed_width_column_wrapper{14, 12, 7}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::HOUR), + fixed_width_column_wrapper{14, 12, 7}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::HOUR), + fixed_width_column_wrapper{23, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MINUTE), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MINUTE), + fixed_width_column_wrapper{1, 0, 32}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MINUTE), + fixed_width_column_wrapper{1, 0, 32}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MINUTE), + fixed_width_column_wrapper{59, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::SECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::SECOND), + fixed_width_column_wrapper{12, 0, 12}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::SECOND), + fixed_width_column_wrapper{12, 0, 12}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::SECOND), + fixed_width_column_wrapper{59, 0, 0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MILLISECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MILLISECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MILLISECOND), + fixed_width_column_wrapper{762, 0, 929}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MILLISECOND), + fixed_width_column_wrapper{976, 23, 987}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::MICROSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::MICROSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::MICROSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::MICROSECOND), + fixed_width_column_wrapper{675, 432, 234}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::NANOSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_s, cudf::datetime::datetime_component::NANOSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ms, cudf::datetime::datetime_component::NANOSECOND), + fixed_width_column_wrapper{0, 0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps_ns, cudf::datetime::datetime_component::NANOSECOND), + fixed_width_column_wrapper{766, 424, 623}); } template diff --git a/cpp/tests/hashing/murmurhash3_x64_128_test.cpp b/cpp/tests/hashing/murmurhash3_x64_128_test.cpp index 4fb8f78b558..0e68050f935 100644 --- a/cpp/tests/hashing/murmurhash3_x64_128_test.cpp +++ b/cpp/tests/hashing/murmurhash3_x64_128_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,8 +22,6 @@ #include -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; - using NumericTypesNoBools = cudf::test::Concat; diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp index cc95c7a2f0f..8bc47c92c6b 100644 --- a/cpp/tests/hashing/sha256_test.cpp +++ b/cpp/tests/hashing/sha256_test.cpp @@ -23,8 +23,6 @@ #include #include -constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS}; - class SHA256HashTest : public cudf::test::BaseFixture {}; TEST_F(SHA256HashTest, EmptyTable) diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp index a4dc7531765..2151ec6e22f 100644 --- a/cpp/tests/interop/from_arrow_device_test.cpp +++ b/cpp/tests/interop/from_arrow_device_test.cpp @@ -270,9 +270,9 @@ TEST_F(FromArrowDeviceTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); @@ -414,9 +414,9 @@ TEST_F(FromArrowDeviceTest, DictionaryIndicesType) { std::vector> columns; auto col = cudf::test::fixed_width_column_wrapper({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1}); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); + columns.emplace_back(cudf::dictionary::encode(col)); + columns.emplace_back(cudf::dictionary::encode(col)); + columns.emplace_back(cudf::dictionary::encode(col)); cudf::table expected_table(std::move(columns)); cudf::table_view expected_table_view = expected_table.view(); diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp index cbfa4911c3c..ef9936b214c 100644 --- a/cpp/tests/interop/from_arrow_host_test.cpp +++ b/cpp/tests/interop/from_arrow_host_test.cpp @@ -309,9 +309,9 @@ TEST_F(FromArrowHostDeviceTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 81c406c0faf..6e742b9e4cf 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -52,7 +52,7 @@ std::unique_ptr get_cudf_table() .release()); auto col4 = cudf::test::fixed_width_column_wrapper({1, 2, 5, 2, 7}, {true, false, true, true, true}); - columns.emplace_back(std::move(cudf::dictionary::encode(col4))); + columns.emplace_back(cudf::dictionary::encode(col4)); columns.emplace_back(cudf::test::fixed_width_column_wrapper( {true, false, true, false, true}, {true, false, true, true, false}) .release()); @@ -339,9 +339,9 @@ TEST_F(FromArrowTest, DictionaryIndicesType) std::vector> columns; auto col = cudf::test::fixed_width_column_wrapper({1, 2, 5, 2, 7}, {true, false, true, true, true}); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); - columns.emplace_back(std::move(cudf::dictionary::encode(col))); + columns.emplace_back(cudf::dictionary::encode(col)); + columns.emplace_back(cudf::dictionary::encode(col)); + columns.emplace_back(cudf::dictionary::encode(col)); cudf::table expected_table(std::move(columns)); diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index a961f73d955..8be7e087b6d 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -256,7 +256,8 @@ std::enable_if_t, nanoarrow::UniqueArray> get_nanoarrow_ ArrowBitmap out; ArrowBitmapInit(&out); NANOARROW_THROW_NOT_OK(ArrowBitmapResize(&out, b.size(), 1)); - std::memset(out.buffer.data, 0, out.buffer.size_bytes); + // TODO: Investigate clang-tidy issue further after nanoarrow is made compliant + std::memset(out.buffer.data, 0, out.buffer.size_bytes); // NOLINT for (size_t i = 0; i < b.size(); ++i) { ArrowBitSetTo(out.buffer.data, i, static_cast(b[i])); diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 51216a8512c..7ba586461dc 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -55,7 +55,7 @@ get_nanoarrow_cudf_table(cudf::size_type length) auto col4 = cudf::test::fixed_width_column_wrapper( test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin()); auto dict_col = cudf::dictionary::encode(col4); - columns.emplace_back(std::move(cudf::dictionary::encode(col4))); + columns.emplace_back(cudf::dictionary::encode(col4)); columns.emplace_back(cudf::test::fixed_width_column_wrapper(test_data.bool_data.begin(), test_data.bool_data.end(), test_data.bool_validity.begin()) @@ -82,8 +82,8 @@ get_nanoarrow_cudf_table(cudf::size_type length) test_data.string_data.begin(), test_data.string_data.end(), test_data.validity.begin()) .release(); vector_of_columns cols; - cols.push_back(move(int_column)); - cols.push_back(move(str_column)); + cols.push_back(std::move(int_column)); + cols.push_back(std::move(str_column)); auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( test_data.bool_data_validity.begin(), test_data.bool_data_validity.end())); columns.emplace_back( @@ -575,9 +575,9 @@ TEST_F(ToArrowDeviceTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); diff --git a/cpp/tests/interop/to_arrow_host_test.cpp b/cpp/tests/interop/to_arrow_host_test.cpp index fc0ed6c9352..fcb4433b42e 100644 --- a/cpp/tests/interop/to_arrow_host_test.cpp +++ b/cpp/tests/interop/to_arrow_host_test.cpp @@ -436,9 +436,9 @@ TEST_F(ToArrowHostDeviceTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 90ae12cdd90..a6aa4b22eca 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -90,7 +90,7 @@ std::pair, std::shared_ptr> get_table auto col4 = cudf::test::fixed_width_column_wrapper( int64_data.begin(), int64_data.end(), validity.begin()); auto dict_col = cudf::dictionary::encode(col4); - columns.emplace_back(std::move(cudf::dictionary::encode(col4))); + columns.emplace_back(cudf::dictionary::encode(col4)); columns.emplace_back(cudf::test::fixed_width_column_wrapper( bool_data.begin(), bool_data.end(), bool_validity.begin()) .release()); @@ -112,8 +112,8 @@ std::pair, std::shared_ptr> get_table cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin()) .release(); vector_of_columns cols; - cols.push_back(move(int_column)); - cols.push_back(move(str_column)); + cols.push_back(std::move(int_column)); + cols.push_back(std::move(str_column)); auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper( bool_data_validity.begin(), bool_data_validity.end())); columns.emplace_back( @@ -294,9 +294,9 @@ TEST_F(ToArrowTest, StructColumn) auto int_col2 = cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); - auto list_col = - cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) - .release(); + auto list_col = cudf::test::lists_column_wrapper( + {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) // NOLINT + .release(); vector_of_columns cols2; cols2.push_back(std::move(str_col2)); cols2.push_back(std::move(int_col2)); @@ -438,7 +438,7 @@ TEST_F(ToArrowTest, FixedPoint64TableLarge) auto const schema = std::make_shared(schema_vector); auto const expected_arrow_table = arrow::Table::Make(schema, {arr}); - std::vector const metadata = {{"a"}}; + std::vector const metadata = {{"a"}}; // NOLINT ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table)); } } diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp index 840cf263ed9..54262dc3b44 100644 --- a/cpp/tests/io/comp/decomp_test.cpp +++ b/cpp/tests/io/comp/decomp_test.cpp @@ -39,19 +39,19 @@ using cudf::device_span; */ template struct DecompressTest : public cudf::test::BaseFixture { - std::vector vector_from_string(char const* str) const + [[nodiscard]] std::vector vector_from_string(std::string const str) const { - return std::vector(reinterpret_cast(str), - reinterpret_cast(str) + strlen(str)); + return {reinterpret_cast(str.c_str()), + reinterpret_cast(str.c_str()) + strlen(str.c_str())}; } - void Decompress(std::vector* decompressed, + void Decompress(std::vector& decompressed, uint8_t const* compressed, size_t compressed_size) { auto stream = cudf::get_default_stream(); rmm::device_buffer src{compressed, compressed_size, stream}; - rmm::device_uvector dst{decompressed->size(), stream}; + rmm::device_uvector dst{decompressed.size(), stream}; cudf::detail::hostdevice_vector> inf_in(1, stream); inf_in[0] = {static_cast(src.data()), src.size()}; @@ -67,7 +67,7 @@ struct DecompressTest : public cudf::test::BaseFixture { static_cast(this)->dispatch(inf_in, inf_out, inf_stat); CUDF_CUDA_TRY(cudaMemcpyAsync( - decompressed->data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value())); + decompressed.data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value())); inf_stat.device_to_host_sync(stream); ASSERT_EQ(inf_stat[0].status, cudf::io::compression_status::SUCCESS); } @@ -125,49 +125,57 @@ struct NvcompConfigTest : public cudf::test::BaseFixture {}; TEST_F(GzipDecompressTest, HelloWorld) { - constexpr char uncompressed[] = "hello world"; + std::string const uncompressed{"hello world"}; + // NOLINTBEGIN constexpr uint8_t compressed[] = { 0x1f, 0x8b, 0x8, 0x0, 0x9, 0x63, 0x99, 0x5c, 0x2, 0xff, 0xcb, 0x48, 0xcd, 0xc9, 0xc9, 0x57, 0x28, 0xcf, 0x2f, 0xca, 0x49, 0x1, 0x0, 0x85, 0x11, 0x4a, 0xd, 0xb, 0x0, 0x0, 0x0}; + // NOLINTEND std::vector input = vector_from_string(uncompressed); std::vector output(input.size()); - Decompress(&output, compressed, sizeof(compressed)); + Decompress(output, compressed, sizeof(compressed)); EXPECT_EQ(output, input); } TEST_F(SnappyDecompressTest, HelloWorld) { - constexpr char uncompressed[] = "hello world"; + std::string const uncompressed{"hello world"}; + // NOLINTBEGIN constexpr uint8_t compressed[] = { 0xb, 0x28, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64}; + // NOLINTEND std::vector input = vector_from_string(uncompressed); std::vector output(input.size()); - Decompress(&output, compressed, sizeof(compressed)); + Decompress(output, compressed, sizeof(compressed)); EXPECT_EQ(output, input); } TEST_F(SnappyDecompressTest, ShortLiteralAfterLongCopyAtStartup) { - constexpr char uncompressed[] = "Aaaaaaaaaaaah!"; + std::string const uncompressed{"Aaaaaaaaaaaah!"}; + // NOLINTBEGIN constexpr uint8_t compressed[] = {14, 0x0, 'A', 0x0, 'a', (10 - 4) * 4 + 1, 1, 0x4, 'h', '!'}; + // NOLINTEND std::vector input = vector_from_string(uncompressed); std::vector output(input.size()); - Decompress(&output, compressed, sizeof(compressed)); + Decompress(output, compressed, sizeof(compressed)); EXPECT_EQ(output, input); } TEST_F(BrotliDecompressTest, HelloWorld) { - constexpr char uncompressed[] = "hello world"; + std::string const uncompressed{"hello world"}; + // NOLINTBEGIN constexpr uint8_t compressed[] = { 0xb, 0x5, 0x80, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x77, 0x6f, 0x72, 0x6c, 0x64, 0x3}; + // NOLINTEND std::vector input = vector_from_string(uncompressed); std::vector output(input.size()); - Decompress(&output, compressed, sizeof(compressed)); + Decompress(output, compressed, sizeof(compressed)); EXPECT_EQ(output, input); } diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index dc14824d834..b265dcf9273 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -63,9 +63,9 @@ auto dtype() template using column_wrapper = - typename std::conditional, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>::type; + std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; using column = cudf::column; using table = cudf::table; using table_view = cudf::table_view; @@ -954,7 +954,7 @@ TEST_F(CsvReaderTest, Strings) ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( - std::vector{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"}, + std::vector{"abc def ghi", "\"jkl mno pqr\"", R"(stu ""vwx"" yz)"}, view.column(1)); } @@ -1014,7 +1014,7 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored) ASSERT_EQ(type_id::STRING, view.column(1).type().id()); expect_column_data_equal( - std::vector{"\"abcdef ghi\"", "\"jkl \"\"mno\"\" pqr\"", "stu \"vwx\" yz"}, + std::vector{"\"abcdef ghi\"", R"("jkl ""mno"" pqr")", "stu \"vwx\" yz"}, view.column(1)); } @@ -1830,7 +1830,7 @@ TEST_F(CsvReaderTest, StringsWithWriter) auto int_column = column_wrapper{10, 20, 30}; auto string_column = - column_wrapper{"abc def ghi", "\"jkl mno pqr\"", "stu \"\"vwx\"\" yz"}; + column_wrapper{"abc def ghi", "\"jkl mno pqr\"", R"(stu ""vwx"" yz)"}; cudf::table_view input_table(std::vector{int_column, string_column}); // TODO add quoting style flag? @@ -2516,4 +2516,39 @@ TEST_F(CsvReaderTest, UTF8BOM) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result_view, expected); } +void expect_buffers_equal(cudf::io::datasource::buffer* lhs, cudf::io::datasource::buffer* rhs) +{ + ASSERT_EQ(lhs->size(), rhs->size()); + EXPECT_EQ(0, std::memcmp(lhs->data(), rhs->data(), lhs->size())); +} + +TEST_F(CsvReaderTest, OutOfMapBoundsReads) +{ + // write a lot of data into a file + auto filepath = temp_env->get_temp_dir() + "OutOfMapBoundsReads.csv"; + auto const num_rows = 1 << 20; + auto const row = std::string{"0,1,2,3,4,5,6,7,8,9\n"}; + auto const file_size = num_rows * row.size(); + { + std::ofstream outfile(filepath, std::ofstream::out); + for (size_t i = 0; i < num_rows; ++i) { + outfile << row; + } + } + + // Only memory map the middle of the file + auto source = cudf::io::datasource::create(filepath, file_size / 2, file_size / 4); + auto full_source = cudf::io::datasource::create(filepath); + auto const all_data = source->host_read(0, file_size); + auto ref_data = full_source->host_read(0, file_size); + expect_buffers_equal(ref_data.get(), all_data.get()); + + auto const start_data = source->host_read(file_size / 2, file_size / 2); + expect_buffers_equal(full_source->host_read(file_size / 2, file_size / 2).get(), + start_data.get()); + + auto const end_data = source->host_read(0, file_size / 2 + 512); + expect_buffers_equal(full_source->host_read(0, file_size / 2 + 512).get(), end_data.get()); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 49ad0c408dc..cb6716f4a18 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -68,9 +68,9 @@ auto dtype() template using column_wrapper = - typename std::conditional, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>::type; + std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; cudf::test::TempDirTestEnvironment* const temp_env = static_cast( diff --git a/cpp/tests/io/json/json_writer.cpp b/cpp/tests/io/json/json_writer.cpp index 2c4e29a01b9..39d31c406a5 100644 --- a/cpp/tests/io/json/json_writer.cpp +++ b/cpp/tests/io/json/json_writer.cpp @@ -70,6 +70,43 @@ TEST_F(JsonWriterTest, EmptyInput) EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); } +TEST_F(JsonWriterTest, EmptyLeaf) +{ + cudf::test::strings_column_wrapper col1{""}; + cudf::test::fixed_width_column_wrapper offsets{0, 0}; + auto col2 = make_lists_column(1, + offsets.release(), + cudf::test::strings_column_wrapper{}.release(), + 0, + rmm::device_buffer{}, + cudf::test::get_default_stream()); + auto col3 = cudf::test::lists_column_wrapper::make_one_empty_row_column(); + cudf::table_view tbl_view{{col1, *col2, col3}}; + cudf::io::table_metadata mt{{{"col1"}, {"col2"}, {"col3"}}}; + + std::vector out_buffer; + auto destination = cudf::io::sink_info(&out_buffer); + auto out_options = cudf::io::json_writer_options_builder(destination, tbl_view) + .include_nulls(true) + .metadata(mt) + .lines(false) + .na_rep("null") + .build(); + + // Empty columns in table + cudf::io::write_json(out_options, cudf::test::get_default_stream()); + std::string const expected = R"([{"col1":"","col2":[],"col3":[]}])"; + EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size())); + + // Empty columns in table - JSON Lines + out_buffer.clear(); + out_options.enable_lines(true); + cudf::io::write_json(out_options, cudf::test::get_default_stream()); + std::string const expected_lines = R"({"col1":"","col2":[],"col3":[]})" + "\n"; + EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size())); +} + TEST_F(JsonWriterTest, ErrorCases) { cudf::test::strings_column_wrapper col1{"a", "b", "c"}; diff --git a/cpp/tests/io/metadata_utilities.cpp b/cpp/tests/io/metadata_utilities.cpp index 84f04f67038..380d66c53f9 100644 --- a/cpp/tests/io/metadata_utilities.cpp +++ b/cpp/tests/io/metadata_utilities.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,10 +14,9 @@ * limitations under the License. */ +#include #include -#include - namespace cudf::test { void expect_metadata_equal(cudf::io::table_input_metadata in_meta, diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu index 8ad1fea649d..5f1aea71f73 100644 --- a/cpp/tests/io/orc_chunked_reader_test.cu +++ b/cpp/tests/io/orc_chunked_reader_test.cu @@ -1358,10 +1358,11 @@ TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow) int64_t constexpr total_rows = num_rows * num_reps; static_assert(total_rows > std::numeric_limits::max()); - auto const it = cudf::detail::make_counting_transform_iterator(0l, [num_rows](int64_t i) { - return (i % num_rows) % static_cast(std::numeric_limits::max() / 2); - }); - auto const col = data_col(it, it + num_rows); + auto const it = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [num_rows](int64_t i) { + return (i % num_rows) % static_cast(std::numeric_limits::max() / 2); + }); + auto const col = data_col(it, it + num_rows); auto const chunk_table = cudf::table_view{{col}}; std::vector data_buffer; diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 89e704f3ed3..cce0adbf317 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -43,9 +43,9 @@ template using column_wrapper = - typename std::conditional, - cudf::test::strings_column_wrapper, - cudf::test::fixed_width_column_wrapper>::type; + std::conditional_t, + cudf::test::strings_column_wrapper, + cudf::test::fixed_width_column_wrapper>; using str_col = column_wrapper; using bool_col = column_wrapper; @@ -1358,21 +1358,22 @@ TEST_P(OrcWriterTestStripes, StripeSize) cols.push_back(col.release()); auto const expected = std::make_unique
(std::move(cols)); - auto validate = [&](std::vector const& orc_buffer) { - auto const expected_stripe_num = - std::max(num_rows / size_rows, (num_rows * sizeof(int64_t)) / size_bytes); - auto const stats = cudf::io::read_parsed_orc_statistics( - cudf::io::source_info(orc_buffer.data(), orc_buffer.size())); - EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num); - - cudf::io::orc_reader_options in_opts = - cudf::io::orc_reader_options::builder( - cudf::io::source_info(orc_buffer.data(), orc_buffer.size())) - .use_index(false); - auto result = cudf::io::read_orc(in_opts); - - CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); - }; + auto validate = + [&, &size_bytes = size_bytes, &size_rows = size_rows](std::vector const& orc_buffer) { + auto const expected_stripe_num = + std::max(num_rows / size_rows, (num_rows * sizeof(int64_t)) / size_bytes); + auto const stats = cudf::io::read_parsed_orc_statistics( + cudf::io::source_info(orc_buffer.data(), orc_buffer.size())); + EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num); + + cudf::io::orc_reader_options in_opts = + cudf::io::orc_reader_options::builder( + cudf::io::source_info(orc_buffer.data(), orc_buffer.size())) + .use_index(false); + auto result = cudf::io::read_orc(in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view()); + }; { std::vector out_buffer_chunked; diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp index 6141a40bc95..a1b8677eac8 100644 --- a/cpp/tests/io/parquet_common.cpp +++ b/cpp/tests/io/parquet_common.cpp @@ -744,7 +744,7 @@ int32_t compare(T& v1, T& v2) int32_t compare_binary(std::vector const& v1, std::vector const& v2, cudf::io::parquet::detail::Type ptype, - cuda::std::optional const& ctype) + std::optional const& ctype) { auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN); switch (ptype) { diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp index bd1579eaa1b..c90b81ed27a 100644 --- a/cpp/tests/io/parquet_common.hpp +++ b/cpp/tests/io/parquet_common.hpp @@ -172,7 +172,7 @@ std::pair create_parquet_typed_with_stats(std::string int32_t compare_binary(std::vector const& v1, std::vector const& v2, cudf::io::parquet::detail::Type ptype, - cuda::std::optional const& ctype); + std::optional const& ctype); void expect_compression_stats_empty(std::shared_ptr stats); diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp index 8b03e94191e..f1286a00d22 100644 --- a/cpp/tests/io/parquet_misc_test.cpp +++ b/cpp/tests/io/parquet_misc_test.cpp @@ -98,7 +98,7 @@ TYPED_TEST(ParquetWriterDeltaTest, SupportedDeltaListSliced) // list constexpr int vals_per_row = 4; auto c1_offset_iter = cudf::detail::make_counting_transform_iterator( - 0, [vals_per_row](cudf::size_type idx) { return idx * vals_per_row; }); + 0, [](cudf::size_type idx) { return idx * vals_per_row; }); cudf::test::fixed_width_column_wrapper c1_offsets(c1_offset_iter, c1_offset_iter + num_rows + 1); cudf::test::fixed_width_column_wrapper c1_vals( diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp index dc8e68b3a15..7986a3c6d70 100644 --- a/cpp/tests/io/parquet_reader_test.cpp +++ b/cpp/tests/io/parquet_reader_test.cpp @@ -1189,15 +1189,12 @@ TEST_F(ParquetReaderTest, NestingOptimizationTest) cudf::test::fixed_width_column_wrapper values(value_iter, value_iter + num_values, validity); // ~256k values with num_nesting_levels = 16 - int total_values_produced = num_values; - auto prev_col = values.release(); + auto prev_col = values.release(); for (int idx = 0; idx < num_nesting_levels; idx++) { - auto const depth = num_nesting_levels - idx; auto const num_rows = (1 << (num_nesting_levels - idx)); auto offsets_iter = cudf::detail::make_counting_transform_iterator( - 0, [depth, rows_per_level](cudf::size_type i) { return i * rows_per_level; }); - total_values_produced += (num_rows + 1); + 0, [](cudf::size_type i) { return i * rows_per_level; }); cudf::test::fixed_width_column_wrapper offsets(offsets_iter, offsets_iter + num_rows + 1); @@ -2727,3 +2724,40 @@ TYPED_TEST(ParquetReaderPredicatePushdownTest, FilterTyped) EXPECT_EQ(result_table.num_columns(), expected->num_columns()); CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result_table); } + +// The test below requires several minutes to complete with memcheck, thus it is disabled by +// default. +TEST_F(ParquetReaderTest, DISABLED_ListsWideTable) +{ + auto constexpr num_rows = 2; + auto constexpr num_cols = 26'755; // for slightly over 2B keys + auto constexpr seed = 0xceed; + + std::mt19937 engine{seed}; + + auto list_list = make_parquet_list_list_col(0, num_rows, 1, 1, false); + auto list_list_nulls = make_parquet_list_list_col(0, num_rows, 1, 1, true); + + // switch between nullable and non-nullable + std::vector cols(num_cols); + bool with_nulls = false; + std::generate_n(cols.begin(), num_cols, [&]() { + auto const view = with_nulls ? list_list_nulls->view() : list_list->view(); + with_nulls = not with_nulls; + return view; + }); + + cudf::table_view expected(cols); + + // Use a host buffer for faster I/O + std::vector buffer; + auto const out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buffer}, expected).build(); + cudf::io::write_parquet(out_opts); + + cudf::io::parquet_reader_options default_in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(buffer.data(), buffer.size())); + auto const [result, _] = cudf::io::read_parquet(default_in_opts); + + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result->view()); +} diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp index 7c305235ea6..a0b48f54854 100644 --- a/cpp/tests/io/parquet_v2_test.cpp +++ b/cpp/tests/io/parquet_v2_test.cpp @@ -1302,24 +1302,24 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls) table_view expected({col0, col1, col2, col3, col4, col5, col6, col7}); std::array expected_null_counts{4, 4, 4, 6, 4, 6, 4, 5, 11}; - std::vector const expected_def_hists[] = {{1, 1, 2, 3}, - {1, 3, 10}, - {1, 1, 2, 10}, - {1, 1, 2, 2, 8}, - {1, 1, 1, 1, 10}, - {1, 1, 1, 1, 2, 8}, - {1, 3, 9}, - {1, 3, 1, 8}, - {1, 0, 4, 1, 1, 4, 9}}; - std::vector const expected_rep_hists[] = {{4, 3}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 6}, - {4, 4, 5}, - {4, 4, 5}, - {4, 6, 2, 8}}; + std::vector> const expected_def_hists = {{1, 1, 2, 3}, + {1, 3, 10}, + {1, 1, 2, 10}, + {1, 1, 2, 2, 8}, + {1, 1, 1, 1, 10}, + {1, 1, 1, 1, 2, 8}, + {1, 3, 9}, + {1, 3, 1, 8}, + {1, 0, 4, 1, 1, 4, 9}}; + std::vector> const expected_rep_hists = {{4, 3}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 6}, + {4, 4, 5}, + {4, 4, 5}, + {4, 6, 2, 8}}; auto const filepath = temp_env->get_temp_filepath("ColumnIndexListWithNulls.parquet"); auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected) diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp index 8794f2ee304..6c5e9cdf07a 100644 --- a/cpp/tests/io/parquet_writer_test.cpp +++ b/cpp/tests/io/parquet_writer_test.cpp @@ -290,7 +290,8 @@ class custom_test_data_sink : public cudf::io::data_sink { CUDF_EXPECTS(outfile_.is_open(), "Cannot open output file"); } - ~custom_test_data_sink() override { flush(); } + // Marked as NOLINT because we are calling a virtual method in the destructor + ~custom_test_data_sink() override { flush(); } // NOLINT void host_write(void const* data, size_t size) override { @@ -981,13 +982,15 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation) TEST_F(ParquetWriterTest, BinaryColumnIndexTruncation) { - std::vector truncated_min[] = {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}, - {0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; + std::array, 3> truncated_min{ + {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}, + {0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}}; - std::vector truncated_max[] = {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff}, - {0xff}, - {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; + std::array, 3> truncated_max{ + {{0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xff}, + {0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}}; cudf::test::lists_column_wrapper col0{ {0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe}}; diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp index 93754091b3f..178edc52dd3 100644 --- a/cpp/tests/join/distinct_join_tests.cpp +++ b/cpp/tests/join/distinct_join_tests.cpp @@ -314,7 +314,7 @@ TEST_F(DistinctJoinTest, EmptyBuildTableLeftJoin) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; this->compare_to_reference( build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -362,7 +362,7 @@ TEST_F(DistinctJoinTest, EmptyProbeTableLeftJoin) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; this->compare_to_reference( build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -398,7 +398,7 @@ TEST_F(DistinctJoinTest, LeftJoinNoNulls) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; this->compare_to_reference( build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -423,7 +423,7 @@ TEST_F(DistinctJoinTest, LeftJoinWithNulls) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; column_wrapper col_gold_0{{3, 1, 2, 0, 2}, {true, true, true, true, true}}; strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {true, true, false, true, true}); @@ -468,7 +468,7 @@ TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls) auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), std::move(get_left_indices(result->size()))}; + auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; auto col0_gold_names_col = strcol_wrapper{ "Samuel Vimes", "Detritus", "Carrot Ironfoundersson", "Samuel Vimes", "Angua von Überwald"}; diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp index 249319da7f7..7b61be113f9 100644 --- a/cpp/tests/large_strings/large_strings_fixture.cpp +++ b/cpp/tests/large_strings/large_strings_fixture.cpp @@ -123,12 +123,9 @@ LargeStringsData* StringsLargeTest::g_ls_data = nullptr; int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - auto const cmd_opts = parse_cudf_test_opts(argc, argv); - // hardcoding the CUDA memory resource to keep from exceeding the pool - auto mr = cudf::test::make_cuda(); - cudf::set_current_device_resource(mr.get()); - auto adaptor = make_stream_mode_adaptor(cmd_opts); - + cudf::test::config config; + config.rmm_mode = "cuda"; + init_cudf_test(argc, argv, config); // create object to automatically be destroyed at the end of main() auto lsd = cudf::test::StringsLargeTest::get_ls_data(); diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp index 97979e79010..bea044496b3 100644 --- a/cpp/tests/merge/merge_string_test.cpp +++ b/cpp/tests/merge/merge_string_test.cpp @@ -97,7 +97,7 @@ TYPED_TEST(MergeStringTest, Merge1StringKeyColumns) "hi", "hj"}); - auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) { + auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) return 0; else @@ -296,7 +296,7 @@ TYPED_TEST(MergeStringTest, Merge1StringKeyNullColumns) true, false, false}); - auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) { + auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) return 0; else diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp index 2e09f25b51f..6208d395f0a 100644 --- a/cpp/tests/merge/merge_test.cpp +++ b/cpp/tests/merge/merge_test.cpp @@ -349,7 +349,7 @@ TYPED_TEST(MergeTest_, Merge1KeyColumns) cudf::test::fixed_width_column_wrapper expectedDataWrap1(seq_out1, seq_out1 + outputRows); - auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) { + auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) return 0; else @@ -452,7 +452,7 @@ TYPED_TEST(MergeTest_, Merge1KeyNullColumns) cudf::size_type inputRows = 40; // data: 0 2 4 6 | valid: 1 1 1 0 - auto sequence1 = cudf::detail::make_counting_transform_iterator(0, [inputRows](auto row) { + auto sequence1 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) { return 0; // <- no shortcut to this can avoid compiler errors } else { @@ -465,7 +465,7 @@ TYPED_TEST(MergeTest_, Merge1KeyNullColumns) leftColWrap1(sequence1, sequence1 + inputRows, valid_sequence1); // data: 1 3 5 7 | valid: 1 1 1 0 - auto sequence2 = cudf::detail::make_counting_transform_iterator(0, [inputRows](auto row) { + auto sequence2 = cudf::detail::make_counting_transform_iterator(0, [](auto row) { if (cudf::type_to_id() == cudf::type_id::BOOL8) { return 1; } else diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp index 1e9e13ded93..bdb98372836 100644 --- a/cpp/tests/reductions/reduction_tests.cpp +++ b/cpp/tests/reductions/reduction_tests.cpp @@ -33,8 +33,12 @@ #include #include +#include #include +#include +#include +#include #include using aggregation = cudf::aggregation; @@ -765,6 +769,25 @@ TYPED_TEST(MultiStepReductionTest, Mean) expected_value_nulls); } +template +double calc_var(std::vector const& v, int ddof, std::vector const& mask = {}) +{ + auto const values = [&]() { + if (mask.empty()) { return v; } + std::vector masked{}; + thrust::copy_if( + v.begin(), v.end(), mask.begin(), std::back_inserter(masked), [](auto m) { return m; }); + return masked; + }(); + auto const valid_count = values.size(); + double const mean = std::accumulate(values.cbegin(), values.cend(), double{0}) / valid_count; + double const sq_sum_of_differences = + std::accumulate(values.cbegin(), values.cend(), double{0}, [mean](double acc, auto const v) { + return acc + std::pow(v - mean, 2); + }); + return sq_sum_of_differences / (valid_count - ddof); +} + // This test is disabled for only a Debug build because a compiler error // documented in cpp/src/reductions/std.cu and cpp/src/reductions/var.cu #ifdef NDEBUG @@ -777,25 +800,12 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std) std::vector int_values({-3, 2, 1, 0, 5, -3, -2, 28}); std::vector host_bools({true, true, false, true, true, true, false, true}); - auto calc_var = [](std::vector& v, cudf::size_type valid_count, int ddof) { - double mean = std::accumulate(v.begin(), v.end(), double{0}); - mean /= valid_count; - - double sum_of_sq = std::accumulate( - v.begin(), v.end(), double{0}, [](double acc, TypeParam i) { return acc + i * i; }); - - cudf::size_type div = valid_count - ddof; - - double var = sum_of_sq / div - ((mean * mean) * valid_count) / div; - return var; - }; - // test without nulls std::vector v = convert_values(int_values); cudf::test::fixed_width_column_wrapper col(v.begin(), v.end()); auto const ddof = 1; - double var = calc_var(v, v.size(), ddof); + double var = calc_var(v, ddof); double std = std::sqrt(var); auto var_agg = cudf::make_variance_aggregation(ddof); auto std_agg = cudf::make_std_aggregation(ddof); @@ -811,23 +821,19 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std) // test with nulls cudf::test::fixed_width_column_wrapper col_nulls = construct_null_column(v, host_bools); - cudf::size_type valid_count = - cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count(); - auto replaced_array = replace_nulls(v, host_bools, T{0}); - - double var_nulls = calc_var(replaced_array, valid_count, ddof); - double std_nulls = std::sqrt(var_nulls); + double var_nulls = calc_var(v, ddof, host_bools); + double std_nulls = std::sqrt(var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - std_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + var_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + std_nulls); } // ---------------------------------------------------------------------------- @@ -1139,23 +1145,10 @@ TEST_P(ReductionParamTest, DISABLED_std_var) std::vector int_values({-3, 2, 1, 0, 5, -3, -2, 28}); std::vector host_bools({true, true, false, true, true, true, false, true}); - auto calc_var = [ddof](std::vector& v, cudf::size_type valid_count) { - double mean = std::accumulate(v.begin(), v.end(), double{0}); - mean /= valid_count; - - double sum_of_sq = std::accumulate( - v.begin(), v.end(), double{0}, [](double acc, double i) { return acc + i * i; }); - - cudf::size_type div = valid_count - ddof; - - double var = sum_of_sq / div - ((mean * mean) * valid_count) / div; - return var; - }; - // test without nulls cudf::test::fixed_width_column_wrapper col(int_values.begin(), int_values.end()); - double var = calc_var(int_values, int_values.size()); + double var = calc_var(int_values, ddof); double std = std::sqrt(var); auto var_agg = cudf::make_variance_aggregation(ddof); auto std_agg = cudf::make_std_aggregation(ddof); @@ -1172,23 +1165,19 @@ TEST_P(ReductionParamTest, DISABLED_std_var) // test with nulls cudf::test::fixed_width_column_wrapper col_nulls = construct_null_column(int_values, host_bools); - cudf::size_type valid_count = - cudf::column_view(col_nulls).size() - cudf::column_view(col_nulls).null_count(); - auto replaced_array = replace_nulls(int_values, host_bools, int{0}); - - double var_nulls = calc_var(replaced_array, valid_count); + double var_nulls = calc_var(int_values, ddof, host_bools); double std_nulls = std::sqrt(var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - var_nulls); - EXPECT_EQ(this - ->template reduction_test( - col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) - .first, - std_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + var_nulls); + EXPECT_DOUBLE_EQ(this + ->template reduction_test( + col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64)) + .first, + std_nulls); } //------------------------------------------------------------------- @@ -2471,21 +2460,11 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd) std::vector v = convert_values(int_values); cudf::data_type output_type{cudf::type_to_id()}; - auto calc_var = [](std::vector const& v, cudf::size_type valid_count, cudf::size_type ddof) { - double mean = std::accumulate(v.cbegin(), v.cend(), double{0}); - mean /= valid_count; - double sum_of_sq = std::accumulate( - v.cbegin(), v.cend(), double{0}, [](double acc, TypeParam i) { return acc + i * i; }); - auto const div = valid_count - ddof; - double var = sum_of_sq / div - ((mean * mean) * valid_count) / div; - return var; - }; - // test without nulls cudf::test::dictionary_column_wrapper col(v.begin(), v.end()); cudf::size_type const ddof = 1; - double var = calc_var(v, v.size(), ddof); + double var = calc_var(v, ddof); double std = std::sqrt(var); auto var_agg = cudf::make_variance_aggregation(ddof); auto std_agg = cudf::make_std_aggregation(ddof); @@ -2497,15 +2476,13 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd) std::vector validity({true, true, false, true, true, true, false, true}); cudf::test::dictionary_column_wrapper col_nulls(v.begin(), v.end(), validity.begin()); - cudf::size_type const valid_count = std::count(validity.begin(), validity.end(), true); - - double var_nulls = calc_var(replace_nulls(v, validity, T{0}), valid_count, ddof); + double var_nulls = calc_var(v, ddof, validity); double std_nulls = std::sqrt(var_nulls); - EXPECT_EQ(this->template reduction_test(col_nulls, *var_agg, output_type).first, - var_nulls); - EXPECT_EQ(this->template reduction_test(col_nulls, *std_agg, output_type).first, - std_nulls); + EXPECT_DOUBLE_EQ(this->template reduction_test(col_nulls, *var_agg, output_type).first, + var_nulls); + EXPECT_DOUBLE_EQ(this->template reduction_test(col_nulls, *std_agg, output_type).first, + std_nulls); } TYPED_TEST(DictionaryReductionTest, NthElement) diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp index 19996f827cf..bc0321bd40a 100644 --- a/cpp/tests/reductions/segmented_reduction_tests.cpp +++ b/cpp/tests/reductions/segmented_reduction_tests.cpp @@ -1092,11 +1092,10 @@ TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets) auto aggregates = std::vector>>(); - aggregates.push_back(std::move(cudf::make_max_aggregation())); - aggregates.push_back(std::move(cudf::make_min_aggregation())); - aggregates.push_back(std::move(cudf::make_sum_aggregation())); - aggregates.push_back( - std::move(cudf::make_product_aggregation())); + aggregates.push_back(cudf::make_max_aggregation()); + aggregates.push_back(cudf::make_min_aggregation()); + aggregates.push_back(cudf::make_sum_aggregation()); + aggregates.push_back(cudf::make_product_aggregation()); auto output_type = cudf::data_type{cudf::type_to_id()}; for (auto&& agg : aggregates) { diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp index 1858cd7782e..b12bf08520f 100644 --- a/cpp/tests/replace/replace_tests.cpp +++ b/cpp/tests/replace/replace_tests.cpp @@ -356,7 +356,7 @@ void test_replace(cudf::host_span input_column, for (size_t i = 0; i < values_to_replace_column.size(); i++) { size_t k = 0; - auto pred = [=, &k, &reference_result, &expected_valid, &isReplaced](T element) { + auto pred = [=, &k, &expected_valid, &isReplaced](T element) { bool toBeReplaced = false; if (!isReplaced[k]) { if (!input_has_nulls || expected_valid[k]) { @@ -503,7 +503,7 @@ TYPED_TEST(ReplaceTest, LargeScaleReplaceTest) const size_t REPLACE_SIZE = 10000; thrust::host_vector input_column(DATA_SIZE); - std::generate(std::begin(input_column), std::end(input_column), [REPLACE_SIZE]() { + std::generate(std::begin(input_column), std::end(input_column), []() { return std::rand() % (REPLACE_SIZE); }); diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp index f702dc78371..165e0347785 100644 --- a/cpp/tests/rolling/collect_ops_test.cpp +++ b/cpp/tests/rolling/collect_ops_test.cpp @@ -214,7 +214,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods) *cudf::make_collect_list_aggregation()); auto expected_result_2 = cudf::test::lists_column_wrapper{ {{}, {0, 1, 2, 3}, {1, 2, 3, 4}, {2, 3, 4, 5}, {}, {}}, - cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) { + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0 && i < 4; })}.release(); @@ -338,7 +338,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) cudf::test::fixed_width_column_wrapper{0, 0, 4, 8, 12, 12, 12}.release(); auto expected_num_rows = expected_offsets->size() - 1; auto null_mask_iter = cudf::detail::make_counting_transform_iterator( - cudf::size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; }); + cudf::size_type{0}, [](auto i) { return i > 0 && i < 4; }); auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows); @@ -373,7 +373,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) cudf::test::fixed_width_column_wrapper{0, 0, 3, 5, 8, 8, 8}.release(); auto expected_num_rows = expected_offsets->size() - 1; auto null_mask_iter = cudf::detail::make_counting_transform_iterator( - cudf::size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; }); + cudf::size_type{0}, [](auto i) { return i > 0 && i < 4; }); auto [null_mask, null_count] = cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows); @@ -1499,7 +1499,7 @@ TYPED_TEST(TypedCollectSetTest, RollingWindowHonoursMinPeriods) *cudf::make_collect_set_aggregation()); auto expected_result_2 = cudf::test::lists_column_wrapper{ {{}, {0, 1, 2}, {1, 2, 4}, {2, 4, 5}, {}, {}}, - cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) { + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0 && i < 4; })}.release(); diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp index ec726878b34..0eaab0c9f7a 100644 --- a/cpp/tests/rolling/offset_row_window_test.cpp +++ b/cpp/tests/rolling/offset_row_window_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,6 +41,11 @@ using cudf::test::iterators::nulls_at; auto constexpr null = int32_t{0}; // NULL representation for int32_t; +// clang-tidy doesn't think std::transform can handle a +// thrust::constant_iterator, so this is a workaround that uses nulls_at +// instead of no_nulls +auto no_nulls_list() { return nulls_at({}); } + struct OffsetRowWindowTest : public cudf::test::BaseFixture { static ints_column const _keys; // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1}; static ints_column const _values; // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; @@ -210,7 +215,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_0_to_2) CUDF_TEST_EXPECT_COLUMNS_EQUAL( *run_rolling(*AGG_COLLECT_LIST), - lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5}, {}, {7, 8}, {8, 9}, {9}, {}}, no_nulls}); + lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5}, {}, {7, 8}, {8, 9}, {9}, {}}, + no_nulls_list()}); } TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2) @@ -250,7 +256,7 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2) CUDF_TEST_EXPECT_COLUMNS_EQUAL( *run_rolling(*AGG_COLLECT_LIST), lists_column{{{1, 2}, {2, 3}, {3, 4}, {4, 5}, {5, 6}, {6, 7}, {7, 8}, {8, 9}, {9}, {}}, - no_nulls}); + no_nulls_list()}); } // To test that preceding bounds are clamped correctly at group boundaries. diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp index c2c22986975..6e0dc16dca9 100644 --- a/cpp/tests/rolling/rolling_test.cpp +++ b/cpp/tests/rolling/rolling_test.cpp @@ -541,7 +541,7 @@ class RollingTest : public cudf::test::BaseFixture { agg_op op; for (cudf::size_type i = 0; i < num_rows; i++) { - OutputType val = agg_op::template identity(); + auto val = agg_op::template identity(); // load sizes min_periods = std::max(min_periods, 1); // at least one observation is required diff --git a/cpp/tests/scalar/scalar_test.cpp b/cpp/tests/scalar/scalar_test.cpp index 2d37de920d5..2b79911a95a 100644 --- a/cpp/tests/scalar/scalar_test.cpp +++ b/cpp/tests/scalar/scalar_test.cpp @@ -190,7 +190,7 @@ TEST_F(ListScalarTest, MoveConstructorNonNested) EXPECT_EQ(mask_ptr, s2.validity_data()); EXPECT_EQ(data_ptr, s2.view().data()); - EXPECT_EQ(s.view().data(), nullptr); + EXPECT_EQ(s.view().data(), nullptr); // NOLINT } TEST_F(ListScalarTest, MoveConstructorNested) @@ -205,8 +205,8 @@ TEST_F(ListScalarTest, MoveConstructorNested) EXPECT_EQ(mask_ptr, s2.validity_data()); EXPECT_EQ(offset_ptr, s2.view().child(0).data()); EXPECT_EQ(data_ptr, s2.view().child(1).data()); - EXPECT_EQ(s.view().data(), nullptr); - EXPECT_EQ(s.view().num_children(), 0); + EXPECT_EQ(s.view().data(), nullptr); // NOLINT + EXPECT_EQ(s.view().num_children(), 0); // NOLINT } struct StructScalarTest : public cudf::test::BaseFixture {}; diff --git a/cpp/tests/search/search_list_test.cpp b/cpp/tests/search/search_list_test.cpp index 48711c21715..7584003e800 100644 --- a/cpp/tests/search/search_list_test.cpp +++ b/cpp/tests/search/search_list_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,7 +35,6 @@ using strings_col = cudf::test::strings_column_wrapper; constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::FIRST_ERROR}; constexpr int32_t null{0}; // Mark for null child elements at the current level -constexpr int32_t XXX{0}; // Mark for null elements at all levels using TestTypes = cudf::test::Concat> grand_child; - grand_child.push_back(std::move(col4.release())); + grand_child.push_back(col4.release()); auto child_col_2 = cudf::make_structs_column(6, std::move(grand_child), 0, rmm::device_buffer{}); child_columns2.push_back(std::move(child_col_2)); auto struct_col3 = diff --git a/cpp/tests/stream_compaction/unique_tests.cpp b/cpp/tests/stream_compaction/unique_tests.cpp index 4d7d23dc881..d5b6915b520 100644 --- a/cpp/tests/stream_compaction/unique_tests.cpp +++ b/cpp/tests/stream_compaction/unique_tests.cpp @@ -43,7 +43,6 @@ auto constexpr KEEP_ANY = cudf::duplicate_keep_option::KEEP_ANY; auto constexpr KEEP_FIRST = cudf::duplicate_keep_option::KEEP_FIRST; auto constexpr KEEP_LAST = cudf::duplicate_keep_option::KEEP_LAST; auto constexpr KEEP_NONE = cudf::duplicate_keep_option::KEEP_NONE; -auto constexpr NULL_EQUAL = cudf::null_equality::EQUAL; auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL; using int32s_col = cudf::test::fixed_width_column_wrapper; diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp index 443f4548b2c..07b2d77cc04 100644 --- a/cpp/tests/streams/stream_compaction_test.cpp +++ b/cpp/tests/streams/stream_compaction_test.cpp @@ -29,8 +29,6 @@ #include -auto constexpr null{0}; // null at current level -auto constexpr XXX{0}; // null pushed down from parent level auto constexpr NaN = std::numeric_limits::quiet_NaN(); auto constexpr KEEP_ANY = cudf::duplicate_keep_option::KEEP_ANY; auto constexpr KEEP_FIRST = cudf::duplicate_keep_option::KEEP_FIRST; diff --git a/cpp/tests/streams/strings/factory_test.cpp b/cpp/tests/streams/strings/factory_test.cpp new file mode 100644 index 00000000000..36e595ab9fa --- /dev/null +++ b/cpp/tests/streams/strings/factory_test.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +#include + +#include +#include + +class StringsFactoryTest : public cudf::test::BaseFixture {}; + +using string_pair = thrust::pair; + +TEST_F(StringsFactoryTest, StringConstructionFromPairs) +{ + auto const stream = cudf::test::get_default_stream(); + + auto const h_data = std::vector{'a', 'b', 'c'}; + auto const d_data = cudf::detail::make_device_uvector_async( + h_data, stream, cudf::get_current_device_resource_ref()); + + auto const h_input = + std::vector{{d_data.data(), 1}, {d_data.data() + 1, 1}, {d_data.data() + 2, 1}}; + auto const d_input = cudf::detail::make_device_uvector_async( + h_input, stream, cudf::get_current_device_resource_ref()); + auto const input = cudf::device_span{d_input.data(), d_input.size()}; + cudf::make_strings_column(input, stream); +} + +TEST_F(StringsFactoryTest, StringBatchConstruction) +{ + auto const stream = cudf::test::get_default_stream(); + + auto const h_data = std::vector{'a', 'b', 'c'}; + auto const d_data = cudf::detail::make_device_uvector_async( + h_data, stream, cudf::get_current_device_resource_ref()); + + auto const h_input = + std::vector{{d_data.data(), 1}, {d_data.data() + 1, 1}, {d_data.data() + 2, 1}}; + auto const d_input = cudf::detail::make_device_uvector_async( + h_input, stream, cudf::get_current_device_resource_ref()); + + std::vector> input( + 10, cudf::device_span{d_input.data(), d_input.size()}); + cudf::make_strings_column_batch(input, stream); +} diff --git a/cpp/tests/streams/strings/find_test.cpp b/cpp/tests/streams/strings/find_test.cpp index 52839c6fc9f..e5a1ee0988c 100644 --- a/cpp/tests/streams/strings/find_test.cpp +++ b/cpp/tests/streams/strings/find_test.cpp @@ -46,4 +46,5 @@ TEST_F(StringsFindTest, Find) auto const pattern = std::string("[a-z]"); auto const prog = cudf::strings::regex_program::create(pattern); cudf::strings::findall(view, *prog, cudf::test::get_default_stream()); + cudf::strings::find_re(view, *prog, cudf::test::get_default_stream()); } diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index bdfd38267e6..cceec1d3537 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -474,6 +474,54 @@ TEST_F(StringsContainsTests, FixedQuantifier) } } +TEST_F(StringsContainsTests, ZeroRangeQuantifier) +{ + auto input = cudf::test::strings_column_wrapper({"a", "", "abc", "XYAZ", "ABC", "ZYXA"}); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("A{0,}"); // should match everyting + auto prog = cudf::strings::regex_program::create(pattern); + + { + auto expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 1, 1}); + auto results = cudf::strings::contains_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } + { + auto expected = cudf::test::fixed_width_column_wrapper({2, 1, 4, 5, 4, 5}); + auto results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } + + pattern = std::string("(?:ab){0,3}"); + prog = cudf::strings::regex_program::create(pattern); + + { + auto expected = cudf::test::fixed_width_column_wrapper({1, 1, 1, 1, 1, 1}); + auto results = cudf::strings::contains_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } + { + auto expected = cudf::test::fixed_width_column_wrapper({2, 1, 3, 5, 4, 5}); + auto results = cudf::strings::count_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } +} + +TEST_F(StringsContainsTests, NestedQuantifier) +{ + auto input = cudf::test::strings_column_wrapper({"TEST12 1111 2222 3333 4444 5555", + "0000 AAAA 9999 BBBB 8888", + "7777 6666 4444 3333", + "12345 3333 4444 1111 ABCD"}); + auto sv = cudf::strings_column_view(input); + auto pattern = std::string(R"((\d{4}\s){4})"); + cudf::test::fixed_width_column_wrapper expected({true, false, false, true}); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::contains_re(sv, *prog); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsContainsTests, QuantifierErrors) { EXPECT_THROW(cudf::strings::regex_program::create("^+"), cudf::logic_error); diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp index 61246fb098d..7e0338f1bf4 100644 --- a/cpp/tests/strings/extract_tests.cpp +++ b/cpp/tests/strings/extract_tests.cpp @@ -19,7 +19,6 @@ #include #include #include -#include #include #include @@ -240,6 +239,21 @@ TEST_F(StringsExtractTests, SpecialNewLines) CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); } +TEST_F(StringsExtractTests, NestedQuantifier) +{ + auto input = cudf::test::strings_column_wrapper({"TEST12 1111 2222 3333 4444 5555", + "0000 AAAA 9999 BBBB 8888", + "7777 6666 4444 3333", + "12345 3333 4444 1111 ABCD"}); + auto sv = cudf::strings_column_view(input); + auto pattern = std::string(R"((\d{4}\s){4})"); + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::extract(sv, *prog); + // fixed quantifier on capture group only honors the last group + auto expected = cudf::test::strings_column_wrapper({"4444 ", "", "", "1111 "}, {1, 0, 0, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected); +} + TEST_F(StringsExtractTests, EmptyExtractTest) { std::vector h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""}; diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index 90054e41d36..7eb429da7d9 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -44,6 +45,8 @@ struct StringsFactoriesTest : public cudf::test::BaseFixture {}; +using string_pair = thrust::pair; + TEST_F(StringsFactoriesTest, CreateColumnFromPair) { std::vector h_test_strings{"the quick brown fox jumps over the lazy dog", @@ -61,7 +64,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair) cudf::size_type count = (cudf::size_type)h_test_strings.size(); thrust::host_vector h_buffer(memsize); rmm::device_uvector d_buffer(memsize, cudf::get_default_stream()); - thrust::host_vector> strings(count); + thrust::host_vector strings(count); thrust::host_vector h_offsets(count + 1); cudf::size_type offset = 0; cudf::size_type nulls = 0; @@ -69,12 +72,12 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair) for (cudf::size_type idx = 0; idx < count; ++idx) { char const* str = h_test_strings[idx]; if (!str) { - strings[idx] = thrust::pair{nullptr, 0}; + strings[idx] = string_pair{nullptr, 0}; nulls++; } else { auto length = (cudf::size_type)strlen(str); memcpy(h_buffer.data() + offset, str, length); - strings[idx] = thrust::pair{d_buffer.data() + offset, length}; + strings[idx] = string_pair{d_buffer.data() + offset, length}; offset += length; } h_offsets[idx + 1] = offset; @@ -201,14 +204,13 @@ TEST_F(StringsFactoriesTest, EmptyStringsColumn) cudf::make_strings_column(0, std::move(d_offsets), d_chars.release(), 0, d_nulls.release()); cudf::test::expect_column_empty(results->view()); - rmm::device_uvector> d_strings{ - 0, cudf::get_default_stream()}; + rmm::device_uvector d_strings{0, cudf::get_default_stream()}; results = cudf::make_strings_column(d_strings); cudf::test::expect_column_empty(results->view()); } namespace { -using string_pair = thrust::pair; + struct string_view_to_pair { __device__ string_pair operator()(thrust::pair const& p) { @@ -234,3 +236,198 @@ TEST_F(StringsFactoriesTest, StringPairWithNullsAndEmpty) auto result = cudf::make_strings_column(pairs); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), data); } + +struct StringsBatchConstructionTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsBatchConstructionTest, EmptyColumns) +{ + auto constexpr num_columns = 10; + auto const stream = cudf::get_default_stream(); + + auto const d_string_pairs = rmm::device_uvector{0, stream}; + auto const input = std::vector>( + num_columns, {d_string_pairs.data(), d_string_pairs.size()}); + auto const output = cudf::make_strings_column_batch(input, stream); + + auto const expected_col = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + for (auto const& col : output) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col->view(), col->view()); + } +} + +TEST_F(StringsBatchConstructionTest, AllNullsColumns) +{ + auto constexpr num_columns = 10; + auto constexpr num_rows = 100; + auto const stream = cudf::get_default_stream(); + + auto d_string_pairs = rmm::device_uvector{num_rows, stream}; + thrust::uninitialized_fill_n(rmm::exec_policy(stream), + d_string_pairs.data(), + d_string_pairs.size(), + string_pair{nullptr, 0}); + auto const input = std::vector>( + num_columns, {d_string_pairs.data(), d_string_pairs.size()}); + auto const output = cudf::make_strings_column_batch(input, stream); + + auto const expected_col = cudf::make_strings_column(d_string_pairs); + for (auto const& col : output) { + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col->view(), col->view()); + } +} + +namespace { + +struct index_to_pair { + int const num_test_strings; + char const* d_chars; + std::size_t const* d_offsets; + int const* is_null; + + __device__ string_pair operator()(cudf::size_type idx) + { + auto const data_idx = idx % num_test_strings; + return {is_null[data_idx] ? nullptr : d_chars + d_offsets[data_idx], + static_cast(d_offsets[data_idx + 1] - d_offsets[data_idx])}; + } +}; + +} // namespace + +TEST_F(StringsBatchConstructionTest, CreateColumnsFromPairs) +{ + auto constexpr num_columns = 10; + auto constexpr max_num_rows = 1000; + auto const stream = cudf::get_default_stream(); + auto const mr = cudf::get_current_device_resource_ref(); + + std::vector h_test_strings{"the quick brown fox jumps over the lazy dog", + "the fat cat lays next to the other accénted cat", + "a slow moving turtlé cannot catch the bird", + "which can be composéd together to form a more complete", + "thé result does not include the value in the sum in", + "", + nullptr, + "absent stop words"}; + auto const num_test_strings = static_cast(h_test_strings.size()); + + std::vector h_offsets(num_test_strings + 1, 0); + for (int i = 0; i < num_test_strings; ++i) { + h_offsets[i + 1] = h_offsets[i] + (h_test_strings[i] ? strlen(h_test_strings[i]) : 0); + } + + std::vector h_chars(h_offsets.back()); + std::vector is_null(num_test_strings, 0); + for (int i = 0; i < num_test_strings; ++i) { + if (h_test_strings[i]) { + memcpy(h_chars.data() + h_offsets[i], h_test_strings[i], strlen(h_test_strings[i])); + } else { + is_null[i] = 1; + } + } + + auto const d_offsets = cudf::detail::make_device_uvector_async(h_offsets, stream, mr); + auto const d_chars = cudf::detail::make_device_uvector_async(h_chars, stream, mr); + auto const d_is_null = cudf::detail::make_device_uvector_async(is_null, stream, mr); + + std::vector> d_input; + std::vector> input; + d_input.reserve(num_columns); + input.reserve(num_columns); + + for (int col_idx = 0; col_idx < num_columns; ++col_idx) { + // Columns have sizes increase from `max_num_rows / num_columns` to `max_num_rows`. + auto const num_rows = + static_cast(static_cast(col_idx + 1) / num_columns * max_num_rows); + + auto string_pairs = rmm::device_uvector(num_rows, stream); + thrust::tabulate( + rmm::exec_policy_nosync(stream), + string_pairs.begin(), + string_pairs.end(), + index_to_pair{num_test_strings, d_chars.begin(), d_offsets.begin(), d_is_null.begin()}); + + d_input.emplace_back(std::move(string_pairs)); + input.emplace_back(d_input.back()); + } + + auto const output = cudf::make_strings_column_batch(input, stream, mr); + + for (std::size_t i = 0; i < num_columns; ++i) { + auto const string_pairs = input[i]; + auto const expected = cudf::make_strings_column(string_pairs, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), output[i]->view()); + } +} + +// The test below requires a huge amount of memory, thus it is disabled by default. +TEST_F(StringsBatchConstructionTest, DISABLED_CreateLongStringsColumns) +{ + auto constexpr num_columns = 2; + auto const stream = cudf::get_default_stream(); + auto const mr = cudf::get_current_device_resource_ref(); + + std::vector h_test_strings{"the quick brown fox jumps over the lazy dog", + "the fat cat lays next to the other accénted cat", + "a slow moving turtlé cannot catch the bird", + "which can be composéd together to form a more complete", + "thé result does not include the value in the sum in", + "", + nullptr, + "absent stop words"}; + auto const num_test_strings = static_cast(h_test_strings.size()); + + std::vector h_offsets(num_test_strings + 1, 0); + for (int i = 0; i < num_test_strings; ++i) { + h_offsets[i + 1] = h_offsets[i] + (h_test_strings[i] ? strlen(h_test_strings[i]) : 0); + } + + std::vector h_chars(h_offsets.back()); + std::vector is_null(num_test_strings, 0); + for (int i = 0; i < num_test_strings; ++i) { + if (h_test_strings[i]) { + memcpy(h_chars.data() + h_offsets[i], h_test_strings[i], strlen(h_test_strings[i])); + } else { + is_null[i] = 1; + } + } + + auto const d_offsets = cudf::detail::make_device_uvector_async(h_offsets, stream, mr); + auto const d_chars = cudf::detail::make_device_uvector_async(h_chars, stream, mr); + auto const d_is_null = cudf::detail::make_device_uvector_async(is_null, stream, mr); + + // If we create a column by repeating h_test_strings by `max_cycles` times, + // we will have it size around (1.5*INT_MAX) bytes. + auto const max_cycles = static_cast(static_cast(std::numeric_limits::max()) * + 1.5 / h_offsets.back()); + + std::vector> d_input; + std::vector> input; + d_input.reserve(num_columns); + input.reserve(num_columns); + + for (int col_idx = 0; col_idx < num_columns; ++col_idx) { + // Columns have sizes increase from `max_cycles * num_test_strings / num_columns` to + // `max_cycles * num_test_strings`. + auto const num_rows = static_cast(static_cast(col_idx + 1) / num_columns * + max_cycles * num_test_strings); + + auto string_pairs = rmm::device_uvector(num_rows, stream); + thrust::tabulate( + rmm::exec_policy_nosync(stream), + string_pairs.begin(), + string_pairs.end(), + index_to_pair{num_test_strings, d_chars.begin(), d_offsets.begin(), d_is_null.begin()}); + + d_input.emplace_back(std::move(string_pairs)); + input.emplace_back(d_input.back()); + } + + auto const output = cudf::make_strings_column_batch(input, stream, mr); + + for (std::size_t i = 0; i < num_columns; ++i) { + auto const string_pairs = input[i]; + auto const expected = cudf::make_strings_column(string_pairs, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected->view(), output[i]->view()); + } +} diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp index 73da4d081e2..4821a7fa999 100644 --- a/cpp/tests/strings/findall_tests.cpp +++ b/cpp/tests/strings/findall_tests.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -149,6 +150,22 @@ TEST_F(StringsFindallTests, LargeRegex) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); } +TEST_F(StringsFindallTests, FindTest) +{ + auto const valids = cudf::test::iterators::null_at(5); + cudf::test::strings_column_wrapper input( + {"3A", "May4", "Jan2021", "March", "A9BC", "", "", "abcdef ghijklm 12345"}, valids); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("\\d+"); + + auto prog = cudf::strings::regex_program::create(pattern); + auto results = cudf::strings::find_re(sv, *prog); + auto expected = + cudf::test::fixed_width_column_wrapper({0, 3, 3, -1, 1, 0, -1, 15}, valids); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); +} + TEST_F(StringsFindallTests, NoMatches) { cudf::test::strings_column_wrapper input({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"}); @@ -169,10 +186,16 @@ TEST_F(StringsFindallTests, EmptyTest) auto prog = cudf::strings::regex_program::create(pattern); cudf::test::strings_column_wrapper input; - auto sv = cudf::strings_column_view(input); - auto results = cudf::strings::findall(sv, *prog); - - using LCW = cudf::test::lists_column_wrapper; - LCW expected; - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + auto sv = cudf::strings_column_view(input); + { + auto results = cudf::strings::findall(sv, *prog); + using LCW = cudf::test::lists_column_wrapper; + LCW expected; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + } + { + auto results = cudf::strings::find_re(sv, *prog); + auto expected = cudf::test::fixed_width_column_wrapper{}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected); + } } diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp index ce5f68de3c9..26bcfe8028d 100644 --- a/cpp/tests/strings/integers_tests.cpp +++ b/cpp/tests/strings/integers_tests.cpp @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -425,7 +426,7 @@ TYPED_TEST(StringsIntegerConvertTest, IntegerToHex) if (v == 0) { return std::string("00"); } // special handling for single-byte types if constexpr (std::is_same_v || std::is_same_v) { - char const hex_digits[16] = { + std::array const hex_digits = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; std::string str; str += hex_digits[(v & 0xF0) >> 4]; diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 9847d8d6bb5..abc12b00a81 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -200,6 +200,34 @@ TEST_F(StringsReplaceRegexTest, ZeroLengthMatch) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } +TEST_F(StringsReplaceRegexTest, ZeroRangeQuantifier) +{ + auto input = cudf::test::strings_column_wrapper({"a", "", "123", "XYAZ", "abc", "zéyab"}); + auto sv = cudf::strings_column_view(input); + + auto pattern = std::string("A{0,5}"); + auto prog = cudf::strings::regex_program::create(pattern); + auto repl = cudf::string_scalar("_"); + auto expected = cudf::test::strings_column_wrapper( + {"_a_", "_", "_1_2_3_", "_X_Y__Z_", "_a_b_c_", "_z_é_y_a_b_"}); + auto results = cudf::strings::replace_re(sv, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + pattern = std::string("[a0-9]{0,2}"); + prog = cudf::strings::regex_program::create(pattern); + expected = + cudf::test::strings_column_wrapper({"__", "_", "___", "_X_Y_A_Z_", "__b_c_", "_z_é_y__b_"}); + results = cudf::strings::replace_re(sv, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + pattern = std::string("(?:ab){0,3}"); + prog = cudf::strings::regex_program::create(pattern); + expected = + cudf::test::strings_column_wrapper({"_a_", "_", "_1_2_3_", "_X_Y_A_Z_", "__c_", "_z_é_y__"}); + results = cudf::strings::replace_re(sv, *prog, repl); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(StringsReplaceRegexTest, Multiline) { auto const multiline = cudf::strings::regex_flags::MULTILINE; diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp index f0010fc1ed9..219bd6d8b01 100644 --- a/cpp/tests/structs/structs_column_tests.cpp +++ b/cpp/tests/structs/structs_column_tests.cpp @@ -635,9 +635,8 @@ TEST_F(StructColumnWrapperTest, TestStructsColumnWithEmptyChild) auto mask_vec = std::vector{true, false, false}; auto [null_mask, null_count] = cudf::test::detail::make_null_mask(mask_vec.begin(), mask_vec.end()); - auto structs_col = - cudf::make_structs_column(num_rows, std::move(cols), null_count, std::move(null_mask)); - EXPECT_NO_THROW(structs_col->view()); + EXPECT_NO_THROW(auto structs_col = cudf::make_structs_column( + num_rows, std::move(cols), null_count, std::move(null_mask))); } CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/transform/bools_to_mask_test.cpp b/cpp/tests/transform/bools_to_mask_test.cpp index 215ca158f37..2684123c08a 100644 --- a/cpp/tests/transform/bools_to_mask_test.cpp +++ b/cpp/tests/transform/bools_to_mask_test.cpp @@ -32,7 +32,7 @@ struct MaskToNullTest : public cudf::test::BaseFixture { { cudf::test::fixed_width_column_wrapper input_column( input.begin(), input.end(), val.begin()); - std::transform(val.begin(), val.end(), input.begin(), input.begin(), std::logical_and()); + std::transform(val.begin(), val.end(), input.begin(), input.begin(), std::logical_and<>()); auto sample = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; }); diff --git a/cpp/tests/transform/integration/unary_transform_test.cpp b/cpp/tests/transform/integration/unary_transform_test.cpp index 1785848ec77..0bdf5b321ac 100644 --- a/cpp/tests/transform/integration/unary_transform_test.cpp +++ b/cpp/tests/transform/integration/unary_transform_test.cpp @@ -47,7 +47,7 @@ void test_udf(char const* udf, Op op, Data data_init, cudf::size_type size, bool TEST_F(UnaryOperationIntegrationTest, Transform_FP32_FP32) { // c = a*a*a*a - char const* cuda = + std::string const cuda = R"***( __device__ inline void fdsf ( float* C, @@ -58,7 +58,7 @@ __device__ inline void fdsf ( } )***"; - char const* ptx = + std::string const ptx = R"***( // // Generated by NVIDIA NVVM Compiler @@ -101,17 +101,17 @@ __device__ inline void fdsf ( auto op = [](dtype a) { return a * a * a * a; }; auto data_init = [](cudf::size_type row) { return row % 3; }; - test_udf(cuda, op, data_init, 500, false); - test_udf(ptx, op, data_init, 500, true); + test_udf(cuda.c_str(), op, data_init, 500, false); + test_udf(ptx.c_str(), op, data_init, 500, true); } TEST_F(UnaryOperationIntegrationTest, Transform_INT32_INT32) { // c = a * a - a - char const cuda[] = + std::string const cuda = "__device__ inline void f(int* output,int input){*output = input*input - input;}"; - char const* ptx = + std::string const ptx = R"***( .func _Z1fPii( .param .b64 _Z1fPii_param_0, @@ -136,8 +136,8 @@ TEST_F(UnaryOperationIntegrationTest, Transform_INT32_INT32) auto op = [](dtype a) { return a * a - a; }; auto data_init = [](cudf::size_type row) { return row % 78; }; - test_udf(cuda, op, data_init, 500, false); - test_udf(ptx, op, data_init, 500, true); + test_udf(cuda.c_str(), op, data_init, 500, false); + test_udf(ptx.c_str(), op, data_init, 500, true); } TEST_F(UnaryOperationIntegrationTest, Transform_INT8_INT8) @@ -145,7 +145,7 @@ TEST_F(UnaryOperationIntegrationTest, Transform_INT8_INT8) // Capitalize all the lower case letters // Assuming ASCII, the PTX code is compiled from the following CUDA code - char const cuda[] = + std::string const cuda = R"***( __device__ inline void f( signed char* output, @@ -159,7 +159,7 @@ __device__ inline void f( } )***"; - char const ptx[] = + std::string const ptx = R"***( .func _Z1fPcc( .param .b64 _Z1fPcc_param_0, @@ -191,15 +191,15 @@ __device__ inline void f( auto op = [](dtype a) { return std::toupper(a); }; auto data_init = [](cudf::size_type row) { return 'a' + (row % 26); }; - test_udf(cuda, op, data_init, 500, false); - test_udf(ptx, op, data_init, 500, true); + test_udf(cuda.c_str(), op, data_init, 500, false); + test_udf(ptx.c_str(), op, data_init, 500, true); } TEST_F(UnaryOperationIntegrationTest, Transform_Datetime) { // Add one day to timestamp in microseconds - char const cuda[] = + std::string const cuda = R"***( __device__ inline void f(cudf::timestamp_us* output, cudf::timestamp_us input) { @@ -217,7 +217,7 @@ __device__ inline void f(cudf::timestamp_us* output, cudf::timestamp_us input) auto random_eng = cudf::test::UniformRandomGenerator(0, 100000000); auto data_init = [&random_eng](cudf::size_type row) { return random_eng.generate(); }; - test_udf(cuda, op, data_init, 500, false); + test_udf(cuda.c_str(), op, data_init, 500, false); } } // namespace transformation diff --git a/cpp/tests/utilities/table_utilities.cu b/cpp/tests/utilities/table_utilities.cu index 354c0b1b57e..8e4906408de 100644 --- a/cpp/tests/utilities/table_utilities.cu +++ b/cpp/tests/utilities/table_utilities.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,10 +15,9 @@ */ #include +#include #include -#include - namespace cudf::test::detail { void expect_table_properties_equal(cudf::table_view lhs, cudf::table_view rhs) { diff --git a/cpp/tests/utilities_tests/batched_memcpy_tests.cu b/cpp/tests/utilities_tests/batched_memcpy_tests.cu new file mode 100644 index 00000000000..98657f8e224 --- /dev/null +++ b/cpp/tests/utilities_tests/batched_memcpy_tests.cu @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +template +struct BatchedMemcpyTest : public cudf::test::BaseFixture {}; + +TEST(BatchedMemcpyTest, BasicTest) +{ + using T1 = int64_t; + + // Device init + auto stream = cudf::get_default_stream(); + auto mr = cudf::get_current_device_resource_ref(); + + // Buffer lengths (in number of elements) + std::vector const h_lens{ + 50000, 4, 1000, 0, 250000, 1, 100, 8000, 0, 1, 100, 1000, 10000, 100000, 0, 1, 100000}; + + // Total number of buffers + auto const num_buffs = h_lens.size(); + + // Exclusive sum of buffer lengths for pointers + std::vector h_lens_excl_sum(num_buffs); + std::exclusive_scan(h_lens.begin(), h_lens.end(), h_lens_excl_sum.begin(), 0); + + // Corresponding buffer sizes (in bytes) + std::vector h_sizes_bytes; + h_sizes_bytes.reserve(num_buffs); + std::transform( + h_lens.cbegin(), h_lens.cend(), std::back_inserter(h_sizes_bytes), [&](auto& size) { + return size * sizeof(T1); + }); + + // Initialize random engine + auto constexpr seed = 0xcead; + std::mt19937 engine{seed}; + using uniform_distribution = + typename std::conditional_t, + std::bernoulli_distribution, + std::conditional_t, + std::uniform_real_distribution, + std::uniform_int_distribution>>; + uniform_distribution dist{}; + + // Generate a src vector of random data vectors + std::vector> h_sources; + h_sources.reserve(num_buffs); + std::transform(h_lens.begin(), h_lens.end(), std::back_inserter(h_sources), [&](auto size) { + std::vector data(size); + std::generate_n(data.begin(), size, [&]() { return T1{dist(engine)}; }); + return data; + }); + // Copy the vectors to device + std::vector> h_device_vecs; + h_device_vecs.reserve(h_sources.size()); + std::transform( + h_sources.begin(), h_sources.end(), std::back_inserter(h_device_vecs), [stream, mr](auto& vec) { + return cudf::detail::make_device_uvector_async(vec, stream, mr); + }); + // Pointers to the source vectors + std::vector h_src_ptrs; + h_src_ptrs.reserve(h_sources.size()); + std::transform( + h_device_vecs.begin(), h_device_vecs.end(), std::back_inserter(h_src_ptrs), [](auto& vec) { + return static_cast(vec.data()); + }); + // Copy the source data pointers to device + auto d_src_ptrs = cudf::detail::make_device_uvector_async(h_src_ptrs, stream, mr); + + // Total number of elements in all buffers + auto const total_buff_len = std::accumulate(h_lens.cbegin(), h_lens.cend(), 0); + + // Create one giant buffer for destination + auto d_dst_data = cudf::detail::make_zeroed_device_uvector_async(total_buff_len, stream, mr); + // Pointers to destination buffers within the giant destination buffer + std::vector h_dst_ptrs(num_buffs); + std::for_each(thrust::make_counting_iterator(static_cast(0)), + thrust::make_counting_iterator(num_buffs), + [&](auto i) { return h_dst_ptrs[i] = d_dst_data.data() + h_lens_excl_sum[i]; }); + // Copy destination data pointers to device + auto d_dst_ptrs = cudf::detail::make_device_uvector_async(h_dst_ptrs, stream, mr); + + // Copy buffer size iterators (in bytes) to device + auto d_sizes_bytes = cudf::detail::make_device_uvector_async(h_sizes_bytes, stream, mr); + + // Run the batched memcpy + cudf::detail::batched_memcpy_async( + d_src_ptrs.begin(), d_dst_ptrs.begin(), d_sizes_bytes.begin(), num_buffs, stream); + + // Expected giant destination buffer after the memcpy + std::vector expected_buffer; + expected_buffer.reserve(total_buff_len); + std::for_each(h_sources.cbegin(), h_sources.cend(), [&expected_buffer](auto& source) { + expected_buffer.insert(expected_buffer.end(), source.begin(), source.end()); + }); + + // Copy over the result destination buffer to host and synchronize the stream + auto result_dst_buffer = + cudf::detail::make_std_vector_sync(cudf::device_span(d_dst_data), stream); + + // Check if both vectors are equal + EXPECT_TRUE( + std::equal(expected_buffer.begin(), expected_buffer.end(), result_dst_buffer.begin())); +} diff --git a/cpp/tests/utilities_tests/batched_memset_tests.cu b/cpp/tests/utilities_tests/batched_memset_tests.cu index bed0f40d70e..0eeb7b95318 100644 --- a/cpp/tests/utilities_tests/batched_memset_tests.cu +++ b/cpp/tests/utilities_tests/batched_memset_tests.cu @@ -18,8 +18,8 @@ #include #include +#include #include -#include #include #include #include @@ -78,7 +78,7 @@ TEST(MultiBufferTestIntegral, BasicTest1) }); // Function Call - cudf::io::detail::batched_memset(memset_bufs, uint64_t{0}, stream); + cudf::detail::batched_memset(memset_bufs, uint64_t{0}, stream); // Set all buffer regions to 0 for expected comparison std::for_each( diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp index d052e20eedb..cfab570833b 100644 --- a/cpp/tests/utilities_tests/logger_tests.cpp +++ b/cpp/tests/utilities_tests/logger_tests.cpp @@ -28,16 +28,17 @@ class LoggerTest : public cudf::test::BaseFixture { std::vector prev_sinks; public: - LoggerTest() : prev_level{cudf::logger().level()}, prev_sinks{cudf::logger().sinks()} + LoggerTest() + : prev_level{cudf::detail::logger().level()}, prev_sinks{cudf::detail::logger().sinks()} { - cudf::logger().sinks() = {std::make_shared(oss)}; - cudf::logger().set_formatter( + cudf::detail::logger().sinks() = {std::make_shared(oss)}; + cudf::detail::logger().set_formatter( std::unique_ptr(new spdlog::pattern_formatter("%v"))); } ~LoggerTest() override { - cudf::logger().set_level(prev_level); - cudf::logger().sinks() = prev_sinks; + cudf::detail::logger().set_level(prev_level); + cudf::detail::logger().sinks() = prev_sinks; } void clear_sink() { oss.str(""); } @@ -46,32 +47,32 @@ class LoggerTest : public cudf::test::BaseFixture { TEST_F(LoggerTest, Basic) { - cudf::logger().critical("crit msg"); + cudf::detail::logger().critical("crit msg"); ASSERT_EQ(this->sink_content(), "crit msg\n"); } TEST_F(LoggerTest, DefaultLevel) { - cudf::logger().trace("trace"); - cudf::logger().debug("debug"); - cudf::logger().info("info"); - cudf::logger().warn("warn"); - cudf::logger().error("error"); - cudf::logger().critical("critical"); + cudf::detail::logger().trace("trace"); + cudf::detail::logger().debug("debug"); + cudf::detail::logger().info("info"); + cudf::detail::logger().warn("warn"); + cudf::detail::logger().error("error"); + cudf::detail::logger().critical("critical"); ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n"); } TEST_F(LoggerTest, CustomLevel) { - cudf::logger().set_level(spdlog::level::warn); - cudf::logger().info("info"); - cudf::logger().warn("warn"); + cudf::detail::logger().set_level(spdlog::level::warn); + cudf::detail::logger().info("info"); + cudf::detail::logger().warn("warn"); ASSERT_EQ(this->sink_content(), "warn\n"); this->clear_sink(); - cudf::logger().set_level(spdlog::level::debug); - cudf::logger().trace("trace"); - cudf::logger().debug("debug"); + cudf::detail::logger().set_level(spdlog::level::debug); + cudf::detail::logger().trace("trace"); + cudf::detail::logger().debug("debug"); ASSERT_EQ(this->sink_content(), "debug\n"); } diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp index ae7c6fa8b8c..7b8ee840da4 100644 --- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp +++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "io/utilities/hostdevice_vector.hpp" + #include #include #include @@ -22,10 +24,17 @@ #include #include #include +#include #include #include +using cudf::host_span; +using cudf::detail::host_2dspan; +using cudf::detail::hostdevice_2dvector; +using cudf::detail::hostdevice_span; +using cudf::detail::hostdevice_vector; + class PinnedMemoryTest : public cudf::test::BaseFixture { size_t prev_copy_threshold; size_t prev_alloc_threshold; @@ -125,3 +134,63 @@ TEST_F(PinnedMemoryTest, MakeHostVector) EXPECT_FALSE(vec.get_allocator().is_device_accessible()); } } + +TEST_F(PinnedMemoryTest, HostSpan) +{ + auto test_ctors = [](auto&& vec) { + auto const is_vec_device_accessible = vec.get_allocator().is_device_accessible(); + // Test conversion from a vector + auto const span = host_span{vec}; + EXPECT_EQ(span.is_device_accessible(), is_vec_device_accessible); + // Test conversion from host_span with different type + auto const span_converted = host_span{span}; + EXPECT_EQ(span_converted.is_device_accessible(), is_vec_device_accessible); + }; + + cudf::set_allocate_host_as_pinned_threshold(7); + for (int i = 1; i < 10; i++) { + // some iterations will use pinned memory, some will not + test_ctors(cudf::detail::make_host_vector(i, cudf::get_default_stream())); + } + + auto stream{cudf::get_default_stream()}; + + // hostdevice vectors use pinned memory for the host side; test that host_span can be constructed + // from a hostdevice_vector with correct device accessibility + + hostdevice_vector hd_vec(10, stream); + auto const span = host_span{hd_vec}; + EXPECT_TRUE(span.is_device_accessible()); + + // test host_view and operator[] + { + hostdevice_2dvector hd_2dvec(10, 10, stream); + auto const span2d = hd_2dvec.host_view().flat_view(); + EXPECT_TRUE(span2d.is_device_accessible()); + + auto const span2d_from_cast = host_2dspan{hd_2dvec}; + EXPECT_TRUE(span2d_from_cast.flat_view().is_device_accessible()); + + auto const row_span = hd_2dvec[0]; + EXPECT_TRUE(row_span.is_device_accessible()); + } + + // test const versions of host_view and operator[] + { + hostdevice_2dvector const const_hd_2dvec(10, 10, stream); + auto const const_span2d = const_hd_2dvec.host_view().flat_view(); + EXPECT_TRUE(const_span2d.is_device_accessible()); + + auto const const_span2d_from_cast = host_2dspan{const_hd_2dvec}; + EXPECT_TRUE(const_span2d_from_cast.flat_view().is_device_accessible()); + + auto const const_row_span = const_hd_2dvec[0]; + EXPECT_TRUE(const_row_span.is_device_accessible()); + } + + // test hostdevice_span + { + hostdevice_span hd_span(hd_vec); + EXPECT_TRUE(host_span{hd_span}.is_device_accessible()); + } +} diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu index 019d6adc007..5389e1c069d 100644 --- a/cpp/tests/utilities_tests/span_tests.cu +++ b/cpp/tests/utilities_tests/span_tests.cu @@ -336,58 +336,50 @@ auto get_test_hostdevice_vector() TEST(HostDeviceSpanTest, CanCreateFullSubspan) { - auto message = get_test_hostdevice_vector(); - auto const message_span = - cudf::detail::hostdevice_span(message.host_ptr(), message.device_ptr(), message.size()); + auto message = get_test_hostdevice_vector(); + auto const message_span = cudf::detail::hostdevice_span{message}; - expect_equivalent(message_span, message.subspan(0, message_span.size())); + expect_equivalent(message_span.subspan(0, message_span.size()), message_span); } TEST(HostDeviceSpanTest, CanCreateHostSpan) { auto message = get_test_hostdevice_vector(); auto const message_span = host_span(message.host_ptr(), message.size()); - auto const hd_span = - cudf::detail::hostdevice_span(message.host_ptr(), message.device_ptr(), message.size()); + auto const hd_span = cudf::detail::hostdevice_span{message}; expect_equivalent(message_span, cudf::host_span(hd_span)); } TEST(HostDeviceSpanTest, CanTakeSubspanFull) { - auto message = get_test_hostdevice_vector(); - auto const message_span = - cudf::detail::hostdevice_span(message.host_ptr(), message.device_ptr(), message.size()); + auto message = get_test_hostdevice_vector(); + auto const message_span = cudf::detail::hostdevice_span{message}; - expect_match("hello world", message.subspan(0, 11)); expect_match("hello world", message_span.subspan(0, 11)); } TEST(HostDeviceSpanTest, CanTakeSubspanPartial) { - auto message = get_test_hostdevice_vector(); - auto const message_span = - cudf::detail::hostdevice_span(message.host_ptr(), message.device_ptr(), message.size()); + auto message = get_test_hostdevice_vector(); + auto const message_span = cudf::detail::hostdevice_span{message}; - expect_match("lo w", message.subspan(3, 4)); expect_match("lo w", message_span.subspan(3, 4)); } TEST(HostDeviceSpanTest, CanGetData) { - auto message = get_test_hostdevice_vector(); - auto const message_span = - cudf::detail::hostdevice_span(message.host_ptr(), message.device_ptr(), message.size()); + auto message = get_test_hostdevice_vector(); + auto const message_span = cudf::detail::hostdevice_span{message}; EXPECT_EQ(message.host_ptr(), message_span.host_ptr()); } TEST(HostDeviceSpanTest, CanGetSize) { - auto message = get_test_hostdevice_vector(); - auto const message_span = - cudf::detail::hostdevice_span(message.host_ptr(), message.device_ptr(), message.size()); - auto const empty_span = cudf::detail::hostdevice_span(); + auto message = get_test_hostdevice_vector(); + auto const message_span = cudf::detail::hostdevice_span{message}; + auto const empty_span = cudf::detail::hostdevice_span(); EXPECT_EQ(static_cast(11), message_span.size()); EXPECT_EQ(static_cast(0), empty_span.size()); @@ -413,8 +405,7 @@ TEST(HostDeviceSpanTest, CanCopySpan) cudf::detail::hostdevice_span message_span_copy; { - auto const message_span = - cudf::detail::hostdevice_span(message.host_ptr(), message.device_ptr(), message.size()); + auto const message_span = cudf::detail::hostdevice_span{message}; message_span_copy = message_span; } diff --git a/dependencies.yaml b/dependencies.yaml index ed36a23e5c3..4804f7b00b0 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -6,15 +6,24 @@ files: cuda: ["11.8", "12.5"] arch: [x86_64] includes: + # Note that clang-tidy is not included here because cudf's preferred + # version conflicts with the rest of RAPIDS as well as its own + # clang-format version. Until we update our clang-format version we will + # not be able to install both into the same environment. Moreover, using + # this version will break compatibility with other RAPIDS libraries that + # are still using 16.0.6, and as such will and that would break any + # unified environment like that used in unified devcontainers. - build_base - build_all - build_cpp - build_python_common + - clang_format - cuda - cuda_version - depends_on_cupy - depends_on_libkvikio - depends_on_librmm + - depends_on_nvcomp - depends_on_rmm - develop - docs @@ -85,6 +94,16 @@ files: includes: - develop - py_version + clang_tidy: + output: none + includes: + - build_all + - build_base + - clang_tidy + - cuda + - cuda_version + - develop + - py_version docs: output: none includes: @@ -152,6 +171,13 @@ files: - build_cpp - depends_on_libkvikio - depends_on_librmm + py_run_libcudf: + output: pyproject + pyproject_dir: python/libcudf + extras: + table: project + includes: + - depends_on_nvcomp py_build_pylibcudf: output: pyproject pyproject_dir: python/pylibcudf @@ -367,9 +393,27 @@ dependencies: - fmt>=11.0.2,<12 - flatbuffers==24.3.25 - librdkafka>=2.5.0,<2.6.0a0 + - spdlog>=1.14.1,<1.15 + depends_on_nvcomp: + common: + - output_types: conda + packages: # Align nvcomp version with rapids-cmake - nvcomp==4.0.1 - - spdlog>=1.14.1,<1.15 + specific: + - output_types: [requirements, pyproject] + matrices: + - matrix: + cuda: "12.*" + packages: + - nvidia-nvcomp-cu12==4.0.1 + - matrix: + cuda: "11.*" + packages: + - nvidia-nvcomp-cu11==4.0.1 + - matrix: + packages: + - nvidia-nvcomp==4.0.1 rapids_build_skbuild: common: - output_types: [conda, requirements, pyproject] @@ -395,9 +439,18 @@ dependencies: - cython>=3.0.3 pyarrow_run: common: - - output_types: [conda, requirements, pyproject] + - output_types: [conda] packages: - pyarrow>=14.0.0,<18.0.0a0 + - output_types: [requirements, pyproject] + packages: + # pyarrow 17.0.0 wheels have a subtle issue around threading that + # can cause segmentation faults around imports on arm. It appears to + # be highly dependent on the exact build configuration, so we'll just + # avoid 17.0.0 for now unless we observe similar issues in future + # releases as well. + - pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64' + - pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64' cuda_version: specific: - output_types: conda @@ -518,11 +571,21 @@ dependencies: # pre-commit requires identify minimum version 1.0, but clang-format requires textproto support and that was # added in 2.5.20, so we need to call out the minimum version needed for our plugins - identify>=2.5.20 + - output_types: conda + packages: + - &doxygen doxygen=1.9.1 # pre-commit hook needs a specific version. + clang_format: + common: - output_types: conda packages: - clang==16.0.6 - clang-tools=16.0.6 - - &doxygen doxygen=1.9.1 # pre-commit hook needs a specific version. + clang_tidy: + common: + - output_types: conda + packages: + - clang==19.1.0 + - clang-tools==19.1.0 docs: common: - output_types: [conda] @@ -576,7 +639,7 @@ dependencies: packages: - fsspec>=0.6.0 - &numpy numpy>=1.23,<3.0a0 - - pandas>=2.0,<2.2.3dev0 + - pandas>=2.0,<2.2.4dev0 run_pylibcudf: common: - output_types: [conda, requirements, pyproject] @@ -664,7 +727,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.8,<1.9 + - polars>=1.11,<1.12 run_dask_cudf: common: - output_types: [conda, requirements, pyproject] @@ -722,6 +785,10 @@ dependencies: packages: - *numba-cuda-dep - pandas==2.0.* + - matrix: {dependencies: "latest"} + packages: + - numba-cuda==0.0.15 + - pandas==2.2.3 - matrix: packages: - output_types: conda diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 95813907bf4..5942cc16850 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -342,7 +342,7 @@ def clean_all_xml_files(path): "cudf.Series": ("cudf.core.series.Series", "cudf.Series"), "cudf.Index": ("cudf.core.index.Index", "cudf.Index"), "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"), - "DeviceBuffer": ("rmm._lib.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"), + "DeviceBuffer": ("rmm.pylibrmm.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"), } diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md index 34b657488c1..5024747227e 100644 --- a/docs/cudf/source/cudf_pandas/faq.md +++ b/docs/cudf/source/cudf_pandas/faq.md @@ -181,6 +181,32 @@ There are a few known limitations that you should be aware of: ``` - `cudf.pandas` (and cuDF in general) is only compatible with pandas 2. Version 24.02 of cudf was the last to support pandas 1.5.x. +- In order for `cudf.pandas` to produce a proxy array that ducktypes as a NumPy + array, we create a proxy type that actually subclasses `numpy.ndarray`. We can + verify this with an isinstance check. + + ```python + %load_ext cudf.pandas + import pandas as pd + import numpy as np + + arr = pd.Series([1, 1, 2]).unique() # returns a proxy array + isinstance(arr, np.ndarray) # returns True, where arr is a proxy array + ``` + Because the proxy type ducktypes as a NumPy array, NumPy functions may attempt to + access internal members, such as the [data buffer](https://numpy.org/doc/stable/dev/internals.html#internal-organization-of-numpy-arrays), via the NumPy C API. + However, our proxy mechanism is designed to proxy function calls at the Python + level, which is incompatible with these types of accesses. To handle these + situations, we perform an eager device-to-host (DtoH) copy, which sets the data + buffer correctly but incurs the cost of extra time when creating the proxy array. + In the previous example, creating `arr` performed this kind of implicit DtoH transfer. + + With this approach, we also get compatibility with third party libraries like `torch`. + + ```python + import torch + x = torch.from_numpy(arr) + ``` ## Can I force running on the CPU? diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md index 6fce268f309..f4d2c7319b3 100644 --- a/docs/cudf/source/developer_guide/contributing_guide.md +++ b/docs/cudf/source/developer_guide/contributing_guide.md @@ -15,8 +15,7 @@ Developers are strongly recommended to set up `pre-commit` prior to any developm The `.pre-commit-config.yaml` file at the root of the repo is the primary source of truth linting. Specifically, cuDF uses the following tools: -- [`ruff`](https://beta.ruff.rs/) checks for general code formatting compliance. -- [`isort`](https://pycqa.github.io/isort/) ensures imports are sorted consistently. +- [`ruff`](https://docs.astral.sh/ruff/) checks for general code formatting compliance. - [`mypy`](http://mypy-lang.org/) performs static type checking. In conjunction with [type hints](https://docs.python.org/3/library/typing.html), `mypy` can help catch various bugs that are otherwise difficult to find. diff --git a/docs/cudf/source/developer_guide/testing.md b/docs/cudf/source/developer_guide/testing.md index f12f809d5db..22cc1b5b8de 100644 --- a/docs/cudf/source/developer_guide/testing.md +++ b/docs/cudf/source/developer_guide/testing.md @@ -7,6 +7,23 @@ specifically the [`pytest-cov`](https://github.com/pytest-dev/pytest-cov) plugin Code coverage reports are uploaded to [Codecov](https://app.codecov.io/gh/rapidsai/cudf). Each PR also indicates whether it increases or decreases test coverage. +### Configuring pytest + +Pytest will accept configuration in [multiple different +files](https://docs.pytest.org/en/stable/reference/customize.html), +with a specified discovery and precedence order. Note in particular +that there is no automatic "include" mechanism, as soon as a matching +configuration file is found, discovery stops. + +For preference, so that all tool configuration lives in the same +place, we use `pyproject.toml`-based configuration. Test configuration +for a given package should live in that package's `pyproject.toml` +file. + +Where tests do not naturally belong to a project, for example the +`cudf.pandas` integration tests and the cuDF benchmarks, use a +`pytest.ini` file as close to the tests as possible. + ## Test organization How tests are organized depends on which of the following two groups they fall into: diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb index 95f5f9734dd..46221b6015b 100644 --- a/docs/cudf/source/user_guide/10min.ipynb +++ b/docs/cudf/source/user_guide/10min.ipynb @@ -38,10 +38,10 @@ "import os\n", "\n", "import cupy as cp\n", + "import dask_cudf\n", "import pandas as pd\n", "\n", "import cudf\n", - "import dask_cudf\n", "\n", "cp.random.seed(12)\n", "\n", diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index e21536e2e97..62e14a67ee5 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf. groupby interop join + json labeling lists merge @@ -49,3 +50,4 @@ This page provides API documentation for pylibcudf. io/index.rst strings/index.rst + nvtext/index.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst new file mode 100644 index 00000000000..bb38d179a57 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/json.rst @@ -0,0 +1,6 @@ +==== +json +==== + +.. automodule:: pylibcudf.json + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst new file mode 100644 index 00000000000..abb45e426a8 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/edit_distance.rst @@ -0,0 +1,6 @@ +============= +edit_distance +============= + +.. automodule:: pylibcudf.nvtext.edit_distance + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst new file mode 100644 index 00000000000..d68199271bd --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/generate_ngrams.rst @@ -0,0 +1,6 @@ +=============== +generate_ngrams +=============== + +.. automodule:: pylibcudf.nvtext.generate_ngrams + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst new file mode 100644 index 00000000000..e0735a197fd --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/index.rst @@ -0,0 +1,14 @@ +nvtext +====== + +.. toctree:: + :maxdepth: 1 + + edit_distance + generate_ngrams + jaccard + minhash + ngrams_tokenize + normalize + replace + stemmer diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst new file mode 100644 index 00000000000..ea59657c25e --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/jaccard.rst @@ -0,0 +1,6 @@ +======= +jaccard +======= + +.. automodule:: pylibcudf.nvtext.jaccard + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst new file mode 100644 index 00000000000..b8ec02fca35 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/minhash.rst @@ -0,0 +1,6 @@ +======= +minhash +======= + +.. automodule:: pylibcudf.nvtext.minhash + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst new file mode 100644 index 00000000000..ce6db76f889 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/ngrams_tokenize.rst @@ -0,0 +1,6 @@ +=============== +ngrams_tokenize +=============== + +.. automodule:: pylibcudf.nvtext.ngrams_tokenize + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst new file mode 100644 index 00000000000..e496f6a45da --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/normalize.rst @@ -0,0 +1,6 @@ +========= +normalize +========= + +.. automodule:: pylibcudf.nvtext.normalize + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst new file mode 100644 index 00000000000..04cee972dc1 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/replace.rst @@ -0,0 +1,6 @@ +======= +replace +======= + +.. automodule:: pylibcudf.nvtext.replace + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst new file mode 100644 index 00000000000..b407ff8451a --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/nvtext/stemmer.rst @@ -0,0 +1,6 @@ +======= +stemmer +======= + +.. automodule:: pylibcudf.nvtext.stemmer + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst new file mode 100644 index 00000000000..38a46641200 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/combine.rst @@ -0,0 +1,6 @@ +======= +combine +======= + +.. automodule:: pylibcudf.strings.combine + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst new file mode 100644 index 00000000000..de62221456f --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_booleans.rst @@ -0,0 +1,6 @@ +================ +convert_booleans +================ + +.. automodule:: pylibcudf.strings.convert.convert_booleans + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst new file mode 100644 index 00000000000..fc5d5204ab3 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_datetime.rst @@ -0,0 +1,6 @@ +================ +convert_datetime +================ + +.. automodule:: pylibcudf.strings.convert.convert_datetime + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst new file mode 100644 index 00000000000..e80b0c15a61 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_durations.rst @@ -0,0 +1,6 @@ +================= +convert_durations +================= + +.. automodule:: pylibcudf.strings.convert.convert_durations + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst new file mode 100644 index 00000000000..16d971a6849 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_fixed_point.rst @@ -0,0 +1,6 @@ +=================== +convert_fixed_point +=================== + +.. automodule:: pylibcudf.strings.convert.convert_fixed_point + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst new file mode 100644 index 00000000000..9ae4004cea9 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_floats.rst @@ -0,0 +1,6 @@ +============== +convert_floats +============== + +.. automodule:: pylibcudf.strings.convert.convert_floats + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst new file mode 100644 index 00000000000..71d146c0379 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_integers.rst @@ -0,0 +1,6 @@ +================ +convert_integers +================ + +.. automodule:: pylibcudf.strings.convert.convert_integers + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst new file mode 100644 index 00000000000..4ead8677a69 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_ipv4.rst @@ -0,0 +1,6 @@ +============ +convert_ipv4 +============ + +.. automodule:: pylibcudf.strings.convert.convert_ipv4 + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst new file mode 100644 index 00000000000..33a719a42e1 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_lists.rst @@ -0,0 +1,6 @@ +============= +convert_lists +============= + +.. automodule:: pylibcudf.strings.convert.convert_lists + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst new file mode 100644 index 00000000000..f20d95e0cdd --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/convert_urls.rst @@ -0,0 +1,6 @@ +============ +convert_urls +============ + +.. automodule:: pylibcudf.strings.convert.convert_urls + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst new file mode 100644 index 00000000000..3d07c1271b4 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/convert/index.rst @@ -0,0 +1,15 @@ +convert +======= + +.. toctree:: + :maxdepth: 1 + + convert_booleans + convert_datetime + convert_durations + convert_fixed_point + convert_floats + convert_integers + convert_ipv4 + convert_lists + convert_urls diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst new file mode 100644 index 00000000000..8e86b33b1a0 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find_multiple.rst @@ -0,0 +1,6 @@ +============= +find_multiple +============= + +.. automodule:: pylibcudf.strings.find_multiple + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst index 9b1a6b72a88..ae670b5bd8a 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst @@ -6,13 +6,26 @@ strings capitalize char_types + combine contains extract find + find_multiple findall + padding regex_flags regex_program repeat + replace_re replace + side_type slice + split strip + wrap + +.. toctree:: + :maxdepth: 2 + :caption: Subpackages + + convert/index.rst diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst new file mode 100644 index 00000000000..5b417024fd5 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/padding.rst @@ -0,0 +1,6 @@ +======= +padding +======= + +.. automodule:: pylibcudf.strings.padding + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst new file mode 100644 index 00000000000..5bf715ef657 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace_re.rst @@ -0,0 +1,6 @@ +========== +replace_re +========== + +.. automodule:: pylibcudf.strings.replace_re + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst new file mode 100644 index 00000000000..d5aef9c4f75 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/side_type.rst @@ -0,0 +1,6 @@ +========= +side_type +========= + +.. automodule:: pylibcudf.strings.side_type + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst new file mode 100644 index 00000000000..cba96e86f45 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/split.rst @@ -0,0 +1,6 @@ +===== +split +===== + +.. automodule:: pylibcudf.strings.split + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst new file mode 100644 index 00000000000..bd825f78568 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/wrap.rst @@ -0,0 +1,6 @@ +==== +wrap +==== + +.. automodule:: pylibcudf.strings.wrap + :members: diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb index 75eafcc5387..abfe5a1b178 100644 --- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb +++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb @@ -101,6 +101,8 @@ "outputs": [], "source": [ "# define a scalar function\n", + "\n", + "\n", "def f(x):\n", " return x + 1" ] @@ -247,6 +249,8 @@ "outputs": [], "source": [ "# redefine the same function from above\n", + "\n", + "\n", "def f(x):\n", " return x + 1" ] @@ -1622,6 +1626,8 @@ "outputs": [], "source": [ "# a user defined aggregation function.\n", + "\n", + "\n", "def udaf(df):\n", " return df[\"b\"].max() - df[\"b\"].min() / 2" ] diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh index 5a429bdc739..4b5379cf0f1 100755 --- a/java/ci/build-in-docker.sh +++ b/java/ci/build-in-docker.sh @@ -64,7 +64,8 @@ cmake .. -G"${CMAKE_GENERATOR}" \ -DBUILD_TESTS=$BUILD_CPP_TESTS \ -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS \ -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL \ - -DBUILD_SHARED_LIBS=OFF + -DBUILD_SHARED_LIBS=OFF \ + -DKvikIO_REMOTE_SUPPORT=OFF if [[ -z "${PARALLEL_LEVEL}" ]]; then cmake --build . diff --git a/java/src/main/java/ai/rapids/cudf/RegexFlag.java b/java/src/main/java/ai/rapids/cudf/RegexFlag.java index 7ed8e0354c9..68a3856f37d 100644 --- a/java/src/main/java/ai/rapids/cudf/RegexFlag.java +++ b/java/src/main/java/ai/rapids/cudf/RegexFlag.java @@ -28,7 +28,16 @@ public enum RegexFlag { DEFAULT(0), // default MULTILINE(8), // the '^' and '$' honor new-line characters DOTALL(16), // the '.' matching includes new-line characters - ASCII(256); // use only ASCII when matching built-in character classes + ASCII(256), // use only ASCII when matching built-in character classes + /** + * EXT_NEWLINE(512): Extends line delimiters to include the following Unicode characters + * - NEXT_LINE ('\u0085') + * - LINE_SEPARATOR ('\u2028') + * - PARAGRAPH_SEPARATOR ('\u2029') + * - CARRIAGE_RETURN ('\r') + * - NEW_LINE ('\n') + */ + EXT_NEWLINE(512); final int nativeId; // Native id, for use with libcudf. private RegexFlag(int nativeId) { // Only constant values should be used diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index 708744569df..14c290b300a 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -31,6 +31,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.EnumSet; import java.util.List; import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; @@ -3877,6 +3878,43 @@ void testExtractRe() { } } + @Test +void testExtractReWithMultiLineDelimiters() { + String NEXT_LINE = "\u0085"; + String LINE_SEPARATOR = "\u2028"; + String PARAGRAPH_SEPARATOR = "\u2029"; + String CARRIAGE_RETURN = "\r"; + String NEW_LINE = "\n"; + + try (ColumnVector input = ColumnVector.fromStrings( + "boo:" + NEXT_LINE + "boo::" + LINE_SEPARATOR + "boo:::", + "boo:::" + LINE_SEPARATOR + "zzé" + CARRIAGE_RETURN + "lll", + "boo::", + "", + "boo::" + NEW_LINE, + "boo::" + CARRIAGE_RETURN, + "boo:" + NEXT_LINE + "boo::" + PARAGRAPH_SEPARATOR, + "boo:" + NEW_LINE + "boo::" + LINE_SEPARATOR, + "boo:" + NEXT_LINE + "boo::" + NEXT_LINE); + Table expected_ext_newline = new Table.TestBuilder() + .column("boo:::", null, "boo::", null, "boo::", "boo::", "boo::", "boo::", "boo::") + .build(); + Table expected_default = new Table.TestBuilder() + .column("boo:::", null, "boo::", null, "boo::", null, null, null, null) + .build()) { + + // Regex pattern to match 'boo:' followed by one or more colons at the end of the string + try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.EXT_NEWLINE)))) { + assertColumnsAreEqual(expected_ext_newline.getColumns()[0], found.getColumns()[0]); + } + + try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.DEFAULT)))) { + assertColumnsAreEqual(expected_default.getColumns()[0], found.getColumns()[0]); + } + } + } + + @Test void testExtractAllRecord() { String pattern = "([ab])(\\d)"; diff --git a/java/src/test/java/ai/rapids/cudf/ReductionTest.java b/java/src/test/java/ai/rapids/cudf/ReductionTest.java index 8cc7df1ce7f..6bd6603d71b 100644 --- a/java/src/test/java/ai/rapids/cudf/ReductionTest.java +++ b/java/src/test/java/ai/rapids/cudf/ReductionTest.java @@ -612,13 +612,13 @@ void testWithSetOutputType() { assertEquals(expected, result); } - try (Scalar expected = Scalar.fromFloat(1.666667f); + try (Scalar expected = Scalar.fromFloat(1.6666666f); ColumnVector cv = ColumnVector.fromBytes(new byte[]{1, 2, 3, 4}); Scalar result = cv.variance(DType.FLOAT32)) { assertEquals(expected, result); } - try (Scalar expected = Scalar.fromFloat(1.2909945f); + try (Scalar expected = Scalar.fromFloat(1.2909944f); ColumnVector cv = ColumnVector.fromBytes(new byte[]{1, 2, 3, 4}); Scalar result = cv.standardDeviation(DType.FLOAT32)) { assertEquals(expected, result); diff --git a/pyproject.toml b/pyproject.toml index 8f9aa165e5a..6933484f4e7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,8 @@ select = [ "F", # pycodestyle Warning "W", + # isort + "I", # no-blank-line-before-function "D201", # one-blank-line-after-class @@ -90,6 +92,8 @@ select = [ "UP007", # Import from `collections.abc` instead: `Callable` "UP035", + # usage of legacy `np.random` function calls + "NPY002", ] ignore = [ # whitespace before : diff --git a/python/cudf/benchmarks/API/bench_functions.py b/python/cudf/benchmarks/API/bench_functions.py index 93109838900..f902111b0db 100644 --- a/python/cudf/benchmarks/API/bench_functions.py +++ b/python/cudf/benchmarks/API/bench_functions.py @@ -72,12 +72,13 @@ def bench_pivot_table_simple(benchmark, dataframe): @pytest_cases.parametrize("nr", NUM_ROWS) def bench_crosstab_simple(benchmark, nr): + rng = np.random.default_rng(seed=0) series_a = np.array(["foo", "bar"] * nr) series_b = np.array(["one", "two"] * nr) series_c = np.array(["dull", "shiny"] * nr) - np.random.shuffle(series_a) - np.random.shuffle(series_b) - np.random.shuffle(series_c) + rng.shuffle(series_a) + rng.shuffle(series_b) + rng.shuffle(series_c) series_a = cudf.Series(series_a) series_b = cudf.Series(series_b) series_c = cudf.Series(series_c) diff --git a/python/cudf/benchmarks/API/bench_multiindex.py b/python/cudf/benchmarks/API/bench_multiindex.py index 6268bcc4267..77004c3313e 100644 --- a/python/cudf/benchmarks/API/bench_multiindex.py +++ b/python/cudf/benchmarks/API/bench_multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. """Benchmarks of MultiIndex methods.""" @@ -11,16 +11,18 @@ @pytest.fixture def pidx(): num_elements = int(1e3) - a = np.random.randint(0, num_elements // 10, num_elements) - b = np.random.randint(0, num_elements // 10, num_elements) + rng = np.random.default_rng(seed=0) + a = rng.integers(0, num_elements // 10, num_elements) + b = rng.integers(0, num_elements // 10, num_elements) return pd.MultiIndex.from_arrays([a, b], names=("a", "b")) @pytest.fixture def midx(pidx): num_elements = int(1e3) - a = np.random.randint(0, num_elements // 10, num_elements) - b = np.random.randint(0, num_elements // 10, num_elements) + rng = np.random.default_rng(seed=0) + a = rng.integers(0, num_elements // 10, num_elements) + b = rng.integers(0, num_elements // 10, num_elements) df = cudf.DataFrame({"a": a, "b": b}) return cudf.MultiIndex.from_frame(df) diff --git a/python/cudf/benchmarks/conftest.py b/python/cudf/benchmarks/conftest.py index 7b2b71cf216..0e4afadccf5 100644 --- a/python/cudf/benchmarks/conftest.py +++ b/python/cudf/benchmarks/conftest.py @@ -56,27 +56,23 @@ # into the main repo. sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) -from config import cudf # noqa: W0611, E402, F401 -from utils import ( # noqa: E402 - OrderedSet, - collapse_fixtures, - column_generators, - make_fixture, -) - # Turn off isort until we upgrade to 5.8.0 # https://github.com/pycqa/isort/issues/1594 -# isort: off from config import ( # noqa: W0611, E402, F401 NUM_COLS, NUM_ROWS, collect_ignore, + cudf, # noqa: W0611, E402, F401 pytest_collection_modifyitems, pytest_sessionfinish, pytest_sessionstart, ) - -# isort: on +from utils import ( # noqa: E402 + OrderedSet, + collapse_fixtures, + column_generators, + make_fixture, +) @pytest_cases.fixture(params=[0, 1], ids=["AxisIndex", "AxisColumn"]) diff --git a/python/cudf/cudf/_fuzz_testing/avro.py b/python/cudf/cudf/_fuzz_testing/avro.py index d9974037daa..172193aa672 100644 --- a/python/cudf/cudf/_fuzz_testing/avro.py +++ b/python/cudf/cudf/_fuzz_testing/avro.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import copy import io @@ -68,12 +68,12 @@ def generate_input(self): # https://github.com/rapidsai/cudf/issues/6604 - cudf.utils.dtypes.TIMEDELTA_TYPES ) - + seed = random.randint(0, 2**32 - 1) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) self._current_params["dtypes_meta"] = dtypes_meta - seed = random.randint(0, 2**32 - 1) + self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols @@ -100,17 +100,18 @@ def write_data(self, file_name): def set_rand_params(self, params): params_dict = {} + rng = np.random.default_rng(seed=None) for param, values in params.items(): if values == ALL_POSSIBLE_VALUES: if param == "columns": col_size = self._rand(len(self._df.columns)) params_dict[param] = list( - np.unique(np.random.choice(self._df.columns, col_size)) + np.unique(rng.choice(self._df.columns, col_size)) ) elif param in ("skiprows", "num_rows"): - params_dict[param] = np.random.choice( + params_dict[param] = rng.choice( [None, self._rand(len(self._df))] ) else: - params_dict[param] = np.random.choice(values) + params_dict[param] = rng.choice(values) self._current_params["test_kwargs"] = self.process_kwargs(params_dict) diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py index 67211a1c4bf..fa3ed40ce91 100644 --- a/python/cudf/cudf/_fuzz_testing/csv.py +++ b/python/cudf/cudf/_fuzz_testing/csv.py @@ -54,7 +54,7 @@ def generate_input(self): random.seed(seed) dtypes_list = list(cudf.utils.dtypes.ALL_TYPES) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed @@ -77,25 +77,22 @@ def write_data(self, file_name): def set_rand_params(self, params): params_dict = {} + rng = np.random.default_rng(seed=None) for param, values in params.items(): if values == ALL_POSSIBLE_VALUES: if param == "usecols": col_size = self._rand(len(self._df.columns)) - col_val = np.random.choice( + col_val = rng.choice( [ None, - np.unique( - np.random.choice(self._df.columns, col_size) - ), + np.unique(rng.choice(self._df.columns, col_size)), ] ) params_dict[param] = ( col_val if col_val is None else list(col_val) ) elif param == "dtype": - dtype_val = np.random.choice( - [None, self._df.dtypes.to_dict()] - ) + dtype_val = rng.choice([None, self._df.dtypes.to_dict()]) if dtype_val is not None: dtype_val = { col_name: "category" @@ -105,25 +102,25 @@ def set_rand_params(self, params): } params_dict[param] = dtype_val elif param == "header": - header_val = np.random.choice( - ["infer", np.random.randint(low=0, high=len(self._df))] + header_val = rng.choice( + ["infer", rng.integers(low=0, high=len(self._df))] ) params_dict[param] = header_val elif param == "skiprows": - params_dict[param] = np.random.randint( + params_dict[param] = rng.integers( low=0, high=len(self._df) ) elif param == "skipfooter": - params_dict[param] = np.random.randint( + params_dict[param] = rng.integers( low=0, high=len(self._df) ) elif param == "nrows": - nrows_val = np.random.choice( - [None, np.random.randint(low=0, high=len(self._df))] + nrows_val = rng.choice( + [None, rng.integers(low=0, high=len(self._df))] ) params_dict[param] = nrows_val else: - params_dict[param] = np.random.choice(values) + params_dict[param] = rng.choice(values) self._current_params["test_kwargs"] = self.process_kwargs(params_dict) @@ -159,7 +156,7 @@ def generate_input(self): random.seed(seed) dtypes_list = list(cudf.utils.dtypes.ALL_TYPES) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed @@ -182,26 +179,25 @@ def write_data(self, file_name): def set_rand_params(self, params): params_dict = {} + rng = np.random.default_rng(seed=None) for param, values in params.items(): if values == ALL_POSSIBLE_VALUES: if param == "columns": col_size = self._rand(len(self._current_buffer.columns)) params_dict[param] = list( np.unique( - np.random.choice( - self._current_buffer.columns, col_size - ) + rng.choice(self._current_buffer.columns, col_size) ) ) elif param == "chunksize": - params_dict[param] = np.random.choice( + params_dict[param] = rng.choice( [ None, - np.random.randint( + rng.integers( low=1, high=max(1, len(self._current_buffer)) ), ] ) else: - params_dict[param] = np.random.choice(values) + params_dict[param] = rng.choice(values) self._current_params["test_kwargs"] = self.process_kwargs(params_dict) diff --git a/python/cudf/cudf/_fuzz_testing/io.py b/python/cudf/cudf/_fuzz_testing/io.py index ffb7171a855..a4b8e18d8b4 100644 --- a/python/cudf/cudf/_fuzz_testing/io.py +++ b/python/cudf/cudf/_fuzz_testing/io.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import copy import json @@ -91,8 +91,9 @@ def get_next_regression_params(self): return dtypes_meta, num_rows, num_cols, seed def set_rand_params(self, params): + rng = np.random.default_rng(seed=None) params_dict = { - param: np.random.choice(values) for param, values in params.items() + param: rng.choice(values) for param, values in params.items() } self._current_params["test_kwargs"] = self.process_kwargs( params_dict=params_dict diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py index e987529c8ba..45d2c8d8cf0 100644 --- a/python/cudf/cudf/_fuzz_testing/json.py +++ b/python/cudf/cudf/_fuzz_testing/json.py @@ -80,7 +80,7 @@ def generate_input(self): # https://github.com/rapidsai/cudf/issues/7086 # dtypes_list.extend(["list"]) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed @@ -105,14 +105,15 @@ def write_data(self, file_name): def set_rand_params(self, params): params_dict = {} + rng = np.random.default_rng(seed=None) for param, values in params.items(): if param == "dtype" and values == ALL_POSSIBLE_VALUES: - dtype_val = np.random.choice( + dtype_val = rng.choice( [True, self._current_buffer.dtypes.to_dict()] ) params_dict[param] = _get_dtype_param_value(dtype_val) else: - params_dict[param] = np.random.choice(values) + params_dict[param] = rng.choice(values) self._current_params["test_kwargs"] = self.process_kwargs(params_dict) @@ -155,7 +156,7 @@ def generate_input(self): # https://github.com/rapidsai/cudf/issues/7086 # dtypes_list.extend(["list"]) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed @@ -180,12 +181,13 @@ def write_data(self, file_name): def set_rand_params(self, params): params_dict = {} + rng = np.random.default_rng(seed=None) for param, values in params.items(): if param == "dtype" and values == ALL_POSSIBLE_VALUES: - dtype_val = np.random.choice( + dtype_val = rng.choice( [True, self._current_buffer.dtypes.to_dict()] ) params_dict[param] = _get_dtype_param_value(dtype_val) else: - params_dict[param] = np.random.choice(values) + params_dict[param] = rng.choice(values) self._current_params["test_kwargs"] = self.process_kwargs(params_dict) diff --git a/python/cudf/cudf/_fuzz_testing/orc.py b/python/cudf/cudf/_fuzz_testing/orc.py index ecddc72fa85..4d9e4abb09e 100644 --- a/python/cudf/cudf/_fuzz_testing/orc.py +++ b/python/cudf/cudf/_fuzz_testing/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import copy import io @@ -62,13 +62,11 @@ def generate_input(self): - cudf.utils.dtypes.UNSIGNED_TYPES - {"datetime64[ns]"} ) - + seed = random.randint(0, 2**32 - 1) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) - self._current_params["dtypes_meta"] = dtypes_meta - seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols @@ -94,42 +92,41 @@ def write_data(self, file_name): def set_rand_params(self, params): params_dict = {} + rng = np.random.default_rng(seed=None) for param, values in params.items(): if values == ALL_POSSIBLE_VALUES: if param == "columns": col_size = self._rand(len(self._df.columns)) params_dict[param] = list( - np.unique(np.random.choice(self._df.columns, col_size)) + np.unique(rng.choice(self._df.columns, col_size)) ) elif param == "stripes": f = io.BytesIO(self._current_buffer) orcFile = pa.orc.ORCFile(f) stripes = list(range(orcFile.nstripes)) - params_dict[param] = np.random.choice( + params_dict[param] = rng.choice( [ None, list( map( int, np.unique( - np.random.choice( - stripes, orcFile.nstripes - ) + rng.choice(stripes, orcFile.nstripes) ), ) ), ] ) elif param == "use_index": - params_dict[param] = np.random.choice([True, False]) + params_dict[param] = rng.choice([True, False]) elif param in ("skiprows", "num_rows"): - params_dict[param] = np.random.choice( + params_dict[param] = rng.choice( [None, self._rand(len(self._df))] ) else: if not isinstance(values, list): raise TypeError("values must be of type list") - params_dict[param] = np.random.choice(values) + params_dict[param] = rng.choice(values) self._current_params["test_kwargs"] = self.process_kwargs(params_dict) @@ -177,12 +174,11 @@ def generate_input(self): # https://github.com/rapidsai/cudf/issues/7355 - cudf.utils.dtypes.DATETIME_TYPES ) - + seed = random.randint(0, 2**32 - 1) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) self._current_params["dtypes_meta"] = dtypes_meta - seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols diff --git a/python/cudf/cudf/_fuzz_testing/parquet.py b/python/cudf/cudf/_fuzz_testing/parquet.py index 2d934e4816d..bd3df1b0847 100644 --- a/python/cudf/cudf/_fuzz_testing/parquet.py +++ b/python/cudf/cudf/_fuzz_testing/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import logging import random @@ -59,12 +59,11 @@ def generate_input(self): - {"uint32"} | {"list", "decimal64"} ) - + seed = random.randint(0, 2**32 - 1) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) self._current_params["dtypes_meta"] = dtypes_meta - seed = random.randint(0, 2**32 - 1) self._current_params["seed"] = seed self._current_params["num_rows"] = num_rows self._current_params["num_cols"] = num_cols @@ -96,14 +95,15 @@ def write_data(self, file_name): def set_rand_params(self, params): params_dict = {} + rng = np.random.default_rng(seed=None) for param, values in params.items(): if param == "columns" and values == ALL_POSSIBLE_VALUES: col_size = self._rand(len(self._df.columns)) params_dict[param] = list( - np.unique(np.random.choice(self._df.columns, col_size)) + np.unique(rng.choice(self._df.columns, col_size)) ) else: - params_dict[param] = np.random.choice(values) + params_dict[param] = rng.choice(values) self._current_params["test_kwargs"] = self.process_kwargs(params_dict) @@ -146,7 +146,7 @@ def generate_input(self): | {"list", "decimal64"} ) dtypes_meta, num_rows, num_cols = _generate_rand_meta( - self, dtypes_list + self, dtypes_list, seed ) self._current_params["dtypes_meta"] = dtypes_meta self._current_params["seed"] = seed diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py index 3d070576a12..bbc19dce1a4 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import sys @@ -68,7 +68,9 @@ def parquet_writer_test(pdf): @pythonfuzz( data_handle=ParquetWriter, params={ - "row_group_size": np.random.random_integers(1, 10000, 100), + "row_group_size": np.random.default_rng(seed=0).integers( + 1, 10000, 100 + ), "compression": ["snappy", None], }, ) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 8ce92e1c0f6..4cadb3a109c 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -40,8 +40,11 @@ } -def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): +def _generate_rand_meta( + obj, dtypes_list, null_frequency_override=None, seed=0 +): obj._current_params = {} + rng = np.random.default_rng(seed=seed) num_rows = obj._rand(obj._max_rows) num_cols = obj._rand(obj._max_columns) @@ -69,12 +72,12 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): meta["max_string_length"] = obj._max_string_length elif dtype == "list": if obj._max_lists_length is None: - meta["lists_max_length"] = np.random.randint(0, 2000000000) + meta["lists_max_length"] = rng.integers(0, 2000000000) else: meta["lists_max_length"] = obj._max_lists_length if obj._max_lists_nesting_depth is None: - meta["nesting_max_depth"] = np.random.randint( + meta["nesting_max_depth"] = rng.integers( 1, np.iinfo("int64").max ) else: @@ -85,7 +88,7 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): ) elif dtype == "struct": if obj._max_lists_nesting_depth is None: - meta["nesting_max_depth"] = np.random.randint(2, 10) + meta["nesting_max_depth"] = rng.integers(2, 10) else: meta["nesting_max_depth"] = obj._max_lists_nesting_depth @@ -95,9 +98,7 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None): meta["max_null_frequency"] = obj._max_struct_null_frequency if obj._max_struct_types_at_each_level is None: - meta["max_types_at_each_level"] = np.random.randint( - low=1, high=10 - ) + meta["max_types_at_each_level"] = rng.integers(low=1, high=10) else: meta["max_types_at_each_level"] = ( obj._max_struct_types_at_each_level diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd index 8ceea4920e2..8b1d16f0d85 100644 --- a/python/cudf/cudf/_lib/column.pxd +++ b/python/cudf/cudf/_lib/column.pxd @@ -11,7 +11,7 @@ from pylibcudf.libcudf.column.column_view cimport ( mutable_column_view, ) from pylibcudf.libcudf.types cimport size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef class Column: diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 99e4c21df8a..065655505b8 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -28,7 +28,7 @@ from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector -from rmm._lib.device_buffer cimport DeviceBuffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from cudf._lib.types cimport ( dtype_from_column_view, diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 49714091f46..4221e745e65 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -4,11 +4,11 @@ import pickle from libc.stdint cimport uint8_t, uintptr_t from libcpp cimport bool -from libcpp.memory cimport make_shared, shared_ptr, unique_ptr +from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector -from rmm._lib.device_buffer cimport DeviceBuffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer import pylibcudf @@ -30,10 +30,6 @@ from libcpp.memory cimport make_unique cimport pylibcudf.libcudf.contiguous_split as cpp_contiguous_split from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.lists.gather cimport ( - segmented_gather as cpp_segmented_gather, -) -from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport size_type @@ -339,26 +335,6 @@ def get_element(Column input_column, size_type index): ) -@acquire_spill_lock() -def segmented_gather(Column source_column, Column gather_map): - cdef shared_ptr[lists_column_view] source_LCV = ( - make_shared[lists_column_view](source_column.view()) - ) - cdef shared_ptr[lists_column_view] gather_map_LCV = ( - make_shared[lists_column_view](gather_map.view()) - ) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_segmented_gather( - source_LCV.get()[0], gather_map_LCV.get()[0]) - ) - - result = Column.from_unique_ptr(move(c_result)) - return result - - cdef class _CPackedColumns: @staticmethod diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index bc5e085ec39..d844466120f 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -13,12 +13,11 @@ from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.filling cimport calendrical_month_sequence from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport size_type +from pylibcudf.datetime import DatetimeComponent from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar -import pylibcudf as plc - @acquire_spill_lock() def add_months(Column col, Column months): @@ -40,9 +39,39 @@ def add_months(Column col, Column months): @acquire_spill_lock() def extract_datetime_component(Column col, object field): - result = Column.from_pylibcudf( - plc.datetime.extract_datetime_component(col.to_pylibcudf(mode="read"), field) - ) + + cdef unique_ptr[column] c_result + cdef column_view col_view = col.view() + cdef libcudf_datetime.datetime_component component + + component_names = { + "year": DatetimeComponent.YEAR, + "month": DatetimeComponent.MONTH, + "day": DatetimeComponent.DAY, + "weekday": DatetimeComponent.WEEKDAY, + "hour": DatetimeComponent.HOUR, + "minute": DatetimeComponent.MINUTE, + "second": DatetimeComponent.SECOND, + "millisecond": DatetimeComponent.MILLISECOND, + "microsecond": DatetimeComponent.MICROSECOND, + "nanosecond": DatetimeComponent.NANOSECOND, + } + if field == "day_of_year": + with nogil: + c_result = move(libcudf_datetime.day_of_year(col_view)) + elif field in component_names: + component = component_names[field] + with nogil: + c_result = move( + libcudf_datetime.extract_datetime_component( + col_view, + component + ) + ) + else: + raise ValueError(f"Invalid field: '{field}'") + + result = Column.from_unique_ptr(move(c_result)) if field == "weekday": # Pandas counts Monday-Sunday as 0-6 diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 1dc586bb257..1c9d3a01b80 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -1,49 +1,22 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cpython cimport pycapsule -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - import pylibcudf -from pylibcudf.libcudf.interop cimport ( - DLManagedTensor, - from_dlpack as cpp_from_dlpack, - to_dlpack as cpp_to_dlpack, -) -from pylibcudf.libcudf.table.table cimport table -from pylibcudf.libcudf.table.table_view cimport table_view - -from cudf._lib.utils cimport ( - columns_from_pylibcudf_table, - columns_from_unique_ptr, - table_view_from_columns, -) +from cudf._lib.utils cimport columns_from_pylibcudf_table from cudf.core.buffer import acquire_spill_lock from cudf.core.dtypes import ListDtype, StructDtype -def from_dlpack(dlpack_capsule): +def from_dlpack(object dlpack_capsule): """ Converts a DLPack Tensor PyCapsule into a list of columns. DLPack Tensor PyCapsule is expected to have the name "dltensor". """ - cdef DLManagedTensor* dlpack_tensor = pycapsule.\ - PyCapsule_GetPointer(dlpack_capsule, 'dltensor') - pycapsule.PyCapsule_SetName(dlpack_capsule, 'used_dltensor') - - cdef unique_ptr[table] c_result - - with nogil: - c_result = move( - cpp_from_dlpack(dlpack_tensor) - ) - - res = columns_from_unique_ptr(move(c_result)) - dlpack_tensor.deleter(dlpack_tensor) - return res + return columns_from_pylibcudf_table( + pylibcudf.interop.from_dlpack(dlpack_capsule) + ) def to_dlpack(list source_columns): @@ -52,39 +25,13 @@ def to_dlpack(list source_columns): DLPack Tensor PyCapsule will have the name "dltensor". """ - if any(column.null_count for column in source_columns): - raise ValueError( - "Cannot create a DLPack tensor with null values. \ - Input is required to have null count as zero." - ) - - cdef DLManagedTensor *dlpack_tensor - cdef table_view source_table_view = table_view_from_columns(source_columns) - - with nogil: - dlpack_tensor = cpp_to_dlpack( - source_table_view + return pylibcudf.interop.to_dlpack( + pylibcudf.Table( + [col.to_pylibcudf(mode="read") for col in source_columns] ) - - return pycapsule.PyCapsule_New( - dlpack_tensor, - 'dltensor', - dlmanaged_tensor_pycapsule_deleter ) -cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept: - cdef DLManagedTensor* dlpack_tensor = 0 - try: - dlpack_tensor = pycapsule.PyCapsule_GetPointer( - pycap_obj, 'used_dltensor') - return # we do not call a used capsule's deleter - except Exception: - dlpack_tensor = pycapsule.PyCapsule_GetPointer( - pycap_obj, 'dltensor') - dlpack_tensor.deleter(dlpack_tensor) - - def gather_metadata(object cols_dtypes): """ Generates a ColumnMetadata vector for each column. diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 7e8710bedb6..12432ac6d5d 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -9,7 +9,7 @@ from pylibcudf.libcudf.types cimport null_order, size_type from cudf._lib.column cimport Column from cudf._lib.utils cimport columns_from_pylibcudf_table -import pylibcudf +import pylibcudf as plc from pylibcudf cimport Scalar @@ -17,7 +17,7 @@ from pylibcudf cimport Scalar @acquire_spill_lock() def count_elements(Column col): return Column.from_pylibcudf( - pylibcudf.lists.count_elements( + plc.lists.count_elements( col.to_pylibcudf(mode="read")) ) @@ -25,8 +25,8 @@ def count_elements(Column col): @acquire_spill_lock() def explode_outer(list source_columns, int explode_column_idx): return columns_from_pylibcudf_table( - pylibcudf.lists.explode_outer( - pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]), + plc.lists.explode_outer( + plc.Table([c.to_pylibcudf(mode="read") for c in source_columns]), explode_column_idx, ) ) @@ -35,7 +35,7 @@ def explode_outer(list source_columns, int explode_column_idx): @acquire_spill_lock() def distinct(Column col, bool nulls_equal, bool nans_all_equal): return Column.from_pylibcudf( - pylibcudf.lists.distinct( + plc.lists.distinct( col.to_pylibcudf(mode="read"), nulls_equal, nans_all_equal, @@ -46,7 +46,7 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal): @acquire_spill_lock() def sort_lists(Column col, bool ascending, str na_position): return Column.from_pylibcudf( - pylibcudf.lists.sort_lists( + plc.lists.sort_lists( col.to_pylibcudf(mode="read"), ascending, null_order.BEFORE if na_position == "first" else null_order.AFTER, @@ -58,7 +58,7 @@ def sort_lists(Column col, bool ascending, str na_position): @acquire_spill_lock() def extract_element_scalar(Column col, size_type index): return Column.from_pylibcudf( - pylibcudf.lists.extract_list_element( + plc.lists.extract_list_element( col.to_pylibcudf(mode="read"), index, ) @@ -68,7 +68,7 @@ def extract_element_scalar(Column col, size_type index): @acquire_spill_lock() def extract_element_column(Column col, Column index): return Column.from_pylibcudf( - pylibcudf.lists.extract_list_element( + plc.lists.extract_list_element( col.to_pylibcudf(mode="read"), index.to_pylibcudf(mode="read"), ) @@ -78,7 +78,7 @@ def extract_element_column(Column col, Column index): @acquire_spill_lock() def contains_scalar(Column col, py_search_key): return Column.from_pylibcudf( - pylibcudf.lists.contains( + plc.lists.contains( col.to_pylibcudf(mode="read"), py_search_key.device_value.c_value, ) @@ -88,7 +88,7 @@ def contains_scalar(Column col, py_search_key): @acquire_spill_lock() def index_of_scalar(Column col, object py_search_key): return Column.from_pylibcudf( - pylibcudf.lists.index_of( + plc.lists.index_of( col.to_pylibcudf(mode="read"), py_search_key.device_value.c_value, True, @@ -99,7 +99,7 @@ def index_of_scalar(Column col, object py_search_key): @acquire_spill_lock() def index_of_column(Column col, Column search_keys): return Column.from_pylibcudf( - pylibcudf.lists.index_of( + plc.lists.index_of( col.to_pylibcudf(mode="read"), search_keys.to_pylibcudf(mode="read"), True, @@ -110,8 +110,8 @@ def index_of_column(Column col, Column search_keys): @acquire_spill_lock() def concatenate_rows(list source_columns): return Column.from_pylibcudf( - pylibcudf.lists.concatenate_rows( - pylibcudf.Table([ + plc.lists.concatenate_rows( + plc.Table([ c.to_pylibcudf(mode="read") for c in source_columns ]) ) @@ -121,8 +121,18 @@ def concatenate_rows(list source_columns): @acquire_spill_lock() def concatenate_list_elements(Column input_column, dropna=False): return Column.from_pylibcudf( - pylibcudf.lists.concatenate_list_elements( + plc.lists.concatenate_list_elements( input_column.to_pylibcudf(mode="read"), dropna, ) ) + + +@acquire_spill_lock() +def segmented_gather(Column source_column, Column gather_map): + return Column.from_pylibcudf( + plc.lists.segmented_gather( + source_column.to_pylibcudf(mode="read"), + gather_map.to_pylibcudf(mode="read"), + ) + ) diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx index e3c2273345a..3dd99c42d76 100644 --- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx +++ b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx @@ -2,37 +2,23 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.edit_distance cimport ( - edit_distance as cpp_edit_distance, - edit_distance_matrix as cpp_edit_distance_matrix, -) +from pylibcudf cimport nvtext from cudf._lib.column cimport Column @acquire_spill_lock() def edit_distance(Column strings, Column targets): - cdef column_view c_strings = strings.view() - cdef column_view c_targets = targets.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_edit_distance(c_strings, c_targets)) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.edit_distance.edit_distance( + strings.to_pylibcudf(mode="read"), + targets.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def edit_distance_matrix(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_edit_distance_matrix(c_strings)) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.edit_distance.edit_distance_matrix( + strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx index 6591b527eec..7fdf9258b7f 100644 --- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx +++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx @@ -2,75 +2,34 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.generate_ngrams cimport ( - generate_character_ngrams as cpp_generate_character_ngrams, - generate_ngrams as cpp_generate_ngrams, - hash_character_ngrams as cpp_hash_character_ngrams, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +from pylibcudf import nvtext @acquire_spill_lock() def generate_ngrams(Column strings, int ngrams, object py_separator): - - cdef DeviceScalar separator = py_separator.device_value - - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef const string_scalar* c_separator = separator\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_generate_ngrams( - c_strings, - c_ngrams, - c_separator[0] - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.generate_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams, + py_separator.device_value.c_value + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def generate_character_ngrams(Column strings, int ngrams): - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_generate_character_ngrams( - c_strings, - c_ngrams - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.generate_character_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def hash_character_ngrams(Column strings, int ngrams): - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_hash_character_ngrams( - c_strings, - c_ngrams - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.generate_ngrams.hash_character_ngrams( + strings.to_pylibcudf(mode="read"), + ngrams + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx index 0ebf7c281e3..c964d0206b7 100644 --- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx +++ b/python/cudf/cudf/_lib/nvtext/jaccard.pyx @@ -2,33 +2,16 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.jaccard cimport ( - jaccard_index as cpp_jaccard_index, -) -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column +from pylibcudf import nvtext + @acquire_spill_lock() def jaccard_index(Column input1, Column input2, int width): - cdef column_view c_input1 = input1.view() - cdef column_view c_input2 = input2.view() - cdef size_type c_width = width - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_jaccard_index( - c_input1, - c_input2, - c_width - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.jaccard.jaccard_index( + input1.to_pylibcudf(mode="read"), + input2.to_pylibcudf(mode="read"), + width, + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 59cb8d51440..5e39cafa47b 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -2,93 +2,44 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.minhash cimport ( - minhash as cpp_minhash, - minhash64 as cpp_minhash64, - word_minhash as cpp_word_minhash, - word_minhash64 as cpp_word_minhash64, -) -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column - -@acquire_spill_lock() -def minhash(Column strings, Column seeds, int width): - - cdef column_view c_strings = strings.view() - cdef size_type c_width = width - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_minhash( - c_strings, - c_seeds, - c_width - ) - ) - - return Column.from_unique_ptr(move(c_result)) +from pylibcudf import nvtext @acquire_spill_lock() -def minhash64(Column strings, Column seeds, int width): - - cdef column_view c_strings = strings.view() - cdef size_type c_width = width - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result +def minhash(Column input, Column seeds, int width=4): + result = nvtext.minhash.minhash( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + width, + ) + return Column.from_pylibcudf(result) - with nogil: - c_result = move( - cpp_minhash64( - c_strings, - c_seeds, - c_width - ) - ) - return Column.from_unique_ptr(move(c_result)) +@acquire_spill_lock() +def minhash64(Column input, Column seeds, int width=4): + result = nvtext.minhash.minhash64( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + width, + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def word_minhash(Column input, Column seeds): - - cdef column_view c_input = input.view() - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_word_minhash( - c_input, - c_seeds - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.minhash.word_minhash( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(result) @acquire_spill_lock() def word_minhash64(Column input, Column seeds): - - cdef column_view c_input = input.view() - cdef column_view c_seeds = seeds.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_word_minhash64( - c_input, - c_seeds - ) - ) - - return Column.from_unique_ptr(move(c_result)) + result = nvtext.minhash.word_minhash64( + input.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx index dec4f037d98..c125d92a24e 100644 --- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx +++ b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx @@ -2,48 +2,23 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport ( - ngrams_tokenize as cpp_ngrams_tokenize, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar + +from pylibcudf import nvtext @acquire_spill_lock() def ngrams_tokenize( - Column strings, + Column input, int ngrams, object py_delimiter, object py_separator ): - - cdef DeviceScalar delimiter = py_delimiter.device_value - cdef DeviceScalar separator = py_separator.device_value - - cdef column_view c_strings = strings.view() - cdef size_type c_ngrams = ngrams - cdef const string_scalar* c_separator = separator\ - .get_raw_ptr() - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_ngrams_tokenize( - c_strings, - c_ngrams, - c_delimiter[0], - c_separator[0] - ) + return Column.from_pylibcudf( + nvtext.ngrams_tokenize.ngrams_tokenize( + input.to_pylibcudf(mode="read"), + ngrams, + py_delimiter.device_value.c_value, + py_separator.device_value.c_value ) - - return Column.from_unique_ptr(move(c_result)) + ) diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx index 5e86a9ce959..cc45123dd0a 100644 --- a/python/cudf/cudf/_lib/nvtext/normalize.pyx +++ b/python/cudf/cudf/_lib/nvtext/normalize.pyx @@ -3,36 +3,26 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.normalize cimport ( - normalize_characters as cpp_normalize_characters, - normalize_spaces as cpp_normalize_spaces, -) from cudf._lib.column cimport Column - -@acquire_spill_lock() -def normalize_spaces(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_normalize_spaces(c_strings)) - - return Column.from_unique_ptr(move(c_result)) +from pylibcudf import nvtext @acquire_spill_lock() -def normalize_characters(Column strings, bool do_lower=True): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result +def normalize_spaces(Column input): + return Column.from_pylibcudf( + nvtext.normalize.normalize_spaces( + input.to_pylibcudf(mode="read") + ) + ) - with nogil: - c_result = move(cpp_normalize_characters(c_strings, do_lower)) - return Column.from_unique_ptr(move(c_result)) +@acquire_spill_lock() +def normalize_characters(Column input, bool do_lower=True): + return Column.from_pylibcudf( + nvtext.normalize.normalize_characters( + input.to_pylibcudf(mode="read"), + do_lower, + ) + ) diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx index 61ae3da5782..bec56ade83c 100644 --- a/python/cudf/cudf/_lib/nvtext/replace.pyx +++ b/python/cudf/cudf/_lib/nvtext/replace.pyx @@ -2,20 +2,10 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.nvtext.replace cimport ( - filter_tokens as cpp_filter_tokens, - replace_tokens as cpp_replace_tokens, -) -from pylibcudf.libcudf.scalar.scalar cimport string_scalar from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar +from pylibcudf import nvtext @acquire_spill_lock() @@ -30,27 +20,14 @@ def replace_tokens(Column strings, provided. """ - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef column_view c_strings = strings.view() - cdef column_view c_targets = targets.view() - cdef column_view c_replacements = replacements.view() - - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_replace_tokens( - c_strings, - c_targets, - c_replacements, - c_delimiter[0], - ) + return Column.from_pylibcudf( + nvtext.replace.replace_tokens( + strings.to_pylibcudf(mode="read"), + targets.to_pylibcudf(mode="read"), + replacements.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, ) - - return Column.from_unique_ptr(move(c_result)) + ) @acquire_spill_lock() @@ -65,24 +42,11 @@ def filter_tokens(Column strings, character provided. """ - cdef DeviceScalar replacement = py_replacement.device_value - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef column_view c_strings = strings.view() - cdef const string_scalar* c_repl = replacement\ - .get_raw_ptr() - cdef const string_scalar* c_delimiter = delimiter\ - .get_raw_ptr() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move( - cpp_filter_tokens( - c_strings, - min_token_length, - c_repl[0], - c_delimiter[0], - ) + return Column.from_pylibcudf( + nvtext.replace.filter_tokens( + strings.to_pylibcudf(mode="read"), + min_token_length, + py_replacement.device_value.c_value, + py_delimiter.device_value.c_value, ) - - return Column.from_unique_ptr(move(c_result)) + ) diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx index 5bf25562fed..63a389b64d5 100644 --- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx +++ b/python/cudf/cudf/_lib/nvtext/stemmer.pyx @@ -1,24 +1,19 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf.core.buffer import acquire_spill_lock - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from enum import IntEnum -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view +from cudf.core.buffer import acquire_spill_lock + from pylibcudf.libcudf.nvtext.stemmer cimport ( - is_letter as cpp_is_letter, letter_type, - porter_stemmer_measure as cpp_porter_stemmer_measure, underlying_type_t_letter_type, ) from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column +from pylibcudf import nvtext + class LetterType(IntEnum): CONSONANT = letter_type.CONSONANT @@ -27,43 +22,34 @@ class LetterType(IntEnum): @acquire_spill_lock() def porter_stemmer_measure(Column strings): - cdef column_view c_strings = strings.view() - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_porter_stemmer_measure(c_strings)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf( + nvtext.stemmer.porter_stemmer_measure( + strings.to_pylibcudf(mode="read"), + ) + ) @acquire_spill_lock() def is_letter(Column strings, object ltype, size_type index): - cdef column_view c_strings = strings.view() - cdef letter_type c_ltype = ( - ltype + return Column.from_pylibcudf( + nvtext.stemmer.is_letter( + strings.to_pylibcudf(mode="read"), + ltype==LetterType.VOWEL, + index, + ) ) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_is_letter(c_strings, c_ltype, index)) - - return Column.from_unique_ptr(move(c_result)) @acquire_spill_lock() def is_letter_multi(Column strings, object ltype, Column indices): - cdef column_view c_strings = strings.view() - cdef column_view c_indices = indices.view() - cdef letter_type c_ltype = ( - ltype + return Column.from_pylibcudf( + nvtext.stemmer.is_letter( + strings.to_pylibcudf(mode="read"), + ltype==LetterType.VOWEL, + indices.to_pylibcudf(mode="read"), + ) ) - cdef unique_ptr[column] c_result - - with nogil: - c_result = move(cpp_is_letter(c_strings, c_ltype, c_indices)) - - return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index 27095ca02d4..0f9820ed1db 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -4,7 +4,7 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.scalar.scalar cimport scalar -from rmm._lib.memory_resource cimport DeviceMemoryResource +from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource cdef class DeviceScalar: diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 0dde91316fb..56712402919 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -6,7 +6,6 @@ import numpy as np import pandas as pd import pyarrow as pa -from libc.stdint cimport int64_t from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -25,25 +24,7 @@ cimport pylibcudf.libcudf.types as libcudf_types # DeviceScalar is phased out entirely from cuDF Cython (at which point # cudf.Scalar will be directly backed by pylibcudf.Scalar). from pylibcudf cimport Scalar as plc_Scalar -from pylibcudf.libcudf.scalar.scalar cimport ( - duration_scalar, - list_scalar, - scalar, - struct_scalar, - timestamp_scalar, -) -from pylibcudf.libcudf.wrappers.durations cimport ( - duration_ms, - duration_ns, - duration_s, - duration_us, -) -from pylibcudf.libcudf.wrappers.timestamps cimport ( - timestamp_ms, - timestamp_ns, - timestamp_s, - timestamp_us, -) +from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id @@ -284,62 +265,6 @@ cdef class DeviceScalar: ] -# TODO: Currently the only uses of this function and the one below are in -# _create_proxy_nat_scalar. See if that code path can be simplified to excise -# or at least simplify these implementations. -cdef _set_datetime64_from_np_scalar(unique_ptr[scalar]& s, - object value, - object dtype, - bool valid=True): - - value = value if valid else 0 - - if dtype == "datetime64[s]": - s.reset( - new timestamp_scalar[timestamp_s](np.int64(value), valid) - ) - elif dtype == "datetime64[ms]": - s.reset( - new timestamp_scalar[timestamp_ms](np.int64(value), valid) - ) - elif dtype == "datetime64[us]": - s.reset( - new timestamp_scalar[timestamp_us](np.int64(value), valid) - ) - elif dtype == "datetime64[ns]": - s.reset( - new timestamp_scalar[timestamp_ns](np.int64(value), valid) - ) - else: - raise ValueError(f"dtype not supported: {dtype}") - -cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s, - object value, - object dtype, - bool valid=True): - - value = value if valid else 0 - - if dtype == "timedelta64[s]": - s.reset( - new duration_scalar[duration_s](np.int64(value), valid) - ) - elif dtype == "timedelta64[ms]": - s.reset( - new duration_scalar[duration_ms](np.int64(value), valid) - ) - elif dtype == "timedelta64[us]": - s.reset( - new duration_scalar[duration_us](np.int64(value), valid) - ) - elif dtype == "timedelta64[ns]": - s.reset( - new duration_scalar[duration_ns](np.int64(value), valid) - ) - else: - raise ValueError(f"dtype not supported: {dtype}") - - def as_device_scalar(val, dtype=None): if isinstance(val, (cudf.Scalar, DeviceScalar)): if dtype == val.dtype or dtype is None: @@ -361,22 +286,3 @@ def _is_null_host_scalar(slr): return True else: return False - - -def _create_proxy_nat_scalar(dtype): - cdef DeviceScalar result = DeviceScalar.__new__(DeviceScalar) - - dtype = cudf.dtype(dtype) - if dtype.char in 'mM': - nat = dtype.type('NaT').astype(dtype) - if dtype.type == np.datetime64: - _set_datetime64_from_np_scalar( - ( result.c_value).c_obj, nat, dtype, True - ) - elif dtype.type == np.timedelta64: - _set_timedelta64_from_np_scalar( - ( result.c_value).c_obj, nat, dtype, True - ) - return result - else: - raise TypeError('NAT only valid for datetime and timedelta') diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx index 60a6795a402..06ee07d8e2b 100644 --- a/python/cudf/cudf/_lib/string_casting.pyx +++ b/python/cudf/cudf/_lib/string_casting.pyx @@ -2,80 +2,27 @@ from cudf._lib.column cimport Column -from cudf._lib.scalar import as_device_scalar - -from cudf._lib.scalar cimport DeviceScalar - -from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES - -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.convert.convert_booleans cimport ( - from_booleans as cpp_from_booleans, - to_booleans as cpp_to_booleans, -) -from pylibcudf.libcudf.strings.convert.convert_datetime cimport ( - is_timestamp as cpp_is_timestamp, -) -from pylibcudf.libcudf.strings.convert.convert_floats cimport ( - from_floats as cpp_from_floats, - to_floats as cpp_to_floats, -) -from pylibcudf.libcudf.strings.convert.convert_integers cimport ( - from_integers as cpp_from_integers, - hex_to_integers as cpp_hex_to_integers, - integers_to_hex as cpp_integers_to_hex, - is_hex as cpp_is_hex, - to_integers as cpp_to_integers, -) -from pylibcudf.libcudf.strings.convert.convert_ipv4 cimport ( - integers_to_ipv4 as cpp_integers_to_ipv4, - ipv4_to_integers as cpp_ipv4_to_integers, - is_ipv4 as cpp_is_ipv4, -) -from pylibcudf.libcudf.types cimport data_type, type_id - -from cudf._lib.types cimport underlying_type_t_type_id - import pylibcudf as plc +from pylibcudf.types cimport DataType -import cudf +from cudf._lib.scalar import as_device_scalar from cudf._lib.types cimport dtype_to_pylibcudf_type def floating_to_string(Column input_col): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_floats( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) - - -def string_to_floating(Column input_col, object out_type): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type] - ) + plc_column = plc.strings.convert.convert_floats.from_floats( + input_col.to_pylibcudf(mode="read"), ) - cdef data_type c_out_type = data_type(tid) - with nogil: - c_result = move( - cpp_to_floats( - input_column_view, - c_out_type)) + return Column.from_pylibcudf(plc_column) + - return Column.from_unique_ptr(move(c_result)) +def string_to_floating(Column input_col, DataType out_type): + plc_column = plc.strings.convert.convert_floats.to_floats( + input_col.to_pylibcudf(mode="read"), + out_type + ) + return Column.from_pylibcudf(plc_column) def dtos(Column input_col): @@ -107,7 +54,7 @@ def stod(Column input_col): A Column with strings cast to double """ - return string_to_floating(input_col, cudf.dtype("float64")) + return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT64)) def ftos(Column input_col): @@ -139,36 +86,22 @@ def stof(Column input_col): A Column with strings cast to float """ - return string_to_floating(input_col, cudf.dtype("float32")) + return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT32)) def integer_to_string(Column input_col): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_integers( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) - - -def string_to_integer(Column input_col, object out_type): - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[out_type] - ) + plc_column = plc.strings.convert.convert_integers.from_integers( + input_col.to_pylibcudf(mode="read"), ) - cdef data_type c_out_type = data_type(tid) - with nogil: - c_result = move( - cpp_to_integers( - input_column_view, - c_out_type)) + return Column.from_pylibcudf(plc_column) - return Column.from_unique_ptr(move(c_result)) + +def string_to_integer(Column input_col, DataType out_type): + plc_column = plc.strings.convert.convert_integers.to_integers( + input_col.to_pylibcudf(mode="read"), + out_type + ) + return Column.from_pylibcudf(plc_column) def i8tos(Column input_col): @@ -200,7 +133,7 @@ def stoi8(Column input_col): A Column with strings cast to int8 """ - return string_to_integer(input_col, cudf.dtype("int8")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.INT8)) def i16tos(Column input_col): @@ -232,7 +165,7 @@ def stoi16(Column input_col): A Column with strings cast to int16 """ - return string_to_integer(input_col, cudf.dtype("int16")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.INT16)) def itos(Column input_col): @@ -264,7 +197,7 @@ def stoi(Column input_col): A Column with strings cast to int32 """ - return string_to_integer(input_col, cudf.dtype("int32")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.INT32)) def ltos(Column input_col): @@ -296,7 +229,7 @@ def stol(Column input_col): A Column with strings cast to int64 """ - return string_to_integer(input_col, cudf.dtype("int64")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.INT64)) def ui8tos(Column input_col): @@ -328,7 +261,7 @@ def stoui8(Column input_col): A Column with strings cast to uint8 """ - return string_to_integer(input_col, cudf.dtype("uint8")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT8)) def ui16tos(Column input_col): @@ -360,7 +293,7 @@ def stoui16(Column input_col): A Column with strings cast to uint16 """ - return string_to_integer(input_col, cudf.dtype("uint16")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT16)) def uitos(Column input_col): @@ -392,7 +325,7 @@ def stoui(Column input_col): A Column with strings cast to uint32 """ - return string_to_integer(input_col, cudf.dtype("uint32")) + return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT32)) def ultos(Column input_col): @@ -424,80 +357,24 @@ def stoul(Column input_col): A Column with strings cast to uint64 """ - return string_to_integer(input_col, cudf.dtype("uint64")) - - -def _to_booleans(Column input_col, object string_true="True"): - """ - Converting/Casting input column of type string to boolean column - - Parameters - ---------- - input_col : input column of type string - string_true : string that represents True - - Returns - ------- - A Column with string values cast to boolean - """ - - cdef DeviceScalar str_true = as_device_scalar(string_true) - cdef column_view input_column_view = input_col.view() - cdef const string_scalar* string_scalar_true = ( - str_true.get_raw_ptr()) - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_to_booleans( - input_column_view, - string_scalar_true[0])) - - return Column.from_unique_ptr(move(c_result)) + return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT64)) def to_booleans(Column input_col): - - return _to_booleans(input_col) - - -def _from_booleans( - Column input_col, - object string_true="True", - object string_false="False"): - """ - Converting/Casting input column of type boolean to string column - - Parameters - ---------- - input_col : input column of type boolean - string_true : string that represents True - string_false : string that represents False - - Returns - ------- - A Column with boolean values cast to string - """ - - cdef DeviceScalar str_true = as_device_scalar(string_true) - cdef DeviceScalar str_false = as_device_scalar(string_false) - cdef column_view input_column_view = input_col.view() - cdef const string_scalar* string_scalar_true = ( - str_true.get_raw_ptr()) - cdef const string_scalar* string_scalar_false = ( - str_false.get_raw_ptr()) - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_booleans( - input_column_view, - string_scalar_true[0], - string_scalar_false[0])) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_booleans.to_booleans( + input_col.to_pylibcudf(mode="read"), + as_device_scalar("True").c_value, + ) + return Column.from_pylibcudf(plc_column) def from_booleans(Column input_col): - return _from_booleans(input_col) + plc_column = plc.strings.convert.convert_booleans.from_booleans( + input_col.to_pylibcudf(mode="read"), + as_device_scalar("True").c_value, + as_device_scalar("False").c_value, + ) + return Column.from_pylibcudf(plc_column) def int2timestamp( @@ -520,11 +397,10 @@ def int2timestamp( A Column with date-time represented in string format """ - cdef string c_timestamp_format = format.encode("UTF-8") return Column.from_pylibcudf( plc.strings.convert.convert_datetime.from_timestamps( input_col.to_pylibcudf(mode="read"), - c_timestamp_format, + format, names.to_pylibcudf(mode="read") ) ) @@ -545,12 +421,11 @@ def timestamp2int(Column input_col, dtype, format): """ dtype = dtype_to_pylibcudf_type(dtype) - cdef string c_timestamp_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_datetime.to_timestamps( input_col.to_pylibcudf(mode="read"), dtype, - c_timestamp_format + format ) ) @@ -570,18 +445,11 @@ def istimestamp(Column input_col, str format): A Column of boolean values identifying strings that matched the format. """ - if input_col.size == 0: - return cudf.core.column.column_empty(0, dtype=cudf.dtype("bool")) - cdef column_view input_column_view = input_col.view() - cdef string c_timestamp_format = str(format).encode('UTF-8') - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_is_timestamp( - input_column_view, - c_timestamp_format)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_datetime.is_timestamp( + input_col.to_pylibcudf(mode="read"), + format + ) + return Column.from_pylibcudf(plc_column) def timedelta2int(Column input_col, dtype, format): @@ -599,12 +467,11 @@ def timedelta2int(Column input_col, dtype, format): """ dtype = dtype_to_pylibcudf_type(dtype) - cdef string c_timestamp_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_durations.to_durations( input_col.to_pylibcudf(mode="read"), dtype, - c_timestamp_format + format ) ) @@ -623,12 +490,10 @@ def int2timedelta(Column input_col, str format): A Column with Timedelta represented in string format """ - - cdef string c_duration_format = format.encode('UTF-8') return Column.from_pylibcudf( plc.strings.convert.convert_durations.from_durations( input_col.to_pylibcudf(mode="read"), - c_duration_format + format ) ) @@ -646,14 +511,10 @@ def int2ip(Column input_col): A Column with integer represented in string ipv4 format """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_integers_to_ipv4(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4( + input_col.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) def ip2int(Column input_col): @@ -669,14 +530,10 @@ def ip2int(Column input_col): A Column with ipv4 represented as integer """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_ipv4_to_integers(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.ipv4_to_integers( + input_col.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) def is_ipv4(Column source_strings): @@ -685,18 +542,13 @@ def is_ipv4(Column source_strings): that have strings in IPv4 format. This format is nnn.nnn.nnn.nnn where nnn is integer digits in [0,255]. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_ipv4( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_ipv4.is_ipv4( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) -def htoi(Column input_col, **kwargs): +def htoi(Column input_col): """ Converting input column of type string having hex values to integer of out_type @@ -709,22 +561,11 @@ def htoi(Column input_col, **kwargs): ------- A Column of integers parsed from hexadecimal string values. """ - - cdef column_view input_column_view = input_col.view() - cdef type_id tid = ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[cudf.dtype("int64")] - ) + plc_column = plc.strings.convert.convert_integers.hex_to_integers( + input_col.to_pylibcudf(mode="read"), + plc.DataType(plc.TypeId.INT64) ) - cdef data_type c_out_type = data_type(tid) - - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_hex_to_integers(input_column_view, - c_out_type)) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) def is_hex(Column source_strings): @@ -732,15 +573,10 @@ def is_hex(Column source_strings): Returns a Column of boolean values with True for `source_strings` that have hex characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_hex( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_integers.is_hex( + source_strings.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) def itoh(Column input_col): @@ -756,11 +592,7 @@ def itoh(Column input_col): ------- A Column of strings with hexadecimal characters. """ - - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_integers_to_hex(input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_integers.integers_to_hex( + input_col.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index 4bf8a9b1a8f..ffa5e603408 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -71,16 +71,9 @@ startswith_multiple, ) from cudf._lib.strings.find_multiple import find_multiple -from cudf._lib.strings.findall import findall -from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object -from cudf._lib.strings.padding import ( - SideType, - center, - ljust, - pad, - rjust, - zfill, -) +from cudf._lib.strings.findall import find_re, findall +from cudf._lib.strings.json import get_json_object +from cudf._lib.strings.padding import center, ljust, pad, rjust, zfill from cudf._lib.strings.repeat import repeat_scalar, repeat_sequence from cudf._lib.strings.replace import ( insert, diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 76cc13db0da..0f7b27d85d7 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -2,24 +2,11 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move +from cudf._lib.column cimport Column -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.combine cimport ( - concatenate as cpp_concatenate, - join_list_elements as cpp_join_list_elements, - join_strings as cpp_join_strings, - output_if_empty_list, - separator_on_nulls, -) -from pylibcudf.libcudf.table.table_view cimport table_view +import pylibcudf as plc -from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport table_view_from_columns +import cudf @acquire_spill_lock() @@ -31,26 +18,12 @@ def concatenate(list source_strings, with the specified `sep` between each column and `na`/`None` values are replaced by `na_rep` """ - cdef DeviceScalar separator = sep.device_value - cdef DeviceScalar narep = na_rep.device_value - - cdef unique_ptr[column] c_result - cdef table_view source_view = table_view_from_columns(source_strings) - - cdef const string_scalar* scalar_separator = \ - (separator.get_raw_ptr()) - cdef const string_scalar* scalar_narep = ( - narep.get_raw_ptr() + plc_column = plc.strings.combine.concatenate( + plc.Table([col.to_pylibcudf(mode="read") for col in source_strings]), + sep.device_value.c_value, + na_rep.device_value.c_value, ) - - with nogil: - c_result = move(cpp_concatenate( - source_view, - scalar_separator[0], - scalar_narep[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -62,27 +35,12 @@ def join(Column source_strings, with the specified `sep` between each column and `na`/`None` values are replaced by `na_rep` """ - - cdef DeviceScalar separator = sep.device_value - cdef DeviceScalar narep = na_rep.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_separator = \ - (separator.get_raw_ptr()) - cdef const string_scalar* scalar_narep = ( - narep.get_raw_ptr() + plc_column = plc.strings.combine.join_strings( + source_strings.to_pylibcudf(mode="read"), + sep.device_value.c_value, + na_rep.device_value.c_value, ) - - with nogil: - c_result = move(cpp_join_strings( - source_view, - scalar_separator[0], - scalar_narep[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -96,29 +54,15 @@ def join_lists_with_scalar( between each string in lists and ``/`None` values are replaced by `py_narep` """ - - cdef DeviceScalar separator = py_separator.device_value - cdef DeviceScalar narep = py_narep.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_separator = \ - (separator.get_raw_ptr()) - cdef const string_scalar* scalar_narep = ( - narep.get_raw_ptr() + plc_column = plc.strings.combine.join_list_elements( + source_strings.to_pylibcudf(mode="read"), + py_separator.device_value.c_value, + py_narep.device_value.c_value, + cudf._lib.scalar.DeviceScalar("", cudf.dtype("object")).c_value, + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) - - with nogil: - c_result = move(cpp_join_list_elements( - source_view, - scalar_separator[0], - scalar_narep[0], - separator_on_nulls.YES, - output_if_empty_list.NULL_ELEMENT - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -135,28 +79,12 @@ def join_lists_with_column( ``/`None` values in `separator_strings` are replaced by `py_separator_narep` """ - - cdef DeviceScalar source_narep = py_source_narep.device_value - cdef DeviceScalar separator_narep = py_separator_narep.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view separator_view = separator_strings.view() - - cdef const string_scalar* scalar_source_narep = \ - (source_narep.get_raw_ptr()) - cdef const string_scalar* scalar_separator_narep = ( - separator_narep.get_raw_ptr() + plc_column = plc.strings.combine.join_list_elements( + source_strings.to_pylibcudf(mode="read"), + separator_strings.to_pylibcudf(mode="read"), + py_separator_narep.device_value.c_value, + py_source_narep.device_value.c_value, + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) - - with nogil: - c_result = move(cpp_join_list_elements( - source_view, - separator_view, - scalar_separator_narep[0], - scalar_source_narep[0], - separator_on_nulls.YES, - output_if_empty_list.NULL_ELEMENT - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx index a8df8c9a92c..96dcd021c3b 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx @@ -1,22 +1,11 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -import cudf - -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_fixed_point cimport ( - from_fixed_point as cpp_from_fixed_point, - is_fixed_point as cpp_is_fixed_point, - to_fixed_point as cpp_to_fixed_point, -) -from pylibcudf.libcudf.types cimport data_type, type_id - from cudf._lib.column cimport Column +from cudf._lib.types cimport dtype_to_pylibcudf_type + +import pylibcudf as plc @acquire_spill_lock() @@ -32,14 +21,10 @@ def from_decimal(Column input_col): ------- A column of strings representing the input decimal values. """ - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - with nogil: - c_result = move( - cpp_from_fixed_point( - input_column_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.from_fixed_point( + input_col.to_pylibcudf(mode="read"), + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -57,25 +42,11 @@ def to_decimal(Column input_col, object out_type): ------- A column of decimals parsed from the string values. """ - cdef column_view input_column_view = input_col.view() - cdef unique_ptr[column] c_result - cdef int scale = out_type.scale - cdef data_type c_out_type - if isinstance(out_type, cudf.Decimal32Dtype): - c_out_type = data_type(type_id.DECIMAL32, -scale) - elif isinstance(out_type, cudf.Decimal64Dtype): - c_out_type = data_type(type_id.DECIMAL64, -scale) - elif isinstance(out_type, cudf.Decimal128Dtype): - c_out_type = data_type(type_id.DECIMAL128, -scale) - else: - raise TypeError("should be a decimal dtype") - with nogil: - c_result = move( - cpp_to_fixed_point( - input_column_view, - c_out_type)) - - result = Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( + input_col.to_pylibcudf(mode="read"), + dtype_to_pylibcudf_type(out_type), + ) + result = Column.from_pylibcudf(plc_column) result.dtype.precision = out_type.precision return result @@ -98,14 +69,8 @@ def is_fixed_point(Column input_col, object dtype): ------- A Column of booleans indicating valid decimal conversion. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = input_col.view() - cdef int scale = dtype.scale - cdef data_type c_dtype = data_type(type_id.DECIMAL64, -scale) - with nogil: - c_result = move(cpp_is_fixed_point( - source_view, - c_dtype - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_fixed_point.is_fixed_point( + input_col.to_pylibcudf(mode="read"), + dtype_to_pylibcudf_type(dtype), + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx index 7965b588703..5da6e3f10cc 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx @@ -1,18 +1,11 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_floats cimport ( - is_float as cpp_is_float, -) - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def is_float(Column source_strings): @@ -20,12 +13,7 @@ def is_float(Column source_strings): Returns a Column of boolean values with True for `source_strings` that have floats. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_is_float( - source_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.convert.convert_floats.is_float( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx index 73aebf8ab35..3a2cb4bd5c7 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx @@ -1,23 +1,13 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move +import pylibcudf as plc from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.convert.convert_lists cimport ( - format_list_column as cpp_format_list_column, -) - from cudf._lib.column cimport Column from cudf._lib.scalar import as_device_scalar -from cudf._lib.scalar cimport DeviceScalar - @acquire_spill_lock() def format_list_column(Column source_list, Column separators): @@ -34,19 +24,9 @@ def format_list_column(Column source_list, Column separators): ------- Formatted strings column """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_list.view() - cdef column_view separators_view = separators.view() - # Use 'None' as null-replacement string - cdef DeviceScalar str_na_rep = as_device_scalar("None") - cdef const string_scalar* string_scalar_na_rep = ( - str_na_rep.get_raw_ptr()) - - with nogil: - c_result = move(cpp_format_list_column( - source_view, string_scalar_na_rep[0], separators_view - )) - - return Column.from_unique_ptr( - move(c_result) + plc_column = plc.strings.convert.convert_lists.format_list_column( + source_list.to_pylibcudf(mode="read"), + as_device_scalar("None").c_value, + separators.to_pylibcudf(mode="read"), ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx index e52116d6247..d5c2f771970 100644 --- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx +++ b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx @@ -1,17 +1,9 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move +import pylibcudf as plc from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.convert.convert_urls cimport ( - url_decode as cpp_url_decode, - url_encode as cpp_url_encode, -) - from cudf._lib.column cimport Column @@ -28,17 +20,10 @@ def url_decode(Column source_strings): ------- URL decoded string column """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_url_decode( - source_view - )) - - return Column.from_unique_ptr( - move(c_result) + plc_column = plc.strings.convert.convert_urls.url_decode( + source_strings.to_pylibcudf(mode="read") ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -57,14 +42,7 @@ def url_encode(Column source_strings): ------- URL encoded string column """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_url_encode( - source_view - )) - - return Column.from_unique_ptr( - move(c_result) + plc_column = plc.strings.convert.convert_urls.url_encode( + source_strings.to_pylibcudf(mode="read") ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx index 1358f8e3c2c..39e0013769f 100644 --- a/python/cudf/cudf/_lib/strings/find_multiple.pyx +++ b/python/cudf/cudf/_lib/strings/find_multiple.pyx @@ -1,18 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.find_multiple cimport ( - find_multiple as cpp_find_multiple, -) - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def find_multiple(Column source_strings, Column target_strings): @@ -20,14 +13,8 @@ def find_multiple(Column source_strings, Column target_strings): Returns a column with character position values where each of the `target_strings` are found in each string of `source_strings`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view target_view = target_strings.view() - - with nogil: - c_result = move(cpp_find_multiple( - source_view, - target_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.find_multiple.find_multiple( + source_strings.to_pylibcudf(mode="read"), + target_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx index 0e758d5b322..3e7a504d535 100644 --- a/python/cudf/cudf/_lib/strings/findall.pyx +++ b/python/cudf/cudf/_lib/strings/findall.pyx @@ -23,3 +23,19 @@ def findall(Column source_strings, object pattern, uint32_t flags): prog, ) return Column.from_pylibcudf(plc_result) + + +@acquire_spill_lock() +def find_re(Column source_strings, object pattern, uint32_t flags): + """ + Returns character positions where the pattern first matches + the elements in source_strings. + """ + prog = plc.strings.regex_program.RegexProgram.create( + str(pattern), flags + ) + plc_result = plc.strings.findall.find_re( + source_strings.to_pylibcudf(mode="read"), + prog, + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx index c9b0bba088d..374a104635a 100644 --- a/python/cudf/cudf/_lib/strings/json.pyx +++ b/python/cudf/cudf/_lib/strings/json.pyx @@ -1,84 +1,26 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move +import pylibcudf as plc +from pylibcudf.json cimport GetJsonObjectOptions from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.json cimport ( - get_json_object as cpp_get_json_object, - get_json_object_options, -) - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar @acquire_spill_lock() def get_json_object( - Column col, object py_json_path, GetJsonObjectOptions options): + Column col, + object py_json_path, + GetJsonObjectOptions options +): """ Apply a JSONPath string to all rows in an input column of json strings. """ - cdef unique_ptr[column] c_result - - cdef column_view col_view = col.view() - cdef DeviceScalar json_path = py_json_path.device_value - - cdef const string_scalar* scalar_json_path = ( - json_path.get_raw_ptr() + plc_column = plc.json.get_json_object( + col.to_pylibcudf(mode="read"), + py_json_path.device_value.c_value, + options ) - - with nogil: - c_result = move(cpp_get_json_object( - col_view, - scalar_json_path[0], - options.options, - )) - - return Column.from_unique_ptr(move(c_result)) - - -cdef class GetJsonObjectOptions: - cdef get_json_object_options options - - def __init__( - self, - *, - allow_single_quotes=False, - strip_quotes_from_single_strings=True, - missing_fields_as_nulls=False - ): - self.options.set_allow_single_quotes(allow_single_quotes) - self.options.set_strip_quotes_from_single_strings( - strip_quotes_from_single_strings - ) - self.options.set_missing_fields_as_nulls(missing_fields_as_nulls) - - @property - def allow_single_quotes(self): - return self.options.get_allow_single_quotes() - - @property - def strip_quotes_from_single_strings(self): - return self.options.get_strip_quotes_from_single_strings() - - @property - def missing_fields_as_nulls(self): - return self.options.get_missing_fields_as_nulls() - - @allow_single_quotes.setter - def allow_single_quotes(self, val): - self.options.set_allow_single_quotes(val) - - @strip_quotes_from_single_strings.setter - def strip_quotes_from_single_strings(self, val): - self.options.set_strip_quotes_from_single_strings(val) - - @missing_fields_as_nulls.setter - def missing_fields_as_nulls(self, val): - self.options.set_missing_fields_as_nulls(val) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/padding.pyx b/python/cudf/cudf/_lib/strings/padding.pyx index d0239e91ec3..015a2ebab8a 100644 --- a/python/cudf/cudf/_lib/strings/padding.pyx +++ b/python/cudf/cudf/_lib/strings/padding.pyx @@ -1,64 +1,31 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from enum import IntEnum - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.strings.padding cimport ( - pad as cpp_pad, - zfill as cpp_zfill, -) -from pylibcudf.libcudf.strings.side_type cimport ( - side_type, - underlying_type_t_side_type, -) - - -class SideType(IntEnum): - LEFT = side_type.LEFT - RIGHT = side_type.RIGHT - BOTH = side_type.BOTH +import pylibcudf as plc @acquire_spill_lock() def pad(Column source_strings, size_type width, fill_char, - side=SideType.LEFT): + side=plc.strings.side_type.SideType.LEFT): """ Returns a Column by padding strings in `source_strings` up to the given `width`. Direction of padding is to be specified by `side`. The additional characters being filled can be changed by specifying `fill_char`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() - - cdef side_type pad_direction = ( - side + plc_result = plc.strings.padding.pad( + source_strings.to_pylibcudf(mode="read"), + width, + side, + fill_char, ) - - with nogil: - c_result = move(cpp_pad( - source_view, - width, - pad_direction, - f_char - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -68,19 +35,13 @@ def zfill(Column source_strings, Returns a Column by prepending strings in `source_strings` with '0' characters up to the given `width`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_zfill( - source_view, - width - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.padding.zfill( + source_strings.to_pylibcudf(mode="read"), + width + ) + return Column.from_pylibcudf(plc_result) -@acquire_spill_lock() def center(Column source_strings, size_type width, fill_char): @@ -89,23 +50,9 @@ def center(Column source_strings, in `source_strings` with additional character, `fill_char` up to the given `width`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() - - with nogil: - c_result = move(cpp_pad( - source_view, - width, - side_type.BOTH, - f_char - )) + return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.BOTH) - return Column.from_unique_ptr(move(c_result)) - -@acquire_spill_lock() def ljust(Column source_strings, size_type width, fill_char): @@ -113,23 +60,9 @@ def ljust(Column source_strings, Returns a Column by filling right side of strings in `source_strings` with additional character, `fill_char` up to the given `width`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() + return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.RIGHT) - with nogil: - c_result = move(cpp_pad( - source_view, - width, - side_type.RIGHT, - f_char - )) - return Column.from_unique_ptr(move(c_result)) - - -@acquire_spill_lock() def rjust(Column source_strings, size_type width, fill_char): @@ -137,17 +70,4 @@ def rjust(Column source_strings, Returns a Column by filling left side of strings in `source_strings` with additional character, `fill_char` up to the given `width`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string f_char = str(fill_char).encode() - - with nogil: - c_result = move(cpp_pad( - source_view, - width, - side_type.LEFT, - f_char - )) - - return Column.from_unique_ptr(move(c_result)) + return pad(source_strings, width, fill_char, plc.strings.side_type.SideType.LEFT) diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx index fffc8b7c3f6..462d5c903e8 100644 --- a/python/cudf/cudf/_lib/strings/replace_re.pyx +++ b/python/cudf/cudf/_lib/strings/replace_re.pyx @@ -1,26 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move -from libcpp.vector cimport vector +from pylibcudf.libcudf.types cimport size_type +import pylibcudf as plc from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program -from pylibcudf.libcudf.strings.replace_re cimport ( - replace_re as cpp_replace_re, - replace_with_backrefs as cpp_replace_with_backrefs, -) -from pylibcudf.libcudf.types cimport size_type - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar @acquire_spill_lock() @@ -34,28 +19,16 @@ def replace_re(Column source_strings, `n` indicates the number of resplacements to be made from start. (-1 indicates all) """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string pattern_string = str(pattern).encode() - cdef const string_scalar* scalar_repl = \ - (repl.get_raw_ptr()) - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_replace_re( - source_view, - dereference(c_prog), - scalar_repl[0], - n - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.replace_re.replace_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT + ), + py_repl.device_value.c_value, + n + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -68,50 +41,29 @@ def replace_with_backrefs( new string with the extracted elements found using `pattern` regular expression in `source_strings`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef string pattern_string = str(pattern).encode() - cdef string repl_string = str(repl).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_replace_with_backrefs( - source_view, - dereference(c_prog), - repl_string - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.replace_re.replace_with_backrefs( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT + ), + repl + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() def replace_multi_re(Column source_strings, - object patterns, + list patterns, Column repl_strings): """ Returns a Column after replacing occurrences of multiple regular expressions `patterns` with their corresponding strings in `repl_strings` in `source_strings`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef column_view repl_view = repl_strings.view() - - cdef int pattern_size = len(patterns) - cdef vector[string] patterns_vector - patterns_vector.reserve(pattern_size) - - for pattern in patterns: - patterns_vector.push_back(str.encode(pattern)) - - with nogil: - c_result = move(cpp_replace_re( - source_view, - patterns_vector, - repl_view - )) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.replace_re.replace_re( + source_strings.to_pylibcudf(mode="read"), + patterns, + repl_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx index a81fb18e752..5319addc41c 100644 --- a/python/cudf/cudf/_lib/strings/split/partition.pyx +++ b/python/cudf/cudf/_lib/strings/split/partition.pyx @@ -1,21 +1,10 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.split.partition cimport ( - partition as cpp_partition, - rpartition as cpp_rpartition, -) -from pylibcudf.libcudf.table.table cimport table - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -25,25 +14,11 @@ def partition(Column source_strings, Returns data by splitting the `source_strings` column at the first occurrence of the specified `py_delimiter`. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_partition( - source_view, - scalar_str[0] - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.partition.partition( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -53,22 +28,8 @@ def rpartition(Column source_strings, Returns a Column by splitting the `source_strings` column at the last occurrence of the specified `py_delimiter`. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rpartition( - source_view, - scalar_str[0] - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.partition.rpartition( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx index f481fea4c51..4ec6c7073d8 100644 --- a/python/cudf/cudf/_lib/strings/split/split.pyx +++ b/python/cudf/cudf/_lib/strings/split/split.pyx @@ -1,33 +1,12 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cython.operator cimport dereference -from libcpp.memory cimport unique_ptr -from libcpp.string cimport string -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.regex_flags cimport regex_flags -from pylibcudf.libcudf.strings.regex_program cimport regex_program -from pylibcudf.libcudf.strings.split.split cimport ( - rsplit as cpp_rsplit, - rsplit_re as cpp_rsplit_re, - rsplit_record as cpp_rsplit_record, - rsplit_record_re as cpp_rsplit_record_re, - split as cpp_split, - split_re as cpp_split_re, - split_record as cpp_split_record, - split_record_re as cpp_split_record_re, -) -from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.utils cimport data_from_unique_ptr + +import pylibcudf as plc @acquire_spill_lock() @@ -39,26 +18,12 @@ def split(Column source_strings, column around the specified `py_delimiter`. The split happens from beginning. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_split( - source_view, - scalar_str[0], - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.split( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -70,25 +35,12 @@ def split_record(Column source_strings, column around the specified `py_delimiter`. The split happens from beginning. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_split_record( - source_view, - scalar_str[0], - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.split_record( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -100,26 +52,12 @@ def rsplit(Column source_strings, column around the specified `py_delimiter`. The split happens from the end. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rsplit( - source_view, - scalar_str[0], - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.rsplit( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -131,25 +69,12 @@ def rsplit_record(Column source_strings, column around the specified `py_delimiter`. The split happens from the end. """ - - cdef DeviceScalar delimiter = py_delimiter.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef const string_scalar* scalar_str = ( - delimiter.get_raw_ptr() - ) - - with nogil: - c_result = move(cpp_rsplit_record( - source_view, - scalar_str[0], - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.rsplit_record( + source_strings.to_pylibcudf(mode="read"), + py_delimiter.device_value.c_value, + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -160,24 +85,15 @@ def split_re(Column source_strings, Returns data by splitting the `source_strings` column around the delimiters identified by `pattern`. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_split_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.split_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -189,24 +105,15 @@ def rsplit_re(Column source_strings, column around the delimiters identified by `pattern`. The delimiters are searched starting from the end of each string. """ - cdef unique_ptr[table] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_rsplit_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return data_from_unique_ptr( - move(c_result), - column_names=range(0, c_result.get()[0].num_columns()) + plc_table = plc.strings.split.split.rsplit_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns())) @acquire_spill_lock() @@ -217,23 +124,15 @@ def split_record_re(Column source_strings, Returns a Column by splitting the `source_strings` column around the delimiters identified by `pattern`. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_split_record_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.split_record_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -245,20 +144,12 @@ def rsplit_record_re(Column source_strings, column around the delimiters identified by `pattern`. The delimiters are searched starting from the end of each string. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - cdef string pattern_string = str(pattern).encode() - cdef regex_flags c_flags = regex_flags.DEFAULT - cdef unique_ptr[regex_program] c_prog - - with nogil: - c_prog = move(regex_program.create(pattern_string, c_flags)) - c_result = move(cpp_rsplit_record_re( - source_view, - dereference(c_prog), - maxsplit - )) - - return Column.from_unique_ptr( - move(c_result), + plc_column = plc.strings.split.split.rsplit_record_re( + source_strings.to_pylibcudf(mode="read"), + plc.strings.regex_program.RegexProgram.create( + str(pattern), + plc.strings.regex_flags.RegexFlags.DEFAULT, + ), + maxsplit, ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx index 38ecb21a94c..982c5a600e7 100644 --- a/python/cudf/cudf/_lib/strings/strip.pyx +++ b/python/cudf/cudf/_lib/strings/strip.pyx @@ -1,18 +1,8 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.scalar.scalar cimport string_scalar -from pylibcudf.libcudf.strings.side_type cimport side_type -from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip - from cudf._lib.column cimport Column -from cudf._lib.scalar cimport DeviceScalar import pylibcudf as plc @@ -24,15 +14,12 @@ def strip(Column source_strings, The set of characters need be stripped from left and right side can be specified by `py_repl`. """ - - cdef DeviceScalar repl = py_repl.device_value - return Column.from_pylibcudf( - plc.strings.strip.strip( - source_strings.to_pylibcudf(mode="read"), - plc.strings.SideType.BOTH, - repl.c_value - ) + plc_result = plc.strings.strip.strip( + source_strings.to_pylibcudf(mode="read"), + plc.strings.side_type.SideType.BOTH, + py_repl.device_value.c_value, ) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -43,24 +30,12 @@ def lstrip(Column source_strings, The set of characters need be stripped from left side can be specified by `py_repl`. """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() + plc_result = plc.strings.strip.strip( + source_strings.to_pylibcudf(mode="read"), + plc.strings.side_type.SideType.LEFT, + py_repl.device_value.c_value, ) - - with nogil: - c_result = move(cpp_strip( - source_view, - side_type.LEFT, - scalar_str[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_result) @acquire_spill_lock() @@ -71,21 +46,9 @@ def rstrip(Column source_strings, The set of characters need be stripped from right side can be specified by `py_repl`. """ - - cdef DeviceScalar repl = py_repl.device_value - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - cdef const string_scalar* scalar_str = ( - repl.get_raw_ptr() + plc_result = plc.strings.strip.strip( + source_strings.to_pylibcudf(mode="read"), + plc.strings.side_type.SideType.RIGHT, + py_repl.device_value.c_value, ) - - with nogil: - c_result = move(cpp_strip( - source_view, - side_type.RIGHT, - scalar_str[0] - )) - - return Column.from_unique_ptr(move(c_result)) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings/wrap.pyx b/python/cudf/cudf/_lib/strings/wrap.pyx index eed5cf33b10..2b40f01f818 100644 --- a/python/cudf/cudf/_lib/strings/wrap.pyx +++ b/python/cudf/cudf/_lib/strings/wrap.pyx @@ -1,17 +1,13 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - from cudf.core.buffer import acquire_spill_lock -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.wrap cimport wrap as cpp_wrap from pylibcudf.libcudf.types cimport size_type from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def wrap(Column source_strings, @@ -21,14 +17,8 @@ def wrap(Column source_strings, in the Column to be formatted in paragraphs with length less than a given `width`. """ - - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_wrap( - source_view, - width - )) - - return Column.from_unique_ptr(move(c_result)) + plc_result = plc.strings.wrap.wrap( + source_strings.to_pylibcudf(mode="read"), + width + ) + return Column.from_pylibcudf(plc_result) diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx index 78fc9f08bd8..dd2fafbe07f 100644 --- a/python/cudf/cudf/_lib/strings_udf.pyx +++ b/python/cudf/cudf/_lib/strings_udf.pyx @@ -23,7 +23,8 @@ from pylibcudf.libcudf.strings_udf cimport ( to_string_view_array as cpp_to_string_view_array, udf_string, ) -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +from rmm.librmm.device_buffer cimport device_buffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from cudf._lib.column cimport Column diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index 6e8ad556b08..3b13cc258ab 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -1,8 +1,8 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. import sys -from collections.abc import Callable -from typing import TYPE_CHECKING, Any, Dict, Iterable, TypeVar, Union +from collections.abc import Callable, Iterable +from typing import TYPE_CHECKING, Any, TypeVar, Union import numpy as np from pandas import Period, Timedelta, Timestamp @@ -42,7 +42,7 @@ SeriesOrSingleColumnIndex = Union["cudf.Series", "cudf.core.index.Index"] # Groupby aggregation -AggType = Union[str, Callable] -MultiColumnAggType = Union[ - AggType, Iterable[AggType], Dict[Any, Iterable[AggType]] +AggType = Union[str, Callable] # noqa: UP007 +MultiColumnAggType = Union[ # noqa: UP007 + AggType, Iterable[AggType], dict[Any, Iterable[AggType]] ] diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py index e2bdecbe67a..871ffc6269d 100644 --- a/python/cudf/cudf/core/_compat.py +++ b/python/cudf/cudf/core/_compat.py @@ -3,7 +3,7 @@ import pandas as pd from packaging import version -PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.2") +PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.3") PANDAS_VERSION = version.parse(pd.__version__) diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index 32ae8c5ee53..ffa306bf93f 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -6,7 +6,7 @@ import pickle import weakref from types import SimpleNamespace -from typing import Any, Literal, Mapping +from typing import TYPE_CHECKING, Any, Literal import numpy from typing_extensions import Self @@ -18,6 +18,9 @@ from cudf.core.abc import Serializable from cudf.utils.string import format_bytes +if TYPE_CHECKING: + from collections.abc import Mapping + def host_memory_allocation(nbytes: int) -> memoryview: """Allocate host memory using NumPy @@ -284,7 +287,7 @@ def memoryview( """Read-only access to the buffer through host memory.""" size = self._size if size is None else size host_buf = host_memory_allocation(size) - rmm._lib.device_buffer.copy_ptr_to_host( + rmm.pylibrmm.device_buffer.copy_ptr_to_host( self.get_ptr(mode="read") + offset, host_buf ) return memoryview(host_buf).toreadonly() diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py index 0bd8d6054b3..ecf9807cfc2 100644 --- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py +++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py @@ -2,13 +2,16 @@ from __future__ import annotations -from typing import Literal, Mapping +from typing import TYPE_CHECKING, Literal from typing_extensions import Self import cudf from cudf.core.buffer.buffer import Buffer, BufferOwner +if TYPE_CHECKING: + from collections.abc import Mapping + class ExposureTrackedBuffer(Buffer): """An exposure tracked buffer. diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 4c9e524ee05..b40c56c9a6b 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -207,7 +207,7 @@ def spill(self, target: str = "cpu") -> None: domain="cudf_python-spill", ): host_mem = host_memory_allocation(self.size) - rmm._lib.device_buffer.copy_ptr_to_host( + rmm.pylibrmm.device_buffer.copy_ptr_to_host( self._ptr, host_mem ) self._ptr_desc["memoryview"] = host_mem @@ -352,7 +352,7 @@ def memoryview( else: assert self._ptr_desc["type"] == "gpu" ret = host_memory_allocation(size) - rmm._lib.device_buffer.copy_ptr_to_host( + rmm.pylibrmm.device_buffer.copy_ptr_to_host( self._ptr + offset, ret ) return ret diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py index 06791df7dc0..a1e87d04bc9 100644 --- a/python/cudf/cudf/core/column/__init__.py +++ b/python/cudf/cudf/core/column/__init__.py @@ -29,4 +29,3 @@ Decimal128Column, DecimalBaseColumn, ) -from cudf.core.column.interval import IntervalColumn # noqa: F401 diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 864e87b5377..087d0ed65f5 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -4,7 +4,7 @@ import warnings from functools import cached_property -from typing import TYPE_CHECKING, Any, Mapping, Sequence, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np import pandas as pd @@ -26,6 +26,7 @@ if TYPE_CHECKING: from collections import abc + from collections.abc import Mapping, Sequence import numba.cuda diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 7674565e2c3..d2cd6e8ac8f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -4,10 +4,11 @@ import pickle from collections import abc +from collections.abc import MutableSequence, Sequence from functools import cached_property from itertools import chain from types import SimpleNamespace -from typing import TYPE_CHECKING, Any, Literal, MutableSequence, Sequence, cast +from typing import TYPE_CHECKING, Any, Literal, cast import cupy import numpy as np diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index d0ea4612a1b..b6dc250e64d 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -8,7 +8,7 @@ import locale import re from locale import nl_langinfo -from typing import TYPE_CHECKING, Literal, Sequence, cast +from typing import TYPE_CHECKING, Literal, cast import numpy as np import pandas as pd @@ -31,6 +31,8 @@ from cudf.utils.utils import _all_bools_with_nulls if TYPE_CHECKING: + from collections.abc import Sequence + from cudf._typing import ( ColumnBinaryOperand, DatetimeLikeScalar, @@ -480,6 +482,11 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike: def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn: if dtype == self.dtype: return self + elif isinstance(dtype, pd.DatetimeTZDtype): + raise TypeError( + "Cannot use .astype to convert from timezone-naive dtype to timezone-aware dtype. " + "Use tz_localize instead." + ) return libcudf.unary.cast(self, dtype=dtype) def as_timedelta_column(self, dtype: Dtype) -> None: # type: ignore[override] @@ -940,6 +947,16 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn: def as_string_column(self) -> cudf.core.column.StringColumn: return self._local_time.as_string_column() + def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn: + if isinstance(dtype, pd.DatetimeTZDtype) and dtype != self.dtype: + if dtype.unit != self.time_unit: + # TODO: Doesn't check that new unit is valid. + casted = self._with_type_metadata(dtype) + else: + casted = self + return casted.tz_convert(str(dtype.tz)) + return super().as_datetime_column(dtype) + def get_dt_field(self, field: str) -> ColumnBase: return libcudf.datetime.extract_datetime_component( self._local_time, field diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 8803ebd6791..8ae06f72d1e 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -3,8 +3,9 @@ from __future__ import annotations import warnings +from collections.abc import Sequence from decimal import Decimal -from typing import TYPE_CHECKING, Sequence, cast +from typing import TYPE_CHECKING, cast import cupy as cp import numpy as np diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index c6a39199e3b..6b25e568f00 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -3,7 +3,7 @@ from __future__ import annotations from functools import cached_property -from typing import TYPE_CHECKING, Sequence, cast +from typing import TYPE_CHECKING, cast import numpy as np import pandas as pd @@ -11,7 +11,6 @@ from typing_extensions import Self import cudf -from cudf._lib.copying import segmented_gather from cudf._lib.lists import ( concatenate_list_elements, concatenate_rows, @@ -22,6 +21,7 @@ extract_element_scalar, index_of_column, index_of_scalar, + segmented_gather, sort_lists, ) from cudf._lib.strings.convert.convert_lists import format_list_column @@ -34,6 +34,8 @@ from cudf.core.missing import NA if TYPE_CHECKING: + from collections.abc import Sequence + from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 05a0ab2e09a..a91c080fe21 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -2,9 +2,7 @@ from __future__ import annotations -from typing import Union, overload - -from typing_extensions import Literal +from typing import Literal, Union, overload import cudf import cudf.core.column diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 78d2814ed26..620cae65374 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -3,7 +3,7 @@ from __future__ import annotations import functools -from typing import TYPE_CHECKING, Any, Sequence, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np import pandas as pd @@ -28,7 +28,7 @@ from .numerical_base import NumericalBaseColumn if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import Callable, Sequence from cudf._typing import ( ColumnBinaryOperand, diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 3b8dd05c13a..f6ab91f2f01 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -180,9 +180,12 @@ def var( min_count: int = 0, ddof=1, ): - return self._reduce( + result = self._reduce( "var", skipna=skipna, min_count=min_count, ddof=ddof ) + if result is NA: + return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) + return result def std( self, @@ -190,9 +193,12 @@ def std( min_count: int = 0, ddof=1, ): - return self._reduce( + result = self._reduce( "std", skipna=skipna, min_count=min_count, ddof=ddof ) + if result is NA: + return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) + return result def median(self, skipna: bool | None = None) -> NumericalBaseColumn: skipna = True if skipna is None else skipna diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 4463e3280df..856ce0f75de 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5,12 +5,14 @@ import re import warnings from functools import cached_property -from typing import TYPE_CHECKING, Sequence, cast, overload +from typing import TYPE_CHECKING, cast, overload import numpy as np import pandas as pd import pyarrow as pa +import pylibcudf as plc + import cudf import cudf.api.types from cudf import _lib as libcudf @@ -33,6 +35,8 @@ def str_to_boolean(column: StringColumn): if TYPE_CHECKING: + from collections.abc import Sequence + import cupy import numba.cuda @@ -996,7 +1000,7 @@ def replace( return self._return_or_inplace( libstrings.replace_multi_re( self._column, - pat, + list(pat), column.as_column(repl, dtype="str"), ) if regex @@ -2383,8 +2387,7 @@ def get_json_object( 0 [\n { "category": "reference",\n ... dtype: object """ - - options = libstrings.GetJsonObjectOptions( + options = plc.json.GetJsonObjectOptions( allow_single_quotes=allow_single_quotes, strip_quotes_from_single_strings=( strip_quotes_from_single_strings @@ -2546,9 +2549,9 @@ def split( result_table = {0: self._column.copy()} else: if regex is True: - data, _ = libstrings.split_re(self._column, pat, n) + data = libstrings.split_re(self._column, pat, n) else: - data, _ = libstrings.split( + data = libstrings.split( self._column, cudf.Scalar(pat, "str"), n ) if len(data) == 1 and data[0].null_count == len(self._column): @@ -2719,9 +2722,9 @@ def rsplit( result_table = {0: self._column.copy()} else: if regex is True: - data, _ = libstrings.rsplit_re(self._column, pat, n) + data = libstrings.rsplit_re(self._column, pat, n) else: - data, _ = libstrings.rsplit( + data = libstrings.rsplit( self._column, cudf.Scalar(pat, "str"), n ) if len(data) == 1 and data[0].null_count == len(self._column): @@ -2820,7 +2823,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.partition(self._column, cudf.Scalar(sep, "str"))[0], + libstrings.partition(self._column, cudf.Scalar(sep, "str")), expand=expand, ) @@ -2885,7 +2888,7 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - libstrings.rpartition(self._column, cudf.Scalar(sep, "str"))[0], + libstrings.rpartition(self._column, cudf.Scalar(sep, "str")), expand=expand, ) @@ -2966,7 +2969,7 @@ def pad( raise TypeError(msg) try: - side = libstrings.SideType[side.upper()] + side = plc.strings.side_type.SideType[side.upper()] except KeyError: raise ValueError( "side has to be either one of {'left', 'right', 'both'}" @@ -3624,6 +3627,46 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: data = libstrings.findall(self._column, pat, flags) return self._return_or_inplace(data) + def find_re(self, pat: str, flags: int = 0) -> SeriesOrIndex: + """ + Find first occurrence of pattern or regular expression in the + Series/Index. + + Parameters + ---------- + pat : str + Pattern or regular expression. + flags : int, default 0 (no flags) + Flags to pass through to the regex engine (e.g. re.MULTILINE) + + Returns + ------- + Series + A Series of position values where the pattern first matches + each string. + + Examples + -------- + >>> import cudf + >>> s = cudf.Series(['Lion', 'Monkey', 'Rabbit', 'Cat']) + >>> s.str.find_re('[ti]') + 0 1 + 1 -1 + 2 4 + 3 2 + dtype: int32 + """ + if isinstance(pat, re.Pattern): + flags = pat.flags & ~re.U + pat = pat.pattern + if not _is_supported_regex_flags(flags): + raise NotImplementedError( + "Unsupported value for `flags` parameter" + ) + + data = libstrings.find_re(self._column, pat, flags) + return self._return_or_inplace(data) + def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series: """ Find all first occurrences of patterns in the Series/Index. diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 2fda3b2c434..8f16ba4e15b 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -68,12 +68,7 @@ def base_size(self): return self.size + self.offset def to_arrow(self) -> pa.Array: - children = [ - pa.nulls(len(child)) - if len(child) == child.null_count - else child.to_arrow() - for child in self.children - ] + children = [child.to_arrow() for child in self.children] pa_type = pa.struct( { diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 6b6f3e517a8..087d6474e7f 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -4,7 +4,7 @@ import datetime import functools -from typing import TYPE_CHECKING, Sequence, cast +from typing import TYPE_CHECKING, cast import numpy as np import pandas as pd @@ -19,6 +19,8 @@ from cudf.utils.utils import _all_bools_with_nulls if TYPE_CHECKING: + from collections.abc import Sequence + from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype _unit_to_nanoseconds_conversion = { diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index bc093fdaa9a..496e86ed709 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -5,8 +5,9 @@ import itertools import sys from collections import abc +from collections.abc import Mapping from functools import cached_property, reduce -from typing import TYPE_CHECKING, Any, Mapping, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np import pandas as pd diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 79ed5a0e187..bf1c39b23da 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -13,8 +13,8 @@ import textwrap import warnings from collections import abc, defaultdict -from collections.abc import Callable, Iterator -from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast +from collections.abc import Callable, Iterator, MutableMapping +from typing import TYPE_CHECKING, Any, Literal, cast import cupy import numba @@ -781,9 +781,15 @@ def __init__( ) elif isinstance(data, ColumnAccessor): raise TypeError( - "Use cudf.Series._from_data for constructing a Series from " + "Use cudf.DataFrame._from_data for constructing a DataFrame from " "ColumnAccessor" ) + elif isinstance(data, ColumnBase): + raise TypeError( + "Use cudf.DataFrame._from_arrays for constructing a DataFrame from " + "ColumnBase or Use cudf.DataFrame._from_data by passing a dict " + "of column name and column as key-value pair." + ) elif hasattr(data, "__cuda_array_interface__"): arr_interface = data.__cuda_array_interface__ # descr is an optional field of the _cuda_ary_iface_ @@ -5118,11 +5124,12 @@ def info( useful for big DataFrames and fine-tune memory optimization: >>> import numpy as np - >>> random_strings_array = np.random.choice(['a', 'b', 'c'], 10 ** 6) + >>> rng = np.random.default_rng(seed=0) + >>> random_strings_array = rng.choice(['a', 'b', 'c'], 10 ** 6) >>> df = cudf.DataFrame({ - ... 'column_1': np.random.choice(['a', 'b', 'c'], 10 ** 6), - ... 'column_2': np.random.choice(['a', 'b', 'c'], 10 ** 6), - ... 'column_3': np.random.choice(['a', 'b', 'c'], 10 ** 6) + ... 'column_1': rng.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_2': rng.choice(['a', 'b', 'c'], 10 ** 6), + ... 'column_3': rng.choice(['a', 'b', 'c'], 10 ** 6) ... }) >>> df.info(memory_usage='deep') @@ -5883,7 +5890,7 @@ def _from_arrays( f"records dimension expected 1 or 2 but found: {array_data.ndim}" ) - if data.ndim == 2: + if array_data.ndim == 2: num_cols = array_data.shape[1] else: # Since we validate ndim to be either 1 or 2 above, diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 5250a741d3d..aa601a2b322 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -3,7 +3,7 @@ import enum from collections import abc -from typing import Any, Iterable, Mapping, Sequence, Tuple, cast +from typing import TYPE_CHECKING, Any, cast import cupy as cp import numpy as np @@ -20,6 +20,9 @@ build_column, ) +if TYPE_CHECKING: + from collections.abc import Iterable, Mapping, Sequence + # Implementation of interchange protocol classes # ---------------------------------------------- @@ -61,7 +64,7 @@ class _MaskKind(enum.IntEnum): _DtypeKind.BOOL, _DtypeKind.STRING, } -ProtoDtype = Tuple[_DtypeKind, int, str, str] +ProtoDtype = tuple[_DtypeKind, int, str, str] class _CuDFBuffer: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 37ad6b8fabb..205edd91d9d 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,7 @@ import pickle import warnings from collections import abc -from typing import TYPE_CHECKING, Any, Literal, MutableMapping +from typing import TYPE_CHECKING, Any, Literal # TODO: The `numpy` import is needed for typing purposes during doc builds # only, need to figure out why the `np` alias is insufficient then remove. @@ -36,6 +36,7 @@ from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf if TYPE_CHECKING: + from collections.abc import MutableMapping from types import ModuleType from cudf._typing import Dtype, ScalarLike diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 81b20488d8d..6630bd96c01 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -8,7 +8,7 @@ import warnings from collections import abc from functools import cached_property -from typing import TYPE_CHECKING, Any, Iterable, Literal +from typing import TYPE_CHECKING, Any, Literal import cupy as cp import numpy as np @@ -36,6 +36,8 @@ from cudf.utils.utils import GetAttrGetItemMixin if TYPE_CHECKING: + from collections.abc import Iterable + from cudf._typing import ( AggType, DataFrameOrSeries, diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index cd07c58c5d9..1b90e9f9df0 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -5,10 +5,10 @@ import operator import pickle import warnings -from collections.abc import Hashable +from collections.abc import Hashable, MutableMapping from functools import cache, cached_property from numbers import Number -from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast +from typing import TYPE_CHECKING, Any, Literal, cast import cupy import numpy as np diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 5952815deef..e031f2a4e8e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -10,9 +10,7 @@ from typing import ( TYPE_CHECKING, Any, - Callable, Literal, - MutableMapping, TypeVar, cast, ) @@ -63,6 +61,8 @@ from cudf.utils.utils import _warn_no_dask_cudf if TYPE_CHECKING: + from collections.abc import Callable, MutableMapping + from cudf._typing import ( ColumnLike, DataFrameOrSeries, diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 8182e5cede2..ce6a5c960dd 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -3,9 +3,7 @@ from __future__ import annotations from dataclasses import dataclass -from typing import Any, List, Union - -from typing_extensions import TypeAlias +from typing import Any, TypeAlias import cudf from cudf.api.types import _is_scalar_or_zero_d_array, is_integer @@ -46,11 +44,11 @@ class ScalarIndexer: key: GatherMap -IndexingSpec: TypeAlias = Union[ - EmptyIndexer, MapIndexer, MaskIndexer, ScalarIndexer, SliceIndexer -] +IndexingSpec: TypeAlias = ( + EmptyIndexer | MapIndexer | MaskIndexer | ScalarIndexer | SliceIndexer +) -ColumnLabels: TypeAlias = List[str] +ColumnLabels: TypeAlias = list[str] def destructure_iloc_key( diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 92d094d9de5..bfff62f0a89 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -8,7 +8,7 @@ import pickle import warnings from functools import cached_property -from typing import TYPE_CHECKING, Any, MutableMapping +from typing import TYPE_CHECKING, Any import cupy as cp import numpy as np @@ -36,7 +36,7 @@ from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name if TYPE_CHECKING: - from collections.abc import Generator, Hashable + from collections.abc import Generator, Hashable, MutableMapping from typing_extensions import Self diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 6e5abb2b82b..3d132c92d54 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -681,7 +681,7 @@ def _tile(A, reps): nval = len(value_vars) dtype = min_unsigned_type(nval) - if not var_name: + if var_name is None: var_name = "variable" if not value_vars: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index acd97c2047c..9b60424c924 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -9,7 +9,7 @@ import warnings from collections import abc from shutil import get_terminal_size -from typing import TYPE_CHECKING, Any, Literal, MutableMapping +from typing import TYPE_CHECKING, Any, Literal import cupy import numpy as np @@ -71,6 +71,8 @@ from cudf.utils.performance_tracking import _performance_tracking if TYPE_CHECKING: + from collections.abc import MutableMapping + import pyarrow as pa from cudf._typing import ( @@ -637,10 +639,15 @@ def __init__( column = as_column(data, nan_as_null=nan_as_null, dtype=dtype) if isinstance(data, (pd.Series, Series)): index_from_data = ensure_index(data.index) - elif isinstance(data, (ColumnAccessor, ColumnBase)): + elif isinstance(data, ColumnAccessor): raise TypeError( "Use cudf.Series._from_data for constructing a Series from " - "ColumnAccessor or a ColumnBase" + "ColumnAccessor" + ) + elif isinstance(data, ColumnBase): + raise TypeError( + "Use cudf.Series._from_column for constructing a Series from " + "a ColumnBase" ) elif isinstance(data, dict): if not data: @@ -2943,7 +2950,7 @@ def corr(self, other, method="pearson", min_periods=None): >>> ser1 = cudf.Series([0.9, 0.13, 0.62]) >>> ser2 = cudf.Series([0.12, 0.26, 0.51]) >>> ser1.corr(ser2, method="pearson") - -0.20454263717316112 + -0.20454263717316126 >>> ser1.corr(ser2, method="spearman") -0.5 """ diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 68f34fa28ff..885e7b16644 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -4,7 +4,7 @@ import math import re import warnings -from typing import Literal, Sequence +from typing import TYPE_CHECKING, Literal import numpy as np import pandas as pd @@ -20,6 +20,9 @@ from cudf.core import column from cudf.core.index import ensure_index +if TYPE_CHECKING: + from collections.abc import Sequence + # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112 _unit_map = { "year": "year", diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py index 43604ab21a7..a0cbe7ada19 100644 --- a/python/cudf/cudf/core/udf/strings_typing.py +++ b/python/cudf/cudf/core/udf/strings_typing.py @@ -99,7 +99,7 @@ def prepare_args(self, ty, val, **kwargs): ty.dtype, (StringView, UDFString) ): return types.uint64, val.ptr if isinstance( - val, rmm._lib.device_buffer.DeviceBuffer + val, rmm.pylibrmm.device_buffer.DeviceBuffer ) else val.get_ptr(mode="read") else: return ty, val diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index c364d55e677..73afde407db 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -10,9 +10,9 @@ import pickle import types import warnings -from collections.abc import Callable, Iterator +from collections.abc import Callable, Iterator, Mapping from enum import IntEnum -from typing import Any, Literal, Mapping +from typing import Any, Literal import numpy as np diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py index f82e300e83d..38103a71908 100644 --- a/python/cudf/cudf/pandas/module_accelerator.py +++ b/python/cudf/cudf/pandas/module_accelerator.py @@ -17,7 +17,7 @@ from abc import abstractmethod from importlib._bootstrap import _ImportLockContext as ImportLock from types import ModuleType -from typing import Any, ContextManager, NamedTuple +from typing import Any, ContextManager, NamedTuple # noqa: UP035 from typing_extensions import Self diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py index 8870fbc5c28..bb2fc00d9fc 100644 --- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py +++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py @@ -9,6 +9,7 @@ python analyze-test-failures.py Example: +------- python analyze-test-failures.py log.json frame/* """ diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index d12d2697729..59966a5ff0c 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -35,7 +35,7 @@ def null_assert_warnings(*args, **kwargs): @pytest.fixture(scope="session", autouse=True) # type: ignore def patch_testing_functions(): - tm.assert_produces_warning = null_assert_warnings + tm.assert_produces_warning = null_assert_warnings # noqa: F821 pytest.raises = replace_kwargs({"match": None})(pytest.raises) diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py index 4ea0b3b4413..a0ad872e4c7 100644 --- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py +++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py @@ -5,7 +5,8 @@ """ Summarizes the test results per module. -Examples: +Examples +-------- python summarize-test-results.py log.json python summarize-test-results.py log.json --output json python summarize-test-results.py log.json --output table diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 8cb9efa873c..a5dc8a5498c 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -92,7 +92,8 @@ def random_bitmask(size): number of bits """ sz = bitmask_allocation_size_bytes(size) - data = np.random.randint(0, 255, dtype="u1", size=sz) + rng = np.random.default_rng(seed=0) + data = rng.integers(0, 255, dtype="u1", size=sz) return data.view("i1") @@ -209,9 +210,10 @@ def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs): def gen_rand(dtype, size, **kwargs): + rng = np.random.default_rng(seed=kwargs.get("seed", 0)) dtype = cudf.dtype(dtype) if dtype.kind == "f": - res = np.random.random(size=size).astype(dtype) + res = rng.random(size=size).astype(dtype) if kwargs.get("positive_only", False): return res else: @@ -219,25 +221,23 @@ def gen_rand(dtype, size, **kwargs): elif dtype == np.int8 or dtype == np.int16: low = kwargs.get("low", -32) high = kwargs.get("high", 32) - return np.random.randint(low=low, high=high, size=size).astype(dtype) + return rng.integers(low=low, high=high, size=size).astype(dtype) elif dtype.kind == "i": low = kwargs.get("low", -10000) high = kwargs.get("high", 10000) - return np.random.randint(low=low, high=high, size=size).astype(dtype) + return rng.integers(low=low, high=high, size=size).astype(dtype) elif dtype == np.uint8 or dtype == np.uint16: low = kwargs.get("low", 0) high = kwargs.get("high", 32) - return np.random.randint(low=low, high=high, size=size).astype(dtype) + return rng.integers(low=low, high=high, size=size).astype(dtype) elif dtype.kind == "u": low = kwargs.get("low", 0) high = kwargs.get("high", 128) - return np.random.randint(low=low, high=high, size=size).astype(dtype) + return rng.integers(low=low, high=high, size=size).astype(dtype) elif dtype.kind == "b": low = kwargs.get("low", 0) high = kwargs.get("high", 2) - return np.random.randint(low=low, high=high, size=size).astype( - np.bool_ - ) + return rng.integers(low=low, high=high, size=size).astype(np.bool_) elif dtype.kind == "M": low = kwargs.get("low", 0) time_unit, _ = np.datetime_data(dtype) @@ -246,14 +246,14 @@ def gen_rand(dtype, size, **kwargs): int(1e18) / _unit_to_nanoseconds_conversion[time_unit], ) return pd.to_datetime( - np.random.randint(low=low, high=high, size=size), unit=time_unit + rng.integers(low=low, high=high, size=size), unit=time_unit ) elif dtype.kind in ("O", "U"): low = kwargs.get("low", 10) high = kwargs.get("high", 11) - nchars = np.random.randint(low=low, high=high, size=1)[0] + nchars = rng.integers(low=low, high=high, size=1)[0] char_options = np.array(list(string.ascii_letters + string.digits)) - all_chars = "".join(np.random.choice(char_options, nchars * size)) + all_chars = "".join(rng.choice(char_options, nchars * size)) return np.array( [all_chars[nchars * i : nchars * (i + 1)] for i in range(size)] ) diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py index 13c194d6be0..99b686406fb 100644 --- a/python/cudf/cudf/testing/dataset_generator.py +++ b/python/cudf/cudf/testing/dataset_generator.py @@ -48,16 +48,22 @@ def __init__( self, cardinality=100, null_frequency=0.1, - generator=lambda: [ - _generate_string(string.ascii_letters, random.randint(4, 8)) - for _ in range(100) - ], + generator=None, is_sorted=True, dtype=None, ): self.cardinality = cardinality self.null_frequency = null_frequency - self.generator = generator + if generator is None: + rng = np.random.default_rng(seed=0) + self.generator = lambda: [ + _generate_string( + string.ascii_letters, rng, rng.integers(4, 8).item() + ) + for _ in range(100) + ] + else: + self.generator = generator self.is_sorted = is_sorted self.dtype = dtype @@ -96,7 +102,7 @@ def _write(tbl, path, format): tbl.to_parquet(path, row_group_size=format["row_group_size"]) -def _generate_column(column_params, num_rows): +def _generate_column(column_params, num_rows, rng): # If cardinality is specified, we create a set to sample from. # Otherwise, we simply use the given generator to generate each value. @@ -115,10 +121,8 @@ def _generate_column(column_params, num_rows): ) return pa.DictionaryArray.from_arrays( dictionary=vals, - indices=np.random.randint( - low=0, high=len(vals), size=num_rows - ), - mask=np.random.choice( + indices=rng.integers(low=0, high=len(vals), size=num_rows), + mask=rng.choice( [True, False], size=num_rows, p=[ @@ -142,7 +146,7 @@ def _generate_column(column_params, num_rows): column_params.generator, names=column_params.dtype.fields.keys(), mask=pa.array( - np.random.choice( + rng.choice( [True, False], size=num_rows, p=[ @@ -163,10 +167,10 @@ def _generate_column(column_params, num_rows): type=arrow_type, ) vals = pa.array( - np.random.choice(column_params.generator, size=num_rows) + rng.choice(column_params.generator, size=num_rows) if isinstance(arrow_type, pa.lib.Decimal128Type) - else np.random.choice(vals, size=num_rows), - mask=np.random.choice( + else rng.choice(vals, size=num_rows), + mask=rng.choice( [True, False], size=num_rows, p=[ @@ -189,7 +193,7 @@ def _generate_column(column_params, num_rows): # Generate data for current column return pa.array( column_params.generator, - mask=np.random.choice( + mask=rng.choice( [True, False], size=num_rows, p=[ @@ -233,7 +237,9 @@ def generate( def get_dataframe(parameters, use_threads): # Initialize seeds if parameters.seed is not None: - np.random.seed(parameters.seed) + rng = np.random.default_rng(seed=parameters.seed) # noqa: F841 + else: + rng = np.random.default_rng(seed=0) # noqa: F841 # For each column, invoke the data generator for column_params in parameters.column_parameters: @@ -281,14 +287,16 @@ def get_dataframe(parameters, use_threads): if not use_threads: for i, column_params in enumerate(parameters.column_parameters): column_data[i] = _generate_column( - column_params, parameters.num_rows + column_params, + parameters.num_rows, + rng, ) else: pool = Pool(pa.cpu_count()) column_data = pool.starmap( _generate_column, [ - (column_params, parameters.num_rows) + (column_params, parameters.num_rows, rng) for i, column_params in enumerate(parameters.column_parameters) ], ) @@ -336,7 +344,7 @@ def rand_dataframe( """ # Apply seed random.seed(seed) - np.random.seed(seed) + rng = np.random.default_rng(seed=seed) column_params = [] for meta in dtypes_meta: @@ -348,7 +356,7 @@ def rand_dataframe( lists_max_length = meta["lists_max_length"] nesting_max_depth = meta["nesting_max_depth"] value_type = meta["value_type"] - nesting_depth = np.random.randint(1, nesting_max_depth) + nesting_depth = rng.integers(1, nesting_max_depth) dtype = cudf.core.dtypes.ListDtype(value_type) @@ -368,6 +376,7 @@ def rand_dataframe( size=cardinality, nesting_depth=nesting_depth, lists_max_length=lists_max_length, + rng=rng, ), is_sorted=False, dtype=dtype, @@ -377,10 +386,11 @@ def rand_dataframe( nesting_max_depth = meta["nesting_max_depth"] max_types_at_each_level = meta["max_types_at_each_level"] max_null_frequency = meta["max_null_frequency"] - nesting_depth = np.random.randint(1, nesting_max_depth) + nesting_depth = rng.integers(1, nesting_max_depth) structDtype = create_nested_struct_type( max_types_at_each_level=max_types_at_each_level, nesting_level=nesting_depth, + rng=rng, ) column_params.append( @@ -392,6 +402,7 @@ def rand_dataframe( cardinality=cardinality, size=rows, max_null_frequency=max_null_frequency, + rng=rng, ), is_sorted=False, dtype=structDtype, @@ -401,14 +412,16 @@ def rand_dataframe( max_precision = meta.get( "max_precision", cudf.Decimal64Dtype.MAX_PRECISION ) - precision = np.random.randint(1, max_precision) - scale = np.random.randint(0, precision) + precision = rng.integers(1, max_precision) + scale = rng.integers(0, precision) dtype = cudf.Decimal64Dtype(precision=precision, scale=scale) column_params.append( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, - generator=decimal_generator(dtype=dtype, size=cardinality), + generator=decimal_generator( + dtype=dtype, size=cardinality, rng=rng + ), is_sorted=False, dtype=dtype, ) @@ -417,14 +430,16 @@ def rand_dataframe( max_precision = meta.get( "max_precision", cudf.Decimal32Dtype.MAX_PRECISION ) - precision = np.random.randint(1, max_precision) - scale = np.random.randint(0, precision) + precision = rng.integers(1, max_precision) + scale = rng.integers(0, precision) dtype = cudf.Decimal32Dtype(precision=precision, scale=scale) column_params.append( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, - generator=decimal_generator(dtype=dtype, size=cardinality), + generator=decimal_generator( + dtype=dtype, size=cardinality, rng=rng + ), is_sorted=False, dtype=dtype, ) @@ -433,14 +448,16 @@ def rand_dataframe( max_precision = meta.get( "max_precision", cudf.Decimal128Dtype.MAX_PRECISION ) - precision = np.random.randint(1, max_precision) - scale = np.random.randint(0, precision) + precision = rng.integers(1, max_precision) + scale = rng.integers(0, precision) dtype = cudf.Decimal128Dtype(precision=precision, scale=scale) column_params.append( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, - generator=decimal_generator(dtype=dtype, size=cardinality), + generator=decimal_generator( + dtype=dtype, size=cardinality, rng=rng + ), is_sorted=False, dtype=dtype, ) @@ -469,6 +486,7 @@ def rand_dataframe( size=cardinality, min_bound=meta.get("min_bound", None), max_bound=meta.get("max_bound", None), + rng=rng, ), is_sorted=False, dtype=dtype, @@ -484,6 +502,7 @@ def rand_dataframe( size=cardinality, min_bound=meta.get("min_bound", None), max_bound=meta.get("max_bound", None), + rng=rng, ), is_sorted=False, dtype=dtype, @@ -497,7 +516,8 @@ def rand_dataframe( generator=lambda cardinality=cardinality: [ _generate_string( string.printable, - np.random.randint( + rng, + rng.integers( low=0, high=meta.get("max_string_length", 1000), size=1, @@ -519,6 +539,7 @@ def rand_dataframe( size=cardinality, min_bound=meta.get("min_bound", None), max_bound=meta.get("max_bound", None), + rng=rng, ), is_sorted=False, dtype=cudf.dtype(dtype), @@ -534,6 +555,7 @@ def rand_dataframe( size=cardinality, min_bound=meta.get("min_bound", None), max_bound=meta.get("max_bound", None), + rng=rng, ), is_sorted=False, dtype=cudf.dtype(dtype), @@ -544,7 +566,7 @@ def rand_dataframe( ColumnParameters( cardinality=cardinality, null_frequency=null_frequency, - generator=boolean_generator(cardinality), + generator=boolean_generator(cardinality, rng), is_sorted=False, dtype=cudf.dtype(dtype), ) @@ -567,7 +589,7 @@ def rand_dataframe( return df -def int_generator(dtype, size, min_bound=None, max_bound=None): +def int_generator(dtype, size, rng, min_bound=None, max_bound=None): """ Generator for int data """ @@ -577,7 +599,7 @@ def int_generator(dtype, size, min_bound=None, max_bound=None): iinfo = np.iinfo(dtype) low, high = iinfo.min, iinfo.max - return lambda: np.random.randint( + return lambda: rng.integers( low=low, high=high, size=size, @@ -585,13 +607,13 @@ def int_generator(dtype, size, min_bound=None, max_bound=None): ) -def float_generator(dtype, size, min_bound=None, max_bound=None): +def float_generator(dtype, size, rng, min_bound=None, max_bound=None): """ Generator for float data """ if min_bound is not None and max_bound is not None: low, high = min_bound, max_bound - return lambda: np.random.uniform( + return lambda: rng.uniform( low=low, high=high, size=size, @@ -599,7 +621,7 @@ def float_generator(dtype, size, min_bound=None, max_bound=None): else: finfo = np.finfo(dtype) return ( - lambda: np.random.uniform( + lambda: rng.uniform( low=finfo.min / 2, high=finfo.max / 2, size=size, @@ -608,7 +630,7 @@ def float_generator(dtype, size, min_bound=None, max_bound=None): ) -def datetime_generator(dtype, size, min_bound=None, max_bound=None): +def datetime_generator(dtype, size, rng, min_bound=None, max_bound=None): """ Generator for datetime data """ @@ -618,14 +640,14 @@ def datetime_generator(dtype, size, min_bound=None, max_bound=None): iinfo = np.iinfo("int64") low, high = iinfo.min + 1, iinfo.max - return lambda: np.random.randint( + return lambda: rng.integers( low=np.datetime64(low, "ns").astype(dtype).astype("int"), high=np.datetime64(high, "ns").astype(dtype).astype("int"), size=size, ) -def timedelta_generator(dtype, size, min_bound=None, max_bound=None): +def timedelta_generator(dtype, size, rng, min_bound=None, max_bound=None): """ Generator for timedelta data """ @@ -635,25 +657,25 @@ def timedelta_generator(dtype, size, min_bound=None, max_bound=None): iinfo = np.iinfo("int64") low, high = iinfo.min + 1, iinfo.max - return lambda: np.random.randint( + return lambda: rng.integers( low=np.timedelta64(low, "ns").astype(dtype).astype("int"), high=np.timedelta64(high, "ns").astype(dtype).astype("int"), size=size, ) -def boolean_generator(size): +def boolean_generator(size, rng): """ Generator for bool data """ - return lambda: np.random.choice(a=[False, True], size=size) + return lambda: rng.choice(a=[False, True], size=size) -def decimal_generator(dtype, size): +def decimal_generator(dtype, size, rng): max_integral = 10 ** (dtype.precision - dtype.scale) - 1 max_float = (10**dtype.scale - 1) if dtype.scale != 0 else 0 return lambda: ( - np.random.uniform( + rng.uniform( low=-max_integral, high=max_integral + (max_float / 10**dtype.scale), size=size, @@ -661,32 +683,33 @@ def decimal_generator(dtype, size): ) -def get_values_for_nested_data(dtype, lists_max_length=None, size=None): +def get_values_for_nested_data(dtype, rng, lists_max_length=None, size=None): """ Returns list of values based on dtype. """ if size is None: - cardinality = np.random.randint(0, lists_max_length) + cardinality = rng.integers(0, lists_max_length) else: cardinality = size dtype = cudf.dtype(dtype) if dtype.kind in ("i", "u"): - values = int_generator(dtype=dtype, size=cardinality)() + values = int_generator(dtype=dtype, size=cardinality, rng=rng)() elif dtype.kind == "f": - values = float_generator(dtype=dtype, size=cardinality)() + values = float_generator(dtype=dtype, size=cardinality, rng=rng)() elif dtype.kind in ("U", "O"): values = [ _generate_string( string.printable, + rng, 100, ) for _ in range(cardinality) ] elif dtype.kind == "M": - values = datetime_generator(dtype=dtype, size=cardinality)().astype( - dtype - ) + values = datetime_generator( + dtype=dtype, size=cardinality, rng=rng + )().astype(dtype) elif dtype.kind == "m": values = timedelta_generator(dtype=dtype, size=cardinality)().astype( dtype @@ -699,14 +722,14 @@ def get_values_for_nested_data(dtype, lists_max_length=None, size=None): return values -def make_lists(dtype, lists_max_length, nesting_depth, top_level_list): +def make_lists(dtype, lists_max_length, nesting_depth, top_level_list, rng): """ Helper to create random list of lists with `nesting_depth` and specified value type `dtype`. """ nesting_depth -= 1 if nesting_depth >= 0: - L = np.random.randint(1, lists_max_length) + L = rng.integers(1, lists_max_length) for i in range(L): top_level_list.append( make_lists( @@ -714,11 +737,14 @@ def make_lists(dtype, lists_max_length, nesting_depth, top_level_list): lists_max_length=lists_max_length, nesting_depth=nesting_depth, top_level_list=[], + rng=rng, ) ) else: top_level_list = get_values_for_nested_data( - dtype=dtype, lists_max_length=lists_max_length + dtype=dtype, + lists_max_length=lists_max_length, + rng=rng, ) # To ensure numpy arrays are not passed as input to # list constructor, returning a python list object here. @@ -728,22 +754,22 @@ def make_lists(dtype, lists_max_length, nesting_depth, top_level_list): return top_level_list -def make_array_for_struct(dtype, cardinality, size, max_null_frequency): +def make_array_for_struct(dtype, cardinality, size, max_null_frequency, rng): """ Helper to create a pa.array with `size` and `dtype` for a `StructArray`. """ - null_frequency = np.random.uniform(low=0, high=max_null_frequency) - local_cardinality = max(np.random.randint(low=0, high=cardinality), 1) + null_frequency = rng.uniform(low=0, high=max_null_frequency) + local_cardinality = max(rng.integers(low=0, high=cardinality), 1) data = get_values_for_nested_data( - dtype=dtype.type.to_pandas_dtype(), size=local_cardinality + dtype=dtype.type.to_pandas_dtype(), size=local_cardinality, rng=rng ) - vals = np.random.choice(data, size=size) + vals = rng.choice(data, size=size) return pa.array( vals, - mask=np.random.choice( + mask=rng.choice( [True, False], size=size, p=[null_frequency, 1 - null_frequency], @@ -756,7 +782,7 @@ def make_array_for_struct(dtype, cardinality, size, max_null_frequency): ) -def get_nested_lists(dtype, size, nesting_depth, lists_max_length): +def get_nested_lists(dtype, size, nesting_depth, lists_max_length, rng): """ Returns a list of nested lists with random nesting depth and random nested lists length. @@ -770,13 +796,14 @@ def get_nested_lists(dtype, size, nesting_depth, lists_max_length): lists_max_length=lists_max_length, nesting_depth=nesting_depth, top_level_list=[], + rng=rng, ) ) return list_of_lists -def get_nested_structs(dtype, cardinality, size, max_null_frequency): +def get_nested_structs(dtype, cardinality, size, max_null_frequency, rng): """ Returns a list of arrays with random data corresponding to the dtype provided. @@ -787,7 +814,7 @@ def get_nested_structs(dtype, cardinality, size, max_null_frequency): for name, col_dtype in dtype.fields.items(): if isinstance(col_dtype, cudf.StructDtype): result_arrays = get_nested_structs( - col_dtype, cardinality, size, max_null_frequency + col_dtype, cardinality, size, max_null_frequency, rng ) result_arrays = pa.StructArray.from_arrays( result_arrays, names=col_dtype.fields.keys() @@ -798,13 +825,14 @@ def get_nested_structs(dtype, cardinality, size, max_null_frequency): cardinality=cardinality, size=size, max_null_frequency=max_null_frequency, + rng=rng, ) list_of_arrays.append(result_arrays) return list_of_arrays -def list_generator(dtype, size, nesting_depth, lists_max_length): +def list_generator(dtype, size, nesting_depth, lists_max_length, rng): """ Generator for list data """ @@ -813,10 +841,11 @@ def list_generator(dtype, size, nesting_depth, lists_max_length): size=size, nesting_depth=nesting_depth, lists_max_length=lists_max_length, + rng=rng, ) -def struct_generator(dtype, cardinality, size, max_null_frequency): +def struct_generator(dtype, cardinality, size, max_null_frequency, rng): """ Generator for struct data """ @@ -825,25 +854,26 @@ def struct_generator(dtype, cardinality, size, max_null_frequency): cardinality=cardinality, size=size, max_null_frequency=max_null_frequency, + rng=rng, ) -def create_nested_struct_type(max_types_at_each_level, nesting_level): +def create_nested_struct_type(max_types_at_each_level, nesting_level, rng): dtypes_list = cudf.utils.dtypes.ALL_TYPES - picked_types = np.random.choice(list(dtypes_list), max_types_at_each_level) + picked_types = rng.choice(list(dtypes_list), max_types_at_each_level) type_dict = {} for name, type_ in enumerate(picked_types): if type_ == "struct": type_dict[str(name)] = create_nested_struct_type( - max_types_at_each_level, nesting_level - 1 + max_types_at_each_level, nesting_level - 1, rng ) else: type_dict[str(name)] = cudf.dtype(type_) return cudf.StructDtype(type_dict) -def _generate_string(str_seq: str, length: int = 10) -> str: - return "".join(random.choices(str_seq, k=length)) +def _generate_string(str_seq: str, rng, length: int = 10) -> str: + return "".join(rng.choice(list(str_seq), size=length)) def _unique_string() -> str: diff --git a/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt b/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt index 84b13c9d946..566ac2c337d 100644 --- a/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt +++ b/python/cudf/cudf/tests/data/subword_tokenizer_data/bert_base_cased_sampled/vocab-hash.txt @@ -1,4382 +1,4382 @@ -26899 -27424 +19535 +9039 875 -7428432802425011718 0 -5054974408289448963 6 -18358444369622338053 9 -5716902217424485892 14 -8236612966193239043 18 -15282833726017872390 21 -15533348956988973570 27 -9001315167781089284 29 -7621090240282984451 33 -15337888141402371590 36 -16169070283077377537 42 -15615300272936709634 43 -12338784885023498756 45 -3175624061711419395 49 -9436392785812228615 52 -12978641027296058883 59 -14468815760709033991 62 -15607694490571932163 69 -53295083356623878 72 -0 78 -2230148770582976004 78 -6120456721458209796 82 -15411373208619074054 86 -10274574020114097153 92 -9000294930530661890 93 -13031557903172483076 95 -11350066664294002181 99 -6325605033787362307 104 -2909954277284188676 107 -4104562716099355138 111 -3267092979937387012 113 -17525453481571210244 117 -11532627846208440834 121 -10784672185103672321 123 -11229796758348255749 124 -4379577250247562242 129 -1041161126836283908 131 -3854383966527313413 135 -16467720483237810694 140 -14820844471735454722 146 -13111220924289178119 148 -2548683052821249538 155 -719749806464434178 157 -2121722119826170883 159 -9005614210949580292 162 -7050169108294333445 166 -17351764915062575107 171 -14644698505496219141 174 -11657834349296686081 179 -13626797927783164930 180 -14735048589438940164 182 -1078491261937017863 186 -7952761372439242754 193 -7692446865301965827 195 -4552111108816020995 198 -12455022990418032132 201 -1123962659471997957 205 -3056549312838577156 210 -1025661670765243906 214 -5397331336358247944 216 -7810366437124875782 224 -1195318972358038531 230 -7079722807026103811 233 -2524512050942986248 236 -1208593608912656389 244 -458260789232344578 249 -13194777122325112327 251 -5922704468287492 258 -11746235869336195079 262 -8611574268876189188 269 -7889840228953421829 273 -16998721522558936068 278 -6703563424903621638 282 -8885848295085850114 288 -13776273837475230211 290 -6036043703810622467 293 -2006225773287659526 296 -14202467530861800964 302 -7157057020317447684 306 -16885485872491802629 310 -12800303798361952772 315 -621325108927868418 319 -16727475898656483841 321 -6890112792805515778 322 -2421332377941126151 324 -16243404411124196356 331 -179400401794890244 335 -2630159406474274819 339 -1306609735592145925 342 -14908020842914311174 347 -1684452927247835651 353 -9400495923215416322 356 -8041860727239247878 358 -5619270496913133574 364 -2985476283152588291 370 -18150632792370312198 373 -13075355875451793410 379 -7596576612263365635 381 -7174955249282660868 384 -2272878747426984963 388 -9645618748109430277 391 -5995177571885476868 396 -16871713338758691845 400 -11801224416933808644 405 -15551192014010130949 409 -8196030292452405250 414 -4794784530053649411 416 -68047322062825475 419 -10163451915097363972 422 -4366630365820669955 426 -9174613115382159879 429 -17673253091692480002 436 -10710744348807818249 438 -6301209632168211460 447 -6557199531177304066 451 -10370980735304160259 453 -2426040420413965827 456 -18123352379522220547 459 -15891150425892429319 462 -16507447417454265351 469 -487708338428237827 476 -14107089365716616196 479 -747857609528251395 483 -17357876987202521607 486 -321005419951863300 493 -703083947315053061 497 -0 502 -17149635587492691460 502 -8277651075246678020 506 -1819886593879462403 510 -13106328552418381315 513 -17519686381941948418 516 -10696099526822671877 518 -4627984173327437314 523 -2628632462897246722 525 -3686397216490033667 527 -6617920799692924934 530 -6679301623707790339 536 -2596030458845084674 539 -13288938917088308226 541 -8348492885671808517 543 -6252009608718840325 548 -5807005916268695559 553 -15382799971167504899 560 -14954638692016032262 563 -8963684459383523331 569 -2934745887866391556 572 -8236887590303639044 576 -2016330563068923911 580 -12976290063611676164 587 -9986513189506445831 591 -780378482699725318 598 -383862355994530823 604 -7511344867307093508 611 -1435616864863593988 615 -12590979271693393411 619 -859813995721111047 622 -17910873098448224770 629 -16703366890805911553 631 -6922480979814889987 632 -8200210214462711297 635 -18382541080931060232 636 -12959023536126992897 644 -11055794376142651906 645 -8668012051305565187 647 -6795201209679524868 650 -3864186432644490244 654 -4574634299775772674 658 -2086703290536303619 660 -7145543127561014787 663 -9889572542971630085 666 -3510566585561691650 671 -10482036181312531460 673 -4296479271603189251 677 -17165580381790665732 680 -17931697598514948104 684 -5072138329769649158 692 -17857316349005986308 698 -1196313437880152072 702 -16094827446472526340 710 -6365083142954013701 714 -17639674970007880709 719 -1336948026798963208 724 -15719079816546418177 732 -453771991153695748 733 -15666021623592344581 737 -3887496731301423107 742 -16351565489992748547 745 -12913808626051103749 748 -9427161342471792643 753 -14610089064185748483 756 -11909740995340709890 759 -3386059367942955011 761 -7100313088634791944 764 -14954362273735097348 772 -5300343188950335490 776 -3306636399811602435 778 -15049176780536452612 781 -11478464585367391747 785 -4192691696663825924 788 -1724981527538165256 792 -8923121468991320579 800 -10407927314751914499 803 -4140577061391662082 806 -11024499228689010181 808 -11103397578962422789 813 -16103730809841527300 818 -2161511371026989571 822 -16905537098408481288 825 -14418359835235787780 833 -8643099440826274820 837 -15803230958149170691 841 -2270949347024239618 844 -16607521085023703556 846 -12520505897845165062 850 -10502193626894192132 856 -12350321094518214659 860 -4950437143309872131 863 -938542234576037889 866 -9547302901107668484 867 -7827404372121768966 871 -17757593377946824198 877 -13699186867246955524 883 -9859653826627356163 887 -16394835100035514883 890 -13800374264730731525 893 -16954635983094506500 898 -8015308433863798275 902 -858715644299290630 905 -4519655150699331077 911 -7134867591233050115 916 -6432786657037144579 919 -0 922 -9408341322832972291 922 -13653279902433200130 925 -1249019122170091524 927 -5444522055126761479 931 -18233734556082323457 938 -1838285473517654531 939 -10799019207790220804 942 -2448710159565130755 946 -18425837006146807297 949 -1384258267102048263 950 -6553795393861204486 957 -5022631533298058243 963 -2595435540421003780 966 -18298501952506793480 970 -17380720526409169413 978 -10291550905275666437 983 -8968303908578660869 988 -7762552109517888009 993 -12993351549860134403 1002 -13098482377540869636 1005 -17174134275815044100 1009 -2405939573849534980 1013 -11051603729345690626 1017 -2765842466801084934 1019 -13348255112383532037 1025 -4560899789258637829 1030 -17071422935680193539 1035 -11513452937230732294 1038 -1637355496640499203 1044 -14940739688966611972 1047 -8286559267538602502 1051 -6029036263825492484 1057 -6337648087046756355 1061 -12327119652833755139 1064 -7489768843341343236 1067 -17101806024406781955 1071 -1494687508867621385 1074 -915975103999953922 1083 -14731060910946571783 1085 -7993361195780195330 1092 -13688799604315935236 1094 -7328858946338903047 1098 -2913637027195678723 1105 -18189363439163655681 1108 -11261484070936291332 1109 -1244962005334571010 1113 -12618388435910808066 1115 -655187203027088898 1117 -1699259352638115337 1119 -9837815037477742085 1128 -10558465000768489987 1133 -3128326958710492164 1136 -16210393874387209731 1140 -3831602806328386054 1143 -1858477608543888899 1149 -11203849268139405826 1152 -14876215834473532933 1154 -838167957834962945 1159 -4472540425609859076 1160 -11410947109444917250 1164 -8435818218907397633 1166 -11045000766266457089 1167 -12325335880954441220 1168 -16708265953266297345 1172 -18342265362969646594 1173 -6953158344648897539 1175 -9922701673105435137 1178 -10113283973443524101 1179 -11668798096262926343 1184 -2129351334726026241 1191 -5692959118811792390 1192 -2917574127780044290 1198 -0 1200 -14420924818562740228 1200 -6098057863303978497 1204 -1252966646111680002 1205 -7111078464697947144 1207 -14144456899593720327 1215 -7367692118573781509 1222 -9319588592876439043 1227 -5212294342286609410 1230 -1600499660866511361 1232 -17579747388547180552 1233 -8365608306992954885 1241 -10307394306592963076 1246 -17092600292669807621 1250 -17030981925892977667 1255 -6929843536411176451 1258 -9908722951841282057 1261 -14685407131320535554 1270 -12861962652898171396 1272 -11958437143660911107 1276 -15904867421058229764 1279 -7283769647955500035 1283 -7872121678898447876 1286 -11726527760261815816 1290 -2316085662456682505 1298 -12840093831481137155 1307 -15574983692566917639 1310 -15176154862895929860 1317 -16186650646772958214 1321 -1965140296142659588 1327 -17362020270091437575 1331 -26356620300320263 1338 -4688323194808506371 1345 -470137109846916612 1348 -785647648524588041 1352 -686083037273571331 1361 -8705676087000994307 1364 -15985311040931325446 1367 -8848102120172622345 1373 -14900059783221505542 1382 -11611185676221023751 1388 -5823293000835959809 1395 -11173877492782561286 1396 -5985141512875075076 1402 -16607272189142469634 1406 -7000924871247012354 1408 -12796508861938638339 1410 -16352304696891085315 1413 -12654027566339262469 1416 -17652126895193709571 1421 -2059554016646703617 1424 -8824828815238545922 1425 -8026041213654553606 1427 -189105210507091461 1433 -8038465995762949635 1438 -0 1441 -4346653818095449092 1441 -13441396742193060358 1445 -5067771148519478785 1451 -210369551309682178 1452 -7856429334361659909 1454 -6456628847560069634 1459 -4777640967745320451 1461 -8983636279512822276 1464 -14568805960710332932 1468 -13817574021643753989 1472 -14625711259902278149 1477 -4632056779689710085 1482 -17613320542667293189 1487 -3172012402848437254 1492 -8040798394603101188 1498 -14064841209998140419 1502 -1914908168343121410 1505 -7368139610144548354 1507 -12868473585497306119 1509 -0 1516 -1618708134596732930 1516 -12587973098332420105 1518 -4964388169698209795 1527 -11644359715676310021 1530 -2644060095775605251 1535 -6430078223195648003 1538 -10183198452214045187 1541 -1240799682393062914 1544 -594310634075621378 1546 -2369514519273954820 1548 -10180653661786314245 1552 -954303650251543043 1557 -14430712698160791045 1560 -7362398115224322564 1565 -17170839233019868678 1569 -4334478792852912645 1575 -6976600872204725253 1580 -2757627166710815234 1585 -11581525848542896643 1587 -1902097979216049156 1590 -7092174838851165700 1594 -3776232881097953287 1598 -4956341896516184071 1605 -16560365104979398147 1612 -9985649880040289799 1615 -8870322153106933763 1622 -6905121755133908995 1625 -13368640352340902916 1628 -6681848478588709895 1632 -1825204937600832520 1639 -10492979809894170628 1647 -16021790814379410438 1651 -2537982728896871938 1657 -17110141827238231043 1659 -8972517116882764291 1662 -6878463938568223238 1665 -3653948979877717506 1671 -11414481194651397126 1673 -14522267179648162819 1679 -3098339502618796035 1682 -7079749050994126342 1685 -13571764215085394946 1691 -4748948606525397506 1693 -1577643399485818884 1695 -4080235243237779462 1699 -10874175738252140040 1705 -8407257242091918850 1713 -13208300770644489219 1715 -692428139842995202 1718 -1811883090719733762 1720 -9059362818280152070 1722 -1942856588307002885 1728 -8118332366482353665 1733 -4958069245857057284 1734 -14647311378680886789 1738 -10762024033896625670 1743 -28898254948429830 1749 -9834906317233815042 1755 -14985989359682912259 1757 -1282980713864208388 1760 -6063131598875265027 1764 -11171681444901584901 1767 -9942643440891227650 1772 -7536761905759707139 1774 -17586310513048226310 1777 -5368266791748388869 1783 -14231943828217691651 1788 -12518647321260815877 1791 -129394441281844743 1796 -2483490487411335170 1803 -654244401428041732 1805 -15646533714849457160 1809 -11807354932867949571 1817 -15902831808268765699 1820 -16275101253541722114 1823 -7489443708629377026 1825 -15395914353243975682 1827 -5617555619731661829 1829 -3134100206450675206 1834 -11607495136261988868 1840 -4974806308616426501 1844 -17446584074836170241 1849 -15686830167444742663 1850 -9706307518401206273 1857 -1668062460313515521 1858 -1175330870409010693 1859 -6316020408117881860 1864 -3926008952689808899 1868 -7412001888157663237 1871 -16350342416828571139 1876 -17722048717800707588 1879 -6638262866276511751 1883 -7428951476729761793 1890 -17816197047883941382 1891 -1346568064340942337 1897 -3701787015222295555 1898 -6659812133237486083 1901 -1828541539854978054 1904 -12379063259192634885 1910 -2611769333840765443 1915 -9618163593004828678 1918 -10135224491789939206 1924 -12979651712861326853 1930 -8882180359699969027 1935 -8839565787481092102 1938 -13328456084920556038 1944 -14232512278042323458 1950 -1868952656876792325 1952 -7567044498348088836 1957 -9878469525845452294 1961 -10877666723773861891 1967 -4437849393189355524 1970 -542122243470857732 1974 -4059190346138068994 1978 -14321675947144358916 1980 -14971180244834539009 1984 -7944574903635664900 1985 -6982417546170903047 1989 -9205813465909939715 1996 -14237044737088801799 1999 -636814072910696963 2006 -12520841226045264391 2009 -8898943418672995331 2016 -15646690259358356484 2019 -15618851112604340228 2023 -10285088843216830977 2027 -18286036510192394760 2028 -6450286360774949890 2036 -12025307250191760899 2038 -7044602746592181249 2041 -8270361223031661060 2042 -7199149542695273990 2046 -16798091800673956358 2052 -5285433079037354499 2058 -8498140496880657410 2061 -18434636390635965953 2063 -8780418579830073348 2064 -959965579978681347 2068 -2666650386212475906 2071 -4093783342266269185 2073 -7977153448080645638 2074 -3230317076849645570 2080 -2644129221999468547 2082 -7597431151331275265 2085 -6151418962808616963 2086 -16786361788616914434 2089 -9522044737514147334 2091 -15360350686533802498 2097 -4398995179394704386 2099 -4163122903470647302 2101 -18110267126768664070 2107 -17811600627481865731 2113 -11988559903619469315 2116 -5893679902922151940 2119 -3302430115655037445 2123 -2756050317441962502 2128 -7373324598575981572 2134 -15626353672087051269 2138 -9026268416534243843 2143 -5857105831257628164 2146 -11246462751297413124 2150 -7459631049065515526 2154 -2175352842263141379 2160 -9748465532031254533 2163 -12060676108130005507 2168 -8160425232164846593 2171 -1665947540125783558 2172 -10758171140537368580 2178 -5744770555727548418 2182 -15867521551313803780 2184 -11178209498970826244 2188 -2663862265833334277 2192 -646145646253570050 2197 -6886825228888300036 2199 -5219187155516171272 2203 -16142200027647465989 2211 -8727938199665870852 2216 -1200328579526163971 2220 -12449385538114001417 2223 -14632283715533800450 2232 -5295800027246062086 2234 -8827019094633400323 2240 -14543826221768176641 2243 -12388128316821831686 2244 -3087048392675298821 2250 -17669786912563615747 2255 -3879520399747123716 2258 -15648071975541157893 2262 -5580473107362200071 2267 -6895786389712974853 2274 -17709709086906012676 2279 -9627483233657542665 2283 -9602326803985618949 2292 -6748599026443758086 2297 -11488364339401397254 2303 -6716511183525677573 2309 -16003763240189186563 2314 -6003803301075291138 2317 -15800367754014516746 2319 -2817341800198731782 2329 -2110085916033252869 2335 -10353852055773781511 2340 -8745468498457416193 2347 -15197463976907486213 2348 -11844773108515011075 2353 -10745169896165544965 2356 -9502565595236673539 2361 -18340734722524717062 2364 -0 2370 -4877506240735029250 2370 -6632868101528461318 2372 -1094192348264738308 2378 -15930308455756352518 2382 -7517061312773919237 2388 -11537382714050522116 2393 -15343851421525887493 2397 -15685583084244037124 2402 -11443729733346354693 2406 -18096845502703148037 2411 -13060060807344890377 2416 -8226818503915081731 2425 -5171144332412330499 2428 -5367144440061049859 2431 -4687503341676126209 2434 -8115677569098133507 2435 -8753274682505368066 2438 -6767268893840927749 2440 -10747160183142327300 2445 -5318831768157948930 2449 -16744837601970291208 2451 -3968740997769839108 2459 -1041860322726726147 2463 -13185494599343868419 2466 -3781663100474830852 2469 -8664347289501861378 2473 -7145447006642560001 2475 -977858689003972101 2476 -188865761021926916 2481 -14781205616979726850 2485 -7514076159997088261 2487 -15227633270557658627 2492 -7486357174119883778 2495 -7899052859637422087 2497 -4312982947448530435 2504 -2484418012864310785 2507 -8450324929602980870 2508 -11374778755239228418 2514 -10780034123560756745 2516 -10313953391808102916 2525 -13836623279669341188 2529 -16297706918062760459 2533 -6404560275247226885 2544 -8323769790774729734 2549 -10061687257419431941 2555 -6724033317759518212 2560 -12265972209834273288 2564 -4748706107567735299 2572 -17588235414846031363 2575 -16029681841978911746 2578 -333014962274056196 2580 -2819861156000228870 2584 -17301319418358929926 2590 -14323022738651812355 2596 -17758251407482208260 2599 -9992216596142364674 2603 -5541911712511293955 2605 -1880849355295036931 2608 -15421034026101803523 2611 -2288503501826235907 2614 -2336333131728265731 2617 -15127408664422292997 2620 -6756061181968708102 2625 -2316367058427453443 2631 -13786932856453332482 2634 -17564157627292750852 2636 -5809790665868502019 2640 -9389430036410766853 2643 -15157257604368261123 2648 -523412383725034497 2651 -5270886391729814021 2652 -8987256414287503365 2657 -2751897370690544643 2662 -47819066577966599 2665 -9543124453318907909 2672 -15186331456703232514 2677 -9731347057535958023 2679 -6234700495105510914 2686 -17720066604242729989 2688 -611878128332703234 2693 -6029104170087404549 2695 -14612606995632327172 2700 -7357792311987945475 2704 -6074856230289873410 2707 -13368808999886628358 2709 -5918378978107988995 2715 -15624776793824203778 2718 -4241055509726121476 2720 -12687432015779367427 2724 -4003272975122620932 2727 -17483676776191982087 2731 -2701605488646040584 2738 -7387630099939362308 2746 -16331822462747681798 2750 -2197183442359868933 2756 -17624623361194542087 2761 -1749450990014992388 2768 -2888206094896619010 2772 -12985412669390948353 2774 -9843120678422464515 2775 -15590458610270713859 2778 -5950622975418741251 2781 -17607672802725530117 2784 -1225097419526011394 2789 -3758572251524375044 2791 -5891371767718009858 2795 -6843754938996156419 2797 -13418347525088883204 2800 -2887280155684756490 2804 -7867196614872225796 2814 -10992396837241625094 2818 -15526482250456426497 2824 -7582254907030848515 2825 -14309589056601523716 2828 -2843794758628944386 2832 -10106627892829635078 2834 -11117505412117820418 2840 -17559521087909430786 2842 -18410508844162253834 2844 -7796754440171003912 2854 -1826091018065355268 2862 -5568124937607335426 2866 -9164033835486570503 2868 -7917102923116225537 2875 -10708221634884163076 2876 -966446973350329348 2880 -1882776320247897092 2884 -18137433528115911172 2888 -7577505208556149252 2892 -3902521102041700356 2896 -11942362790107158020 2900 -2328713611561709573 2904 -8376513561567004165 2909 -18415012889800110091 2914 -7983446382889179652 2925 -2304166271864391689 2929 -708759182721729026 2938 -10774631175750681603 2940 -2608247964063907842 2943 -7317603117343176707 2945 -12615180422705001477 2948 -17995452459822326275 2953 -12439250137675515394 2956 -9947610136498965509 2958 -10340600516380348420 2963 -10073894039732477444 2967 -15954561361998232578 2971 -6039226287079734788 2973 -12684813664097613833 2977 -8337524429261820932 2986 -0 2990 -5738139389410570757 2990 -0 2995 -163262518049440773 2995 -11390362112332120070 3000 -7666496378417453571 3006 -17188351170280199170 3009 -14157925477049500677 3011 -16535316221715341826 3016 -701193705161007105 3018 -15417977144980853763 3019 -9623949443365348357 3022 -16537640731048440324 3027 -9880057250380779521 3031 -10507448958568448514 3032 -9901540867816521219 3034 -10882434502571251716 3037 -15939490563935542790 3041 -3818155241101528578 3047 -10810785028031231493 3049 -17268925026504538113 3054 -6000103580025957894 3055 -14492044616225970179 3061 -8964295197943843335 3064 -13244227239481936387 3071 -2072267724499101186 3074 -735562179013069826 3076 -3271477415853879302 3078 -1150251700717751812 3084 -11835839830005115393 3088 -17028480913889055238 3089 -16864969398419772420 3095 -9646252156141336066 3099 -5589333819644110342 3101 -14729039479109188098 3107 -2256025994407046148 3109 -5630416426912279555 3113 -23611161351524356 3116 -16061932977440933889 3120 -7560058124185071106 3121 -8943767870065516551 3123 -17388385529962317834 3130 -11686727589179028995 3140 -2993671307613155843 3143 -7451626547139373061 3146 -12726375988952098305 3151 -0 3152 -1735273330892205060 3152 -2746028049042776065 3156 -17093562035495421445 3157 -7598703106262353411 3162 -17526920923827930631 3165 -0 3172 -18087597149122765317 3172 -11336730259137625602 3177 -9704022087244797957 3179 -14531181144788964866 3184 -5103530438547424773 3186 -7049971328222257156 3191 -2593832991454060548 3195 -2549992206172832771 3199 -2656864556911864322 3202 -3094347590740453380 3204 -0 3208 -10556974365044028932 3208 -12597146506913681926 3212 -18243354473097630721 3218 -4168646291002030084 3219 -8893226051755120644 3223 -7904367695210051587 3227 -17247367703075879942 3230 -1338287165638264836 3236 -6734394253777139715 3240 -14645087877274778627 3243 -1841749727013933062 3246 -0 3252 -9793622484838288388 3252 -15384076833580083718 3256 -14678310837729104389 3262 -8947895455599830021 3267 -12421729442783160325 3272 -14382812703434878978 3277 -3484468606955360259 3279 -2411175954345499653 3282 -18322361710054416389 3287 -8989744845956541448 3292 -9637438279185886726 3300 -8282725403817063939 3306 -10727259769060221446 3309 -280860399088910340 3315 -3074647116268871172 3319 -9311932047626983431 3323 -2990333995786696707 3330 -11415454184475025922 3333 -8194042667332418565 3335 -11269986522125913093 3340 -10773634478079810565 3345 -0 3350 -4302235270674672643 3350 -4579270605621971460 3353 -3687011949425630213 3357 -9678333478858482691 3362 -14661606109051090440 3365 -9504123850532876291 3373 -14299233528797568008 3376 -10370491504729965060 3384 -286239823911254530 3388 -7969121812144744451 3390 -16606218867148559880 3393 -11756345184017143302 3401 -8204961944753809412 3407 -12456910480062157316 3411 -7569786299014196739 3415 -3372309516929818119 3418 -16631131943564946948 3425 -4436969913528429575 3429 -14467771002258720772 3436 -15278270405312088583 3440 -6638334178561090565 3447 -8154814430089498114 3452 -17289464348431017987 3454 -13185969354886446085 3457 -4725380864147687429 3462 -14933071000620043778 3467 -12471883028204926466 3469 -13286302152236950530 3471 -12020003522260348419 3473 -11784545509165047810 3476 -10311182359550097412 3478 -2262872037167824902 3482 -15672162207595698690 3488 -8479660175647360516 3490 -543122224331105283 3494 -8738610060644560897 3497 -15969479020845567490 3498 +0 0 +1196190418526572547 0 +3117251964976502276 3 +0 7 +3266452963994632202 7 +6701451810090115586 17 +10156473964989528067 19 +6270220596053033473 22 +8689732391113957377 23 +345423933508452359 24 +9048486634542125058 31 +13000119181766437380 33 +1008808785591799299 37 +12586249368236978177 40 +11161089178393358857 41 +0 50 +6900865085865625094 50 +2615908179610132483 56 +1617129254806601731 59 +1607892326666533378 62 +123501381755693059 64 +17180234710792039941 67 +17345742025318016002 72 +7933590365928361474 74 +16187522989672200717 76 +14893593683284454915 89 +6001767212789422083 92 +1805417936920808451 95 +8589625060174958594 98 +13148488988905702416 100 +6759231203841442819 116 +798806762886474754 119 +13949836854106156034 121 +4277844318153606661 123 +18162360468357982216 128 +17429735113921325570 136 +10428297564837543938 138 +10174389176493224450 140 +4782734429389924866 142 +16828613770926935558 144 +16924367891356487169 150 +15473269356473895940 151 +10277883249583756290 155 +7398921953351034881 157 +15672774546004063755 158 +7032338026028942337 169 +12638648541163088900 170 +11956890857542837252 174 +10813991647348979717 178 +698603259209416204 183 +104155371596876289 195 +8849883347580968451 196 +13523964487472320004 199 +12948374094552270339 203 +16624700721113753096 206 +0 214 +630014773304871940 214 +14669827911540386306 218 +16593543947487157254 220 +16189120489289924617 226 +5936869209199720450 235 +6504800368776816645 237 +17628010111075734529 242 +16073662248530872322 243 +15997624981342335497 245 +13519486007586370049 254 +469623719382726661 255 +10478598590185625089 260 +5239294057556035586 261 +17274642882001730567 263 +7924882265216651266 270 +13138720901108912133 272 +13741737182438464004 277 +14608811194009491970 281 +2489742908982890509 283 +14952279757728973318 296 +13432486964055121926 302 +15397241996877524995 308 +7400937882698838020 311 +13309132794101168654 315 +8519404085542453250 329 +2551722931538879493 331 +4492819152473235971 336 +9634175483270757380 339 +5023439465649179147 343 +2912624940235659267 354 +15615524075652075524 357 +15131856319265032196 361 +7560465986110364673 365 +16393161300057821706 366 +6737538541011470849 376 +6394493716971627523 377 +0 380 +6957953643235488257 380 +7533365794097524234 381 +11551517784611555841 391 +0 392 +14017003685401013761 392 +13868858036311946245 393 +609890416048967688 398 +15853752823436186626 406 +13008887538399190534 408 +275598997711474690 414 +612244017304434692 416 +265561555991638021 420 +0 425 +4771730300985403909 425 +14595656195986303489 430 +13010615142623560194 431 +3520044222049365512 433 +4843556531627173889 441 +9544321596489038851 442 +18097338319835691009 445 +17588488217883868161 446 +4553739803879796748 447 +12247953831639953411 459 +1685939678565356546 462 +2454121115370725890 464 +7699707784321416706 466 +2322428462912444939 468 +4251948422489921028 479 +8009626371771665409 483 +15830912148611917313 484 +15530208627603713027 485 +14550069280077337095 488 +3074860258671426050 495 +9819565310679728648 497 +0 505 +239920763215632386 505 +4479084686100589069 507 +7541436040510714881 520 +0 521 +18361828565940659201 521 +13943609537766478850 522 +1644071836581560844 524 +3325147442114083333 536 +9121949682662027269 541 +5375060563545179653 546 +11461944020052039682 551 +10205876604940857353 553 +17856338086929782276 562 +3964733248608209412 566 +15252617693956101123 570 +5198588053258159617 573 +7294352613378259976 574 +14274593384918848004 582 +12443356879762990084 586 +15967601366558600195 590 +0 593 +1596502676746638348 593 +3447763432008799745 605 +2154246728958848517 606 +1249748142575979010 611 +12802117032328183298 613 +14720455521613154825 615 +14431397366571454983 624 +8968154969419252739 631 +61922506310515202 634 +17332184019644205571 636 +1580044533016865796 639 +0 643 +16037339623732295172 643 +0 647 +6451385579602643969 647 +2249232807147062791 648 +15969372656029624833 655 +9184080755936318981 656 +10444965622910510594 661 +976846907109217284 663 +15036566770534162954 667 +2852219209756952581 677 +14428186506827194885 682 +0 687 +9583345567128655877 687 +8154021185610424842 692 +7639653587249864197 702 +284400846134645765 707 +5822594207495943172 712 +4666916656146452484 716 +10837424823999667726 720 +7662230599689246212 734 +16769958284715374596 738 +14214321919518354947 742 +7700892892210644993 745 +5647486165416790024 746 +12807160877623480835 754 +17202327424132939777 757 +5849043248643779075 758 +18232796011600235523 761 +4957062118189902859 764 +6105730765254667266 775 +8753292226633308675 777 +14066686889142136835 780 +1047708050925830148 783 +5555751253338228747 787 +8205438979066793987 798 +10100035083082646017 801 +3037731532850264067 802 +16470238215781450756 805 +15841867742103541257 809 +8087512074161331714 818 +15493250668750321668 820 +3797087601271950854 824 +2623502875154101252 830 +15159098560356506121 834 +343051006899596292 843 +16668194639613285891 847 +0 850 +9601059867653113858 850 +1570493927206813191 852 +9118300038493915138 859 +9563382677447647747 861 +5285530497249013763 864 +14598000812816350721 867 +15243372398425255435 868 +9815541045508240385 879 +408899826773384197 880 +7463961818871554565 885 +12980371725716597249 890 +15376403281856848903 891 +0 898 +5841652391326774789 898 +6476912065420260354 903 +3963854010828661252 905 +5784218172655345161 909 +15327721657175197701 918 +13180549833166182403 923 +15904501101973266436 926 +0 930 +14206180323061139974 930 +1106786522797875713 936 +17058832169116321282 937 +721828206256696835 939 +0 942 +8561789411832569355 942 +13374043249168898050 953 +15922789491870388229 955 +0 960 +16131878595889026564 960 +5509499768642979336 964 +12415614990376579585 972 +11304605070154481157 973 +7663245502729528834 978 +2692663086158549507 980 +14133757573751133701 983 +6813598296480126979 988 +13616528755765764611 991 +16303994430841145861 994 +12880492472155407874 999 +14023778603465187338 1001 +1658551813664662018 1011 +8148008758896362498 1013 +10688946549204321795 1015 +13274653424094307841 1018 +10847911221158770190 1019 +0 1033 +4643539771717744131 1033 +4169507947260962821 1036 +3126526255358650372 1041 +13449815687571241992 1045 +9421207081901200898 1053 +6898163624184020997 1055 +7290174431607841794 1060 +2741902156609523715 1062 +15499057183587255302 1065 +16461426401301993476 1071 +11278211202787295747 1075 +0 1078 +9413985875830324739 1078 +4646548733144616463 1081 +7078801759685020673 1096 +5376123263925219331 1097 +14227335667134915589 1100 +0 1105 +7295351152600562699 1105 +0 1116 +1397641409882635269 1116 +2364632016557825025 1121 +7290779788839345158 1122 +223977268476071945 1128 +13026660262516529667 1137 +17998435953459809796 1140 +8522469059272339460 1144 +16293947433309880833 1148 +4576500186674335749 1149 +0 1154 +4042247147937702403 1154 +3034443556411821057 1157 +13667368622259281923 1158 +15202537810082257934 1161 +15337640185400698372 1175 +8308041085868251649 1179 +8832030889396702722 1180 +10436989792260434949 1182 +14898581533124037641 1187 +9317528159836099585 1196 +1612938252083390982 1197 +6278485319310800898 1203 +10612805446261845508 1205 +13787162434835940874 1209 +12133705386992745478 1219 +5227473436681376774 1225 +5656787771058157057 1231 +4433258109319585794 1232 +6704526927800668169 1234 +17440456789764264962 1243 +6979104089888754689 1245 +10768049747876580866 1246 +15707303682313568257 1248 +15148244407999994380 1249 +2841265161354426373 1261 +5252307512862989316 1266 +13331565891980378113 1270 +18159416118263116290 1271 +501516395825858060 1273 +3867012501081805829 1285 +8267472486312505860 1290 +12872828689431491073 1294 +727773195231890946 1295 +7322382021491738631 1297 +5402024496579473921 1304 +6959655625064837122 1305 +10187142685062514177 1307 +3029360479097259523 1308 +3524388403479357447 1311 +5803404108302127107 1318 +3322880653425492483 1321 +14014789072627667972 1324 +0 1328 +17779075582177396743 1328 +11597164340541700097 1335 +18164718194518923266 1336 +0 1338 +3688441162538457604 1338 +12763684824056344584 1342 +6555198237040291843 1350 +8999497138912988675 1353 +9277828726380557826 1356 +1652226711750231042 1358 +6386464493042135559 1360 +11832103051565904386 1367 +7889400420599073793 1369 +5173699340624307713 1370 +9839391635984425985 1371 +9179189546563518985 1372 +8987610858276033026 1381 +14211262843725043205 1383 +9924217736728436740 1388 +4401850895204555779 1392 +5541709837691148811 1395 +10214740045672277507 1406 +14656675767246138369 1409 +5518164076312088578 1410 +8819194535554354691 1412 +1202694809888231436 1415 +9937648736864647683 1427 +4776509399304216066 1430 +3828150896429232641 1432 +9726415758235178498 1433 +15478358790166008844 1435 +0 1447 +447632828248568324 1447 +10254625284015096321 1451 +9602208154038649858 1452 +7918490636759656966 1454 +4464032935723660291 1460 +517803065456797188 1463 +11296051306811729413 1467 +9559870439106258948 1472 +18140734313948729864 1476 +5761393475703308289 1484 +5817187969532432391 1485 +7214411138154648580 1492 +8556555308704695297 1496 +5517275039512219661 1497 +155198283803470849 1510 +12028807386786979841 1511 +9402878779861331461 1512 +7529466829850301953 1517 +3700043109242268166 1518 +7889220073888590849 1524 +9698905706548099588 1525 +950350740255051780 1529 +16659267722661032455 1533 +11934825441675277832 1540 +1840952787151591937 1548 +3181706929772123141 1549 +13084360636440561667 1554 +7392348362663288323 1557 +11299566685738323463 1560 +11865504406956790788 1567 +470806909387516931 1571 +11392390055026286594 1574 +0 1576 +15250035972710306824 1576 +1841748561073501700 1584 +13959366503388518404 1588 +16383575845586120707 1592 +5993903773214649347 1595 +12927537188954086928 1598 +6310676060569643522 1614 +6823572598110530053 1616 +0 1621 +10355215107753852930 1621 +12991560131813107723 1623 +6463225875312731650 1634 +444925180768886788 1636 +8287375501749122564 1640 +8102699978355624961 1644 +3217121844483982342 1645 +0 1651 +15310893597687290371 1651 +4651888484278436356 1654 +16622466823413339137 1658 +14426029300798547465 1659 +16208338759425902084 1668 +13384891560853317123 1672 +10542264124115582467 1675 +0 1678 +13404868863569442317 1678 +8380728838811013123 1691 +2656782871938641923 1694 +5621105522992570375 1697 +16165957063051496962 1704 +17183335989224497157 1706 +0 1711 +12377944724210268163 1711 +15698714840429098497 1714 +2063306500131813891 1715 +7135499884796623879 1718 +14916197160702468612 1725 +14565364212611500547 1729 +17109666354199615491 1732 +18420265465448709122 1735 +5039636110599831051 1737 +13648715526743665665 1748 +8648155745742680580 1749 +0 1753 +4128476852805537282 1753 +12229435493123252233 1755 +18671114624524289 1764 +0 1765 +4330985506003776003 1765 +4960636854468069379 1768 +2825174586054641673 1771 +8083214972260871169 1780 +1656668836635006471 1781 +15658718806708214274 1788 +1364137667359422465 1790 +5440910769879224326 1791 +1242060995600047617 1797 +6028285323527704577 1798 +9862524515548398083 1799 +14095132043223516673 1802 +5330121798209797643 1803 +3047808178481674242 1814 +7009881287782938629 1816 +3836453927748870146 1821 +4828562734878493698 1823 +6251707885160171534 1825 +13503013357676597250 1839 +13120060435028427777 1841 +17453157023102628866 1842 +6659266074333195266 1844 +12122449770852231175 1846 +76872493233309186 1853 +10510620038219076100 1855 +3104474465142299652 1859 +15145875800387371010 1863 +14514645157364972555 1865 +5990940750853294082 1876 +9568631318395414530 1878 +13307393937882497539 1880 +0 1883 +13432428898749511691 1883 +2851874300532727813 1894 +16127254686981486084 1899 +11152828733555106817 1903 +8099684063905722369 1904 +10726727557015251463 1905 +0 1912 +16773004137299201537 1912 +0 1913 +1737396243104320517 1913 +12312810570815952904 1918 +8420117868402509825 1926 +4468099455608655362 1927 +17181412210024682497 1929 +7344171998747088899 1930 +11200240032637073926 1933 +9773885730549905922 1939 +2888420847349521921 1941 +0 1942 +3301971714535044611 1942 +6622000068430301708 1945 +14679279568503564291 1957 +15312513401406547971 1960 +11219696574507219971 1963 +15557068645919193090 1966 +14518831268196627465 1968 +11306244334020066818 1977 +445302382600591361 1979 +4798518764725378563 1980 +12833053520101596161 1983 +6569110733351726088 1984 +1133142439547627010 1992 +6020738327851480577 1994 +0 1995 +0 1995 +15123217074875560455 1995 +5146261845254048769 2002 +15577303646915962882 2003 +5068854713026915334 2005 +5662217880612308482 2011 +13584286678752042508 2013 +17647669975855288324 2025 +7092182408195844613 2029 +5243600304614296065 2034 +16379641210199802883 2035 +6541142296931350023 2038 +17648968980389751301 2045 +3633167252938199556 2050 +691728008305302531 2054 +7434042972483105284 2057 +1243474674683616271 2061 +439217426838173186 2076 +10460352595647090183 2078 +5080394082232633345 2085 +7346464481151790597 2086 +8068677175549539843 2091 +4859996294860352513 2094 +12470823893961605122 2095 +10033529424736163842 2097 +10769920382809060357 2099 +16128670331104411146 2104 +2973668094989328385 2114 +16323032859702780931 2115 +12227727930958763521 2118 +7302528030871866371 2119 +8967586997946816013 2122 +13935701471042006020 2135 +15676859696752227844 2139 +0 2143 +2397906929972799494 2143 +731429270944234509 2149 +14629591375919925252 2162 +14201687141277194244 2166 +8813493889730974725 2170 +4967156306307785221 2175 +12152782138863493635 2180 +5716269545878689795 2183 +12118250850399448070 2186 +10079764034817249795 2192 +9905170822798166018 2195 +7330246949116896272 2197 +4975588281894977539 2213 +2377967791858227715 2216 +1711948357573607427 2219 +15733402191778006532 2222 +13617127880905861132 2226 +5413022680339381252 2238 +12001217113207191043 2242 +605362804928389124 2245 +10888521749365150723 2249 +11742554576381655052 2252 +3591551764774430724 2264 +8647496912976230402 2268 +3843626828621262342 2270 +3921763517492323331 2276 +7707493410895858692 2279 +3920334550068498946 2283 +2658528064200329217 2285 +9038122947820533253 2286 +6952499746958836740 2291 +7951530266135717388 2295 +16076637508890388481 2307 +15187897527562671106 2308 +5520701509759360003 2310 +2598679891400145409 2313 +17512255026679867408 2314 +10995766946592999425 2330 +18117038245928559618 2331 +5391766950501834244 2333 +14461374868186265605 2337 +1273598128050393611 2342 +11820949665032480260 2353 +17841646829021216260 2357 +10200569215461547521 2361 +3670141860910412289 2362 +18396940417538187269 2363 +14261984156631670787 2368 +106960762513502723 2371 +16393357936187300353 2374 +7032931990465729538 2375 +15907195827890083338 2377 +16437195285078765571 2387 +17301257309241798147 2390 +8236593629924756481 2393 +1379157623727557125 2394 +14767417508072398345 2399 +16695407490005887489 2408 +1414009372711604744 2409 +499004129948061185 2417 +5775255721778604547 2418 +16754393591199635469 2421 +10568987941526160386 2434 +3311623553148127749 2436 +10255724520964794369 2441 +3121950734017230849 2442 +2129428121322164230 2443 +5233872436075409922 2449 +5115946926893418500 2451 +298818270766586369 2455 +2534391384903305218 2456 +13962240998865999372 2458 +2858192092257344002 2470 +2246014736733727747 2472 +18208224108542041605 2475 +5900635063125726209 2480 +8459478259862856201 2481 +3106812066263162882 2490 +6016756381746226178 2492 +375597697640802819 2494 +2513762961093744131 2497 +15366269329105501700 2500 +10035949288505144322 2504 +427851159373997574 2506 +4274431321888115714 2512 +5253654952100000770 2514 +16894221500064376839 2516 +14687193167626954754 2523 +13771965837935513090 2525 +8874009193925074945 2527 +4974093839237721093 2528 +741620693598341642 2533 +11991618038806280705 2543 +11116672093208526850 2544 +15807249887587362818 2546 +7323942637968351746 2548 +3660270925885407751 2550 +0 2557 +10684033640943126020 2557 +16989816981004759553 2561 +9001924880900419075 2562 +1998443310251235851 2565 +17567979874939109890 2576 +13652482471668535812 2578 +17509569230481751555 2582 +3182500785161561606 2585 +13325982159032983558 2591 +1923914978402147329 2597 +5589189981371284484 2598 +1161601912578541572 2602 +1916235467976744451 2606 +16280831412119656968 2609 +5531274414859838467 2617 +13599333592024061957 2620 +17989155199582565378 2625 +3030922814179764740 2627 +14644007464957335564 2631 +0 2643 +5497605392959732225 2643 +2032331457863458818 2644 +8100338548587463682 2646 +993329328502006794 2648 +6750921732510502913 2658 +13748899324120622595 2659 +15617703054210413571 2662 +13138109094843573761 2665 +6544485718564688390 2666 +4168731610225209858 2672 +7315066071491735044 2674 +11306658702491732995 2678 +1460741416990041090 2681 +8624484085251326469 2683 +4952143576172173826 2688 +11470130411385533445 2690 +8808161070990530055 2695 +3407659004810532870 2702 +9761503347061253645 2708 +347929962150473217 2721 +15682869073661250565 2722 +12636859761190001153 2727 +2169559175677957635 2728 +6583723534446631435 2731 +11332478688871909892 2742 +3541912969021597188 2746 +15665073567582359041 2750 +6811971824255872515 2751 +17832657550632072714 2754 +8908928359280249862 2764 +16149194899805562374 2770 +16584564148406323202 2776 +8926638669588577795 2778 +8056234806465729542 2781 +20557314279745028 2787 +1574148835258315780 2791 +0 2795 +5593745704266732037 2795 +8450014032945361420 2800 +7024373575570305540 2812 +11737655816003366406 2816 +4727037432569372673 2822 +8600949146786643459 2823 +9003058529087846919 2826 +14052664559056898 2833 +1424791599736305667 2835 +5413427196124183555 2838 +13050600684981920260 2841 +8589685071512056331 2845 +13186761374251900929 2856 +14090913721681066498 2857 +0 2859 +2742241767433926657 2859 +6309431184810384395 2860 +16867533333923942913 2871 +555261403132789763 2872 +5659601479152637444 2875 +18276768397881284098 2879 +6852010445819064844 2881 +16631838326863331329 2893 +246764640492975110 2894 +1313867708490425347 2900 +8944238870676823556 2903 +1060867472129666057 2907 +16635885715046522883 2916 +13334184179287121921 2919 +1341139991463623173 2920 +0 2925 +6310211216600221189 2925 +3521973268169620995 2930 +1462184866304097281 2933 +8359017763585949185 2934 +14138351761235446785 2935 +6817592922583008262 2936 +0 2942 +6385096150346020868 2942 +0 2946 +5484657660585723395 2946 +10615912620259059212 2949 +11956475177743584771 2961 +14617995947569946629 2964 +16460942815259223553 2969 +9814422111234662404 2970 +4608931955518876683 2974 +8617716815688349187 2985 +17740454941921826819 2988 +0 2991 +10586556775954286081 2991 +11028786367153901576 2992 +7561184979369551368 3000 +10180555287637633027 3008 +262376940139235842 3011 +1252244297117510657 3013 +17286434400127825418 3014 +11940732067173687811 3024 +9446744360256471555 3027 +583923543216445954 3030 +8153426984110241281 3032 +8998238685693393417 3033 +11022193474305971204 3042 +18018779292443289604 3046 +13782486654821986817 3050 +1031535266324627457 3051 +17367371162468022278 3052 +16063095350159409665 3058 +16006913374966627331 3059 +0 3062 +317830424679224322 3062 +14882116247225631239 3064 +9977848214775454210 3071 +15016859152309685763 3073 +1451917599200393219 3076 +14163345466838668289 3079 +7124786413716748809 3080 +8972415547684808706 3089 +17905923295565835779 3091 +11508735911159903238 3094 +1060738927182784515 3100 +3235164743035444235 3103 +7249634886133244929 3114 +13627026919527422469 3115 +804144428748921345 3120 +4260278694170215937 3121 +2554890109424057864 3122 +0 3130 +2939022249034957313 3130 +3727916159743203841 3131 +14170274700031256577 3132 +7153627445263524879 3133 +6798175517396767234 3148 +1899052595905691141 3150 +4651137331222245891 3155 +14020723224952528387 3158 +5768869715157669895 3161 +13394211108659571714 3168 +15788932119193980932 3170 +13584005658508513793 3174 +9286626632069867523 3175 +2398026920081879562 3178 +1285989134179298818 3188 +9371873775174273029 3190 +18182246561705410049 3195 +3627164815665507843 3196 +18002283031389555722 3199 +13723140536667785217 3209 +11940684153082156547 3210 +16151440538186193925 3213 +13475891972713434115 3218 +5932226594251481096 3221 +15508203776273810434 3229 +13958242421862434307 3231 +2178759546197172739 3234 +12536204645038731778 3237 +14021691565090239498 3239 +0 3249 +18424936840617633797 3249 +9515558058741110274 3254 +14427656809453646337 3256 +15295479713001905676 3257 +6924455800485778945 3269 +5547275743159208965 3270 +15965423529103676930 3275 +6276065480049782274 3277 +923852355669990415 3279 +5171389834127005698 3294 +15756927494767584258 3296 +5380717287071449607 3298 +6048706605171842052 3305 +10493631130929582093 3309 +2792686703001238018 3322 +16318095573166788102 3324 +14961739739381704706 3330 +13885085964549002242 3332 +8803999472247604229 3334 +13681809489997040642 3339 +1274343414475602434 3341 +17525390131260455942 3343 +4637625228183366658 3349 +8313154017818126861 3351 +13090076428282480132 3364 +18133227728108545 3368 +8282473413611347970 3369 +107193099920609282 3371 +8505179371271580173 3373 +11102079825957593602 3386 +10212767298703785475 3388 +5215453497761775618 3391 +3298152084179375111 3393 +1095163960428030473 3400 +16887781145875813889 3409 +14786085928210816520 3410 +8581278387803219458 3418 +6241337607249230852 3420 +9254719800476612099 3424 +2568855290428722689 3427 +1289519920250085381 3428 +14618186241114017793 3433 +9612541243912769538 3434 +13926515287424429066 3436 +11093957915681312769 3446 +12010544601346956290 3447 +11839562359654205442 3449 +6839541636025740804 3451 +6012482217637302795 3455 +0 3466 +5775335776577318914 3466 +2685494297938271233 3468 +18186802079969910787 3469 +3127521196291951624 3472 +6934893239724900866 3480 +11630798772510404609 3482 +2767762624498050052 3483 +14135084772626181124 3487 +11643008759045397001 3491 3500 -5303047073946667464 +14107087915135404740 +3545799512027105927 +32996413518841137 +15568274631689570656 +20587511236070012 +2390363305266056430 +3863606920688567965 210658854139 +9870724567599405 +103154228 +3007753865419557454 493093586 -15289397349632312454 -5941764183477191834 -3477193953305167424 -236453760381 -7470284155521404014 -24445261 -16426766960960540026 -14549236 -817365937 +814220189 +538968856 +45810044 +11403474 +2625321602296383846 +3076135121411313050 +16635669954819197974 +5514354727165429372 +18413391979390173264 +3544953467117898450 +6361518319333476776 +5833854247140395797 +518849275 +2752627 +71565807 +9870724570416301 +163316374 +60096910 +817038254 +18411417877468545037 +5993603989931887912 +1873618523431177265 +14787093348585572176 +18413109988782047308 +1283692271244348427 +17461812017531651650 +13165236096819726043 +14883032307819284131 +2789363538679106064 +11161692095903435283 +62914993 +2365498112266798670 +154665586 +13726068529822894439 +5570718 +544604964 +33560368941433940 +819856323 +1873618458944931675 +1873618489039064439 +6156738032733324876 +10259573046193883986 +6208295848581203181 +5991347927496394467 +2272905061487347697 +8972557000702888938 +15289397384024950845 +4767727591019973374 +10758418391935812957 +2292825785040636736 +1545208828 +219257441372 +5569296714050766113 +2207492642904016905 +12612941966326959190 +12426051065400527122 +18331556280207363 +2785415334835848520 +6156737968247080128 +15292217517958891614 +5780604328577598853 +3188833133853148985 +4078298757842341053 +6051485356288903427 +573178715 +102957618 +91488775 +2625321602296187261 +114426460 +22675774 +11206864 +9870724567402585 +5406444726343502428 +68551110 +515834601 +2431124533 +538772246 +11065179658016983681 +8930986418384079868 +4076606646528706921 1873618471841499416 -71893492 -10694515171064744788 -29330183088506125 -61997475 -4653200 -109445719 -8926052536804313893 -7528330190111771360 -1418462186 -5887104182899575287 -2625321597997091447 -23407864425745813 -1647838213 -6152225753094686522 -14151987057237756511 -18058417591402760409 -538510099 -17855463731522440261 -240752528220 -27920040887059601 -11078361536363433136 -12517601 -15885957841278600403 -518718202 -805438326 -2621553 -1550910461 -2411070513 -59965836 -13012951802392676509 -97518103 -2625321602295859611 -30277976 -546374457 +3701601059573925529 +16166203682344470241 +6101795981361546864 +15289397371128186695 +7569568047215545466 +18411981910273949729 16759426304739641933 -259654328 -27356063970624739 -1873618458944931675 -6209987959894902621 -5728764444739437994 -18413109988782047308 -13885455448020813663 +48431492 +24535874148371011 +14024943 +59900299 +105775699 +10770155859627543824 +71369196 +9870724570219682 +163119765 +2530739313276357975 +5052785364214352114 +805372789 +5652457623480305518 +644809585 +816841645 +2556016 +4501477955215362649 +4502324021619918399 +2150364451440035988 +6156455943246842659 +1873618497637649718 +12309852946450942075 +3660444556051220001 +11103300151687644832 +8714520725396523830 +5461104765611607541 +27356033875641745 +5352348805862394041 +2012415014 +5151629580948802356 +5374107 +154468975 +108593749 +62718382 +16843031 +28311895 +1107456968073808590 +11490081257974859839 +16633695840000739887 +9386257335747873389 +4959080478982475006 +11408348231855703653 13464164481390611573 -5514354709969504081 -6364097374632348674 -2676033351739376985 -1136798196293306910 -5299098874403555921 -2120987217453057458 -17306856587979066781 -1873618532028844481 -5572365145471912335 -18412263926676652075 -105382480 -5303047039553965447 -9881712940254169714 -152830562 -8610102806501591788 -15524263781940136850 -14282671233461718187 -2857298572705729021 -29330122900898936 -10554335258691243263 -8453377129057749572 -18411417864571256842 -811271050 -1873618489038604579 -4657106642463886071 -2676033356038145381 -514654951 -10757572347027851837 -4237766514325588729 -571999061 -9821766011288487605 -7230168968130792223 -2704904949959166469 -1823671323 -103350839 -46006654 -2755882956846859930 -15289397371128186695 -12662636664722033563 -16318735 -18411417894664929297 -5462796894122411284 -9950019064427710530 -6981729909914862956 -1992588707391932346 -63766972 -6422699 -23407808536904833 -15394822466617412826 -16881139139804531782 -14312300901618944289 -2625321593698061230 -9870724570679212 -5780604289886653255 -3870997034531752803 -2531021389865944442 -10908568553618343357 -1860700038481053299 -196215461 -1801847830 -24183115 -18424247431471827427 -14287090 -417019855960 -71631344 -4391052 -61735328 -18413674012989259870 -2625321597996829544 -17957750408840481687 -9870724568648556 -41943405 -2789363542978135882 -18412827950883864637 -548143940 -22151483 -17257283845880874759 -899112529018292807 -538247952 -69599701 -8510664359869943178 -27356081165698156 -27638084672359236 -12255453 -11400819049620310987 -1321272283 -16881139122607162703 -2359405 -3101815889301670444 -518456056 -9232147856523987724 -3758799212073651272 -3591160524196219107 -154600049 -17946608694533885076 -11500631658516907905 -825323275339564903 -9870724566615620 -39911783 -12318365723907459763 -546112310 -18412827980977537092 -536216330 -2676033351739114988 -11069796553860646809 -7880043043777809442 -451412296787 -18411981918872141859 -11678577273375754735 -8856014234050823647 -105120332 -1309344723 -162464400 -681145240220010584 -2626514825137096412 -6589396841525218018 -356832249381 -6156738032733324876 -11202456151687629452 -27638041680086900 -11243723090649876783 -5726358144768542273 -12498251711624252784 -13702827714901707594 -811008904 +15494005608834598990 +1407386597 8192198 -8714520725396523830 -514392806 -9960543895307946415 -15287141235608259625 -5727354401416546168 +219257244681 +42598769 +811008904 +2573543610120276856 +5356297048398365877 +7595953279435999504 +5726226297114658480 +2723374776553770162 +1543385872365455415 +11535686880442518166 +15289397379726773461 +5565348488711963913 +504169174 +9870724567205432 +14212253575230457510 +5831598111619679502 +2625321602295990612 +572982104 +813826970 +279448324634 +538575636 +11010253 +68354499 +11243723090649876783 +18331491793766525 +15292781563660995825 +5991347884505304103 +9409295256684857617 +3759645248384009814 +5832726134240118664 +14312300901618944289 +20305615210743190 +13001845694847518363 +2652485274356286816 +6151097653090126690 +2203332276215481610 +18412545964574834746 1808894516123993997 -3686437022462641529 +518456056 +2359405 +1321272283 +71172585 +417019398489 +18895516000586505 +162923155 +9870724570023121 +13828334 +2625321864544389907 +816645035 +8453377129057749572 +11949535972653271176 +1873618467543321286 5249797181178709209 -2625321589399030850 -103088691 -3062219857732765097 -830399540494469985 -530117487457144076 -12454108019635062383 -197984938 -8930986418384079868 -818873277 -16056587 -11526999220155450649 -6160551 -63504826 -7621890105505615217 -11847668763332905754 -10377426660276898779 -1873618519132015281 -18092519415945890646 -15882855708139391266 -7993599274919922706 -2789363538679106064 -2150364451440035988 -9870724570416301 -2625321593697799226 -91161094 -1410073577 -23920969 -7513578521803359945 -22279798815198594 -15520597512816297356 -1023125932615797552 -540017436 -8910392170935354895 -195953314 -644809585 -14024943 -71369196 -1873618476141774348 -816841645 -10906583479868327250 -1454041666728626384 -4128904 -18413392005184749654 -108921430 -468609401971 -16204201012116260706 -99025451 -9870724568385196 -18412545943079354421 -11878630053446878902 +5567604589840172352 +3707523343842937215 +17088205463377873568 +2169005683868174908 +9568723490388248888 +6103488088376871190 +4025969582498383295 +62521771 +18276979644936029994 +154272366 +16646420 +544211744 +28766107292140894 +5177496 +509805280 +1873618519132801026 +1873618544926132491 +7676326001635166459 +7676326031729298383 +869984510486186619 +13146357072728951328 +2000487899013646903 +2449021711964768402 +6155298010574883251 +6098975770044401989 +3189961199463959445 +2676033351739376985 +7995587 +19464489 +547029825 +219257046468 +2021331689141374237 +15288269301218674108 +11705421198335413148 +2508194873 +2625321610894575340 +6097847713031849822 +16064731596255856452 +13701595356116683915 +6364097396127827248 +18413391987988365394 +16364556117061994922 +10296839827164892306 +5403008449516603011 +15858116883009440274 +5833854255738587405 +45220217 +194314911 +10813643 +68157888 +56689033 +114033243 +4287350266942457603 +987047180239768912 +813630359 +18411417886066737167 +18413109997380239438 +11548493110908749415 +6364097387529046615 +5561348123192067576 +5835546388547569431 +5246976935469649046 +13884327378110449525 18204249488608200784 -5566476545725367766 -17951898368652543383 -7558005371879033601 -16542141154387102177 -6316393479032998553 -11694336983993944146 -11427331956784106382 -4662073785906890031 -1873618454645640429 -537985804 -12999620585941961275 -2295119206548507606 -11993306 -1597536180772867045 -5299098844309358384 -8294669686619703163 -69337553 -1873618506235448739 -518193910 -5406444726343502428 -16765215479188031591 -5460499803636172954 -3431717683755289915 -28202117477106938 -5249797172580910311 -5745384143842643344 -14065038233622153931 -14311172801615955497 -16758489844492275047 -5510538272098551989 -11065487220741573048 -9870724566353399 -5679882735784101879 -259130038 -87097857 -3491703471172619422 -545850164 -18271599167641487963 -5991347923196709309 -1873618458944406678 -7033448275620070919 -812778389 -434977997061097911 -3445982126355516078 -2676033351738852867 -3545799512027105927 -1873618484739311861 -12749251354825264418 -14836382508930370955 -2625321585100000596 -21997756618246082 -8716776809328151764 -15580874176502892132 -3332575624131774585 -4445946672738010859 -5780604328577598853 -2848264744227112681 -1873618441749072804 -257098416 -4930631980557601532 -6877319166685482198 -1005889956380019628 -820642761 -17826079 -23125779236849772 -810746758 -7930050 -8929320279979198383 -9654763076979264499 -11949535972653271176 -1873618514832984063 -514130660 -18066207382028748450 -2573543666009114673 -18613585580197092 -1427238547443354327 -2625321589398768544 -102826544 -5903884228619468800 -4279043148 -7036226112429884975 -818611132 -15794439 -3324580943442478547 -1903640920853056624 -5898403 -1873618497637649718 -1133620887485417426 -10156853965084755435 -63242678 -282723005 -13586095437453200186 -9082058141968173941 -1987794462939089941 -13237708531286474753 -5240852582657493474 -1915314009235720841 -9870724570154139 -90898949 -17090754651615726815 -492307151 -195691169 -11050161621988804687 -23658823 -11623400942792738969 -9304480456320748248 -71107048 -816579498 -23971751058934778 -17869638717220195611 -1873618476141513316 -361675971417279818 -61211034 -1873618501936418049 -3866756 -567411536 -5302201063430292982 -8486888319115725460 -12406930521299355297 -9870724568123690 -11034422950646711803 -4287350254045103750 -5566476545725106758 -1923875870 -547619651 -6366353527348595732 +70975974 +9870724569826462 +816448424 +4211213383 +2162794 +12974919760129952993 +105382480 +5459976661309982295 +21433723812579518 +32432320527074663 +1873618497637255436 +9305858029919208637 +10225919154718574351 8597156797828894009 -13590665243542948895 -13237708561380147208 -4254959725487523541 -2907303882175415846 -1873618454645376983 -9230753948926543533 -11731158 -527827717 -5511666307614640107 -1330643932 -69075405 -28202091681942395 -4727296740454696303 -1992881785902860007 -18301216972081072101 -4076606659425995504 -9870724566091296 +12461042340477994821 +1455946274504313841 +9538952396691934382 +927164962728314711 +5782296426993943791 +9714916684781063078 +16449809 +4980885 +819266496 +2625321589399030850 +10907429529076434052 +257295025 39387493 154075756 -5459976644113468289 -545588016 -12461042340477994821 -223556406340 -32432337723721245 -19595563 -2573543610120276856 -24535874149025753 -5196265237615086368 +62325160 +1495925747 +288043895627 +4504298205224635444 +14835085562484362568 +16881139122607162703 +1839046019115124804 +11923578915473263059 +9388513449772451585 +5247593352907982888 +5153885686374731086 +12020808312486431384 +14848239906707278405 +5405598728725530322 +3653991426073234491 +5566476498435442740 +4333982245204396969 +17007720368052373541 +14458654042895551171 +16885259953617962521 +2676033351739180486 +6877309693745106245 +21997713627284659 +7562235540534921217 +2625321610894378836 +5458848587099997499 +1647838213 +288046714075 +1454859013759438228 +1133620887485417426 +237175467 +810615685 +1418462186 +12162857194684744950 +88080898 +19267879 +7798976 +546833214 +6206321690771522709 +21433680821684597 +1873618480439692390 +3932922014897081298 +2549492329236335496 +5249797112394286460 +12294570438877711433 +2324121364801391676 +3315661715940248009 +8971880411373045432 +5461104782808583112 +18411981918872141859 +15371922320578972378 +361675971417279818 +90898949 +13390152586296232130 +492307151 +13522668389390157414 +538182415 +10617033 +12498251711624252784 +22085946 +1987794462939089941 +425617786716 +1730937871 +5356297014005859746 +5569296739846327213 +16881139139804531782 +4196703391028741586 +1873618476141710425 +821147663836514852 +3158171969379764633 +30176223702288623 17735566651085687884 -6204347601746593065 -1873618484739049815 -812516243 -6152225714402428442 -15291935501556190620 -15505670362359531298 -451411772583 -9484411285755463284 -161940107 -15292499508566297469 -563348302 -506004186 -11238431078799509026 -18323667541285735009 -2625321610894640833 -103179363763488430 -503001580666 -12769025487284210679 -17785259844527786731 -29612147900877606 -15290243377345399572 -17563932 -7667902 -3186488476490139978 -810484612 -1192315333980326167 -1873618514832721746 -15292499491370961900 -513868514 -5347351719937377689 -45220217 -11775490430040476325 -12240192446106372977 -35324256 -2396555433535145871 -7409502855497715015 -7888341864134085054 -4278781002 -1732546121802517809 -2374936041605498895 -21433680820701635 -12189960762281954023 -869984510486186619 -3598203394278688718 -6103488079777762245 -72876542 -16990917635978692369 -818348984 -15532291 -1146796961722731823 -17761874897365304540 -62980530 -4534407021717882867 -5636255 -32714379920409891 -12552846396214610071 -6262673798361580735 -2528483177756102046 -9870724569894177 -9297735470756268616 -5831598115918776853 -32432303331018178 -6064762127302393958 -6156455943246842659 -23396678 -13500652 -16916327697533962956 -70844900 -816317351 -18411699885273055253 -5884848047378859255 -5837238405281154301 -14311736903207619026 -5141736951422061236 -3604608 -31022281504523376 -3599049409094225259 -577045344 -2974323816123992770 -8021450341214588326 -3577503648415550265 -509805280 -9870724567861628 -11098517635487303139 -7462549834646555859 -98501157 -5779476207078475458 -219257375260 -490013379 -4222974949961697922 +1427238547443354327 +10223260478367337870 +10720606758114626648 +70779363 +105185869 +162529937 +9870724569630759 +24904017 +2681814701524780811 +1320879066 +1584661506 +644219759 +13435115 +6097847786116483627 +12477949191893683608 +6925759835249836137 +27920040887322186 +10003084053115964048 +16253198 +153879145 +2625321589398833886 +257098416 +4784274 +9103100569952650951 +12474564753552836994 +1495729137 +62128549 +9774054990929462949 +5356296971014964874 +6153353870293665804 +9568883447315500158 +1915314009235720841 +16655465042802838677 +14866462842593414402 +2676033351738984017 +546636604 +535167753 +42008942 +30540122 +6365225483234117329 +7602365 +282854078 +2625321610894182276 +13307798926833551183 +10913926882465549337 +15906307047154976446 +6104586261131037638 +8483828720841721486 +15287423226215073909 +17785259896117529586 +2785415278947600352 +9000175594581527004 +14425661002709010016 +5513226652957347114 +805679481429165719 +17859691850682797212 +9181555677596944971 +1363739614 +9870724566615620 +537985804 +572392279 +15175534989820758889 +1873618476141513316 +2152780467316001469 +12601357272775920269 +16765215479188031591 +6534429686359852912 6366353553143235674 -3158171969379764633 -21365044 -27638058876667848 -29330140097217635 -1873618454645114642 -2703776923039566000 -68813257 -279448782049 -814285726 -12237654319976351671 -517669620 -5779476284463187670 -10375505326587315831 -18411699915366727708 -6205475624366966000 -3307734082 -39125348 -1087507565178193378 -545325868 -15986098390340470919 -223556143025 -19177592590632702 -8865366478519731984 -19333416 -32432337723461001 -812254097 -11305519054433421356 -1873618484738787248 -5105416417023100899 -572982104 -505742040 -563086155 -104333894 -8070528080642443989 -11327137566841769230 -2625321610894378836 -16377260960560187819 -15586729198848181726 -1873618441748546884 -18413109971585663048 -4825924017323379312 -5915592292141435844 +12689613402799605860 +9138963602338286574 +104989258 +644023149 +361131345578 +816055205 +9870724569433729 +70582752 +1309213649 +17634738504986593825 +5639662680184522626 +6316393479032998553 +16340493341965880015 +5344573059048999857 +34124461934314600 +5994450030541998229 +2625321589398637514 +2676819007 +15515140772745448064 +498702419026 +227855238971 +4587663 +16893851890367073119 +14264208198271043974 +555090760 +818873277 +61931938 +16056587 +8821966780582857359 +18411699885273055253 +4861149623842704773 +18413391996586557524 +18115578910873816258 5832726151436896491 -17247780946628644032 +365262179507571896 +16896582888638318388 +4445946672738929841 +17186370630874106258 810222466 7405754 -11549275701007551889 -10161648502327149991 -570950482 -1873618514832459339 -313841222762 -4452458274095237609 -1445774942907271091 -6101795934071424788 -92406286 -5293539447540681024 -18331491793766525 -197198505 -11199980773228349986 -32432320526091507 -818086838 -1997667722089860216 -2524806027085153844 -1964966944 -15270143 -1370042529145686776 -5565348523104797810 -18331539082773742 -62718382 -2012415014 -18413110001679335503 -5374107 -14282027259104724924 -10375505339483621145 -9887461037680036022 -1873618544926132491 -4662355883991631380 -18412263939573940270 -157614716 -3295137431799204142 -9870724569630759 -491782859 -214958343888 -16875205763331852041 -7241607903360452069 -5408471212899110030 -23134531 -18411417877468545037 -27356081166681957 -644023149 -70582752 -816055205 -3342460 -5246976952665638015 -14212253575230457510 -576783198 -1842511416005692464 -806159226 -5566476498435574920 -15292217517958891614 -13516735047310051359 -5728764487730398405 -468608617008 -4025969582498383295 -16044698410490725659 -1519546451849645365 -9870724567599405 -5566476545724581156 -5619444426388998007 -98239009 -547095362 -27356033875641745 -219257112483 -8140646021471143544 -4713167439824750602 -16357059045845960667 -5462796881224795644 -9138963602338286574 -21102898 -10905173367761798655 -13701595356116683915 -2477484405147109478 -1880166538706292058 -11206864 -1283692271244348427 -68551110 -5885543833259674054 -18413673995792875610 -2352415791 -14947075702982868 -5299098870103476096 -681145240220994278 -163447447 -331038328206 -38863202 -96207382 -153551462 -2625321606595348609 -5461104757014004985 -10744889200825601240 -1988559907 -258343605 -6517011693716180143 -535167753 -2530175340657839273 -811991951 -15291935475760762248 -4397798264919820154 -18413674025886548065 -12109395139072755174 -475082778886408323 -104071746 -161415815 -8697110475982376165 -15584540329550678645 -13669583335851559254 -2625321610894116800 -1873618441748286746 -18412827963781152832 -819856323 -6209141854797957852 -1783548230307677653 -18411981901675757599 -637928298 -7143606 -15855332315905657597 -2625321864544389907 -12020808312486431384 -3076135121411313050 -10139438201185111279 -6152225744495577231 -33560368941368890 -210659313158 -4278256712 -27638024483702949 -24904017 -32432320525830439 -13263754581809432790 -817824692 -15007995 -359800716494834349 -18613516794268696 -9839328478246341893 -62456234 -5111959 -18411981931769430054 -16219982623696489082 -6261827792145090364 -7692717626264324682 -42664306 -13806855580317125108 -9870724569368358 -16269555352897260337 -214958081659 -11214563466575480865 -15636771529559117046 -13271165719268362246 -2652485274356286816 -538968856 -3784724792312663401 -18263821886743185772 -1986666427421953426 -5565348480114297669 -5352348827359053328 -12976359 -1873618476140725820 -421319345246 -70320604 -11703165067112811597 -21715697223994697 -3757107087862401328 -60424594 -3080312 -10697899350700788395 -1873618527730534170 -468608354196 -509280991 -50528646 -1193603335023233930 -16635669954819197974 -15426482629288462533 -5460499803637156023 -2625321602296318353 -9870724567336570 -97976862 -8818864638845060491 -14288223544298637564 -88080898 -6996745855548787140 -5566476571519223063 -546833214 -220421203678071202 -31022238513759415 -1873618458945389823 -6406389097441592980 -20840752 -813761433 -27356085465188671 -68288962 -5865888353649363875 -109394696450803010 -12213481117926952067 -18413391987988365394 -10944716 -517145329 -5723537903358642458 -21715753112570631 -7758478083289188556 -10675690836223986039 -153289315 -95945236 -11547019543992076059 -9649086479758069023 -2625321606595086582 -258081459 -544801575 -5887799994573980828 -2845029447323880298 -18809125 -8510103668314541335 -6205475701751155414 -1990332636357069057 -429916882098 -2673382969485886910 -1873618489039064439 -18413392018082037849 -10914208898869168291 -3773122177597967623 -161153669 -103809598 -14107087915135404740 -6366071515245381876 -18412545955976642616 -15289397371128645360 -5462796868327967227 -1402930148 -28202057290482949 -797695489810761887 -16777494 -18116142943679220675 -5142301044413893172 -17219576355390295334 -5249797112394286460 -13735950183222348532 -6881458 -29048192479791616 -16896582888638318388 -14517406836956661503 -5458848655886518922 -313840698753 -5197393273133271298 -3861350810962691992 -6375653898722412075 -16885380374869314205 -361129707266 -210659050964 -29048123694646491 -3017170418691476659 -1873618450347593089 -15290243360149277503 -14745847 -72090103 -14546784569801180959 -7431889721301470079 -6364097387529111599 -2435475427475262665 -1873618497636600365 -6151097734773868363 -62194086 -17083693200934636558 -32150372909516328 -4849811 -3172873313800750756 -2150364429944620611 -3862478902367620470 -9305858029919208637 -2625321597997287853 -2508194873 -491258567 -1408762855 -5015996636573993090 -2414921941537785811 -538706709 -5734260728554980678 -22610237 -12714212 -70058456 -6208295882974168451 -32714336929384395 -16643035121679272213 -20023641798084435 -4770547828131824981 -2818164 -1930668198955452820 -13726068529822894439 -468608091255 -5569296714050766113 -17490170188584258190 -8694008299851745161 -7073102484926630551 -155058804 -97714714 -40370537 -2625321602296056238 -1703347206 -15895039144349470066 -5352348805862656188 -3068049059797011246 -5880738612678821404 -12309852946450942075 -33560429128451329 -15289397384024950845 -4767727591019973374 -10682570 -10233718743719545342 -850088361543927300 -2792183694107936667 -1107456968073808590 -5759560470823897206 -162923155 -29612216687004362 -5875369269012203157 -95683088 -294416195335096411 -22279760122415532 -5639662680184522626 -17619012653768771484 -13237708544183762948 -8550520059753138843 -27356042474686002 -249849483538007723 -544539427 -13390152586296232130 -10906513561824594910 -18546980 -1873618489038801706 -2676033356038342054 -6313103561496791450 -2063139881 -6848542126596623056 -160891523 -103547450 -14101293042239958 -6151097653090126690 -1584595969 -12424382439595706534 -17698252132056434004 -4129856573689694799 -16885259953617962521 -12393440069873436875 -32432320527338097 -21433680821684597 -8617826180017097033 -1413046597527668667 -3973491001936446780 -819332033 -17305802226190387588 -1873618467542665344 -16515346 -6619310 -6206321690771522709 -4089771542585346905 -1223976962194278208 -13487493291780736605 -2487491354099451134 -8854886172739175692 -9870724570875039 -2625321593698257851 -1535116279 -6262673798362565305 -91619849 -493028049 -5352348797264856883 -8143564249694210398 -6151097683183797493 -9386257309953099582 -196412070 -3865299044899163405 -71827955 -18613366323088485 -18157949162008873831 -7562235583526800081 -817300400 -4618470194090937269 -4587663 -3932922014897081298 -61931938 -1873618497636337289 -2522831856378710008 -6364097413323754682 -6053028402293443390 -42140016 -12287601267178473523 -2625321597997025900 -538444562 -15991329612793777185 -15291089478142986477 -12452064 -2676033644081056812 -2556016 -16508579235574254010 -805372789 -59900299 -14787093348585572176 -2575517759332551933 -2412665810316625225 -7730749911729375728 -6155298010574883251 -10488220504998020326 -1311572948 -883931539946605906 -5352348805862394041 -2786543383251193103 -546308920 -3346269252 -5782296426993943791 -4469799173763958889 -6205475671656957491 -7872981661881076049 -18116424960081923281 -2676033351739311464 -516621038 -1465168459078698840 -5677488692584514734 -105316943 -4562124351240801677 -5245848874158263187 -16432982289349543214 -162661010 -3971798877726246151 -4787251587800828866 -5875369294806846690 -12217235256243064050 -95420943 -5354604868299326678 -4502324021619918399 -544277281 -5940918086979029952 -2014710471177341259 -2140013610 -1873618463243635741 -18284834 -2676033356038079832 -10531295876509927029 -5458848625792321791 -18411699898170343448 -7410231625909407077 -3478039985316562895 -6204347606046083061 -31586254122912349 -6829167320236755019 -27920101074341046 -13165236096819726043 -32432389312220424 -571933524 -5727354401416743090 -10225919154718574351 -4127600472563058730 -160629376 -103285302 -8483828720842049762 -15740334315622960494 -206359759935 -9813006656186419950 -9319686106503382840 -5515085278788979157 -232154663489 -26149204 -6208295848581203181 -3094190453106412515 -6520986101609793850 -32432320527074663 -5245848925746038203 -5942328186188203485 -1873618467542403595 -16253198 -15881445561639371975 -6357162 -63701435 -15515478115209971466 -5833854247140395797 -283181761 -19177532404009207 -16567374854657149772 -684134257893509654 -9870724570613070 -15680489859993767209 -12826571498698443033 -2625321593697995819 -10329316755526125416 -10754752208794748192 -10758418391935812957 -12105446909435186010 -3143159678306028631 -236453432350 -540214046 -14848239906707278405 -29330157293274228 -684134210602468610 -817038254 -4977791693940394179 -71565807 -1873618497636075077 -807142269 -61669791 -11287403619712895066 -4325515 -13819298136066198 -7734678113259293802 -6098975847429179176 -99222062 -18056758355458722638 -9870724568582655 -16224960573811657069 -2625321597996763849 -4078298757842341053 -17625510063045740642 -10528906628815718922 -490734276 -5412367062202975465 -22085946 -12751507524739009261 -538182415 -12189916 -18413109984482951243 -2541195915421354200 -6671860954713623381 -2893509029140760671 -69534164 -747829823970020707 -6770804071406897080 -2293868 -5566476498434524382 -6534429686359852912 -18412263922377556010 -164430493 -9870724566550039 -154534512 -10167299845199168903 -12754891682880490747 -5250413516934022944 -3315661715940248009 -451651625195343029 -32432333423379563 -5941764217869305943 -2141783083 -283748271730 -10161648493728303880 -5240846595623881868 -67502526 -15618641120352995308 -2676033351739049517 -6205475697451599682 -4023356732265137752 -14986955239351847842 -31304272112126853 -516358893 -2207492698791414354 -477207135345 -1309279186 -105054795 -17859691850682797212 -162398863 -4238330517036600601 -152502880 -18412263952471228465 -257295025 -10905173350565414454 -17498716255300421272 -8881019260503721949 -18022689 -534119176 -18411417890365833232 -6293435910568086045 -9374458755688828226 -820839372 -6153071780807051278 -5909364179964069981 -8126661 -3735453693364143828 -6155045908522469290 -745740842898098858 -2625321589398965240 -12142525752872799042 -160367231 -17958290734101235336 -9523554809025136564 -16892239439269464715 -15289397371127860096 -1736311827 -15991050 -63439289 -6095014 -12484855343804124176 -9658025172156550406 -18067928153034001057 -292345808939 -16572875051796793000 -10542598463376395267 -12772641161582545873 -18413674008690163805 -1544487931 -14737352740221028816 -282919615 -12808641794728789765 -2625321593697733840 -17128487303121020 -1706624008 -14101026494875963 -11214563466576463780 -18412827946584768572 -11966722661119888545 -6156455943247300775 -5300226909920168653 -6004915412369541960 -816776108 -4223816177647290930 -71303659 -1873618476141710425 -12477949191893683608 -417019528294 -9511403338599564690 -4063367 -61407645 -2543805385922512178 -9870724578216632 -5407707525201267705 -9870724568320021 -2564752444 -98959914 -15494005608834598990 -15140097999495498431 -21823800 -12734096628671909131 -537920267 -18412827976678441027 -11927769 -69272016 -18411981914573045794 -2571498445011814318 -10592171188278987146 -2057911839619745748 -9870724566287831 -154272366 -545784627 -17616192489740896443 -21715680027609308 -16886908734816455284 -583336804 -2246313005 -516096747 -2625321585099935141 -620888934 -162136717 -331037018572 -477206873177 -503001777494 -15592058013925444099 -1652810939277510396 -10531295803425490030 -3205882223899445065 -31304323701671300 -28484129580057898 -1873618441749006513 -16893851890367073119 -820577224 -16904712944498838074 -1394017249 -17760542 -4160689491693538063 -4047541379259827663 -7864513 -14219872676477209184 -504169174 -17244622751296785814 -2625321589398702921 -4278977611 -7239633818635733091 -5462796868326918190 -1334641629 -73073152 -7460569593843485201 -15287141188316891641 -818545595 -9339868219275806468 -15728902 -5382561551670903978 -9373330690077689939 -18413392000885653589 -5832866 -63177141 -438515402871 -2373415502940997016 -2148672322930150296 -168849237244054062 -12339564610979564477 -8327325764367420682 -7630443591734791098 -12608147700378373379 -9870724570088730 -2150364451439708714 -18412545938780258356 -13221120945827219803 -492241614 -4129856608083381232 -15740733274947783803 -15858116883009440274 -1873618476141446514 -816513961 -17564225130023161250 -13697261 -10668197763104573447 -71041511 -5357143003026951378 -31022281504720056 -1873618501936351339 -3801219 -442814170389 -5701610621477129021 -8520914754064026558 -15289397306641222853 -108593749 -98697768 -9870724568058057 -5780604294184830225 -156041850 -5192881006389626514 -32150304123324262 -219257572663 -18412545968873930811 -5249797099496672683 -11127945220196076778 -9103100569952650951 -11665621 -421318034537 -17619012718254098754 -14443179094226111164 -1873618480440216958 -69009868 -10594427319499622429 -814482337 -13968724050119231192 -28202091681875145 -27638110466671725 -16166203682344470241 -1712194570 -472907842721 -507970270 -15580874172203795679 -23689855033805297 -154010219 -17092164759424403479 -12893049762838873864 -6877309693745106245 -545522479 -5887800020369606783 -14977809576148535095 -19530026 -14105033451515939293 -6795216411027442152 -2543452128325209336 -1385890784 -114426460 -6444189713816225654 -6152225714402364510 -524384476410219715 -17953567922355439196 -17113993018971653874 -573178715 -515834601 -17090754617222956318 -161874570 -1538130937 -47186305 -30458188512103543 -2449021711964768402 -2414448843017751282 -5214737420442796133 -505938649 -2625321610894575340 -13965057806789381527 -970700105235760464 -15223822230290106035 -16285378285009240167 -16940455997476965252 -2601013084734032090 -5248157445900799208 -1580068669843704469 -15043322265989680207 -29048166685607288 -3863606942184311140 -820315079 -17045009756596405420 -29048192480512516 -11510172448171493799 -5885976160280708469 -7602365 -17785259896117529586 -8856014216854897981 -14477731067643038195 -1873618514832657292 -2578187325 -15292499491370895395 -33560368941827284 -13146357072728951328 -17353152791227993245 -159842942 -15530553734630409457 -5569296726948055802 -494159375523777824 -1812923415 -6366353518750729401 -4278715465 -17097308613030775025 -35258719 -1899651063193471062 -12103109825679658143 -6364338522051512284 -2429880031182916564 -11621189233770302317 -72811005 -15466754 -3880024017885400135 -818283447 -62914993 -4076606625033226775 -1873618497637320883 -7746405201714873917 -5570718 -10859426818132543221 -6925759835249836137 -3506237898852665380 -23407812836853915 -1873618523432225060 -17166316876055971050 -18008952305986046279 -43123062 -9870724569826462 -7410173966093388838 -33560399035500221 -511599051947 -214958540605 -13237708557081051143 -20587696099952690 -15339421027537585423 -6104586261132347910 -11103300151687644832 -1456931819 -1873618450346281005 -9181531069949872018 -14650572868605052119 -17783567759008991682 -575239712866634722 -15288269284022357372 -6206321673575138470 -644219759 -13435115 -399811749952817933 -145335345147610979 -70779363 -6366071455058494624 -7529998377695250462 -519635711 -3539071 -576979807 -9568723490388248888 -634323816 -13012951802393594980 -853643387796785445 -98435620 -28766107292140894 -9181555677596944971 -5195701200510977145 -5129024196560096606 -5831598124518278362 -4844858457232050089 -219257310372 -7569568047215545466 -5461104800004441485 -1518418407735101149 -814220189 -11403474 -18005251247539029895 -10333839787251271664 -1836516380 -8054758354584013306 -507708124 -163644058 -9001701177466488459 -2625321606595545096 -153748072 -4787251587801811388 -39059811 -545260331 -2036204584 -5356296971014964874 -19267879 -9714916684781063078 -3055188874828713383 -14576212124415364447 -2150364417046743283 -4662355849599126556 -1372824966366170355 -1318388695 -15289397293744393060 -8423108281783224429 -505676503 -104268357 -477206348880 -5831598081526006949 -4625631396377398109 -2625321610894313322 -6206321759557388696 -12237654281284815334 -17236251 -9391897711091583990 -3891732840317912522 -8856014216854636141 -5758903550139959418 -7340217 -638124907 -810156929 -6206321690772243584 -112132697 -15287987228927658628 -339636063086 -7721139320100816372 -684134305183500639 -22279768720672168 -5831598111619679502 -14814059355306855043 -4211213383 -15290243360149735302 -18411699880973959188 -15204606 -11507341268100646834 -62652845 -6365225483234117329 -5308570 -3491703531359374171 -17791918762976347730 -4127600455366674792 -11130039777759856047 -13951205954302381098 -18115578910873816258 -8659114857360722535 -6153353844499089111 -157549179 -9870724569564298 -16327183209838150989 -491717322 -214958278120 -32432303330691092 -17684252729367202593 -16965951797418331227 -23068994 -2272905061487347697 -1873618450346019367 -7515799761807542411 -815989668 -2576363817137867614 -70517215 -17763448248357489818 -13172970 -3276923 -806093689 -17621268802185464283 -60621205 -18411699911067631643 -576717661 -1685722535145180234 -23689824939607125 -17256155806064642777 -5516892801706297876 -12982659022915898414 -9870724567533791 -15515140725455259155 -547029825 -219257046468 -4180850416920431050 -21037361 -68485573 -11141327 -813958043 -189614828176542708 -1873618480439692390 -279448454880 -16253215886083360174 -572110149897422243 -9896616181508082455 -153485925 -8021450371307931626 +2507605048 +17607182983838566334 +546439994 +2679637056 +41812332 +99156525 +9140909979694467183 +11742834345080916462 +9950583114428779661 +18411417894664929297 +17160329975787685834 +1518418407735101149 +18331556279224644 +15289397293744393060 +13950077918785243724 +15287141235606883137 +2789363555875490728 +491913932 +90505732 +214958474959 +21692726 +2063139881 +9870724566418979 +339635799228 +11740202404159819072 +12769623874203027091 +7171706723174648069 +16156684754098128751 +6208295835683456476 +1873618476141316771 +5882159696614657105 +3431717683755289915 +1873618506235448739 +17166316876055971050 +1023125932615797552 +22279798815198594 +12346762090311779845 +162136717 +331037018572 +13041896 +1733362705 +643826540 +2306802734 +477206873177 +17309267256018536307 +2625321597997222413 +517669620 +620888934 +70386141 +31022281504720056 +7409502855497715015 +6155045934318095559 +18412263918078459945 +5458848625792321791 38797665 -19177566795402134 -27356016680241600 669582195 -2625321606595283106 +27328854 554894151 -5512098557251945790 -9568883447315500158 -1440671446449589035 -4502324021620638916 -3249068390006196153 -15292781563660995825 -821822415 -27356063969248337 -18413109967286566983 -10911952793442192048 -6064503826171693679 -11161692095903435283 -1004761907965660269 -2207210695286917386 -6388664954993575829 -46662016 -5885976061401368013 -104006209 -5572809636517250553 -2625321610894051277 -17955470565775510239 -4661227814082512385 -6368045642960996241 -5463642874544129714 -16974104 -533070599 -809894783 -18413109997380239438 -7078069 -637862761 -6288511205539515238 +468608091255 +15859976 +4287350254045103750 +61735328 +4391052 +6520986101609793850 +153485925 +8510664359869943178 +11050161621988804687 +20869691006257762 +5196265237615086368 +3491703531359374171 +1873618489037883086 +11633356565151157114 +16633695839999756254 +23407812836853915 +1873618519132015281 +12074216556992400767 +6153071832396467338 +16120716880803858984 +5299098848608717979 +17149817495305717193 +18411981927470333989 +3308118261903721908 +5831598124518278362 +7209143 +810025856 +797977540607610221 +98959914 +7470284155521404014 +2564752444 +1727529996 +12318365723907459763 +5884848047378859255 +13222096480399396057 +6314795724398267312 +4397798316509039896 3974700764184054454 -18613559784442970 -2791055594105669609 -4504298205224635444 -18412263935274844205 -2605266760616185153 -15287987228927396675 -339635799228 -92078603 -8501910827968825512 -5991347884504386492 -210659247559 -17284241873202253123 -16893851873170950707 -651404368114879038 -18411417873169448972 -24838480 -5726226344404977639 -10259573046193883986 -2676958769323838072 -72286714 -6886936648282539655 -14942458 -521143041 -5046422 -13980703149896829784 -1495991284 -62390697 -18199185222634702635 -8834282535679560676 -15925946803693423456 -42598769 -9870724569302153 -5459976661309982295 -11084138473134491150 -5303047078245827995 -214958016090 -12451287838412704489 -5509410202188647833 -2681814701524780811 -10628953736434486617 -9774054990929462949 +5514354709969504081 +2893509029140760671 +1873618514834032208 +5516046791188875281 +1223976962194278208 +14737352740221028816 +6368045642960996241 +3489447322753895731 +21496117 +9870724566222277 +514654951 +189614828176542708 +214958278120 +491717322 +571999061 +6367324841362000185 +10375505339483621145 +8070938101082033295 +5569296709751605280 +1316357237551401394 +12020684574736647824 +15991329612793777185 +10697899350700788395 +16739161091967945306 +3891732840317912522 +1899651063193471062 +161940107 +24314188 +2224234517276395067 +17082847207615301902 +2625321597997025900 +6152225714402364510 +12845285 +506004186 +1733166096 +70189530 +10906583445476542150 +563348302 +31022281504523376 +1873618527730601376 +2530175340657839273 +1873618497636468806 +1873618441749006513 +18412827950883864637 +6366353518750729401 +1413046597527668667 +4078298775038527628 +5565348505908348542 +4022831255438034250 +153289315 +4194441 +15663365 +11700344903086704893 +73007615 +818480058 +296644709200 +95945236 +2150364417046743283 +30740204914804024 +15290525359355331959 +4237766475632413481 +16758489844492275047 +5408700909154273182 +5153885660580611393 +1873618519131818153 +13951205954302051853 +3597357340772992368 +7432602967203907143 +1880166538706292058 +399811749952817933 +10381427610856195682 +4644563108626631009 +14665351642484132 +7012532 +5141736951422061236 +3344434243 +16330874579771459902 +1873618484739705502 +8550520059753138843 +4645667113710455153 +5885976069999102359 +15501473033754248808 +9896616181508082455 +5462796868327967227 +14585410861575638263 +214958081659 +339635406848 +11818157132458100908 +11526999220155450649 +18613533990193666 +1873618450347593089 +3861350810962691992 +684134305183500639 +18413673995792875610 +15530271666638163275 +17621561863500597513 +4238330517036600601 +22279768720672168 +4502606072414800438 +10655291933708585535 +161743496 +517276402 +2625321597996829544 +12648675 +563151692 +1308623824 +104399431 +236453432350 +4279043148 +540214046 +1744643526947965735 +2065251783916127931 +18411699893871247383 +5459976691403131036 +21715680027609308 +5726226344404977639 +15292499491370895395 +18413392005184749654 +1873618497636273356 +32432320526091507 +31304323701802109 +2576363817137867614 +1631882651478526261 +5995296157134227520 +7558005371879033601 +61342108 +95748625 +520094464 +15466754 +3997830 +5240846595623881868 +5887800020369606783 +15288833278135765839 +818283447 +72811005 +5459935770830768809 +9355193091131312182 18411417903263121427 -3865299049198390675 -12910822 -5356297009705911966 -2421359666 -70255067 -2248112069177510680 -3493395634074945822 -60359057 -12654580528992553525 -519111421 -3808100888100343209 -3014775 -13513632858283052077 -15289397310941235057 -8861613698626554738 -9697577994188492052 -155255415 -10381427610856195682 -9870724567271440 -2625321602296252770 -14512708438227029368 -97911325 -489423554 -4022831255438034250 -30671195 -1873618458945324208 -20775215 -5459976691403654584 -813695896 -12665415616966166285 -5645056620059298667 -68223425 +8430474765433179073 +5247593352907000017 +27638110466671725 +32714414313178248 +9234597529149245860 +3229097897723824621 +7449919019336600171 +2413828615876118471 +2414448843017751282 +6101795942670075923 +7697026996393675938 +31304285008889193 +15777957523720635747 +3143159678306028631 +11065487220741573048 +6815921 +2140013610 +14282671233461718187 +9230753948926543533 +98566694 +2625321585100065781 +5382561551670903978 +259130038 +155910777 +87097857 +18284834 +282067642 +545850164 +33278352538406314 +21433680820701635 +5625593289148533238 +10512763733196344280 +3784125305237867347 +1873618514833639109 +32432337723461001 +1873618454645376983 +15292499534361593441 +5133829485925763690 +16904712944497920326 +5511666277521099409 +5622264654903379074 +571605843 +514261733 +491324104 +2625321606595414132 +21102898 +1385890784 +524384476410219715 +17257283880273643533 +5195701200510977145 +10280579133800254203 +200191596416994196 +1873618476140725820 +13117263636444153530 +15096032016463431129 +6729754102968812754 +18412263926676652075 +31304272112126853 +12118995007347033900 +1996555300 +9870724568648556 +540017436 1319896024 -2390363305266056430 -17634738504986593825 -20305632407192782 -17462509665872383079 +4236074403011431549 1606616067 +195953314 +23920969 +104202820 305243098454 -163119765 -48431492 -10590197086357423689 -2787671431665157349 -6366353484357502971 -18413674021587452000 -17620986833073014515 -105775699 -20869665212206112 -4445946672738929841 -95879699 -2625321606595021110 -10906583445476542150 -18412827959482056767 -17205553309096938840 -12294570438877711433 -5461104782808583112 -544736038 -9950019055828534995 -5991347927496394467 -811664269 -5403008449516603011 -18411981897376661534 -572392279 -7677136701370927115 -6155045908523191668 -18067928196024961188 -20587511236070012 -103744061 -161088132 -335336768790 -6155045934318095559 -13322381941750499717 -15291371425760087333 -30740222110467489 -5245848925746498573 -5349308051975768286 -4548309565419816229 -255984301 -5461104787107351969 -16711957 -10906583475570214623 -6365225453139920066 -6177363118375897150 -6815921 -7032232753418799293 -5558136817694803400 -4030203865610717075 -12718336251608304605 -18411981927470333989 -1545208828 -15287141235606883137 -5837238474067478018 -11705421198335413148 -5524868651610213131 -210658985303 -6098975770044925746 -24576334 -13151687854617134836 -4662073803102881076 -72024566 -817497011 +12452064 +5248157445900799208 +31022281504130688 +1873618497636075077 +1454041666728626384 +1873618441748613384 +15289397310941170468 +12999620585941961275 +5875369294806846690 +18142877345697171235 +2789363542978135882 +18411981936068526119 +12057284352529139627 +356832576945 +17092164759424403479 +1460339693 +3801219 +256115374 +1987904546 +1964966944 +15270143 +33278386930321618 +442814170389 +818086838 +17462509665872383079 +6206321759557388696 +5408471212899110030 +1873618523432225060 +17353152791227993245 +6261827792145090364 +15223822230290106035 +15287141218411547437 +7576852735584046364 +9714916620293637762 +31586271318836879 +41025899 +2625321585099869531 +223556471268 +545653553 +4023356732265137752 +98370083 +1531970550 +6619310 +23689824939607125 +9950019064427710530 +12424382439595706534 +1873618484739311861 +18331530485106038 +23407864425745813 +797695489810761887 +15289397353931868100 29330157293733695 -17096567568145714575 -1454859013759438228 -14680310 -4784274 -62128549 -1493907215600323645 -6364097387529046615 -12583654612056476062 -12851509922494416016 -1495729137 -15287141218411547437 -828143439367899804 -2523959969279970191 -3919394969679695174 -7595953279435999504 -2625321597997222413 -491193030 -1839046019115124804 -7241043922144659849 -18613499598604650 -18413391983689269329 -10594427319500605883 -12648675 -4861149623842704773 -5782296448490276391 -5516046782590617836 -518849275 -10015828607276288922 -15662612681012938353 -2752627 -60096910 -5133829485924779401 +6101795994259359091 +7692717626264324682 +5299098870103476096 +9813006656186419950 +3591160524196219107 +4129856608083381232 +2755882956846859930 +5352348797264856883 +812254097 +5524868651610213131 +11124082762859154482 +2857298572705729021 +19177566795402134 +18301216972081072101 +2625321606595217579 +6152225753094686522 +548471621 +5512098595943810546 +6584302816815089825 +11092808190891527547 +5941764144784016877 +18412827959482056767 +14428481252717692252 +5301355009924597043 +12284124821458521275 +3577503648415550265 +9870724568450966 +69599701 +7431889721301470079 +46662016 +35193182 +104006209 +12255453 +17877509356004052233 +11069796553860646809 +4347925833116091776 +10590197086357423689 +5570988786672405753 +9297735470756268616 +14637903957824965363 +539325825270679628 +15584540329550678645 +17247780946628644032 +15073532 +1574831104 +72417787 +152699489 +496763604 +577045344 +817890229 +3604608 +6204347606046083061 +12771845711499103316 +15290243377345399572 +11127945220196076778 +6208295882974168451 +11846037961957312985 +13106160036035691955 +1906986264008723742 +4657106642463886071 +9198943117821938833 +15270695523793439937 +5246976952665638015 7003516464553396964 -12903069678853164419 -2625321602295990612 -97649177 -259785401 -5464488953846367762 -546505531 -30409049 -374027977988 -1396769762 -21715680028329254 -5637072609524124450 -7731877951544692100 -1873618458945062288 -6767393152337644543 -9467310877347154547 -5429433323061448040 -10617033 -1730937871 -107356700000258304 -425617786716 -451412690018 +10956724774926813288 +820708298 +545456942 +63766972 +4585257123188181630 +6422699 +17891616 +9117971362513815455 +18413674004391067740 +14597841535548131734 +9772926989806013640 +1873618458945784014 +4522983015860209939 +13806855580317125108 +15426482629288462533 +3506237898852665380 +5787083718919720300 +13322381941750499717 +13237708531286474753 +32178524 +490930885 +2625321606595021110 +20709678 +1706624008 +513868514 +245051624454 +525337347 +16483453074211931840 +12217235256243064050 +4794173760728861833 +5347351719937377689 +18411699902469439513 +30458205707308380 +10750398672578676906 +13351948495571782591 18413392013782941784 -12020684574736647824 -105513554 -3541851256594893702 -16038494049631274933 -497025749 -4661227783988316231 -18412545951677546551 -5565348467217401524 -14428481252717692252 -544473890 -3344434243 -2169005683868174908 -5993603989931887912 -12972952285742288 -13117263636444153530 -811402123 -2676033356038276482 -1873618514833639109 -514786024 -572130134 -160825986 -1938490399 -10280579133800254203 -285938493736356261 -6425213859614951480 -103481913 -11364576519499679975 -1881294612915292853 -15739206202722094240 -4397798316509039896 -17011915733784398286 -1873618446048496233 -14383326641327005 +17088031212435737557 +5105416417023100899 +11427331956784106382 +3698377064120454404 +69403090 +1629160452 +161153669 +17153706162049649384 +103809598 +15580874133512784221 +7872981661881076049 +2544766567488424310 +8818864638845060491 +1597536180772867045 +17631161371525515669 +4977791693940394179 +29048123694646491 +15288269284022357372 +806224763 26345813 -6156455960443095577 -14975681650483333306 -819266496 -16449809 -15288269301218674108 +72221177 +14876921 +60752278 +3407997 +152502880 +2625321593698257851 +5675796551176948530 +5337229686545450161 +10649664295314066054 +18271599167641487963 +5741055947585816645 +1873618523431831649 +9763237054719266042 +5778348175860436286 +11906248855590275274 +145335345147610979 +27356063970624739 +2676033356038407648 +6226088 +7285785859232107205 +2036204584 +109445719 +5779476228575003910 +13117159209037203716 +97976862 +545260331 +1839046014815569788 +23407778443758207 +13885455448020813663 +17091318705916150977 +14749580058850363919 +32714379920475611 +5511666307614640107 +5780604362970367638 +18412263935274844205 +4767727591020628301 +5885976160280708469 +1396769762 +811860878 +5996988225456246061 +9887461037680036022 +490734276 +295544243748734701 +30458205707109873 +9950019055828534995 +17090754617222956318 +5566476545725367766 +17648172271756969893 +15289115277341755866 +30176189305784773 +6205475701751155414 +9940375093616053431 +5728764444739437994 +15140097999495498431 +6523880655054768550 +5727354401416743090 +210659313158 +1456931819 +11862232 +527958790 +103612987 +4278256712 +9870724568058057 +69206479 +1997667722089860216 +12339564610979564477 +5243108696682924272 +23125779236849772 +1873618501936285414 +16044698410490725659 +13669583335851559254 +14425661019905001872 +13681696359428851594 +16161820259100396848 +72024566 +24535844054893958 +3211386 +817497011 +9870724570875039 +60555668 +26149204 +24535874149025753 +806028152 +232154663489 +507839197 +14680310 +2625321593698061230 +15273884660262766886 +6633286351216577743 1873618493337504776 -5782296461386581535 -12162857194684744950 -16633695839999756254 -6553773 -6206321690771457172 -5411573444917201071 -14273081993166850387 -17297538988880889355 -9870724570810095 -339635275824 -101450287 -2625321593698192308 -91554312 -3812049113439014303 -492962512 -15289397349632182266 -342928503145892901 -9257009393629660721 -13674941621707869313 -17952462371364276975 -24314188 -7676326001635166459 -12622921449567619867 -14471968401314024391 -14418163 -71762418 -4522126 -1873618497636273356 -1873618523431177265 -31304285008889193 -2625321597996960522 -42074479 -18895601982637667 -14883032307819284131 -32178524 -490930885 -5459976661309458015 -194314911 -1873618454646032908 -9386257314251803173 -13950077918785243724 -5831598146013367591 -5882159627828332650 -69730775 -6100103913039400051 -15744000533156660854 -12386527 -518587129 -59834762 -9231865831523354279 -2490479 +3094190453106412515 +1087507565178193378 +8481290603310483360 +16885380374869314205 +5412367062202975465 +1372824966366170355 +2543805385922512178 +27356033876298083 +2676033356038210923 +820315079 +63373752 +6813843502384089093 +97780251 +258343605 +155124341 2148672331528407961 -2908260051937332390 -16876615841046071902 -9950583114428779661 -154731123 +6029477 +17632571483632043275 +18349504802666319185 +12133908888583014197 +18412827968080248897 +6100103913039400051 +5249797202674912471 +1873618458945389823 +16271603835638319461 +605557034314828631 +5881866613803452649 +3544107379218385465 +7406761759861377295 +811664269 +3234866947851553000 +43254135 +10675690836223986039 +3346269252 +17946608694533885076 +5459976644113468289 +15290525376551717170 +5946696443085589628 +3477193953305167424 +5353476789790574588 +28202091681942395 +5944020284604679310 +8143564249694210398 +31304332299601191 +8856014216854636141 +160760449 +28766090095429261 +5727354401416546168 +421318034537 +814482337 +103416376 +210659116497 +15505670362359531298 +16893851873170950707 +15515478115209971466 +46072191 +11665621 +9870724567861628 +23134531 +69009868 +14105033451515939293 +1784112314702629632 +32714336929384395 +853643387796785445 +4713167439824750602 +3812049126336301758 +16130159995999161574 +15289397371128645360 +3910652327921846506 +519111421 +71827955 +9870724570679212 +2625321593697864817 +60359057 +326737659857 +163578521 +17219576355390295334 +197984938 +817300400 +3014775 +18413674012989259870 +27638084672359236 +6156455943247300775 +18115860927276518423 +18323667541285735009 +5572809636518234139 +2581508047683782932 13237708539884666883 -30458205708158447 -2964529530791004471 +2524806027085153844 +2676033356038014277 +31243224015702806 +12982659022915898414 +510460641 +464308734399 +2219404100148201532 +544867112 +438515402871 +258146996 +5832866 +97583640 +63177141 +21715697223994697 +14657391675119111356 +18411699911067631643 +1873618514832657292 +15339421027537585423 +3545799490531822688 +16916327697533962956 +5299098844309358384 +4127600472562141528 +8920676095134336910 +5133829485924779401 +6151097665987347595 +6152225697206437786 +1706034183 +15897859265386515143 +43057525 +536216330 +943759021439519007 +10939233345785957508 +1746899701160150410 +1384869222252743071 +5881302580997523790 +107356700000258304 +11287403619712895066 +814285726 +68813257 +279448782049 +539034393 +22937920 +210658919829 +493159123 +91750922 +5831598081526006949 +103219765 +45875581 +516096747 +9870724567665640 +2625321602296449309 +5568582435113995179 +6154199872212897041 +5356297009705911966 +9001701177466488459 +6425213859614951480 +5129024196560096606 +3971798877726246151 +6471725260172231870 +18412545934481162291 +6155045908523191668 +27356042474686002 +11226550812567014265 +9870724570481756 +417019855960 +12512221777814685432 +2625321593697668091 +18116424960081923281 +14287090 +2818164 +71631344 +18895601982637667 +18066207382028748450 +5353476806987745228 +6100103891543657570 +6996745855548787140 +10594427319500605883 +575239712866634722 +3870997034531752803 +7239633818635733091 +29612147900877606 +8865366478519731984 +5352348805862656188 +8709732826087622782 +18412263943873036335 40042856 -2933734509745341832 -5459976691403131036 -1730675726 -1873618484739705502 -2676033351739245930 -15215179494928287321 -14866462842593414402 -5463642917535614049 -631243623 +5245789596497153220 +819921860 +15580874176502892132 +154731123 +544670501 +62980530 +17105177 +4254959725487523541 +5636255 +30458188511970924 +3973491001936446780 +7410173966093388838 +8704274751914773568 5885261859847867262 -11391362031143292020 -506659547 -105251406 -5778348197355914873 -16324853745603185849 -5509410163496651347 -152699489 -15292499534361856724 -496763604 -544211744 -4078298792234977417 -5461104782808057591 -14648423506775771515 -10504814416598927327 -8709732826087622782 -2544766567488424310 -811139977 -17088205463377873568 -15798241638577276499 -2676033356038014277 -2785415326238639918 -12562453432512743836 -12350988444867431112 -1873618514833377412 -16940553195690134509 -45875581 -103219765 -8854886168440079511 -5941764153383128192 -2625321589399162008 -11818157132458100908 -2785415278947600352 -15257764832492062794 -232154598652 -819004351 -16187661 -4644563108626631009 -4000515045253449269 -16872667624306444468 -1873618493337242815 -6291625 -6156737968247080128 -292346005443 -283116224 -3220426554520570467 -12356593998396393868 +13590665243542948895 +2412665810316625225 +18613516794268696 +1873618514832459339 +11623400942792738969 684134257893444250 -17175427809786595961 -9870724570547380 +9126223058424237061 +10530167802300925091 +14267039342724252928 +7042731044489660311 +811271050 +219257507157 +16204201012116260706 +19923244 +2248112069177510680 +19741625396299098 +14311172801615955497 +313840698753 +157549179 +4662073785906890031 +9658025172156550406 +6364338522051512284 +3599049409094225259 +6177363118375897150 +1801912319444323213 +11272401 +91554312 +2625321602296252770 +68616647 +538837783 +1411918553413126104 +22741311 +492962512 +451411772583 +160367231 +9870724567468020 +814089116 +2528483177756102046 +10370417990725208293 +6829167367527204602 +10167299845199168903 +14284319595218536933 +18413109967286566983 +9881712940254169714 +13819298136066198 +10906513561824594910 +9486667438472824267 +10215782083327823122 +3685635809078676834 +518718202 1992881803100621054 -2625321593697930351 -9450798976826149302 -16655465042802838677 -6474545510181176536 -11740202404159819072 -15289397349631921063 -9714916620293637762 -6098975770044401989 -16364556117061994922 -196084388 -540148509 -24052042 -11065179658016983681 -12480382642832672298 -71500270 -7285785859232107205 -14156017 -17632571483632043275 -61604254 -4259978 -17750109864738752812 -1873618523430913566 -9830100417878166271 -14425661002709010016 -4794173760728861833 -464308734399 -510460641 -2507605048 -41812332 -2679637056 -99156525 -16044698410491643447 -9870724568517151 -5516046735301085409 -6261263733545503259 -3759645248384009814 -538116878 -5779476232874035736 -6104586261131037638 -10531295842117158093 -12124379 -69468627 -5565348505908348542 -814941090 -5299098870104394759 -14322284629040564382 -10440328872292254866 +14090480 +59965836 +1550910461 +17063697102470777209 +71434733 +2411070513 +17639632887023929831 +805438326 +2621553 +9870724570285675 +1192315333980326167 +6928358494714596151 +5512098613140196086 +15611911946388048179 +5214737420442796133 +5778348150066318224 +27638024483702949 +18412827976678441027 +8881019260503721949 +5837238405281154301 +5461104765611674055 +4181978486829943864 +154534512 +5439644 +755605599842665887 +62783919 +292345154665 +544473890 +567411536 +15257764832492062794 +15122676476569913436 +5835546375651263675 +5516046795487905107 +10758418391935682553 +27356085465188671 +11400819049620310987 +5245848947241584851 +1728578573 +42664306 +219257310372 +8257735 +524354306 +429916226796 +4076606625033226775 +10162787574158199843 +6064503826171693679 +1873618450346019367 +5566476545724581156 +2704904932763174902 +4548309565419816229 +12484855343804124176 +879419277503368073 +6153917830015027393 +573047641 +68420036 +9870724567271440 +2625321602296056238 +2531021389865944442 +11075790 +102826544 +6064762127302393958 +5249797159683425776 +18413674021587452000 +31586340104504804 +4469799173763958889 +13237708548482859013 +31586254122912349 +9870724570088730 +331037869860 +417019463453 +48300418 +71238122 +1539245050 +644678512 +2424942 +11305519054433421356 +15739206202722094240 +18411699919665823773 +4787251587801811388 +32432320527338097 +27920126869375123 +18008952305986046279 +4661227783988316231 +2543452128325209336 +12826571498698443033 +16711957 +4160546838840674266 +5245848925746038203 +62587308 +5784522635265967958 +5243033 +544277281 +6211116016906930204 +16253215886083360174 +23407808536904833 +9821766011288487605 +2676033351739442523 +8061124 +13012951802392676509 +219257112483 +547095362 +1992881785902860007 +19530026 +810877831 +2625321610894640833 +4825924017323379312 +8854886168440079511 +3808100888100343209 +3493395634074945822 +12591757708863407913 +5349308051975768286 +13361048879788721990 +4074350485214726972 +1626046306171619904 +8617826180017097033 +13513914891881875478 +29330183088506125 +18412545943079354421 +6405261049027955879 +17939922315943150958 +1410073577 +7837634943338155161 +85348920764927349 +2625321602295859611 +813695896 +538444562 +29048192480512516 +45285754 +68223425 +91161094 +10184384893691038634 +17542946455516547053 +4180850416920431050 +1928986057181235415 +6364097387529111599 +15289397371127860096 +2522831856378710008 +5885976061401368013 +21997756618246082 +18412263952471228465 +816513961 +13678484518713821516 2228331 +2531021385567766485 +197198505 518324983 -16872385650894636566 -6284197438710222140 -8098722631875955846 -5727354392818878727 -9870724566484489 -154468975 -2292825785040636736 -3172873343893834792 -14418466534433295118 -2707725182771857350 -15293345523383077603 -259261111 -19988781 -15371922320578972378 -19741625396299098 -18411699893871247383 -12818875419963886521 -2676033351738984017 -14268291611706526293 -1309213649 -104989258 -6367324841362000185 -7432602967203907143 -11331649863678691999 -15292499534361593441 -1815413785 -5778348223150556659 -5572809636518234139 -11408348231855703653 -2446197814 -13001682102565734253 -17186370630874106258 -2785415274648570354 +71041511 +9870724569894177 +105448017 +5779476207078475458 +13980703149896829784 +13697261 +12769025487284210679 +2150364451439708714 +4162099616697747321 +1873618497637320883 +1873618523430651633 +12716605781588708278 +5248951136269502637 +11703165067112811597 +62390697 +5046422 +521143041 +16515346 +2625321589399096275 +154141293 +1495991284 +165610142 +10528906628815718922 +555549513 +819332033 +567018319 +1095239150174145285 +1873618458944406678 +6364097374632348674 +10811668319096800853 +1465168459078698840 +17269866578628118199 +2676033351739245930 +7864513 +19333416 +11034422950646711803 +283116224 +2625321610894444316 +546898751 +18411417864571256842 +2372569380647405649 +12754891682880490747 +18413109975884759113 +3332575624131774585 +13536993689470216 +15291935475760762248 +6153353775713551670 +15289397349632312454 +9373330690077689939 +29330183088310857 14264783202905229777 -7171706723174648069 -820773835 -4645667113710455153 -16425638839461284611 -5353476806987745228 -1840738151924108521 -6153071806601889790 -810877831 -8061124 -5356297048398365877 -4770547841029572913 -12804866717273491655 -15580874133512784221 -514261733 -571605843 -12346762090311779845 -102957618 -10907429529076434052 +5995296139937712949 +12269921124804135698 +5885976155981547843 +4129856573689694799 +538247952 +3598203394278688718 +159777405 +22151483 +1409876968 +45089144 +10682570 +3172873343893834792 +15290243360149277503 +18412827985276633157 +28202117477106938 +2057911839619745748 +1193603335023233930 +1410790466305853323 +1873618476141774348 +16643035121679272213 +8716776878113885241 +14581169549127125939 +6375653898722412075 +8694008299851745161 +21997756618049241 +13500652 +816317351 +9870724569695611 +105251406 +70844900 +4279895118 +197001895 +24969554 +15855332315905657597 +151126621 +506659547 +5142301044413893172 +1917322269 +3137288237381257087 +14409153078548760239 +14633483086300318059 +5780604315680310424 +5572809636517250553 +15592058013925444099 +17244622751296785814 +27356063969248337 +33560399034844286 +62194086 +153944682 +4849811 +16318735 2625321589398899121 -5354604872597767596 -4279174221 -27638024484621167 -8483828720841721486 -1459422188 +9374458755688828226 +15882855708139391266 +10225919150420460684 +15292499508566297469 +14065038233622153931 +17943317531893566468 +7031431794891230329 +16385074671182940805 +6205475624366966000 +6103488079777762245 +18411981897376661534 +9796067026192895123 +7996312456522828750 +13487493291780736605 +17245750799710423012 +2676033351739049517 +546702141 +7667902 +42074479 +282919615 +5515085278788979157 +810484612 +2625321610894247837 +1544487931 +9511403338599564690 +5192881006389626514 +5778348223150556659 +32432363517642192 +14046027923586223423 +18413674030185644130 +10771469979967884371 +2229804632046437624 +17480593072628501093 +6368045642961454306 +13237708557081051143 +13331692090551241408 +7677136701370927115 +339636063086 +538051341 +21954873 +492176077 +9870724566681132 +1398211555 +8807886331112851129 +5516892801706297876 +16546579671803956126 +13217980679449939250 +8503320935775669540 +32150372909516328 +1675756474409617568 +7721139320100816372 +2541195915421354200 +24772943 +1309279186 +105054795 +9870724569498781 +162398863 +70648289 +477207135345 +3452050537366752044 +5460499803637156023 +18199185222634702635 +5512098557251945790 +21715680028265805 +249849483538007723 +3554388240579364748 +9386257314251803173 +10594427319499622429 +15289397306641222853 +29330140097217635 +14311736903207619026 +8856014234050823647 +17855463731522440261 +15291089478142986477 +6365225461738636619 +2625321589398702921 +39059811 23689889426704296 -17648172271756969893 -232154335723 -15925513 -10811668319096800853 -6365225478934037607 -9763237054719266042 -11633356565151157114 -63373752 -1873618493336979326 -6029477 -3580814869236944221 -5199085482290645376 -282854078 -2625321593697668091 -9870724570285675 -7449919019336600171 -1839046014815569788 -23789896 -9131616131521448314 -5779476228575003910 -5511666277521099409 -13940760354079114484 -18413109980183855178 -644678512 -71238122 -417019463453 -15131353489256221185 -447360420122266222 -520094464 -3997830 -15096032016463431129 -1873618501936549084 -61342108 -1873618523430651633 -18412263918078459945 -5344573059048999857 -5155859771100236117 -5405598659939206416 -27356033876298083 -2146416200305806198 -5303893093062347743 +2474797953316292924 +61997475 +818938814 +153748072 +4653200 +50528646 +256967343 +468608354196 +509280991 +5463642917535614049 +6209141923583494372 +16122124 +10375505326587315831 +4397798264919820154 +8054758354584013306 +1873618489038145001 +830399540494469985 +5637072609524124450 +13913370016116443160 +18412545951677546551 +2676033351738852867 +99222062 +546505531 +18940198 +5411521012994540776 +2625321610894051277 +374027977988 +282723005 +30409049 +7471291 +259785401 +821756878 +11229884474259868248 +16357059045845960667 +1873618454646032908 +10542598463376395267 +5887104182899575287 +12393440069873436875 +7730749911729375728 +15289397349631921063 +7621890105505615217 +18263821886743185772 +18058417591402760409 +1317733333 +511599051947 +214958540605 +67633599 +9870724566484489 21758263 -3189961199463959445 -527958790 -69206479 -11862232 -6364097396127827248 -1320879066 -365262179507571896 -23689855034002659 -1473119215 -18412263948172132400 -31243224015702806 -39518566 -9870724566222277 -545719090 -5301355009924597043 -9391897706793274792 -11514789185312918199 -18411417886066737167 -5299098848607995194 -2284412389694637269 -10530167802300925091 -10427987387505837891 -14322803714593785119 -2625321585099869531 -6829167367527204602 -6013889919468112625 -4181978486829943864 -8698802578697685482 -1654120425802828663 +1873618532028844481 +8328637003859953646 +5509410163496651347 +15289397375428069113 +16436648502584675667 +6205475697451599682 +8929320279979198383 +4974759074281293673 +9870724569302153 +13107433 +815924131 +2524775482 +28484129580057898 +206359759935 +2625321597997287853 +24576334 +70451678 +16432982289349543214 +5459976661309458015 +4237766514325588729 +14322284629040564382 +9697577994188492052 +5759560470823897206 +1873618527730862181 +6153071780807051278 +15586729198848181726 +5558136817694803400 +10694515171064744788 +1988559907 +5302201063430292982 +4456589 +1713308683 +96207382 +15925513 +13650504529854072320 +5458848655886518922 +61800865 +5779476232874035736 +153551462 +38863202 +8099682237308012832 +18411417873169448972 +2787671431665157349 +651404368114879038 +17955470565775510239 +18413109984482951243 +14911447610171655366 +8858552325786961082 +13840095604897025041 +5940918086979029952 +5723537903358642458 +11238431078799509026 +7758478083289188556 +2014710471177341259 +12454108019635062383 +99025451 +3862478902367620470 +17621268784989013395 +7274680 +546308920 +27638041680086900 +5991347923197297866 +6208295874376239521 +34124418943289166 +27356081166223293 +5461104782808057591 +5357143003026951378 +113312346 +7885152524183078888 +214958343888 +309541536868 +2395604016 +9870724566287831 +491782859 +2414921941537785811 +7528330190111771360 +30458205708158447 +2707725182771857350 +23407795639356361 +5991347884504386492 +14541340640523322842 +14576212124415364447 +525512494730316455 +6795216411027442152 +1160107295621122785 +11391362031143292020 +2421359666 +162005644 +517538547 +196412070 +6152225714402428442 +447112610911 +620757861 +2625321597997091447 +12910822 +9467310877347154547 +70255067 +8926052536804313893 +1873618467542403595 +1873618441749072804 +18116142943679220675 +4787251587800828866 +18411981905974853664 +17175427809786595961 +153354852 +818545595 +5462796881224795644 +15728902 +4259978 +96010773 +1334641629 +73073152 +61604254 +7570985824906512457 +20305632407192782 +14288223544298637564 +12772641161582545873 +13237708565679243273 +15394822466617412826 +18430745393540238720 +282329787 +98828841 +809894783 +17011915733784398286 +7078069 +637862761 +18546980 +7993599274919922706 +546112310 5569296748444387676 -1873618441748940565 -256967343 -5245848947241584851 -15862817677379702068 -14633483086300318059 -288046714075 -2203332276215481610 -7798976 -810615685 -237175467 -11340219378265033230 -313841615983 -513999587 -18413674004391067740 -2116750858326574509 -8070938101082033295 -2625321589398637514 -25099937047839912 -5245848878456439955 -12118995007347033900 -4562124381333884039 -31586327206235137 -16436648502583690678 -9181481831755875838 -5516046752497929091 +5429433323061448040 +1873618454645640429 +16425638839461284611 +32432337723721245 +2785415309041207732 +6156455960443095577 +15292499534361856724 +4727296740454696303 +15289115311734721621 +10914208898869168291 +583336804 +214958147231 +21365044 +491586249 +9870724566091296 +2246313005 +3901910077209972079 4183106466458307862 -1991460714865167155 -17082847207615301902 -818480058 -15663365 -73007615 -3701600990787603378 -63111604 -5767329 -579208034 -1493907215601306869 -11535686880442518166 -3313969578832561394 -2704904932763174902 -6570315963541227654 -282591932 -5726226297114658480 -17160329975787685834 -8843457619279611284 -18413674034484740195 -9870724570023121 -492176077 -30740204914083091 -21433663625497129 -1629160452 -1873618450346477252 -18412827972379344962 -5243108696682924272 -7260902865540482639 -816448424 -70975974 -15287423196122254433 -1873618501936285414 -5151629580948802356 -3735682 -61079961 -18411981910273949729 -7837634943338155161 -3597357340772992368 -5133829485925763690 -51184007 -10956724774926813288 -98632231 -17309267256018536307 -9870724567992379 -29048106498198701 -3544107379218385465 -14386655907412249373 -219257507157 -21496117 -68944331 -16330874579771459902 -11600084 -11124082762859154482 -5459935770830768809 -814416800 -347984565637089693 -11923578915473263059 -575144796 -517800693 -3297856681506178941 -326737923180 -16038494049632258844 -15104099179857577674 -32996413518841137 -153944682 -2152780467316001469 -8722536002903082945 -10646954815923686447 -545456942 -14458654042895551171 -3935742187522887052 -16064731596255856452 -19464489 -17648172288953812474 -6213874949885069218 -14851060135220743194 -6471725260172231870 -4504298175131421894 -573113178 -11701191021079496730 -12314601354656483126 -13957562954616997312 -161809033 -563217229 -104464968 +2989761681675848960 +681145240220994278 +4089771542585346905 +9960543895307946415 +1801847830 1366033375 -1133620930477295468 -6209141923583494372 -2625321610894509848 -5052785364214352114 -6155298040667702671 -5246977012853376412 -4074350485214726972 -27328854 -1873618441748677997 -2000487899013646903 -7465404271946632160 -7239351853821397993 -11742834345080916462 -6368045642961454306 -5516046795487905107 -434216307724 -3493677603186412637 -810353539 -16633695840000739887 -821147663836514852 -18413391996586557524 -7536828 -4151361015346562251 -14540810596246030644 -5995296139937712949 -159777405 -8816997369364548341 -45089144 -18412545934481162291 -9298403582666148514 -15108492788614827244 -35193182 -5568582435113995179 -5570988833963444820 -15289397375428069113 -15401217 -8430474765433179073 -10750398672578676906 -72745468 -5405598728725859379 -9250794030848869727 -62849456 -17422075106091075868 -5505181 -1873618497637255436 -578945889 -13106160036035691955 -282329787 -5570988786672405753 -9870724569761068 -7031431794891230329 -43057525 -1706034183 -491913932 -214958474959 -90505732 -18412545964574834746 -32432303330887118 -846140170598090257 -5458848587099997499 -17607182983838566334 -195297952 -539362075 -5460499872422693597 -23265605 -943759021439519007 -70713826 -816186278 -2207492642904016905 -644154222 -60817815 -806290300 -3473534 -1873618501936022824 -13307798926833551183 -1873618527730926929 -11349795265056081195 -567018319 -9388513449772451585 -165610142 -2625321576501808484 -7290339324003420579 +70058456 +2625321597996894992 +104464968 +563217229 +9391897711091583990 +12714212 +161809033 +24183115 +196215461 +4662073803102881076 +5564423899624572258 +1930668198955452820 +1873618497636337289 +2785415326238575484 +14418466534433295118 +15292499491370961900 +17305802226190387588 +1413046597527538184 +11547019543992076059 +18412545960275738681 +72876542 +6671860954713623381 +818348984 +7073102484926630551 +4844858457232050089 +4063367 +1436145094782551981 +95814162 +8697110475982376165 +15532291 +61407645 +23971751058934778 +13418544886826534789 +17616192489740896443 +8486888319115725460 +5941764217869305943 +5566476498434524382 +17083693235326683482 +12583654612056476062 15287141244205140113 -41025899 -9870724567730368 -5569296739846327213 -98370083 -1531970550 -219257244681 -2065251783916127931 -6151097665987347595 -1407386597 -3973490993339565383 -12463417266756127924 -17631161371525515669 -21233971 -3232498753 -4767727591020628301 -8972557000702888938 -1873618458945784014 -15290525376551717170 -1559626750 -68682184 -12689613402799605860 -527434500 -517538547 -3542979343701772038 -447112610911 -163578521 -326737659857 -30458205707109873 -2625321606595479619 -498702419026 -555090760 -11846037961957312985 +10329316755526125416 +545915701 +6881458 +23689855034002659 +12734096628671909131 +11199980773228349986 +98632231 +12818875419963886521 +6848542126596623056 +5941764183477191834 +3186488476490139978 +3875793754648151904 +14319322726341872694 +17090754651615726815 +3221822110943152678 +5516046735301085409 2286775792223980496 -2676819007 -11599686562536949325 -3968978683605551949 -5831598103022077418 -15175534989820758889 -3812049126336301758 -545194794 -12348736218027264207 -12743882002561631754 -12318365723906541324 -8882845388820581451 -12769623874203027091 -1732546160493595960 -10430737389551487761 -9512531412808567772 -21433723812579518 -812123024 -9140909979694467183 -4025048830681353606 -1873618489039455401 -18331530485106038 -5516046791188875281 -6156456003434055463 -12474564753552836994 -17621561863500597513 -104202820 -29612220986426501 -1996555300 -2625321610894247837 -17489156252859434801 -103179363763095696 -15920335005095365860 -13112992413209136128 -2034107431 -17291573824845253535 -9772926989806013640 -819987397 -17170714 -1873618467543321286 -16156684754098128751 -6925759830950740072 -7274680 -16161820259100396848 -3698377064120454404 -10296839827164892306 -13913370016116443160 -1363739614 -92275213 -210659444315 -1784112314702629632 -5461104765611674055 -507299956084 -13237708552781955078 -197067432 -4211147846 -14657391675119111356 -25035091 -1735459858 -15139069 -14426056237756189706 -12771845711499103316 -9940375093616053431 -6523880655054768550 -62587308 -10967349376607587326 -1873618497636993704 -15290807392954681807 -5243033 -1133620917580466754 -1873618523431898109 -11613165301442872555 -282067642 -9870724569498781 -2141513421469058406 -14318336791419094928 -5885976069999102359 -6153917830015027393 -214958212644 -548995910 -90243587 -16101055855214332856 -9409295256684857617 -539099930 -30458248699119542 -23003457 -252379820 -6173800107753209956 -70451678 -13107433 -815924131 -1873618476140856959 -3188833133853148985 -3211386 -60555668 -5514354727165429372 -18430745393540238720 -5566476498435442740 -8821966780582857359 -806028152 -31022281504130688 -15273884660262766886 -17153706162049649384 -15568274631689570656 -98107936 -9870724567468020 -2625321602296449309 +20587696099952690 +6886936648282539655 +6013889919468112625 +2625321606595479619 +1370042529145686776 +365428606574 +339635275824 +101450287 +17625510063045740642 +214957950334 +6213874949885069218 +812516243 +5463642874544129714 +10906583479868327250 +15744000533156660854 +5885543833259674054 +18413391983689269329 +1873618476140792146 +6177363174264606048 +827351178595273968 +10488220504998020326 +12517601 +5354604868299326678 +7734678113259293802 +3055188874828713383 +477206348880 +196018851 +517145329 +1801651221 +104268357 +505676503 +9870724568714358 +10531295842117158093 +15215179494928287321 +18411417881767641102 +11678577273375754735 +50011031689890353 +1873618441748677997 +9839328478246341893 +2704904949959166469 +18413109993081143373 +15289397310941235057 +31304323701671300 +61211034 +15335680 +164430493 +1884114794138700522 +497025749 +3866756 +4131548702199581670 +17761874897365304540 +17619012653768771484 +16542141154387102177 +4451048243670025981 +2973047420892220660 +6155298040667702671 +6684847 5250413516934940017 -10377197347619277484 -546964288 +98435620 +2625321585099935141 +15293345523383077603 +3324580943442478547 +545719090 +23689855033805297 +6829167320236755019 +5991347923196709309 +12903069678853164419 +8098722631875955846 +17142588721119759321 +6151097721875664077 +5352348827359053328 +491193030 +812319634 +2295119206548507606 +514130660 2429420595 -68420036 -13840095604897025041 -11075790 +2625321606595283106 +10757572347027851837 +10668197763104573447 +1873618476140594617 +18411981914573045794 +14847634415788362123 +451651625195343029 +4278715465 +746324852 +69665238 +15288551330518206781 +35258719 +23789896 +12320990 +161415815 +1491796976 +1812923415 +9870724568517151 +104071746 +5460499803636172954 +10592171188278987146 +9388513432576593267 +2704904949958969710 +16038494049632258844 +28202057290482949 +5303893093062347743 +5780604350074062990 +13674941621707869313 +15881445561639371975 +11405246090117384413 +61014424 +3670145 +1685722535145180234 +8834282535679560676 +95420943 +15139069 +3205882223899445065 +255984301 +1735459858 +1192315303887243384 +20869665212206112 +6925759830950740072 +883931539946605906 +5299098848607995194 +1445774942907271091 +4445946672738010859 +63832509 +17620986833073014515 +545522479 +98239009 +17045009756596405420 +6488236 +7536133444401891169 +820773835 +13968724050119231192 +5701610621477129021 +3068049059797011246 +20775215 +2625321606595086582 +29048166685607288 +15618641120352995308 +5831598115918776853 +812123024 +313841551493 +5880738612678821404 1873618506234530930 -517276402 +32432303331018178 +16219982623696489082 +12258209558853782455 +16436648502583690678 +9387385384162626351 +4151361015346562251 +3542979343701772038 +18412545968873930811 +6741138607599781046 +1709507593 +12124379 +451412624470 +516752111 +161219206 +9523554809025136564 +17951898368652543383 +69468627 +814941090 +92406286 +103875135 +9870724568320021 +2523959969279970191 +5144036663261072615 +30740239306197230 +2089265852158774334 +18331517588211470 +1873618501936549084 +6364097443417820330 +1873618441748286746 +27638058876667848 +828143439367899804 31304293607146613 -10225919150420460684 -32714392818354350 -163316374 -17480593072628501093 -3653991426073234491 -28202143271093720 -2625321606595217579 -669516658 -11075097734987253589 -544932649 -5248951136269502637 -24535874148371011 -5247593352907000017 -13750803869111880047 -821756878 -5565348488711963913 -18940198 -23407778443822783 -811860878 -3910652327921846506 -2372569380647405649 -6151097721875664077 -8481290603310483360 -15289115311734721621 -5197393238738928914 -8858552325786961082 -15270695523793439937 -103940672 +1580068669843704469 +5565348480114297669 +72286714 +164037275 +3473534 +14942458 +356832249381 +806290300 +60817815 6206603741566403719 -151388766 -2531021385567766485 -7563081637033018620 -13044533461222491710 -6154199872212897041 -9126223058424237061 -1160107295621122785 -32714349826081871 -6152225697206437786 -4333982245204396969 -7012532 -5411521012994803182 -5249797159683425776 -570557265 -17619527108083517000 -3758799224970808644 -11069796609748044689 -210659181949 -14926165161459649868 -7570985824906512457 -3234866947851553000 -1906986264008723742 -24772943 -1873618446046923526 +12240192446106372977 +5942328186188203485 +1493907215601306869 +30740204914083091 +1873618463243635741 +1873618523431898109 +2341393787733871788 +1873618549225229533 +129168850403198609 +5728764487730398405 +10015828607276288922 +12057002331826554305 +10159392401199270768 +12142525752872799042 +2676033356038473537 +494403323033 +292346005443 +223556143025 +10911952793442192048 +820577224 +6291625 +1394017249 +98042399 +12163381674616883890 +1992588707391932346 +109394696450803010 +17760542 +545325868 +12318365723906541324 7516607870825792868 -14876921 -72221177 -18411699906768535578 -1495925747 -62325160 -288043895627 -31304259214443724 -3685635809078676834 -4980885 -313838798363 -13951205954302051853 -464309454125 -7151957518376504179 -6153353870293665804 -365428606574 -14319322726341872694 -3493083035910933027 -214957950334 -13222096480399396057 -22741311 -538837783 -12845285 -1675756474409617568 -7676326031729298383 -1873618476140594617 -70189530 -2861086850442987769 -12590629664748537952 -15501473033754248808 -1733166096 -2949238 -5833854255738587405 -6405261049027955879 -60293520 -6364097417622914469 -50397573 -15289397310941170468 -1436145094782551981 -9870724567205432 -155189878 -7996312456522828750 -2413828615876118471 -1818166298 -97845788 -2625321602296187261 +18411699880973959188 +2396555433535145871 +4074350515307087985 +18172096623373846790 +23407778443822783 +18413391992287461459 +6284197438710222140 +13819010092172884 +15636771529559117046 +530117487457144076 +7241607903360452069 +17113993018971653874 +6155045921420412650 +17869638717220195611 +8695136395555507435 +548143940 +1992881785903908578 +15741861301866530650 +18411417890365833232 +5726358144768542273 +18413110001679335503 +16149105318148965958 +6366353527348595732 +6155045912821630127 +18067928153034001057 +7888341864134085054 +8856014216854897981 +11202456151687629452 +6152225744495577231 +69272016 +631243623 +210659378559 +3541851256594893702 +23396678 +11927769 +9870724568123690 +103678524 +92209676 +17297538988880889355 +1873618501936351339 +1519546451849645365 +5438906422044199526 +2626514825137096412 +15740334315622960494 +8912366315847026281 +5461104800004441485 +326737923180 +6388664954993575829 +14264208172476401284 +14745847 +3276923 +576717661 +806093689 +1815348248 +152371807 +2625321593698126810 +9870724570940976 +72090103 +60621205 +5569296726948055802 +4502324021620638916 +2599235317101562153 +12665415616966166285 +5197393238738928914 +3701600990787603378 +9654763076979264499 +12808641794728789765 +2676033356038276482 +18412263913779363880 +3865299044899163405 +6877319166685482198 4451323549999957434 -3544953467117898450 +15287141188316891641 +17563932 +155189878 +292345808939 40501610 -6364097443417820330 -1543385872365455415 -12606726616442537392 -16436379939763522008 -7562235540534921217 -546702141 -20709678 -18413109962987470918 -10939233345785957508 -1384869222252743071 +97845788 +6095014 +63439289 +2791055594105669609 +28484146776247610 14383042897579063 -245051624454 -813630359 -5881866613803452649 -1455946274504313841 -68157888 -10813643 -4502606072414800438 -9388513432576593267 -517014256 -16739161091967945306 -6203168539198949844 -20305658202031811 -15122676476569913436 -48365955 -5941764144784016877 -12601357272775920269 -5900805793554762144 -163054228 -6155327937823509637 -95814162 -2625321606594955469 -544670501 -11092808190891527547 -6365225423046182853 -3545799490531822688 -5991347927496329957 -2676033356038473537 -6928358494714596151 -18895516000586505 -18413109993081143373 -1317798870 -3242943116712479419 -8468495303965871404 -10215782083327823122 -295544243748734701 -7536133444401891169 -13880529192106527090 -18412263930975748140 -103678524 -8816997365064994109 -5513226652957347114 -13427220419978791304 -4279895118 -2581508047683782932 -151126621 -16436648502584675667 -5245789596497153220 -18411417868870352907 -1574831104 -5512098613140196086 -16646420 -16881311723980129501 -580191075 -6750384 -460010423829 -17142588721119759321 -5411521012994540776 -13331692090551241408 -2236213724530672835 -10512763733196344280 -91750922 -493159123 -210658919829 -5353476789791099071 -2973047420892220660 -102615266471184862 -817431474 -71959029 -14614773 -29330157293667421 -18411417898964025362 -8854886129749066875 -62063012 -1631882651478526261 -1873618497636468806 -1626046306171619904 -4718737 -6971710725545264615 -15463390673086056969 -5996988225456246061 -2625321597997156982 -1258091056198584472 -2365498112266798670 -12258209558853782455 -548471621 -200191596416994196 +5199085482290645376 +1818166298 +8843457619279611284 +4076606659425995504 +10441809166173275876 +17306856587979066781 +918014392040164136 +1873618458945456185 +12237654281284815334 +1873618484738787248 +3101815889301670444 +20023641798084435 +17404805271631431757 +10554335258691243263 +7239351853821397993 +12012735189037878155 +12893049762838873864 +5565348523104797810 +4078298792234977417 +18411981923171237924 +15636380174699072146 +1440671446449589035 +112132697 +429916882098 +3822179681402096264 +1881294612915292853 +16508579235574254010 +32432389312220424 +11911353550167017696 +3493395603981665996 +15287423196122254433 +6104586261132347910 +1873618450346674286 +2848264744227112681 +681145240220010584 +6366353527348399255 +437798118096833445 +16872667624306444468 +516358893 +15515140725455259155 +527827717 +103481913 +210659181949 +1938490399 +23200068 +160825986 +69075405 +7460569593843485201 +3172873313800750756 +9870724567926898 +14383326641327005 +4452458274095237609 +1330643932 +11731158 +4025048830681353606 +4530582984922301900 +10233718743719545342 +1873618471842024407 +2487491354099451134 +15292499491369977714 +11078361536363433136 +7513578521803359945 +15662612681012938353 +2146416200305806198 +342928503145892901 +7746405201714873917 5565348480113903112 -10159392401199270768 -538575636 +60424594 +14549236 +15897908900500866947 +33560403333875446 +3080312 +1005889956380019628 +817365937 +163644058 +507708124 +71893492 +2625321593697930351 +9870724570745030 +17750109864738752812 +17419818910382754661 +15475002888547338265 +16224960573811657069 +18412827946584768572 +7880043043777809442 +27638084672424186 +5246977012853376412 +6366353484357502971 +5407707525201267705 +359800716494834349 +447360420122266222 +2676033356038079832 +12463417266756127924 +6053028402293443390 +820184006 +97649177 +63242678 +28484176870181368 +5898403 +33278412725750974 +5293539447540681024 +4504298175131421894 +6313103561496791450 +544932649 +2597848126 +17783567759008991682 +9195012299735698785 +3491703471172619422 +8722536002903082945 +1873618514832721746 +2676958769323838072 +4000515045253449269 +9298403582666148514 +5780604337176709161 +11701191021079496730 +17943317531894613402 +5299098874403555921 5782296448490211725 -15289115277341755866 -12583138 -4959080478982475006 -4237766475632413481 -2687090 -60031373 -11241814380293784908 -18413674017288355935 -10162787574158199843 -5625593289148533238 -605557034314828631 -2625321602295925195 -97583640 -16546579671803956126 -546439994 -13513914891881875478 -18412827955182960702 -18142877345697171235 -8716776878113885241 -5991347923197297866 -21715680028265805 -5299098848608717979 -2686971790050919863 -10551496 -2676033351739442523 -5246976935469649046 -4236074403011431549 -5561348123192067576 -516752111 -13525196865559988902 -451412624470 -6813843502384089093 -3452050537366752044 -2723374776553770162 -105448017 -14284319595218536933 -356832576945 -1987904546 +13965057806789381527 +3880024017885400135 +43123062 +811533196 +1133620917580466754 +17096567568145714575 +7462549834646555859 +4223816177647290930 +1873618450346477252 +9232147856523987724 +6517011693716180143 +6219166245997316001 +29330122900898936 +17782439637510195701 +16892239439269464715 +8072726556923202784 +15862817677379702068 2789363555876800106 -17063697102470777209 -6584302816815089825 -5727354422913010657 -13944415416121166662 -28311895 -11906248855590275274 -3707523343842937215 -18412827985276633157 -821232589 -18415907 -2676033356038210923 -17257283880273643533 -18331556279224644 -9117971362513815455 -18411981923171237924 -309541536868 -113312346 -46072191 -103416376 -27920126869375123 -160760449 -361131345578 -9234597529149245860 -14835085562484362568 -4585257123188181630 -1413046597527538184 -6208295874376239521 -13217980679449939250 -1966081057 -6101795981361546864 -16384272 -10370417990725208293 -4196703391028741586 -6488236 -63832509 -5153885660580611393 -6155045912821630127 -5197393273132877515 -2625321593698126810 -10720606758114626648 -9870724570745030 -30740204914804024 -91488775 -7792373120121047026 -3579577413 -5458848587100981064 -755605599842665887 -17404805271631431757 +11534547 +279448847671 +210658985303 +252379820 +539099930 +814351263 +160629376 +9870724567730368 +103285302 +68878794 +14268291611706526293 +23003457 +16876615841046071902 +16269555352897260337 +1873618446048496233 +10161648493728303880 +10430737389551487761 +13735950183222348532 +1991460714865167155 +9870724570547380 417019921504 -9386257335747873389 +572110149897422243 +2883701 817169327 -18413391979390173264 +5240852582657493474 +518980348 +60227983 +4618470194090937269 +1459422188 +232154335723 +3773122177597967623 71696881 -8328637003859953646 -14665059300281706 -6101796011455220816 -4456589 -13070886371126478108 -8733200714257204941 -10913926882465549337 -29330183088310857 -61800865 +2625321593697733840 +331038328206 +163447447 +18411699889572151318 +13044533461223476582 +18413392000885653589 +16436379939763522008 +12314601354656483126 +5539270545646881104 +11507341268100646834 +102615266471184862 +31304259214443724 +7731878007432940921 +7401639761433855789 +9450798976826149302 +5701792 +5245848925746498573 +15104099179857577674 +108921430 +17170714 +5411573444917201071 +544736038 +468609401971 +3758799224970808644 +63046067 +819987397 +18411417898964025362 +14322803714593785119 +3062219857732765097 +2207210695286917386 +10532987970627045461 +1873618458945062288 +684134257893509654 +14101293042239958 +5015996636573993090 +6098975770044925746 +850088361543927300 +157614716 +111739480 +219257572663 +19988781 +30740222110861163 +8618954206935976415 +6206321690772243584 +1873618450346281005 +5405598659939206416 +15872788267758849077 +5572365145471912335 +4625631396377398109 +17619527108083517000 +493028049 +9870724567533791 +1559626750 +19177592590632702 +91619849 +2625321602296318353 +515965674 +103088691 +68682184 +527434500 +1990332636357069057 +899112529018292807 +6570315963541227654 +16038494049631274933 +15920335005095365860 +28202143271093720 14949273699027977966 +18412263922377556010 +5782296461386581535 +13001682102565734253 +12972952285742288 +2703776923039566000 +2687090 +11775490430040476325 +14156017 +2686971790050919863 +17489156252859434801 +105906772 +60031373 +11241814380293784908 +71500270 +5734260728554980678 +5197393273133271298 +17648172288953812474 +1873618493336979326 +683965395648908415 +18331539082773742 +32150304123324262 +17422075106091075868 1873618523431110190 -3573803894998305775 -5569296709751605280 -5835546375651263675 -9870724568714358 -42008942 -1746899701160150410 -9664889374910385451 -7406761759861377295 -2625321597996894992 -365428082633 +10628953736434486617 +14512708438227029368 +18411981931769430054 +16886908734816455284 +62849456 +9131616131521448314 +5505181 +11364576519499679975 +33560368941368890 +544539427 +39911783 +33560399035500221 +533070599 +578945889 +8097653480026868144 +154600049 +16974104 +17957750408840481687 +285938493736356261 +5991347927496329957 +10377197347619277484 +15131353489256221185 +15287987228927396675 +14282027259104724924 +8070528080642443989 +17128487303121020 +219257375260 +42729843 +811139977 +6365225478934037607 +460010423829 +490013379 +13850612818404510827 +2207492698791414354 +7515799761807542411 +1873618480440216958 11888218815508973537 -6311975551774360856 -1408369638 -6101795942670075923 -15515140772745448064 -27638058877519937 -13361048879788721990 -2430665780 -22217020 -538313489 -927164962728314711 -69665238 -27638084672424186 -2573543627316201844 -12320990 -2424942 -18413392009483845719 -3660444556051220001 -18412545947378450486 -154665586 -9870724566681132 -546177847 -2229804632046437624 -5245848917148372136 -15906307047154976446 -827351178595273968 -5780604350074062990 -6350640494756627870 -9198943117821938833 -2676033351739180486 -1192315303887243384 -67633599 -6205475723246636047 -17419818910382754661 -162529937 -17083693235326683482 -105185869 -8912366315847026281 -5249797202674912471 -2446394423 -1461650414 -257426098 -17299513133793348673 -4451048243670025981 -14597841535548131734 -14130457194541352666 -15290525359355331959 -9195012299735698785 -524354306 -429916226796 -6153353788611431303 -1728578573 -6153071806602085789 -2676033356037948725 -8257735 -2785415326238575484 -1873618489038408278 -8072726556923202784 -7731878007432940921 -16271603835638319461 -11229884474259868248 -5835546388547569431 -2704904949958969710 -103154228 -2625321589399096275 -6887529782530082437 -45810044 -16365628939578247566 -4408861808311732424 -3554388240579364748 -3431353251379022211 -4131548706499659810 -3229097897723824621 -818938814 -16122124 -10831084194895235709 -6226088 -6366071472254485645 -10441809166173275876 -9538952396691934382 -5994450030541998229 -6835382734606174906 -4397798273518472097 -2625321593697864817 -9870724570481756 -17782439637510195701 -31304332299601191 -4074350515307087985 -10758418391935682553 -11405246090117384413 -196018851 -17943317531894613402 -15289397375426759758 -1801651221 -12716605781588708278 -5353476789790574588 -1873618450346936800 -14462121002204464918 -2785415309041207732 -71434733 -10770155859627543824 -1873618476141841211 -5780604362970367638 -2530739313276357975 -14090480 -5567604589840172352 -296644709200 -11266915032714840583 -4194441 -2200512120787569683 -2549492329236335496 -6211116016906930204 -99090988 -9625506809262378259 -13237708535585570818 -490103571663 -14541340640523322842 -9870724568450966 -1793158821936040552 -9486667438472824267 -21954873 -538051341 -1398211555 -5408700909154273182 -5356297014005859746 -8444237263823374707 -69403090 -2599235317101562153 -15897859265386515143 -6097847713031849822 -2162794 -9796067026192895123 -13117159209037203716 -164299420 -17088031212435737557 -8099682237308012832 -8971880411373045432 -3099205763721988894 -9870724566418979 -545915701 -13237708565679243273 -4449074137450482853 -18115860927276518423 -5247593352907982888 +10754752208794748192 +4770547841029572913 +2236213724530672835 +12085352355710699032 +2625321602296121720 +11141327 +68485573 +22610237 +279448454880 +573113178 +9870724567336570 +17097308613030775025 +45547899 +102892081 +538706709 +813958043 +2150364429944620611 +15290243360149735302 +1004761907965660269 +13427220419978791304 +18412827955182960702 +2116750858326574509 +11847668763332905754 +15530553734630409457 +6366353492955694732 +5875369269012203157 +7528870385169533979 +71303659 +816776108 +417019528294 +48365955 +2490479 +518587129 +59834762 +163054228 +9870724570154139 +5300226909920168653 +1493907215600323645 +1873618523430913566 +6293435910568086045 +4661227814082512385 +9231865831523354279 +4562124381333884039 +5994450030542718433 +8468495303965871404 +6151097734773868363 +5308570 +51184007 +16777494 +62652845 +7410231625909407077 +8336200085500660803 +10377426660276898779 +15108492788614827244 +5405598728725859379 +5349367342193968413 +1873618489038801706 +12851509922494416016 +3812049113439014303 +14151987057237756511 +10744889200825601240 +9304480456320748248 +19595563 +219257178788 +15798241638577276499 +3017170418691476659 +8857706336766199103 +13957562954616997312 +8126661 +30740222110467489 +4662355883991631380 +18413674000091971675 +6203168539198949844 +12480382642832672298 +14814059355306855043 +10229733804179524009 +6887529782530082437 +10905173350565414454 16533468055605152863 +3758799212073651272 +240752528220 +68288962 +1318388695 +6350640494756627870 +5462796894122411284 +813761433 +10944716 +2625321602295925195 +538510099 +18411699898170343448 +13880529192106527090 +18413392009483845719 +25099937047839912 +10440328872292254866 +6835382734606174906 +3686437022462641529 +816579498 +541328159 +2293868 +71107048 +105513554 +151388766 +9870724569957729 +5197393273132877515 +5882159627828332650 +7851156332894489575 +11510172448171493799 +9250794030848869727 +8327325764367420682 +16778219695595128445 +8733200714257204941 +16580883 +13513632858283052077 +1461650414 +2625321589399162008 +819397570 +8423108281783224429 +5111959 +1473119215 +257426098 +62456234 +39518566 +12622921449567619867 +10531295803425490030 +5566476498435574920 1873618458944474091 -19923244 -3188833116656765520 -2676033351738918494 -4501477955215362649 -17621268784989013395 -14581169549127125939 -6206321707968234614 -33278352538406314 -516227820 +1873618489038604579 +18613366323088485 +3008657884776564221 +11621189233770302317 +5464488953846367762 +475082778886408323 +6155327937823509637 +9956242218935388443 +11065487246536017596 +2676033351739311464 +14361095630442006439 +810746758 +283181761 +2625321610894509848 +546964288 +6365225423046182853 +7930050 +6262673798362565305 +1840738151924108521 +8483828720842049762 +9013091190501541709 +8294669686619703163 +9288263741171697848 +15520597512816297356 +2792183694107936667 +9227353569080969220 +2429880031182916564 +5833854255738521265 +18412263930975748140 +2430665780 +22217020 +18301216972082383618 +11964228788262537512 +159842942 +28766150282773591 +538313489 +813564822 +7032232753418799293 +12348736218027264207 +15290243360149343057 +6406389097441592980 +2964529530791004471 +18613559784442970 +1873618476141841211 +5991347884505041337 +6101796011455220816 +6366071455058494624 +6155045908522469290 +8412057600244518110 +3478039985316562895 +12718336251608304605 +70910437 +4211147846 +197067432 +14443179094226111164 +2192639020 +9870724569761068 +105316943 +25035091 +162661010 +518193910 +5303047078245827995 +1903640920853056624 +18092519415945890646 +4127600455366674792 +6474545510181176536 +7731877951544692100 +11084138473134491150 +2625321589398965240 +1495860210 +154010219 +16384272 +15043322265989680207 +6204347601746593065 +4915348 +62259623 +468608617008 +1966081057 +1192315299587689576 +17256155806064642777 +1873618489038408278 +12662636664722033563 +1654120425802828663 +25099894056749168 +5299098874402571932 +2676033351739114988 +489423554 +30671195 +5411521012994803182 +42140016 +7733439 +2625321610894313322 +7329667560521271617 +6206321690771457172 +5967447917778832244 +2284412389694637269 +2572415553107265488 +18412827963781152832 +16904712944498838074 +15289397349632182266 +29330122899915877 +27356081166681957 +6173800107753209956 +538116878 +10551496 +3919394969679695174 +9870724578216632 +492241614 +8816997369364548341 +4662355849599126556 +16567374854657149772 +12884708026702235763 +6364097417622914469 +1873618532029106835 +8861613698626554738 6890349946557761313 -1411918553413126104 -162267790 -2474797953316292924 -1694703987789596868 -18172096623373846790 -28766090095429261 -1223976979390989739 -3221822110943152678 -104923721 -15185362616787929146 -10003084053115964048 -2625321585100065781 -437798118096833445 -1815348248 -31304323701802109 -152371807 -14046027923586223423 -2021331689141374237 -20869691006257762 -13044533461223476582 -16778219695595128445 -12057002331826554305 -17465760298758178660 -7576852735584046364 -129168850403198609 -820708298 -17891616 -1873618489038145001 -7995587 -11911353550167017696 -4522983015860209939 -12612941966326959190 -102892081 -2625321589398833886 -45547899 -11548493110908749415 -4076606693818764590 -7851156332894489575 -12779163922391107832 -5991347884505304103 -1095239150174145285 -3863606920688567965 -10771469979967884371 -15859976 -14312864964518020808 -17245750799710423012 -5963940 -10655291933708585535 -4162099616697747321 -63308215 -1873618519131818153 -30176189305784773 +5837238474067478018 +5780604294184830225 +11214563466576463780 +29612216687004362 +5516046782590617836 +10156853965084755435 +6151097683183797493 +11613165301442872555 +1986666427421953426 +6155045882728942511 +7033448275620070919 +2907303882175415846 +1320813529 +1584595969 +105120332 +7465404271946632160 +70713826 +24838480 +162464400 +12451287838412704489 +816186278 +644154222 +3735453693364143828 +9870724569564298 +1309344723 +21715680028329254 +13044533461222491710 +1873618497636993704 +3445982126355516078 +7529998377695250462 +12237654319976351671 +4534407021717882867 +3431353251379022211 +494159375523777824 +1136798196293306910 +16426766960960540026 +819004351 +12356593998396393868 +16187661 +3307734082 +14273081993166850387 +4718737 +434977997061097911 +62063012 +2625321589398768544 +39125348 +30458248699315998 +17858552114457937219 +5903884228619468800 +16872385650894636566 +10504814416598927327 +12213481117926952067 +18413674008690163805 +14101026494875963 +4709060078846741586 +2676033351738918494 +9714916620294556051 +13237708535585570818 +810353539 +2625321610894116800 53412232 +434216307724 +7536828 +41943405 +6770804071406897080 +821822415 318140582948 -15611911946388048179 +6365225453139920066 +4502324038816629924 +4030203865610717075 +18411699906768535578 +15290807392954681807 +11966722661119888545 +8618954206934993224 +12189960762281954023 +32432333423379563 +18413392018082037849 +6004915412369541960 +14546784569801180959 +745740842898098858 +15289397293744523027 +5299098870104394759 +9257009393629660721 +5900805793554762144 +6155045917120857525 +21823800 +1317798870 +537920267 +1730675726 +1535706104 +9870724566550039 +14648423506775771515 +10531295876509927029 +3973490993339565383 +14312864964518020808 +14824583848163281869 +16940553195690134509 +1873618476141446514 +5778348218852443426 +5758903550139959418 +27356016680241600 +13940760354079114484 +5645056620059298667 +347984565637089693 +815989668 +9870724569368358 +5887799994573980828 +162267790 +517800693 +70517215 +15925946803693423456 +2625321597997353452 +16572875051796793000 +575144796 +104923721 +13172970 +14426056237756189706 +5909364179964069981 +5459976691403654584 +4397798273518472097 +27920040887059601 +1873618527730926929 +1873618467542665344 +18613585580197092 +32714392818354350 +18613499598604650 +5780604289886653255 +3865299049198390675 +22279760122415532 +18412545930182066226 +50397573 +153616999 +2625321589398571980 +1736311827 +15991050 +14665059300281706 +4522126 +7792373120121047026 +30458248699119542 +13951205954302381098 +17785259844527786731 +6444189713816225654 +747829823970020707 +8698802578697685482 +14477731067643038195 +18412263939573940270 +14318336791419094928 +15291371425760087333 +12109395139072755174 +30277976 +99090988 +282591932 +546374457 +490103571663 +15580874172203795679 +810156929 +7340217 +638124907 +259654328 +18809125 +18056758355458722638 +5679882735784101879 +7563081637033018620 +8520914754064026558 +283748271730 +67502526 +9870724566353399 +7242736046355514492 +572130134 +514786024 +214958409445 +29048192479791616 +2625321576501808484 +5354604872597767596 +29048106498198701 +2575517759332551933 +6311975551774360856 +14036340911856223966 +32150286927595340 +17291573824845253535 +14926165161459649868 12640696470018459947 -30176223702288623 -9870724570219682 -33278412725750974 -1409876968 -28766150282773591 -1873618450346674286 +17498716255300421272 +3968978683605551949 +16377260960560187819 +19177532404009207 +2625321597997156982 +24445261 +5245848878456439955 +421319345246 +5510538272098551989 +70320604 +3249068390006196153 +5888081980883929307 +1836516380 +12976359 +236453760381 +2141513421469058406 +1873618497636600365 +11878630053446878902 +6156456003434055463 +27638058877519937 +18413109962987470918 +6288511205539515238 +4770547828131824981 +4160689491693538063 +14836382508930370955 +12751507524739009261 +10427987387505837891 +2605266760616185153 +2524806001290315567 +33560429128451329 +4325515 +669516658 +15794439 +807142269 +5303047104041388600 +818611132 +61669791 +12644080653952551280 +6045857707735386835 +11229983338076703492 +2845029447323880298 +18412827972379344962 +6767393152337644543 +2673382969485886910 +15185362616787929146 +17490170188584258190 +4047541379259827663 +15680489859993767209 +546177847 +7143606 +637928298 +7276444624641068235 +12287601267178473523 +31022238513759415 +17698252132056434004 +1732546160493595960 +7036226112429884975 +2676033644081056812 +548995910 +90243587 +571933524 +812778389 +9870724566156739 +214958212644 +1873618446046923526 +3493083035910933027 +15291935501556190620 +14650572868605052119 +6971710725545264615 +17302333254828493968 +6098975847429179176 +4504298213822565083 +505938649 +3579577413 +2786543383251193103 +70123993 +47186305 +2352415791 +4279174221 +2625321597996960522 +1538130937 +161874570 +17082847207615236134 +6206321707968234614 +8854886129749066875 +10908568553618343357 +2785415326238639918 +1873618527730534170 +1873618441748940565 +5745384143842643344 +18413674017288355935 +16044698410491643447 +9181531069949872018 +10905173367761798655 +13237708544183762948 +3757107087862401328 +1311572948 +2034107431 +15597828 +2538734651 +5727354392818878727 +4128904 +818414521 +95879699 +5727354422913010657 +5245848874158263187 +9664889374910385451 +18411699915366727708 +14851060135220743194 +17958290734101235336 +9319686106503382840 +89657146100418951 +11349795265056081195 +14540810596246030644 +5779476284463187670 +18415907 +156041850 +259261111 +821232589 +809763710 +98697768 +6946995 +5941764153383128192 +17684252729367202593 +10233694917695638297 +970700105235760464 +21715753112570631 +17953636526298302297 +6262673798361580735 +5847102830955857465 +3313969578832561394 +2974323816123992770 +13271165719268362246 +17083693200934636558 +6101795934071424788 +16990917635978692369 +812581780 +16327183209838150989 +21233971 +1535116279 +214958016090 +2625321606595545096 +3232498753 +1500709877 +514392806 +5831598146013367591 +4502324004423927097 +3099205763721988894 15290243360148359553 -14036340911856223966 -6365225461738636619 -816645035 -417019398489 -6206321673575531611 -12057284352529139627 -71172585 -13828334 -7528870385169533979 -5832726134240118664 -2785415334835848520 -2572415553107265488 +1873618476140856959 +3295137431799204142 +14130457194541352666 +8910392170935354895 +3967850626592737364 +18412545938780258356 +12583138 +505742040 +4278977611 +540148509 +24052042 +196084388 +563086155 +104333894 +2625321597996763849 +16324853745603185849 +13586095437453200186 +15804734059994287439 +18005251247539029895 +13516735047310051359 +3493677603186412637 +10159956468397444373 +5249797099496672683 +17763448248357489818 +18412263948172132400 61276571 +7630443591734791098 3932293 -9870724568188981 +72745468 +95683088 +15401217 +4076606693818764590 +15986098390340470919 +1873618519131556994 +9386257309953099582 +8501910827968825512 +168849237244054062 +6750384 +545784627 +2625321585100000596 +1652810939277510396 +580191075 +98501157 +5198803303557629187 +3297856681506178941 +3935742187522887052 +2601013084734032090 +11500631658516907905 +8021450341214588326 +14977809576148535095 +4127600472563058730 +16965951797418331227 +27356081165698156 +491258567 +12804866717273491655 +1408762855 +2573543666009114673 +2200512120787569683 +2625321606595348609 +21037361 +14462121002204464918 +5619444426388998007 +3973491023432910866 +12103109825679658143 +7260902865540482639 +5566476571519223063 +18413109971585663048 +17791918762976347730 +16365628939578247566 +4449074137450482853 +11214563466575480865 +7239069803025663720 +17952462371364276975 +9512531412808567772 +11075097734987253589 +2373415502940997016 +16874702537456224943 +517014256 +2573543627316201844 +4278781002 +69730775 +9870724568582655 +12386527 +12743882002561631754 +10906583475570214623 +104137283 +35324256 +10167863869407233224 +18412827980977537092 +363084051790629688 +11694336983993944146 +1873618441748546884 +32432320525830439 +12654580528992553525 +7241043922144659849 +9391897706793274792 +152830562 +1402930148 +164299420 +5303047073946667464 +3735682 +61079961 +15204606 1873618549225491555 -2360543918673038210 -98828841 -12512221777814685432 -17939922315943150958 -6045857707735386835 -21692726 -4502324038816629924 -11490081257974859839 -17639632887023929831 -1316357237551401394 -6101795994259359091 -11796695 -69140942 -18411699889572151318 -12074216556992400767 -1320813529 -8618954206934993224 -164037275 -4160546838840674266 -12591757708863407913 -555549513 -9870724566156739 -154141293 -32714414313178248 -545653553 -223556471268 -12613788024133322735 -812581780 -5778348150066318224 -1500709877 -6741138607599781046 -9227353569080969220 -515965674 -13884327378110449525 -18411699919665823773 -16340493341965880015 -162005644 -620757861 -21997756618049241 -17007720368052373541 -13001845694847518363 -227855238971 -17629469 -1737950228 -9288263741171697848 -20305615210743190 -1873618489037883086 -18613533990193666 -7733439 -313841551493 -15288551330518206781 -17302333254828493968 -6153071832396467338 +3188833116656765520 +31586327206235137 +820839372 +464309454125 +18022689 +545588016 +17205553309096938840 +313838798363 +223556406340 +98304546 +15463390673086056969 +4240022615453076686 +10831084194895235709 +11549275701007551889 +155648632 +6553773 +534119176 +4222974949961697922 +8326286517935867839 +1873618454645114642 +1146796961722731823 +5509410202188647833 +1873618514833377412 +3242943116712479419 +29330157293667421 +8882845388820581451 +12608147700378373379 +14465116522071263669 +5461104757014004985 +9649086479758069023 +2625321606595152102 +513999587 +20840752 +2148672322930150296 +10646954815923686447 +10831360821402142464 +313841615983 +10139438201185111279 +16881311723980129501 +18413674025886548065 +2785415274648570354 +5353476789791099071 2979056014524680527 -8857706336766199103 -2625321589398571980 -45285754 -5991347884505041337 -4502324004423927097 -16874702537456224943 -14911447610171655366 +6366071515245381876 +8610102806501591788 +10333839787251271664 +13237708552781955078 +451412690018 +16101055855214332856 +9870724568385196 +12189916 +23658823 +195691169 +5155859771100236117 +69534164 +35127645 +103940672 +11069796609748044689 13944990587222231178 -3308118261903721908 -18413109975884759113 -8412057600244518110 -15597828 -2538734651 -818414521 -17082847207615236134 -18276979644936029994 -5701792 -63046067 -5882159696614657105 -1410790466305853323 -18412263913779363880 -32714379920475611 -539325825270679628 -1873618519131556994 -13536993689470216 -9870724569957729 -43254135 -5153885686374731086 -9387385384162626351 -8336200085500660803 -5303047104041388600 -5512098595943810546 -5717788221838658971 -2324121364801391676 -12012735189037878155 -2192639020 -1873618476141316771 -70910437 -3670145 -2219404100148201532 -2544580112253650683 -61014424 -6155045921420412650 -18412263943873036335 -1873618549225229533 -9870724567926898 -98566694 +27920101074341046 +17298949057997047589 +2908260051937332390 +6364097413323754682 +12350988444867431112 +1223976979390989739 +5782296431293302176 +11098517635487303139 +13525196865559988902 +2374936041605498895 +15007995 +1574765567 +519635711 +5831598103022077418 +576979807 +817824692 +634323816 +3539071 +2446394423 +6206321673575531611 +2360543918673038210 +27638024484621167 +11340219378265033230 +6366071472254485645 +4562124351240801677 29894215892535509 -155910777 -6366353527348399255 -9956242218935388443 -31586340104504804 -219257441372 -13522668389390157414 -18411417881767641102 -11534547 -279448847671 -7242736046355514492 -68878794 -814351263 -1192315299587689576 -2524775482 -34124461934314600 -507839197 -5539270545646881104 -4974759074281293673 -5337229686545450161 -153879145 -12644080653952551280 -30458205707308380 +6153353844499089111 +13070886371126478108 +9181481831755875838 +18067928196024961188 +6981729909914862956 +63701435 +6357162 +15288269305517836796 +17299513133793348673 545391405 -17877509356004052233 +17826079 +820642761 +98107936 +8854886172739175692 +9082058141968173941 +1873618484739049815 +11514789185312918199 +5778348197355914873 +11130039777759856047 +294416195335096411 +846140170598090257 +2571498445011814318 +18412545947378450486 +1408369638 +2625321606594955469 +5245848947242502849 +365428082633 +5245848917148372136 +10859426818132543221 +15524263781940136850 +2578187325 +17564225130023161250 +811991951 +1694703987789596868 +1873618450346936800 +12105446909435186010 +14975681650483333306 +32432303330887118 +29612220986426501 +11644189250161151139 17520266449292560845 -11065487246536017596 -2011949215506761725 -6155045882728942511 -812319634 -1130753852548581517 -573047641 -5299098874402571932 -18413674000091971675 -18331556280207363 -17269866578628118199 -15289397293744523027 -161743496 -10649664295314066054 -6051485356288903427 -4347925833116091776 -30458188511970924 -104399431 -10184384893691038634 -7401639761433855789 -1308623824 -563151692 -2625321610894444316 -7239069803025663720 -11434534198373320614 -1873618441748613384 -5622264654903379074 -29330122899915877 -15636380174699072146 -820184006 -2597848126 -10233694917695638297 -14585410861575638263 -7471291 -85348920764927349 -6366353492955694732 -18413674030185644130 -4127600472562141528 -35127645 -5780604337176709161 -541328159 -2524806001290315567 -13850612818404510827 -18412827968080248897 -15335680 -3493395603981665996 -17858552114457937219 -62783919 -3875793754648151904 -5564423899624572258 -292345154665 -3489447322753895731 -18411981905974853664 -5439644 -42991988 -9870724569695611 -12269921124804135698 -559088458 -33278386930321618 -15289397353931868100 -214958409445 -6219166245997316001 -15289397379726773461 -30458248699315998 -23200068 -12163381674616883890 -70648289 -9000175594581527004 -806224763 -89657146100418951 -15475002888547338265 -3407997 -60752278 -18411981936068526119 -14267039342724252928 +92275213 +335336768790 +69337553 +7290339324003420579 +17621268802185464283 +161088132 +9870724568188981 +516621038 +11993306 +507299956084 +210659444315 +103744061 +13151687854617134836 +8659114857360722535 +825323275339564903 +103179363763488430 +684134210602468610 +1873618501936418049 +6205475723246636047 +5516046752497929091 +15885957841278600403 +2477484405147109478 +16875205763331852041 +72155640 +472907842721 +14471968401314024391 +806159226 +1712194570 +576783198 +1815413785 +2446197814 +14811384 +507970270 +8929038315166239946 +3342460 +3220426554520570467 +2625321593698192308 +5677488692584514734 +21433663625497129 +2435475427475262665 +16940455997476965252 +6153071806602085789 +5865888353649363875 +17465760298758178660 +13263754581809432790 +8716776809328151764 +13112992413209136128 +6153353788611431303 +3784724792312663401 +12590629664748537952 +2676033356038342054 +14219872676477209184 +11327137566841769230 +63504826 +97911325 +9339868219275806468 13726068525522684375 -1873618527730862181 -4504298213822565083 -155648632 -98304546 -9870724567665640 -13681696359428851594 -219257178788 -24535844054893958 -50011031689890353 -10532987940533372886 -11272401 -23407795639356361 -68616647 -814089116 -15635925519041823968 -1998521381 -163512984 -797977540607610221 -32150286927595340 -4709060078846741586 -5967447917778832244 -5885976078596834724 -2625321606595414132 -153616999 -1744643526947965735 -17461812017531651650 -987047180239768912 -30740239306197230 -15288833278135765839 -525337347 -5885976155981547843 -18413391992287461459 -10532987970627045461 -56689033 -5722409915131627177 -114033243 -10159956468397444373 -18412545930182066226 -5349367342193968413 -13819010092172884 -104137283 -17953636526298302297 -2224234517276395067 -2789363555875490728 -2625321610894182276 -12426051065400527122 -9355193091131312182 -30740222110861163 -14361095630442006439 -3137288237381257087 -17105177 -819921860 -7209143 -1727529996 -810025856 -805679481429165719 -17298949057997047589 -21997713627284659 -16120716880803858984 -33560368941433940 -1535706104 -10229733804179524009 -18412545960275738681 -9714916620294556051 -4078298775038527628 -5461104765611607541 -210659378559 -92209676 -13418544886826534789 -14264208172476401284 -1917322269 -197001895 -24969554 -5405598728725530322 -15073532 -817890229 -72417787 -1873618471842024407 -17091318705916150977 -5946696443085589628 -5177496 -5847102830955857465 -62521771 -1873618523431831649 +2011949215506761725 +1737950228 +6160551 +9830100417878166271 +155255415 +17629469 +8140646021471143544 +545194794 +8510103668314541335 +18411417868870352907 5835546371351184527 -14824583848163281869 -42729843 -9870724569433729 -5780604315680310424 -16385074671182940805 -214958147231 -3007753865419557454 -491586249 -17943317531893566468 -1801912319444323213 -22937920 -539034393 -27356055371580547 -1873618476140792146 -5198803303557629187 -6103488088376871190 -13041896 -1733362705 -70386141 -2306802734 -643826540 +18413109980183855178 +5249797172580910311 +10532987940533372886 +32714379920409891 +1873618514832984063 +13702827714901707594 +29330157293274228 +220421203678071202 +5565348467217401524 +313841222762 +570950482 +13012951802393594980 +6209141854797957852 +5717788221838658971 +5460499872422693597 +8444237263823374707 +2544580112253650683 +32432303330691092 +14986955239351847842 +4392112055939237960 +16285378285009240167 +6205475671656957491 +11266915032714840583 +15289397375426759758 +17284241873202253123 +1783548230307677653 +195297952 +69140942 +23265605 +11796695 +210659247559 +17257283845880874759 +451412296787 +92078603 +160891523 +539362075 +103547450 +9870724567992379 +11331649863678691999 +12613788024133322735 +13944415416121166662 +15895039144349470066 +8816997365064994109 +1732546121802517809 +13221120945827219803 +3863606942184311140 +12562453432512743836 +7562235583526800081 +9870724570810095 +71959029 +232154598652 +14614773 3145849 -14637903957824965363 519242494 -60490131 +2625321593697995819 +1133620930477295468 +817431474 805962615 -5784522635265967958 -1873618527730601376 -18301216972082383618 -11644189250161151139 -2625321602296383846 -9870724567402585 -98042399 -15741861301866530650 -494403323033 -6729754102968812754 -546898751 -6208295835683456476 -33560403333875446 -14409153078548760239 -15530271666638163275 -1873618458945456185 +4131548706499659810 +60490131 +503001777494 +6206321673575138470 +1258091056198584472 +3573803894998305775 +10967349376607587326 +1873618523431569790 +6153071806601889790 +12749251354825264418 +9625506809262378259 +2676033356038145381 +15635925519041823968 +5885976078596834724 +9484411285755463284 +532291916112267238 +18411981901675757599 +1703347206 +33560368941827284 +5303047039553965447 +40370537 +97714714 +155058804 +6261263733545503259 +5963940 +63308215 +1130753852548581517 +5570988833963444820 +18157949162008873831 +8021450371307931626 +2861086850442987769 +1873618489039455401 +18413674034484740195 +1873618458945324208 +32714349826081871 +18424247431471827427 +1842511416005692464 +6589396841525218018 +5782296448490276391 +13237708561380147208 +27356055371580547 +5462796868326918190 +1860700038481053299 +5458848587100981064 +3580814869236944221 +5566476545725106758 +28202091681875145 +5915592292141435844 +11434534198373320614 +15740733274947783803 +10161648502327149991 +15287141235608259625 +12779163922391107832 +68944331 +814416800 +1823671323 +23068994 +210659050964 +46006654 +516227820 +11600084 +103350839 +361129707266 +13750803869111880047 +103179363763095696 +1873618501936022824 +2933734509745341832 +7230168968130792223 +14517406836956661503 +17619012718254098754 +12406930521299355297 +4408861808311732424 +2949238 +9870724570613070 +60293520 +503001580666 +14947075702982868 +1998521381 +2625321593697799226 +14418163 +163512984 +71762418 +5722409915131627177 +11599686562536949325 +1873618493337242815 16951650337051970851 -5144036663261072615 -813826970 -12133908888583014197 -68354499 -11010253 -279448324634 -14749580058850363919 -6633286351216577743 -2089265852158774334 -8929038315166239946 -31586271318836879 -13678484518713821516 -105906772 -96010773 -2625321606595152102 -153354852 -10831360821402142464 -5652457623480305518 -8503320935775669540 -16483453074211931840 -363084051790629688 -544867112 -258146996 -5944020284604679310 -5782296431293302176 -28484176870181368 -23407778443758207 -3973491023432910866 -5778348175860436286 -1873618514834032208 -5438906422044199526 -103875135 -7697026996393675938 -1709507593 -161219206 -13237708548482859013 -3701601059573925529 -879419277503368073 -3822179681402096264 +2676033356037948725 +18412545955976642616 5565348445721659362 -532291916112267238 -256115374 -1460339693 -13351948495571782591 -14665351642484132 -3008657884776564221 -2341393787733871788 -16904712944497920326 -3967850626592737364 -16843031 -4131548702199581670 -6946995 -809763710 -1928986057181235415 -11964228788262537512 -2989761681675848960 -1873618519132801026 -7276444624641068235 -5994450030542718433 -12284124821458521275 -111739480 -4076606646528706921 -13650504529854072320 -15804734059994287439 -14425661019905001872 -2395604016 -14465116522071263669 -210659116497 -15290243360149343057 -15777957523720635747 -10167863869407233224 -18331517588211470 -12884708026702235763 -14811384 -72155640 -7042731044489660311 -15288269305517836796 -5675796551176948530 -14264208198271043974 -1495860210 -5787083718919720300 -25099894056749168 -683965395648908415 -62259623 -4915348 -12974919760129952993 -6155045917120857525 -1873618523431569790 -9013091190501541709 -4392112055939237960 -2625321597997353452 -15897908900500866947 -6177363174264606048 -15872788267758849077 -491324104 -33560399034844286 -22675774 -17542946455516547053 -2431124533 -538772246 -27920040887322186 -8704274751914773568 -12085352355710699032 -6153353775713551670 -70123993 -27356081166223293 -7885152524183078888 -60227983 -2883701 -11700344903086704893 -7329667560521271617 -518980348 -5833854255738521265 -8618954206935976415 -3901910077209972079 -1713308683 -1992881785903908578 -4530582984922301900 -16130159995999161574 -155124341 -2625321602296121720 -1884114794138700522 -5778348218852443426 -97780251 -4240022615453076686 -6097847786116483627 -6361518319333476776 -30540122 -28484146776247610 -546636604 -5741055947585816645 -6100103891543657570 -8807886331112851129 -813564822 -10223260478367337870 -746324852 -15287423226215073909 -11226550812567014265 -1491796976 -8097653480026868144 -5995296157134227520 -1873618532029106835 -1539245050 -48300418 -331037869860 -95748625 -6314795724398267312 -5888081980883929307 -544604964 -34124418943289166 -5245848947242502849 -32432363517642192 -2676033356038407648 -811533196 -1317733333 -8920676095134336910 -17149817495305717193 -918014392040164136 -103612987 -8695136395555507435 -18349504802666319185 -14847634415788362123 -1584661506 -4287350266942457603 -525512494730316455 -5881302580997523790 -1574765567 -3784125305237867347 -819397570 -8326286517935867839 -16149105318148965958 -16580883 -6684847 -18411699902469439513 -11229983338076703492 -15292499491369977714 -339635406848 -9870724570940976 +5767329 +5250413516934022944 +97518103 +63111604 +579208034 +544801575 +17236251 +258081459 +17953567922355439196 +30458188512103543 +15287987228927658628 +4930631980557601532 +20305658202031811 +2120987217453057458 +6209987959894902621 +7151957518376504179 +12552846396214610071 +1793158821936040552 +5461104787107351969 +559088458 +14386655907412249373 +547619651 +2141783083 +12606726616442537392 +1923875870 +811402123 +570557265 +42991988 100 101 102 diff --git a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py index a009802bab0..a8c5ae3b6a3 100644 --- a/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py +++ b/python/cudf/cudf/tests/groupby/test_ordering_pandas_compat.py @@ -14,9 +14,10 @@ def with_nulls(request): @pytest.mark.parametrize("nrows", [30, 300, 300_000]) @pytest.mark.parametrize("nkeys", [1, 2, 4]) def test_groupby_maintain_order_random(nrows, nkeys, with_nulls): + rng = np.random.default_rng(seed=0) key_names = [f"key{key}" for key in range(nkeys)] - key_values = [np.random.randint(100, size=nrows) for _ in key_names] - value = np.random.randint(-100, 100, size=nrows) + key_values = [rng.integers(100, size=nrows) for _ in key_names] + value = rng.integers(-100, 100, size=nrows) df = cudf.DataFrame(dict(zip(key_names, key_values), value=value)) if with_nulls: for key in key_names: diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini deleted file mode 100644 index 496a322ff80..00000000000 --- a/python/cudf/cudf/tests/pytest.ini +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. - -[pytest] -markers = - spilling: mark benchmark a good candidate to run with `CUDF_SPILL=ON` -xfail_strict = true -filterwarnings = - error - ignore:::.*xdist.* - ignore:::.*pytest.* - # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow() - ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore - # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+ - ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning - # PerformanceWarning from cupy warming up the JIT cache - ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning - # Ignore numba PEP 456 warning specific to arm machines - ignore:FNV hashing is not implemented in Numba.*:UserWarning -addopts = --tb=native diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py index cea86a5499e..691da224f44 100644 --- a/python/cudf/cudf/tests/series/test_datetimelike.py +++ b/python/cudf/cudf/tests/series/test_datetimelike.py @@ -266,3 +266,25 @@ def test_pandas_compatible_non_zoneinfo_raises(klass): with cudf.option_context("mode.pandas_compatible", True): with pytest.raises(NotImplementedError): cudf.from_pandas(pandas_obj) + + +def test_astype_naive_to_aware_raises(): + ser = cudf.Series([datetime.datetime(2020, 1, 1)]) + with pytest.raises(TypeError): + ser.astype("datetime64[ns, UTC]") + with pytest.raises(TypeError): + ser.to_pandas().astype("datetime64[ns, UTC]") + + +@pytest.mark.parametrize("unit", ["ns", "us"]) +def test_astype_aware_to_aware(unit): + ser = cudf.Series( + [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)] + ) + result = ser.astype(f"datetime64[{unit}, US/Pacific]") + expected = ser.to_pandas().astype(f"datetime64[{unit}, US/Pacific]") + zoneinfo_type = pd.DatetimeTZDtype( + expected.dtype.unit, zoneinfo.ZoneInfo(str(expected.dtype.tz)) + ) + expected = ser.astype(zoneinfo_type) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index 979c936a182..af9a6c7e696 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -33,7 +33,7 @@ def __array_function__(self, *args, **kwargs): missing_arrfunc_reason = "NEP-18 support is not available in NumPy" -np.random.seed(0) +rng = np.random.default_rng(seed=0) @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) @@ -49,7 +49,7 @@ def __array_function__(self, *args, **kwargs): ], ) def test_array_func_cudf_series(func): - np_ar = np.random.random(100) + np_ar = rng.random(100) cudf_ser = cudf.Series(np_ar) expect = func(np_ar) got = func(cudf_ser) @@ -74,7 +74,7 @@ def test_array_func_cudf_series(func): ], ) def test_array_func_cudf_dataframe(func): - pd_df = pd.DataFrame(np.random.uniform(size=(100, 10))) + pd_df = pd.DataFrame(rng.uniform(size=(100, 10))) cudf_df = cudf.from_pandas(pd_df) expect = func(pd_df) got = func(cudf_df) @@ -91,7 +91,7 @@ def test_array_func_cudf_dataframe(func): ], ) def test_array_func_missing_cudf_dataframe(func): - pd_df = pd.DataFrame(np.random.uniform(size=(100, 10))) + pd_df = pd.DataFrame(rng.uniform(size=(100, 10))) cudf_df = cudf.from_pandas(pd_df) with pytest.raises(TypeError): func(cudf_df) @@ -105,7 +105,7 @@ def test_array_func_missing_cudf_dataframe(func): ], ) def test_array_func_cudf_index(func): - np_ar = np.random.random(100) + np_ar = rng.random(100) cudf_index = cudf.Index(cudf.Series(np_ar)) expect = func(np_ar) got = func(cudf_index) @@ -125,7 +125,7 @@ def test_array_func_cudf_index(func): ], ) def test_array_func_missing_cudf_index(func): - np_ar = np.random.random(100) + np_ar = rng.random(100) cudf_index = cudf.Index(cudf.Series(np_ar)) with pytest.raises(TypeError): func(cudf_index) diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index 5acdf36de80..17ef033ea9e 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -600,12 +600,12 @@ def test_avro_reader_multiblock( else: assert dtype in ("float32", "float64") avro_type = "float" if dtype == "float32" else "double" - np.random.seed(0) + rng = np.random.default_rng(seed=0) # We don't use rand_dataframe() here, because it increases the # execution time of each test by a factor of 10 or more (it appears # to use a very costly approach to generating random data). # See also: https://github.com/rapidsai/cudf/issues/13128 - values = np.random.rand(total_rows).astype(dtype) + values = rng.random(total_rows).astype(dtype) bytes_per_row = values.dtype.itemsize # The sync_interval is the number of bytes between sync blocks. We know diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 2e8519509e2..949fa909b5b 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -2,7 +2,6 @@ import decimal import operator -import random import warnings from itertools import combinations_with_replacement, product @@ -179,7 +178,13 @@ @pytest.mark.parametrize("obj_class", ["Series", "Index"]) @pytest.mark.parametrize("binop", _binops) -def test_series_binop(binop, obj_class): +def test_series_binop(request, binop, obj_class): + request.applymarker( + pytest.mark.xfail( + binop is operator.floordiv, + reason="https://github.com/rapidsai/cudf/issues/17073", + ) + ) nelem = 1000 arr1 = utils.gen_rand("float64", nelem) * 10000 # Keeping a low value because CUDA 'pow' has 2 full range error @@ -187,13 +192,15 @@ def test_series_binop(binop, obj_class): sr1 = Series(arr1) sr2 = Series(arr2) + psr1 = sr1.to_pandas() + psr2 = sr2.to_pandas() if obj_class == "Index": sr1 = Index(sr1) sr2 = Index(sr2) + expect = binop(psr1, psr2) result = binop(sr1, sr2) - expect = binop(pd.Series(arr1), pd.Series(arr2)) if obj_class == "Index": result = Series(result) @@ -204,7 +211,8 @@ def test_series_binop(binop, obj_class): @pytest.mark.parametrize("binop", _binops) def test_series_binop_concurrent(binop): def func(index): - arr = np.random.random(100) * 10 + rng = np.random.default_rng(seed=0) + arr = rng.random(100) * 10 sr = Series(arr) result = binop(sr.astype("int32"), sr) @@ -223,8 +231,9 @@ def func(index): @pytest.mark.parametrize("obj_class", ["Series", "Index"]) @pytest.mark.parametrize("nelem,binop", list(product([1, 2, 100], _binops))) def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar): - arr = np.random.random(nelem) - rhs = random.choice(arr).item() + rng = np.random.default_rng(seed=0) + arr = rng.random(nelem) + rhs = rng.choice(arr).item() sr = Series(arr) if obj_class == "Index": @@ -247,10 +256,11 @@ def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar): "lhs_dtype,rhs_dtype", list(product(_int_types, _int_types)) ) def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype): - arr1 = (np.random.random(100) * 100).astype(lhs_dtype) + rng = np.random.default_rng(seed=0) + arr1 = (rng.random(100) * 100).astype(lhs_dtype) sr1 = Series(arr1) - arr2 = (np.random.random(100) * 100).astype(rhs_dtype) + arr2 = (rng.random(100) * 100).astype(rhs_dtype) sr2 = Series(arr2) if obj_class == "Index": @@ -271,8 +281,9 @@ def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype): "dtype", ["int8", "int32", "int64", "float32", "float64", "datetime64[ms]"] ) def test_series_compare(cmpop, obj_class, dtype): - arr1 = np.random.randint(0, 100, 100).astype(dtype) - arr2 = np.random.randint(0, 100, 100).astype(dtype) + rng = np.random.default_rng(seed=0) + arr1 = rng.integers(0, 100, 100).astype(dtype) + arr2 = rng.integers(0, 100, 100).astype(dtype) sr1 = Series(arr1) sr2 = Series(arr2) @@ -438,9 +449,10 @@ def test_str_series_compare_num_reflected( def test_series_compare_scalar( nelem, cmpop, obj_class, dtype, use_cudf_scalar ): - arr1 = np.random.randint(0, 100, 100).astype(dtype) + rng = np.random.default_rng(seed=0) + arr1 = rng.integers(0, 100, 100).astype(dtype) sr1 = Series(arr1) - rhs = random.choice(arr1).item() + rhs = rng.choice(arr1).item() if use_cudf_scalar: rhs = cudf.Scalar(rhs) @@ -465,9 +477,9 @@ def test_series_compare_scalar( @pytest.mark.parametrize("nelem", [1, 7, 8, 9, 32, 64, 128]) @pytest.mark.parametrize("lhs_nulls,rhs_nulls", list(product(_nulls, _nulls))) def test_validity_add(nelem, lhs_nulls, rhs_nulls): - np.random.seed(0) + rng = np.random.default_rng(seed=0) # LHS - lhs_data = np.random.random(nelem) + lhs_data = rng.random(nelem) if lhs_nulls == "some": lhs_mask = utils.random_bitmask(nelem) lhs_bitmask = utils.expand_bits_to_bytes(lhs_mask)[:nelem] @@ -478,7 +490,7 @@ def test_validity_add(nelem, lhs_nulls, rhs_nulls): else: lhs = Series(lhs_data) # RHS - rhs_data = np.random.random(nelem) + rhs_data = rng.random(nelem) if rhs_nulls == "some": rhs_mask = utils.random_bitmask(nelem) rhs_bitmask = utils.expand_bits_to_bytes(rhs_mask)[:nelem] @@ -525,8 +537,9 @@ def test_validity_add(nelem, lhs_nulls, rhs_nulls): ) def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class): nelem = 10 - lhs = (np.random.random(nelem) * nelem).astype(lhs_dtype) - rhs = (np.random.random(nelem) * nelem).astype(rhs_dtype) + rng = np.random.default_rng(seed=0) + lhs = (rng.random(nelem) * nelem).astype(lhs_dtype) + rhs = (rng.random(nelem) * nelem).astype(rhs_dtype) sr1 = Series(lhs) sr2 = Series(rhs) @@ -550,8 +563,9 @@ def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class): ) def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class): nelem = 5 - lhs = (np.random.random(nelem) * nelem).astype(lhs_dtype) - rhs = (np.random.random(nelem) * nelem).astype(rhs_dtype) + rng = np.random.default_rng(seed=0) + lhs = (rng.random(nelem) * nelem).astype(lhs_dtype) + rhs = (rng.random(nelem) * nelem).astype(rhs_dtype) sr1 = Series(lhs) sr2 = Series(rhs) @@ -574,8 +588,7 @@ def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class): ) def test_series_reflected_ops_scalar(func, dtype, obj_class): # create random series - np.random.seed(12) - random_series = utils.gen_rand(dtype, 100, low=10) + random_series = utils.gen_rand(dtype, 100, low=10, seed=12) # gpu series gs = Series(random_series) @@ -631,8 +644,7 @@ def test_series_reflected_ops_cudf_scalar(funcs, dtype, obj_class): cpu_func, gpu_func = funcs # create random series - np.random.seed(12) - random_series = utils.gen_rand(dtype, 100, low=10) + random_series = utils.gen_rand(dtype, 100, low=10, seed=12) # gpu series gs = Series(random_series) @@ -774,7 +786,8 @@ def test_df_different_index_shape(df2, binop): @pytest.mark.parametrize("op", [operator.eq, operator.ne]) def test_boolean_scalar_binop(op): - psr = pd.Series(np.random.choice([True, False], 10)) + rng = np.random.default_rng(seed=0) + psr = pd.Series(rng.choice([True, False], 10)) gsr = cudf.from_pandas(psr) assert_eq(op(psr, True), op(gsr, True)) assert_eq(op(psr, False), op(gsr, False)) @@ -923,16 +936,17 @@ def test_operator_func_dataframe(func, nulls, fill_value, other): num_cols = 3 def gen_df(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame() from string import ascii_lowercase - cols = np.random.choice(num_cols + 5, num_cols, replace=False) + cols = rng.choice(num_cols + 5, num_cols, replace=False) for i in range(num_cols): colname = ascii_lowercase[cols[i]] data = utils.gen_rand("float64", num_rows) * 10000 if nulls == "some": - idx = np.random.choice( + idx = rng.choice( num_rows, size=int(num_rows / 2), replace=False ) data[idx] = np.nan @@ -954,21 +968,21 @@ def gen_df(): @pytest.mark.parametrize("nulls", _nulls) @pytest.mark.parametrize("other", ["df", "scalar"]) def test_logical_operator_func_dataframe(func, nulls, other): - np.random.seed(0) num_rows = 100 num_cols = 3 def gen_df(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame() from string import ascii_lowercase - cols = np.random.choice(num_cols + 5, num_cols, replace=False) + cols = rng.choice(num_cols + 5, num_cols, replace=False) for i in range(num_cols): colname = ascii_lowercase[cols[i]] data = utils.gen_rand("float64", num_rows) * 10000 if nulls == "some": - idx = np.random.choice( + idx = rng.choice( num_rows, size=int(num_rows / 2), replace=False ) data[idx] = np.nan @@ -977,8 +991,12 @@ def gen_df(): pdf1 = gen_df() pdf2 = gen_df() if other == "df" else 59.0 - gdf1 = cudf.DataFrame.from_pandas(pdf1) - gdf2 = cudf.DataFrame.from_pandas(pdf2) if other == "df" else 59.0 + gdf1 = cudf.DataFrame.from_pandas(pdf1, nan_as_null=False) + gdf2 = ( + cudf.DataFrame.from_pandas(pdf2, nan_as_null=False) + if other == "df" + else 59.0 + ) got = getattr(gdf1, func)(gdf2) expect = getattr(pdf1, func)(pdf2)[list(got._data)] diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py index cd1ad21ae59..db41f689255 100644 --- a/python/cudf/cudf/tests/test_categorical.py +++ b/python/cudf/cudf/tests/test_categorical.py @@ -252,10 +252,10 @@ def test_cat_series_binop_error(): @pytest.mark.parametrize("num_elements", [10, 100, 1000]) def test_categorical_unique(num_elements): # create categorical series - np.random.seed(12) + rng = np.random.default_rng(seed=12) pd_cat = pd.Categorical( pd.Series( - np.random.choice( + rng.choice( list(string.ascii_letters + string.digits), num_elements ), dtype="category", @@ -279,12 +279,10 @@ def test_categorical_unique(num_elements): @pytest.mark.parametrize("nelem", [20, 50, 100]) def test_categorical_unique_count(nelem): # create categorical series - np.random.seed(12) + rng = np.random.default_rng(seed=0) pd_cat = pd.Categorical( pd.Series( - np.random.choice( - list(string.ascii_letters + string.digits), nelem - ), + rng.choice(list(string.ascii_letters + string.digits), nelem), dtype="category", ) ) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index 4aa7fb27c9b..65947efc2df 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -31,12 +31,13 @@ @pytest.fixture(params=dtypes, ids=dtypes) def pandas_input(request): dtype = request.param - rng = np.random.default_rng() + rng = np.random.default_rng(seed=0) size = 100 def random_ints(dtype, size): dtype_min = np.iinfo(dtype).min dtype_max = np.iinfo(dtype).max + rng = np.random.default_rng(seed=0) return rng.integers(dtype_min, dtype_max, size=size, dtype=dtype) try: @@ -154,7 +155,9 @@ def test_column_slicing(pandas_input, offset, size): [cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype], ) def test_decimal_column_slicing(offset, size, precision, scale, decimal_type): - col = cudf.core.column.as_column(pd.Series(np.random.rand(1000))) + col = cudf.core.column.as_column( + pd.Series(np.random.default_rng(seed=0).random(1000)) + ) col = col.astype(decimal_type(precision, scale)) column_slicing_test(col, offset, size, True) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 8da589ba45b..ab0f1767cd6 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -30,6 +30,7 @@ def _hide_concat_empty_dtype_warning(): def make_frames(index=None, nulls="none"): + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { "x": range(10), @@ -51,7 +52,7 @@ def make_frames(index=None, nulls="none"): df2.y = np.full_like(df2.y, np.nan) if nulls == "some": mask = np.arange(10) - np.random.shuffle(mask) + rng.shuffle(mask) mask = mask[:5] df.loc[mask, "y"] = np.nan df2.loc[mask, "y"] = np.nan @@ -203,10 +204,9 @@ def test_concat_misordered_columns(): @pytest.mark.parametrize("axis", [1, "columns"]) def test_concat_columns(axis): - pdf1 = pd.DataFrame(np.random.randint(10, size=(5, 3)), columns=[1, 2, 3]) - pdf2 = pd.DataFrame( - np.random.randint(10, size=(5, 4)), columns=[4, 5, 6, 7] - ) + rng = np.random.default_rng(seed=0) + pdf1 = pd.DataFrame(rng.integers(10, size=(5, 3)), columns=[1, 2, 3]) + pdf2 = pd.DataFrame(rng.integers(10, size=(5, 4)), columns=[4, 5, 6, 7]) gdf1 = cudf.from_pandas(pdf1) gdf2 = cudf.from_pandas(pdf2) @@ -1398,11 +1398,12 @@ def test_concat_single_object(ignore_index, typ): ], ) def test_concat_decimal_dataframe(ltype, rtype): + rng = np.random.default_rng(seed=0) gdf1 = cudf.DataFrame( - {"id": np.random.randint(0, 10, 3), "val": ["22.3", "59.5", "81.1"]} + {"id": rng.integers(0, 10, 3), "val": ["22.3", "59.5", "81.1"]} ) gdf2 = cudf.DataFrame( - {"id": np.random.randint(0, 10, 3), "val": ["2.35", "5.59", "8.14"]} + {"id": rng.integers(0, 10, 3), "val": ["2.35", "5.59", "8.14"]} ) gdf1["val"] = gdf1["val"].astype(ltype) diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py index 9b6f82ec705..f33cfe268a3 100644 --- a/python/cudf/cudf/tests/test_copying.py +++ b/python/cudf/cudf/tests/test_copying.py @@ -16,8 +16,9 @@ @pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES) def test_repeat(dtype): - arr = np.random.rand(10) * 10 - repeats = np.random.randint(10, size=10) + rng = np.random.default_rng(seed=0) + arr = rng.random(10) * 10 + repeats = rng.integers(10, size=10) psr = pd.Series(arr).astype(dtype) gsr = cudf.from_pandas(psr) @@ -25,18 +26,20 @@ def test_repeat(dtype): def test_repeat_index(): + rng = np.random.default_rng(seed=0) arrays = [[1, 1, 2, 2], ["red", "blue", "red", "blue"]] psr = pd.MultiIndex.from_arrays(arrays, names=("number", "color")) gsr = cudf.from_pandas(psr) - repeats = np.random.randint(10, size=4) + repeats = rng.integers(10, size=4) assert_eq(psr.repeat(repeats), gsr.repeat(repeats)) def test_repeat_dataframe(): + rng = np.random.default_rng(seed=0) psr = pd.DataFrame({"a": [1, 1, 2, 2]}) gsr = cudf.from_pandas(psr) - repeats = np.random.randint(10, size=4) + repeats = rng.integers(10, size=4) # pd.DataFrame doesn't have repeat() so as a workaround, we are # comparing pd.Series.repeat() with cudf.DataFrame.repeat()['a'] @@ -45,7 +48,8 @@ def test_repeat_dataframe(): @pytest.mark.parametrize("dtype", NUMERIC_TYPES) def test_repeat_scalar(dtype): - arr = np.random.rand(10) * 10 + rng = np.random.default_rng(seed=0) + arr = rng.random(10) * 10 repeats = 10 psr = pd.Series(arr).astype(dtype) gsr = cudf.from_pandas(psr) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index b6efc8ebd88..8800275bf67 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -1764,13 +1764,13 @@ def test_csv_writer_multiindex(tmpdir): pdf_df_fname = tmpdir.join("pdf_df_3.csv") gdf_df_fname = tmpdir.join("gdf_df_3.csv") - np.random.seed(0) + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame( { - "a": np.random.randint(0, 5, 20), - "b": np.random.randint(0, 5, 20), + "a": rng.integers(0, 5, 20), + "b": rng.integers(0, 5, 20), "c": range(20), - "d": np.random.random(20), + "d": rng.random(20), } ) gdg = gdf.groupby(["a", "b"]).mean() diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 6f88d942746..0f2b41888fa 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -428,7 +428,7 @@ def test_series_init_none(): def test_dataframe_basic(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = cudf.DataFrame() # Populate with cuda memory @@ -437,7 +437,7 @@ def test_dataframe_basic(): assert len(df) == 10 # Populate with numpy array - rnd_vals = np.random.random(10) + rnd_vals = rng.random(10) df["vals"] = rnd_vals np.testing.assert_equal(df["vals"].to_numpy(), rnd_vals) assert len(df) == 10 @@ -1238,8 +1238,9 @@ def test_empty_dataframe_to_cupy(): df = cudf.DataFrame() nelem = 123 + rng = np.random.default_rng(seed=0) for k in "abc": - df[k] = np.random.random(nelem) + df[k] = rng.random(nelem) # Check all columns in empty dataframe. mat = df.head(0).to_cupy() @@ -1250,8 +1251,9 @@ def test_dataframe_to_cupy(): df = cudf.DataFrame() nelem = 123 + rng = np.random.default_rng(seed=0) for k in "abcd": - df[k] = np.random.random(nelem) + df[k] = rng.random(nelem) # Check all columns mat = df.to_cupy() @@ -1279,8 +1281,9 @@ def test_dataframe_to_cupy_null_values(): na = -10000 refvalues = {} + rng = np.random.default_rng(seed=0) for k in "abcd": - df[k] = data = np.random.random(nelem) + df[k] = data = rng.random(nelem) bitmask = utils.random_bitmask(nelem) df[k] = df[k]._column.set_mask(bitmask) boolmask = np.asarray( @@ -1321,10 +1324,11 @@ def test_dataframe_append_empty(): def test_dataframe_setitem_from_masked_object(): - ary = np.random.randn(100) + rng = np.random.default_rng(seed=0) + ary = rng.standard_normal(100) mask = np.zeros(100, dtype=bool) mask[:20] = True - np.random.shuffle(mask) + rng.shuffle(mask) ary[mask] = np.nan test1_null = cudf.Series(ary, nan_as_null=True) @@ -1534,14 +1538,12 @@ def test_dataframe_hash_values_xxhash64(): @pytest.mark.parametrize("nparts", [1, 2, 8, 13]) @pytest.mark.parametrize("nkeys", [1, 2]) def test_dataframe_hash_partition(nrows, nparts, nkeys): - np.random.seed(123) - gdf = cudf.DataFrame() - keycols = [] - for i in range(nkeys): - keyname = f"key{i}" - gdf[keyname] = np.random.randint(0, 7 - i, nrows) - keycols.append(keyname) - gdf["val1"] = np.random.randint(0, nrows * 2, nrows) + rng = np.random.default_rng(seed=0) + gdf = cudf.DataFrame( + {f"key{i}": rng.integers(0, 7 - i, nrows) for i in range(nkeys)} + ) + keycols = gdf.columns.to_list() + gdf["val1"] = rng.integers(0, nrows * 2, nrows) got = gdf.partition_by_hash(keycols, nparts=nparts) # Must return a list @@ -1751,8 +1753,9 @@ def test_concat_with_axis(): assert_eq(concat_cdf_s, concat_s, check_index_type=True) + rng = np.random.default_rng(seed=0) # concat series and dataframes - s3 = pd.Series(np.random.random(5)) + s3 = pd.Series(rng.random(5)) cs3 = cudf.Series.from_pandas(s3) concat_cdf_all = cudf.concat([cdf1, cs3, cdf2], axis=1) @@ -1787,13 +1790,14 @@ def test_concat_with_axis(): check_index_type=True, ) + rng = np.random.default_rng(seed=0) # concat groupby multi index gdf1 = cudf.DataFrame( { - "x": np.random.randint(0, 10, 10), - "y": np.random.randint(0, 10, 10), - "z": np.random.randint(0, 10, 10), - "v": np.random.randint(0, 10, 10), + "x": rng.integers(0, 10, 10), + "y": rng.integers(0, 10, 10), + "z": rng.integers(0, 10, 10), + "v": rng.integers(0, 10, 10), } ) gdf2 = gdf1[5:] @@ -1833,14 +1837,14 @@ def test_concat_with_axis(): @pytest.mark.parametrize("nrows", [0, 3, 10, 100, 1000]) def test_nonmatching_index_setitem(nrows): - np.random.seed(0) + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame() - gdf["a"] = np.random.randint(2147483647, size=nrows) - gdf["b"] = np.random.randint(2147483647, size=nrows) + gdf["a"] = rng.integers(2147483647, size=nrows) + gdf["b"] = rng.integers(2147483647, size=nrows) gdf = gdf.set_index("b") - test_values = np.random.randint(2147483647, size=nrows) + test_values = rng.integers(2147483647, size=nrows) gdf["c"] = test_values assert len(test_values) == len(gdf["c"]) gdf_series = cudf.Series(test_values, index=gdf.index, name="c") @@ -1974,10 +1978,11 @@ def test_index_in_dataframe_constructor(): @pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000]) @pytest.mark.parametrize("data_type", dtypes) def test_from_arrow(nelem, data_type): + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "a": np.random.randint(0, 1000, nelem).astype(data_type), - "b": np.random.randint(0, 1000, nelem).astype(data_type), + "a": rng.integers(0, 1000, nelem).astype(data_type), + "b": rng.integers(0, 1000, nelem).astype(data_type), } ) padf = pa.Table.from_pandas( @@ -2012,10 +2017,11 @@ def test_from_arrow_chunked_categories(): @pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000]) @pytest.mark.parametrize("data_type", dtypes) def test_to_arrow(nelem, data_type): + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "a": np.random.randint(0, 1000, nelem).astype(data_type), - "b": np.random.randint(0, 1000, nelem).astype(data_type), + "a": rng.integers(0, 1000, nelem).astype(data_type), + "b": rng.integers(0, 1000, nelem).astype(data_type), } ) gdf = cudf.DataFrame.from_pandas(df) @@ -2119,17 +2125,16 @@ def test_to_arrow_missing_categorical(): @pytest.mark.parametrize("data_type", dtypes) def test_from_scalar_typing(data_type): + rng = np.random.default_rng(seed=0) if data_type == "datetime64[ms]": scalar = ( - np.dtype("int64") - .type(np.random.randint(0, 5)) - .astype("datetime64[ms]") + np.dtype("int64").type(rng.integers(0, 5)).astype("datetime64[ms]") ) elif data_type.startswith("datetime64"): scalar = np.datetime64(datetime.date.today()).astype("datetime64[ms]") data_type = "datetime64[ms]" else: - scalar = np.dtype(data_type).type(np.random.randint(0, 5)) + scalar = np.dtype(data_type).type(rng.integers(0, 5)) gdf = cudf.DataFrame() gdf["a"] = [1, 2, 3, 4, 5] @@ -2140,7 +2145,8 @@ def test_from_scalar_typing(data_type): @pytest.mark.parametrize("data_type", NUMERIC_TYPES) def test_from_python_array(data_type): - np_arr = np.random.randint(0, 100, 10).astype(data_type) + rng = np.random.default_rng(seed=0) + np_arr = rng.integers(0, 100, 10).astype(data_type) data = memoryview(np_arr) data = arr.array(data.format, data) @@ -2220,7 +2226,7 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): # against pandas nullable types as they are the ones that closely # resemble `cudf` dtypes behavior. pdf = pd.DataFrame() - + rng = np.random.default_rng(seed=0) null_rep = np.nan if dtype in ["float32", "float64"] else None np_dtype = dtype dtype = np.dtype(dtype) @@ -2228,13 +2234,11 @@ def test_dataframe_transpose(nulls, num_cols, num_rows, dtype): for i in range(num_cols): colname = string.ascii_lowercase[i] data = pd.Series( - np.random.randint(0, 26, num_rows).astype(np_dtype), + rng.integers(0, 26, num_rows).astype(np_dtype), dtype=dtype, ) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) if len(idx): data[idx] = null_rep elif nulls == "all": @@ -2652,8 +2656,8 @@ def test_unaryops_df(pdf, unaryop, col_name, assign_col_name): def test_df_abs(pdf): - np.random.seed(0) - disturbance = pd.Series(np.random.rand(10)) + rng = np.random.default_rng(seed=0) + disturbance = pd.Series(rng.random(10)) pdf = pdf - 5 + disturbance d = pdf.apply(np.abs) g = cudf.from_pandas(pdf).abs() @@ -2706,8 +2710,9 @@ def test_iteritems(gdf): def test_quantile(q, numeric_only): ts = pd.date_range("2018-08-24", periods=5, freq="D") td = pd.to_timedelta(np.arange(5), unit="h") + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( - {"date": ts, "delta": td, "val": np.random.randn(len(ts))} + {"date": ts, "delta": td, "val": rng.standard_normal(len(ts))} ) gdf = cudf.DataFrame.from_pandas(pdf) @@ -2729,9 +2734,10 @@ def test_quantile(q, numeric_only): [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], ) def test_decimal_quantile(q, interpolation, decimal_type): + rng = np.random.default_rng(seed=0) data = ["244.8", "32.24", "2.22", "98.14", "453.23", "5.45"] gdf = cudf.DataFrame( - {"id": np.random.randint(0, 10, size=len(data)), "val": data} + {"id": rng.integers(0, 10, size=len(data)), "val": data} ) gdf["id"] = gdf["id"].astype("float64") gdf["val"] = gdf["val"].astype(decimal_type(7, 2)) @@ -2843,9 +2849,9 @@ def test_cuda_array_interface(dtype): @pytest.mark.parametrize("nchunks", [1, 2, 5, 10]) @pytest.mark.parametrize("data_type", dtypes) def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): + rng = np.random.default_rng(seed=0) np_list_data = [ - np.random.randint(0, 100, nelem).astype(data_type) - for i in range(nchunks) + rng.integers(0, 100, nelem).astype(data_type) for i in range(nchunks) ] pa_chunk_array = pa.chunked_array(np_list_data) @@ -2855,8 +2861,7 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type): assert_eq(expect, got) np_list_data2 = [ - np.random.randint(0, 100, nelem).astype(data_type) - for i in range(nchunks) + rng.integers(0, 100, nelem).astype(data_type) for i in range(nchunks) ] pa_chunk_array2 = pa.chunked_array(np_list_data2) pa_table = pa.Table.from_arrays( @@ -2881,11 +2886,13 @@ def query_GPU_memory(note=""): cuda.current_context().deallocations.clear() nRows = int(1e8) nCols = 2 - dataNumpy = np.asfortranarray(np.random.rand(nRows, nCols)) + rng = np.random.default_rng(seed=0) + dataNumpy = np.asfortranarray(rng.random(nRows, nCols)) colNames = ["col" + str(iCol) for iCol in range(nCols)] pandasDF = pd.DataFrame(data=dataNumpy, columns=colNames, dtype=np.float32) cudaDF = cudf.core.DataFrame.from_pandas(pandasDF) - boolmask = cudf.Series(np.random.randint(1, 2, len(cudaDF)).astype("bool")) + rng = np.random.default_rng(seed=0) + boolmask = cudf.Series(rng.integers(1, 2, len(cudaDF)).astype("bool")) memory_used = query_GPU_memory() cudaDF = cudaDF[boolmask] @@ -2903,7 +2910,8 @@ def query_GPU_memory(note=""): def test_boolmask(pdf, gdf): - boolmask = np.random.randint(0, 2, len(pdf)) > 0 + rng = np.random.default_rng(seed=0) + boolmask = rng.integers(0, 2, len(pdf)) > 0 gdf = gdf[boolmask] pdf = pdf[boolmask] assert_eq(pdf, gdf) @@ -2922,12 +2930,11 @@ def test_boolmask(pdf, gdf): ], ) def test_dataframe_boolmask(mask_shape): - pdf = pd.DataFrame() - for col in "abc": - pdf[col] = np.random.randint(0, 10, 3) - pdf_mask = pd.DataFrame() - for col in mask_shape[1]: - pdf_mask[col] = np.random.randint(0, 2, mask_shape[0]) > 0 + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame({col: rng.integers(0, 10, 3) for col in "abc"}) + pdf_mask = pd.DataFrame( + {col: rng.integers(0, 2, mask_shape[0]) > 0 for col in mask_shape[1]} + ) gdf = cudf.DataFrame.from_pandas(pdf) gdf_mask = cudf.DataFrame.from_pandas(pdf_mask) gdf = gdf[gdf_mask] @@ -2992,7 +2999,8 @@ def test_arrow_handle_no_index_name(pdf, gdf): def test_pandas_non_contiguious(): - arr1 = np.random.sample([5000, 10]) + rng = np.random.default_rng(seed=0) + arr1 = rng.random(size=(5000, 10)) assert arr1.flags["C_CONTIGUOUS"] is True df = pd.DataFrame(arr1) for col in df.columns: @@ -3052,10 +3060,11 @@ def test_series_rename(): @pytest.mark.parametrize("data_type", dtypes) @pytest.mark.parametrize("nelem", [0, 100]) def test_head_tail(nelem, data_type): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "a": np.random.randint(0, 1000, nelem).astype(data_type), - "b": np.random.randint(0, 1000, nelem).astype(data_type), + "a": rng.integers(0, 1000, nelem).astype(data_type), + "b": rng.integers(0, 1000, nelem).astype(data_type), } ) gdf = cudf.from_pandas(pdf) @@ -3308,15 +3317,15 @@ def test_set_index_verify_integrity(data, index, verify_integrity): @pytest.mark.parametrize("drop", [True, False]) @pytest.mark.parametrize("nelem", [10, 200, 1333]) def test_set_index_multi(drop, nelem): - np.random.seed(0) + rng = np.random.default_rng(seed=0) a = np.arange(nelem) - np.random.shuffle(a) + rng.shuffle(a) df = pd.DataFrame( { "a": a, - "b": np.random.randint(0, 4, size=nelem), - "c": np.random.uniform(low=0, high=4, size=nelem), - "d": np.random.choice(["green", "black", "white"], nelem), + "b": rng.integers(0, 4, size=nelem), + "c": rng.uniform(low=0, high=4, size=nelem), + "d": rng.choice(["green", "black", "white"], nelem), } ) df["e"] = df["d"].astype("category") @@ -3894,13 +3903,13 @@ def test_select_dtype_datetime_with_frequency(): def test_dataframe_describe_exclude(): - np.random.seed(12) + rng = np.random.default_rng(seed=12) data_length = 10000 df = cudf.DataFrame() - df["x"] = np.random.normal(10, 1, data_length) + df["x"] = rng.normal(10, 1, data_length) df["x"] = df.x.astype("int64") - df["y"] = np.random.normal(10, 1, data_length) + df["y"] = rng.normal(10, 1, data_length) pdf = df.to_pandas() gdf_results = df.describe(exclude=["float"]) @@ -3910,13 +3919,13 @@ def test_dataframe_describe_exclude(): def test_dataframe_describe_include(): - np.random.seed(12) + rng = np.random.default_rng(seed=12) data_length = 10000 df = cudf.DataFrame() - df["x"] = np.random.normal(10, 1, data_length) + df["x"] = rng.normal(10, 1, data_length) df["x"] = df.x.astype("int64") - df["y"] = np.random.normal(10, 1, data_length) + df["y"] = rng.normal(10, 1, data_length) pdf = df.to_pandas() gdf_results = df.describe(include=["int"]) pdf_results = pdf.describe(include=["int"]) @@ -3925,12 +3934,12 @@ def test_dataframe_describe_include(): def test_dataframe_describe_default(): - np.random.seed(12) + rng = np.random.default_rng(seed=12) data_length = 10000 df = cudf.DataFrame() - df["x"] = np.random.normal(10, 1, data_length) - df["y"] = np.random.normal(10, 1, data_length) + df["x"] = rng.normal(10, 1, data_length) + df["y"] = rng.normal(10, 1, data_length) pdf = df.to_pandas() gdf_results = df.describe() pdf_results = pdf.describe() @@ -3939,14 +3948,14 @@ def test_dataframe_describe_default(): def test_series_describe_include_all(): - np.random.seed(12) + rng = np.random.default_rng(seed=12) data_length = 10000 df = cudf.DataFrame() - df["x"] = np.random.normal(10, 1, data_length) + df["x"] = rng.normal(10, 1, data_length) df["x"] = df.x.astype("int64") - df["y"] = np.random.normal(10, 1, data_length) - df["animal"] = np.random.choice(["dog", "cat", "bird"], data_length) + df["y"] = rng.normal(10, 1, data_length) + df["animal"] = rng.choice(["dog", "cat", "bird"], data_length) pdf = df.to_pandas() gdf_results = df.describe(include="all") @@ -3962,13 +3971,13 @@ def test_series_describe_include_all(): def test_dataframe_describe_percentiles(): - np.random.seed(12) + rng = np.random.default_rng(seed=12) data_length = 10000 sample_percentiles = [0.0, 0.1, 0.33, 0.84, 0.4, 0.99] df = cudf.DataFrame() - df["x"] = np.random.normal(10, 1, data_length) - df["y"] = np.random.normal(10, 1, data_length) + df["x"] = rng.normal(10, 1, data_length) + df["y"] = rng.normal(10, 1, data_length) pdf = df.to_pandas() gdf_results = df.describe(percentiles=sample_percentiles) pdf_results = pdf.describe(percentiles=sample_percentiles) @@ -4098,10 +4107,11 @@ def test_ndim(): ], ) def test_dataframe_round(decimals): + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame( { "floats": np.arange(0.5, 10.5, 1), - "ints": np.random.normal(-100, 100, 10), + "ints": rng.normal(-100, 100, 10), "floats_with_na": np.array( [ 14.123, @@ -4117,9 +4127,9 @@ def test_dataframe_round(decimals): ] ), "floats_same": np.repeat([-0.6459412758761901], 10), - "bools": np.random.choice([True, None, False], 10), - "strings": np.random.choice(["abc", "xyz", None], 10), - "struct": np.random.choice([{"abc": 1}, {"xyz": 2}, None], 10), + "bools": rng.choice([True, None, False], 10), + "strings": rng.choice(["abc", "xyz", None], 10), + "struct": rng.choice([{"abc": 1}, {"xyz": 2}, None], 10), "list": [[1], [2], None, [4], [3]] * 2, } ) @@ -5811,10 +5821,11 @@ def test_memory_usage(deep, index, set_index): @pytest_xfail def test_memory_usage_string(): rows = int(100) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { "A": np.arange(rows, dtype="int32"), - "B": np.random.choice(["apple", "banana", "orange"], rows), + "B": rng.choice(["apple", "banana", "orange"], rows), } ) gdf = cudf.from_pandas(df) @@ -5837,10 +5848,11 @@ def test_memory_usage_string(): def test_memory_usage_cat(): rows = int(100) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { "A": np.arange(rows, dtype="int32"), - "B": np.random.choice(["apple", "banana", "orange"], rows), + "B": rng.choice(["apple", "banana", "orange"], rows), } ) df["B"] = df.B.astype("category") @@ -5870,13 +5882,14 @@ def test_memory_usage_list(): def test_memory_usage_multi(rows): # We need to sample without replacement to guarantee that the size of the # levels are always the same. + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { "A": np.arange(rows, dtype="int32"), - "B": np.random.choice( + "B": rng.choice( np.arange(rows, dtype="int64"), rows, replace=False ), - "C": np.random.choice( + "C": rng.choice( np.arange(rows, dtype="float64"), rows, replace=False ), } @@ -6698,8 +6711,16 @@ def test_dataframe_init_1d_list(data, columns): (cupy.array([11, 123, -2342, 232]), ["z"], [0, 1, 1, 0]), (cupy.array([11, 123, -2342, 232]), ["z"], [1, 2, 3, 4]), (cupy.array([11, 123, -2342, 232]), ["z"], ["a", "z", "d", "e"]), - (np.random.randn(2, 4), ["a", "b", "c", "d"], ["a", "b"]), - (np.random.randn(2, 4), ["a", "b", "c", "d"], [1, 0]), + ( + np.random.default_rng(seed=0).standard_normal(size=(2, 4)), + ["a", "b", "c", "d"], + ["a", "b"], + ), + ( + np.random.default_rng(seed=0).standard_normal(size=(2, 4)), + ["a", "b", "c", "d"], + [1, 0], + ), (cupy.random.randn(2, 4), ["a", "b", "c", "d"], ["a", "b"]), (cupy.random.randn(2, 4), ["a", "b", "c", "d"], [1, 0]), ], @@ -6873,8 +6894,9 @@ def test_dataframe_info_basic(): memory usage: 859.0+ bytes """ ) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( - np.random.randn(10, 10), + rng.standard_normal(size=(10, 10)), index=["a", "2", "3", "4", "5", "6", "7", "8", "100", "1111"], ) cudf.from_pandas(df).info(buf=buffer, verbose=True) @@ -9374,8 +9396,8 @@ def test_dataframe_roundtrip_arrow_struct_dtype(gdf): def test_dataframe_setitem_cupy_array(): - np.random.seed(0) - pdf = pd.DataFrame(np.random.randn(10, 2)) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.standard_normal(size=(10, 2))) gdf = cudf.from_pandas(pdf) gpu_array = cupy.array([True, False] * 5) @@ -10161,7 +10183,7 @@ def df_eval(request): } ) int_max = 10 - rng = cupy.random.default_rng(0) + rng = cupy.random.default_rng(seed=0) return cudf.DataFrame( { "a": rng.integers(N, size=int_max), @@ -10529,11 +10551,12 @@ def test_dataframe_init_length_error(data, index): def test_dataframe_binop_with_mixed_date_types(): + rng = np.random.default_rng(seed=0) df = pd.DataFrame( - np.random.rand(2, 2), + rng.random(size=(2, 2)), columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"), ) - ser = pd.Series(np.random.rand(3), index=[0, 1, 2]) + ser = pd.Series(rng.random(size=3), index=[0, 1, 2]) gdf = cudf.from_pandas(df) gser = cudf.from_pandas(ser) expected = df - ser @@ -10542,9 +10565,10 @@ def test_dataframe_binop_with_mixed_date_types(): def test_dataframe_binop_with_mixed_string_types(): - df1 = pd.DataFrame(np.random.rand(3, 3), columns=pd.Index([0, 1, 2])) + rng = np.random.default_rng(seed=0) + df1 = pd.DataFrame(rng.random(size=(3, 3)), columns=pd.Index([0, 1, 2])) df2 = pd.DataFrame( - np.random.rand(6, 6), + rng.random(size=(6, 6)), columns=pd.Index([0, 1, 2, "VhDoHxRaqt", "X0NNHBIPfA", "5FbhPtS0D1"]), ) gdf1 = cudf.from_pandas(df1) @@ -10557,7 +10581,8 @@ def test_dataframe_binop_with_mixed_string_types(): def test_dataframe_binop_and_where(): - df = pd.DataFrame(np.random.rand(2, 2), columns=pd.Index([True, False])) + rng = np.random.default_rng(seed=0) + df = pd.DataFrame(rng.random(size=(2, 2)), columns=pd.Index([True, False])) gdf = cudf.from_pandas(df) expected = df > 1 @@ -10572,12 +10597,13 @@ def test_dataframe_binop_and_where(): def test_dataframe_binop_with_datetime_index(): + rng = np.random.default_rng(seed=0) df = pd.DataFrame( - np.random.rand(2, 2), + rng.random(size=(2, 2)), columns=pd.Index(["2000-01-03", "2000-01-04"], dtype="datetime64[ns]"), ) ser = pd.Series( - np.random.rand(2), + rng.random(2), index=pd.Index( [ "2000-01-04", @@ -10615,8 +10641,8 @@ def test_dataframe_dict_like_with_columns(columns, index): def test_dataframe_init_columns_named_multiindex(): - np.random.seed(0) - data = np.random.randn(2, 2) + rng = np.random.default_rng(seed=0) + data = rng.standard_normal(size=(2, 2)) columns = cudf.MultiIndex.from_tuples( [("A", "one"), ("A", "two")], names=["y", "z"] ) @@ -10627,8 +10653,8 @@ def test_dataframe_init_columns_named_multiindex(): def test_dataframe_init_columns_named_index(): - np.random.seed(0) - data = np.random.randn(2, 2) + rng = np.random.default_rng(seed=0) + data = rng.standard_normal(size=(2, 2)) columns = pd.Index(["a", "b"], name="custom_name") gdf = cudf.DataFrame(data, columns=columns) pdf = pd.DataFrame(data, columns=columns) @@ -11146,3 +11172,12 @@ def test_from_pandas_preserve_column_dtype(): df = pd.DataFrame([[1, 2]], columns=pd.Index([1, 2], dtype="int8")) result = cudf.DataFrame.from_pandas(df) pd.testing.assert_index_equal(result.columns, df.columns, exact=True) + + +def test_dataframe_init_column(): + s = cudf.Series([1, 2, 3]) + with pytest.raises(TypeError): + cudf.DataFrame(s._column) + expect = cudf.DataFrame({"a": s}) + actual = cudf.DataFrame._from_arrays(s._column, columns=["a"]) + assert_eq(expect, actual) diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py index 45bd31ef58e..3aedbf8365b 100644 --- a/python/cudf/cudf/tests/test_dataframe_copy.py +++ b/python/cudf/cudf/tests/test_dataframe_copy.py @@ -93,11 +93,15 @@ def test_dataframe_deep_copy_and_insert(copy_parameters): @pytest.mark.parametrize("ncols", [0, 1, 10]) @pytest.mark.parametrize("data_type", ALL_TYPES) def test_cudf_dataframe_copy(copy_fn, ncols, data_type): - pdf = pd.DataFrame() - for i in range(ncols): - pdf[chr(i + ord("a"))] = pd.Series( - np.random.randint(0, 1000, 20) - ).astype(data_type) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame( + { + chr(i + ord("a")): pd.Series(rng.integers(0, 1000, 20)).astype( + data_type + ) + for i in range(ncols) + } + ) df = DataFrame.from_pandas(pdf) copy_df = copy_fn(df) assert_eq(df, copy_df) @@ -116,18 +120,20 @@ def test_cudf_dataframe_copy(copy_fn, ncols, data_type): @pytest.mark.parametrize("ncols", [0, 1, 10]) @pytest.mark.parametrize("data_type", ALL_TYPES) def test_cudf_dataframe_copy_then_insert(copy_fn, ncols, data_type): - pdf = pd.DataFrame() - for i in range(ncols): - pdf[chr(i + ord("a"))] = pd.Series( - np.random.randint(0, 1000, 20) - ).astype(data_type) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame( + { + chr(i + ord("a")): pd.Series(rng.integers(0, 1000, 20)).astype( + data_type + ) + for i in range(ncols) + } + ) df = DataFrame.from_pandas(pdf) copy_df = copy_fn(df) copy_pdf = copy_fn(pdf) - copy_df["aa"] = pd.Series(np.random.randint(0, 1000, 20)).astype(data_type) - copy_pdf["aa"] = pd.Series(np.random.randint(0, 1000, 20)).astype( - data_type - ) + copy_df["aa"] = pd.Series(rng.integers(0, 1000, 20)).astype(data_type) + copy_pdf["aa"] = pd.Series(rng.integers(0, 1000, 20)).astype(data_type) assert not copy_pdf.to_string().split() == pdf.to_string().split() assert not copy_df.to_string().split() == df.to_string().split() diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py index 4a2345fc009..b7403c12bcd 100644 --- a/python/cudf/cudf/tests/test_datetime.py +++ b/python/cudf/cudf/tests/test_datetime.py @@ -216,17 +216,21 @@ def test_setitem_datetime(): def test_sort_datetime(): - df = pd.DataFrame() - df["date"] = np.array( - [ - np.datetime64("2016-11-20"), - np.datetime64("2020-11-20"), - np.datetime64("2019-11-20"), - np.datetime64("1918-11-20"), - np.datetime64("2118-11-20"), - ] + rng = np.random.default_rng(seed=0) + df = pd.DataFrame( + { + "date": np.array( + [ + np.datetime64("2016-11-20"), + np.datetime64("2020-11-20"), + np.datetime64("2019-11-20"), + np.datetime64("1918-11-20"), + np.datetime64("2118-11-20"), + ] + ), + "vals": rng.random(5), + } ) - df["vals"] = np.random.sample(len(df["date"])) gdf = cudf.from_pandas(df) @@ -432,11 +436,12 @@ def test_datetime_to_arrow(dtype): ) @pytest.mark.parametrize("nulls", ["none", "some"]) def test_datetime_unique(data, nulls): + rng = np.random.default_rng(seed=0) psr = data.copy() if len(data) > 0: if nulls == "some": - p = np.random.randint(0, len(data), 2) + p = rng.integers(0, len(data), 2) psr[p] = None gsr = cudf.from_pandas(psr) @@ -461,10 +466,11 @@ def test_datetime_unique(data, nulls): @pytest.mark.parametrize("nulls", ["none", "some"]) def test_datetime_nunique(data, nulls): psr = data.copy() + rng = np.random.default_rng(seed=0) if len(data) > 0: if nulls == "some": - p = np.random.randint(0, len(data), 2) + p = rng.integers(0, len(data), 2) psr[p] = None gsr = cudf.from_pandas(psr) @@ -2525,23 +2531,7 @@ def test_dti_asi8(): @pytest.mark.parametrize( "method, kwargs", - [ - ["mean", {}], - pytest.param( - "std", - {}, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/16444" - ), - ), - pytest.param( - "std", - {"ddof": 0}, - marks=pytest.mark.xfail( - reason="https://github.com/rapidsai/cudf/issues/16444" - ), - ), - ], + [["mean", {}], ["std", {}], ["std", {"ddof": 0}]], ) def test_dti_reduction(method, kwargs): pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo") diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py index ebcc35784ee..20c24bd7564 100644 --- a/python/cudf/cudf/tests/test_dlpack.py +++ b/python/cudf/cudf/tests/test_dlpack.py @@ -42,9 +42,10 @@ def data_1d(request): nelems = request.param[0] dtype = request.param[1] nulls = request.param[2] - a = np.random.randint(10, size=nelems).astype(dtype) + rng = np.random.default_rng(seed=0) + a = rng.integers(10, size=nelems).astype(dtype) if nulls == "some" and a.size != 0 and np.issubdtype(dtype, np.floating): - idx = np.random.choice(a.size, size=int(a.size * 0.2), replace=False) + idx = rng.choice(a.size, size=int(a.size * 0.2), replace=False) a[idx] = np.nan return a @@ -55,9 +56,10 @@ def data_2d(request): nrows = request.param[1] dtype = request.param[2] nulls = request.param[3] - a = np.random.randint(10, size=(nrows, ncols)).astype(dtype) + rng = np.random.default_rng(seed=0) + a = rng.integers(10, size=(nrows, ncols)).astype(dtype) if nulls == "some" and a.size != 0 and np.issubdtype(dtype, np.floating): - idx = np.random.choice(a.size, size=int(a.size * 0.2), replace=False) + idx = rng.choice(a.size, size=int(a.size * 0.2), replace=False) a.ravel()[idx] = np.nan return np.ascontiguousarray(a) diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py index 5b1ee0ffac6..eeac78dbebc 100644 --- a/python/cudf/cudf/tests/test_dropna.py +++ b/python/cudf/cudf/tests/test_dropna.py @@ -22,13 +22,13 @@ @pytest.mark.parametrize("inplace", [True, False]) def test_dropna_series(data, nulls, inplace): psr = pd.Series(data) - + rng = np.random.default_rng(seed=0) if len(data) > 0: if nulls == "one": - p = np.random.randint(0, 4) + p = rng.integers(0, 4) psr[p] = None elif nulls == "some": - p1, p2 = np.random.randint(0, 4, (2,)) + p1, p2 = rng.integers(0, 4, (2,)) psr[p1] = None psr[p2] = None elif nulls == "all": diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py index 0b4ed52ba96..67dd7a8388b 100644 --- a/python/cudf/cudf/tests/test_duplicates.py +++ b/python/cudf/cudf/tests/test_duplicates.py @@ -368,9 +368,13 @@ def test_dataframe_drop_duplicates_method(): def test_datetime_drop_duplicates(): - date_df = cudf.DataFrame() - date_df["date"] = pd.date_range("11/20/2018", periods=6, freq="D") - date_df["value"] = np.random.sample(len(date_df)) + rng = np.random.default_rng(seed=0) + date_df = cudf.DataFrame( + { + "date": pd.date_range("11/20/2018", periods=6, freq="D"), + "value": rng.random(6), + } + ) df = concat([date_df, date_df[:4]]) assert_eq(df[:-4], df.drop_duplicates()) @@ -585,7 +589,8 @@ def test_drop_duplicates_multi_index(): ] idx = pd.MultiIndex.from_tuples(list(zip(*arrays)), names=["a", "b"]) - pdf = pd.DataFrame(np.random.randint(0, 2, (8, 4)), index=idx) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.integers(0, 2, (8, 4)), index=idx) gdf = cudf.DataFrame.from_pandas(pdf) expected = pdf.drop_duplicates() diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py index 47f9180dcb1..cfb4ae2c0f8 100644 --- a/python/cudf/cudf/tests/test_factorize.py +++ b/python/cudf/cudf/tests/test_factorize.py @@ -13,13 +13,16 @@ @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) def test_factorize_series_obj(ncats, nelem): df = DataFrame() - np.random.seed(0) + rng = np.random.default_rng(seed=0) # initialize data frame - df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32) + df["cats"] = arr = rng.integers(2, size=10, dtype=np.int32) uvals, labels = df["cats"].factorize() - np.testing.assert_array_equal(labels.to_numpy(), sorted(set(arr))) + unique_values, indices = np.unique(arr, return_index=True) + expected_values = unique_values[np.argsort(indices)] + + np.testing.assert_array_equal(labels.to_numpy(), expected_values) assert isinstance(uvals, cp.ndarray) assert isinstance(labels, Index) @@ -31,14 +34,17 @@ def test_factorize_series_obj(ncats, nelem): @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)]) def test_factorize_index_obj(ncats, nelem): df = DataFrame() - np.random.seed(0) + rng = np.random.default_rng(seed=0) # initialize data frame - df["cats"] = arr = np.random.randint(2, size=10, dtype=np.int32) + df["cats"] = arr = rng.integers(2, size=10, dtype=np.int32) df = df.set_index("cats") uvals, labels = df.index.factorize() - np.testing.assert_array_equal(labels.values.get(), sorted(set(arr))) + unique_values, indices = np.unique(arr, return_index=True) + expected_values = unique_values[np.argsort(indices)] + + np.testing.assert_array_equal(labels.values.get(), expected_values) assert isinstance(uvals, cp.ndarray) assert isinstance(labels, Index) diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py index 7e5523bb8c7..f93bd2c5d32 100644 --- a/python/cudf/cudf/tests/test_feather.py +++ b/python/cudf/cudf/tests/test_feather.py @@ -15,13 +15,14 @@ @pytest.fixture(params=[0, 1, 10, 100]) def pdf(request): + rng = np.random.default_rng(seed=0) types = NUMERIC_TYPES + ["bool"] nrows = request.param # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( { - f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) + f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) for typ in types } ) @@ -30,7 +31,7 @@ def pdf(request): test_pdf.index.name = "index" # Create non-numeric categorical data otherwise may get typecasted - data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)] + data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)] test_pdf["col_category"] = pd.Series(data, dtype="category") # Feather can't handle indexes properly diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 14ba9894fd3..6b222841622 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -77,21 +77,21 @@ def make_frame( extra_vals=(), with_datetime=False, ): - np.random.seed(seed) + rng = np.random.default_rng(seed=seed) df = dataframe_class() - df["x"] = np.random.randint(0, 5, nelem) - df["y"] = np.random.randint(0, 3, nelem) + df["x"] = rng.integers(0, 5, nelem) + df["y"] = rng.integers(0, 3, nelem) for lvl in extra_levels: - df[lvl] = np.random.randint(0, 2, nelem) + df[lvl] = rng.integers(0, 2, nelem) - df["val"] = np.random.random(nelem) + df["val"] = rng.random(nelem) for val in extra_vals: - df[val] = np.random.random(nelem) + df[val] = rng.random(nelem) if with_datetime: - df["datetime"] = np.random.randint( + df["datetime"] = rng.integers( _now, _tomorrow, nelem, dtype=np.int64 ).astype("datetime64[ns]") @@ -266,9 +266,10 @@ def test_groupby_getitem_getattr(as_index): def test_groupby_cats(): - df = DataFrame() - df["cats"] = pd.Categorical(list("aabaacaab")) - df["vals"] = np.random.random(len(df)) + rng = np.random.default_rng(seed=0) + df = DataFrame( + {"cats": pd.Categorical(list("aabaacaab")), "vals": rng.random(9)} + ) cats = df["cats"].values_host vals = df["vals"].to_numpy() @@ -285,13 +286,16 @@ def test_groupby_cats(): def test_groupby_iterate_groups(): - np.random.seed(0) - df = DataFrame() + rng = np.random.default_rng(seed=0) nelem = 20 - df["key1"] = np.random.randint(0, 3, nelem) - df["key2"] = np.random.randint(0, 2, nelem) - df["val1"] = np.random.random(nelem) - df["val2"] = np.random.random(nelem) + df = DataFrame( + { + "key1": rng.integers(0, 3, nelem), + "key2": rng.integers(0, 2, nelem), + "val1": rng.random(nelem), + "val2": rng.random(nelem), + } + ) def assert_values_equal(arr): np.testing.assert_array_equal(arr[0], arr) @@ -307,13 +311,16 @@ def assert_values_equal(arr): reason="Fails in older versions of pandas", ) def test_groupby_apply(): - np.random.seed(0) - df = DataFrame() + rng = np.random.default_rng(seed=0) nelem = 20 - df["key1"] = np.random.randint(0, 3, nelem) - df["key2"] = np.random.randint(0, 2, nelem) - df["val1"] = np.random.random(nelem) - df["val2"] = np.random.random(nelem) + df = DataFrame( + { + "key1": rng.integers(0, 3, nelem), + "key2": rng.integers(0, 2, nelem), + "val1": rng.random(nelem), + "val2": rng.random(nelem), + } + ) expect_grpby = df.to_pandas().groupby( ["key1", "key2"], as_index=False, group_keys=False @@ -351,13 +358,16 @@ def f3(df, k, L, m): reason="Fails in older versions of pandas", ) def test_groupby_apply_args(func, args): - np.random.seed(0) - df = DataFrame() + rng = np.random.default_rng(seed=0) nelem = 20 - df["key1"] = np.random.randint(0, 3, nelem) - df["key2"] = np.random.randint(0, 2, nelem) - df["val1"] = np.random.random(nelem) - df["val2"] = np.random.random(nelem) + df = DataFrame( + { + "key1": rng.integers(0, 3, nelem), + "key2": rng.integers(0, 2, nelem), + "val1": rng.random(nelem), + "val2": rng.random(nelem), + } + ) expect_grpby = df.to_pandas().groupby( ["key1", "key2"], as_index=False, group_keys=False @@ -369,7 +379,6 @@ def test_groupby_apply_args(func, args): def test_groupby_apply_grouped(): - np.random.seed(0) df = DataFrame() nelem = 20 df["key1"] = range(nelem) @@ -1010,6 +1019,7 @@ def test_groupby_2keys_agg(nelem, func): # "func", ["min", "max", "idxmin", "idxmax", "count", "sum"], ) def test_groupby_agg_decimal(num_groups, nelem_per_group, func): + rng = np.random.default_rng(seed=0) # The number of digits after the decimal to use. decimal_digits = 2 # The number of digits before the decimal to use. @@ -1026,8 +1036,8 @@ def test_groupby_agg_decimal(num_groups, nelem_per_group, func): # https://github.com/pandas-dev/pandas/issues/40685). However, if that is # ever enabled, then this issue will crop up again so we may as well have # it fixed now. - x = np.unique((np.random.rand(nelem) * scale).round(decimal_digits)) - y = np.unique((np.random.rand(nelem) * scale).round(decimal_digits)) + x = np.unique((rng.random(nelem) * scale).round(decimal_digits)) + y = np.unique((rng.random(nelem) * scale).round(decimal_digits)) if x.size < y.size: total_elements = x.size @@ -1313,9 +1323,9 @@ def test_empty_groupby(func): def test_groupby_unsupported_columns(): - np.random.seed(12) + rng = np.random.default_rng(seed=12) pd_cat = pd.Categorical( - pd.Series(np.random.choice(["a", "b", 1], 3), dtype="category") + pd.Series(rng.choice(["a", "b", 1], 3), dtype="category") ) pdf = pd.DataFrame( { @@ -1421,10 +1431,11 @@ def test_groupby_apply_basic_agg_single_column(): def test_groupby_multi_agg_single_groupby_series(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "x": np.random.randint(0, 5, size=10000), - "y": np.random.normal(size=10000), + "x": rng.integers(0, 5, size=10000), + "y": rng.normal(size=10000), } ) gdf = cudf.from_pandas(pdf) @@ -1435,12 +1446,13 @@ def test_groupby_multi_agg_single_groupby_series(): def test_groupby_multi_agg_multi_groupby(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "a": np.random.randint(0, 5, 10), - "b": np.random.randint(0, 5, 10), - "c": np.random.randint(0, 5, 10), - "d": np.random.randint(0, 5, 10), + "a": rng.integers(0, 5, 10), + "b": rng.integers(0, 5, 10), + "c": rng.integers(0, 5, 10), + "d": rng.integers(0, 5, 10), } ) gdf = cudf.from_pandas(pdf) @@ -1450,6 +1462,7 @@ def test_groupby_multi_agg_multi_groupby(): def test_groupby_datetime_multi_agg_multi_groupby(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { "a": pd.date_range( @@ -1457,9 +1470,9 @@ def test_groupby_datetime_multi_agg_multi_groupby(): datetime.datetime.now() + datetime.timedelta(9), freq="D", ), - "b": np.random.randint(0, 5, 10), - "c": np.random.randint(0, 5, 10), - "d": np.random.randint(0, 5, 10), + "b": rng.integers(0, 5, 10), + "c": rng.integers(0, 5, 10), + "d": rng.integers(0, 5, 10), } ) gdf = cudf.from_pandas(pdf) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 3f483219423..24d42d9eb4c 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2645,21 +2645,20 @@ def test_isin_multiindex(data, values, level, err): ) -range_data = [ - range(np.random.randint(0, 100)), - range(9, 12, 2), - range(20, 30), - range(100, 1000, 10), - range(0, 10, -2), - range(0, -10, 2), - range(0, -10, -2), -] - - -@pytest.fixture(params=range_data) +@pytest.fixture( + params=[ + range(np.random.default_rng(seed=0).integers(0, 100)), + range(9, 12, 2), + range(20, 30), + range(100, 1000, 10), + range(0, 10, -2), + range(0, -10, 2), + range(0, -10, -2), + ] +) def rangeindex(request): """Create a cudf RangeIndex of different `nrows`""" - return RangeIndex(request.param) + return cudf.RangeIndex(request.param) @pytest.mark.parametrize( @@ -2830,21 +2829,20 @@ def test_rangeindex_append_return_rangeindex(): assert_eq(result, expected) -index_data = [ - range(np.random.randint(0, 100)), - range(0, 10, -2), - range(0, -10, 2), - range(0, -10, -2), - range(0, 1), - [1, 2, 3, 1, None, None], - [None, None, 3.2, 1, None, None], - [None, "a", "3.2", "z", None, None], - pd.Series(["a", "b", None], dtype="category"), - np.array([1, 2, 3, None], dtype="datetime64[s]"), -] - - -@pytest.fixture(params=index_data) +@pytest.fixture( + params=[ + range(np.random.default_rng(seed=0).integers(0, 100)), + range(0, 10, -2), + range(0, -10, 2), + range(0, -10, -2), + range(0, 1), + [1, 2, 3, 1, None, None], + [None, None, 3.2, 1, None, None], + [None, "a", "3.2", "z", None, None], + pd.Series(["a", "b", None], dtype="category"), + np.array([1, 2, 3, None], dtype="datetime64[s]"), + ] +) def index(request): """Create a cudf Index of different dtypes""" return cudf.Index(request.param) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 00ae99466bb..421bc0c298b 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -32,7 +32,8 @@ def pdf_gdf(): @pytest.fixture def pdf_gdf_multi(): - pdf = pd.DataFrame(np.random.rand(7, 5)) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.random(size=(7, 5))) pdfIndex = pd.MultiIndex( [ ["a", "b", "c"], @@ -212,12 +213,17 @@ def test_dataframe_column_name_indexing(): df[1].to_numpy(), np.asarray(range(10), dtype=np.int32) ) + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame() nelem = 10 - pdf["key1"] = np.random.randint(0, 5, nelem) - pdf["key2"] = np.random.randint(0, 3, nelem) - pdf[1] = np.arange(1, 1 + nelem) - pdf[2] = np.random.random(nelem) + pdf = pd.DataFrame( + { + "key1": rng.integers(0, 5, nelem), + "key2": rng.integers(0, 3, nelem), + 1: np.arange(1, 1 + nelem), + 2: rng.random(nelem), + } + ) df = cudf.from_pandas(pdf) assert_eq(df[df.columns], df) @@ -239,16 +245,13 @@ def test_dataframe_column_name_indexing(): def test_dataframe_slicing(): + rng = np.random.default_rng(seed=0) df = cudf.DataFrame() size = 123 - df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( - np.int32 - ) - df["b"] = hb = np.random.random(size).astype(np.float32) - df["c"] = hc = np.random.randint(low=0, high=100, size=size).astype( - np.int64 - ) - df["d"] = hd = np.random.random(size).astype(np.float64) + df["a"] = ha = rng.integers(low=0, high=100, size=size).astype(np.int32) + df["b"] = hb = rng.random(size).astype(np.float32) + df["c"] = hc = rng.integers(low=0, high=100, size=size).astype(np.int64) + df["d"] = hd = rng.random(size).astype(np.float64) # Row slice first 10 first_10 = df[:10] @@ -287,12 +290,13 @@ def test_dataframe_slicing(): @pytest.mark.parametrize("scalar", [0, 20, 100]) def test_dataframe_loc(scalar, step): size = 123 + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "a": np.random.randint(low=0, high=100, size=size), - "b": np.random.random(size).astype(np.float32), - "c": np.random.random(size).astype(np.float64), - "d": np.random.random(size).astype(np.float64), + "a": rng.integers(low=0, high=100, size=size), + "b": rng.random(size).astype(np.float32), + "c": rng.random(size).astype(np.float64), + "d": rng.random(size).astype(np.float64), } ) pdf.index.name = "index" @@ -392,12 +396,11 @@ def test_dataframe_loc_mask(mask, arg): def test_dataframe_loc_outbound(): + rng = np.random.default_rng(seed=0) df = cudf.DataFrame() size = 10 - df["a"] = ha = np.random.randint(low=0, high=100, size=size).astype( - np.int32 - ) - df["b"] = hb = np.random.random(size).astype(np.float32) + df["a"] = ha = rng.integers(low=0, high=100, size=size).astype(np.int32) + df["b"] = hb = rng.random(size).astype(np.float32) pdf = pd.DataFrame() pdf["a"] = ha @@ -590,8 +593,8 @@ def test_dataframe_series_loc_multiindex(obj): @pytest.mark.parametrize("nelem", [2, 5, 20, 100]) def test_series_iloc(nelem): # create random cudf.Series - np.random.seed(12) - ps = pd.Series(np.random.sample(nelem)) + rng = np.random.default_rng(seed=0) + ps = pd.Series(rng.random(nelem)) # gpu cudf.Series gs = cudf.Series(ps) @@ -625,12 +628,11 @@ def test_series_iloc(nelem): @pytest.mark.parametrize("nelem", [2, 5, 20, 100]) def test_dataframe_iloc(nelem): + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame() - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( - np.int32 - ) - gdf["b"] = hb = np.random.random(nelem).astype(np.float32) + gdf["a"] = ha = rng.integers(low=0, high=100, size=nelem).astype(np.int32) + gdf["b"] = hb = rng.random(nelem).astype(np.float32) pdf = pd.DataFrame() pdf["a"] = ha @@ -679,12 +681,11 @@ def test_dataframe_iloc(nelem): def test_dataframe_iloc_tuple(): + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame() nelem = 123 - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( - np.int32 - ) - gdf["b"] = hb = np.random.random(nelem).astype(np.float32) + gdf["a"] = ha = rng.integers(low=0, high=100, size=nelem).astype(np.int32) + gdf["b"] = hb = rng.random(nelem).astype(np.float32) pdf = pd.DataFrame() pdf["a"] = ha @@ -695,12 +696,11 @@ def test_dataframe_iloc_tuple(): def test_dataframe_iloc_index_error(): + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame() nelem = 123 - gdf["a"] = ha = np.random.randint(low=0, high=100, size=nelem).astype( - np.int32 - ) - gdf["b"] = hb = np.random.random(nelem).astype(np.float32) + gdf["a"] = ha = rng.integers(low=0, high=100, size=nelem).astype(np.int32) + gdf["b"] = hb = rng.random(nelem).astype(np.float32) pdf = pd.DataFrame() pdf["a"] = ha @@ -714,14 +714,16 @@ def test_dataframe_iloc_index_error(): @pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200]) def test_dataframe_take(ntake): - np.random.seed(0) - df = cudf.DataFrame() - + rng = np.random.default_rng(seed=0) nelem = 123 - df["ii"] = np.random.randint(0, 20, nelem) - df["ff"] = np.random.random(nelem) + df = cudf.DataFrame( + { + "ii": rng.integers(0, 20, nelem), + "ff": rng.random(nelem), + } + ) - take_indices = np.random.randint(0, len(df), ntake) + take_indices = rng.integers(0, len(df), ntake) actual = df.take(take_indices) expected = df.to_pandas().take(take_indices) @@ -733,7 +735,7 @@ def test_dataframe_take(ntake): @pytest.mark.parametrize("ntake", [1, 2, 8, 9]) def test_dataframe_take_with_multiindex(ntake): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = cudf.DataFrame( index=cudf.MultiIndex( levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]], @@ -742,10 +744,10 @@ def test_dataframe_take_with_multiindex(ntake): ) nelem = 9 - df["ii"] = np.random.randint(0, 20, nelem) - df["ff"] = np.random.random(nelem) + df["ii"] = rng.integers(0, 20, nelem) + df["ff"] = rng.random(nelem) - take_indices = np.random.randint(0, len(df), ntake) + take_indices = rng.integers(0, len(df), ntake) actual = df.take(take_indices) expected = df.to_pandas().take(take_indices) @@ -755,13 +757,13 @@ def test_dataframe_take_with_multiindex(ntake): @pytest.mark.parametrize("ntake", [0, 1, 10, 123, 122, 200]) def test_series_take(ntake): - np.random.seed(0) + rng = np.random.default_rng(seed=0) nelem = 123 - psr = pd.Series(np.random.randint(0, 20, nelem)) + psr = pd.Series(rng.integers(0, 20, nelem)) gsr = cudf.Series(psr) - take_indices = np.random.randint(0, len(gsr), ntake) + take_indices = rng.integers(0, len(gsr), ntake) actual = gsr.take(take_indices) expected = psr.take(take_indices) @@ -841,14 +843,15 @@ def test_empty_boolean_mask(dtype): ) @pytest.mark.parametrize("nulls", ["one", "some", "all", "none"]) def test_series_apply_boolean_mask(data, mask, nulls): + rng = np.random.default_rng(seed=0) psr = pd.Series(data) if len(data) > 0: if nulls == "one": - p = np.random.randint(0, 4) + p = rng.integers(0, 4) psr[p] = None elif nulls == "some": - p1, p2 = np.random.randint(0, 4, (2,)) + p1, p2 = rng.integers(0, 4, (2,)) psr[p1] = None psr[p2] = None elif nulls == "all": @@ -1810,13 +1813,14 @@ def test_boolean_mask_columns_iloc_series(): @pytest.mark.parametrize("index_type", ["single", "slice"]) def test_loc_timestamp_issue_8585(index_type): + rng = np.random.default_rng(seed=0) # https://github.com/rapidsai/cudf/issues/8585 start = pd.Timestamp( datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M") ) end = pd.Timestamp(datetime.strptime("2021-03-12 11:00", "%Y-%m-%d %H:%M")) timestamps = pd.date_range(start, end, periods=12) - value = np.random.normal(size=12) + value = rng.normal(size=12) df = pd.DataFrame(value, index=timestamps, columns=["value"]) cdf = cudf.from_pandas(df) if index_type == "single": @@ -1851,6 +1855,7 @@ def test_loc_timestamp_issue_8585(index_type): ], ) def test_loc_multiindex_timestamp_issue_8585(index_type): + rng = np.random.default_rng(seed=0) # https://github.com/rapidsai/cudf/issues/8585 start = pd.Timestamp( datetime.strptime("2021-03-12 00:00", "%Y-%m-%d %H:%M") @@ -1861,7 +1866,7 @@ def test_loc_multiindex_timestamp_issue_8585(index_type): index = pd.MultiIndex.from_product( [timestamps, labels], names=["timestamp", "label"] ) - value = np.random.normal(size=12) + value = rng.normal(size=12) df = pd.DataFrame(value, index=index, columns=["value"]) cdf = cudf.from_pandas(df) start = pd.Timestamp( diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py index b1ce69e58ef..f6941ce7fae 100644 --- a/python/cudf/cudf/tests/test_joining.py +++ b/python/cudf/cudf/tests/test_joining.py @@ -22,7 +22,7 @@ def make_params(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) hows = _JOIN_TYPES @@ -39,14 +39,14 @@ def make_params(): yield (aa, bb, how) # Test large random integer inputs - aa = np.random.randint(0, 50, 100) - bb = np.random.randint(0, 50, 100) + aa = rng.integers(0, 50, 100) + bb = rng.integers(0, 50, 100) for how in hows: yield (aa, bb, how) # Test floating point inputs - aa = np.random.random(50) - bb = np.random.random(50) + aa = rng.random(50) + bb = rng.random(50) for how in hows: yield (aa, bb, how) @@ -162,9 +162,9 @@ def _check_series(expect, got): reason="bug in older version of pandas", ) def test_dataframe_join_suffix(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) - df = cudf.DataFrame(np.random.randint(0, 5, (5, 3)), columns=list("abc")) + df = cudf.DataFrame(rng.integers(0, 5, (5, 3)), columns=list("abc")) left = df.set_index("a") right = df.set_index("c") @@ -281,19 +281,19 @@ def test_dataframe_join_mismatch_cats(how): @pytest.mark.parametrize("on", ["key1", ["key1", "key2"], None]) def test_dataframe_merge_on(on): - np.random.seed(0) + rng = np.random.default_rng(seed=0) # Make cuDF df_left = cudf.DataFrame() nelem = 500 - df_left["key1"] = np.random.randint(0, 40, nelem) - df_left["key2"] = np.random.randint(0, 50, nelem) + df_left["key1"] = rng.integers(0, 40, nelem) + df_left["key2"] = rng.integers(0, 50, nelem) df_left["left_val"] = np.arange(nelem) df_right = cudf.DataFrame() nelem = 500 - df_right["key1"] = np.random.randint(0, 30, nelem) - df_right["key2"] = np.random.randint(0, 50, nelem) + df_right["key1"] = rng.integers(0, 30, nelem) + df_right["key2"] = rng.integers(0, 50, nelem) df_right["right_val"] = np.arange(nelem) # Make pandas DF @@ -347,19 +347,19 @@ def test_dataframe_merge_on(on): def test_dataframe_merge_on_unknown_column(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) # Make cuDF df_left = cudf.DataFrame() nelem = 500 - df_left["key1"] = np.random.randint(0, 40, nelem) - df_left["key2"] = np.random.randint(0, 50, nelem) + df_left["key1"] = rng.integers(0, 40, nelem) + df_left["key2"] = rng.integers(0, 50, nelem) df_left["left_val"] = np.arange(nelem) df_right = cudf.DataFrame() nelem = 500 - df_right["key1"] = np.random.randint(0, 30, nelem) - df_right["key2"] = np.random.randint(0, 50, nelem) + df_right["key1"] = rng.integers(0, 30, nelem) + df_right["key2"] = rng.integers(0, 50, nelem) df_right["right_val"] = np.arange(nelem) with pytest.raises(KeyError) as raises: @@ -368,19 +368,19 @@ def test_dataframe_merge_on_unknown_column(): def test_dataframe_merge_no_common_column(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) # Make cuDF df_left = cudf.DataFrame() nelem = 500 - df_left["key1"] = np.random.randint(0, 40, nelem) - df_left["key2"] = np.random.randint(0, 50, nelem) + df_left["key1"] = rng.integers(0, 40, nelem) + df_left["key2"] = rng.integers(0, 50, nelem) df_left["left_val"] = np.arange(nelem) df_right = cudf.DataFrame() nelem = 500 - df_right["key3"] = np.random.randint(0, 30, nelem) - df_right["key4"] = np.random.randint(0, 50, nelem) + df_right["key3"] = rng.integers(0, 30, nelem) + df_right["key4"] = rng.integers(0, 50, nelem) df_right["right_val"] = np.arange(nelem) with pytest.raises(ValueError) as raises: @@ -460,14 +460,14 @@ def test_dataframe_merge_order(): @pytest.mark.parametrize("rows", [1, 5, 100]) @pytest.mark.parametrize("how", ["left", "inner", "outer"]) def test_dataframe_pairs_of_triples(pairs, max, rows, how): - np.random.seed(0) + rng = np.random.default_rng(seed=0) pdf_left = pd.DataFrame() pdf_right = pd.DataFrame() for left_column in pairs[0]: - pdf_left[left_column] = np.random.randint(0, max, rows) + pdf_left[left_column] = rng.integers(0, max, rows) for right_column in pairs[1]: - pdf_right[right_column] = np.random.randint(0, max, rows) + pdf_right[right_column] = rng.integers(0, max, rows) gdf_left = cudf.from_pandas(pdf_left) gdf_right = cudf.from_pandas(pdf_right) if not set(pdf_left.columns).intersection(pdf_right.columns): @@ -504,15 +504,15 @@ def test_dataframe_pairs_of_triples(pairs, max, rows, how): def test_safe_merging_with_left_empty(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) pairs = ("bcd", "b") pdf_left = pd.DataFrame() pdf_right = pd.DataFrame() for left_column in pairs[0]: - pdf_left[left_column] = np.random.randint(0, 10, 0) + pdf_left[left_column] = rng.integers(0, 10, 0) for right_column in pairs[1]: - pdf_right[right_column] = np.random.randint(0, 10, 5) + pdf_right[right_column] = rng.integers(0, 10, 5) gdf_left = cudf.from_pandas(pdf_left) gdf_right = cudf.from_pandas(pdf_right) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index c81c2d1d94b..47976fc4bac 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -32,13 +32,14 @@ def make_numeric_dataframe(nrows, dtype): @pytest.fixture(params=[0, 1, 10, 100]) def pdf(request): + rng = np.random.default_rng(seed=0) types = NUMERIC_TYPES + DATETIME_TYPES + ["bool"] nrows = request.param # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( { - f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) + f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) for typ in types } ) diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py index 790e84559a9..a34c89f55d3 100644 --- a/python/cudf/cudf/tests/test_monotonic.py +++ b/python/cudf/cudf/tests/test_monotonic.py @@ -164,7 +164,8 @@ def test_series(testlist): def test_multiindex(): - pdf = pd.DataFrame(np.random.rand(7, 5)) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.random(size=(7, 5))) pdf.index = pd.MultiIndex( [ ["a", "b", "c"], diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index c41be3e4428..ad0e0858c43 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -153,7 +153,8 @@ def test_multiindex_swaplevel(): def test_string_index(): - pdf = pd.DataFrame(np.random.rand(5, 5)) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.random(size=(5, 5))) gdf = cudf.from_pandas(pdf) stringIndex = ["a", "b", "c", "d", "e"] pdf.index = stringIndex @@ -176,7 +177,8 @@ def test_string_index(): def test_multiindex_row_shape(): - pdf = pd.DataFrame(np.random.rand(0, 5)) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.random(size=(0, 5))) gdf = cudf.from_pandas(pdf) pdfIndex = pd.MultiIndex([["a", "b", "c"]], [[0]]) pdfIndex.names = ["alpha"] @@ -193,7 +195,8 @@ def test_multiindex_row_shape(): @pytest.fixture def pdf(): - return pd.DataFrame(np.random.rand(7, 5)) + rng = np.random.default_rng(seed=0) + return pd.DataFrame(rng.random(size=(7, 5))) @pytest.fixture @@ -271,7 +274,8 @@ def test_from_pandas_series(): def test_series_multiindex(pdfIndex): - ps = pd.Series(np.random.rand(7)) + rng = np.random.default_rng(seed=0) + ps = pd.Series(rng.random(7)) gs = cudf.from_pandas(ps) ps.index = pdfIndex gs.index = cudf.from_pandas(pdfIndex) @@ -439,7 +443,8 @@ def test_multiindex_loc_rows_1_1_key(pdf, gdf, pdfIndex): def test_multiindex_column_shape(): - pdf = pd.DataFrame(np.random.rand(5, 0)) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.random(size=(5, 0))) gdf = cudf.from_pandas(pdf) pdfIndex = pd.MultiIndex([["a", "b", "c"]], [[0]]) pdfIndex.names = ["alpha"] @@ -522,9 +527,13 @@ def test_multiindex_from_product(arrays): def test_multiindex_index_and_columns(): - gdf = cudf.DataFrame() - gdf["x"] = np.random.randint(0, 5, 5) - gdf["y"] = np.random.randint(0, 5, 5) + rng = np.random.default_rng(seed=0) + gdf = cudf.DataFrame( + { + "x": rng.integers(0, 5, 5), + "y": rng.integers(0, 5, 5), + } + ) pdf = gdf.to_pandas() mi = cudf.MultiIndex( levels=[[0, 1, 2], [3, 4]], @@ -542,11 +551,12 @@ def test_multiindex_index_and_columns(): def test_multiindex_multiple_groupby(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { "a": [4, 17, 4, 9, 5], "b": [1, 4, 4, 3, 2], - "x": np.random.normal(size=5), + "x": rng.normal(size=5), } ) gdf = cudf.DataFrame.from_pandas(pdf) @@ -566,11 +576,12 @@ def test_multiindex_multiple_groupby(): ], ) def test_multi_column(func): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "x": np.random.randint(0, 5, size=1000), - "y": np.random.randint(0, 10, size=1000), - "z": np.random.normal(size=1000), + "x": rng.integers(0, 5, size=1000), + "y": rng.integers(0, 10, size=1000), + "z": rng.normal(size=1000), } ) gdf = cudf.DataFrame.from_pandas(pdf) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 1dd732c7191..41c1c3ccb20 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -681,7 +681,6 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): from pyarrow import orc - np.random.seed(0) supported_stat_types = supported_numpy_dtypes + ["str"] # Writing bool columns to multiple row groups is disabled # until #6763 is fixed @@ -704,6 +703,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): has_nulls=True, low=0, high=max_char_length, + seed=0, ) for dtype in supported_stat_types } @@ -845,7 +845,6 @@ def test_orc_reader_gmt_timestamps(datadir): def test_orc_bool_encode_fail(): - np.random.seed(0) buffer = BytesIO() # Generate a boolean column longer than a single row group @@ -927,7 +926,6 @@ def test_empty_string_columns(data): [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype], ) def test_orc_writer_decimal(tmpdir, scale, decimal_type): - np.random.seed(0) fname = tmpdir / "decimal.orc" expected = cudf.DataFrame({"dec_val": gen_rand_series("i", 100)}) @@ -988,7 +986,7 @@ def test_orc_string_stream_offset_issue(): def generate_list_struct_buff(size=100_000): rd = random.Random(1) - np.random.seed(seed=1) + rng = np.random.default_rng(seed=1) buff = BytesIO() @@ -999,12 +997,12 @@ def generate_list_struct_buff(size=100_000): [ [ [ - rd.choice([None, np.random.randint(1, 3)]) - for _ in range(np.random.randint(1, 3)) + rd.choice([None, rng.integers(1, 3)]) + for _ in range(rng.integers(1, 3)) ] - for _ in range(np.random.randint(0, 3)) + for _ in range(rng.integers(0, 3)) ] - for _ in range(np.random.randint(0, 3)) + for _ in range(rng.integers(0, 3)) ], ] ) @@ -1012,8 +1010,8 @@ def generate_list_struct_buff(size=100_000): ] lvl1_list = [ [ - rd.choice([None, np.random.randint(0, 3)]) - for _ in range(np.random.randint(1, 4)) + rd.choice([None, rng.integers(0, 3)]) + for _ in range(rng.integers(1, 4)) ] for _ in range(size) ] @@ -1021,7 +1019,7 @@ def generate_list_struct_buff(size=100_000): rd.choice( [ None, - {"a": np.random.randint(0, 3), "b": np.random.randint(0, 3)}, + {"a": rng.integers(0, 3), "b": rng.integers(0, 3)}, ] ) for _ in range(size) @@ -1030,11 +1028,11 @@ def generate_list_struct_buff(size=100_000): rd.choice( [ None, - {"a": rd.choice([None, np.random.randint(0, 3)])}, + {"a": rd.choice([None, rng.integers(0, 3)])}, { "lvl1_struct": { - "c": rd.choice([None, np.random.randint(0, 3)]), - "d": np.random.randint(0, 3), + "c": rd.choice([None, rng.integers(0, 3)]), + "d": rng.integers(0, 3), }, }, ] @@ -1044,7 +1042,7 @@ def generate_list_struct_buff(size=100_000): list_nests_struct = [ [ {"a": rd.choice(lvl1_struct), "b": rd.choice(lvl1_struct)} - for _ in range(np.random.randint(1, 4)) + for _ in range(rng.integers(1, 4)) ] for _ in range(size) ] @@ -1135,7 +1133,7 @@ def gen_map_buff(size): from pyarrow import orc rd = random.Random(1) - np.random.seed(seed=1) + rng = np.random.default_rng(seed=1) buff = BytesIO() @@ -1146,7 +1144,7 @@ def gen_map_buff(size): None, { rd.choice(al): rd.choice( - [None, np.random.randint(1, 1500)] + [None, rng.integers(1, 1500)] ), }, ] @@ -1167,7 +1165,7 @@ def gen_map_buff(size): None, [ rd.choice( - [None, np.random.randint(1, 1500)] + [None, rng.integers(1, 1500)] ) for _ in range(5) ], @@ -1194,10 +1192,10 @@ def gen_map_buff(size): None, { "a": rd.choice( - [None, np.random.randint(1, 1500)] + [None, rng.integers(1, 1500)] ), "b": rd.choice( - [None, np.random.randint(1, 1500)] + [None, rng.integers(1, 1500)] ), }, ] diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py index ad78621c5fa..b474bbe9bd8 100644 --- a/python/cudf/cudf/tests/test_pack.py +++ b/python/cudf/cudf/tests/test_pack.py @@ -24,11 +24,11 @@ def test_sizeof_packed_dataframe(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() nelem = 1000 df["keys"] = hkeys = np.arange(nelem, dtype=np.float64) - df["vals"] = hvals = np.random.random(nelem) + df["vals"] = hvals = rng.random(nelem) packed = pack(df) nbytes = hkeys.nbytes + hvals.nbytes @@ -67,46 +67,46 @@ def assert_packed_frame_equality(df): def test_packed_dataframe_equality_numeric(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() nelem = 10 df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = np.random.random(nelem) + df["vals"] = rng.random(nelem) check_packed_equality(df) def test_packed_dataframe_equality_categorical(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = pd.Categorical( ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_equality(df) def test_packed_dataframe_equality_list(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10))) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_equality(df) def test_packed_dataframe_equality_struct(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = Series( list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_equality(df) @@ -135,46 +135,46 @@ def assert_packed_frame_unique_pointers(df): def test_packed_dataframe_unique_pointers_numeric(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() nelem = 10 df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = np.random.random(nelem) + df["vals"] = rng.random(nelem) check_packed_unique_pointers(df) def test_packed_dataframe_unique_pointers_categorical(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = pd.Categorical( ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_unique_pointers(df) def test_packed_dataframe_unique_pointers_list(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10))) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_unique_pointers(df) def test_packed_dataframe_unique_pointers_struct(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = Series( list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_unique_pointers(df) @@ -208,46 +208,46 @@ def assert_packed_frame_picklable(df): def test_pickle_packed_dataframe_numeric(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() nelem = 10 df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = np.random.random(nelem) + df["vals"] = rng.random(nelem) check_packed_pickled_equality(df) def test_pickle_packed_dataframe_categorical(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = pd.Categorical( ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_pickled_equality(df) def test_pickle_packed_dataframe_list(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10))) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_pickled_equality(df) def test_pickle_packed_dataframe_struct(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = Series( list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_pickled_equality(df) @@ -273,45 +273,45 @@ def assert_packed_frame_serializable(df): def test_serialize_packed_dataframe_numeric(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() nelem = 10 df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = np.random.random(nelem) + df["vals"] = rng.random(nelem) check_packed_serialized_equality(df) def test_serialize_packed_dataframe_categorical(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = pd.Categorical( ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_serialized_equality(df) def test_serialize_packed_dataframe_list(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10))) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_serialized_equality(df) def test_serialize_packed_dataframe_struct(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = Series( list({"0": i, "1": i + 1, "2": i + 2} for i in range(10)) ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_packed_serialized_equality(df) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 7f1b0b1cd46..c9ce24d2a5b 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -53,6 +53,7 @@ def datadir(datadir): @pytest.fixture(params=[1, 5, 10, 100000]) def simple_pdf(request): + rng = np.random.default_rng(seed=0) types = [ "bool", "int8", @@ -72,7 +73,7 @@ def simple_pdf(request): # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( { - f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) + f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) for typ in types }, # Need to ensure that this index is not a RangeIndex to get the @@ -92,6 +93,7 @@ def simple_gdf(simple_pdf): def build_pdf(num_columns, day_resolution_timestamps): + rng = np.random.default_rng(seed=0) types = [ "bool", "int8", @@ -114,7 +116,7 @@ def build_pdf(num_columns, day_resolution_timestamps): # Create a pandas dataframe with random data of mixed types test_pdf = pd.DataFrame( { - f"col_{typ}": np.random.randint(0, nrows, nrows).astype(typ) + f"col_{typ}": rng.integers(0, nrows, nrows).astype(typ) for typ in types }, # Need to ensure that this index is not a RangeIndex to get the @@ -142,7 +144,7 @@ def build_pdf(num_columns, day_resolution_timestamps): }, ]: data = [ - np.random.randint(0, (0x7FFFFFFFFFFFFFFF / t["nsDivisor"])) + rng.integers(0, (0x7FFFFFFFFFFFFFFF / t["nsDivisor"])) for i in range(nrows) ] if day_resolution_timestamps: @@ -152,11 +154,11 @@ def build_pdf(num_columns, day_resolution_timestamps): ) # Create non-numeric categorical data otherwise parquet may typecast it - data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)] + data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)] test_pdf["col_category"] = pd.Series(data, dtype="category") # Create non-numeric str data - data = [ascii_letters[np.random.randint(0, 52)] for i in range(nrows)] + data = [ascii_letters[rng.integers(0, 52)] for i in range(nrows)] test_pdf["col_str"] = pd.Series(data, dtype="str") return test_pdf @@ -453,7 +455,9 @@ def test_parquet_read_filtered(tmpdir, rdg_seed): dg.ColumnParameters( 40, 0.2, - lambda: np.random.default_rng().integers(0, 100, size=40), + lambda: np.random.default_rng(seed=None).integers( + 0, 100, size=40 + ), True, ), ], @@ -1909,6 +1913,7 @@ def test_parquet_writer_dictionary_setting(use_dict, max_dict_size): @pytest.mark.parametrize("filename", ["myfile.parquet", None]) @pytest.mark.parametrize("cols", [["b"], ["c", "b"]]) def test_parquet_partitioned(tmpdir_factory, cols, filename): + rng = np.random.default_rng(seed=0) # Checks that write_to_dataset is wrapping to_parquet # as expected gdf_dir = str(tmpdir_factory.mktemp("gdf_dir")) @@ -1917,8 +1922,8 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename): pdf = pd.DataFrame( { "a": np.arange(0, stop=size, dtype="int64"), - "b": np.random.choice(list("abcd"), size=size), - "c": np.random.choice(np.arange(4), size=size), + "b": rng.choice(list("abcd"), size=size), + "c": rng.choice(np.arange(4), size=size), } ) pdf.to_parquet(pdf_dir, index=False, partition_cols=cols) @@ -1954,6 +1959,7 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename): @pytest.mark.parametrize("kwargs", [{"nrows": 1}, {"skip_rows": 1}]) def test_parquet_partitioned_notimplemented(tmpdir_factory, kwargs): + rng = np.random.default_rng(seed=0) # Checks that write_to_dataset is wrapping to_parquet # as expected pdf_dir = str(tmpdir_factory.mktemp("pdf_dir")) @@ -1961,8 +1967,8 @@ def test_parquet_partitioned_notimplemented(tmpdir_factory, kwargs): pdf = pd.DataFrame( { "a": np.arange(0, stop=size, dtype="int64"), - "b": np.random.choice(list("abcd"), size=size), - "c": np.random.choice(np.arange(4), size=size), + "b": rng.choice(list("abcd"), size=size), + "c": rng.choice(np.arange(4), size=size), } ) pdf.to_parquet(pdf_dir, index=False, partition_cols=["b"]) @@ -2127,6 +2133,7 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory): @pytest.mark.parametrize("cols", [None, ["b"]]) @pytest.mark.parametrize("store_schema", [True, False]) def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema): + rng = np.random.default_rng(seed=0) dir1 = tmpdir_factory.mktemp("dir1") dir2 = tmpdir_factory.mktemp("dir2") if cols is None: @@ -2139,7 +2146,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema): gdf = cudf.DataFrame( { "a": np.arange(0, stop=size), - "b": np.random.choice(np.arange(4), size=size), + "b": rng.choice(np.arange(4), size=size), } ) gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema) @@ -3214,11 +3221,12 @@ def test_parquet_nested_struct_list(): def test_parquet_writer_zstd(): size = 12345 + rng = np.random.default_rng(seed=0) expected = cudf.DataFrame( { "a": np.arange(0, stop=size, dtype="float64"), - "b": np.random.choice(list("abcd"), size=size), - "c": np.random.choice(np.arange(4), size=size), + "b": rng.choice(list("abcd"), size=size), + "c": rng.choice(np.arange(4), size=size), } ) diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py index 0f13a9e173a..2f10a5dfd74 100644 --- a/python/cudf/cudf/tests/test_pickling.py +++ b/python/cudf/cudf/tests/test_pickling.py @@ -40,33 +40,33 @@ def assert_frame_picklable(df): def test_pickle_dataframe_numeric(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() nelem = 10 df["keys"] = np.arange(nelem, dtype=np.float64) - df["vals"] = np.random.random(nelem) + df["vals"] = rng.random(nelem) check_serialization(df) def test_pickle_dataframe_categorical(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() df["keys"] = pd.Categorical( ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"] ) - df["vals"] = np.random.random(len(df)) + df["vals"] = rng.random(len(df)) check_serialization(df) def test_memory_usage_dataframe(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() nelem = 1000 df["keys"] = hkeys = np.arange(nelem, dtype=np.float64) - df["vals"] = hvals = np.random.random(nelem) + df["vals"] = hvals = rng.random(nelem) nbytes = hkeys.nbytes + hvals.nbytes sizeof = df.memory_usage().sum() @@ -98,11 +98,11 @@ def test_pickle_buffer(): @pytest.mark.parametrize("named", [True, False]) def test_pickle_series(named): - np.random.seed(0) + rng = np.random.default_rng(seed=0) if named: - ser = Series(np.random.random(10), name="a") + ser = Series(rng.random(10), name="a") else: - ser = Series(np.random.random(10)) + ser = Series(rng.random(10)) pickled = pickle.dumps(ser) out = pickle.loads(pickled) diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py index b12209fd3b9..7685d09203e 100644 --- a/python/cudf/cudf/tests/test_query.py +++ b/python/cudf/cudf/tests/test_query.py @@ -45,10 +45,10 @@ def test_query(data, fn, nulls): # prepare nelem, seed = data expect_fn, query_expr = fn - np.random.seed(seed) + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame() pdf["a"] = np.arange(nelem) - pdf["b"] = np.random.random(nelem) * nelem + pdf["b"] = rng.random(nelem) * nelem if nulls: pdf.loc[::2, "a"] = None gdf = cudf.from_pandas(pdf) @@ -71,10 +71,10 @@ def test_query_ref_env(data, fn): # prepare nelem, seed = data expect_fn, query_expr = fn - np.random.seed(seed) + rng = np.random.default_rng(seed=0) df = DataFrame() df["a"] = aa = np.arange(nelem) - df["b"] = bb = np.random.random(nelem) * nelem + df["b"] = bb = rng.random(nelem) * nelem c = 2.3 d = 1.2 # udt @@ -121,9 +121,9 @@ def test_query_local_dict(): def test_query_splitted_combine(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)} + {"x": rng.integers(0, 5, size=10), "y": rng.normal(size=10)} ) gdf = DataFrame.from_pandas(df) diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py index 4c1d8ce92ae..1d9c6690f14 100644 --- a/python/cudf/cudf/tests/test_rank.py +++ b/python/cudf/cudf/tests/test_rank.py @@ -125,32 +125,28 @@ def test_rank_error_arguments(pdf): ) -sort_group_args = [ - np.full((3,), np.nan), - 100 * np.random.random(10), - np.full((3,), np.inf), - np.full((3,), -np.inf), -] -sort_dtype_args = [np.int32, np.int64, np.float32, np.float64] - - @pytest.mark.filterwarnings("ignore:invalid value encountered in cast") @pytest.mark.parametrize( "elem,dtype", list( product( - combinations_with_replacement(sort_group_args, 4), - sort_dtype_args, + combinations_with_replacement( + [ + np.full((3,), np.nan), + 100 * np.random.default_rng(seed=0).random(10), + np.full((3,), np.inf), + np.full((3,), -np.inf), + ], + 4, + ), + [np.int32, np.int64, np.float32, np.float64], ) ), ) def test_series_rank_combinations(elem, dtype): - np.random.seed(0) aa = np.fromiter(chain.from_iterable(elem), np.float64).astype(dtype) - gdf = DataFrame() - df = pd.DataFrame() - gdf["a"] = aa - df["a"] = aa + gdf = DataFrame({"a": aa}) + df = pd.DataFrame({"a": aa}) ranked_gs = gdf["a"].rank(method="first") ranked_ps = df["a"].rank(method="first") # Check diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index f276f394cd0..e0bc8f32c9b 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -62,8 +62,7 @@ def test_sum_string(): ) @pytest.mark.parametrize("nelem", params_sizes) def test_sum_decimal(dtype, nelem): - np.random.seed(0) - data = [str(x) for x in gen_rand("int64", nelem) / 100] + data = [str(x) for x in gen_rand("int64", nelem, seed=0) / 100] expected = pd.Series([Decimal(x) for x in data]).sum() got = cudf.Series(data).astype(dtype).sum() @@ -73,15 +72,13 @@ def test_sum_decimal(dtype, nelem): @pytest.mark.parametrize("dtype,nelem", params) def test_product(dtype, nelem): - np.random.seed(0) + rng = np.random.default_rng(seed=0) dtype = cudf.dtype(dtype).type if cudf.dtype(dtype).kind in {"u", "i"}: data = np.ones(nelem, dtype=dtype) # Set at most 30 items to [0..2) to keep the value within 2^32 for _ in range(30): - data[np.random.randint(low=0, high=nelem, size=1)] = ( - np.random.uniform() * 2 - ) + data[rng.integers(low=0, high=nelem, size=1)] = rng.uniform() * 2 else: data = gen_rand(dtype, nelem) @@ -104,7 +101,6 @@ def test_product(dtype, nelem): ], ) def test_product_decimal(dtype): - np.random.seed(0) data = [str(x) for x in gen_rand("int8", 3) / 10] expected = pd.Series([Decimal(x) for x in data]).product() @@ -153,7 +149,6 @@ def test_sum_of_squares(dtype, nelem): ], ) def test_sum_of_squares_decimal(dtype): - np.random.seed(0) data = [str(x) for x in gen_rand("int8", 3) / 10] expected = pd.Series([Decimal(x) for x in data]).pow(2).sum() @@ -186,7 +181,6 @@ def test_min(dtype, nelem): ) @pytest.mark.parametrize("nelem", params_sizes) def test_min_decimal(dtype, nelem): - np.random.seed(0) data = [str(x) for x in gen_rand("int64", nelem) / 100] expected = pd.Series([Decimal(x) for x in data]).min() @@ -219,7 +213,6 @@ def test_max(dtype, nelem): ) @pytest.mark.parametrize("nelem", params_sizes) def test_max_decimal(dtype, nelem): - np.random.seed(0) data = [str(x) for x in gen_rand("int64", nelem) / 100] expected = pd.Series([Decimal(x) for x in data]).max() @@ -256,7 +249,8 @@ def test_sum_boolean(): def test_date_minmax(): - np_data = np.random.normal(size=10**3) + rng = np.random.default_rng(seed=0) + np_data = rng.normal(size=10**3) gdf_data = Series(np_data) np_casted = np_data.astype("datetime64[ms]") diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 95e19fae501..bf0c97adb00 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -25,9 +25,10 @@ @pytest.mark.parametrize("dtype", repr_categories) @pytest.mark.parametrize("nrows", [0, 5, 10]) def test_null_series(nrows, dtype): + rng = np.random.default_rng(seed=0) size = 5 - sr = cudf.Series(np.random.randint(1, 9, size)).astype(dtype) - sr[np.random.choice([False, True], size=size)] = None + sr = cudf.Series(rng.integers(1, 9, size)).astype(dtype) + sr[rng.choice([False, True], size=size)] = None if dtype != "category" and cudf.dtype(dtype).kind in {"u", "i"}: ps = pd.Series( sr._column.data_array_view(mode="read").copy_to_host(), @@ -60,11 +61,12 @@ def test_null_series(nrows, dtype): @pytest.mark.parametrize("ncols", [1, 2, 3, 4, 5, 10]) def test_null_dataframe(ncols): + rng = np.random.default_rng(seed=0) size = 20 gdf = cudf.DataFrame() for idx, dtype in enumerate(dtype_categories): - sr = cudf.Series(np.random.randint(0, 128, size)).astype(dtype) - sr[np.random.choice([False, True], size=size)] = None + sr = cudf.Series(rng.integers(0, 128, size)).astype(dtype) + sr[rng.choice([False, True], size=size)] = None gdf[dtype] = sr pdf = gdf.to_pandas() pd.options.display.max_columns = int(ncols) @@ -77,7 +79,8 @@ def test_null_dataframe(ncols): @pytest.mark.parametrize("nrows", [None, 0, 1, 2, 9, 10, 11, 19, 20, 21]) def test_full_series(nrows, dtype): size = 20 - ps = pd.Series(np.random.randint(0, 100, size)).astype(dtype) + rng = np.random.default_rng(seed=0) + ps = pd.Series(rng.integers(0, 100, size)).astype(dtype) sr = cudf.from_pandas(ps) pd.options.display.max_rows = nrows assert repr(ps) == repr(sr) @@ -89,8 +92,9 @@ def test_full_series(nrows, dtype): @pytest.mark.parametrize("size", [20, 21]) @pytest.mark.parametrize("dtype", repr_categories) def test_full_dataframe_20(dtype, size, nrows, ncols): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( - {idx: np.random.randint(0, 100, size) for idx in range(size)} + {idx: rng.integers(0, 100, size) for idx in range(size)} ).astype(dtype) gdf = cudf.from_pandas(pdf) @@ -178,11 +182,12 @@ def test_mixed_series(mixed_pdf, mixed_gdf): def test_MI(): + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame( { - "a": np.random.randint(0, 4, 10), - "b": np.random.randint(0, 4, 10), - "c": np.random.randint(0, 4, 10), + "a": rng.integers(0, 4, 10), + "b": rng.integers(0, 4, 10), + "c": rng.integers(0, 4, 10), } ) levels = [["a", "b", "c", "d"], ["w", "x", "y", "z"], ["m", "n"]] @@ -223,9 +228,10 @@ def test_groupby_MI(nrows, ncols): @pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES) @pytest.mark.parametrize("length", [0, 1, 10, 100, 1000]) def test_generic_index(length, dtype): + rng = np.random.default_rng(seed=0) psr = pd.Series( range(length), - index=np.random.randint(0, high=100, size=length).astype(dtype), + index=rng.integers(0, high=100, size=length).astype(dtype), dtype="float64" if length == 0 else None, ) gsr = cudf.Series.from_pandas(psr) diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py index a61477981f8..5ff0098bcf4 100644 --- a/python/cudf/cudf/tests/test_resampling.py +++ b/python/cudf/cudf/tests/test_resampling.py @@ -50,8 +50,9 @@ def test_series_upsample_simple(): @pytest.mark.parametrize("rule", ["2s", "10s"]) def test_series_resample_ffill(rule): - rng = pd.date_range("1/1/2012", periods=10, freq="5s") - ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) + date_idx = pd.date_range("1/1/2012", periods=10, freq="5s") + rng = np.random.default_rng(seed=0) + ts = pd.Series(rng.integers(0, 500, len(date_idx)), index=date_idx) gts = cudf.from_pandas(ts) assert_resample_results_equal( ts.resample(rule).ffill(), gts.resample(rule).ffill() @@ -60,8 +61,9 @@ def test_series_resample_ffill(rule): @pytest.mark.parametrize("rule", ["2s", "10s"]) def test_series_resample_bfill(rule): - rng = pd.date_range("1/1/2012", periods=10, freq="5s") - ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) + date_idx = pd.date_range("1/1/2012", periods=10, freq="5s") + rng = np.random.default_rng(seed=0) + ts = pd.Series(rng.integers(0, 500, len(date_idx)), index=date_idx) gts = cudf.from_pandas(ts) assert_resample_results_equal( ts.resample(rule).bfill(), gts.resample(rule).bfill() @@ -70,8 +72,9 @@ def test_series_resample_bfill(rule): @pytest.mark.parametrize("rule", ["2s", "10s"]) def test_series_resample_asfreq(rule): - rng = pd.date_range("1/1/2012", periods=100, freq="5s") - ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) + date_range = pd.date_range("1/1/2012", periods=100, freq="5s") + rng = np.random.default_rng(seed=0) + ts = pd.Series(rng.integers(0, 500, len(date_range)), index=date_range) gts = cudf.from_pandas(ts) assert_resample_results_equal( ts.resample(rule).asfreq(), gts.resample(rule).asfreq() @@ -79,8 +82,9 @@ def test_series_resample_asfreq(rule): def test_dataframe_resample_aggregation_simple(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( - np.random.randn(1000, 3), + rng.standard_normal(size=(1000, 3)), index=pd.date_range("1/1/2012", freq="s", periods=1000), columns=["A", "B", "C"], ) @@ -91,8 +95,9 @@ def test_dataframe_resample_aggregation_simple(): def test_dataframe_resample_multiagg(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( - np.random.randn(1000, 3), + rng.standard_normal(size=(1000, 3)), index=pd.date_range("1/1/2012", freq="s", periods=1000), columns=["A", "B", "C"], ) @@ -104,10 +109,11 @@ def test_dataframe_resample_multiagg(): def test_dataframe_resample_on(): + rng = np.random.default_rng(seed=0) # test resampling on a specified column pdf = pd.DataFrame( { - "x": np.random.randn(1000), + "x": rng.standard_normal(size=(1000)), "y": pd.date_range("1/1/2012", freq="s", periods=1000), } ) @@ -119,15 +125,16 @@ def test_dataframe_resample_on(): def test_dataframe_resample_level(): + rng = np.random.default_rng(seed=0) # test resampling on a specific level of a MultIndex pdf = pd.DataFrame( { - "x": np.random.randn(1000), + "x": rng.standard_normal(size=1000), "y": pd.date_range("1/1/2012", freq="s", periods=1000), } ) pdi = pd.MultiIndex.from_frame(pdf) - pdf = pd.DataFrame({"a": np.random.randn(1000)}, index=pdi) + pdf = pd.DataFrame({"a": rng.standard_normal(size=1000)}, index=pdi) gdf = cudf.from_pandas(pdf) assert_resample_results_equal( pdf.resample("3min", level="y").mean(), @@ -153,11 +160,12 @@ def test_dataframe_resample_level(): reason="Fails in older versions of pandas", ) def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq): + rng = np.random.default_rng(seed=0) # test that we cast to the appropriate frequency # when resampling: pdf = pd.DataFrame( { - "x": np.random.randn(100), + "x": rng.standard_normal(size=100), "y": pd.date_range("1/1/2012", freq=in_freq, periods=100), } ) diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py index 4235affd4d1..26386abb05d 100644 --- a/python/cudf/cudf/tests/test_reshape.py +++ b/python/cudf/cudf/tests/test_reshape.py @@ -46,13 +46,12 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): pdf = pd.DataFrame() id_vars = [] + rng = np.random.default_rng(seed=0) for i in range(num_id_vars): colname = "id" + str(i) - data = np.random.randint(0, 26, num_rows).astype(dtype) + data = rng.integers(0, 26, num_rows).astype(dtype) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan elif nulls == "all": data[:] = np.nan @@ -62,11 +61,9 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype): value_vars = [] for i in range(num_value_vars): colname = "val" + str(i) - data = np.random.randint(0, 26, num_rows).astype(dtype) + data = rng.integers(0, 26, num_rows).astype(dtype) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan elif nulls == "all": data[:] = np.nan @@ -119,6 +116,15 @@ def test_melt_str_scalar_id_var(): assert_eq(result, expected) +def test_melt_falsy_var_name(): + df = cudf.DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 5], "C": [2, 4, 6]}) + result = cudf.melt(df, id_vars=["A"], value_vars=["B"], var_name="") + expected = pd.melt( + df.to_pandas(), id_vars=["A"], value_vars=["B"], var_name="" + ) + assert_eq(result, expected) + + @pytest.mark.parametrize("num_cols", [1, 2, 10]) @pytest.mark.parametrize("num_rows", [1, 2, 1000]) @pytest.mark.parametrize( @@ -130,13 +136,12 @@ def test_df_stack(nulls, num_cols, num_rows, dtype): pytest.skip(reason="nulls not supported in dtype: " + dtype) pdf = pd.DataFrame() + rng = np.random.default_rng(seed=0) for i in range(num_cols): colname = str(i) - data = np.random.randint(0, 26, num_rows).astype(dtype) + data = rng.integers(0, 26, num_rows).astype(dtype) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan pdf[colname] = data @@ -271,8 +276,8 @@ def test_df_stack_multiindex_column_axis_pd_example(level): ], names=["exp", "animal", "hair_length"], ) - - df = pd.DataFrame(np.random.randn(4, 4), columns=columns) + rng = np.random.default_rng(seed=0) + df = pd.DataFrame(rng.standard_normal(size=(4, 4)), columns=columns) with expect_warning_if(PANDAS_GE_220, FutureWarning): expect = df.stack(level=level, future_stack=False) @@ -299,14 +304,13 @@ def test_interleave_columns(nulls, num_cols, num_rows, dtype): pytest.skip(reason="nulls not supported in dtype: " + dtype) pdf = pd.DataFrame(dtype=dtype) + rng = np.random.default_rng(seed=0) for i in range(num_cols): colname = str(i) - data = pd.Series(np.random.randint(0, 26, num_rows)).astype(dtype) + data = pd.Series(rng.integers(0, 26, num_rows)).astype(dtype) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan pdf[colname] = data @@ -335,16 +339,13 @@ def test_tile(nulls, num_cols, num_rows, dtype, count): pytest.skip(reason="nulls not supported in dtype: " + dtype) pdf = pd.DataFrame(dtype=dtype) + rng = np.random.default_rng(seed=0) for i in range(num_cols): colname = str(i) - data = pd.Series(np.random.randint(num_cols, 26, num_rows)).astype( - dtype - ) + data = pd.Series(rng.integers(num_cols, 26, num_rows)).astype(dtype) if nulls == "some": - idx = np.random.choice( - num_rows, size=int(num_rows / 2), replace=False - ) + idx = rng.choice(num_rows, size=int(num_rows / 2), replace=False) data[idx] = np.nan pdf[colname] = data @@ -715,23 +716,20 @@ def test_pivot_duplicate_error(): @pytest.mark.parametrize( - "data", - [ + "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] +) +@pytest.mark.parametrize("fill_value", [0]) +def test_pivot_table_simple(aggfunc, fill_value): + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame( { "A": ["one", "one", "two", "three"] * 6, "B": ["A", "B", "C"] * 8, "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, - "D": np.random.randn(24), - "E": np.random.randn(24), + "D": rng.standard_normal(size=24), + "E": rng.standard_normal(size=24), } - ], -) -@pytest.mark.parametrize( - "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] -) -@pytest.mark.parametrize("fill_value", [0]) -def test_pivot_table_simple(data, aggfunc, fill_value): - pdf = pd.DataFrame(data) + ) expected = pd.pivot_table( pdf, values=["D", "E"], @@ -740,7 +738,7 @@ def test_pivot_table_simple(data, aggfunc, fill_value): aggfunc=aggfunc, fill_value=fill_value, ) - cdf = cudf.DataFrame(data) + cdf = cudf.DataFrame.from_pandas(pdf) actual = cudf.pivot_table( cdf, values=["D", "E"], @@ -753,23 +751,20 @@ def test_pivot_table_simple(data, aggfunc, fill_value): @pytest.mark.parametrize( - "data", - [ + "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] +) +@pytest.mark.parametrize("fill_value", [0]) +def test_dataframe_pivot_table_simple(aggfunc, fill_value): + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame( { "A": ["one", "one", "two", "three"] * 6, "B": ["A", "B", "C"] * 8, "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, - "D": np.random.randn(24), - "E": np.random.randn(24), + "D": rng.standard_normal(size=24), + "E": rng.standard_normal(size=24), } - ], -) -@pytest.mark.parametrize( - "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}] -) -@pytest.mark.parametrize("fill_value", [0]) -def test_dataframe_pivot_table_simple(data, aggfunc, fill_value): - pdf = pd.DataFrame(data) + ) expected = pdf.pivot_table( values=["D", "E"], index=["A", "B"], @@ -777,7 +772,7 @@ def test_dataframe_pivot_table_simple(data, aggfunc, fill_value): aggfunc=aggfunc, fill_value=fill_value, ) - cdf = cudf.DataFrame(data) + cdf = cudf.DataFrame.from_pandas(pdf) actual = cdf.pivot_table( values=["D", "E"], index=["A", "B"], diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index 0b892a51895..68f2aaf9cab 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -170,11 +170,15 @@ def test_serialize_dataframe(): def test_serialize_dataframe_with_index(): - df = cudf.DataFrame() - df["a"] = np.arange(100) - df["b"] = np.random.random(100) - df["c"] = pd.Categorical( - ["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"] + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame( + { + "a": np.arange(100), + "b": rng.random(100), + "c": pd.Categorical( + ["a", "b", "c", "_", "_"] * 20, categories=["a", "b", "c"] + ), + } ) df = df.sort_values("b") outdf = cudf.DataFrame.deserialize(*df.serialize()) @@ -200,11 +204,12 @@ def test_serialize_generic_index(): def test_serialize_multi_index(): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { "a": [4, 17, 4, 9, 5], "b": [1, 4, 4, 3, 2], - "x": np.random.normal(size=5), + "x": rng.normal(size=5), } ) gdf = cudf.DataFrame.from_pandas(pdf) @@ -218,7 +223,8 @@ def test_serialize_multi_index(): def test_serialize_masked_series(): nelem = 50 - data = np.random.random(nelem) + rng = np.random.default_rng(seed=0) + data = rng.random(nelem) mask = utils.random_bitmask(nelem) bitmask = utils.expand_bits_to_bytes(mask)[:nelem] null_count = utils.count_zero(bitmask) @@ -229,10 +235,14 @@ def test_serialize_masked_series(): def test_serialize_groupby_df(): - df = cudf.DataFrame() - df["key_1"] = np.random.randint(0, 20, 100) - df["key_2"] = np.random.randint(0, 20, 100) - df["val"] = np.arange(100, dtype=np.float32) + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame( + { + "key_1": rng.integers(0, 20, 100), + "key_2": rng.integers(0, 20, 100), + "val": np.arange(100, dtype=np.float32), + } + ) gb = df.groupby(["key_1", "key_2"], sort=True) outgb = gb.deserialize(*gb.serialize()) expect = gb.mean() @@ -241,9 +251,9 @@ def test_serialize_groupby_df(): def test_serialize_groupby_external(): - df = cudf.DataFrame() - df["val"] = np.arange(100, dtype=np.float32) - gb = df.groupby(cudf.Series(np.random.randint(0, 20, 100))) + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame({"val": np.arange(100, dtype=np.float32)}) + gb = df.groupby(cudf.Series(rng.integers(0, 20, 100))) outgb = gb.deserialize(*gb.serialize()) expect = gb.mean() got = outgb.mean() @@ -262,7 +272,8 @@ def test_serialize_groupby_level(): def test_serialize_groupby_sr(): - sr = cudf.Series(np.random.randint(0, 20, 100)) + rng = np.random.default_rng(seed=0) + sr = cudf.Series(rng.integers(0, 20, 100)) gb = sr.groupby(sr // 2) outgb = gb.deserialize(*gb.serialize()) got = gb.mean() @@ -271,9 +282,10 @@ def test_serialize_groupby_sr(): def test_serialize_datetime(): + rng = np.random.default_rng(seed=0) # Make frame with datetime column df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} + {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)} ) ts = np.arange(0, len(df), dtype=np.dtype("datetime64[ms]")) df["timestamp"] = ts @@ -285,9 +297,10 @@ def test_serialize_datetime(): def test_serialize_string(): + rng = np.random.default_rng(seed=0) # Make frame with string column df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=5), "y": np.random.normal(size=5)} + {"x": rng.integers(0, 5, size=5), "y": rng.normal(size=5)} ) str_data = ["a", "bc", "def", "ghij", "klmno"] df["timestamp"] = str_data diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index a24002dc38e..7f0a4902ed1 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -519,13 +519,13 @@ def test_series_factorize_sort(data, sort): @pytest.mark.parametrize("nulls", ["none", "some"]) def test_series_datetime_value_counts(data, nulls, normalize, dropna): psr = data.copy() - + rng = np.random.default_rng(seed=0) if len(data) > 0: if nulls == "one": - p = np.random.randint(0, len(data)) + p = rng.integers(0, len(data)) psr[p] = None elif nulls == "some": - p = np.random.randint(0, len(data), 2) + p = rng.integers(0, len(data), 2) psr[p] = None gsr = cudf.from_pandas(psr) @@ -546,10 +546,10 @@ def test_series_datetime_value_counts(data, nulls, normalize, dropna): @pytest.mark.parametrize("num_elements", [10, 100, 1000]) def test_categorical_value_counts(dropna, normalize, num_elements): # create categorical series - np.random.seed(12) + rng = np.random.default_rng(seed=12) pd_cat = pd.Categorical( pd.Series( - np.random.choice(list(ascii_letters + digits), num_elements), + rng.choice(list(ascii_letters + digits), num_elements), dtype="category", ) ) @@ -586,8 +586,9 @@ def test_categorical_value_counts(dropna, normalize, num_elements): @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize("normalize", [True, False]) def test_series_value_counts(dropna, normalize): + rng = np.random.default_rng(seed=0) for size in [10**x for x in range(5)]: - arr = np.random.randint(low=-1, high=10, size=size) + arr = rng.integers(low=-1, high=10, size=size) mask = arr != -1 sr = cudf.Series.from_masked_array( arr, cudf.Series(mask)._column.as_mask() @@ -714,8 +715,8 @@ def test_series_mode(gs, dropna): @pytest.mark.parametrize( "arr", [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), np.zeros(100), np.repeat([-0.6459412758761901], 100), np.repeat(np.nan, 100), @@ -731,12 +732,12 @@ def test_series_round(arr, decimals): expected = pser.round(decimals) assert_eq(result, expected) - + rng = np.random.default_rng(seed=0) # with nulls, maintaining existing null mask arr = arr.astype("float64") # for pandas nulls - arr.ravel()[ - np.random.choice(arr.shape[0], arr.shape[0] // 2, replace=False) - ] = np.nan + arr.ravel()[rng.choice(arr.shape[0], arr.shape[0] // 2, replace=False)] = ( + np.nan + ) pser = pd.Series(arr) ser = cudf.Series(arr) @@ -1726,7 +1727,7 @@ def test_series_truncate_datetimeindex(): [], [0, 12, 14], [0, 14, 12, 12, 3, 10, 12, 14], - np.random.randint(-100, 100, 200), + np.random.default_rng(seed=0).integers(-100, 100, 200), pd.Series([0.0, 1.0, None, 10.0]), [None, None, None, None], [np.nan, None, -1, 2, 3], @@ -1735,7 +1736,7 @@ def test_series_truncate_datetimeindex(): @pytest.mark.parametrize( "values", [ - np.random.randint(-100, 100, 10), + np.random.default_rng(seed=0).integers(-100, 100, 10), [], [np.nan, None, -1, 2, 3], [1.0, 12.0, None, None, 120], @@ -1746,7 +1747,8 @@ def test_series_truncate_datetimeindex(): ], ) def test_isin_numeric(data, values): - index = np.random.randint(0, 100, len(data)) + rng = np.random.default_rng(seed=0) + index = rng.integers(0, 100, len(data)) psr = pd.Series(data, index=index) gsr = cudf.Series.from_pandas(psr, nan_as_null=False) @@ -1943,8 +1945,9 @@ def test_diff_many_dtypes(data): @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"]) @pytest.mark.parametrize("series_bins", [True, False]) def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): - data = np.random.randint(0, 100, num_rows).astype(dtype) - bins = np.unique(np.sort(np.random.randint(2, 95, num_bins).astype(dtype))) + rng = np.random.default_rng(seed=0) + data = rng.integers(0, 100, num_rows).astype(dtype) + bins = np.unique(np.sort(rng.integers(2, 95, num_bins).astype(dtype))) s = cudf.Series(data) if series_bins: s_bins = cudf.Series(bins) @@ -1957,7 +1960,8 @@ def test_series_digitize(num_rows, num_bins, right, dtype, series_bins): def test_series_digitize_invalid_bins(): - s = cudf.Series(np.random.randint(0, 30, 80), dtype="int32") + rng = np.random.default_rng(seed=0) + s = cudf.Series(rng.integers(0, 30, 80), dtype="int32") bins = cudf.Series([2, None, None, 50, 90], dtype="int32") with pytest.raises( @@ -2038,7 +2042,8 @@ def test_default_float_bitwidth_construction(default_float_bitwidth, data): def test_series_ordered_dedup(): # part of https://github.com/rapidsai/cudf/issues/11486 - sr = cudf.Series(np.random.randint(0, 100, 1000)) + rng = np.random.default_rng(seed=0) + sr = cudf.Series(rng.integers(0, 100, 1000)) # pandas unique() preserves order expect = pd.Series(sr.to_pandas().unique()) got = cudf.Series._from_column(sr._column.unique()) diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py index 3d8b6a79d2a..db1de7d0cf4 100644 --- a/python/cudf/cudf/tests/test_seriesmap.py +++ b/python/cudf/cudf/tests/test_seriesmap.py @@ -47,8 +47,8 @@ def test_series_map_callable_numeric_basic(): @pytest.mark.parametrize("nelem", list(product([2, 10, 100, 1000]))) def test_series_map_callable_numeric_random(nelem): # Generate data - np.random.seed(0) - data = np.random.random(nelem) * 100 + rng = np.random.default_rng(seed=0) + data = rng.random(nelem) * 100 sr = Series(data) pdsr = pd.Series(data) diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 2cf2259d9ec..7e5ce713c7e 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -34,10 +34,10 @@ "nelem,dtype", list(product(sort_nelem_args, sort_dtype_args)) ) def test_dataframe_sort_values(nelem, dtype): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() - df["a"] = aa = (100 * np.random.random(nelem)).astype(dtype) - df["b"] = bb = (100 * np.random.random(nelem)).astype(dtype) + df["a"] = aa = (100 * rng.random(nelem)).astype(dtype) + df["b"] = bb = (100 * rng.random(nelem)).astype(dtype) sorted_df = df.sort_values(by="a") # Check sorted_index = np.argsort(aa, kind="mergesort") @@ -85,9 +85,9 @@ def test_series_sort_values_ignore_index(ignore_index): "nelem,sliceobj", list(product([10, 100], sort_slice_args)) ) def test_dataframe_sort_values_sliced(nelem, sliceobj): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame() - df["a"] = np.random.random(nelem) + df["a"] = rng.random(nelem) expect = df[sliceobj]["a"].sort_values() gdf = DataFrame.from_pandas(df) @@ -100,8 +100,8 @@ def test_dataframe_sort_values_sliced(nelem, sliceobj): list(product(sort_nelem_args, sort_dtype_args, [True, False])), ) def test_series_argsort(nelem, dtype, asc): - np.random.seed(0) - sr = Series((100 * np.random.random(nelem)).astype(dtype)) + rng = np.random.default_rng(seed=0) + sr = Series((100 * rng.random(nelem)).astype(dtype)) res = sr.argsort(ascending=asc) if asc: @@ -116,8 +116,8 @@ def test_series_argsort(nelem, dtype, asc): "nelem,asc", list(product(sort_nelem_args, [True, False])) ) def test_series_sort_index(nelem, asc): - np.random.seed(0) - sr = Series(100 * np.random.random(nelem)) + rng = np.random.default_rng(seed=0) + sr = Series(100 * rng.random(nelem)) psr = sr.to_pandas() expected = psr.sort_index(ascending=asc) @@ -167,9 +167,9 @@ def test_series_nsmallest(data, n): @pytest.mark.parametrize("op", ["nsmallest", "nlargest"]) @pytest.mark.parametrize("columns", ["a", ["b", "a"]]) def test_dataframe_nlargest_nsmallest(nelem, n, op, columns): - np.random.seed(0) - aa = np.random.random(nelem) - bb = np.random.random(nelem) + rng = np.random.default_rng(seed=0) + aa = rng.random(nelem) + bb = rng.random(nelem) df = DataFrame({"a": aa, "b": bb}) pdf = df.to_pandas() @@ -181,10 +181,10 @@ def test_dataframe_nlargest_nsmallest(nelem, n, op, columns): ) def test_dataframe_nlargest_sliced(counts, sliceobj): nelem, n = counts - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame() - df["a"] = np.random.random(nelem) - df["b"] = np.random.random(nelem) + df["a"] = rng.random(nelem) + df["b"] = rng.random(nelem) expect = df[sliceobj].nlargest(n, "a") gdf = DataFrame.from_pandas(df) @@ -197,10 +197,10 @@ def test_dataframe_nlargest_sliced(counts, sliceobj): ) def test_dataframe_nsmallest_sliced(counts, sliceobj): nelem, n = counts - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame() - df["a"] = np.random.random(nelem) - df["b"] = np.random.random(nelem) + df["a"] = rng.random(nelem) + df["b"] = rng.random(nelem) expect = df[sliceobj].nsmallest(n, "a") gdf = DataFrame.from_pandas(df) @@ -216,13 +216,13 @@ def test_dataframe_nsmallest_sliced(counts, sliceobj): def test_dataframe_multi_column( num_cols, num_rows, dtype, ascending, na_position ): - np.random.seed(0) + rng = np.random.default_rng(seed=0) by = list(string.ascii_lowercase[:num_cols]) pdf = pd.DataFrame() for i in range(5): colname = string.ascii_lowercase[i] - data = np.random.randint(0, 26, num_rows).astype(dtype) + data = rng.integers(0, 26, num_rows).astype(dtype) pdf[colname] = data gdf = DataFrame.from_pandas(pdf) @@ -244,17 +244,17 @@ def test_dataframe_multi_column( def test_dataframe_multi_column_nulls( num_cols, num_rows, dtype, nulls, ascending, na_position ): - np.random.seed(0) + rng = np.random.default_rng(seed=0) by = list(string.ascii_lowercase[:num_cols]) pdf = pd.DataFrame() for i in range(3): colname = string.ascii_lowercase[i] - data = np.random.randint(0, 26, num_rows).astype(dtype) + data = rng.integers(0, 26, num_rows).astype(dtype) if nulls == "some": idx = np.array([], dtype="int64") if num_rows > 0: - idx = np.random.choice( + idx = rng.choice( num_rows, size=int(num_rows / 4), replace=False ) data[idx] = np.nan @@ -295,8 +295,8 @@ def test_dataframe_multi_column_nulls_multiple_ascending( @pytest.mark.parametrize("nelem", [1, 100]) def test_series_nlargest_nelem(nelem): - np.random.seed(0) - elems = np.random.random(nelem) + rng = np.random.default_rng(seed=0) + elems = rng.random(nelem) gds = Series(elems).nlargest(nelem) pds = pd.Series(elems).nlargest(nelem) @@ -308,11 +308,14 @@ def test_series_nlargest_nelem(nelem): @pytest.mark.parametrize("keep", [True, False]) def test_dataframe_scatter_by_map(map_size, nelem, keep): strlist = ["dog", "cat", "fish", "bird", "pig", "fox", "cow", "goat"] - np.random.seed(0) - df = DataFrame() - df["a"] = np.random.choice(strlist[:map_size], nelem) - df["b"] = np.random.uniform(low=0, high=map_size, size=nelem) - df["c"] = np.random.randint(map_size, size=nelem) + rng = np.random.default_rng(seed=0) + df = DataFrame( + { + "a": rng.choice(strlist[:map_size], nelem), + "b": rng.uniform(low=0, high=map_size, size=nelem), + "c": rng.integers(map_size, size=nelem), + } + ) df["d"] = df["a"].astype("category") def _check_scatter_by_map(dfs, col): @@ -381,10 +384,10 @@ def _check_scatter_by_map(dfs, col): "kind", ["quicksort", "mergesort", "heapsort", "stable"] ) def test_dataframe_sort_values_kind(nelem, dtype, kind): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = DataFrame() - df["a"] = aa = (100 * np.random.random(nelem)).astype(dtype) - df["b"] = bb = (100 * np.random.random(nelem)).astype(dtype) + df["a"] = aa = (100 * rng.random(nelem)).astype(dtype) + df["b"] = bb = (100 * rng.random(nelem)).astype(dtype) with expect_warning_if(kind != "quicksort", UserWarning): sorted_df = df.sort_values(by="a", kind=kind) # Check diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py index 3248e7f72c0..8b68ae6480b 100644 --- a/python/cudf/cudf/tests/test_sparse_df.py +++ b/python/cudf/cudf/tests/test_sparse_df.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import numpy as np @@ -6,7 +6,8 @@ def test_to_dense_array(): - data = np.random.random(8) + rng = np.random.default_rng(seed=0) + data = rng.random(8) mask = np.asarray([0b11010110]).astype(np.byte) sr = Series.from_masked_array(data=data, mask=mask, null_count=3) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index f952cea07f8..27de0ed42e5 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -24,8 +24,8 @@ @pytest.mark.parametrize("dtype", params_dtypes) @pytest.mark.parametrize("skipna", [True, False]) def test_series_reductions(method, dtype, skipna): - np.random.seed(0) - arr = np.random.random(100) + rng = np.random.default_rng(seed=0) + arr = rng.random(100) if np.issubdtype(dtype, np.integer): arr *= 100 mask = arr > 10 @@ -56,8 +56,8 @@ def call_test(sr, skipna): def test_series_reductions_concurrency(method): e = ThreadPoolExecutor(10) - np.random.seed(0) - srs = [cudf.Series(np.random.random(10000)) for _ in range(1)] + rng = np.random.default_rng(seed=0) + srs = [cudf.Series(rng.random(10000)) for _ in range(1)] def call_test(sr): fn = getattr(sr, method) @@ -74,8 +74,8 @@ def f(sr): @pytest.mark.parametrize("ddof", range(3)) def test_series_std(ddof): - np.random.seed(0) - arr = np.random.random(100) - 0.5 + rng = np.random.default_rng(seed=0) + arr = rng.random(100) - 0.5 sr = cudf.Series(arr) pd = sr.to_pandas() got = sr.std(ddof=ddof) @@ -84,8 +84,9 @@ def test_series_std(ddof): def test_series_unique(): + rng = np.random.default_rng(seed=0) for size in [10**x for x in range(5)]: - arr = np.random.randint(low=-1, high=10, size=size) + arr = rng.integers(low=-1, high=10, size=size) mask = arr != -1 sr = cudf.Series(arr) sr[~mask] = None @@ -129,7 +130,8 @@ def test_series_nunique(nan_as_null, dropna): def test_series_scale(): - arr = pd.Series(np.random.randint(low=-10, high=10, size=100)) + rng = np.random.default_rng(seed=0) + arr = pd.Series(rng.integers(low=-10, high=10, size=100)) sr = cudf.Series(arr) vmin = arr.min() @@ -229,8 +231,8 @@ def test_misc_quantiles(data, q): @pytest.mark.parametrize( "data", [ - {"data": np.random.normal(-100, 100, 1000)}, - {"data": np.random.randint(-50, 50, 1000)}, + {"data": np.random.default_rng(seed=0).normal(-100, 100, 1000)}, + {"data": np.random.default_rng(seed=0).integers(-50, 50, 1000)}, {"data": (np.zeros(100))}, {"data": np.repeat(np.nan, 100)}, {"data": np.array([1.123, 2.343, np.nan, 0.0])}, @@ -280,8 +282,8 @@ def test_kurt_skew_error(op): @pytest.mark.parametrize( "data", [ - cudf.Series(np.random.normal(-100, 100, 1000)), - cudf.Series(np.random.randint(-50, 50, 1000)), + cudf.Series(np.random.default_rng(seed=0).normal(-100, 100, 1000)), + cudf.Series(np.random.default_rng(seed=0).integers(-50, 50, 1000)), cudf.Series(np.zeros(100)), cudf.Series(np.repeat(np.nan, 100)), cudf.Series(np.array([1.123, 2.343, np.nan, 0.0])), @@ -311,8 +313,8 @@ def test_skew_series(data, null_flag, numeric_only): @pytest.mark.parametrize("dtype", params_dtypes) @pytest.mark.parametrize("num_na", [0, 1, 50, 99, 100]) def test_series_median(dtype, num_na): - np.random.seed(0) - arr = np.random.random(100) + rng = np.random.default_rng(seed=0) + arr = rng.random(100) if np.issubdtype(dtype, np.integer): arr *= 100 mask = np.arange(100) >= num_na @@ -344,8 +346,8 @@ def test_series_median(dtype, num_na): @pytest.mark.parametrize( "data", [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), np.zeros(100), np.array([1.123, 2.343, np.nan, 0.0]), np.array([-2, 3.75, 6, None, None, None, -8.5, None, 4.2]), @@ -379,8 +381,8 @@ def test_series_pct_change(data, periods, fill_method): @pytest.mark.parametrize( "data1", [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), np.zeros(100), np.repeat(np.nan, 100), np.array([1.123, 2.343, np.nan, 0.0]), @@ -393,8 +395,8 @@ def test_series_pct_change(data, periods, fill_method): @pytest.mark.parametrize( "data2", [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), np.zeros(100), np.repeat(np.nan, 100), np.array([1.123, 2.343, np.nan, 0.0]), @@ -423,8 +425,8 @@ def test_cov1d(data1, data2): @pytest.mark.parametrize( "data1", [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), np.zeros(100), np.repeat(np.nan, 100), np.array([1.123, 2.343, np.nan, 0.0]), @@ -437,8 +439,8 @@ def test_cov1d(data1, data2): @pytest.mark.parametrize( "data2", [ - np.random.normal(-100, 100, 1000), - np.random.randint(-50, 50, 1000), + np.random.default_rng(seed=0).normal(-100, 100, 1000), + np.random.default_rng(seed=0).integers(-50, 50, 1000), np.zeros(100), np.repeat(np.nan, 100), np.array([1.123, 2.343, np.nan, 0.0]), diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index cc88cc79769..e25f99d7bee 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -36,6 +36,7 @@ idx_list = [None, [10, 11, 12, 13, 14]] idx_id_list = ["None_index", "Set_index"] +rng = np.random.default_rng(seed=0) def raise_builder(flags, exceptions): @@ -132,9 +133,14 @@ def test_string_get_item(ps_gs, item): np.array([False] * 5), cupy.asarray(np.array([True] * 5)), cupy.asarray(np.array([False] * 5)), - np.random.randint(0, 2, 5).astype("bool").tolist(), - np.random.randint(0, 2, 5).astype("bool"), - cupy.asarray(np.random.randint(0, 2, 5).astype("bool")), + np.random.default_rng(seed=0) + .integers(0, 2, 5) + .astype("bool") + .tolist(), + np.random.default_rng(seed=0).integers(0, 2, 5).astype("bool"), + cupy.asarray( + np.random.default_rng(seed=0).integers(0, 2, 5).astype("bool") + ), ], ) def test_string_bool_mask(ps_gs, item): @@ -1078,7 +1084,8 @@ def test_string_set_scalar(scalar): def test_string_index(): - pdf = pd.DataFrame(np.random.rand(5, 5)) + rng = np.random.default_rng(seed=0) + pdf = pd.DataFrame(rng.random(size=(5, 5))) gdf = cudf.DataFrame.from_pandas(pdf) stringIndex = ["a", "b", "c", "d", "e"] pdf.index = stringIndex @@ -1899,6 +1906,26 @@ def test_string_findall(pat, flags): assert_eq(expected, actual) +@pytest.mark.parametrize( + "pat, flags, pos", + [ + ("Monkey", 0, [-1, 0, -1, -1]), + ("on", 0, [2, 1, -1, 1]), + ("bit", 0, [-1, -1, 3, -1]), + ("on$", 0, [2, -1, -1, -1]), + ("on$", re.MULTILINE, [2, -1, -1, 1]), + ("o.*k", re.DOTALL, [-1, 1, -1, 1]), + ], +) +def test_string_find_re(pat, flags, pos): + test_data = ["Lion", "Monkey", "Rabbit", "Don\nkey"] + gs = cudf.Series(test_data) + + expected = pd.Series(pos, dtype=np.int32) + actual = gs.str.find_re(pat, flags) + assert_eq(expected, actual) + + def test_string_replace_multi(): ps = pd.Series(["hello", "goodbye"]) gs = cudf.Series(["hello", "goodbye"]) diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index e91edc9eec6..899d78c999b 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -50,10 +50,14 @@ def test_struct_for_field(key, expect): assert_eq(expect, got) -@pytest.mark.parametrize("input_obj", [[{"a": 1, "b": cudf.NA, "c": 3}]]) -def test_series_construction_with_nulls(input_obj): - expect = pa.array(input_obj, from_pandas=True) - got = cudf.Series(input_obj).to_arrow() +def test_series_construction_with_nulls(): + fields = [ + pa.array([1], type=pa.int64()), + pa.array([None], type=pa.int64()), + pa.array([3], type=pa.int64()), + ] + expect = pa.StructArray.from_arrays(fields, ["a", "b", "c"]) + got = cudf.Series(expect).to_arrow() assert expect == got diff --git a/python/cudf/cudf/tests/test_transform.py b/python/cudf/cudf/tests/test_transform.py index 88938457545..1305022d7fa 100644 --- a/python/cudf/cudf/tests/test_transform.py +++ b/python/cudf/cudf/tests/test_transform.py @@ -24,8 +24,8 @@ def _generic_function(a): ) def test_apply_python_lambda(dtype, udf, testfunc): size = 500 - - lhs_arr = np.random.random(size).astype(dtype) + rng = np.random.default_rng(seed=0) + lhs_arr = rng.random(size).astype(dtype) lhs_ser = Series(lhs_arr) out_ser = lhs_ser.apply(udf) diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py index 5f5d79c1dce..b714beb0069 100644 --- a/python/cudf/cudf/tests/test_unaops.py +++ b/python/cudf/cudf/tests/test_unaops.py @@ -17,7 +17,8 @@ @pytest.mark.parametrize("dtype", utils.NUMERIC_TYPES) def test_series_abs(dtype): - arr = (np.random.random(1000) * 100).astype(dtype) + rng = np.random.default_rng(seed=0) + arr = (rng.random(1000) * 100).astype(dtype) sr = Series(arr) np.testing.assert_equal(sr.abs().to_numpy(), np.abs(arr)) np.testing.assert_equal(abs(sr).to_numpy(), abs(arr)) @@ -25,22 +26,24 @@ def test_series_abs(dtype): @pytest.mark.parametrize("dtype", utils.INTEGER_TYPES) def test_series_invert(dtype): - arr = (np.random.random(1000) * 100).astype(dtype) + rng = np.random.default_rng(seed=0) + arr = (rng.random(1000) * 100).astype(dtype) sr = Series(arr) np.testing.assert_equal((~sr).to_numpy(), np.invert(arr)) np.testing.assert_equal((~sr).to_numpy(), ~arr) def test_series_neg(): - arr = np.random.random(100) * 100 + rng = np.random.default_rng(seed=0) + arr = rng.random(100) * 100 sr = Series(arr) np.testing.assert_equal((-sr).to_numpy(), -arr) @pytest.mark.parametrize("mth", ["min", "max", "sum", "product"]) def test_series_pandas_methods(mth): - np.random.seed(0) - arr = (1 + np.random.random(5) * 100).astype(np.int64) + rng = np.random.default_rng(seed=0) + arr = (1 + rng.random(5) * 100).astype(np.int64) sr = Series(arr) psr = pd.Series(arr) np.testing.assert_equal(getattr(sr, mth)(), getattr(psr, mth)()) diff --git a/python/cudf/cudf/tests/test_unique.py b/python/cudf/cudf/tests/test_unique.py index 699b3340521..9a1c3b213b8 100644 --- a/python/cudf/cudf/tests/test_unique.py +++ b/python/cudf/cudf/tests/test_unique.py @@ -12,9 +12,9 @@ @pytest.fixture def df(): df = cudf.DataFrame() - np.random.seed(0) + rng = np.random.default_rng(seed=0) - arr = np.random.randint(2, size=10, dtype=np.int64) + arr = rng.integers(2, size=10, dtype=np.int64) df["foo"] = arr df["bar"] = cudf.Series([pd.Timestamp(x) for x in arr]) diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py index babe4be2715..896a3809c67 100644 --- a/python/cudf/cudf/utils/hash_vocab_utils.py +++ b/python/cudf/cudf/utils/hash_vocab_utils.py @@ -69,10 +69,10 @@ def _get_space_util(bins, init_bins): return sum(_new_bin_length(len(b)) for b in bins) + 2 * init_bins -def _pick_initial_a_b(data, max_constant, init_bins): +def _pick_initial_a_b(data, max_constant, init_bins, rng): while True: - a = np.random.randint(2**12, 2**15) - b = np.random.randint(2**12, 2**15) + a = rng.integers(2**12, 2**15) + b = rng.integers(2**12, 2**15) bins = _make_bins(data, init_bins, a, b) score = _get_space_util(bins, init_bins) / len(data) @@ -86,18 +86,18 @@ def _pick_initial_a_b(data, max_constant, init_bins): return bins, a, b -def _find_hash_for_internal(hash_bin): +def _find_hash_for_internal(hash_bin, rng): if not hash_bin: return [[], 0, 0] new_length = _new_bin_length(len(hash_bin)) while True: - a = np.random.randint( + a = rng.integers( A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH, ) - b = np.random.randint( + b = rng.integers( B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH ) bins = _make_bins(hash_bin, new_length, a, b) @@ -108,11 +108,11 @@ def _find_hash_for_internal(hash_bin): return bins, a, b -def _perfect_hash(integers, max_constant): +def _perfect_hash(integers, max_constant, rng): num_top_level_bins = len(integers) // 4 init_bins, init_a, init_b = _pick_initial_a_b( - integers, max_constant, num_top_level_bins + integers, max_constant, num_top_level_bins, rng ) flattened_bins = [] @@ -127,7 +127,7 @@ def _perfect_hash(integers, max_constant): for i, b in enumerate(init_bins): if i % 500 == 0: print(f"Processing bin {i} / {len(init_bins)} of size = {len(b)}") - internal_table, coeff_a, coeff_b = _find_hash_for_internal(b) + internal_table, coeff_a, coeff_b = _find_hash_for_internal(b, rng) bin_length = len(internal_table) max_bin_length = max(bin_length, max_bin_length) internal_table_coeffs[i] = ( @@ -245,7 +245,7 @@ def hash_vocab( """ Write the vocab vocabulary hashtable to the output_path """ - np.random.seed(1243342) + rng = np.random.default_rng(seed=1243342) vocab = _load_vocab_dict(vocab_path) keys = list(map(_sdbm_hash, vocab.keys())) @@ -264,7 +264,7 @@ def hash_vocab( hash_table, inner_table_coeffs, offsets_into_ht, - ) = _perfect_hash(keys, 10) + ) = _perfect_hash(keys, 10, rng) _pack_keys_and_values(hash_table, hashed_vocab) _store_func( diff --git a/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb b/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb index c7d39b78810..94904fd83d4 100644 --- a/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb +++ b/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb @@ -18,13 +18,13 @@ "import numpy as np\n", "import pandas as pd\n", "\n", - "np.random.seed(0)\n", + "rng = np.random.default_rng(seed=0)\n", "\n", "num_rows = 25_000_000\n", "num_columns = 12\n", "\n", "# Create a DataFrame with random data\n", - "df = pd.DataFrame(np.random.randint(0, 100, size=(num_rows, num_columns)),\n", + "df = pd.DataFrame(rng.integers(0, 100, size=(num_rows, num_columns)),\n", " columns=[f'Column_{i}' for i in range(1, num_columns + 1)])" ] }, diff --git a/python/cudf/cudf_pandas_tests/pytest.ini b/python/cudf/cudf_pandas_tests/pytest.ini new file mode 100644 index 00000000000..46e2448ea24 --- /dev/null +++ b/python/cudf/cudf_pandas_tests/pytest.ini @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +# Note, this config file overrides the default "cudf" test config in +# ../pyproject.toml We do so deliberately because we have different +# treatment of markers and warnings +[pytest] +addopts = --tb=native --strict-config --strict-markers +empty_parameter_set_mark = fail_at_collect +xfail_strict = true diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 2bbed40e34e..7aefdc386bb 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import collections +import contextlib import copy import datetime import operator @@ -21,10 +22,15 @@ import pyarrow as pa import pytest from nbconvert.preprocessors import ExecutePreprocessor -from numba import NumbaDeprecationWarning, vectorize +from numba import ( + NumbaDeprecationWarning, + __version__ as numba_version, + vectorize, +) +from packaging import version from pytz import utc -from cudf.core._compat import PANDAS_GE_220 +from cudf.core._compat import PANDAS_GE_210, PANDAS_GE_220, PANDAS_VERSION from cudf.pandas import LOADED, Profiler from cudf.pandas.fast_slow_proxy import ( ProxyFallbackError, @@ -52,8 +58,6 @@ get_calendar, ) -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION - # Accelerated pandas has the real pandas and cudf modules as attributes pd = xpd._fsproxy_slow cudf = xpd._fsproxy_fast @@ -622,10 +626,6 @@ def test_array_function_series_fallback(series): tm.assert_equal(expect, got) -@pytest.mark.xfail( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas", -) def test_timedeltaproperties(series): psr, sr = series psr, sr = psr.astype("timedelta64[ns]"), sr.astype("timedelta64[ns]") @@ -685,10 +685,6 @@ def test_maintain_container_subclasses(multiindex): assert isinstance(got, xpd.core.indexes.frozen.FrozenList) -@pytest.mark.xfail( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, - reason="Fails in older versions of pandas due to unsupported boxcar window type", -) def test_rolling_win_type(): pdf = pd.DataFrame(range(5)) df = xpd.DataFrame(range(5)) @@ -697,8 +693,14 @@ def test_rolling_win_type(): tm.assert_equal(result, expected) -@pytest.mark.skip( - reason="Requires Numba 0.59 to fix segfaults on ARM. See https://github.com/numba/llvmlite/pull/1009" +@pytest.mark.skipif( + version.parse(numba_version) < version.parse("0.59"), + reason="Requires Numba 0.59 to fix segfaults on ARM. See https://github.com/numba/llvmlite/pull/1009", +) +@pytest.mark.xfail( + version.parse(numba_version) >= version.parse("0.59") + and PANDAS_VERSION < version.parse("2.1"), + reason="numba.generated_jit removed in 0.59, requires pandas >= 2.1", ) def test_rolling_apply_numba_engine(): def weighted_mean(x): @@ -709,7 +711,12 @@ def weighted_mean(x): pdf = pd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]]) df = xpd.DataFrame([[1, 2, 0.6], [2, 3, 0.4], [3, 4, 0.2], [4, 5, 0.7]]) - with pytest.warns(NumbaDeprecationWarning): + ctx = ( + contextlib.nullcontext() + if PANDAS_GE_210 + else pytest.warns(NumbaDeprecationWarning) + ) + with ctx: expect = pdf.rolling(2, method="table", min_periods=0).apply( weighted_mean, raw=True, engine="numba" ) @@ -1135,8 +1142,8 @@ def test_private_method_result_wrapped(): def test_numpy_var(): - np.random.seed(42) - data = np.random.rand(1000) + rng = np.random.default_rng(seed=42) + data = rng.random(1000) psr = pd.Series(data) sr = xpd.Series(data) @@ -1305,7 +1312,7 @@ def max_times_two(self): @pytest.mark.xfail( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION < version.parse("2.1"), reason="DatetimeArray.__floordiv__ missing in pandas-2.0.0", ) def test_floordiv_array_vs_df(): @@ -1580,7 +1587,7 @@ def test_numpy_cupy_flatiter(series): @pytest.mark.xfail( - PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION, + PANDAS_VERSION < version.parse("2.1"), reason="pyarrow_numpy storage type was not supported in pandas-2.0.0", ) def test_arrow_string_arrays(): diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py index a75a20a4681..63fd9601fc1 100644 --- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py +++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py @@ -387,7 +387,8 @@ def test_dir_bound_method( ): """This test will fail because dir for bound methods is currently incorrect, but we have no way to fix it without materializing the slow - type, which is unnecessarily expensive.""" + type, which is unnecessarily expensive. + """ Fast, FastIntermediate = fast_and_intermediate_with_doc Slow, SlowIntermediate = slow_and_intermediate_with_doc diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py index 5b7bde06d1d..a5c29bd93a2 100644 --- a/python/cudf/cudf_pandas_tests/test_profiler.py +++ b/python/cudf/cudf_pandas_tests/test_profiler.py @@ -23,12 +23,12 @@ reason="function names change across versions of pandas, so making sure it only runs on latest version of pandas", ) def test_profiler(): - np.random.seed(42) + rng = np.random.default_rng(seed=42) with Profiler() as profiler: df = pd.DataFrame( { - "idx": np.random.randint(0, 10, 1000), - "data": np.random.rand(1000), + "idx": rng.integers(0, 10, 1000), + "data": rng.random(1000), } ) sums = df.groupby("idx").sum() @@ -58,7 +58,7 @@ def test_profiler(): calls = [ "pd.DataFrame", "", - "np.random.randint", + "rng.integers", "np.random.rand", 'df.groupby("idx").sum', 'df.sum()["data"]', diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py index 892d0886596..27eaff87ba0 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py @@ -102,7 +102,7 @@ def test_random_forest(binary_classification_data): def test_clustering(): rng = np.random.default_rng(42) nsamps = 300 - X = rng.random((nsamps, 2)) + X = rng.random(size=(nsamps, 2)) data = pd.DataFrame(X, columns=["x", "y"]) kmeans = KMeans(n_clusters=3, random_state=42) diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py index 37e3cc34856..0777d982ac2 100644 --- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py +++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py @@ -31,17 +31,17 @@ def dask_client(): def test_1d_distributed(dask_client): - np.random.seed(42) - ts = pd.Series(np.random.rand(100)) + rng = np.random.default_rng(seed=42) + ts = pd.Series(rng.random(100)) m = 10 return stumpy.stumped(dask_client, ts, m) def test_multidimensional_distributed_timeseries(dask_client): - np.random.seed(42) + rng = np.random.default_rng(seed=42) # Each row represents data from a different dimension while each column represents # data from the same dimension - your_time_series = np.random.rand(3, 1000) + your_time_series = rng.random(3, 1000) # Approximately, how many data points might be found in a pattern window_size = 50 diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 605f9be5a49..80201dd84db 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -28,9 +28,10 @@ dependencies = [ "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", - "pandas>=2.0,<2.2.3dev0", + "pandas>=2.0,<2.2.4dev0", "ptxcompiler", - "pyarrow>=14.0.0,<18.0.0a0", + "pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'", + "pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'", "pylibcudf==24.12.*,>=0.0.0a0", "rich", "rmm==24.12.*,>=0.0.0a0", @@ -80,49 +81,26 @@ cudf-pandas-tests = [ Homepage = "https://github.com/rapidsai/cudf" Documentation = "https://docs.rapids.ai/api/cudf/stable/" -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true -known_dask = [ - "dask", - "distributed", - "dask_cuda", +[tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" +filterwarnings = [ + "error", + "ignore:::.*xdist.*", + "ignore:::.*pytest.*", + # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow() + "ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore", + # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+ + "ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning", + # PerformanceWarning from cupy warming up the JIT cache + "ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning", + # Ignore numba PEP 456 warning specific to arm machines + "ignore:FNV hashing is not implemented in Numba.*:UserWarning" ] -known_rapids = [ - "rmm", - "pylibcudf" -] -known_first_party = [ - "cudf", -] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", - "__init__.py", +markers = [ + "spilling: mark benchmark a good candidate to run with `CUDF_SPILL=ON`" ] +xfail_strict = true [tool.rapids-build-backend] build-backend = "scikit_build_core.build" @@ -152,3 +130,18 @@ wheel.packages = ["cudf"] provider = "scikit_build_core.metadata.regex" input = "cudf/VERSION" regex = "(?P.*)" + +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["cudf"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda"] +rapids = ["rmm", "pylibcudf"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index a1a3ec37842..667cd7b1db8 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -32,56 +32,28 @@ test = [ Homepage = "https://github.com/rapidsai/cudf" Documentation = "https://docs.rapids.ai/api/cudf/stable/" -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true -known_dask = [ - "dask", - "distributed", - "dask_cuda", - "streamz", -] -known_rapids = [ - "rmm", - "cudf", - "dask_cudf", -] -known_first_party = [ - "cudf_kafka", -] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", - "__init__.py", -] +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["cudf_kafka"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda", "streamz"] +rapids = ["rmm", "cudf", "dask_cudf"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] [tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ "error" ] +xfail_strict = true [tool.scikit-build] build-dir = "build/{wheel_tag}" diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py index 06bb08953f1..3b1eff4a0d0 100644 --- a/python/cudf_polars/cudf_polars/containers/__init__.py +++ b/python/cudf_polars/cudf_polars/containers/__init__.py @@ -5,7 +5,7 @@ from __future__ import annotations -__all__: list[str] = ["DataFrame", "Column", "NamedColumn"] +__all__: list[str] = ["DataFrame", "Column"] -from cudf_polars.containers.column import Column, NamedColumn +from cudf_polars.containers.column import Column from cudf_polars.containers.dataframe import DataFrame diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py index 3fe3e5557cb..00186098e54 100644 --- a/python/cudf_polars/cudf_polars/containers/column.py +++ b/python/cudf_polars/cudf_polars/containers/column.py @@ -15,7 +15,7 @@ import polars as pl -__all__: list[str] = ["Column", "NamedColumn"] +__all__: list[str] = ["Column"] class Column: @@ -26,6 +26,9 @@ class Column: order: plc.types.Order null_order: plc.types.NullOrder is_scalar: bool + # Optional name, only ever set by evaluation of NamedExpr nodes + # The internal evaluation should not care about the name. + name: str | None def __init__( self, @@ -34,14 +37,12 @@ def __init__( is_sorted: plc.types.Sorted = plc.types.Sorted.NO, order: plc.types.Order = plc.types.Order.ASCENDING, null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE, + name: str | None = None, ): self.obj = column self.is_scalar = self.obj.size() == 1 - if self.obj.size() <= 1: - is_sorted = plc.types.Sorted.YES - self.is_sorted = is_sorted - self.order = order - self.null_order = null_order + self.name = name + self.set_sorted(is_sorted=is_sorted, order=order, null_order=null_order) @functools.cached_property def obj_scalar(self) -> plc.Scalar: @@ -63,9 +64,26 @@ def obj_scalar(self) -> plc.Scalar: ) return plc.copying.get_element(self.obj, 0) + def rename(self, name: str | None, /) -> Self: + """ + Return a shallow copy with a new name. + + Parameters + ---------- + name + New name + + Returns + ------- + Shallow copy of self with new name set. + """ + new = self.copy() + new.name = name + return new + def sorted_like(self, like: Column, /) -> Self: """ - Copy sortedness properties from a column onto self. + Return a shallow copy with sortedness from like. Parameters ---------- @@ -74,20 +92,23 @@ def sorted_like(self, like: Column, /) -> Self: Returns ------- - Self with metadata set. + Shallow copy of self with metadata set. See Also -------- set_sorted, copy_metadata """ - return self.set_sorted( - is_sorted=like.is_sorted, order=like.order, null_order=like.null_order + return type(self)( + self.obj, + name=self.name, + is_sorted=like.is_sorted, + order=like.order, + null_order=like.null_order, ) - # TODO: Return Column once #16272 is fixed. - def astype(self, dtype: plc.DataType) -> plc.Column: + def astype(self, dtype: plc.DataType) -> Column: """ - Return the backing column as the requested dtype. + Cast the column to as the requested dtype. Parameters ---------- @@ -109,8 +130,10 @@ def astype(self, dtype: plc.DataType) -> plc.Column: the current one. """ if self.obj.type() != dtype: - return plc.unary.cast(self.obj, dtype) - return self.obj + return Column(plc.unary.cast(self.obj, dtype), name=self.name).sorted_like( + self + ) + return self def copy_metadata(self, from_: pl.Series, /) -> Self: """ @@ -129,6 +152,7 @@ def copy_metadata(self, from_: pl.Series, /) -> Self: -------- set_sorted, sorted_like """ + self.name = from_.name if len(from_) <= 1: return self ascending = from_.flags["SORTED_ASC"] @@ -192,6 +216,7 @@ def copy(self) -> Self: is_sorted=self.is_sorted, order=self.order, null_order=self.null_order, + name=self.name, ) def mask_nans(self) -> Self: @@ -217,58 +242,3 @@ def nan_count(self) -> int: ) ).as_py() return 0 - - -class NamedColumn(Column): - """A column with a name.""" - - name: str - - def __init__( - self, - column: plc.Column, - name: str, - *, - is_sorted: plc.types.Sorted = plc.types.Sorted.NO, - order: plc.types.Order = plc.types.Order.ASCENDING, - null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE, - ) -> None: - super().__init__( - column, is_sorted=is_sorted, order=order, null_order=null_order - ) - self.name = name - - def copy(self, *, new_name: str | None = None) -> Self: - """ - A shallow copy of the column. - - Parameters - ---------- - new_name - Optional new name for the copied column. - - Returns - ------- - New column sharing data with self. - """ - return type(self)( - self.obj, - self.name if new_name is None else new_name, - is_sorted=self.is_sorted, - order=self.order, - null_order=self.null_order, - ) - - def mask_nans(self) -> Self: - """Return a shallow copy of self with nans masked out.""" - # Annoying, the inheritance is not right (can't call the - # super-type mask_nans), but will sort that by refactoring - # later. - if plc.traits.is_floating_point(self.obj.type()): - old_count = self.obj.null_count() - mask, new_count = plc.transform.nans_to_nulls(self.obj) - result = type(self)(self.obj.with_mask(mask, new_count), self.name) - if old_count == new_count: - return result.sorted_like(self) - return result - return self.copy() diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index f3e3862d0cc..2c195f6637c 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -5,43 +5,50 @@ from __future__ import annotations -import itertools from functools import cached_property -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import pyarrow as pa import pylibcudf as plc import polars as pl -from cudf_polars.containers.column import NamedColumn +from cudf_polars.containers import Column from cudf_polars.utils import dtypes if TYPE_CHECKING: - from collections.abc import Mapping, Sequence, Set + from collections.abc import Iterable, Mapping, Sequence, Set from typing_extensions import Self - from cudf_polars.containers import Column - __all__: list[str] = ["DataFrame"] +# Pacify the type checker. DataFrame init asserts that all the columns +# have a string name, so let's narrow the type. +class NamedColumn(Column): + name: str + + class DataFrame: """A representation of a dataframe.""" - columns: list[NamedColumn] + column_map: dict[str, Column] table: plc.Table + columns: list[NamedColumn] - def __init__(self, columns: Sequence[NamedColumn]) -> None: - self.columns = list(columns) - self._column_map = {c.name: c for c in self.columns} - self.table = plc.Table([c.obj for c in columns]) + def __init__(self, columns: Iterable[Column]) -> None: + columns = list(columns) + if any(c.name is None for c in columns): + raise ValueError("All columns must have a name") + self.columns = [cast(NamedColumn, c) for c in columns] + self.column_map = {c.name: c for c in self.columns} + self.table = plc.Table([c.obj for c in self.columns]) def copy(self) -> Self: """Return a shallow copy of self.""" - return type(self)([c.copy() for c in self.columns]) + return type(self)(c.copy() for c in self.columns) def to_polars(self) -> pl.DataFrame: """Convert to a polars DataFrame.""" @@ -51,42 +58,38 @@ def to_polars(self) -> pl.DataFrame: # https://github.com/pola-rs/polars/issues/11632 # To guarantee we produce correct names, we therefore # serialise with names we control and rename with that map. - name_map = {f"column_{i}": c.name for i, c in enumerate(self.columns)} + name_map = {f"column_{i}": name for i, name in enumerate(self.column_map)} table: pa.Table = plc.interop.to_arrow( self.table, [plc.interop.ColumnMetadata(name=name) for name in name_map], ) df: pl.DataFrame = pl.from_arrow(table) return df.rename(name_map).with_columns( - *( - pl.col(c.name).set_sorted( - descending=c.order == plc.types.Order.DESCENDING - ) - if c.is_sorted - else pl.col(c.name) - for c in self.columns - ) + pl.col(c.name).set_sorted(descending=c.order == plc.types.Order.DESCENDING) + if c.is_sorted + else pl.col(c.name) + for c in self.columns ) @cached_property def column_names_set(self) -> frozenset[str]: """Return the column names as a set.""" - return frozenset(c.name for c in self.columns) + return frozenset(self.column_map) @cached_property def column_names(self) -> list[str]: """Return a list of the column names.""" - return [c.name for c in self.columns] + return list(self.column_map) @cached_property def num_columns(self) -> int: """Number of columns.""" - return len(self.columns) + return len(self.column_map) @cached_property def num_rows(self) -> int: """Number of rows.""" - return 0 if len(self.columns) == 0 else self.table.num_rows() + return self.table.num_rows() if self.column_map else 0 @classmethod def from_polars(cls, df: pl.DataFrame) -> Self: @@ -111,12 +114,8 @@ def from_polars(cls, df: pl.DataFrame) -> Self: # No-op if the schema is unchanged. d_table = plc.interop.from_arrow(table.cast(schema)) return cls( - [ - NamedColumn(column, h_col.name).copy_metadata(h_col) - for column, h_col in zip( - d_table.columns(), df.iter_columns(), strict=True - ) - ] + Column(column).copy_metadata(h_col) + for column, h_col in zip(d_table.columns(), df.iter_columns(), strict=True) ) @classmethod @@ -144,17 +143,14 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self: if table.num_columns() != len(names): raise ValueError("Mismatching name and table length.") return cls( - [ - NamedColumn(c, name) - for c, name in zip(table.columns(), names, strict=True) - ] + Column(c, name=name) for c, name in zip(table.columns(), names, strict=True) ) def sorted_like( self, like: DataFrame, /, *, subset: Set[str] | None = None ) -> Self: """ - Copy sortedness from a dataframe onto self. + Return a shallow copy with sortedness copied from like. Parameters ---------- @@ -165,7 +161,7 @@ def sorted_like( Returns ------- - Self with metadata set. + Shallow copy of self with metadata set. Raises ------ @@ -175,13 +171,12 @@ def sorted_like( if like.column_names != self.column_names: raise ValueError("Can only copy from identically named frame") subset = self.column_names_set if subset is None else subset - self.columns = [ + return type(self)( c.sorted_like(other) if c.name in subset else c for c, other in zip(self.columns, like.columns, strict=True) - ] - return self + ) - def with_columns(self, columns: Sequence[NamedColumn]) -> Self: + def with_columns(self, columns: Iterable[Column], *, replace_only=False) -> Self: """ Return a new dataframe with extra columns. @@ -189,6 +184,8 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self: ---------- columns Columns to add + replace_only + If true, then only replacements are allowed (matching by name). Returns ------- @@ -196,36 +193,30 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self: Notes ----- - If column names overlap, newer names replace older ones. + If column names overlap, newer names replace older ones, and + appear in the same order as the original frame. """ - columns = list( - {c.name: c for c in itertools.chain(self.columns, columns)}.values() - ) - return type(self)(columns) + new = {c.name: c for c in columns} + if replace_only and not self.column_names_set.issuperset(new.keys()): + raise ValueError("Cannot replace with non-existing names") + return type(self)((self.column_map | new).values()) def discard_columns(self, names: Set[str]) -> Self: """Drop columns by name.""" - return type(self)([c for c in self.columns if c.name not in names]) + return type(self)(column for column in self.columns if column.name not in names) def select(self, names: Sequence[str]) -> Self: """Select columns by name returning DataFrame.""" - want = set(names) - if not want.issubset(self.column_names_set): - raise ValueError("Can't select missing names") - return type(self)([self._column_map[name] for name in names]) - - def replace_columns(self, *columns: NamedColumn) -> Self: - """Return a new dataframe with columns replaced by name.""" - new = {c.name: c for c in columns} - if not set(new).issubset(self.column_names_set): - raise ValueError("Cannot replace with non-existing names") - return type(self)([new.get(c.name, c) for c in self.columns]) + try: + return type(self)(self.column_map[name] for name in names) + except KeyError as e: + raise ValueError("Can't select missing names") from e def rename_columns(self, mapping: Mapping[str, str]) -> Self: """Rename some columns.""" - return type(self)([c.copy(new_name=mapping.get(c.name)) for c in self.columns]) + return type(self)(c.rename(mapping.get(c.name, c.name)) for c in self.columns) - def select_columns(self, names: Set[str]) -> list[NamedColumn]: + def select_columns(self, names: Set[str]) -> list[Column]: """Select columns by name.""" return [c for c in self.columns if c.name in names] diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index c401e5a2f17..e748ec16f14 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -15,33 +15,30 @@ from __future__ import annotations -import enum -from enum import IntEnum -from functools import partial, reduce -from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple - -import pyarrow as pa -import pyarrow.compute as pc -import pylibcudf as plc - -from polars.exceptions import InvalidOperationError -from polars.polars import _expr_nodes as pl_expr - -from cudf_polars.containers import Column, NamedColumn -from cudf_polars.utils import dtypes, sorting - -if TYPE_CHECKING: - from collections.abc import Mapping, Sequence - - import polars as pl - import polars.type_aliases as pl_types - - from cudf_polars.containers import DataFrame +from cudf_polars.dsl.expressions.aggregation import Agg +from cudf_polars.dsl.expressions.base import ( + AggInfo, + Col, + Expr, + NamedExpr, +) +from cudf_polars.dsl.expressions.binaryop import BinOp +from cudf_polars.dsl.expressions.boolean import BooleanFunction +from cudf_polars.dsl.expressions.datetime import TemporalFunction +from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn +from cudf_polars.dsl.expressions.rolling import GroupedRollingWindow, RollingWindow +from cudf_polars.dsl.expressions.selection import Filter, Gather +from cudf_polars.dsl.expressions.sorting import Sort, SortBy +from cudf_polars.dsl.expressions.string import StringFunction +from cudf_polars.dsl.expressions.ternary import Ternary +from cudf_polars.dsl.expressions.unary import Cast, Len, UnaryFunction __all__ = [ "Expr", "NamedExpr", "Literal", + "LiteralColumn", + "Len", "Col", "BooleanFunction", "StringFunction", @@ -54,1782 +51,8 @@ "GroupedRollingWindow", "Cast", "Agg", + "AggInfo", "Ternary", "BinOp", + "UnaryFunction", ] - - -class ExecutionContext(IntEnum): - FRAME = enum.auto() - GROUPBY = enum.auto() - ROLLING = enum.auto() - - -class AggInfo(NamedTuple): - requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]] - - -class Expr: - """ - An abstract expression object. - - This contains a (potentially empty) tuple of child expressions, - along with non-child data. For uniform reconstruction and - implementation of hashing and equality schemes, child classes need - to provide a certain amount of metadata when they are defined. - Specifically, the ``_non_child`` attribute must list, in-order, - the names of the slots that are passed to the constructor. The - constructor must take arguments in the order ``(*_non_child, - *children).`` - """ - - __slots__ = ("dtype", "_hash_value", "_repr_value") - dtype: plc.DataType - """Data type of the expression.""" - _hash_value: int - """Caching slot for the hash of the expression.""" - _repr_value: str - """Caching slot for repr of the expression.""" - children: tuple[Expr, ...] = () - """Children of the expression.""" - _non_child: ClassVar[tuple[str, ...]] = ("dtype",) - """Names of non-child data (not Exprs) for reconstruction.""" - - # Constructor must take arguments in order (*_non_child, *children) - def __init__(self, dtype: plc.DataType) -> None: - self.dtype = dtype - - def _ctor_arguments(self, children: Sequence[Expr]) -> Sequence: - return (*(getattr(self, attr) for attr in self._non_child), *children) - - def get_hash(self) -> int: - """ - Return the hash of this expr. - - Override this in subclasses, rather than __hash__. - - Returns - ------- - The integer hash value. - """ - return hash((type(self), self._ctor_arguments(self.children))) - - def __hash__(self) -> int: - """Hash of an expression with caching.""" - try: - return self._hash_value - except AttributeError: - self._hash_value = self.get_hash() - return self._hash_value - - def is_equal(self, other: Any) -> bool: - """ - Equality of two expressions. - - Override this in subclasses, rather than __eq__. - - Parameter - --------- - other - object to compare to - - Returns - ------- - True if the two expressions are equal, false otherwise. - """ - if type(self) is not type(other): - return False # pragma: no cover; __eq__ trips first - return self._ctor_arguments(self.children) == other._ctor_arguments( - other.children - ) - - def __eq__(self, other: Any) -> bool: - """Equality of expressions.""" - if type(self) is not type(other) or hash(self) != hash(other): - return False - else: - return self.is_equal(other) - - def __ne__(self, other: Any) -> bool: - """Inequality of expressions.""" - return not self.__eq__(other) - - def __repr__(self) -> str: - """String representation of an expression with caching.""" - try: - return self._repr_value - except AttributeError: - args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children)) - self._repr_value = f"{type(self).__name__}({args})" - return self._repr_value - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """ - Evaluate this expression given a dataframe for context. - - Parameters - ---------- - df - DataFrame that will provide columns. - context - What context are we performing this evaluation in? - mapping - Substitution mapping from expressions to Columns, used to - override the evaluation of a given expression if we're - performing a simple rewritten evaluation. - - Notes - ----- - Do not call this function directly, but rather - :meth:`evaluate` which handles the mapping lookups. - - Returns - ------- - Column representing the evaluation of the expression. - - Raises - ------ - NotImplementedError - If we couldn't evaluate the expression. Ideally all these - are returned during translation to the IR, but for now we - are not perfect. - """ - raise NotImplementedError( - f"Evaluation of expression {type(self).__name__}" - ) # pragma: no cover; translation of unimplemented nodes trips first - - def evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """ - Evaluate this expression given a dataframe for context. - - Parameters - ---------- - df - DataFrame that will provide columns. - context - What context are we performing this evaluation in? - mapping - Substitution mapping from expressions to Columns, used to - override the evaluation of a given expression if we're - performing a simple rewritten evaluation. - - Notes - ----- - Individual subclasses should implement :meth:`do_evaluate`, - this method provides logic to handle lookups in the - substitution mapping. - - Returns - ------- - Column representing the evaluation of the expression. - - Raises - ------ - NotImplementedError - If we couldn't evaluate the expression. Ideally all these - are returned during translation to the IR, but for now we - are not perfect. - """ - if mapping is None: - return self.do_evaluate(df, context=context, mapping=mapping) - try: - return mapping[self] - except KeyError: - return self.do_evaluate(df, context=context, mapping=mapping) - - def collect_agg(self, *, depth: int) -> AggInfo: - """ - Collect information about aggregations in groupbys. - - Parameters - ---------- - depth - The depth of aggregating (reduction or sampling) - expressions we are currently at. - - Returns - ------- - Aggregation info describing the expression to aggregate in the - groupby. - - Raises - ------ - NotImplementedError - If we can't currently perform the aggregation request, for - example nested aggregations like ``a.max().min()``. - """ - raise NotImplementedError( - f"Collecting aggregation info for {type(self).__name__}" - ) # pragma: no cover; check_agg trips first - - -class NamedExpr: - # NamedExpr does not inherit from Expr since it does not appear - # when evaluating expressions themselves, only when constructing - # named return values in dataframe (IR) nodes. - __slots__ = ("name", "value") - value: Expr - name: str - - def __init__(self, name: str, value: Expr) -> None: - self.name = name - self.value = value - - def __hash__(self) -> int: - """Hash of the expression.""" - return hash((type(self), self.name, self.value)) - - def __repr__(self) -> str: - """Repr of the expression.""" - return f"NamedExpr({self.name}, {self.value})" - - def __eq__(self, other: Any) -> bool: - """Equality of two expressions.""" - return ( - type(self) is type(other) - and self.name == other.name - and self.value == other.value - ) - - def __ne__(self, other: Any) -> bool: - """Inequality of expressions.""" - return not self.__eq__(other) - - def evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> NamedColumn: - """ - Evaluate this expression given a dataframe for context. - - Parameters - ---------- - df - DataFrame providing context - context - Execution context - mapping - Substitution mapping - - Returns - ------- - NamedColumn attaching a name to an evaluated Column - - See Also - -------- - :meth:`Expr.evaluate` for details, this function just adds the - name to a column produced from an expression. - """ - obj = self.value.evaluate(df, context=context, mapping=mapping) - return NamedColumn( - obj.obj, - self.name, - is_sorted=obj.is_sorted, - order=obj.order, - null_order=obj.null_order, - ) - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - return self.value.collect_agg(depth=depth) - - -class Literal(Expr): - __slots__ = ("value",) - _non_child = ("dtype", "value") - value: pa.Scalar[Any] - children: tuple[()] - - def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None: - super().__init__(dtype) - assert value.type == plc.interop.to_arrow(dtype) - self.value = value - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - # datatype of pyarrow scalar is correct by construction. - return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1)) - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - return AggInfo([]) - - -class LiteralColumn(Expr): - __slots__ = ("value",) - _non_child = ("dtype", "value") - value: pa.Array[Any, Any] - children: tuple[()] - - def __init__(self, dtype: plc.DataType, value: pl.Series) -> None: - super().__init__(dtype) - data = value.to_arrow() - self.value = data.cast(dtypes.downcast_arrow_lists(data.type)) - - def get_hash(self) -> int: - """Compute a hash of the column.""" - # This is stricter than necessary, but we only need this hash - # for identity in groupby replacements so it's OK. And this - # way we avoid doing potentially expensive compute. - return hash((type(self), self.dtype, id(self.value))) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - # datatype of pyarrow array is correct by construction. - return Column(plc.interop.from_arrow(self.value)) - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - return AggInfo([]) - - -class Col(Expr): - __slots__ = ("name",) - _non_child = ("dtype", "name") - name: str - children: tuple[()] - - def __init__(self, dtype: plc.DataType, name: str) -> None: - self.dtype = dtype - self.name = name - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - return df._column_map[self.name] - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - return AggInfo([(self, plc.aggregation.collect_list(), self)]) - - -class Len(Expr): - children: tuple[()] - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - return Column( - plc.Column.from_scalar( - plc.interop.from_arrow( - pa.scalar(df.num_rows, type=plc.interop.to_arrow(self.dtype)) - ), - 1, - ) - ) - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - # TODO: polars returns a uint, not an int for count - return AggInfo( - [(None, plc.aggregation.count(plc.types.NullPolicy.INCLUDE), self)] - ) - - -class BooleanFunction(Expr): - __slots__ = ("name", "options", "children") - _non_child = ("dtype", "name", "options") - children: tuple[Expr, ...] - - def __init__( - self, - dtype: plc.DataType, - name: pl_expr.BooleanFunction, - options: tuple[Any, ...], - *children: Expr, - ) -> None: - super().__init__(dtype) - self.options = options - self.name = name - self.children = children - if self.name == pl_expr.BooleanFunction.IsIn and not all( - c.dtype == self.children[0].dtype for c in self.children - ): - # TODO: If polars IR doesn't put the casts in, we need to - # mimic the supertype promotion rules. - raise NotImplementedError("IsIn doesn't support supertype casting") - - @staticmethod - def _distinct( - column: Column, - *, - keep: plc.stream_compaction.DuplicateKeepOption, - source_value: plc.Scalar, - target_value: plc.Scalar, - ) -> Column: - table = plc.Table([column.obj]) - indices = plc.stream_compaction.distinct_indices( - table, - keep, - # TODO: polars doesn't expose options for these - plc.types.NullEquality.EQUAL, - plc.types.NanEquality.ALL_EQUAL, - ) - return Column( - plc.copying.scatter( - [source_value], - indices, - plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]), - ).columns()[0] - ) - - _BETWEEN_OPS: ClassVar[ - dict[ - pl_types.ClosedInterval, - tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator], - ] - ] = { - "none": ( - plc.binaryop.BinaryOperator.GREATER, - plc.binaryop.BinaryOperator.LESS, - ), - "left": ( - plc.binaryop.BinaryOperator.GREATER_EQUAL, - plc.binaryop.BinaryOperator.LESS, - ), - "right": ( - plc.binaryop.BinaryOperator.GREATER, - plc.binaryop.BinaryOperator.LESS_EQUAL, - ), - "both": ( - plc.binaryop.BinaryOperator.GREATER_EQUAL, - plc.binaryop.BinaryOperator.LESS_EQUAL, - ), - } - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - if self.name in ( - pl_expr.BooleanFunction.IsFinite, - pl_expr.BooleanFunction.IsInfinite, - ): - # Avoid evaluating the child if the dtype tells us it's unnecessary. - (child,) = self.children - is_finite = self.name == pl_expr.BooleanFunction.IsFinite - if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64): - value = plc.interop.from_arrow( - pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype)) - ) - return Column(plc.Column.from_scalar(value, df.num_rows)) - needles = child.evaluate(df, context=context, mapping=mapping) - to_search = [-float("inf"), float("inf")] - if is_finite: - # NaN is neither finite not infinite - to_search.append(float("nan")) - haystack = plc.interop.from_arrow( - pa.array( - to_search, - type=plc.interop.to_arrow(needles.obj.type()), - ) - ) - result = plc.search.contains(haystack, needles.obj) - if is_finite: - result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT) - return Column(result) - columns = [ - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ] - # Kleene logic for Any (OR) and All (AND) if ignore_nulls is - # False - if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All): - (ignore_nulls,) = self.options - (column,) = columns - is_any = self.name == pl_expr.BooleanFunction.Any - agg = plc.aggregation.any() if is_any else plc.aggregation.all() - result = plc.reduce.reduce(column.obj, agg, self.dtype) - if not ignore_nulls and column.obj.null_count() > 0: - # Truth tables - # Any All - # | F U T | F U T - # --+------ --+------ - # F | F U T F | F F F - # U | U U T U | F U U - # T | T T T T | F U T - # - # If the input null count was non-zero, we must - # post-process the result to insert the correct value. - h_result = plc.interop.to_arrow(result).as_py() - if is_any and not h_result or not is_any and h_result: - # Any All - # False || Null => Null True && Null => Null - return Column(plc.Column.all_null_like(column.obj, 1)) - return Column(plc.Column.from_scalar(result, 1)) - if self.name == pl_expr.BooleanFunction.IsNull: - (column,) = columns - return Column(plc.unary.is_null(column.obj)) - elif self.name == pl_expr.BooleanFunction.IsNotNull: - (column,) = columns - return Column(plc.unary.is_valid(column.obj)) - elif self.name == pl_expr.BooleanFunction.IsNan: - (column,) = columns - return Column( - plc.unary.is_nan(column.obj).with_mask( - column.obj.null_mask(), column.obj.null_count() - ) - ) - elif self.name == pl_expr.BooleanFunction.IsNotNan: - (column,) = columns - return Column( - plc.unary.is_not_nan(column.obj).with_mask( - column.obj.null_mask(), column.obj.null_count() - ) - ) - elif self.name == pl_expr.BooleanFunction.IsFirstDistinct: - (column,) = columns - return self._distinct( - column, - keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, - source_value=plc.interop.from_arrow( - pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) - ), - target_value=plc.interop.from_arrow( - pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) - ), - ) - elif self.name == pl_expr.BooleanFunction.IsLastDistinct: - (column,) = columns - return self._distinct( - column, - keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST, - source_value=plc.interop.from_arrow( - pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) - ), - target_value=plc.interop.from_arrow( - pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) - ), - ) - elif self.name == pl_expr.BooleanFunction.IsUnique: - (column,) = columns - return self._distinct( - column, - keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, - source_value=plc.interop.from_arrow( - pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) - ), - target_value=plc.interop.from_arrow( - pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) - ), - ) - elif self.name == pl_expr.BooleanFunction.IsDuplicated: - (column,) = columns - return self._distinct( - column, - keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, - source_value=plc.interop.from_arrow( - pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) - ), - target_value=plc.interop.from_arrow( - pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) - ), - ) - elif self.name == pl_expr.BooleanFunction.AllHorizontal: - return Column( - reduce( - partial( - plc.binaryop.binary_operation, - op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, - output_type=self.dtype, - ), - (c.obj for c in columns), - ) - ) - elif self.name == pl_expr.BooleanFunction.AnyHorizontal: - return Column( - reduce( - partial( - plc.binaryop.binary_operation, - op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, - output_type=self.dtype, - ), - (c.obj for c in columns), - ) - ) - elif self.name == pl_expr.BooleanFunction.IsIn: - needles, haystack = columns - return Column(plc.search.contains(haystack.obj, needles.obj)) - elif self.name == pl_expr.BooleanFunction.Not: - (column,) = columns - return Column( - plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT) - ) - else: - raise NotImplementedError( - f"BooleanFunction {self.name}" - ) # pragma: no cover; handled by init raising - - -class StringFunction(Expr): - __slots__ = ("name", "options", "children", "_regex_program") - _non_child = ("dtype", "name", "options") - children: tuple[Expr, ...] - - def __init__( - self, - dtype: plc.DataType, - name: pl_expr.StringFunction, - options: tuple[Any, ...], - *children: Expr, - ) -> None: - super().__init__(dtype) - self.options = options - self.name = name - self.children = children - self._validate_input() - - def _validate_input(self): - if self.name not in ( - pl_expr.StringFunction.Contains, - pl_expr.StringFunction.EndsWith, - pl_expr.StringFunction.Lowercase, - pl_expr.StringFunction.Replace, - pl_expr.StringFunction.ReplaceMany, - pl_expr.StringFunction.Slice, - pl_expr.StringFunction.Strptime, - pl_expr.StringFunction.StartsWith, - pl_expr.StringFunction.StripChars, - pl_expr.StringFunction.StripCharsStart, - pl_expr.StringFunction.StripCharsEnd, - pl_expr.StringFunction.Uppercase, - ): - raise NotImplementedError(f"String function {self.name}") - if self.name == pl_expr.StringFunction.Contains: - literal, strict = self.options - if not literal: - if not strict: - raise NotImplementedError( - "f{strict=} is not supported for regex contains" - ) - if not isinstance(self.children[1], Literal): - raise NotImplementedError( - "Regex contains only supports a scalar pattern" - ) - pattern = self.children[1].value.as_py() - try: - self._regex_program = plc.strings.regex_program.RegexProgram.create( - pattern, - flags=plc.strings.regex_flags.RegexFlags.DEFAULT, - ) - except RuntimeError as e: - raise NotImplementedError( - f"Unsupported regex {pattern} for GPU engine." - ) from e - elif self.name == pl_expr.StringFunction.Replace: - _, literal = self.options - if not literal: - raise NotImplementedError("literal=False is not supported for replace") - if not all(isinstance(expr, Literal) for expr in self.children[1:]): - raise NotImplementedError("replace only supports scalar target") - target = self.children[1] - if target.value == pa.scalar("", type=pa.string()): - raise NotImplementedError( - "libcudf replace does not support empty strings" - ) - elif self.name == pl_expr.StringFunction.ReplaceMany: - (ascii_case_insensitive,) = self.options - if ascii_case_insensitive: - raise NotImplementedError( - "ascii_case_insensitive not implemented for replace_many" - ) - if not all( - isinstance(expr, (LiteralColumn, Literal)) for expr in self.children[1:] - ): - raise NotImplementedError("replace_many only supports literal inputs") - target = self.children[1] - if pc.any(pc.equal(target.value, "")).as_py(): - raise NotImplementedError( - "libcudf replace_many is implemented differently from polars " - "for empty strings" - ) - elif self.name == pl_expr.StringFunction.Slice: - if not all(isinstance(child, Literal) for child in self.children[1:]): - raise NotImplementedError( - "Slice only supports literal start and stop values" - ) - elif self.name == pl_expr.StringFunction.Strptime: - format, _, exact, cache = self.options - if cache: - raise NotImplementedError("Strptime cache is a CPU feature") - if format is None: - raise NotImplementedError("Strptime format is required") - if not exact: - raise NotImplementedError("Strptime does not support exact=False") - elif self.name in { - pl_expr.StringFunction.StripChars, - pl_expr.StringFunction.StripCharsStart, - pl_expr.StringFunction.StripCharsEnd, - }: - if not isinstance(self.children[1], Literal): - raise NotImplementedError( - "strip operations only support scalar patterns" - ) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - if self.name == pl_expr.StringFunction.Contains: - child, arg = self.children - column = child.evaluate(df, context=context, mapping=mapping) - - literal, _ = self.options - if literal: - pat = arg.evaluate(df, context=context, mapping=mapping) - pattern = ( - pat.obj_scalar - if pat.is_scalar and pat.obj.size() != column.obj.size() - else pat.obj - ) - return Column(plc.strings.find.contains(column.obj, pattern)) - else: - return Column( - plc.strings.contains.contains_re(column.obj, self._regex_program) - ) - elif self.name == pl_expr.StringFunction.Slice: - child, expr_offset, expr_length = self.children - assert isinstance(expr_offset, Literal) - assert isinstance(expr_length, Literal) - - column = child.evaluate(df, context=context, mapping=mapping) - # libcudf slices via [start,stop). - # polars slices with offset + length where start == offset - # stop = start + length. Negative values for start look backward - # from the last element of the string. If the end index would be - # below zero, an empty string is returned. - # Do this maths on the host - start = expr_offset.value.as_py() - length = expr_length.value.as_py() - - if length == 0: - stop = start - else: - # No length indicates a scan to the end - # The libcudf equivalent is a null stop - stop = start + length if length else None - if length and start < 0 and length >= -start: - stop = None - return Column( - plc.strings.slice.slice_strings( - column.obj, - plc.interop.from_arrow(pa.scalar(start, type=pa.int32())), - plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())), - ) - ) - elif self.name in { - pl_expr.StringFunction.StripChars, - pl_expr.StringFunction.StripCharsStart, - pl_expr.StringFunction.StripCharsEnd, - }: - column, chars = ( - c.evaluate(df, context=context, mapping=mapping) for c in self.children - ) - if self.name == pl_expr.StringFunction.StripCharsStart: - side = plc.strings.SideType.LEFT - elif self.name == pl_expr.StringFunction.StripCharsEnd: - side = plc.strings.SideType.RIGHT - else: - side = plc.strings.SideType.BOTH - return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar)) - - columns = [ - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ] - if self.name == pl_expr.StringFunction.Lowercase: - (column,) = columns - return Column(plc.strings.case.to_lower(column.obj)) - elif self.name == pl_expr.StringFunction.Uppercase: - (column,) = columns - return Column(plc.strings.case.to_upper(column.obj)) - elif self.name == pl_expr.StringFunction.EndsWith: - column, suffix = columns - return Column( - plc.strings.find.ends_with( - column.obj, - suffix.obj_scalar - if column.obj.size() != suffix.obj.size() and suffix.is_scalar - else suffix.obj, - ) - ) - elif self.name == pl_expr.StringFunction.StartsWith: - column, prefix = columns - return Column( - plc.strings.find.starts_with( - column.obj, - prefix.obj_scalar - if column.obj.size() != prefix.obj.size() and prefix.is_scalar - else prefix.obj, - ) - ) - elif self.name == pl_expr.StringFunction.Strptime: - # TODO: ignores ambiguous - format, strict, exact, cache = self.options - col = self.children[0].evaluate(df, context=context, mapping=mapping) - - is_timestamps = plc.strings.convert.convert_datetime.is_timestamp( - col.obj, format.encode() - ) - - if strict: - if not plc.interop.to_arrow( - plc.reduce.reduce( - is_timestamps, - plc.aggregation.all(), - plc.DataType(plc.TypeId.BOOL8), - ) - ).as_py(): - raise InvalidOperationError("conversion from `str` failed.") - else: - not_timestamps = plc.unary.unary_operation( - is_timestamps, plc.unary.UnaryOperator.NOT - ) - - null = plc.interop.from_arrow(pa.scalar(None, type=pa.string())) - res = plc.copying.boolean_mask_scatter( - [null], plc.Table([col.obj]), not_timestamps - ) - return Column( - plc.strings.convert.convert_datetime.to_timestamps( - res.columns()[0], self.dtype, format.encode() - ) - ) - elif self.name == pl_expr.StringFunction.Replace: - column, target, repl = columns - n, _ = self.options - return Column( - plc.strings.replace.replace( - column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n - ) - ) - elif self.name == pl_expr.StringFunction.ReplaceMany: - column, target, repl = columns - return Column( - plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj) - ) - raise NotImplementedError( - f"StringFunction {self.name}" - ) # pragma: no cover; handled by init raising - - -class TemporalFunction(Expr): - __slots__ = ("name", "options", "children") - _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = { - pl_expr.TemporalFunction.Year: "year", - pl_expr.TemporalFunction.Month: "month", - pl_expr.TemporalFunction.Day: "day", - pl_expr.TemporalFunction.WeekDay: "weekday", - pl_expr.TemporalFunction.Hour: "hour", - pl_expr.TemporalFunction.Minute: "minute", - pl_expr.TemporalFunction.Second: "second", - pl_expr.TemporalFunction.Millisecond: "millisecond", - pl_expr.TemporalFunction.Microsecond: "microsecond", - pl_expr.TemporalFunction.Nanosecond: "nanosecond", - } - _non_child = ("dtype", "name", "options") - children: tuple[Expr, ...] - - def __init__( - self, - dtype: plc.DataType, - name: pl_expr.TemporalFunction, - options: tuple[Any, ...], - *children: Expr, - ) -> None: - super().__init__(dtype) - self.options = options - self.name = name - self.children = children - if self.name not in self._COMPONENT_MAP: - raise NotImplementedError(f"Temporal function {self.name}") - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - columns = [ - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ] - (column,) = columns - if self.name == pl_expr.TemporalFunction.Microsecond: - millis = plc.datetime.extract_datetime_component(column.obj, "millisecond") - micros = plc.datetime.extract_datetime_component(column.obj, "microsecond") - millis_as_micros = plc.binaryop.binary_operation( - millis, - plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), - plc.binaryop.BinaryOperator.MUL, - plc.DataType(plc.TypeId.INT32), - ) - total_micros = plc.binaryop.binary_operation( - micros, - millis_as_micros, - plc.binaryop.BinaryOperator.ADD, - plc.types.DataType(plc.types.TypeId.INT32), - ) - return Column(total_micros) - elif self.name == pl_expr.TemporalFunction.Nanosecond: - millis = plc.datetime.extract_datetime_component(column.obj, "millisecond") - micros = plc.datetime.extract_datetime_component(column.obj, "microsecond") - nanos = plc.datetime.extract_datetime_component(column.obj, "nanosecond") - millis_as_nanos = plc.binaryop.binary_operation( - millis, - plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())), - plc.binaryop.BinaryOperator.MUL, - plc.types.DataType(plc.types.TypeId.INT32), - ) - micros_as_nanos = plc.binaryop.binary_operation( - micros, - plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), - plc.binaryop.BinaryOperator.MUL, - plc.types.DataType(plc.types.TypeId.INT32), - ) - total_nanos = plc.binaryop.binary_operation( - nanos, - millis_as_nanos, - plc.binaryop.BinaryOperator.ADD, - plc.types.DataType(plc.types.TypeId.INT32), - ) - total_nanos = plc.binaryop.binary_operation( - total_nanos, - micros_as_nanos, - plc.binaryop.BinaryOperator.ADD, - plc.types.DataType(plc.types.TypeId.INT32), - ) - return Column(total_nanos) - - return Column( - plc.datetime.extract_datetime_component( - column.obj, - self._COMPONENT_MAP[self.name], - ) - ) - - -class UnaryFunction(Expr): - __slots__ = ("name", "options", "children") - _non_child = ("dtype", "name", "options") - children: tuple[Expr, ...] - - # Note: log, and pow are handled via translation to binops - _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = { - "sin": plc.unary.UnaryOperator.SIN, - "cos": plc.unary.UnaryOperator.COS, - "tan": plc.unary.UnaryOperator.TAN, - "arcsin": plc.unary.UnaryOperator.ARCSIN, - "arccos": plc.unary.UnaryOperator.ARCCOS, - "arctan": plc.unary.UnaryOperator.ARCTAN, - "sinh": plc.unary.UnaryOperator.SINH, - "cosh": plc.unary.UnaryOperator.COSH, - "tanh": plc.unary.UnaryOperator.TANH, - "arcsinh": plc.unary.UnaryOperator.ARCSINH, - "arccosh": plc.unary.UnaryOperator.ARCCOSH, - "arctanh": plc.unary.UnaryOperator.ARCTANH, - "exp": plc.unary.UnaryOperator.EXP, - "sqrt": plc.unary.UnaryOperator.SQRT, - "cbrt": plc.unary.UnaryOperator.CBRT, - "ceil": plc.unary.UnaryOperator.CEIL, - "floor": plc.unary.UnaryOperator.FLOOR, - "abs": plc.unary.UnaryOperator.ABS, - "bit_invert": plc.unary.UnaryOperator.BIT_INVERT, - "not": plc.unary.UnaryOperator.NOT, - } - _supported_misc_fns = frozenset( - { - "drop_nulls", - "fill_null", - "mask_nans", - "round", - "set_sorted", - "unique", - } - ) - _supported_cum_aggs = frozenset( - { - "cum_min", - "cum_max", - "cum_prod", - "cum_sum", - } - ) - _supported_fns = frozenset().union( - _supported_misc_fns, _supported_cum_aggs, _OP_MAPPING.keys() - ) - - def __init__( - self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr - ) -> None: - super().__init__(dtype) - self.name = name - self.options = options - self.children = children - - if self.name not in UnaryFunction._supported_fns: - raise NotImplementedError(f"Unary function {name=}") - if self.name in UnaryFunction._supported_cum_aggs: - (reverse,) = self.options - if reverse: - raise NotImplementedError( - "reverse=True is not supported for cumulative aggregations" - ) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - if self.name == "mask_nans": - (child,) = self.children - return child.evaluate(df, context=context, mapping=mapping).mask_nans() - if self.name == "round": - (decimal_places,) = self.options - (values,) = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - return Column( - plc.round.round( - values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP - ) - ).sorted_like(values) - elif self.name == "unique": - (maintain_order,) = self.options - (values,) = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - # Only one column, so keep_any is the same as keep_first - # for stable distinct - keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY - if values.is_sorted: - maintain_order = True - result = plc.stream_compaction.unique( - plc.Table([values.obj]), - [0], - keep, - plc.types.NullEquality.EQUAL, - ) - else: - distinct = ( - plc.stream_compaction.stable_distinct - if maintain_order - else plc.stream_compaction.distinct - ) - result = distinct( - plc.Table([values.obj]), - [0], - keep, - plc.types.NullEquality.EQUAL, - plc.types.NanEquality.ALL_EQUAL, - ) - (column,) = result.columns() - if maintain_order: - return Column(column).sorted_like(values) - return Column(column) - elif self.name == "set_sorted": - (column,) = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - (asc,) = self.options - order = ( - plc.types.Order.ASCENDING - if asc == "ascending" - else plc.types.Order.DESCENDING - ) - null_order = plc.types.NullOrder.BEFORE - if column.obj.null_count() > 0 and (n := column.obj.size()) > 1: - # PERF: This invokes four stream synchronisations! - has_nulls_first = not plc.copying.get_element(column.obj, 0).is_valid() - has_nulls_last = not plc.copying.get_element( - column.obj, n - 1 - ).is_valid() - if (order == plc.types.Order.DESCENDING and has_nulls_first) or ( - order == plc.types.Order.ASCENDING and has_nulls_last - ): - null_order = plc.types.NullOrder.AFTER - return column.set_sorted( - is_sorted=plc.types.Sorted.YES, - order=order, - null_order=null_order, - ) - elif self.name == "drop_nulls": - (column,) = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - return Column( - plc.stream_compaction.drop_nulls( - plc.Table([column.obj]), [0], 1 - ).columns()[0] - ) - elif self.name == "fill_null": - column = self.children[0].evaluate(df, context=context, mapping=mapping) - if isinstance(self.children[1], Literal): - arg = plc.interop.from_arrow(self.children[1].value) - else: - evaluated = self.children[1].evaluate( - df, context=context, mapping=mapping - ) - arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj - return Column(plc.replace.replace_nulls(column.obj, arg)) - elif self.name in self._OP_MAPPING: - column = self.children[0].evaluate(df, context=context, mapping=mapping) - if column.obj.type().id() != self.dtype.id(): - arg = plc.unary.cast(column.obj, self.dtype) - else: - arg = column.obj - return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name])) - elif self.name in UnaryFunction._supported_cum_aggs: - column = self.children[0].evaluate(df, context=context, mapping=mapping) - plc_col = column.obj - col_type = column.obj.type() - # cum_sum casts - # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention - # Bool -> UInt32 - # cum_prod casts integer dtypes < int64 and bool to int64 - # See: - # https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/cum_agg.rs - if ( - self.name == "cum_sum" - and col_type.id() - in { - plc.types.TypeId.INT8, - plc.types.TypeId.UINT8, - plc.types.TypeId.INT16, - plc.types.TypeId.UINT16, - } - ) or ( - self.name == "cum_prod" - and plc.traits.is_integral(col_type) - and plc.types.size_of(col_type) <= 4 - ): - plc_col = plc.unary.cast( - plc_col, plc.types.DataType(plc.types.TypeId.INT64) - ) - elif ( - self.name == "cum_sum" - and column.obj.type().id() == plc.types.TypeId.BOOL8 - ): - plc_col = plc.unary.cast( - plc_col, plc.types.DataType(plc.types.TypeId.UINT32) - ) - if self.name == "cum_sum": - agg = plc.aggregation.sum() - elif self.name == "cum_prod": - agg = plc.aggregation.product() - elif self.name == "cum_min": - agg = plc.aggregation.min() - elif self.name == "cum_max": - agg = plc.aggregation.max() - - return Column(plc.reduce.scan(plc_col, agg, plc.reduce.ScanType.INCLUSIVE)) - raise NotImplementedError( - f"Unimplemented unary function {self.name=}" - ) # pragma: no cover; init trips first - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs: - raise NotImplementedError(f"{self.name} in groupby") - if depth == 1: - # inside aggregation, need to pre-evaluate, groupby - # construction has checked that we don't have nested aggs, - # so stop the recursion and return ourselves for pre-eval - return AggInfo([(self, plc.aggregation.collect_list(), self)]) - else: - (child,) = self.children - return child.collect_agg(depth=depth) - - -class Sort(Expr): - __slots__ = ("options", "children") - _non_child = ("dtype", "options") - children: tuple[Expr] - - def __init__( - self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr - ) -> None: - super().__init__(dtype) - self.options = options - self.children = (column,) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - (child,) = self.children - column = child.evaluate(df, context=context, mapping=mapping) - (stable, nulls_last, descending) = self.options - order, null_order = sorting.sort_order( - [descending], nulls_last=[nulls_last], num_keys=1 - ) - do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort - table = do_sort(plc.Table([column.obj]), order, null_order) - return Column( - table.columns()[0], - is_sorted=plc.types.Sorted.YES, - order=order[0], - null_order=null_order[0], - ) - - -class SortBy(Expr): - __slots__ = ("options", "children") - _non_child = ("dtype", "options") - children: tuple[Expr, ...] - - def __init__( - self, - dtype: plc.DataType, - options: tuple[bool, tuple[bool], tuple[bool]], - column: Expr, - *by: Expr, - ) -> None: - super().__init__(dtype) - self.options = options - self.children = (column, *by) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - column, *by = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - (stable, nulls_last, descending) = self.options - order, null_order = sorting.sort_order( - descending, nulls_last=nulls_last, num_keys=len(by) - ) - do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key - table = do_sort( - plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order - ) - return Column(table.columns()[0]) - - -class Gather(Expr): - __slots__ = ("children",) - _non_child = ("dtype",) - children: tuple[Expr, Expr] - - def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None: - super().__init__(dtype) - self.children = (values, indices) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - values, indices = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - lo, hi = plc.reduce.minmax(indices.obj) - lo = plc.interop.to_arrow(lo).as_py() - hi = plc.interop.to_arrow(hi).as_py() - n = df.num_rows - if hi >= n or lo < -n: - raise ValueError("gather indices are out of bounds") - if indices.obj.null_count(): - bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY - obj = plc.replace.replace_nulls( - indices.obj, - plc.interop.from_arrow( - pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type())) - ), - ) - else: - bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK - obj = indices.obj - table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy) - return Column(table.columns()[0]) - - -class Filter(Expr): - __slots__ = ("children",) - _non_child = ("dtype",) - children: tuple[Expr, Expr] - - def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr): - super().__init__(dtype) - self.children = (values, indices) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - values, mask = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - table = plc.stream_compaction.apply_boolean_mask( - plc.Table([values.obj]), mask.obj - ) - return Column(table.columns()[0]).sorted_like(values) - - -class RollingWindow(Expr): - __slots__ = ("options", "children") - _non_child = ("dtype", "options") - children: tuple[Expr] - - def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None: - super().__init__(dtype) - self.options = options - self.children = (agg,) - raise NotImplementedError("Rolling window not implemented") - - -class GroupedRollingWindow(Expr): - __slots__ = ("options", "children") - _non_child = ("dtype", "options") - children: tuple[Expr, ...] - - def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None: - super().__init__(dtype) - self.options = options - self.children = (agg, *by) - raise NotImplementedError("Grouped rolling window not implemented") - - -class Cast(Expr): - __slots__ = ("children",) - _non_child = ("dtype",) - children: tuple[Expr] - - def __init__(self, dtype: plc.DataType, value: Expr) -> None: - super().__init__(dtype) - self.children = (value,) - if not dtypes.can_cast(value.dtype, self.dtype): - raise NotImplementedError( - f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}" - ) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - (child,) = self.children - column = child.evaluate(df, context=context, mapping=mapping) - return Column(plc.unary.cast(column.obj, self.dtype)).sorted_like(column) - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - # TODO: Could do with sort-based groupby and segmented filter - (child,) = self.children - return child.collect_agg(depth=depth) - - -class Agg(Expr): - __slots__ = ("name", "options", "op", "request", "children") - _non_child = ("dtype", "name", "options") - children: tuple[Expr, ...] - - def __init__( - self, dtype: plc.DataType, name: str, options: Any, *children: Expr - ) -> None: - super().__init__(dtype) - self.name = name - self.options = options - self.children = children - if name not in Agg._SUPPORTED: - raise NotImplementedError( - f"Unsupported aggregation {name=}" - ) # pragma: no cover; all valid aggs are supported - # TODO: nan handling in groupby case - if name == "min": - req = plc.aggregation.min() - elif name == "max": - req = plc.aggregation.max() - elif name == "median": - req = plc.aggregation.median() - elif name == "n_unique": - # TODO: datatype of result - req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE) - elif name == "first" or name == "last": - req = None - elif name == "mean": - req = plc.aggregation.mean() - elif name == "sum": - req = plc.aggregation.sum() - elif name == "std": - # TODO: handle nans - req = plc.aggregation.std(ddof=options) - elif name == "var": - # TODO: handle nans - req = plc.aggregation.variance(ddof=options) - elif name == "count": - req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE) - elif name == "quantile": - _, quantile = self.children - if not isinstance(quantile, Literal): - raise NotImplementedError("Only support literal quantile values") - req = plc.aggregation.quantile( - quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options] - ) - else: - raise NotImplementedError( - f"Unreachable, {name=} is incorrectly listed in _SUPPORTED" - ) # pragma: no cover - self.request = req - op = getattr(self, f"_{name}", None) - if op is None: - op = partial(self._reduce, request=req) - elif name in {"min", "max"}: - op = partial(op, propagate_nans=options) - elif name in {"count", "first", "last"}: - pass - else: - raise NotImplementedError( - f"Unreachable, supported agg {name=} has no implementation" - ) # pragma: no cover - self.op = op - - _SUPPORTED: ClassVar[frozenset[str]] = frozenset( - [ - "min", - "max", - "median", - "n_unique", - "first", - "last", - "mean", - "sum", - "count", - "std", - "var", - "quantile", - ] - ) - - interp_mapping: ClassVar[dict[str, plc.types.Interpolation]] = { - "nearest": plc.types.Interpolation.NEAREST, - "higher": plc.types.Interpolation.HIGHER, - "lower": plc.types.Interpolation.LOWER, - "midpoint": plc.types.Interpolation.MIDPOINT, - "linear": plc.types.Interpolation.LINEAR, - } - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - if depth >= 1: - raise NotImplementedError( - "Nested aggregations in groupby" - ) # pragma: no cover; check_agg trips first - if (isminmax := self.name in {"min", "max"}) and self.options: - raise NotImplementedError("Nan propagation in groupby for min/max") - (child,) = self.children - ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests - request = self.request - # These are handled specially here because we don't set up the - # request for the whole-frame agg because we can avoid a - # reduce for these. - if self.name == "first": - request = plc.aggregation.nth_element( - 0, null_handling=plc.types.NullPolicy.INCLUDE - ) - elif self.name == "last": - request = plc.aggregation.nth_element( - -1, null_handling=plc.types.NullPolicy.INCLUDE - ) - if request is None: - raise NotImplementedError( - f"Aggregation {self.name} in groupby" - ) # pragma: no cover; __init__ trips first - if isminmax and plc.traits.is_floating_point(self.dtype): - assert expr is not None - # Ignore nans in these groupby aggs, do this by masking - # nans in the input - expr = UnaryFunction(self.dtype, "mask_nans", (), expr) - return AggInfo([(expr, request, self)]) - - def _reduce( - self, column: Column, *, request: plc.aggregation.Aggregation - ) -> Column: - return Column( - plc.Column.from_scalar( - plc.reduce.reduce(column.obj, request, self.dtype), - 1, - ) - ) - - def _count(self, column: Column) -> Column: - return Column( - plc.Column.from_scalar( - plc.interop.from_arrow( - pa.scalar( - column.obj.size() - column.obj.null_count(), - type=plc.interop.to_arrow(self.dtype), - ), - ), - 1, - ) - ) - - def _min(self, column: Column, *, propagate_nans: bool) -> Column: - if propagate_nans and column.nan_count > 0: - return Column( - plc.Column.from_scalar( - plc.interop.from_arrow( - pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype)) - ), - 1, - ) - ) - if column.nan_count > 0: - column = column.mask_nans() - return self._reduce(column, request=plc.aggregation.min()) - - def _max(self, column: Column, *, propagate_nans: bool) -> Column: - if propagate_nans and column.nan_count > 0: - return Column( - plc.Column.from_scalar( - plc.interop.from_arrow( - pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype)) - ), - 1, - ) - ) - if column.nan_count > 0: - column = column.mask_nans() - return self._reduce(column, request=plc.aggregation.max()) - - def _first(self, column: Column) -> Column: - return Column(plc.copying.slice(column.obj, [0, 1])[0]) - - def _last(self, column: Column) -> Column: - n = column.obj.size() - return Column(plc.copying.slice(column.obj, [n - 1, n])[0]) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - if context is not ExecutionContext.FRAME: - raise NotImplementedError( - f"Agg in context {context}" - ) # pragma: no cover; unreachable - - # Aggregations like quantiles may have additional children that were - # preprocessed into pylibcudf requests. - child = self.children[0] - return self.op(child.evaluate(df, context=context, mapping=mapping)) - - -class Ternary(Expr): - __slots__ = ("children",) - _non_child = ("dtype",) - children: tuple[Expr, Expr, Expr] - - def __init__( - self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr - ) -> None: - super().__init__(dtype) - self.children = (when, then, otherwise) - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - when, then, otherwise = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - then_obj = then.obj_scalar if then.is_scalar else then.obj - otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj - return Column(plc.copying.copy_if_else(then_obj, otherwise_obj, when.obj)) - - -class BinOp(Expr): - __slots__ = ("op", "children") - _non_child = ("dtype", "op") - children: tuple[Expr, Expr] - - def __init__( - self, - dtype: plc.DataType, - op: plc.binaryop.BinaryOperator, - left: Expr, - right: Expr, - ) -> None: - super().__init__(dtype) - if plc.traits.is_boolean(self.dtype): - # For boolean output types, bitand and bitor implement - # boolean logic, so translate. bitxor also does, but the - # default behaviour is correct. - op = BinOp._BOOL_KLEENE_MAPPING.get(op, op) - self.op = op - self.children = (left, right) - if not plc.binaryop.is_supported_operation( - self.dtype, left.dtype, right.dtype, op - ): - raise NotImplementedError( - f"Operation {op.name} not supported " - f"for types {left.dtype.id().name} and {right.dtype.id().name} " - f"with output type {self.dtype.id().name}" - ) - - _BOOL_KLEENE_MAPPING: ClassVar[ - dict[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator] - ] = { - plc.binaryop.BinaryOperator.BITWISE_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, - plc.binaryop.BinaryOperator.BITWISE_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, - plc.binaryop.BinaryOperator.LOGICAL_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, - plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, - } - - _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = { - pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL, - pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS, - pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL, - pl_expr.Operator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS, - pl_expr.Operator.Lt: plc.binaryop.BinaryOperator.LESS, - pl_expr.Operator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL, - pl_expr.Operator.Gt: plc.binaryop.BinaryOperator.GREATER, - pl_expr.Operator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL, - pl_expr.Operator.Plus: plc.binaryop.BinaryOperator.ADD, - pl_expr.Operator.Minus: plc.binaryop.BinaryOperator.SUB, - pl_expr.Operator.Multiply: plc.binaryop.BinaryOperator.MUL, - pl_expr.Operator.Divide: plc.binaryop.BinaryOperator.DIV, - pl_expr.Operator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV, - pl_expr.Operator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV, - pl_expr.Operator.Modulus: plc.binaryop.BinaryOperator.PYMOD, - pl_expr.Operator.And: plc.binaryop.BinaryOperator.BITWISE_AND, - pl_expr.Operator.Or: plc.binaryop.BinaryOperator.BITWISE_OR, - pl_expr.Operator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR, - pl_expr.Operator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND, - pl_expr.Operator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR, - } - - def do_evaluate( - self, - df: DataFrame, - *, - context: ExecutionContext = ExecutionContext.FRAME, - mapping: Mapping[Expr, Column] | None = None, - ) -> Column: - """Evaluate this expression given a dataframe for context.""" - left, right = ( - child.evaluate(df, context=context, mapping=mapping) - for child in self.children - ) - lop = left.obj - rop = right.obj - if left.obj.size() != right.obj.size(): - if left.is_scalar: - lop = left.obj_scalar - elif right.is_scalar: - rop = right.obj_scalar - return Column( - plc.binaryop.binary_operation(lop, rop, self.op, self.dtype), - ) - - def collect_agg(self, *, depth: int) -> AggInfo: - """Collect information about aggregations in groupbys.""" - if depth == 1: - # inside aggregation, need to pre-evaluate, - # groupby construction has checked that we don't have - # nested aggs, so stop the recursion and return ourselves - # for pre-eval - return AggInfo([(self, plc.aggregation.collect_list(), self)]) - else: - left_info, right_info = ( - child.collect_agg(depth=depth) for child in self.children - ) - requests = [*left_info.requests, *right_info.requests] - # TODO: Hack, if there were no reductions inside this - # binary expression then we want to pre-evaluate and - # collect ourselves. Otherwise we want to collect the - # aggregations inside and post-evaluate. This is a bad way - # of checking that we are in case 1. - if all( - agg.kind() == plc.aggregation.Kind.COLLECT_LIST - for _, agg, _ in requests - ): - return AggInfo([(self, plc.aggregation.collect_list(), self)]) - return AggInfo( - [*left_info.requests, *right_info.requests], - ) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/__init__.py b/python/cudf_polars/cudf_polars/dsl/expressions/__init__.py new file mode 100644 index 00000000000..acbea129088 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/__init__.py @@ -0,0 +1,8 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Implementations of various expressions.""" + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py new file mode 100644 index 00000000000..41b1defab39 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py @@ -0,0 +1,228 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""DSL nodes for aggregations.""" + +from __future__ import annotations + +from functools import partial +from typing import TYPE_CHECKING, Any, ClassVar + +import pyarrow as pa +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import ( + AggInfo, + ExecutionContext, + Expr, +) +from cudf_polars.dsl.expressions.literal import Literal +from cudf_polars.dsl.expressions.unary import UnaryFunction + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import DataFrame + +__all__ = ["Agg"] + + +class Agg(Expr): + __slots__ = ("name", "options", "op", "request") + _non_child = ("dtype", "name", "options") + + def __init__( + self, dtype: plc.DataType, name: str, options: Any, *children: Expr + ) -> None: + self.dtype = dtype + self.name = name + self.options = options + self.children = children + if name not in Agg._SUPPORTED: + raise NotImplementedError( + f"Unsupported aggregation {name=}" + ) # pragma: no cover; all valid aggs are supported + # TODO: nan handling in groupby case + if name == "min": + req = plc.aggregation.min() + elif name == "max": + req = plc.aggregation.max() + elif name == "median": + req = plc.aggregation.median() + elif name == "n_unique": + # TODO: datatype of result + req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE) + elif name == "first" or name == "last": + req = None + elif name == "mean": + req = plc.aggregation.mean() + elif name == "sum": + req = plc.aggregation.sum() + elif name == "std": + # TODO: handle nans + req = plc.aggregation.std(ddof=options) + elif name == "var": + # TODO: handle nans + req = plc.aggregation.variance(ddof=options) + elif name == "count": + req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE) + elif name == "quantile": + _, quantile = self.children + if not isinstance(quantile, Literal): + raise NotImplementedError("Only support literal quantile values") + req = plc.aggregation.quantile( + quantiles=[quantile.value.as_py()], interp=Agg.interp_mapping[options] + ) + else: + raise NotImplementedError( + f"Unreachable, {name=} is incorrectly listed in _SUPPORTED" + ) # pragma: no cover + self.request = req + op = getattr(self, f"_{name}", None) + if op is None: + op = partial(self._reduce, request=req) + elif name in {"min", "max"}: + op = partial(op, propagate_nans=options) + elif name in {"count", "first", "last"}: + pass + else: + raise NotImplementedError( + f"Unreachable, supported agg {name=} has no implementation" + ) # pragma: no cover + self.op = op + + _SUPPORTED: ClassVar[frozenset[str]] = frozenset( + [ + "min", + "max", + "median", + "n_unique", + "first", + "last", + "mean", + "sum", + "count", + "std", + "var", + "quantile", + ] + ) + + interp_mapping: ClassVar[dict[str, plc.types.Interpolation]] = { + "nearest": plc.types.Interpolation.NEAREST, + "higher": plc.types.Interpolation.HIGHER, + "lower": plc.types.Interpolation.LOWER, + "midpoint": plc.types.Interpolation.MIDPOINT, + "linear": plc.types.Interpolation.LINEAR, + } + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + if depth >= 1: + raise NotImplementedError( + "Nested aggregations in groupby" + ) # pragma: no cover; check_agg trips first + if (isminmax := self.name in {"min", "max"}) and self.options: + raise NotImplementedError("Nan propagation in groupby for min/max") + (child,) = self.children + ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests + request = self.request + # These are handled specially here because we don't set up the + # request for the whole-frame agg because we can avoid a + # reduce for these. + if self.name == "first": + request = plc.aggregation.nth_element( + 0, null_handling=plc.types.NullPolicy.INCLUDE + ) + elif self.name == "last": + request = plc.aggregation.nth_element( + -1, null_handling=plc.types.NullPolicy.INCLUDE + ) + if request is None: + raise NotImplementedError( + f"Aggregation {self.name} in groupby" + ) # pragma: no cover; __init__ trips first + if isminmax and plc.traits.is_floating_point(self.dtype): + assert expr is not None + # Ignore nans in these groupby aggs, do this by masking + # nans in the input + expr = UnaryFunction(self.dtype, "mask_nans", (), expr) + return AggInfo([(expr, request, self)]) + + def _reduce( + self, column: Column, *, request: plc.aggregation.Aggregation + ) -> Column: + return Column( + plc.Column.from_scalar( + plc.reduce.reduce(column.obj, request, self.dtype), + 1, + ) + ) + + def _count(self, column: Column) -> Column: + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar( + column.obj.size() - column.obj.null_count(), + type=plc.interop.to_arrow(self.dtype), + ), + ), + 1, + ) + ) + + def _min(self, column: Column, *, propagate_nans: bool) -> Column: + if propagate_nans and column.nan_count > 0: + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype)) + ), + 1, + ) + ) + if column.nan_count > 0: + column = column.mask_nans() + return self._reduce(column, request=plc.aggregation.min()) + + def _max(self, column: Column, *, propagate_nans: bool) -> Column: + if propagate_nans and column.nan_count > 0: + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype)) + ), + 1, + ) + ) + if column.nan_count > 0: + column = column.mask_nans() + return self._reduce(column, request=plc.aggregation.max()) + + def _first(self, column: Column) -> Column: + return Column(plc.copying.slice(column.obj, [0, 1])[0]) + + def _last(self, column: Column) -> Column: + n = column.obj.size() + return Column(plc.copying.slice(column.obj, [n - 1, n])[0]) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + if context is not ExecutionContext.FRAME: + raise NotImplementedError( + f"Agg in context {context}" + ) # pragma: no cover; unreachable + + # Aggregations like quantiles may have additional children that were + # preprocessed into pylibcudf requests. + child = self.children[0] + return self.op(child.evaluate(df, context=context, mapping=mapping)) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py new file mode 100644 index 00000000000..effe8cb2378 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py @@ -0,0 +1,251 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""Base and common classes for expression DSL nodes.""" + +from __future__ import annotations + +import enum +from enum import IntEnum +from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple + +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.nodebase import Node + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import Column, DataFrame + +__all__ = ["Expr", "NamedExpr", "Col", "AggInfo", "ExecutionContext"] + + +class AggInfo(NamedTuple): + requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]] + + +class ExecutionContext(IntEnum): + FRAME = enum.auto() + GROUPBY = enum.auto() + ROLLING = enum.auto() + + +class Expr(Node["Expr"]): + """An abstract expression object.""" + + __slots__ = ("dtype",) + dtype: plc.DataType + """Data type of the expression.""" + # This annotation is needed because of https://github.com/python/mypy/issues/17981 + _non_child: ClassVar[tuple[str, ...]] = ("dtype",) + """Names of non-child data (not Exprs) for reconstruction.""" + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """ + Evaluate this expression given a dataframe for context. + + Parameters + ---------- + df + DataFrame that will provide columns. + context + What context are we performing this evaluation in? + mapping + Substitution mapping from expressions to Columns, used to + override the evaluation of a given expression if we're + performing a simple rewritten evaluation. + + Notes + ----- + Do not call this function directly, but rather + :meth:`evaluate` which handles the mapping lookups. + + Returns + ------- + Column representing the evaluation of the expression. + + Raises + ------ + NotImplementedError + If we couldn't evaluate the expression. Ideally all these + are returned during translation to the IR, but for now we + are not perfect. + """ + raise NotImplementedError( + f"Evaluation of expression {type(self).__name__}" + ) # pragma: no cover; translation of unimplemented nodes trips first + + def evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """ + Evaluate this expression given a dataframe for context. + + Parameters + ---------- + df + DataFrame that will provide columns. + context + What context are we performing this evaluation in? + mapping + Substitution mapping from expressions to Columns, used to + override the evaluation of a given expression if we're + performing a simple rewritten evaluation. + + Notes + ----- + Individual subclasses should implement :meth:`do_evaluate`, + this method provides logic to handle lookups in the + substitution mapping. + + Returns + ------- + Column representing the evaluation of the expression. + + Raises + ------ + NotImplementedError + If we couldn't evaluate the expression. Ideally all these + are returned during translation to the IR, but for now we + are not perfect. + """ + if mapping is None: + return self.do_evaluate(df, context=context, mapping=mapping) + try: + return mapping[self] + except KeyError: + return self.do_evaluate(df, context=context, mapping=mapping) + + def collect_agg(self, *, depth: int) -> AggInfo: + """ + Collect information about aggregations in groupbys. + + Parameters + ---------- + depth + The depth of aggregating (reduction or sampling) + expressions we are currently at. + + Returns + ------- + Aggregation info describing the expression to aggregate in the + groupby. + + Raises + ------ + NotImplementedError + If we can't currently perform the aggregation request, for + example nested aggregations like ``a.max().min()``. + """ + raise NotImplementedError( + f"Collecting aggregation info for {type(self).__name__}" + ) # pragma: no cover; check_agg trips first + + +class NamedExpr: + # NamedExpr does not inherit from Expr since it does not appear + # when evaluating expressions themselves, only when constructing + # named return values in dataframe (IR) nodes. + __slots__ = ("name", "value") + value: Expr + name: str + + def __init__(self, name: str, value: Expr) -> None: + self.name = name + self.value = value + + def __hash__(self) -> int: + """Hash of the expression.""" + return hash((type(self), self.name, self.value)) + + def __repr__(self) -> str: + """Repr of the expression.""" + return f"NamedExpr({self.name}, {self.value})" + + def __eq__(self, other: Any) -> bool: + """Equality of two expressions.""" + return ( + type(self) is type(other) + and self.name == other.name + and self.value == other.value + ) + + def __ne__(self, other: Any) -> bool: + """Inequality of expressions.""" + return not self.__eq__(other) + + def evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """ + Evaluate this expression given a dataframe for context. + + Parameters + ---------- + df + DataFrame providing context + context + Execution context + mapping + Substitution mapping + + Returns + ------- + Evaluated Column with name attached. + + See Also + -------- + :meth:`Expr.evaluate` for details, this function just adds the + name to a column produced from an expression. + """ + return self.value.evaluate(df, context=context, mapping=mapping).rename( + self.name + ) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + return self.value.collect_agg(depth=depth) + + +class Col(Expr): + __slots__ = ("name",) + _non_child = ("dtype", "name") + name: str + + def __init__(self, dtype: plc.DataType, name: str) -> None: + self.dtype = dtype + self.name = name + self.children = () + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + # Deliberately remove the name here so that we guarantee + # evaluation of the IR produces names. + return df.column_map[self.name].rename(None) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + return AggInfo([(self, plc.aggregation.collect_list(), self)]) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py new file mode 100644 index 00000000000..11a47e7ea51 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py @@ -0,0 +1,134 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""BinaryOp DSL nodes.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, ClassVar + +import pylibcudf as plc + +from polars.polars import _expr_nodes as pl_expr + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import DataFrame + +__all__ = ["BinOp"] + + +class BinOp(Expr): + __slots__ = ("op",) + _non_child = ("dtype", "op") + + def __init__( + self, + dtype: plc.DataType, + op: plc.binaryop.BinaryOperator, + left: Expr, + right: Expr, + ) -> None: + self.dtype = dtype + if plc.traits.is_boolean(self.dtype): + # For boolean output types, bitand and bitor implement + # boolean logic, so translate. bitxor also does, but the + # default behaviour is correct. + op = BinOp._BOOL_KLEENE_MAPPING.get(op, op) + self.op = op + self.children = (left, right) + if not plc.binaryop.is_supported_operation( + self.dtype, left.dtype, right.dtype, op + ): + raise NotImplementedError( + f"Operation {op.name} not supported " + f"for types {left.dtype.id().name} and {right.dtype.id().name} " + f"with output type {self.dtype.id().name}" + ) + + _BOOL_KLEENE_MAPPING: ClassVar[ + dict[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator] + ] = { + plc.binaryop.BinaryOperator.BITWISE_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, + plc.binaryop.BinaryOperator.BITWISE_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, + plc.binaryop.BinaryOperator.LOGICAL_AND: plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, + plc.binaryop.BinaryOperator.LOGICAL_OR: plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, + } + + _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = { + pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL, + pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS, + pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL, + pl_expr.Operator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS, + pl_expr.Operator.Lt: plc.binaryop.BinaryOperator.LESS, + pl_expr.Operator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL, + pl_expr.Operator.Gt: plc.binaryop.BinaryOperator.GREATER, + pl_expr.Operator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL, + pl_expr.Operator.Plus: plc.binaryop.BinaryOperator.ADD, + pl_expr.Operator.Minus: plc.binaryop.BinaryOperator.SUB, + pl_expr.Operator.Multiply: plc.binaryop.BinaryOperator.MUL, + pl_expr.Operator.Divide: plc.binaryop.BinaryOperator.DIV, + pl_expr.Operator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV, + pl_expr.Operator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV, + pl_expr.Operator.Modulus: plc.binaryop.BinaryOperator.PYMOD, + pl_expr.Operator.And: plc.binaryop.BinaryOperator.BITWISE_AND, + pl_expr.Operator.Or: plc.binaryop.BinaryOperator.BITWISE_OR, + pl_expr.Operator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR, + pl_expr.Operator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND, + pl_expr.Operator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR, + } + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + left, right = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + lop = left.obj + rop = right.obj + if left.obj.size() != right.obj.size(): + if left.is_scalar: + lop = left.obj_scalar + elif right.is_scalar: + rop = right.obj_scalar + return Column( + plc.binaryop.binary_operation(lop, rop, self.op, self.dtype), + ) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + if depth == 1: + # inside aggregation, need to pre-evaluate, + # groupby construction has checked that we don't have + # nested aggs, so stop the recursion and return ourselves + # for pre-eval + return AggInfo([(self, plc.aggregation.collect_list(), self)]) + else: + left_info, right_info = ( + child.collect_agg(depth=depth) for child in self.children + ) + requests = [*left_info.requests, *right_info.requests] + # TODO: Hack, if there were no reductions inside this + # binary expression then we want to pre-evaluate and + # collect ourselves. Otherwise we want to collect the + # aggregations inside and post-evaluate. This is a bad way + # of checking that we are in case 1. + if all( + agg.kind() == plc.aggregation.Kind.COLLECT_LIST + for _, agg, _ in requests + ): + return AggInfo([(self, plc.aggregation.collect_list(), self)]) + return AggInfo( + [*left_info.requests, *right_info.requests], + ) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py new file mode 100644 index 00000000000..9c14a8386f3 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py @@ -0,0 +1,268 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""Boolean DSL nodes.""" + +from __future__ import annotations + +from functools import partial, reduce +from typing import TYPE_CHECKING, Any, ClassVar + +import pyarrow as pa +import pylibcudf as plc + +from polars.polars import _expr_nodes as pl_expr + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import ( + ExecutionContext, + Expr, +) + +if TYPE_CHECKING: + from collections.abc import Mapping + + import polars.type_aliases as pl_types + + from cudf_polars.containers import DataFrame + +__all__ = ["BooleanFunction"] + + +class BooleanFunction(Expr): + __slots__ = ("name", "options") + _non_child = ("dtype", "name", "options") + + def __init__( + self, + dtype: plc.DataType, + name: pl_expr.BooleanFunction, + options: tuple[Any, ...], + *children: Expr, + ) -> None: + self.dtype = dtype + self.options = options + self.name = name + self.children = children + if self.name == pl_expr.BooleanFunction.IsIn and not all( + c.dtype == self.children[0].dtype for c in self.children + ): + # TODO: If polars IR doesn't put the casts in, we need to + # mimic the supertype promotion rules. + raise NotImplementedError("IsIn doesn't support supertype casting") + + @staticmethod + def _distinct( + column: Column, + *, + keep: plc.stream_compaction.DuplicateKeepOption, + source_value: plc.Scalar, + target_value: plc.Scalar, + ) -> Column: + table = plc.Table([column.obj]) + indices = plc.stream_compaction.distinct_indices( + table, + keep, + # TODO: polars doesn't expose options for these + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + return Column( + plc.copying.scatter( + [source_value], + indices, + plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]), + ).columns()[0] + ) + + _BETWEEN_OPS: ClassVar[ + dict[ + pl_types.ClosedInterval, + tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator], + ] + ] = { + "none": ( + plc.binaryop.BinaryOperator.GREATER, + plc.binaryop.BinaryOperator.LESS, + ), + "left": ( + plc.binaryop.BinaryOperator.GREATER_EQUAL, + plc.binaryop.BinaryOperator.LESS, + ), + "right": ( + plc.binaryop.BinaryOperator.GREATER, + plc.binaryop.BinaryOperator.LESS_EQUAL, + ), + "both": ( + plc.binaryop.BinaryOperator.GREATER_EQUAL, + plc.binaryop.BinaryOperator.LESS_EQUAL, + ), + } + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + if self.name in ( + pl_expr.BooleanFunction.IsFinite, + pl_expr.BooleanFunction.IsInfinite, + ): + # Avoid evaluating the child if the dtype tells us it's unnecessary. + (child,) = self.children + is_finite = self.name == pl_expr.BooleanFunction.IsFinite + if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64): + value = plc.interop.from_arrow( + pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype)) + ) + return Column(plc.Column.from_scalar(value, df.num_rows)) + needles = child.evaluate(df, context=context, mapping=mapping) + to_search = [-float("inf"), float("inf")] + if is_finite: + # NaN is neither finite not infinite + to_search.append(float("nan")) + haystack = plc.interop.from_arrow( + pa.array( + to_search, + type=plc.interop.to_arrow(needles.obj.type()), + ) + ) + result = plc.search.contains(haystack, needles.obj) + if is_finite: + result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT) + return Column(result) + columns = [ + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ] + # Kleene logic for Any (OR) and All (AND) if ignore_nulls is + # False + if self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All): + (ignore_nulls,) = self.options + (column,) = columns + is_any = self.name == pl_expr.BooleanFunction.Any + agg = plc.aggregation.any() if is_any else plc.aggregation.all() + result = plc.reduce.reduce(column.obj, agg, self.dtype) + if not ignore_nulls and column.obj.null_count() > 0: + # Truth tables + # Any All + # | F U T | F U T + # --+------ --+------ + # F | F U T F | F F F + # U | U U T U | F U U + # T | T T T T | F U T + # + # If the input null count was non-zero, we must + # post-process the result to insert the correct value. + h_result = plc.interop.to_arrow(result).as_py() + if is_any and not h_result or not is_any and h_result: + # Any All + # False || Null => Null True && Null => Null + return Column(plc.Column.all_null_like(column.obj, 1)) + return Column(plc.Column.from_scalar(result, 1)) + if self.name == pl_expr.BooleanFunction.IsNull: + (column,) = columns + return Column(plc.unary.is_null(column.obj)) + elif self.name == pl_expr.BooleanFunction.IsNotNull: + (column,) = columns + return Column(plc.unary.is_valid(column.obj)) + elif self.name == pl_expr.BooleanFunction.IsNan: + (column,) = columns + return Column( + plc.unary.is_nan(column.obj).with_mask( + column.obj.null_mask(), column.obj.null_count() + ) + ) + elif self.name == pl_expr.BooleanFunction.IsNotNan: + (column,) = columns + return Column( + plc.unary.is_not_nan(column.obj).with_mask( + column.obj.null_mask(), column.obj.null_count() + ) + ) + elif self.name == pl_expr.BooleanFunction.IsFirstDistinct: + (column,) = columns + return self._distinct( + column, + keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, + source_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), + ) + elif self.name == pl_expr.BooleanFunction.IsLastDistinct: + (column,) = columns + return self._distinct( + column, + keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST, + source_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), + ) + elif self.name == pl_expr.BooleanFunction.IsUnique: + (column,) = columns + return self._distinct( + column, + keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, + source_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), + ) + elif self.name == pl_expr.BooleanFunction.IsDuplicated: + (column,) = columns + return self._distinct( + column, + keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE, + source_value=plc.interop.from_arrow( + pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype)) + ), + target_value=plc.interop.from_arrow( + pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype)) + ), + ) + elif self.name == pl_expr.BooleanFunction.AllHorizontal: + return Column( + reduce( + partial( + plc.binaryop.binary_operation, + op=plc.binaryop.BinaryOperator.NULL_LOGICAL_AND, + output_type=self.dtype, + ), + (c.obj for c in columns), + ) + ) + elif self.name == pl_expr.BooleanFunction.AnyHorizontal: + return Column( + reduce( + partial( + plc.binaryop.binary_operation, + op=plc.binaryop.BinaryOperator.NULL_LOGICAL_OR, + output_type=self.dtype, + ), + (c.obj for c in columns), + ) + ) + elif self.name == pl_expr.BooleanFunction.IsIn: + needles, haystack = columns + return Column(plc.search.contains(haystack.obj, needles.obj)) + elif self.name == pl_expr.BooleanFunction.Not: + (column,) = columns + return Column( + plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT) + ) + else: + raise NotImplementedError( + f"BooleanFunction {self.name}" + ) # pragma: no cover; handled by init raising diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py new file mode 100644 index 00000000000..596e193d8fe --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py @@ -0,0 +1,131 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""DSL nodes for datetime operations.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, ClassVar + +import pyarrow as pa +import pylibcudf as plc + +from polars.polars import _expr_nodes as pl_expr + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import ExecutionContext, Expr + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import DataFrame + +__all__ = ["TemporalFunction"] + + +class TemporalFunction(Expr): + __slots__ = ("name", "options") + _COMPONENT_MAP: ClassVar[dict[pl_expr.TemporalFunction, str]] = { + pl_expr.TemporalFunction.Year: plc.datetime.DatetimeComponent.YEAR, + pl_expr.TemporalFunction.Month: plc.datetime.DatetimeComponent.MONTH, + pl_expr.TemporalFunction.Day: plc.datetime.DatetimeComponent.DAY, + pl_expr.TemporalFunction.WeekDay: plc.datetime.DatetimeComponent.WEEKDAY, + pl_expr.TemporalFunction.Hour: plc.datetime.DatetimeComponent.HOUR, + pl_expr.TemporalFunction.Minute: plc.datetime.DatetimeComponent.MINUTE, + pl_expr.TemporalFunction.Second: plc.datetime.DatetimeComponent.SECOND, + pl_expr.TemporalFunction.Millisecond: plc.datetime.DatetimeComponent.MILLISECOND, + pl_expr.TemporalFunction.Microsecond: plc.datetime.DatetimeComponent.MICROSECOND, + pl_expr.TemporalFunction.Nanosecond: plc.datetime.DatetimeComponent.NANOSECOND, + } + _non_child = ("dtype", "name", "options") + + def __init__( + self, + dtype: plc.DataType, + name: pl_expr.TemporalFunction, + options: tuple[Any, ...], + *children: Expr, + ) -> None: + self.dtype = dtype + self.options = options + self.name = name + self.children = children + if self.name not in self._COMPONENT_MAP: + raise NotImplementedError(f"Temporal function {self.name}") + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + columns = [ + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ] + (column,) = columns + if self.name == pl_expr.TemporalFunction.Microsecond: + millis = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.MILLISECOND + ) + micros = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.MICROSECOND + ) + millis_as_micros = plc.binaryop.binary_operation( + millis, + plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.DataType(plc.TypeId.INT32), + ) + total_micros = plc.binaryop.binary_operation( + micros, + millis_as_micros, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + return Column(total_micros) + elif self.name == pl_expr.TemporalFunction.Nanosecond: + millis = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.MILLISECOND + ) + micros = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.MICROSECOND + ) + nanos = plc.datetime.extract_datetime_component( + column.obj, plc.datetime.DatetimeComponent.NANOSECOND + ) + millis_as_nanos = plc.binaryop.binary_operation( + millis, + plc.interop.from_arrow(pa.scalar(1_000_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.types.DataType(plc.types.TypeId.INT32), + ) + micros_as_nanos = plc.binaryop.binary_operation( + micros, + plc.interop.from_arrow(pa.scalar(1_000, type=pa.int32())), + plc.binaryop.BinaryOperator.MUL, + plc.types.DataType(plc.types.TypeId.INT32), + ) + total_nanos = plc.binaryop.binary_operation( + nanos, + millis_as_nanos, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + total_nanos = plc.binaryop.binary_operation( + total_nanos, + micros_as_nanos, + plc.binaryop.BinaryOperator.ADD, + plc.types.DataType(plc.types.TypeId.INT32), + ) + return Column(total_nanos) + + return Column( + plc.datetime.extract_datetime_component( + column.obj, + self._COMPONENT_MAP[self.name], + ) + ) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py new file mode 100644 index 00000000000..c8aa993b994 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py @@ -0,0 +1,88 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""Literal DSL nodes.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import pyarrow as pa +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr +from cudf_polars.utils import dtypes + +if TYPE_CHECKING: + from collections.abc import Hashable, Mapping + + import pyarrow as pa + + import polars as pl + + from cudf_polars.containers import DataFrame + +__all__ = ["Literal", "LiteralColumn"] + + +class Literal(Expr): + __slots__ = ("value",) + _non_child = ("dtype", "value") + value: pa.Scalar[Any] + + def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None: + self.dtype = dtype + assert value.type == plc.interop.to_arrow(dtype) + self.value = value + self.children = () + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + # datatype of pyarrow scalar is correct by construction. + return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1)) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + return AggInfo([]) + + +class LiteralColumn(Expr): + __slots__ = ("value",) + _non_child = ("dtype", "value") + value: pa.Array[Any, Any] + + def __init__(self, dtype: plc.DataType, value: pl.Series) -> None: + self.dtype = dtype + data = value.to_arrow() + self.value = data.cast(dtypes.downcast_arrow_lists(data.type)) + self.children = () + + def get_hashable(self) -> Hashable: + """Compute a hash of the column.""" + # This is stricter than necessary, but we only need this hash + # for identity in groupby replacements so it's OK. And this + # way we avoid doing potentially expensive compute. + return (type(self), self.dtype, id(self.value)) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + # datatype of pyarrow array is correct by construction. + return Column(plc.interop.from_arrow(self.value)) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + return AggInfo([]) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py new file mode 100644 index 00000000000..fa68bcb9426 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""Rolling DSL nodes.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +from cudf_polars.dsl.expressions.base import Expr + +if TYPE_CHECKING: + import pylibcudf as plc + +__all__ = ["RollingWindow", "GroupedRollingWindow"] + + +class RollingWindow(Expr): + __slots__ = ("options",) + _non_child = ("dtype", "options") + + def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None: + self.dtype = dtype + self.options = options + self.children = (agg,) + raise NotImplementedError("Rolling window not implemented") + + +class GroupedRollingWindow(Expr): + __slots__ = ("options",) + _non_child = ("dtype", "options") + + def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None: + self.dtype = dtype + self.options = options + self.children = (agg, *by) + raise NotImplementedError("Grouped rolling window not implemented") diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py new file mode 100644 index 00000000000..0247256e507 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""DSL nodes for selection operations.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pyarrow as pa +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import ExecutionContext, Expr + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import DataFrame + +__all__ = ["Gather", "Filter"] + + +class Gather(Expr): + __slots__ = () + _non_child = ("dtype",) + + def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None: + self.dtype = dtype + self.children = (values, indices) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + values, indices = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + lo, hi = plc.reduce.minmax(indices.obj) + lo = plc.interop.to_arrow(lo).as_py() + hi = plc.interop.to_arrow(hi).as_py() + n = df.num_rows + if hi >= n or lo < -n: + raise ValueError("gather indices are out of bounds") + if indices.obj.null_count(): + bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY + obj = plc.replace.replace_nulls( + indices.obj, + plc.interop.from_arrow( + pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type())) + ), + ) + else: + bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK + obj = indices.obj + table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy) + return Column(table.columns()[0]) + + +class Filter(Expr): + __slots__ = () + _non_child = ("dtype",) + + def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr): + self.dtype = dtype + self.children = (values, indices) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + values, mask = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + table = plc.stream_compaction.apply_boolean_mask( + plc.Table([values.obj]), mask.obj + ) + return Column(table.columns()[0]).sorted_like(values) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py new file mode 100644 index 00000000000..99512e2ef52 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py @@ -0,0 +1,95 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""Sorting DSL nodes.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import ExecutionContext, Expr +from cudf_polars.utils import sorting + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import DataFrame + +__all__ = ["Sort", "SortBy"] + + +class Sort(Expr): + __slots__ = ("options",) + _non_child = ("dtype", "options") + + def __init__( + self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr + ) -> None: + self.dtype = dtype + self.options = options + self.children = (column,) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + (child,) = self.children + column = child.evaluate(df, context=context, mapping=mapping) + (stable, nulls_last, descending) = self.options + order, null_order = sorting.sort_order( + [descending], nulls_last=[nulls_last], num_keys=1 + ) + do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort + table = do_sort(plc.Table([column.obj]), order, null_order) + return Column( + table.columns()[0], + is_sorted=plc.types.Sorted.YES, + order=order[0], + null_order=null_order[0], + ) + + +class SortBy(Expr): + __slots__ = ("options",) + _non_child = ("dtype", "options") + + def __init__( + self, + dtype: plc.DataType, + options: tuple[bool, tuple[bool], tuple[bool]], + column: Expr, + *by: Expr, + ) -> None: + self.dtype = dtype + self.options = options + self.children = (column, *by) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + column, *by = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + (stable, nulls_last, descending) = self.options + order, null_order = sorting.sort_order( + descending, nulls_last=nulls_last, num_keys=len(by) + ) + do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key + table = do_sort( + plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order + ) + return Column(table.columns()[0]) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py new file mode 100644 index 00000000000..62b54c63a8d --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py @@ -0,0 +1,282 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""DSL nodes for string operations.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc + +from polars.exceptions import InvalidOperationError +from polars.polars import _expr_nodes as pl_expr + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import ExecutionContext, Expr +from cudf_polars.dsl.expressions.literal import Literal, LiteralColumn + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import DataFrame + +__all__ = ["StringFunction"] + + +class StringFunction(Expr): + __slots__ = ("name", "options", "_regex_program") + _non_child = ("dtype", "name", "options") + + def __init__( + self, + dtype: plc.DataType, + name: pl_expr.StringFunction, + options: tuple[Any, ...], + *children: Expr, + ) -> None: + self.dtype = dtype + self.options = options + self.name = name + self.children = children + self._validate_input() + + def _validate_input(self): + if self.name not in ( + pl_expr.StringFunction.Contains, + pl_expr.StringFunction.EndsWith, + pl_expr.StringFunction.Lowercase, + pl_expr.StringFunction.Replace, + pl_expr.StringFunction.ReplaceMany, + pl_expr.StringFunction.Slice, + pl_expr.StringFunction.Strptime, + pl_expr.StringFunction.StartsWith, + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + pl_expr.StringFunction.Uppercase, + ): + raise NotImplementedError(f"String function {self.name}") + if self.name == pl_expr.StringFunction.Contains: + literal, strict = self.options + if not literal: + if not strict: + raise NotImplementedError( + "f{strict=} is not supported for regex contains" + ) + if not isinstance(self.children[1], Literal): + raise NotImplementedError( + "Regex contains only supports a scalar pattern" + ) + pattern = self.children[1].value.as_py() + try: + self._regex_program = plc.strings.regex_program.RegexProgram.create( + pattern, + flags=plc.strings.regex_flags.RegexFlags.DEFAULT, + ) + except RuntimeError as e: + raise NotImplementedError( + f"Unsupported regex {pattern} for GPU engine." + ) from e + elif self.name == pl_expr.StringFunction.Replace: + _, literal = self.options + if not literal: + raise NotImplementedError("literal=False is not supported for replace") + if not all(isinstance(expr, Literal) for expr in self.children[1:]): + raise NotImplementedError("replace only supports scalar target") + target = self.children[1] + if target.value == pa.scalar("", type=pa.string()): + raise NotImplementedError( + "libcudf replace does not support empty strings" + ) + elif self.name == pl_expr.StringFunction.ReplaceMany: + (ascii_case_insensitive,) = self.options + if ascii_case_insensitive: + raise NotImplementedError( + "ascii_case_insensitive not implemented for replace_many" + ) + if not all( + isinstance(expr, (LiteralColumn, Literal)) for expr in self.children[1:] + ): + raise NotImplementedError("replace_many only supports literal inputs") + target = self.children[1] + if pc.any(pc.equal(target.value, "")).as_py(): + raise NotImplementedError( + "libcudf replace_many is implemented differently from polars " + "for empty strings" + ) + elif self.name == pl_expr.StringFunction.Slice: + if not all(isinstance(child, Literal) for child in self.children[1:]): + raise NotImplementedError( + "Slice only supports literal start and stop values" + ) + elif self.name == pl_expr.StringFunction.Strptime: + format, _, exact, cache = self.options + if cache: + raise NotImplementedError("Strptime cache is a CPU feature") + if format is None: + raise NotImplementedError("Strptime format is required") + if not exact: + raise NotImplementedError("Strptime does not support exact=False") + elif self.name in { + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + }: + if not isinstance(self.children[1], Literal): + raise NotImplementedError( + "strip operations only support scalar patterns" + ) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + if self.name == pl_expr.StringFunction.Contains: + child, arg = self.children + column = child.evaluate(df, context=context, mapping=mapping) + + literal, _ = self.options + if literal: + pat = arg.evaluate(df, context=context, mapping=mapping) + pattern = ( + pat.obj_scalar + if pat.is_scalar and pat.obj.size() != column.obj.size() + else pat.obj + ) + return Column(plc.strings.find.contains(column.obj, pattern)) + else: + return Column( + plc.strings.contains.contains_re(column.obj, self._regex_program) + ) + elif self.name == pl_expr.StringFunction.Slice: + child, expr_offset, expr_length = self.children + assert isinstance(expr_offset, Literal) + assert isinstance(expr_length, Literal) + + column = child.evaluate(df, context=context, mapping=mapping) + # libcudf slices via [start,stop). + # polars slices with offset + length where start == offset + # stop = start + length. Negative values for start look backward + # from the last element of the string. If the end index would be + # below zero, an empty string is returned. + # Do this maths on the host + start = expr_offset.value.as_py() + length = expr_length.value.as_py() + + if length == 0: + stop = start + else: + # No length indicates a scan to the end + # The libcudf equivalent is a null stop + stop = start + length if length else None + if length and start < 0 and length >= -start: + stop = None + return Column( + plc.strings.slice.slice_strings( + column.obj, + plc.interop.from_arrow(pa.scalar(start, type=pa.int32())), + plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())), + ) + ) + elif self.name in { + pl_expr.StringFunction.StripChars, + pl_expr.StringFunction.StripCharsStart, + pl_expr.StringFunction.StripCharsEnd, + }: + column, chars = ( + c.evaluate(df, context=context, mapping=mapping) for c in self.children + ) + if self.name == pl_expr.StringFunction.StripCharsStart: + side = plc.strings.SideType.LEFT + elif self.name == pl_expr.StringFunction.StripCharsEnd: + side = plc.strings.SideType.RIGHT + else: + side = plc.strings.SideType.BOTH + return Column(plc.strings.strip.strip(column.obj, side, chars.obj_scalar)) + + columns = [ + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ] + if self.name == pl_expr.StringFunction.Lowercase: + (column,) = columns + return Column(plc.strings.case.to_lower(column.obj)) + elif self.name == pl_expr.StringFunction.Uppercase: + (column,) = columns + return Column(plc.strings.case.to_upper(column.obj)) + elif self.name == pl_expr.StringFunction.EndsWith: + column, suffix = columns + return Column( + plc.strings.find.ends_with( + column.obj, + suffix.obj_scalar + if column.obj.size() != suffix.obj.size() and suffix.is_scalar + else suffix.obj, + ) + ) + elif self.name == pl_expr.StringFunction.StartsWith: + column, prefix = columns + return Column( + plc.strings.find.starts_with( + column.obj, + prefix.obj_scalar + if column.obj.size() != prefix.obj.size() and prefix.is_scalar + else prefix.obj, + ) + ) + elif self.name == pl_expr.StringFunction.Strptime: + # TODO: ignores ambiguous + format, strict, exact, cache = self.options + col = self.children[0].evaluate(df, context=context, mapping=mapping) + + is_timestamps = plc.strings.convert.convert_datetime.is_timestamp( + col.obj, format + ) + + if strict: + if not plc.interop.to_arrow( + plc.reduce.reduce( + is_timestamps, + plc.aggregation.all(), + plc.DataType(plc.TypeId.BOOL8), + ) + ).as_py(): + raise InvalidOperationError("conversion from `str` failed.") + else: + not_timestamps = plc.unary.unary_operation( + is_timestamps, plc.unary.UnaryOperator.NOT + ) + + null = plc.interop.from_arrow(pa.scalar(None, type=pa.string())) + res = plc.copying.boolean_mask_scatter( + [null], plc.Table([col.obj]), not_timestamps + ) + return Column( + plc.strings.convert.convert_datetime.to_timestamps( + res.columns()[0], self.dtype, format + ) + ) + elif self.name == pl_expr.StringFunction.Replace: + column, target, repl = columns + n, _ = self.options + return Column( + plc.strings.replace.replace( + column.obj, target.obj_scalar, repl.obj_scalar, maxrepl=n + ) + ) + elif self.name == pl_expr.StringFunction.ReplaceMany: + column, target, repl = columns + return Column( + plc.strings.replace.replace_multiple(column.obj, target.obj, repl.obj) + ) + raise NotImplementedError( + f"StringFunction {self.name}" + ) # pragma: no cover; handled by init raising diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py new file mode 100644 index 00000000000..d2b5d6bae29 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +# ruff: noqa: D101 +"""DSL nodes for ternary operations.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import ( + ExecutionContext, + Expr, +) + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import DataFrame + + +__all__ = ["Ternary"] + + +class Ternary(Expr): + __slots__ = () + _non_child = ("dtype",) + + def __init__( + self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr + ) -> None: + self.dtype = dtype + self.children = (when, then, otherwise) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + when, then, otherwise = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + then_obj = then.obj_scalar if then.is_scalar else then.obj + otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj + return Column(plc.copying.copy_if_else(then_obj, otherwise_obj, when.obj)) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py new file mode 100644 index 00000000000..53f6ed29239 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py @@ -0,0 +1,328 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +# TODO: remove need for this +"""DSL nodes for unary operations.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, ClassVar + +import pyarrow as pa +import pylibcudf as plc + +from cudf_polars.containers import Column +from cudf_polars.dsl.expressions.base import AggInfo, ExecutionContext, Expr +from cudf_polars.dsl.expressions.literal import Literal +from cudf_polars.utils import dtypes + +if TYPE_CHECKING: + from collections.abc import Mapping + + from cudf_polars.containers import DataFrame + +__all__ = ["Cast", "UnaryFunction", "Len"] + + +class Cast(Expr): + """Class representing a cast of an expression.""" + + __slots__ = () + _non_child = ("dtype",) + + def __init__(self, dtype: plc.DataType, value: Expr) -> None: + self.dtype = dtype + self.children = (value,) + if not dtypes.can_cast(value.dtype, self.dtype): + raise NotImplementedError( + f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}" + ) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + (child,) = self.children + column = child.evaluate(df, context=context, mapping=mapping) + return Column(plc.unary.cast(column.obj, self.dtype)).sorted_like(column) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + # TODO: Could do with sort-based groupby and segmented filter + (child,) = self.children + return child.collect_agg(depth=depth) + + +class Len(Expr): + """Class representing the length of an expression.""" + + def __init__(self, dtype: plc.DataType) -> None: + self.dtype = dtype + self.children = () + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(df.num_rows, type=plc.interop.to_arrow(self.dtype)) + ), + 1, + ) + ) + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + # TODO: polars returns a uint, not an int for count + return AggInfo( + [(None, plc.aggregation.count(plc.types.NullPolicy.INCLUDE), self)] + ) + + +class UnaryFunction(Expr): + """Class representing unary functions of an expression.""" + + __slots__ = ("name", "options") + _non_child = ("dtype", "name", "options") + + # Note: log, and pow are handled via translation to binops + _OP_MAPPING: ClassVar[dict[str, plc.unary.UnaryOperator]] = { + "sin": plc.unary.UnaryOperator.SIN, + "cos": plc.unary.UnaryOperator.COS, + "tan": plc.unary.UnaryOperator.TAN, + "arcsin": plc.unary.UnaryOperator.ARCSIN, + "arccos": plc.unary.UnaryOperator.ARCCOS, + "arctan": plc.unary.UnaryOperator.ARCTAN, + "sinh": plc.unary.UnaryOperator.SINH, + "cosh": plc.unary.UnaryOperator.COSH, + "tanh": plc.unary.UnaryOperator.TANH, + "arcsinh": plc.unary.UnaryOperator.ARCSINH, + "arccosh": plc.unary.UnaryOperator.ARCCOSH, + "arctanh": plc.unary.UnaryOperator.ARCTANH, + "exp": plc.unary.UnaryOperator.EXP, + "sqrt": plc.unary.UnaryOperator.SQRT, + "cbrt": plc.unary.UnaryOperator.CBRT, + "ceil": plc.unary.UnaryOperator.CEIL, + "floor": plc.unary.UnaryOperator.FLOOR, + "abs": plc.unary.UnaryOperator.ABS, + "bit_invert": plc.unary.UnaryOperator.BIT_INVERT, + "not": plc.unary.UnaryOperator.NOT, + } + _supported_misc_fns = frozenset( + { + "drop_nulls", + "fill_null", + "mask_nans", + "round", + "set_sorted", + "unique", + } + ) + _supported_cum_aggs = frozenset( + { + "cum_min", + "cum_max", + "cum_prod", + "cum_sum", + } + ) + _supported_fns = frozenset().union( + _supported_misc_fns, _supported_cum_aggs, _OP_MAPPING.keys() + ) + + def __init__( + self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr + ) -> None: + self.dtype = dtype + self.name = name + self.options = options + self.children = children + + if self.name not in UnaryFunction._supported_fns: + raise NotImplementedError(f"Unary function {name=}") + if self.name in UnaryFunction._supported_cum_aggs: + (reverse,) = self.options + if reverse: + raise NotImplementedError( + "reverse=True is not supported for cumulative aggregations" + ) + + def do_evaluate( + self, + df: DataFrame, + *, + context: ExecutionContext = ExecutionContext.FRAME, + mapping: Mapping[Expr, Column] | None = None, + ) -> Column: + """Evaluate this expression given a dataframe for context.""" + if self.name == "mask_nans": + (child,) = self.children + return child.evaluate(df, context=context, mapping=mapping).mask_nans() + if self.name == "round": + (decimal_places,) = self.options + (values,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + return Column( + plc.round.round( + values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP + ) + ).sorted_like(values) + elif self.name == "unique": + (maintain_order,) = self.options + (values,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + # Only one column, so keep_any is the same as keep_first + # for stable distinct + keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY + if values.is_sorted: + maintain_order = True + result = plc.stream_compaction.unique( + plc.Table([values.obj]), + [0], + keep, + plc.types.NullEquality.EQUAL, + ) + else: + distinct = ( + plc.stream_compaction.stable_distinct + if maintain_order + else plc.stream_compaction.distinct + ) + result = distinct( + plc.Table([values.obj]), + [0], + keep, + plc.types.NullEquality.EQUAL, + plc.types.NanEquality.ALL_EQUAL, + ) + (column,) = result.columns() + if maintain_order: + return Column(column).sorted_like(values) + return Column(column) + elif self.name == "set_sorted": + (column,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + (asc,) = self.options + order = ( + plc.types.Order.ASCENDING + if asc == "ascending" + else plc.types.Order.DESCENDING + ) + null_order = plc.types.NullOrder.BEFORE + if column.obj.null_count() > 0 and (n := column.obj.size()) > 1: + # PERF: This invokes four stream synchronisations! + has_nulls_first = not plc.copying.get_element(column.obj, 0).is_valid() + has_nulls_last = not plc.copying.get_element( + column.obj, n - 1 + ).is_valid() + if (order == plc.types.Order.DESCENDING and has_nulls_first) or ( + order == plc.types.Order.ASCENDING and has_nulls_last + ): + null_order = plc.types.NullOrder.AFTER + return column.set_sorted( + is_sorted=plc.types.Sorted.YES, + order=order, + null_order=null_order, + ) + elif self.name == "drop_nulls": + (column,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + return Column( + plc.stream_compaction.drop_nulls( + plc.Table([column.obj]), [0], 1 + ).columns()[0] + ) + elif self.name == "fill_null": + column = self.children[0].evaluate(df, context=context, mapping=mapping) + if isinstance(self.children[1], Literal): + arg = plc.interop.from_arrow(self.children[1].value) + else: + evaluated = self.children[1].evaluate( + df, context=context, mapping=mapping + ) + arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj + return Column(plc.replace.replace_nulls(column.obj, arg)) + elif self.name in self._OP_MAPPING: + column = self.children[0].evaluate(df, context=context, mapping=mapping) + if column.obj.type().id() != self.dtype.id(): + arg = plc.unary.cast(column.obj, self.dtype) + else: + arg = column.obj + return Column(plc.unary.unary_operation(arg, self._OP_MAPPING[self.name])) + elif self.name in UnaryFunction._supported_cum_aggs: + column = self.children[0].evaluate(df, context=context, mapping=mapping) + plc_col = column.obj + col_type = column.obj.type() + # cum_sum casts + # Int8, UInt8, Int16, UInt16 -> Int64 for overflow prevention + # Bool -> UInt32 + # cum_prod casts integer dtypes < int64 and bool to int64 + # See: + # https://github.com/pola-rs/polars/blob/main/crates/polars-ops/src/series/ops/cum_agg.rs + if ( + self.name == "cum_sum" + and col_type.id() + in { + plc.types.TypeId.INT8, + plc.types.TypeId.UINT8, + plc.types.TypeId.INT16, + plc.types.TypeId.UINT16, + } + ) or ( + self.name == "cum_prod" + and plc.traits.is_integral(col_type) + and plc.types.size_of(col_type) <= 4 + ): + plc_col = plc.unary.cast( + plc_col, plc.types.DataType(plc.types.TypeId.INT64) + ) + elif ( + self.name == "cum_sum" + and column.obj.type().id() == plc.types.TypeId.BOOL8 + ): + plc_col = plc.unary.cast( + plc_col, plc.types.DataType(plc.types.TypeId.UINT32) + ) + if self.name == "cum_sum": + agg = plc.aggregation.sum() + elif self.name == "cum_prod": + agg = plc.aggregation.product() + elif self.name == "cum_min": + agg = plc.aggregation.min() + elif self.name == "cum_max": + agg = plc.aggregation.max() + + return Column(plc.reduce.scan(plc_col, agg, plc.reduce.ScanType.INCLUSIVE)) + raise NotImplementedError( + f"Unimplemented unary function {self.name=}" + ) # pragma: no cover; init trips first + + def collect_agg(self, *, depth: int) -> AggInfo: + """Collect information about aggregations in groupbys.""" + if self.name in {"unique", "drop_nulls"} | self._supported_cum_aggs: + raise NotImplementedError(f"{self.name} in groupby") + if depth == 1: + # inside aggregation, need to pre-evaluate, groupby + # construction has checked that we don't have nested aggs, + # so stop the recursion and return ourselves for pre-eval + return AggInfo([(self, plc.aggregation.collect_list(), self)]) + else: + (child,) = self.children + return child.collect_agg(depth=depth) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 1c61075be22..f79e229d3f3 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -13,8 +13,8 @@ from __future__ import annotations -import dataclasses import itertools +import json from functools import cache from pathlib import Path from typing import TYPE_CHECKING, Any, ClassVar @@ -26,11 +26,12 @@ import polars as pl import cudf_polars.dsl.expr as expr -from cudf_polars.containers import DataFrame, NamedColumn -from cudf_polars.utils import dtypes, sorting +from cudf_polars.containers import Column, DataFrame +from cudf_polars.dsl.nodebase import Node +from cudf_polars.utils import dtypes if TYPE_CHECKING: - from collections.abc import Callable, MutableMapping + from collections.abc import Callable, Hashable, MutableMapping, Sequence from typing import Literal from cudf_polars.typing import Schema @@ -57,9 +58,7 @@ ] -def broadcast( - *columns: NamedColumn, target_length: int | None = None -) -> list[NamedColumn]: +def broadcast(*columns: Column, target_length: int | None = None) -> list[Column]: """ Broadcast a sequence of columns to a common length. @@ -112,27 +111,38 @@ def broadcast( return [ column if column.obj.size() != 1 - else NamedColumn( + else Column( plc.Column.from_scalar(column.obj_scalar, nrows), - column.name, is_sorted=plc.types.Sorted.YES, order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.BEFORE, + name=column.name, ) for column in columns ] -@dataclasses.dataclass -class IR: +class IR(Node["IR"]): """Abstract plan node, representing an unevaluated dataframe.""" + __slots__ = ("schema",) + # This annotation is needed because of https://github.com/python/mypy/issues/17981 + _non_child: ClassVar[tuple[str, ...]] = ("schema",) schema: Schema """Mapping from column names to their data types.""" - def __post_init__(self): - """Validate preconditions.""" - pass # noqa: PIE790 + def get_hashable(self) -> Hashable: + """ + Hashable representation of node, treating schema dictionary. + + Since the schema is a dictionary, even though it is morally + immutable, it is not hashable. We therefore convert it to + tuples for hashing purposes. + """ + # Schema is the first constructor argument + args = self._ctor_arguments(self.children)[1:] + schema_hash = tuple(self.schema.items()) + return (type(self), schema_hash, args) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """ @@ -161,24 +171,50 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ) # pragma: no cover -@dataclasses.dataclass class PythonScan(IR): """Representation of input from a python function.""" + __slots__ = ("options", "predicate") + _non_child = ("schema", "options", "predicate") options: Any """Arbitrary options.""" predicate: expr.NamedExpr | None """Filter to apply to the constructed dataframe before returning it.""" - def __post_init__(self): - """Validate preconditions.""" + def __init__(self, schema: Schema, options: Any, predicate: expr.NamedExpr | None): + self.schema = schema + self.options = options + self.predicate = predicate + self.children = () raise NotImplementedError("PythonScan not implemented") -@dataclasses.dataclass class Scan(IR): """Input from files.""" + __slots__ = ( + "typ", + "reader_options", + "cloud_options", + "paths", + "with_columns", + "skip_rows", + "n_rows", + "row_index", + "predicate", + ) + _non_child = ( + "schema", + "typ", + "reader_options", + "cloud_options", + "paths", + "with_columns", + "skip_rows", + "n_rows", + "row_index", + "predicate", + ) typ: str """What type of file are we reading? Parquet, CSV, etc...""" reader_options: dict[str, Any] @@ -187,7 +223,7 @@ class Scan(IR): """Cloud-related authentication options, currently ignored.""" paths: list[str] """List of paths to read from.""" - with_columns: list[str] + with_columns: list[str] | None """Projected columns to return.""" skip_rows: int """Rows to skip at the start when reading.""" @@ -198,9 +234,30 @@ class Scan(IR): predicate: expr.NamedExpr | None """Mask to apply to the read dataframe.""" - def __post_init__(self) -> None: - """Validate preconditions.""" - super().__post_init__() + def __init__( + self, + schema: Schema, + typ: str, + reader_options: dict[str, Any], + cloud_options: dict[str, Any] | None, + paths: list[str], + with_columns: list[str] | None, + skip_rows: int, + n_rows: int, + row_index: tuple[str, int] | None, + predicate: expr.NamedExpr | None, + ): + self.schema = schema + self.typ = typ + self.reader_options = reader_options + self.cloud_options = cloud_options + self.paths = paths + self.with_columns = with_columns + self.skip_rows = skip_rows + self.n_rows = n_rows + self.row_index = row_index + self.predicate = predicate + self.children = () if self.typ not in ("csv", "parquet", "ndjson"): # pragma: no cover # This line is unhittable ATM since IPC/Anonymous scan raise # on the polars side @@ -260,6 +317,28 @@ def __post_init__(self) -> None: "Reading only parquet metadata to produce row index." ) + def get_hashable(self) -> Hashable: + """ + Hashable representation of the node. + + The options dictionaries are serialised for hashing purposes + as json strings. + """ + schema_hash = tuple(self.schema.items()) + return ( + type(self), + schema_hash, + self.typ, + json.dumps(self.reader_options), + json.dumps(self.cloud_options), + tuple(self.paths), + tuple(self.with_columns) if self.with_columns is not None else None, + self.skip_rows, + self.n_rows, + self.row_index, + self.predicate, + ) + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" with_columns = self.with_columns @@ -385,15 +464,17 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: init = plc.interop.from_arrow( pa.scalar(offset, type=plc.interop.to_arrow(dtype)) ) - index = NamedColumn( + index = Column( plc.filling.sequence(df.num_rows, init, step), - name, is_sorted=plc.types.Sorted.YES, order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.AFTER, + name=name, ) df = DataFrame([index, *df.columns]) - assert all(c.obj.type() == self.schema[c.name] for c in df.columns) + assert all( + c.obj.type() == self.schema[name] for name, c in df.column_map.items() + ) if self.predicate is None: return df else: @@ -401,7 +482,6 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return df.filter(mask) -@dataclasses.dataclass class Cache(IR): """ Return a cached plan node. @@ -409,20 +489,25 @@ class Cache(IR): Used for CSE at the plan level. """ + __slots__ = ("key",) + _non_child = ("schema", "key") key: int """The cache key.""" - value: IR - """The unevaluated node to cache.""" + + def __init__(self, schema: Schema, key: int, value: IR): + self.schema = schema + self.key = key + self.children = (value,) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" try: return cache[self.key] except KeyError: - return cache.setdefault(self.key, self.value.evaluate(cache=cache)) + (value,) = self.children + return cache.setdefault(self.key, value.evaluate(cache=cache)) -@dataclasses.dataclass class DataFrameScan(IR): """ Input from an existing polars DataFrame. @@ -430,13 +515,38 @@ class DataFrameScan(IR): This typically arises from ``q.collect().lazy()`` """ + __slots__ = ("df", "projection", "predicate") + _non_child = ("schema", "df", "projection", "predicate") df: Any """Polars LazyFrame object.""" - projection: list[str] + projection: tuple[str, ...] | None """List of columns to project out.""" predicate: expr.NamedExpr | None """Mask to apply.""" + def __init__( + self, + schema: Schema, + df: Any, + projection: Sequence[str] | None, + predicate: expr.NamedExpr | None, + ): + self.schema = schema + self.df = df + self.projection = tuple(projection) if projection is not None else None + self.predicate = predicate + self.children = () + + def get_hashable(self) -> Hashable: + """ + Hashable representation of the node. + + The (heavy) dataframe object is hashed as its id, so this is + not stable across runs, or repeat instances of the same equal dataframes. + """ + schema_hash = tuple(self.schema.items()) + return (type(self), schema_hash, id(self.df), self.projection, self.predicate) + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" pdf = pl.DataFrame._from_pydf(self.df) @@ -454,28 +564,39 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return df -@dataclasses.dataclass class Select(IR): """Produce a new dataframe selecting given expressions from an input.""" - df: IR - """Input dataframe.""" - expr: list[expr.NamedExpr] + __slots__ = ("exprs", "should_broadcast") + _non_child = ("schema", "exprs", "should_broadcast") + exprs: tuple[expr.NamedExpr, ...] """List of expressions to evaluate to form the new dataframe.""" should_broadcast: bool """Should columns be broadcast?""" + def __init__( + self, + schema: Schema, + exprs: Sequence[expr.NamedExpr], + should_broadcast: bool, # noqa: FBT001 + df: IR, + ): + self.schema = schema + self.exprs = tuple(exprs) + self.should_broadcast = should_broadcast + self.children = (df,) + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) + (child,) = self.children + df = child.evaluate(cache=cache) # Handle any broadcasting - columns = [e.evaluate(df) for e in self.expr] + columns = [e.evaluate(df) for e in self.exprs] if self.should_broadcast: columns = broadcast(*columns) return DataFrame(columns) -@dataclasses.dataclass class Reduce(IR): """ Produce a new dataframe selecting given expressions from an input. @@ -483,36 +604,73 @@ class Reduce(IR): This is a special case of :class:`Select` where all outputs are a single row. """ - df: IR - """Input dataframe.""" - expr: list[expr.NamedExpr] + __slots__ = ("exprs",) + _non_child = ("schema", "exprs") + exprs: tuple[expr.NamedExpr, ...] """List of expressions to evaluate to form the new dataframe.""" + def __init__( + self, schema: Schema, exprs: Sequence[expr.NamedExpr], df: IR + ): # pragma: no cover; polars doesn't emit this node yet + self.schema = schema + self.exprs = tuple(exprs) + self.children = (df,) + def evaluate( self, *, cache: MutableMapping[int, DataFrame] ) -> DataFrame: # pragma: no cover; polars doesn't emit this node yet """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) - columns = broadcast(*(e.evaluate(df) for e in self.expr)) + (child,) = self.children + df = child.evaluate(cache=cache) + columns = broadcast(*(e.evaluate(df) for e in self.exprs)) assert all(column.obj.size() == 1 for column in columns) return DataFrame(columns) -@dataclasses.dataclass class GroupBy(IR): """Perform a groupby.""" - df: IR - """Input dataframe.""" - agg_requests: list[expr.NamedExpr] - """List of expressions to evaluate groupwise.""" - keys: list[expr.NamedExpr] - """List of expressions forming the keys.""" + __slots__ = ( + "agg_requests", + "keys", + "maintain_order", + "options", + "agg_infos", + ) + _non_child = ("schema", "keys", "agg_requests", "maintain_order", "options") + keys: tuple[expr.NamedExpr, ...] + """Grouping keys.""" + agg_requests: tuple[expr.NamedExpr, ...] + """Aggregation expressions.""" maintain_order: bool - """Should the order of the input dataframe be maintained?""" + """Preserve order in groupby.""" options: Any - """Options controlling style of groupby.""" - agg_infos: list[expr.AggInfo] = dataclasses.field(init=False) + """Arbitrary options.""" + + def __init__( + self, + schema: Schema, + keys: Sequence[expr.NamedExpr], + agg_requests: Sequence[expr.NamedExpr], + maintain_order: bool, # noqa: FBT001 + options: Any, + df: IR, + ): + self.schema = schema + self.keys = tuple(keys) + self.agg_requests = tuple(agg_requests) + self.maintain_order = maintain_order + self.options = options + self.children = (df,) + if self.options.rolling: + raise NotImplementedError( + "rolling window/groupby" + ) # pragma: no cover; rollingwindow constructor has already raised + if self.options.dynamic: + raise NotImplementedError("dynamic group by") + if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests): + raise NotImplementedError("Nested aggregations in groupby") + self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] @staticmethod def check_agg(agg: expr.Expr) -> int: @@ -542,22 +700,10 @@ def check_agg(agg: expr.Expr) -> int: else: raise NotImplementedError(f"No handler for {agg=}") - def __post_init__(self) -> None: - """Check whether all the aggregations are implemented.""" - super().__post_init__() - if self.options.rolling: - raise NotImplementedError( - "rolling window/groupby" - ) # pragma: no cover; rollingwindow constructor has already raised - if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests): - raise NotImplementedError("Nested aggregations in groupby") - self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] - if len(self.keys) == 0: - raise NotImplementedError("dynamic groupby") - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) + (child,) = self.children + df = child.evaluate(cache=cache) keys = broadcast( *(k.evaluate(df) for k in self.keys), target_length=df.num_rows ) @@ -588,15 +734,14 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: requests.append(plc.groupby.GroupByRequest(col, [req])) replacements.append(rep) group_keys, raw_tables = grouper.aggregate(requests) - # TODO: names - raw_columns: list[NamedColumn] = [] + raw_columns: list[Column] = [] for i, table in enumerate(raw_tables): (column,) = table.columns() - raw_columns.append(NamedColumn(column, f"tmp{i}")) + raw_columns.append(Column(column, name=f"tmp{i}")) mapping = dict(zip(replacements, raw_columns, strict=True)) result_keys = [ - NamedColumn(gk, k.name) - for gk, k in zip(group_keys.columns(), keys, strict=True) + Column(grouped_key, name=key.name) + for key, grouped_key in zip(keys, group_keys.columns(), strict=True) ] result_subs = DataFrame(raw_columns) results = [ @@ -639,31 +784,28 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) broadcasted = [ - NamedColumn(reordered, b.name) - for reordered, b in zip( + Column(reordered, name=old.name) + for reordered, old in zip( ordered_table.columns(), broadcasted, strict=True ) ] return DataFrame(broadcasted).slice(self.options.slice) -@dataclasses.dataclass class Join(IR): """A join of two dataframes.""" - left: IR - """Left frame.""" - right: IR - """Right frame.""" - left_on: list[expr.NamedExpr] + __slots__ = ("left_on", "right_on", "options") + _non_child = ("schema", "left_on", "right_on", "options") + left_on: tuple[expr.NamedExpr, ...] """List of expressions used as keys in the left frame.""" - right_on: list[expr.NamedExpr] + right_on: tuple[expr.NamedExpr, ...] """List of expressions used as keys in the right frame.""" options: tuple[ - Literal["inner", "left", "right", "full", "leftsemi", "leftanti", "cross"], + Literal["inner", "left", "right", "full", "semi", "anti", "cross"], bool, tuple[int, int] | None, - str | None, + str, bool, ] """ @@ -675,9 +817,20 @@ class Join(IR): - coalesce: should key columns be coalesced (only makes sense for outer joins) """ - def __post_init__(self) -> None: - """Validate preconditions.""" - super().__post_init__() + def __init__( + self, + schema: Schema, + left_on: Sequence[expr.NamedExpr], + right_on: Sequence[expr.NamedExpr], + options: Any, + left: IR, + right: IR, + ): + self.schema = schema + self.left_on = tuple(left_on) + self.right_on = tuple(right_on) + self.options = options + self.children = (left, right) if any( isinstance(e.value, expr.Literal) for e in itertools.chain(self.left_on, self.right_on) @@ -687,7 +840,7 @@ def __post_init__(self) -> None: @staticmethod @cache def _joiners( - how: Literal["inner", "left", "right", "full", "leftsemi", "leftanti"], + how: Literal["inner", "left", "right", "full", "semi", "anti"], ) -> tuple[ Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None ]: @@ -709,13 +862,13 @@ def _joiners( plc.copying.OutOfBoundsPolicy.NULLIFY, plc.copying.OutOfBoundsPolicy.NULLIFY, ) - elif how == "leftsemi": + elif how == "semi": return ( plc.join.left_semi_join, plc.copying.OutOfBoundsPolicy.DONT_CHECK, None, ) - elif how == "leftanti": + elif how == "anti": return ( plc.join.left_anti_join, plc.copying.OutOfBoundsPolicy.DONT_CHECK, @@ -778,32 +931,30 @@ def _reorder_maps( def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - left = self.left.evaluate(cache=cache) - right = self.right.evaluate(cache=cache) + left, right = (c.evaluate(cache=cache) for c in self.children) how, join_nulls, zlice, suffix, coalesce = self.options - suffix = "_right" if suffix is None else suffix if how == "cross": # Separate implementation, since cross_join returns the # result, not the gather maps columns = plc.join.cross_join(left.table, right.table).columns() left_cols = [ - NamedColumn(new, old.name).sorted_like(old) + Column(new, name=old.name).sorted_like(old) for new, old in zip( columns[: left.num_columns], left.columns, strict=True ) ] right_cols = [ - NamedColumn( + Column( new, - old.name - if old.name not in left.column_names_set - else f"{old.name}{suffix}", + name=name + if name not in left.column_names_set + else f"{name}{suffix}", ) - for new, old in zip( - columns[left.num_columns :], right.columns, strict=True + for new, name in zip( + columns[left.num_columns :], right.column_names, strict=True ) ] - return DataFrame([*left_cols, *right_cols]) + return DataFrame([*left_cols, *right_cols]).slice(zlice) # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184 left_on = DataFrame(broadcast(*(e.evaluate(left) for e in self.left_on))) right_on = DataFrame(broadcast(*(e.evaluate(right) for e in self.right_on))) @@ -838,18 +989,19 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: plc.copying.gather(right.table, rg, right_policy), right.column_names ) if coalesce and how != "inner": - left = left.replace_columns( - *( - NamedColumn( + left = left.with_columns( + ( + Column( plc.replace.replace_nulls(left_col.obj, right_col.obj), - left_col.name, + name=left_col.name, ) for left_col, right_col in zip( left.select_columns(left_on.column_names_set), right.select_columns(right_on.column_names_set), strict=True, ) - ) + ), + replace_only=True, ) right = right.discard_columns(right_on.column_names_set) if how == "right": @@ -866,20 +1018,30 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return result.slice(zlice) -@dataclasses.dataclass class HStack(IR): """Add new columns to a dataframe.""" - df: IR - """Input dataframe.""" - columns: list[expr.NamedExpr] - """List of expressions to produce new columns.""" + __slots__ = ("columns", "should_broadcast") + _non_child = ("schema", "columns", "should_broadcast") should_broadcast: bool - """Should columns be broadcast?""" + """Should the resulting evaluated columns be broadcast to the same length.""" + + def __init__( + self, + schema: Schema, + columns: Sequence[expr.NamedExpr], + should_broadcast: bool, # noqa: FBT001 + df: IR, + ): + self.schema = schema + self.columns = tuple(columns) + self.should_broadcast = should_broadcast + self.children = (df,) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) + (child,) = self.children + df = child.evaluate(cache=cache) columns = [c.evaluate(df) for c in self.columns] if self.should_broadcast: columns = broadcast(*columns, target_length=df.num_rows) @@ -895,20 +1057,36 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return df.with_columns(columns) -@dataclasses.dataclass class Distinct(IR): """Produce a new dataframe with distinct rows.""" - df: IR - """Input dataframe.""" + __slots__ = ("keep", "subset", "zlice", "stable") + _non_child = ("schema", "keep", "subset", "zlice", "stable") keep: plc.stream_compaction.DuplicateKeepOption - """Which rows to keep.""" - subset: set[str] | None - """Which columns to inspect when computing distinct rows.""" + """Which distinct value to keep.""" + subset: frozenset[str] | None + """Which columns should be used to define distinctness. If None, + then all columns are used.""" zlice: tuple[int, int] | None - """Optional slice to perform after compaction.""" + """Optional slice to apply to the result.""" stable: bool - """Should order be preserved?""" + """Should the result maintain ordering.""" + + def __init__( + self, + schema: Schema, + keep: plc.stream_compaction.DuplicateKeepOption, + subset: frozenset[str] | None, + zlice: tuple[int, int] | None, + stable: bool, # noqa: FBT001 + df: IR, + ): + self.schema = schema + self.keep = keep + self.subset = subset + self.zlice = zlice + self.stable = stable + self.children = (df,) _KEEP_MAP: ClassVar[dict[str, plc.stream_compaction.DuplicateKeepOption]] = { "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST, @@ -917,23 +1095,16 @@ class Distinct(IR): "any": plc.stream_compaction.DuplicateKeepOption.KEEP_ANY, } - def __init__(self, schema: Schema, df: IR, options: Any) -> None: - self.schema = schema - self.df = df - (keep, subset, maintain_order, zlice) = options - self.keep = Distinct._KEEP_MAP[keep] - self.subset = set(subset) if subset is not None else None - self.stable = maintain_order - self.zlice = zlice - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) + (child,) = self.children + df = child.evaluate(cache=cache) if self.subset is None: indices = list(range(df.num_columns)) + keys_sorted = all(c.is_sorted for c in df.column_map.values()) else: indices = [i for i, k in enumerate(df.column_names) if k in self.subset] - keys_sorted = all(df.columns[i].is_sorted for i in indices) + keys_sorted = all(df.column_map[name].is_sorted for name in self.subset) if keys_sorted: table = plc.stream_compaction.unique( df.table, @@ -954,10 +1125,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: plc.types.NullEquality.EQUAL, plc.types.NanEquality.ALL_EQUAL, ) + # TODO: Is this sortedness setting correct result = DataFrame( [ - NamedColumn(c, old.name).sorted_like(old) - for c, old in zip(table.columns(), df.columns, strict=True) + Column(new, name=old.name).sorted_like(old) + for new, old in zip(table.columns(), df.columns, strict=True) ] ) if keys_sorted or self.stable: @@ -965,136 +1137,151 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: return result.slice(self.zlice) -@dataclasses.dataclass class Sort(IR): """Sort a dataframe.""" - df: IR - """Input.""" - by: list[expr.NamedExpr] - """List of expressions to produce sort keys.""" - do_sort: Callable[..., plc.Table] - """pylibcudf sorting function.""" + __slots__ = ("by", "order", "null_order", "stable", "zlice") + _non_child = ("schema", "by", "order", "null_order", "stable", "zlice") + by: tuple[expr.NamedExpr, ...] + """Sort keys.""" + order: tuple[plc.types.Order, ...] + """Sort order for each sort key.""" + null_order: tuple[plc.types.NullOrder, ...] + """Null sorting location for each sort key.""" + stable: bool + """Should the sort be stable?""" zlice: tuple[int, int] | None - """Optional slice to apply after sorting.""" - order: list[plc.types.Order] - """Order keys should be sorted in.""" - null_order: list[plc.types.NullOrder] - """Where nulls sort to.""" + """Optional slice to apply to the result.""" def __init__( self, schema: Schema, - df: IR, - by: list[expr.NamedExpr], - options: Any, + by: Sequence[expr.NamedExpr], + order: Sequence[plc.types.Order], + null_order: Sequence[plc.types.NullOrder], + stable: bool, # noqa: FBT001 zlice: tuple[int, int] | None, - ) -> None: + df: IR, + ): self.schema = schema - self.df = df - self.by = by + self.by = tuple(by) + self.order = tuple(order) + self.null_order = tuple(null_order) + self.stable = stable self.zlice = zlice - stable, nulls_last, descending = options - self.order, self.null_order = sorting.sort_order( - descending, nulls_last=nulls_last, num_keys=len(by) - ) - self.do_sort = ( - plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key - ) + self.children = (df,) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) + (child,) = self.children + df = child.evaluate(cache=cache) sort_keys = broadcast( *(k.evaluate(df) for k in self.by), target_length=df.num_rows ) - names = {c.name: i for i, c in enumerate(df.columns)} # TODO: More robust identification here. - keys_in_result = [ - i - for k in sort_keys - if (i := names.get(k.name)) is not None and k.obj is df.columns[i].obj - ] - table = self.do_sort( + keys_in_result = { + k.name: i + for i, k in enumerate(sort_keys) + if k.name in df.column_map and k.obj is df.column_map[k.name].obj + } + do_sort = ( + plc.sorting.stable_sort_by_key if self.stable else plc.sorting.sort_by_key + ) + table = do_sort( df.table, plc.Table([k.obj for k in sort_keys]), - self.order, - self.null_order, + list(self.order), + list(self.null_order), ) - columns = [ - NamedColumn(c, old.name) - for c, old in zip(table.columns(), df.columns, strict=True) - ] - # If a sort key is in the result table, set the sortedness property - for k, i in enumerate(keys_in_result): - columns[i] = columns[i].set_sorted( - is_sorted=plc.types.Sorted.YES, - order=self.order[k], - null_order=self.null_order[k], - ) + columns: list[Column] = [] + for name, c in zip(df.column_map, table.columns(), strict=True): + column = Column(c, name=name) + # If a sort key is in the result table, set the sortedness property + if name in keys_in_result: + i = keys_in_result[name] + column = column.set_sorted( + is_sorted=plc.types.Sorted.YES, + order=self.order[i], + null_order=self.null_order[i], + ) + columns.append(column) return DataFrame(columns).slice(self.zlice) -@dataclasses.dataclass class Slice(IR): """Slice a dataframe.""" - df: IR - """Input.""" + __slots__ = ("offset", "length") + _non_child = ("schema", "offset", "length") offset: int """Start of the slice.""" length: int """Length of the slice.""" + def __init__(self, schema: Schema, offset: int, length: int, df: IR): + self.schema = schema + self.offset = offset + self.length = length + self.children = (df,) + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) + (child,) = self.children + df = child.evaluate(cache=cache) return df.slice((self.offset, self.length)) -@dataclasses.dataclass class Filter(IR): """Filter a dataframe with a boolean mask.""" - df: IR - """Input.""" + __slots__ = ("mask",) + _non_child = ("schema", "mask") mask: expr.NamedExpr - """Expression evaluating to a mask.""" + """Expression to produce the filter mask.""" + + def __init__(self, schema: Schema, mask: expr.NamedExpr, df: IR): + self.schema = schema + self.mask = mask + self.children = (df,) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) + (child,) = self.children + df = child.evaluate(cache=cache) (mask,) = broadcast(self.mask.evaluate(df), target_length=df.num_rows) return df.filter(mask) -@dataclasses.dataclass class Projection(IR): """Select a subset of columns from a dataframe.""" - df: IR - """Input.""" + __slots__ = () + _non_child = ("schema",) + + def __init__(self, schema: Schema, df: IR): + self.schema = schema + self.children = (df,) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - df = self.df.evaluate(cache=cache) + (child,) = self.children + df = child.evaluate(cache=cache) # This can reorder things. columns = broadcast( - *df.select(list(self.schema.keys())).columns, target_length=df.num_rows + *(df.column_map[name] for name in self.schema), target_length=df.num_rows ) return DataFrame(columns) -@dataclasses.dataclass class MapFunction(IR): """Apply some function to a dataframe.""" - df: IR - """Input.""" + __slots__ = ("name", "options") + _non_child = ("schema", "name", "options") name: str - """Function name.""" + """Name of the function to apply""" options: Any - """Arbitrary options, interpreted per function.""" + """Arbitrary name-specific options""" _NAMES: ClassVar[frozenset[str]] = frozenset( [ @@ -1109,9 +1296,11 @@ class MapFunction(IR): ] ) - def __post_init__(self) -> None: - """Validate preconditions.""" - super().__post_init__() + def __init__(self, schema: Schema, name: str, options: Any, df: IR): + self.schema = schema + self.name = name + self.options = options + self.children = (df,) if self.name not in MapFunction._NAMES: raise NotImplementedError(f"Unhandled map function {self.name}") if self.name == "explode": @@ -1125,7 +1314,7 @@ def __post_init__(self) -> None: old, new, _ = self.options # TODO: perhaps polars should validate renaming in the IR? if len(new) != len(set(new)) or ( - set(new) & (set(self.df.schema.keys() - set(old))) + set(new) & (set(df.schema.keys()) - set(old)) ): raise NotImplementedError("Duplicate new names in rename.") elif self.name == "unpivot": @@ -1134,31 +1323,31 @@ def __post_init__(self) -> None: variable_name = "variable" if variable_name is None else variable_name if len(pivotees) == 0: index = frozenset(indices) - pivotees = [name for name in self.df.schema if name not in index] + pivotees = [name for name in df.schema if name not in index] if not all( - dtypes.can_cast(self.df.schema[p], self.schema[value_name]) - for p in pivotees + dtypes.can_cast(df.schema[p], self.schema[value_name]) for p in pivotees ): raise NotImplementedError( "Unpivot cannot cast all input columns to " f"{self.schema[value_name].id()}" ) - self.options = (indices, pivotees, variable_name, value_name) + self.options = (tuple(indices), tuple(pivotees), variable_name, value_name) def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" + (child,) = self.children if self.name == "rechunk": # No-op in our data model # Don't think this appears in a plan tree from python - return self.df.evaluate(cache=cache) # pragma: no cover + return child.evaluate(cache=cache) # pragma: no cover elif self.name == "rename": - df = self.df.evaluate(cache=cache) + df = child.evaluate(cache=cache) # final tag is "swapping" which is useful for the # optimiser (it blocks some pushdown operations) old, new, _ = self.options return df.rename_columns(dict(zip(old, new, strict=True))) elif self.name == "explode": - df = self.df.evaluate(cache=cache) + df = child.evaluate(cache=cache) ((to_explode,),) = self.options index = df.column_names.index(to_explode) subset = df.column_names_set - {to_explode} @@ -1168,9 +1357,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: elif self.name == "unpivot": indices, pivotees, variable_name, value_name = self.options npiv = len(pivotees) - df = self.df.evaluate(cache=cache) + df = child.evaluate(cache=cache) index_columns = [ - NamedColumn(col, name) + Column(col, name=name) for col, name in zip( plc.reshape.tile(df.select(indices).table, npiv).columns(), indices, @@ -1191,50 +1380,56 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: df.num_rows, ).columns() value_column = plc.concatenate.concatenate( - [c.astype(self.schema[value_name]) for c in df.select(pivotees).columns] + [ + df.column_map[pivotee].astype(self.schema[value_name]).obj + for pivotee in pivotees + ] ) return DataFrame( [ *index_columns, - NamedColumn(variable_column, variable_name), - NamedColumn(value_column, value_name), + Column(variable_column, name=variable_name), + Column(value_column, name=value_name), ] ) else: raise AssertionError("Should never be reached") # pragma: no cover -@dataclasses.dataclass class Union(IR): """Concatenate dataframes vertically.""" - dfs: list[IR] - """List of inputs.""" + __slots__ = ("zlice",) + _non_child = ("schema", "zlice") zlice: tuple[int, int] | None - """Optional slice to apply after concatenation.""" + """Optional slice to apply to the result.""" - def __post_init__(self) -> None: - """Validate preconditions.""" - super().__post_init__() - schema = self.dfs[0].schema - if not all(s.schema == schema for s in self.dfs[1:]): + def __init__(self, schema: Schema, zlice: tuple[int, int] | None, *children: IR): + self.schema = schema + self.zlice = zlice + self.children = children + schema = self.children[0].schema + if not all(s.schema == schema for s in self.children[1:]): raise NotImplementedError("Schema mismatch") def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" # TODO: only evaluate what we need if we have a slice - dfs = [df.evaluate(cache=cache) for df in self.dfs] + dfs = [df.evaluate(cache=cache) for df in self.children] return DataFrame.from_table( plc.concatenate.concatenate([df.table for df in dfs]), dfs[0].column_names ).slice(self.zlice) -@dataclasses.dataclass class HConcat(IR): """Concatenate dataframes horizontally.""" - dfs: list[IR] - """List of inputs.""" + __slots__ = () + _non_child = ("schema",) + + def __init__(self, schema: Schema, *children: IR): + self.schema = schema + self.children = children @staticmethod def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table: @@ -1266,7 +1461,7 @@ def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table: def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - dfs = [df.evaluate(cache=cache) for df in self.dfs] + dfs = [df.evaluate(cache=cache) for df in self.children] max_rows = max(df.num_rows for df in dfs) # Horizontal concatenation extends shorter tables with nulls dfs = [ @@ -1278,6 +1473,4 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ) for df in dfs ] - return DataFrame( - list(itertools.chain.from_iterable(df.columns for df in dfs)), - ) + return DataFrame(itertools.chain.from_iterable(df.columns for df in dfs)) diff --git a/python/cudf_polars/cudf_polars/dsl/nodebase.py b/python/cudf_polars/cudf_polars/dsl/nodebase.py new file mode 100644 index 00000000000..228d300f467 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/nodebase.py @@ -0,0 +1,152 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Base class for IR nodes, and utilities.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar + +if TYPE_CHECKING: + from collections.abc import Hashable, Sequence + + from typing_extensions import Self + + +__all__: list[str] = ["Node"] + +T = TypeVar("T", bound="Node[Any]") + + +class Node(Generic[T]): + """ + An abstract node type. + + Nodes are immutable! + + This contains a (potentially empty) tuple of child nodes, + along with non-child data. For uniform reconstruction and + implementation of hashing and equality schemes, child classes need + to provide a certain amount of metadata when they are defined. + Specifically, the ``_non_child`` attribute must list, in-order, + the names of the slots that are passed to the constructor. The + constructor must take arguments in the order ``(*_non_child, + *children).`` + """ + + __slots__ = ("_hash_value", "_repr_value", "children") + _hash_value: int + _repr_value: str + children: tuple[T, ...] + _non_child: ClassVar[tuple[str, ...]] = () + + def _ctor_arguments(self, children: Sequence[T]) -> Sequence[Any | T]: + return (*(getattr(self, attr) for attr in self._non_child), *children) + + def reconstruct( + self, children: Sequence[T] + ) -> Self: # pragma: no cover; not yet used + """ + Rebuild this node with new children. + + Parameters + ---------- + children + New children + + Returns + ------- + New node with new children. Non-child data is shared with the input. + """ + return type(self)(*self._ctor_arguments(children)) + + def get_hashable(self) -> Hashable: + """ + Return a hashable object for the node. + + Returns + ------- + Hashable object. + + Notes + ----- + This method is used by the :meth:`__hash__` implementation + (which does caching). If your node type needs special-case + handling for some of its attributes, override this method, not + :meth:`__hash__`. + """ + return (type(self), self._ctor_arguments(self.children)) + + def __hash__(self) -> int: + """ + Hash of an expression with caching. + + See Also + -------- + get_hashable + """ + try: + return self._hash_value + except AttributeError: + self._hash_value = hash(self.get_hashable()) + return self._hash_value + + def is_equal(self, other: Self) -> bool: + """ + Equality of two nodes of equal type. + + Override this in subclasses, rather than :meth:`__eq__`. + + Parameter + --------- + other + object of same type to compare to. + + Notes + ----- + Since nodes are immutable, this does common subexpression + elimination when two nodes are determined to be equal. + + :meth:`__eq__` handles the case where the objects being + compared are not of the same type, so in this method, we only + need to implement equality of equal types. + + Returns + ------- + True if the two nodes are equal, false otherwise. + """ + if self is other: + return True + result = self._ctor_arguments(self.children) == other._ctor_arguments( + other.children + ) + # Eager CSE for nodes that match. + if result: + self.children = other.children + return result + + def __eq__(self, other: Any) -> bool: + """ + Equality of expressions. + + See Also + -------- + is_equal + """ + if type(self) is not type(other) or hash(self) != hash(other): + return False + else: + return self.is_equal(other) + + def __ne__(self, other: Any) -> bool: + """Inequality of expressions.""" + return not self.__eq__(other) + + def __repr__(self) -> str: + """String representation of an expression with caching.""" + try: + return self._repr_value + except AttributeError: + args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children)) + self._repr_value = f"{type(self).__name__}({args})" + return self._repr_value diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index a0291037f01..c28f2c2651a 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -5,10 +5,11 @@ from __future__ import annotations +import functools import json from contextlib import AbstractContextManager, nullcontext from functools import singledispatch -from typing import Any +from typing import TYPE_CHECKING, Any import pyarrow as pa import pylibcudf as plc @@ -19,8 +20,12 @@ from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir from cudf_polars.dsl import expr, ir +from cudf_polars.dsl.traversal import make_recursive, reuse_if_unchanged from cudf_polars.typing import NodeTraverser -from cudf_polars.utils import dtypes +from cudf_polars.utils import dtypes, sorting + +if TYPE_CHECKING: + from cudf_polars.typing import ExprTransformer __all__ = ["translate_ir", "translate_named_expr"] @@ -148,7 +153,7 @@ def _( with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) exprs = [translate_named_expr(visitor, n=e) for e in node.expr] - return ir.Select(schema, inp, exprs, node.should_broadcast) + return ir.Select(schema, exprs, node.should_broadcast, inp) @_translate_ir.register @@ -161,11 +166,11 @@ def _( keys = [translate_named_expr(visitor, n=e) for e in node.keys] return ir.GroupBy( schema, - inp, - aggs, keys, + aggs, node.maintain_order, node.options, + inp, ) @@ -182,7 +187,71 @@ def _( with set_node(visitor, node.input_right): inp_right = translate_ir(visitor, n=None) right_on = [translate_named_expr(visitor, n=e) for e in node.right_on] - return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options) + if (how := node.options[0]) in { + "inner", + "left", + "right", + "full", + "cross", + "semi", + "anti", + }: + return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right) + else: + how, op1, op2 = how + if how != "ie_join": + raise NotImplementedError( + f"Unsupported join type {how}" + ) # pragma: no cover; asof joins not yet exposed + # No exposure of mixed/conditional joins in pylibcudf yet, so in + # the first instance, implement by doing a cross join followed by + # a filter. + _, join_nulls, zlice, suffix, coalesce = node.options + cross = ir.Join( + schema, + [], + [], + ("cross", join_nulls, None, suffix, coalesce), + inp_left, + inp_right, + ) + dtype = plc.DataType(plc.TypeId.BOOL8) + if op2 is None: + ops = [op1] + else: + ops = [op1, op2] + suffix = cross.options[3] + + # Column references in the right table refer to the post-join + # names, so with suffixes. + def _rename(e: expr.Expr, rec: ExprTransformer) -> expr.Expr: + if isinstance(e, expr.Col) and e.name in inp_left.schema: + return type(e)(e.dtype, f"{e.name}{suffix}") + return reuse_if_unchanged(e, rec) + + mapper = make_recursive(_rename) + right_on = [ + expr.NamedExpr( + f"{old.name}{suffix}" if old.name in inp_left.schema else old.name, new + ) + for new, old in zip( + (mapper(e.value) for e in right_on), right_on, strict=True + ) + ] + mask = functools.reduce( + functools.partial( + expr.BinOp, dtype, plc.binaryop.BinaryOperator.LOGICAL_AND + ), + ( + expr.BinOp(dtype, expr.BinOp._MAPPING[op], left.value, right.value) + for op, left, right in zip(ops, left_on, right_on, strict=True) + ), + ) + filtered = ir.Filter(schema, expr.NamedExpr("mask", mask), cross) + if zlice is not None: + offset, length = zlice + return ir.Slice(schema, offset, length, filtered) + return filtered @_translate_ir.register @@ -192,7 +261,7 @@ def _( with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) exprs = [translate_named_expr(visitor, n=e) for e in node.exprs] - return ir.HStack(schema, inp, exprs, node.should_broadcast) + return ir.HStack(schema, exprs, node.should_broadcast, inp) @_translate_ir.register @@ -202,17 +271,23 @@ def _( with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) exprs = [translate_named_expr(visitor, n=e) for e in node.expr] - return ir.Reduce(schema, inp, exprs) + return ir.Reduce(schema, exprs, inp) @_translate_ir.register def _( node: pl_ir.Distinct, visitor: NodeTraverser, schema: dict[str, plc.DataType] ) -> ir.IR: + (keep, subset, maintain_order, zlice) = node.options + keep = ir.Distinct._KEEP_MAP[keep] + subset = frozenset(subset) if subset is not None else None return ir.Distinct( schema, + keep, + subset, + zlice, + maintain_order, translate_ir(visitor, n=node.input), - node.options, ) @@ -223,14 +298,18 @@ def _( with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) by = [translate_named_expr(visitor, n=e) for e in node.by_column] - return ir.Sort(schema, inp, by, node.sort_options, node.slice) + stable, nulls_last, descending = node.sort_options + order, null_order = sorting.sort_order( + descending, nulls_last=nulls_last, num_keys=len(by) + ) + return ir.Sort(schema, by, order, null_order, stable, node.slice, inp) @_translate_ir.register def _( node: pl_ir.Slice, visitor: NodeTraverser, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.Slice(schema, translate_ir(visitor, n=node.input), node.offset, node.len) + return ir.Slice(schema, node.offset, node.len, translate_ir(visitor, n=node.input)) @_translate_ir.register @@ -240,7 +319,7 @@ def _( with set_node(visitor, node.input): inp = translate_ir(visitor, n=None) mask = translate_named_expr(visitor, n=node.predicate) - return ir.Filter(schema, inp, mask) + return ir.Filter(schema, mask, inp) @_translate_ir.register @@ -259,10 +338,10 @@ def _( name, *options = node.function return ir.MapFunction( schema, - # TODO: merge_sorted breaks this pattern - translate_ir(visitor, n=node.input), name, options, + # TODO: merge_sorted breaks this pattern + translate_ir(visitor, n=node.input), ) @@ -271,7 +350,7 @@ def _( node: pl_ir.Union, visitor: NodeTraverser, schema: dict[str, plc.DataType] ) -> ir.IR: return ir.Union( - schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options + schema, node.options, *(translate_ir(visitor, n=n) for n in node.inputs) ) @@ -279,7 +358,7 @@ def _( def _( node: pl_ir.HConcat, visitor: NodeTraverser, schema: dict[str, plc.DataType] ) -> ir.IR: - return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs]) + return ir.HConcat(schema, *(translate_ir(visitor, n=n) for n in node.inputs)) def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR: @@ -309,8 +388,7 @@ def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR: # IR is versioned with major.minor, minor is bumped for backwards # compatible changes (e.g. adding new nodes), major is bumped for # incompatible changes (e.g. renaming nodes). - # Polars 1.7 changes definition of the CSV reader options schema name. - if (version := visitor.version()) >= (3, 0): + if (version := visitor.version()) >= (4, 0): raise NotImplementedError( f"No support for polars IR {version=}" ) # pragma: no cover; no such version for now. diff --git a/python/cudf_polars/cudf_polars/dsl/traversal.py b/python/cudf_polars/cudf_polars/dsl/traversal.py new file mode 100644 index 00000000000..be8338cb9a9 --- /dev/null +++ b/python/cudf_polars/cudf_polars/dsl/traversal.py @@ -0,0 +1,175 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Traversal and visitor utilities for nodes.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Generic + +from cudf_polars.typing import U_contra, V_co + +if TYPE_CHECKING: + from collections.abc import Callable, Generator, Mapping, MutableMapping + + from cudf_polars.typing import GenericTransformer, NodeT + + +__all__: list[str] = [ + "traversal", + "reuse_if_unchanged", + "make_recursive", + "CachingVisitor", +] + + +def traversal(node: NodeT) -> Generator[NodeT, None, None]: + """ + Pre-order traversal of nodes in an expression. + + Parameters + ---------- + node + Root of expression to traverse. + + Yields + ------ + Unique nodes in the expression, parent before child, children + in-order from left to right. + """ + seen = {node} + lifo = [node] + + while lifo: + node = lifo.pop() + yield node + for child in reversed(node.children): + if child not in seen: + seen.add(child) + lifo.append(child) + + +def reuse_if_unchanged(node: NodeT, fn: GenericTransformer[NodeT, NodeT]) -> NodeT: + """ + Recipe for transforming nodes that returns the old object if unchanged. + + Parameters + ---------- + node + Node to recurse on + fn + Function to transform children + + Notes + ----- + This can be used as a generic "base case" handler when + writing transforms that take nodes and produce new nodes. + + Returns + ------- + Existing node `e` if transformed children are unchanged, otherwise + reconstructed node with new children. + """ + new_children = [fn(c) for c in node.children] + if all(new == old for new, old in zip(new_children, node.children, strict=True)): + return node + return node.reconstruct(new_children) + + +def make_recursive( + fn: Callable[[U_contra, GenericTransformer[U_contra, V_co]], V_co], + *, + state: Mapping[str, Any] | None = None, +) -> GenericTransformer[U_contra, V_co]: + """ + No-op wrapper for recursive visitors. + + Facilitates using visitors that don't need caching but are written + in the same style. + + Parameters + ---------- + fn + Function to transform inputs to outputs. Should take as its + second argument a callable from input to output. + state + Arbitrary *immutable* state that should be accessible to the + visitor through the `state` property. + + Notes + ----- + All transformation functions *must* be free of side-effects. + + Usually, prefer a :class:`CachingVisitor`, but if we know that we + don't need caching in a transformation and then this no-op + approach is slightly cheaper. + + Returns + ------- + Recursive function without caching. + + See Also + -------- + CachingVisitor + """ + + def rec(node: U_contra) -> V_co: + return fn(node, rec) # type: ignore[arg-type] + + rec.state = state if state is not None else {} # type: ignore[attr-defined] + return rec # type: ignore[return-value] + + +class CachingVisitor(Generic[U_contra, V_co]): + """ + Caching wrapper for recursive visitors. + + Facilitates writing visitors where already computed results should + be cached and reused. The cache is managed automatically, and is + tied to the lifetime of the wrapper. + + Parameters + ---------- + fn + Function to transform inputs to outputs. Should take as its + second argument the recursive cache manager. + state + Arbitrary *immutable* state that should be accessible to the + visitor through the `state` property. + + Notes + ----- + All transformation functions *must* be free of side-effects. + + Returns + ------- + Recursive function with caching. + """ + + def __init__( + self, + fn: Callable[[U_contra, GenericTransformer[U_contra, V_co]], V_co], + *, + state: Mapping[str, Any] | None = None, + ) -> None: + self.fn = fn + self.cache: MutableMapping[U_contra, V_co] = {} + self.state = state if state is not None else {} + + def __call__(self, value: U_contra) -> V_co: + """ + Apply the function to a value. + + Parameters + ---------- + value + The value to transform. + + Returns + ------- + A transformed value. + """ + try: + return self.cache[value] + except KeyError: + return self.cache.setdefault(value, self.fn(value, self)) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 05b76d76808..a3607159e01 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -53,12 +53,34 @@ def pytest_configure(config: pytest.Config): "tests/unit/io/test_lazy_parquet.py::test_parquet_is_in_statistics": "Debug output on stderr doesn't match", "tests/unit/io/test_lazy_parquet.py::test_parquet_statistics": "Debug output on stderr doesn't match", "tests/unit/io/test_lazy_parquet.py::test_parquet_different_schema[False]": "Needs cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-columns]": "Correctly raises but different error", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-row_groups]": "Correctly raises but different error", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-prefiltered]": "Correctly raises but different error", + "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_arg[False-none]": "Correctly raises but different error", "tests/unit/io/test_lazy_parquet.py::test_parquet_schema_mismatch_panic_17067[False]": "Needs cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[False-False]": "Needs some variant of cudf#16394", + "tests/unit/io/test_lazy_parquet.py::test_scan_parquet_ignores_dtype_mismatch_for_non_projected_columns_19249[True-False]": "Needs some variant of cudf#16394", "tests/unit/io/test_lazy_parquet.py::test_parquet_slice_pushdown_non_zero_offset[False]": "Thrift data not handled correctly/slice pushdown wrong?", "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read[False]": "Incomplete handling of projected reads with mismatching schemas, cudf#16394", "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_dtype_mismatch[False]": "Different exception raised, but correctly raises an exception", "tests/unit/io/test_lazy_parquet.py::test_parquet_unaligned_schema_read_missing_cols_from_first[False]": "Different exception raised, but correctly raises an exception", "tests/unit/io/test_parquet.py::test_read_parquet_only_loads_selected_columns_15098": "Memory usage won't be correct due to GPU", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-none]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-none]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-prefiltered]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-prefiltered]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-row_groups]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-row_groups]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-False-columns]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-False-columns]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-none]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-none]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-prefiltered]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-prefiltered]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-row_groups]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-row_groups]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection0-True-columns]": "Mismatching column read cudf#16394", + "tests/unit/io/test_parquet.py::test_allow_missing_columns[projection1-True-columns]": "Mismatching column read cudf#16394", "tests/unit/io/test_scan.py::test_scan[single-csv-async]": "Debug output on stderr doesn't match", "tests/unit/io/test_scan.py::test_scan_with_limit[single-csv-async]": "Debug output on stderr doesn't match", "tests/unit/io/test_scan.py::test_scan_with_filter[single-csv-async]": "Debug output on stderr doesn't match", @@ -107,6 +129,14 @@ def pytest_configure(config: pytest.Config): "tests/unit/operations/aggregation/test_aggregations.py::test_sum_empty_and_null_set": "libcudf sums column of all nulls to null, not zero", "tests/unit/operations/aggregation/test_aggregations.py::test_binary_op_agg_context_no_simplify_expr_12423": "groupby-agg of just literals should not produce collect_list", "tests/unit/operations/aggregation/test_aggregations.py::test_nan_inf_aggregation": "treatment of nans and nulls together is different in libcudf and polars in groupby-agg context", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func0-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func1-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func2-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func0-func3-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func0-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func1-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func2-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func3-none]": "cudf-polars doesn't nullify division by zero", "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", @@ -124,13 +154,6 @@ def pytest_configure(config: pytest.Config): "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg", "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg", "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_by_monday_and_offset_5444": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[left-expected0]": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[right-expected1]": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_label[datapoint-expected2]": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_rolling_dynamic_sortedness_check": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_validation": "IR needs to expose groupby-dynamic information", - "tests/unit/operations/test_group_by_dynamic.py::test_group_by_dynamic_15225": "IR needs to expose groupby-dynamic information", "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins", "tests/unit/sql/test_cast.py::test_cast_errors[values0-values::uint8-conversion from `f64` to `u64` failed]": "Casting that raises not supported on GPU", "tests/unit/sql/test_cast.py::test_cast_errors[values1-values::uint4-conversion from `i64` to `u32` failed]": "Casting that raises not supported on GPU", @@ -140,6 +163,7 @@ def pytest_configure(config: pytest.Config): "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics": "Debug output on stderr doesn't match", "tests/unit/test_cse.py::test_cse_predicate_self_join": "Debug output on stderr doesn't match", "tests/unit/test_empty.py::test_empty_9137": "Mismatching dtypes, needs cudf#15852", + "tests/unit/test_errors.py::test_error_on_empty_group_by": "Incorrect exception raised", # Maybe flaky, order-dependent? "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order", "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero", diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py index 240b11bdf59..a27a3395c35 100644 --- a/python/cudf_polars/cudf_polars/typing/__init__.py +++ b/python/cudf_polars/cudf_polars/typing/__init__.py @@ -5,8 +5,8 @@ from __future__ import annotations -from collections.abc import Mapping -from typing import TYPE_CHECKING, Literal, Protocol, Union +from collections.abc import Hashable, Mapping +from typing import TYPE_CHECKING, Any, Literal, Protocol, TypeVar, Union import pylibcudf as plc @@ -18,7 +18,19 @@ import polars as pl -IR: TypeAlias = Union[ + from cudf_polars.dsl import expr, ir, nodebase + +__all__: list[str] = [ + "PolarsIR", + "PolarsExpr", + "NodeTraverser", + "OptimizationArgs", + "GenericTransformer", + "ExprTransformer", + "IRTransformer", +] + +PolarsIR: TypeAlias = Union[ pl_ir.PythonScan, pl_ir.Scan, pl_ir.Cache, @@ -38,7 +50,7 @@ pl_ir.ExtContext, ] -Expr: TypeAlias = Union[ +PolarsExpr: TypeAlias = Union[ pl_expr.Function, pl_expr.Window, pl_expr.Literal, @@ -68,7 +80,7 @@ def set_node(self, n: int) -> None: """Set the current plan node to n.""" ... - def view_current_node(self) -> IR: + def view_current_node(self) -> PolarsIR: """Convert current plan node to python rep.""" ... @@ -80,7 +92,7 @@ def get_dtype(self, n: int) -> pl.DataType: """Get the datatype of the given expression id.""" ... - def view_expression(self, n: int) -> Expr: + def view_expression(self, n: int) -> PolarsExpr: """Convert the given expression to python rep.""" ... @@ -107,3 +119,29 @@ def set_udf( "cluster_with_columns", "no_optimization", ] + + +U_contra = TypeVar("U_contra", bound=Hashable, contravariant=True) +V_co = TypeVar("V_co", covariant=True) +NodeT = TypeVar("NodeT", bound="nodebase.Node[Any]") + + +class GenericTransformer(Protocol[U_contra, V_co]): + """Abstract protocol for recursive visitors.""" + + def __call__(self, __value: U_contra) -> V_co: + """Apply the visitor to the node.""" + ... + + @property + def state(self) -> Mapping[str, Any]: + """Arbitrary immutable state.""" + ... + + +# Quotes to avoid circular import +ExprTransformer: TypeAlias = GenericTransformer["expr.Expr", "expr.Expr"] +"""Protocol for transformation of Expr nodes.""" + +IRTransformer: TypeAlias = GenericTransformer["ir.IR", "ir.IR"] +"""Protocol for transformation of IR nodes.""" diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index bff44af1468..74b2cd4e5de 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -11,14 +11,17 @@ You will need: environment](https://github.com/rapidsai/cudf/blob/branch-24.12/CONTRIBUTING.md#setting-up-your-build-environment). The combined devcontainer works, or whatever your favourite approach is. -> ![NOTE] These instructions will get simpler as we merge code in. +:::{note} +These instructions will get simpler as we merge code in. +::: ## Installing polars -`cudf-polars` works with polars >= 1.3, as long as the internal IR -version doesn't get a major version bump. So `pip install polars>=1.3` -should work. For development, if we're adding things to the polars -side of things, we will need to build polars from source: +The `cudf-polars` `pyproject.toml` advertises which polars versions it +works with. So for pure `cudf-polars` development, installing as +normal and satisfying the dependencies in the repository is +sufficient. For development, if we're adding things to the polars side +of things, we will need to build polars from source: ```sh git clone https://github.com/pola-rs/polars @@ -36,7 +39,9 @@ pip install --upgrade uv uv pip install --upgrade -r py-polars/requirements-dev.txt ``` -> ![NOTE] plain `pip install` works fine, but `uv` is _much_ faster! +:::{note} +plain `pip install` works fine, but `uv` is _much_ faster! +::: Now we have the necessary machinery to build polars ```sh @@ -83,7 +88,7 @@ representation (IR). Second, an execution phase which executes using our IR. The translation phase receives the a low-level Rust `NodeTraverser` -object which delivers Python representations of the plan nodes (and +object that delivers Python representations of the plan nodes (and expressions) one at a time. During translation, we endeavour to raise `NotImplementedError` for any unsupported functionality. This way, if we can't execute something, we just don't modify the logical plan at @@ -126,7 +131,6 @@ arguments, at the moment, `raise_on_fail` is also supported, which raises, rather than falling back, during translation: ```python - result = q.collect(engine=pl.GPUEngine(raise_on_fail=True)) ``` @@ -144,13 +148,73 @@ changes. We can therefore attempt to detect the IR version appropriately. This should be done during IR translation in `translate.py`. -## Adding a handler for a new plan node +# IR design + +As noted, we translate the polars DSL into our own IR. This is both so +that we can smooth out minor version differences (advertised by +`NodeTraverser` version changes) within `cudf-polars`, and so that we +have the freedom to introduce new IR nodes and rewrite rules as might +be appropriate for GPU execution. + +To that end, we provide facilities for definition of nodes as well as +writing traversals and rewrite rules. The abstract base class `Node` +in `dsl/nodebase.py` defines the interface for implementing new nodes, +and provides many useful default methods. See also the docstrings of +the `Node` class. + +:::{note} +This generic implementation relies on nodes being treated as +*immutable*. Do not implement in-place modification of nodes, bad +things will happen. +::: + +## Defining nodes + +A concrete node type (`cudf-polars` has expression nodes, `Expr`; +and plan nodes, `IR`), should inherit from `Node`. Nodes have +two types of data: + +1. `children`: a tuple (possibly empty) of concrete nodes; +2. non-child: arbitrary data attached to the node that is _not_ a + concrete node. + +The base `Node` class requires that one advertise the names of the +non-child attributes in the `_non_child` class variable. The +constructor of the concrete node should take its arguments in the +order `*_non_child` (ordered as the class variable does) and then +`*children`. For example, the `Sort` node, which sorts a column +generated by an expression, has this definition: + +```python +class Expr(Node): + children: tuple[Expr, ...] + +class Sort(Expr): + _non_child = ("dtype", "options") + children: tuple[Expr] + def __init__(self, dtype, options, column: Expr): + self.dtype = dtype + self.options = options + self.children = (column,) +``` + +By following this pattern, we get an automatic (caching) +implementation of `__hash__` and `__eq__`, as well as a useful +`reconstruct` method that will rebuild the node with new children. + +If you want to control the behaviour of `__hash__` and `__eq__` for a +single node, override (respectively) the `get_hashable` and `is_equal` +methods. + +## Adding new translation rules from the polars IR + +### Plan nodes -Plan node definitions live in `cudf_polars/dsl/ir.py`, these are -`dataclasses` that inherit from the base `IR` node. The evaluation of -a plan node is done by implementing the `evaluate` method. +Plan node definitions live in `cudf_polars/dsl/ir.py`, these all +inherit from the base `IR` node. The evaluation of a plan node is done +by implementing the `evaluate` method. -To translate the plan node, add a case handler in `translate_ir` which +To translate the plan node, add a case handler in `translate_ir` that lives in `cudf_polars/dsl/translate.py`. As well as child nodes that are plans, most plan nodes contain child @@ -163,25 +227,12 @@ translating a `Join` node, the left keys (expressions) should be translated with the left input active (and right keys with right input). To facilitate this, use the `set_node` context manager. -## Adding a handler for a new expression node +### Expression nodes Adding a handle for an expression node is very similar to a plan node. -Expressions are all defined in `cudf_polars/dsl/expr.py` and inherit -from `Expr`. Unlike plan nodes, these are not `dataclasses`, since it -is simpler for us to implement efficient hashing, repr, and equality if we -can write that ourselves. - -Every expression consists of two types of data: -1. child data (other `Expr`s) -2. non-child data (anything other than an `Expr`) -The generic implementations of special methods in the base `Expr` base -class require that the subclasses advertise which arguments to the -constructor are non-child in a `_non_child` class slot. The -constructor should then take arguments: -```python -def __init__(self, *non_child_data: Any, *children: Expr): -``` -Read the docstrings in the `Expr` class for more details. +Expressions are defined in `cudf_polars/dsl/expressions/` and exported +into the `dsl` namespace via `expr.py`. They inherit +from `Expr`. Expressions are evaluated by implementing a `do_evaluate` method that takes a `DataFrame` as context (this provides columns) along with an @@ -198,24 +249,142 @@ To simplify state tracking, all columns should be considered immutable on construction. This matches the "functional" description coming from the logical plan in any case, so is reasonably natural. +## Traversing and transforming nodes + +In addition to representing and evaluating nodes. We also provide +facilities for traversing a tree of nodes and defining transformation +rules in `dsl/traversal.py`. The simplest is `traversal`, a +[pre-order](https://en.wikipedia.org/wiki/Tree_traversal) visit of all +unique nodes in an expression. Use this if you want to know some +specific thing about an expression. For example, to determine if an +expression contains a `Literal` node: + +```python +def has_literal(node: Expr) -> bool: + return any(isinstance(e, Literal) for e in traversal(node)) +``` + +It is often convenient to provide (immutable) state to a visitor, as +well as some facility to perform DAG-aware rewrites (reusing a +transformation for an expression if we have already seen it). We +therefore adopt the following pattern of writing DAG-aware visitors. +Suppose we want a rewrite rule (`rewrite`) between expressions +(`Expr`) and some new type `T`. We define our general transformation +function `rewrite` with type `Expr -> (Expr -> T) -> T`: + +```python +from cudf_polars.typing import GenericTransformer + +@singledispatch +def rewrite(e: Expr, rec: GenericTransformer[Expr, T]) -> T: + ... +``` + +Note in particular that the function to perform the recursion is +passed as the second argument. Rather than defining methods on each +node in turn for a particular rewrite rule, we prefer free functions +and use `functools.singledispatch` to provide dispatching. We now, in +the usual fashion, register handlers for different expression types. +To use this function, we need to be able to provide both the +expression to convert and the recursive function itself. To do this we +must convert our `rewrite` function into something that only takes a +single argument (the expression to rewrite), but carries around +information about how to perform the recursion. To this end, we have +two utilities in `traversal.py`: + +- `make_recursive` and +- `CachingVisitor`. + +These both implement the `GenericTransformer` protocol, and can be +wrapped around a transformation function like `rewrite` to provide a +function `Expr -> T`. They also allow us to attach arbitrary +*immutable* state to our visitor by passing a `state` dictionary. This +dictionary can then be inspected by the concrete transformation +function. `make_recursive` is very simple, and provides no caching of +intermediate results (so any DAGs that are visited will be viewed as +trees). `CachingVisitor` provides the same interface, but maintains a +cache of intermediate results, and reuses them if the same expression +is seen again. + +Finally, for writing transformations that take nodes and deliver new +nodes (e.g. rewrite rules), we have a final utility +`reuse_if_unchanged` that can be used as a base case transformation +for node to node rewrites. It is a depth-first visit that transforms +children but only returns a new node with new children if the rewrite +of children returned new nodes. + +To see how these pieces fit together, let us consider writing a +`rename` function that takes an expression (potentially with +references to columns) along with a mapping defining a renaming +between (some subset of) column names. The goal is to deliver a new +expression with appropriate columns renamed. + +To start, we define the dispatch function +```python +from collections.abc import Mapping +from functools import singledispatch +from cudf_polars.dsl.traversal import ( + CachingVisitor, make_recursive, reuse_if_unchanged +) +from cudf_polars.dsl.expr import Col, Expr +from cudf_polars.typing import ExprTransformer + + +@singledispatch +def _rename(e: Expr, rec: ExprTransformer) -> Expr: + raise NotImplementedError(f"No handler for {type(e)}") +``` +then we register specific handlers, first for columns: +```python +@_rename.register +def _(e: Col, rec: ExprTransformer) -> Expr: + mapping = rec.state["mapping"] # state set on rec + if e.name in mapping: + # If we have a rename, return a new Col reference + # with a new name + return type(e)(e.dtype, mapping[e.name]) + return e +``` +and then for the remaining expressions +```python +_rename.register(Expr)(reuse_if_unchanged) +``` + +:::{note} +In this case, we could have put the generic handler in the `_rename` +function, however, then we would not get a nice error message if we +accidentally sent in an object of the incorrect type. +::: + +Finally we tie everything together with a public function: + +```python +def rename(e: Expr, mapping: Mapping[str, str]) -> Expr: + """Rename column references in an expression.""" + mapper = CachingVisitor(_rename, state={"mapping": mapping}) + # or + # mapper = make_recursive(_rename, state={"mapping": mapping}) + return mapper(e) +``` + # Containers Containers should be constructed as relatively lightweight objects -around their pylibcudf counterparts. We have four (in +around their pylibcudf counterparts. We have three (in `cudf_polars/containers/`): 1. `Scalar` (a wrapper around a pylibcudf `Scalar`) 2. `Column` (a wrapper around a pylibcudf `Column`) -3. `NamedColumn` (a `Column` with an additional name) -4. `DataFrame` (a wrapper around a pylibcudf `Table`) +3. `DataFrame` (a wrapper around a pylibcudf `Table`) The interfaces offered by these are somewhat in flux, but broadly -speaking, a `DataFrame` is just a list of `NamedColumn`s which each -hold a `Column` plus a string `name`. `NamedColumn`s are only ever -constructed via `NamedExpr`s, which are the top-level expression node -that lives inside an `IR` node. This means that the expression -evaluator never has to concern itself with column names: columns are -only ever decorated with names when constructing a `DataFrame`. +speaking, a `DataFrame` is just a mapping from string `name`s to +`Column`s, and thus also holds a pylibcudf `Table`. Names are only +attached to `Column`s and hence inserted into `DataFrames` via +`NamedExpr`s, which are the top-level expression nodes that live +inside an `IR` node. This means that the expression evaluator never +has to concern itself with column names: columns are only ever +decorated with names when constructing a `DataFrame`. The columns keep track of metadata (for example, whether or not they are sorted). We could imagine tracking more metadata, like minimum and diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index f55031e0826..2afdab1be4b 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.8,<1.9", + "polars>=1.11,<1.12", "pylibcudf==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -50,12 +50,17 @@ license-files = ["LICENSE"] version = {file = "cudf_polars/VERSION"} [tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" +filterwarnings = [ + "error" +] xfail_strict = true [tool.coverage.report] exclude_also = [ "if TYPE_CHECKING:", - "class .*\\bProtocol\\):", + "class .*\\bProtocol(?:\\[[^]]+\\])?\\):", "assert_never\\(" ] # The cudf_polars test suite doesn't exercise the plugin, so we omit diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py index 19919877f84..1f26ab1af9f 100644 --- a/python/cudf_polars/tests/containers/test_column.py +++ b/python/cudf_polars/tests/containers/test_column.py @@ -3,13 +3,11 @@ from __future__ import annotations -from functools import partial - import pyarrow import pylibcudf as plc import pytest -from cudf_polars.containers import Column, NamedColumn +from cudf_polars.containers import Column def test_non_scalar_access_raises(): @@ -55,11 +53,10 @@ def test_shallow_copy(): @pytest.mark.parametrize("typeid", [plc.TypeId.INT8, plc.TypeId.FLOAT32]) -@pytest.mark.parametrize("constructor", [Column, partial(NamedColumn, name="name")]) -def test_mask_nans(typeid, constructor): +def test_mask_nans(typeid): dtype = plc.DataType(typeid) values = pyarrow.array([0, 0, 0], type=plc.interop.to_arrow(dtype)) - column = constructor(plc.interop.from_arrow(values)) + column = Column(plc.interop.from_arrow(values)) masked = column.mask_nans() assert column.obj.null_count() == masked.obj.null_count() diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py index 39fb44d55a5..5c68fb8f0aa 100644 --- a/python/cudf_polars/tests/containers/test_dataframe.py +++ b/python/cudf_polars/tests/containers/test_dataframe.py @@ -8,18 +8,18 @@ import polars as pl -from cudf_polars.containers import DataFrame, NamedColumn +from cudf_polars.containers import Column, DataFrame from cudf_polars.testing.asserts import assert_gpu_result_equal def test_select_missing_raises(): df = DataFrame( [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID ), - "a", + name="a", ) ] ) @@ -30,17 +30,17 @@ def test_select_missing_raises(): def test_replace_missing_raises(): df = DataFrame( [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID ), - "a", + name="a", ) ] ) - replacement = df.columns[0].copy(new_name="b") + replacement = df.column_map["a"].copy().rename("b") with pytest.raises(ValueError): - df.replace_columns(replacement) + df.with_columns([replacement], replace_only=True) def test_from_table_wrong_names(): @@ -55,14 +55,23 @@ def test_from_table_wrong_names(): DataFrame.from_table(table, ["a", "b"]) +def test_unnamed_column_raise(): + payload = plc.column_factories.make_numeric_column( + plc.DataType(plc.TypeId.INT8), 0, plc.MaskState.ALL_VALID + ) + + with pytest.raises(ValueError): + DataFrame([Column(payload, name="a"), Column(payload)]) + + def test_sorted_like_raises_mismatching_names(): df = DataFrame( [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID ), - "a", + name="a", ) ] ) @@ -72,11 +81,11 @@ def test_sorted_like_raises_mismatching_names(): def test_shallow_copy(): - column = NamedColumn( + column = Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID ), - "a", + name="a", ) column.set_sorted( is_sorted=plc.types.Sorted.YES, @@ -85,13 +94,13 @@ def test_shallow_copy(): ) df = DataFrame([column]) copy = df.copy() - copy.columns[0].set_sorted( + copy.column_map["a"].set_sorted( is_sorted=plc.types.Sorted.NO, order=plc.types.Order.ASCENDING, null_order=plc.types.NullOrder.AFTER, ) - assert df.columns[0].is_sorted == plc.types.Sorted.YES - assert copy.columns[0].is_sorted == plc.types.Sorted.NO + assert df.column_map["a"].is_sorted == plc.types.Sorted.YES + assert copy.column_map["a"].is_sorted == plc.types.Sorted.NO def test_sorted_flags_preserved_empty(): @@ -100,7 +109,7 @@ def test_sorted_flags_preserved_empty(): gf = DataFrame.from_polars(df) - (a,) = gf.columns + a = gf.column_map["a"] assert a.is_sorted == plc.types.Sorted.YES diff --git a/python/cudf_polars/tests/dsl/test_expr.py b/python/cudf_polars/tests/dsl/test_expr.py index b7d4672daca..84e33262869 100644 --- a/python/cudf_polars/tests/dsl/test_expr.py +++ b/python/cudf_polars/tests/dsl/test_expr.py @@ -73,3 +73,24 @@ def test_namedexpr_repr_stable(): b2 = expr.NamedExpr("b1", expr.Col(plc.DataType(plc.TypeId.INT8), "a")) assert repr(b1) == repr(b2) + + +def test_equality_cse(): + dt = plc.DataType(plc.TypeId.INT8) + + def make_expr(n1, n2): + a = expr.Col(plc.DataType(plc.TypeId.INT8), n1) + b = expr.Col(plc.DataType(plc.TypeId.INT8), n2) + + return expr.BinOp(dt, plc.binaryop.BinaryOperator.ADD, a, b) + + e1 = make_expr("a", "b") + e2 = make_expr("a", "b") + e3 = make_expr("a", "c") + + assert e1.children is not e2.children + assert e1 == e2 + assert e1.children is e2.children + assert e1 == e2 + assert e1 != e3 + assert e2 != e3 diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py new file mode 100644 index 00000000000..6505a786855 --- /dev/null +++ b/python/cudf_polars/tests/dsl/test_traversal.py @@ -0,0 +1,229 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from functools import singledispatch + +import pylibcudf as plc + +import polars as pl +from polars.testing import assert_frame_equal + +from cudf_polars import translate_ir +from cudf_polars.dsl import expr, ir +from cudf_polars.dsl.traversal import ( + CachingVisitor, + make_recursive, + reuse_if_unchanged, + traversal, +) +from cudf_polars.typing import ExprTransformer, IRTransformer + + +def make_expr(dt, n1, n2): + a1 = expr.Col(dt, n1) + a2 = expr.Col(dt, n2) + + return expr.BinOp(dt, plc.binaryop.BinaryOperator.MUL, a1, a2) + + +def test_traversal_unique(): + dt = plc.DataType(plc.TypeId.INT8) + + e1 = make_expr(dt, "a", "a") + unique_exprs = list(traversal(e1)) + + assert len(unique_exprs) == 2 + assert set(unique_exprs) == {expr.Col(dt, "a"), e1} + assert unique_exprs == [e1, expr.Col(dt, "a")] + + e2 = make_expr(dt, "a", "b") + unique_exprs = list(traversal(e2)) + + assert len(unique_exprs) == 3 + assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e2} + assert unique_exprs == [e2, expr.Col(dt, "a"), expr.Col(dt, "b")] + + e3 = make_expr(dt, "b", "a") + unique_exprs = list(traversal(e3)) + + assert len(unique_exprs) == 3 + assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e3} + assert unique_exprs == [e3, expr.Col(dt, "b"), expr.Col(dt, "a")] + + +def rename(e, rec): + mapping = rec.state["mapping"] + if isinstance(e, expr.Col) and e.name in mapping: + return type(e)(e.dtype, mapping[e.name]) + return reuse_if_unchanged(e, rec) + + +def test_caching_visitor(): + dt = plc.DataType(plc.TypeId.INT8) + + e1 = make_expr(dt, "a", "b") + + mapper = CachingVisitor(rename, state={"mapping": {"b": "c"}}) + + renamed = mapper(e1) + assert renamed == make_expr(dt, "a", "c") + assert len(mapper.cache) == 3 + + e2 = make_expr(dt, "a", "a") + mapper = CachingVisitor(rename, state={"mapping": {"b": "c"}}) + + renamed = mapper(e2) + assert renamed == make_expr(dt, "a", "a") + assert len(mapper.cache) == 2 + mapper = CachingVisitor(rename, state={"mapping": {"a": "c"}}) + + renamed = mapper(e2) + assert renamed == make_expr(dt, "c", "c") + assert len(mapper.cache) == 2 + + +def test_noop_visitor(): + dt = plc.DataType(plc.TypeId.INT8) + + e1 = make_expr(dt, "a", "b") + + mapper = make_recursive(rename, state={"mapping": {"b": "c"}}) + + renamed = mapper(e1) + assert renamed == make_expr(dt, "a", "c") + + e2 = make_expr(dt, "a", "a") + mapper = make_recursive(rename, state={"mapping": {"b": "c"}}) + + renamed = mapper(e2) + assert renamed == make_expr(dt, "a", "a") + mapper = make_recursive(rename, state={"mapping": {"a": "c"}}) + + renamed = mapper(e2) + assert renamed == make_expr(dt, "c", "c") + + +def test_rewrite_ir_node(): + df = pl.LazyFrame({"a": [1, 2, 1], "b": [1, 3, 4]}) + q = df.group_by("a").agg(pl.col("b").sum()).sort("b") + + orig = translate_ir(q._ldf.visit()) + + new_df = pl.DataFrame({"a": [1, 1, 2], "b": [-1, -2, -4]}) + + def replace_df(node, rec): + if isinstance(node, ir.DataFrameScan): + return ir.DataFrameScan( + node.schema, new_df._df, node.projection, node.predicate + ) + return reuse_if_unchanged(node, rec) + + mapper = CachingVisitor(replace_df) + + new = mapper(orig) + + result = new.evaluate(cache={}).to_polars() + + expect = pl.DataFrame({"a": [2, 1], "b": [-4, -3]}) + + assert_frame_equal(result, expect) + + +def test_rewrite_scan_node(tmp_path): + left = pl.LazyFrame({"a": [1, 2, 3], "b": [1, 3, 4]}) + right = pl.DataFrame({"a": [1, 4, 2], "c": [1, 2, 3]}) + + right.write_parquet(tmp_path / "right.pq") + + right_s = pl.scan_parquet(tmp_path / "right.pq") + + q = left.join(right_s, on="a", how="inner") + + def replace_scan(node, rec): + if isinstance(node, ir.Scan): + return ir.DataFrameScan( + node.schema, right._df, node.with_columns, node.predicate + ) + return reuse_if_unchanged(node, rec) + + mapper = CachingVisitor(replace_scan) + + orig = translate_ir(q._ldf.visit()) + new = mapper(orig) + + result = new.evaluate(cache={}).to_polars() + + expect = q.collect() + + assert_frame_equal(result, expect, check_row_order=False) + + +def test_rewrite_names_and_ops(): + df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": [5, 6, 7], "d": [7, 9, 8]}) + + q = df.select(pl.col("a") - (pl.col("b") + pl.col("c") * 2), pl.col("d")).sort("d") + + # We will replace a -> d, c -> d, and addition with multiplication + expect = ( + df.select( + (pl.col("d") - (pl.col("b") * pl.col("d") * 2)).alias("a"), pl.col("d") + ) + .sort("d") + .collect() + ) + + qir = translate_ir(q._ldf.visit()) + + @singledispatch + def _transform(e: expr.Expr, fn: ExprTransformer) -> expr.Expr: + raise NotImplementedError("Unhandled") + + @_transform.register + def _(e: expr.Col, fn: ExprTransformer): + mapping = fn.state["mapping"] + if e.name in mapping: + return type(e)(e.dtype, mapping[e.name]) + return e + + @_transform.register + def _(e: expr.BinOp, fn: ExprTransformer): + if e.op == plc.binaryop.BinaryOperator.ADD: + return type(e)( + e.dtype, plc.binaryop.BinaryOperator.MUL, *map(fn, e.children) + ) + return reuse_if_unchanged(e, fn) + + _transform.register(expr.Expr)(reuse_if_unchanged) + + @singledispatch + def _rewrite(node: ir.IR, fn: IRTransformer) -> ir.IR: + raise NotImplementedError("Unhandled") + + @_rewrite.register + def _(node: ir.Select, fn: IRTransformer): + expr_mapper = fn.state["expr_mapper"] + return type(node)( + node.schema, + [expr.NamedExpr(e.name, expr_mapper(e.value)) for e in node.exprs], + node.should_broadcast, + fn(node.children[0]), + ) + + _rewrite.register(ir.IR)(reuse_if_unchanged) + + rewriter = CachingVisitor( + _rewrite, + state={ + "expr_mapper": CachingVisitor( + _transform, state={"mapping": {"a": "d", "c": "d"}} + ) + }, + ) + + new_ir = rewriter(qir) + + got = new_ir.evaluate(cache={}).to_polars() + + assert_frame_equal(expect, got) diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 56055f4c6c2..86cb2352dcc 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -93,10 +93,10 @@ def test_bool_agg(agg, request): expr = getattr(pl.col("a"), agg)() q = df.select(expr) - assert_gpu_result_equal(q) + assert_gpu_result_equal(q, check_exact=False) -@pytest.mark.parametrize("cum_agg", expr.UnaryFunction._supported_cum_aggs) +@pytest.mark.parametrize("cum_agg", sorted(expr.UnaryFunction._supported_cum_aggs)) def test_cum_agg_reverse_unsupported(cum_agg): df = pl.LazyFrame({"a": [1, 2, 3]}) expr = getattr(pl.col("a"), cum_agg)(reverse=True) diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py index 76c7648813a..2a37683478b 100644 --- a/python/cudf_polars/tests/expressions/test_sort.py +++ b/python/cudf_polars/tests/expressions/test_sort.py @@ -69,7 +69,7 @@ def test_setsorted(descending, nulls_last, with_nulls): df = translate_ir(q._ldf.visit()).evaluate(cache={}) - (a,) = df.columns + a = df.column_map["a"] assert a.is_sorted == plc.types.Sorted.YES null_order = ( diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py index 3c3986be19b..9900f598e5f 100644 --- a/python/cudf_polars/tests/test_config.py +++ b/python/cudf_polars/tests/test_config.py @@ -10,7 +10,7 @@ import rmm -from cudf_polars.dsl.ir import IR +from cudf_polars.dsl.ir import DataFrameScan from cudf_polars.testing.asserts import ( assert_gpu_result_equal, assert_ir_translation_raises, @@ -18,10 +18,10 @@ def test_polars_verbose_warns(monkeypatch): - def raise_unimplemented(self): + def raise_unimplemented(self, *args): raise NotImplementedError("We don't support this") - monkeypatch.setattr(IR, "__post_init__", raise_unimplemented) + monkeypatch.setattr(DataFrameScan, "__init__", raise_unimplemented) q = pl.LazyFrame({}) # Ensure that things raise assert_ir_translation_raises(q, NotImplementedError) diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 7d9ec98db97..501560d15b8 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -2,9 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +from contextlib import nullcontext + import pytest import polars as pl +from polars.testing import assert_frame_equal from cudf_polars.testing.asserts import ( assert_gpu_result_equal, @@ -22,6 +25,11 @@ def how(request): return request.param +@pytest.fixture(params=[None, (1, 5), (1, None), (0, 2), (0, None)]) +def zlice(request): + return request.param + + @pytest.fixture def left(): return pl.LazyFrame( @@ -37,8 +45,9 @@ def left(): def right(): return pl.LazyFrame( { - "a": [1, 4, 3, 7, None, None], - "c": [2, 3, 4, 5, 6, 7], + "a": [1, 4, 3, 7, None, None, 1], + "c": [2, 3, 4, 5, 6, 7, 8], + "d": [6, None, 7, 8, -1, 2, 4], } ) @@ -70,11 +79,31 @@ def test_coalesce_join(left, right, how, join_nulls, join_expr): query = left.join( right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=True ) - assert_gpu_result_equal(query, check_row_order=False) + assert_gpu_result_equal(query, check_row_order=how == "left") -def test_cross_join(left, right): +def test_left_join_with_slice(left, right, join_nulls, zlice): + q = left.join(right, on="a", how="left", join_nulls=join_nulls, coalesce=True) + ctx = nullcontext() + if zlice is not None: + q_expect = q.collect().slice(*zlice) + q = q.slice(*zlice) + if zlice == (1, 5) or zlice == (0, 2): + # https://github.com/pola-rs/polars/issues/19403 + # https://github.com/pola-rs/polars/issues/19405 + ctx = pytest.raises(AssertionError) + assert_frame_equal( + q_expect, q.collect(engine=pl.GPUEngine(raise_on_fail=True)) + ) + + with ctx: + assert_gpu_result_equal(q) + + +def test_cross_join(left, right, zlice): q = left.join(right, how="cross") + if zlice is not None: + q = q.slice(*zlice) assert_gpu_result_equal(q) @@ -86,3 +115,26 @@ def test_join_literal_key_unsupported(left, right, left_on, right_on): q = left.join(right, left_on=left_on, right_on=right_on, how="inner") assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.parametrize( + "conditions", + [ + [pl.col("a") < pl.col("a_right")], + [pl.col("a_right") <= pl.col("a") * 2], + [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")], + [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")], + [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2], + ], +) +def test_join_where(left, right, conditions, zlice): + q = left.join_where(right, *conditions) + + assert_gpu_result_equal(q, check_row_order=False) + + if zlice is not None: + q_len = q.slice(*zlice).select(pl.len()) + # Can't compare result, since row order is not guaranteed and + # therefore we only check the length + + assert_gpu_result_equal(q_len) diff --git a/python/cudf_polars/tests/utils/test_broadcast.py b/python/cudf_polars/tests/utils/test_broadcast.py index 35aaef44e1f..e7770bfadac 100644 --- a/python/cudf_polars/tests/utils/test_broadcast.py +++ b/python/cudf_polars/tests/utils/test_broadcast.py @@ -6,34 +6,35 @@ import pylibcudf as plc import pytest -from cudf_polars.containers import NamedColumn +from cudf_polars.containers import Column from cudf_polars.dsl.ir import broadcast @pytest.mark.parametrize("target", [4, None]) def test_broadcast_all_scalar(target): columns = [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 1, plc.MaskState.ALL_VALID ), - f"col{i}", + name=f"col{i}", ) for i in range(3) ] result = broadcast(*columns, target_length=target) expected = 1 if target is None else target + assert [c.name for c in result] == [f"col{i}" for i in range(3)] assert all(column.obj.size() == expected for column in result) def test_invalid_target_length(): columns = [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), 4, plc.MaskState.ALL_VALID ), - f"col{i}", + name=f"col{i}", ) for i in range(3) ] @@ -43,11 +44,11 @@ def test_invalid_target_length(): def test_broadcast_mismatching_column_lengths(): columns = [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), i + 1, plc.MaskState.ALL_VALID ), - f"col{i}", + name=f"col{i}", ) for i in range(3) ] @@ -58,16 +59,17 @@ def test_broadcast_mismatching_column_lengths(): @pytest.mark.parametrize("nrows", [0, 5]) def test_broadcast_with_scalars(nrows): columns = [ - NamedColumn( + Column( plc.column_factories.make_numeric_column( plc.DataType(plc.TypeId.INT8), nrows if i == 0 else 1, plc.MaskState.ALL_VALID, ), - f"col{i}", + name=f"col{i}", ) for i in range(3) ] result = broadcast(*columns) + assert [c.name for c in result] == [f"col{i}" for i in range(3)] assert all(column.obj.size() == nrows for column in result) diff --git a/python/custreamz/README.md b/python/custreamz/README.md index 8da17ef09dc..e81fc35c544 100644 --- a/python/custreamz/README.md +++ b/python/custreamz/README.md @@ -26,7 +26,7 @@ tips_df = consumer.read_gdf(topic="custreamz_tips", partition=0, start=0, end=10000, - message_format="CSV") + message_format="csv") print(tips_df.head()) tips_df['tip_percentage'] = tips_df['tip'] / tips_df['total_bill'] * 100 diff --git a/python/custreamz/custreamz/kafka.py b/python/custreamz/custreamz/kafka.py index 0def0ba746e..4cbd7244751 100644 --- a/python/custreamz/custreamz/kafka.py +++ b/python/custreamz/custreamz/kafka.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import confluent_kafka as ck from cudf_kafka._lib.kafka import KafkaDatasource @@ -288,4 +288,4 @@ def poll(self, timeout=None): (default: infinite (None translated into -1 in the library)). (Seconds) """ - return self.ck.poll(timeout) + return self.ck_consumer.poll(timeout) diff --git a/python/custreamz/custreamz/tests/conftest.py b/python/custreamz/custreamz/tests/conftest.py index 1cda9b71387..c5135bc6414 100644 --- a/python/custreamz/custreamz/tests/conftest.py +++ b/python/custreamz/custreamz/tests/conftest.py @@ -2,6 +2,7 @@ import socket import pytest + from custreamz import kafka diff --git a/python/custreamz/custreamz/tests/pytest.ini b/python/custreamz/custreamz/tests/pytest.ini deleted file mode 100644 index 7b0a9f29fb1..00000000000 --- a/python/custreamz/custreamz/tests/pytest.ini +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -addopts = --tb=native diff --git a/python/custreamz/custreamz/tests/test_dataframes.py b/python/custreamz/custreamz/tests/test_dataframes.py index bae4b051cae..8c0130d2818 100644 --- a/python/custreamz/custreamz/tests/test_dataframes.py +++ b/python/custreamz/custreamz/tests/test_dataframes.py @@ -377,24 +377,16 @@ def test_setitem_overwrites(stream): [ ({}, "sum"), ({}, "mean"), - pytest.param({}, "min"), + ({}, "min"), pytest.param( {}, "median", marks=pytest.mark.xfail(reason="Unavailable for rolling objects"), ), - pytest.param({}, "max"), - pytest.param( - {}, - "var", - marks=pytest.mark.xfail(reason="Unavailable for rolling objects"), - ), - pytest.param({}, "count"), - pytest.param( - {"ddof": 0}, - "std", - marks=pytest.mark.xfail(reason="Unavailable for rolling objects"), - ), + ({}, "max"), + ({}, "var"), + ({}, "count"), + ({"ddof": 0}, "std"), pytest.param( {"quantile": 0.5}, "quantile", diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index 85ab0024bb5..a8ab05a3922 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -65,52 +65,24 @@ include = [ ] exclude = ["*tests*"] -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true -known_dask = [ - "dask", - "distributed", - "dask_cuda", -] -known_rapids = [ - "rmm", - "cudf", - "dask_cudf", -] -known_first_party = [ - "streamz", -] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", - "__init__.py", -] +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["streamz"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda"] +rapids = ["rmm", "cudf", "dask_cudf"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] [tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ "error", "ignore:unclosed dict: + dropna = True if self.dropna is None else self.dropna + return { + "gb_cols": self._by_columns, + "aggs": self.spec_info["aggs"], + "columns": self.spec_info["columns"], + "dropna": dropna, + "sort": self.sort, + "sep": self.sep, + } + + @property + def combine_kwargs(self) -> dict: + dropna = True if self.dropna is None else self.dropna + return { + "gb_cols": self._by_columns, + "dropna": dropna, + "sort": self.sort, + "sep": self.sep, + } + + @property + def aggregate_kwargs(self) -> dict: + dropna = True if self.dropna is None else self.dropna + final_columns = self._slice or self._meta.columns + return { + "gb_cols": self._by_columns, + "aggs": self.spec_info["aggs"], + "columns": self.spec_info["columns"], + "final_columns": final_columns, + "as_index": True, + "dropna": dropna, + "sort": self.sort, + "sep": self.sep, + "str_cols_out": self.spec_info["str_cols_out"], + "aggs_renames": self.spec_info["aggs_renames"], + } + + +class CudfGroupbyAgg(GroupbyAggregation): + @functools.cached_property + def spec_info(self): + return _get_spec_info(self) + + @functools.cached_property + def _meta(self): + return _get_meta(self) + + def _lower(self): + return DecomposableCudfGroupbyAgg( + self.frame, + self.arg, + self.observed, + self.dropna, + self.split_every, + self.split_out, + self.sort, + self.shuffle_method, + self._slice, + *self.by, + ) + + +def _maybe_get_custom_expr( + gb, + aggs, + split_every=None, + split_out=None, + shuffle_method=None, + **kwargs, +): + from dask_cudf.groupby import ( + OPTIMIZED_AGGS, + _aggs_optimized, + _redirect_aggs, + ) + + if kwargs: + # Unsupported key-word arguments + return None + + if not hasattr(gb.obj._meta, "to_pandas"): + # Not cuDF-backed data + return None + + _aggs = _redirect_aggs(aggs) + if not _aggs_optimized(_aggs, OPTIMIZED_AGGS): + # One or more aggregations are unsupported + return None + + return CudfGroupbyAgg( + gb.obj.expr, + _aggs, + gb.observed, + gb.dropna, + split_every, + split_out, + gb.sort, + shuffle_method, + gb._slice, + *gb.by, + ) + + class CudfFusedParquetIO(FusedParquetIO): @staticmethod def _load_multiple_files( @@ -89,16 +302,34 @@ def _dataset_info(self): return dataset_info @staticmethod - def _table_to_pandas( - table, - index_name, - *args, - ): + def _table_to_pandas(table, index_name): df = cudf.DataFrame.from_arrow(table) if index_name is not None: df = df.set_index(index_name) return df + def _filtered_task(self, index: int): + columns = self.columns.copy() + index_name = self.index.name + if self.index is not None: + index_name = self.index.name + schema = self._dataset_info["schema"].remove_metadata() + if index_name: + if columns is None: + columns = list(schema.names) + columns.append(index_name) + return ( + self._table_to_pandas, + ( + self._fragment_to_table, + FragmentWrapper(self.fragments[index], filesystem=self.fs), + self.filters, + columns, + schema, + ), + index_name, + ) + def _tune_up(self, parent): if self._fusion_compression_factor >= 1: return diff --git a/python/dask_cudf/dask_cudf/expr/_groupby.py b/python/dask_cudf/dask_cudf/expr/_groupby.py index 65688115b59..8a16fe7615d 100644 --- a/python/dask_cudf/dask_cudf/expr/_groupby.py +++ b/python/dask_cudf/dask_cudf/expr/_groupby.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from dask_expr._collection import new_collection from dask_expr._groupby import ( GroupBy as DXGroupBy, SeriesGroupBy as DXSeriesGroupBy, @@ -11,6 +12,8 @@ from cudf.core.groupby.groupby import _deprecate_collect +from dask_cudf.expr._expr import _maybe_get_custom_expr + ## ## Custom groupby classes ## @@ -54,9 +57,16 @@ def _translate_arg(arg): return arg -# TODO: These classes are mostly a work-around for missing -# `observed=False` support. -# See: https://github.com/rapidsai/cudf/issues/15173 +# We define our own GroupBy classes in Dask cuDF for +# the following reasons: +# (1) We want to use a custom `aggregate` algorithm +# that performs multiple aggregations on the +# same dataframe partition at once. The upstream +# algorithm breaks distinct aggregations into +# separate tasks. +# (2) We need to work around missing `observed=False` +# support: +# https://github.com/rapidsai/cudf/issues/15173 class GroupBy(DXGroupBy): @@ -89,8 +99,15 @@ def collect(self, **kwargs): _deprecate_collect() return self._single_agg(ListAgg, **kwargs) - def aggregate(self, arg, **kwargs): - return super().aggregate(_translate_arg(arg), **kwargs) + def aggregate(self, arg, fused=True, **kwargs): + if ( + fused + and (expr := _maybe_get_custom_expr(self, arg, **kwargs)) + is not None + ): + return new_collection(expr) + else: + return super().aggregate(_translate_arg(arg), **kwargs) class SeriesGroupBy(DXSeriesGroupBy): diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py index 76bb2ea99b4..0421bd755f4 100644 --- a/python/dask_cudf/dask_cudf/io/__init__.py +++ b/python/dask_cudf/dask_cudf/io/__init__.py @@ -1,11 +1,11 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. -from .csv import read_csv -from .json import read_json -from .orc import read_orc, to_orc -from .text import read_text +from .csv import read_csv # noqa: F401 +from .json import read_json # noqa: F401 +from .orc import read_orc, to_orc # noqa: F401 +from .text import read_text # noqa: F401 try: - from .parquet import read_parquet, to_parquet + from .parquet import read_parquet, to_parquet # noqa: F401 except ImportError: pass diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 620a917109e..ae5ca480e31 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -15,7 +15,11 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import skip_dask_expr, xfail_dask_expr +from dask_cudf.tests.utils import ( + require_dask_expr, + skip_dask_expr, + xfail_dask_expr, +) # Check if create_metadata_file is supported by # the current dask.dataframe version @@ -371,12 +375,12 @@ def test_split_row_groups(tmpdir, row_groups, index): row_group_size = 5 file_row_groups = 10 # Known apriori npartitions_expected = math.ceil(file_row_groups / row_groups) * 2 - + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "a": np.random.choice(["apple", "banana", "carrot"], size=df_size), - "b": np.random.random(size=df_size), - "c": np.random.randint(1, 5, size=df_size), + "a": rng.choice(["apple", "banana", "carrot"], size=df_size), + "b": rng.random(size=df_size), + "c": rng.integers(1, 5, size=df_size), "index": np.arange(0, df_size), } ) @@ -615,3 +619,28 @@ def test_timezone_column(tmpdir): got = dask_cudf.read_parquet(path) expect = cudf.read_parquet(path) dd.assert_eq(got, expect) + + +@require_dask_expr() +@pytest.mark.skipif( + not dask_cudf.backends.PYARROW_GE_15, + reason="Requires pyarrow 15", +) +@pytest.mark.parametrize("min_part_size", ["1B", "1GB"]) +def test_read_parquet_arrow_filesystem(tmpdir, min_part_size): + tmp_path = str(tmpdir) + with dask.config.set( + { + "dataframe.backend": "cudf", + "dataframe.parquet.minimum-partition-size": min_part_size, + } + ): + dd.from_dict( + {"x": range(1000), "y": ["a", "b", "c", "d"] * 250}, + npartitions=10, + ).to_parquet(tmp_path, write_index=False) + df = cudf.read_parquet(tmp_path) + ddf = dask_cudf.read_parquet(tmp_path, filesystem="arrow") + dd.assert_eq(df, ddf, check_index=False) + assert isinstance(ddf._meta, cudf.DataFrame) + assert isinstance(ddf.compute(), cudf.DataFrame) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py index cf8af82e112..90907f6fb99 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py @@ -11,6 +11,8 @@ from dask.dataframe import assert_eq +import cudf + import dask_cudf from dask_cudf.tests.utils import QUERY_PLANNING_ON @@ -168,6 +170,8 @@ def test_read_parquet_filesystem(s3_base, s3so, pdf, filesystem): filesystem=filesystem, ) assert df.b.sum().compute() == 9 + assert isinstance(df._meta, cudf.DataFrame) + assert isinstance(df.compute(), cudf.DataFrame) def test_read_parquet_filesystem_explicit(s3_base, s3so, pdf): diff --git a/python/dask_cudf/dask_cudf/tests/pytest.ini b/python/dask_cudf/dask_cudf/tests/pytest.ini deleted file mode 100644 index 7b0a9f29fb1..00000000000 --- a/python/dask_cudf/dask_cudf/tests/pytest.ini +++ /dev/null @@ -1,4 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -addopts = --tb=native diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 6f04b5737da..3fbb2aacd2c 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -25,7 +25,7 @@ def data_dt_1(): def data_dt_2(): - return np.random.randn(100) + return np.random.default_rng(seed=0).standard_normal(size=100) dt_fields = ["year", "month", "day", "hour", "minute", "second"] diff --git a/python/dask_cudf/dask_cudf/tests/test_binops.py b/python/dask_cudf/dask_cudf/tests/test_binops.py index 87bd401accd..8c51f950765 100644 --- a/python/dask_cudf/dask_cudf/tests/test_binops.py +++ b/python/dask_cudf/dask_cudf/tests/test_binops.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. import operator @@ -21,10 +21,11 @@ def _make_empty_frame(npartitions=2): def _make_random_frame_float(nelem, npartitions=2): + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "x": np.random.randint(0, 5, size=nelem), - "y": np.random.normal(size=nelem) + 1, + "x": rng.integers(0, 5, size=nelem), + "y": rng.normal(size=nelem) + 1, } ) gdf = cudf.from_pandas(df) @@ -51,7 +52,6 @@ def _make_random_frame_float(nelem, npartitions=2): @pytest.mark.parametrize("binop", _binops) def test_series_binops_integer(binop): - np.random.seed(0) size = 1000 lhs_df, lhs_gdf = _make_random_frame(size) rhs_df, rhs_gdf = _make_random_frame(size) @@ -62,7 +62,6 @@ def test_series_binops_integer(binop): @pytest.mark.parametrize("binop", _binops) def test_series_binops_float(binop): - np.random.seed(0) size = 1000 lhs_df, lhs_gdf = _make_random_frame_float(size) rhs_df, rhs_gdf = _make_random_frame_float(size) @@ -73,10 +72,10 @@ def test_series_binops_float(binop): @pytest.mark.parametrize("operator", _binops) def test_df_series_bind_ops(operator): - np.random.seed(0) + rng = np.random.default_rng(seed=0) size = 1000 lhs_df, lhs_gdf = _make_random_frame_float(size) - rhs = np.random.rand() + rhs = rng.random() for col in lhs_gdf.columns: got = getattr(lhs_gdf[col], operator.__name__)(rhs) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 5f0fae86691..8e42c847ddf 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -22,13 +22,15 @@ xfail_dask_expr, ) +rng = np.random.default_rng(seed=0) + def test_from_dict_backend_dispatch(): # Test ddf.from_dict cudf-backend dispatch - np.random.seed(0) + rng = np.random.default_rng(seed=0) data = { - "x": np.random.randint(0, 5, size=10000), - "y": np.random.normal(size=10000), + "x": rng.integers(0, 5, size=10000), + "y": rng.normal(size=10000), } expect = cudf.DataFrame(data) with dask.config.set({"dataframe.backend": "cudf"}): @@ -62,10 +64,10 @@ def test_from_dask_dataframe_deprecated(): def test_to_backend(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) data = { - "x": np.random.randint(0, 5, size=10000), - "y": np.random.normal(size=10000), + "x": rng.integers(0, 5, size=10000), + "y": rng.normal(size=10000), } with dask.config.set({"dataframe.backend": "pandas"}): ddf = dd.from_dict(data, npartitions=2) @@ -114,12 +116,12 @@ def test_to_backend_kwargs(): def test_from_pandas(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "x": np.random.randint(0, 5, size=10000), - "y": np.random.normal(size=10000), + "x": rng.integers(0, 5, size=10000), + "y": rng.normal(size=10000), } ) @@ -169,10 +171,10 @@ def _fragmented_gdf(df, nsplit): def test_query(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)} + {"x": rng.integers(0, 5, size=10), "y": rng.normal(size=10)} ) gdf = cudf.DataFrame.from_pandas(df) expr = "x > 2" @@ -188,9 +190,9 @@ def test_query(): def test_query_local_dict(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=10), "y": np.random.normal(size=10)} + {"x": rng.integers(0, 5, size=10), "y": rng.normal(size=10)} ) gdf = cudf.DataFrame.from_pandas(df) ddf = dask_cudf.from_cudf(gdf, npartitions=2) @@ -204,11 +206,11 @@ def test_query_local_dict(): def test_head(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "x": np.random.randint(0, 5, size=100), - "y": np.random.normal(size=100), + "x": rng.integers(0, 5, size=100), + "y": rng.normal(size=100), } ) gdf = cudf.DataFrame.from_pandas(df) @@ -220,13 +222,11 @@ def test_head(): @pytest.mark.parametrize("nelem", [10, 200, 1333]) def test_set_index(nelem): with dask.config.set(scheduler="single-threaded"): - np.random.seed(0) + rng = np.random.default_rng(seed=0) # Use unique index range as the sort may not be stable-ordering x = np.arange(nelem) - np.random.shuffle(x) - df = pd.DataFrame( - {"x": x, "y": np.random.randint(0, nelem, size=nelem)} - ) + rng.shuffle(x) + df = pd.DataFrame({"x": x, "y": rng.integers(0, nelem, size=nelem)}) ddf = dd.from_pandas(df, npartitions=2) ddf2 = ddf.to_backend("cudf") @@ -242,7 +242,7 @@ def test_set_index(nelem): def test_set_index_quantile(nelem, nparts, by): df = cudf.DataFrame() df["a"] = np.ascontiguousarray(np.arange(nelem)[::-1]) - df["b"] = np.random.choice(cudf.datasets.names, size=nelem) + df["b"] = rng.choice(cudf.datasets.names, size=nelem) ddf = dd.from_pandas(df, npartitions=nparts) with pytest.warns(FutureWarning, match="deprecated"): @@ -270,11 +270,11 @@ def assert_frame_equal_by_index_group(expect, got): @pytest.mark.parametrize("nelem", [10, 200, 1333]) def test_set_index_2(nelem): with dask.config.set(scheduler="single-threaded"): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "x": 100 + np.random.randint(0, nelem // 2, size=nelem), - "y": np.random.normal(size=nelem), + "x": 100 + rng.integers(0, nelem // 2, size=nelem), + "y": rng.normal(size=nelem), } ) expect = df.set_index("x").sort_index() @@ -289,11 +289,11 @@ def test_set_index_2(nelem): def test_set_index_w_series(): with dask.config.set(scheduler="single-threaded"): nelem = 20 - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "x": 100 + np.random.randint(0, nelem // 2, size=nelem), - "y": np.random.normal(size=nelem), + "x": 100 + rng.integers(0, nelem // 2, size=nelem), + "y": rng.normal(size=nelem), } ) expect = df.set_index(df.x).sort_index() @@ -327,12 +327,12 @@ def test_set_index_sorted(): @pytest.mark.parametrize("index", [None, "myindex"]) def test_rearrange_by_divisions(nelem, index): with dask.config.set(scheduler="single-threaded"): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "x": np.random.randint(0, 20, size=nelem), - "y": np.random.normal(size=nelem), - "z": np.random.choice(["dog", "cat", "bird"], nelem), + "x": rng.integers(0, 20, size=nelem), + "y": rng.normal(size=nelem), + "z": rng.choice(["dog", "cat", "bird"], nelem), } ) df["z"] = df["z"].astype("category") @@ -355,9 +355,9 @@ def test_rearrange_by_divisions(nelem, index): def test_assign(): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} + {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)} ) dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2) @@ -372,10 +372,10 @@ def test_assign(): @pytest.mark.parametrize("data_type", ["int8", "int16", "int32", "int64"]) def test_setitem_scalar_integer(data_type): - np.random.seed(0) - scalar = np.random.randint(0, 100, dtype=data_type) + rng = np.random.default_rng(seed=0) + scalar = rng.integers(0, 100, dtype=data_type) df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} + {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)} ) dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2) @@ -388,10 +388,10 @@ def test_setitem_scalar_integer(data_type): @pytest.mark.parametrize("data_type", ["float32", "float64"]) def test_setitem_scalar_float(data_type): - np.random.seed(0) - scalar = np.random.randn(1).astype(data_type)[0] + rng = np.random.default_rng(seed=0) + scalar = rng.standard_normal(size=1).astype(data_type)[0] df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} + {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)} ) dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2) @@ -403,10 +403,10 @@ def test_setitem_scalar_float(data_type): def test_setitem_scalar_datetime(): - np.random.seed(0) - scalar = np.int64(np.random.randint(0, 100)).astype("datetime64[ms]") + rng = np.random.default_rng(seed=0) + scalar = np.int64(rng.integers(0, 100)).astype("datetime64[ms]") df = pd.DataFrame( - {"x": np.random.randint(0, 5, size=20), "y": np.random.normal(size=20)} + {"x": rng.integers(0, 5, size=20), "y": rng.normal(size=20)} ) dgf = dd.from_pandas(cudf.DataFrame.from_pandas(df), npartitions=2) @@ -422,12 +422,12 @@ def test_setitem_scalar_datetime(): "func", [ lambda: pd.DataFrame( - {"A": np.random.rand(10), "B": np.random.rand(10)}, + {"A": rng.random(10), "B": rng.random(10)}, index=list("abcdefghij"), ), lambda: pd.DataFrame( { - "A": np.random.rand(10), + "A": rng.random(10), "B": list("a" * 10), "C": pd.Series( [str(20090101 + i) for i in range(10)], @@ -438,7 +438,7 @@ def test_setitem_scalar_datetime(): ), lambda: pd.Series(list("abcdefghijklmnop")), lambda: pd.Series( - np.random.rand(10), + rng.random(10), index=pd.Index( [str(20090101 + i) for i in range(10)], dtype="datetime64[ns]" ), @@ -497,10 +497,11 @@ def test_repartition_hash_staged(npartitions): by = ["b"] datarange = 35 size = 100 + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame( { "a": np.arange(size, dtype="int64"), - "b": np.random.randint(datarange, size=size), + "b": rng.integers(datarange, size=size), } ) # WARNING: Specific npartitions-max_branch combination @@ -537,12 +538,13 @@ def test_repartition_hash(by, npartitions, max_branch): npartitions_i = 4 datarange = 26 size = 100 + rng = np.random.default_rng(seed=0) gdf = cudf.DataFrame( { "a": np.arange(0, stop=size, dtype="int64"), - "b": np.random.randint(datarange, size=size), - "c": np.random.choice(list("abcdefgh"), size=size), - "d": np.random.choice(np.arange(26), size=size), + "b": rng.integers(datarange, size=size), + "c": rng.choice(list("abcdefgh"), size=size), + "d": rng.choice(np.arange(26), size=size), } ) gdf.d = gdf.d.astype("datetime64[ms]") @@ -768,6 +770,7 @@ def test_dataframe_series_replace(data): def test_dataframe_assign_col(): + rng = np.random.default_rng(seed=0) df = cudf.DataFrame(list(range(100))) pdf = pd.DataFrame(list(range(100))) @@ -780,7 +783,7 @@ def test_dataframe_assign_col(): pddf = dd.from_pandas(pdf, npartitions=4) pddf["fold"] = 0 pddf["fold"] = pddf["fold"].map_partitions( - lambda p_df: pd.Series(np.random.randint(0, 4, len(p_df))) + lambda p_df: pd.Series(rng.integers(0, 4, len(p_df))) ) dd.assert_eq(ddf[0], pddf[0]) @@ -1015,10 +1018,11 @@ def test_to_backend_simplify(): @pytest.mark.parametrize("numeric_only", [True, False]) @pytest.mark.parametrize("op", ["corr", "cov"]) def test_cov_corr(op, numeric_only): + rng = np.random.default_rng(seed=0) df = cudf.DataFrame.from_dict( { - "x": np.random.randint(0, 5, size=10), - "y": np.random.normal(size=10), + "x": rng.integers(0, 5, size=10), + "y": rng.normal(size=10), } ) ddf = dd.from_pandas(df, npartitions=2) diff --git a/python/dask_cudf/dask_cudf/tests/test_delayed_io.py b/python/dask_cudf/dask_cudf/tests/test_delayed_io.py index e6fb58ad6df..84ed3b46598 100644 --- a/python/dask_cudf/dask_cudf/tests/test_delayed_io.py +++ b/python/dask_cudf/dask_cudf/tests/test_delayed_io.py @@ -51,9 +51,13 @@ def test_series_from_delayed(): def test_dataframe_to_delayed(): nelem = 100 - df = cudf.DataFrame() - df["x"] = np.arange(nelem) - df["y"] = np.random.randint(nelem, size=nelem) + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame( + { + "x": np.arange(nelem), + "y": rng.integers(nelem, size=nelem), + } + ) ddf = dask_cudf.from_cudf(df, npartitions=5) @@ -80,8 +84,8 @@ def test_dataframe_to_delayed(): def test_series_to_delayed(): nelem = 100 - - sr = cudf.Series(np.random.randint(nelem, size=nelem)) + rng = np.random.default_rng(seed=0) + sr = cudf.Series(rng.integers(nelem, size=nelem)) dsr = dask_cudf.from_cudf(sr, npartitions=5) @@ -109,11 +113,13 @@ def test_series_to_delayed(): def test_mixing_series_frame_error(): nelem = 20 - - df = cudf.DataFrame() - df["x"] = np.arange(nelem) - df["y"] = np.random.randint(nelem, size=nelem) - + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame( + { + "x": np.arange(nelem), + "y": rng.integers(nelem, size=nelem), + } + ) ddf = dask_cudf.from_cudf(df, npartitions=5) delay_frame = ddf.to_delayed() @@ -128,10 +134,13 @@ def test_mixing_series_frame_error(): def test_frame_extra_columns_error(): nelem = 20 - - df = cudf.DataFrame() - df["x"] = np.arange(nelem) - df["y"] = np.random.randint(nelem, size=nelem) + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame( + { + "x": np.arange(nelem), + "y": rng.integers(nelem, size=nelem), + } + ) ddf1 = dask_cudf.from_cudf(df, npartitions=5) df["z"] = np.arange(nelem) diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py index a12481a7bb4..fe57d4a4f00 100644 --- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py +++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py @@ -32,8 +32,9 @@ def test_pyarrow_conversion_dispatch(preserve_index, index): to_pyarrow_table_dispatch, ) + rng = np.random.default_rng(seed=0) df1 = cudf.DataFrame( - np.random.randn(10, 3), columns=list("abc"), index=index + rng.standard_normal(size=(10, 3)), columns=list("abc"), index=index ) df2 = from_pyarrow_table_dispatch( df1, to_pyarrow_table_dispatch(df1, preserve_index=preserve_index) @@ -108,7 +109,8 @@ def test_pyarrow_schema_dispatch(preserve_index): to_pyarrow_table_dispatch, ) - df = cudf.DataFrame(np.random.randn(10, 3), columns=list("abc")) + rng = np.random.default_rng(seed=0) + df = cudf.DataFrame(rng.standard_normal(size=(10, 3)), columns=list("abc")) df["d"] = cudf.Series(["cat", "dog"] * 5) table = to_pyarrow_table_dispatch(df, preserve_index=preserve_index) schema = pyarrow_schema_dispatch(df, preserve_index=preserve_index) diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 7b9f0ca328a..042e69d86f4 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -14,7 +14,11 @@ import dask_cudf from dask_cudf.groupby import OPTIMIZED_AGGS, _aggs_optimized -from dask_cudf.tests.utils import QUERY_PLANNING_ON, xfail_dask_expr +from dask_cudf.tests.utils import ( + QUERY_PLANNING_ON, + require_dask_expr, + xfail_dask_expr, +) def assert_cudf_groupby_layers(ddf): @@ -30,21 +34,21 @@ def assert_cudf_groupby_layers(ddf): @pytest.fixture(params=["non_null", "null"]) def pdf(request): - np.random.seed(0) + rng = np.random.default_rng(seed=0) # note that column name "x" is a substring of the groupby key; # this gives us coverage for cudf#10829 pdf = pd.DataFrame( { - "xx": np.random.randint(0, 5, size=10000), - "x": np.random.normal(size=10000), - "y": np.random.normal(size=10000), + "xx": rng.integers(0, 5, size=10000), + "x": rng.normal(size=10000), + "y": rng.normal(size=10000), } ) # insert nulls into dataframe at random if request.param == "null": - pdf = pdf.mask(np.random.choice([True, False], size=pdf.shape)) + pdf = pdf.mask(rng.choice([True, False], size=pdf.shape)) return pdf @@ -173,11 +177,12 @@ def test_groupby_agg_empty_partition(tmpdir, split_out): ], ) def test_groupby_multi_column(func): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "a": np.random.randint(0, 20, size=1000), - "b": np.random.randint(0, 5, size=1000), - "x": np.random.normal(size=1000), + "a": rng.integers(0, 20, size=1000), + "b": rng.integers(0, 5, size=1000), + "x": rng.normal(size=1000), } ) @@ -371,11 +376,12 @@ def test_groupby_string_index_name(myindex): ], ) def test_groupby_split_out_multiindex(agg_func): + rng = np.random.default_rng(seed=0) df = cudf.DataFrame( { - "a": np.random.randint(0, 10, 100), - "b": np.random.randint(0, 5, 100), - "c": np.random.random(100), + "a": rng.integers(0, 10, 100), + "b": rng.integers(0, 5, 100), + "c": rng.random(100), } ) ddf = dask_cudf.from_cudf(df, 5) @@ -419,12 +425,13 @@ def test_groupby_multiindex_reset_index(npartitions): ], ) def test_groupby_reset_index_multiindex(groupby_keys, agg_func): + rng = np.random.default_rng(seed=0) df = cudf.DataFrame( { - "a": np.random.randint(0, 10, 10), - "b": np.random.randint(0, 5, 10), - "c": np.random.randint(0, 5, 10), - "dd": np.random.randint(0, 5, 10), + "a": rng.integers(0, 10, 10), + "b": rng.integers(0, 5, 10), + "c": rng.integers(0, 5, 10), + "dd": rng.integers(0, 5, 10), } ) ddf = dask_cudf.from_cudf(df, 5) @@ -437,8 +444,9 @@ def test_groupby_reset_index_multiindex(groupby_keys, agg_func): def test_groupby_reset_index_drop_True(): + rng = np.random.default_rng(seed=0) df = cudf.DataFrame( - {"a": np.random.randint(0, 10, 10), "b": np.random.randint(0, 5, 10)} + {"a": rng.integers(0, 10, 10), "b": rng.integers(0, 5, 10)} ) ddf = dask_cudf.from_cudf(df, 5) pddf = dd.from_pandas(df.to_pandas(), 5) @@ -552,10 +560,22 @@ def test_groupby_categorical_key(): ), ], ) +@pytest.mark.parametrize( + "fused", + [ + True, + pytest.param( + False, + marks=require_dask_expr("Not supported by legacy API"), + ), + ], +) @pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2]) @pytest.mark.parametrize("split_every", [False, 4]) @pytest.mark.parametrize("npartitions", [1, 10]) -def test_groupby_agg_params(npartitions, split_every, split_out, as_index): +def test_groupby_agg_params( + npartitions, split_every, split_out, fused, as_index +): df = cudf.datasets.randomdata( nrows=150, dtypes={"name": str, "a": int, "b": int, "c": float}, @@ -570,6 +590,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): "c": ["mean", "std", "var"], } + fused_kwarg = {"fused": fused} if QUERY_PLANNING_ON else {} split_kwargs = {"split_every": split_every, "split_out": split_out} if split_out == "use_dask_default": split_kwargs.pop("split_out") @@ -589,6 +610,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): ddf.groupby(["name", "a"], sort=True, **maybe_as_index) .aggregate( agg_dict, + **fused_kwarg, **split_kwargs, ) .compute() @@ -610,6 +632,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): # Full check (`sort=False`) gr = ddf.groupby(["name", "a"], sort=False, **maybe_as_index).aggregate( agg_dict, + **fused_kwarg, **split_kwargs, ) pr = pddf.groupby(["name", "a"], sort=False).agg( @@ -653,10 +676,11 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index): "aggregations", [(sum, "sum"), (max, "max"), (min, "min")] ) def test_groupby_agg_redirect(aggregations): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "x": np.random.randint(0, 5, size=10000), - "y": np.random.normal(size=10000), + "x": rng.integers(0, 5, size=10000), + "y": rng.normal(size=10000), } ) @@ -758,10 +782,11 @@ def test_groupby_with_list_of_series(): ], ) def test_groupby_nested_dict(func): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "x": np.random.randint(0, 5, size=10000), - "y": np.random.normal(size=10000), + "x": rng.integers(0, 5, size=10000), + "y": rng.normal(size=10000), } ) @@ -794,10 +819,11 @@ def test_groupby_nested_dict(func): ], ) def test_groupby_all_columns(func): + rng = np.random.default_rng(seed=0) pdf = pd.DataFrame( { - "x": np.random.randint(0, 5, size=10000), - "y": np.random.normal(size=10000), + "x": rng.integers(0, 5, size=10000), + "y": rng.normal(size=10000), } ) diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py index 3e078c47cdd..61d0f8d7eb9 100644 --- a/python/dask_cudf/dask_cudf/tests/test_join.py +++ b/python/dask_cudf/dask_cudf/tests/test_join.py @@ -22,18 +22,18 @@ def test_join_inner(left_nrows, right_nrows, left_nkeys, right_nkeys): chunksize = 50 - np.random.seed(0) + rng = np.random.default_rng(seed=0) # cuDF left = cudf.DataFrame( { - "x": np.random.randint(0, left_nkeys, size=left_nrows), + "x": rng.integers(0, left_nkeys, size=left_nrows), "a": np.arange(left_nrows), } ) right = cudf.DataFrame( { - "x": np.random.randint(0, right_nkeys, size=right_nrows), + "x": rng.integers(0, right_nkeys, size=right_nrows), "a": 1000 * np.arange(right_nrows), } ) @@ -84,18 +84,18 @@ def gather(df, grows): def test_join_left(left_nrows, right_nrows, left_nkeys, right_nkeys, how): chunksize = 50 - np.random.seed(0) + rng = np.random.default_rng(seed=0) # cuDF left = cudf.DataFrame( { - "x": np.random.randint(0, left_nkeys, size=left_nrows), + "x": rng.integers(0, left_nkeys, size=left_nrows), "a": np.arange(left_nrows, dtype=np.float64), } ) right = cudf.DataFrame( { - "x": np.random.randint(0, right_nkeys, size=right_nrows), + "x": rng.integers(0, right_nkeys, size=right_nrows), "a": 1000 * np.arange(right_nrows, dtype=np.float64), } ) @@ -153,20 +153,20 @@ def test_merge_left( ): chunksize = 3 - np.random.seed(0) + rng = np.random.default_rng(seed=42) # cuDF left = cudf.DataFrame( { - "x": np.random.randint(0, left_nkeys, size=left_nrows), - "y": np.random.randint(0, left_nkeys, size=left_nrows), + "x": rng.integers(0, left_nkeys, size=left_nrows), + "y": rng.integers(0, left_nkeys, size=left_nrows), "a": np.arange(left_nrows, dtype=np.float64), } ) right = cudf.DataFrame( { - "x": np.random.randint(0, right_nkeys, size=right_nrows), - "y": np.random.randint(0, right_nkeys, size=right_nrows), + "x": rng.integers(0, right_nkeys, size=right_nrows), + "y": rng.integers(0, right_nkeys, size=right_nrows), "a": 1000 * np.arange(right_nrows, dtype=np.float64), } ) @@ -200,18 +200,18 @@ def test_merge_1col_left( ): chunksize = 3 - np.random.seed(0) + rng = np.random.default_rng(seed=0) # cuDF left = cudf.DataFrame( { - "x": np.random.randint(0, left_nkeys, size=left_nrows), + "x": rng.integers(0, left_nkeys, size=left_nrows), "a": np.arange(left_nrows, dtype=np.float64), } ) right = cudf.DataFrame( { - "x": np.random.randint(0, right_nkeys, size=right_nrows), + "x": rng.integers(0, right_nkeys, size=right_nrows), "a": 1000 * np.arange(right_nrows, dtype=np.float64), } ) @@ -238,13 +238,19 @@ def test_merge_1col_left( def test_merge_should_fail(): # Expected failure cases described in #2694 - df1 = cudf.DataFrame() - df1["a"] = [1, 2, 3, 4, 5, 6] * 2 - df1["b"] = np.random.randint(0, 12, 12) - - df2 = cudf.DataFrame() - df2["a"] = [7, 2, 3, 8, 5, 9] * 2 - df2["c"] = np.random.randint(0, 12, 12) + rng = np.random.default_rng(seed=0) + df1 = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6] * 2, + "b": rng.integers(0, 12, 12), + } + ) + df2 = pd.DataFrame( + { + "a": [7, 2, 3, 8, 5, 9] * 2, + "c": rng.integers(0, 12, 12), + } + ) left = dask_cudf.from_cudf(df1, 1).groupby("a").b.min().to_frame() right = dask_cudf.from_cudf(df2, 1).groupby("a").c.min().to_frame() @@ -257,7 +263,7 @@ def test_merge_should_fail(): left.merge(right, how="left", on=["c"]) # Same column names - df2["b"] = np.random.randint(0, 12, 12) + df2["b"] = np.random.default_rng(seed=0).integers(0, 12, 12) right = dask_cudf.from_cudf(df2, 1).groupby("a").b.min().to_frame() with pytest.raises(KeyError): diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index d03e92319be..4351b672151 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -13,11 +13,11 @@ def _make_random_frame(nelem, npartitions=2): - np.random.seed(0) + rng = np.random.default_rng(seed=0) df = pd.DataFrame( { - "x": np.random.randint(0, 5, size=nelem), - "y": np.random.normal(size=nelem) + 1, + "x": rng.integers(0, 5, size=nelem), + "y": rng.normal(loc=1.0, scale=1.0, size=nelem), } ) gdf = cudf.DataFrame.from_pandas(df) diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py index 9bbbbc79561..02c815427f3 100644 --- a/python/dask_cudf/dask_cudf/tests/test_sort.py +++ b/python/dask_cudf/dask_cudf/tests/test_sort.py @@ -28,7 +28,7 @@ @pytest.mark.parametrize("nelem", [10, 500]) @pytest.mark.parametrize("nparts", [1, 10]) def test_sort_values(nelem, nparts, by, ascending): - np.random.seed(0) + _ = np.random.default_rng(seed=0) df = cudf.DataFrame() df["a"] = np.ascontiguousarray(np.arange(nelem)[::-1]) df["b"] = np.arange(100, nelem + 100) @@ -82,7 +82,7 @@ def test_sort_repartition(): ], ) def test_sort_values_with_nulls(data, by, ascending, na_position): - np.random.seed(0) + _ = np.random.default_rng(seed=0) cp.random.seed(0) df = cudf.DataFrame(data) ddf = dd.from_pandas(df, npartitions=5) diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py index cc0c6899804..9aaf6dc8420 100644 --- a/python/dask_cudf/dask_cudf/tests/utils.py +++ b/python/dask_cudf/dask_cudf/tests/utils.py @@ -19,8 +19,9 @@ def _make_random_frame(nelem, npartitions=2, include_na=False): + rng = np.random.default_rng(seed=None) df = pd.DataFrame( - {"x": np.random.random(size=nelem), "y": np.random.random(size=nelem)} + {"x": rng.random(size=nelem), "y": rng.random(size=nelem)} ) if include_na: diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 76e47b50c3b..862e8f36eaa 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "numpy>=1.23,<3.0a0", - "pandas>=2.0,<2.2.3dev0", + "pandas>=2.0,<2.2.4dev0", "rapids-dask-dependency==24.12.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ @@ -69,52 +69,21 @@ version = {file = "dask_cudf/VERSION"} [tool.setuptools.packages.find] exclude = ["*tests*"] -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true +[tool.ruff] +extend = "../../pyproject.toml" -known_dask = [ - "dask", - "distributed", - "dask_cuda", -] -known_rapids = [ - "rmm", - "cudf", -] -known_first_party = [ - "dask_cudf", -] +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["dask_cudf"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", -] +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda"] +rapids = ["rmm", "cudf"] [tool.pytest.ini_options] +addopts = "--tb=native --strict-config --strict-markers" +empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ "error::FutureWarning", "error::DeprecationWarning", @@ -124,4 +93,8 @@ filterwarnings = [ # https://github.com/dask/partd/blob/main/partd/pandas.py#L198 "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning", "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask", + # Dask now loudly throws warnings: https://github.com/dask/dask/pull/11437 + # When the legacy implementation is removed we can remove this warning and stop running pytests with `DASK_DATAFRAME__QUERY_PLANNING=False` + "ignore:The legacy Dask DataFrame implementation is deprecated and will be removed in a future version.*:FutureWarning", ] +xfail_strict = true diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt index 0a8f5c4807d..5f9a04d3cee 100644 --- a/python/libcudf/CMakeLists.txt +++ b/python/libcudf/CMakeLists.txt @@ -22,6 +22,8 @@ project( LANGUAGES CXX ) +option(USE_NVCOMP_RUNTIME_WHEEL "Use the nvcomp wheel at runtime instead of the system library" OFF) + # Check if cudf is already available. If so, it is the user's responsibility to ensure that the # CMake package is also available at build time of the Python cudf package. find_package(cudf "${RAPIDS_VERSION}") @@ -39,14 +41,20 @@ set(BUILD_TESTS OFF) set(BUILD_BENCHMARKS OFF) set(CUDF_BUILD_TESTUTIL OFF) set(CUDF_BUILD_STREAMS_TEST_UTIL OFF) +if(USE_NVCOMP_RUNTIME_WHEEL) + set(CUDF_EXPORT_NVCOMP OFF) +endif() set(CUDA_STATIC_RUNTIME ON) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) add_subdirectory(../../cpp cudf-cpp) -# Ensure other libraries needed by libcudf.so get installed alongside it. -include(cmake/Modules/WheelHelpers.cmake) -install_aliased_imported_targets( - TARGETS cudf nvcomp::nvcomp DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY} -) +if(USE_NVCOMP_RUNTIME_WHEEL) + set(rpaths "$ORIGIN/../../nvidia/nvcomp") + set_property( + TARGET cudf + PROPERTY INSTALL_RPATH ${rpaths} + APPEND + ) +endif() diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml index 5bffe9fd96c..84660cbc276 100644 --- a/python/libcudf/pyproject.toml +++ b/python/libcudf/pyproject.toml @@ -37,6 +37,9 @@ classifiers = [ "Programming Language :: C++", "Environment :: GPU :: NVIDIA CUDA", ] +dependencies = [ + "nvidia-nvcomp==4.0.1", +] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.urls] Homepage = "https://github.com/rapidsai/cudf" diff --git a/python/pylibcudf/LICENSE b/python/pylibcudf/LICENSE new file mode 120000 index 00000000000..30cff7403da --- /dev/null +++ b/python/pylibcudf/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index a7cb66d7b16..15dd2b4c34f 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -17,6 +17,7 @@ set(cython_sources binaryop.pyx column.pyx column_factories.pyx + contiguous_split.pyx concatenate.pyx copying.pyx datetime.pyx @@ -27,6 +28,7 @@ set(cython_sources groupby.pyx interop.pyx join.pyx + json.pyx labeling.pyx lists.pyx merge.pyx @@ -66,3 +68,4 @@ target_link_libraries(pylibcudf_interop PUBLIC nanoarrow) add_subdirectory(libcudf) add_subdirectory(strings) add_subdirectory(io) +add_subdirectory(nvtext) diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index a384edd456d..9bdfdab97c2 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -6,17 +6,21 @@ from . cimport ( binaryop, column_factories, concatenate, + contiguous_split, copying, datetime, experimental, expressions, filling, groupby, + interop, join, + json, labeling, lists, merge, null_mask, + nvtext, partitioning, quantiles, reduce, @@ -50,6 +54,7 @@ __all__ = [ "aggregation", "binaryop", "column_factories", + "contiguous_split", "concatenate", "copying", "datetime", @@ -58,7 +63,9 @@ __all__ = [ "filling", "gpumemoryview", "groupby", + "interop", "join", + "json", "lists", "merge", "null_mask", @@ -78,4 +85,5 @@ __all__ = [ "transpose", "types", "unary", + "nvtext", ] diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index 2a5365e8fad..4033062b7e2 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -15,6 +15,7 @@ binaryop, column_factories, concatenate, + contiguous_split, copying, datetime, experimental, @@ -24,10 +25,12 @@ interop, io, join, + json, labeling, lists, merge, null_mask, + nvtext, partitioning, quantiles, reduce, @@ -61,6 +64,7 @@ "aggregation", "binaryop", "column_factories", + "contiguous_split", "concatenate", "copying", "datetime", @@ -72,6 +76,7 @@ "interop", "io", "join", + "json", "labeling", "lists", "merge", @@ -92,4 +97,5 @@ "transpose", "types", "unary", + "nvtext", ] diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx index 5f9d145139a..51b2b4cfaa3 100644 --- a/python/pylibcudf/pylibcudf/binaryop.pyx +++ b/python/pylibcudf/pylibcudf/binaryop.pyx @@ -52,33 +52,27 @@ cpdef Column binary_operation( if LeftBinaryOperand is Column and RightBinaryOperand is Column: with nogil: - result = move( - cpp_binaryop.binary_operation( - lhs.view(), - rhs.view(), - op, - output_type.c_obj - ) + result = cpp_binaryop.binary_operation( + lhs.view(), + rhs.view(), + op, + output_type.c_obj ) elif LeftBinaryOperand is Column and RightBinaryOperand is Scalar: with nogil: - result = move( - cpp_binaryop.binary_operation( - lhs.view(), - dereference(rhs.c_obj), - op, - output_type.c_obj - ) + result = cpp_binaryop.binary_operation( + lhs.view(), + dereference(rhs.c_obj), + op, + output_type.c_obj ) elif LeftBinaryOperand is Scalar and RightBinaryOperand is Column: with nogil: - result = move( - cpp_binaryop.binary_operation( - dereference(lhs.c_obj), - rhs.view(), - op, - output_type.c_obj - ) + result = cpp_binaryop.binary_operation( + dereference(lhs.c_obj), + rhs.view(), + op, + output_type.c_obj ) else: raise ValueError(f"Invalid arguments {lhs} and {rhs}") diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx index a37a12fc7e1..4e5698566d0 100644 --- a/python/pylibcudf/pylibcudf/column.pyx +++ b/python/pylibcudf/pylibcudf/column.pyx @@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column_factories cimport make_column_from_scalar from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.types cimport size_type -from rmm._lib.device_buffer cimport DeviceBuffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from .gpumemoryview cimport gpumemoryview from .scalar cimport Scalar @@ -138,7 +138,7 @@ cdef class Column: cdef size_type null_count = libcudf_col.get().null_count() - cdef column_contents contents = move(libcudf_col.get().release()) + cdef column_contents contents = libcudf_col.get().release() # Note that when converting to cudf Column objects we'll need to pull # out the base object. @@ -247,7 +247,7 @@ cdef class Column: cdef const scalar* c_scalar = slr.get() cdef unique_ptr[column] c_result with nogil: - c_result = move(make_column_from_scalar(dereference(c_scalar), size)) + c_result = make_column_from_scalar(dereference(c_scalar), size) return Column.from_libcudf(move(c_result)) @staticmethod @@ -269,7 +269,7 @@ cdef class Column: cdef Scalar slr = Scalar.empty_like(like) cdef unique_ptr[column] c_result with nogil: - c_result = move(make_column_from_scalar(dereference(slr.get()), size)) + c_result = make_column_from_scalar(dereference(slr.get()), size) return Column.from_libcudf(move(c_result)) @staticmethod @@ -373,7 +373,7 @@ cdef class Column: """Create a copy of the column.""" cdef unique_ptr[column] c_result with nogil: - c_result = move(make_unique[column](self.view())) + c_result = make_unique[column](self.view()) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/column_factories.pxd b/python/pylibcudf/pylibcudf/column_factories.pxd index fef02359240..d556085ab64 100644 --- a/python/pylibcudf/pylibcudf/column_factories.pxd +++ b/python/pylibcudf/pylibcudf/column_factories.pxd @@ -1,7 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move -from pylibcudf.libcudf.types cimport mask_state, size_type +from pylibcudf.libcudf.types cimport mask_state from .column cimport Column from .types cimport DataType, size_type, type_id diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx index e9085e3ea02..ac942a620b5 100644 --- a/python/pylibcudf/pylibcudf/column_factories.pyx +++ b/python/pylibcudf/pylibcudf/column_factories.pyx @@ -39,29 +39,17 @@ cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id): if isinstance(type_or_id, TypeId): id = type_or_id with nogil: - result = move( - cpp_make_empty_column( - id - ) - ) + result = cpp_make_empty_column(id) else: raise TypeError( "Must pass a TypeId or DataType" ) elif MakeEmptyColumnOperand is DataType: with nogil: - result = move( - cpp_make_empty_column( - type_or_id.c_obj - ) - ) + result = cpp_make_empty_column(type_or_id.c_obj) elif MakeEmptyColumnOperand is type_id: with nogil: - result = move( - cpp_make_empty_column( - type_or_id - ) - ) + result = cpp_make_empty_column(type_or_id) else: raise TypeError( "Must pass a TypeId or DataType" @@ -92,12 +80,10 @@ cpdef Column make_numeric_column( else: raise TypeError("Invalid mask argument") with nogil: - result = move( - cpp_make_numeric_column( - type_.c_obj, - size, - state - ) + result = cpp_make_numeric_column( + type_.c_obj, + size, + state ) return Column.from_libcudf(move(result)) @@ -121,12 +107,10 @@ cpdef Column make_fixed_point_column( else: raise TypeError("Invalid mask argument") with nogil: - result = move( - cpp_make_fixed_point_column( - type_.c_obj, - size, - state - ) + result = cpp_make_fixed_point_column( + type_.c_obj, + size, + state ) return Column.from_libcudf(move(result)) @@ -151,12 +135,10 @@ cpdef Column make_timestamp_column( else: raise TypeError("Invalid mask argument") with nogil: - result = move( - cpp_make_timestamp_column( - type_.c_obj, - size, - state - ) + result = cpp_make_timestamp_column( + type_.c_obj, + size, + state ) return Column.from_libcudf(move(result)) @@ -181,12 +163,10 @@ cpdef Column make_duration_column( else: raise TypeError("Invalid mask argument") with nogil: - result = move( - cpp_make_duration_column( - type_.c_obj, - size, - state - ) + result = cpp_make_duration_column( + type_.c_obj, + size, + state ) return Column.from_libcudf(move(result)) @@ -211,12 +191,10 @@ cpdef Column make_fixed_width_column( else: raise TypeError("Invalid mask argument") with nogil: - result = move( - cpp_make_fixed_width_column( - type_.c_obj, - size, - state - ) + result = cpp_make_fixed_width_column( + type_.c_obj, + size, + state ) return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/concatenate.pyx b/python/pylibcudf/pylibcudf/concatenate.pyx index 8bdcc086e0f..10c860d97bb 100644 --- a/python/pylibcudf/pylibcudf/concatenate.pyx +++ b/python/pylibcudf/pylibcudf/concatenate.pyx @@ -40,14 +40,14 @@ cpdef concatenate(list objects): c_tables.push_back((tbl).view()) with nogil: - c_tbl_result = move(cpp_concatenate.concatenate(c_tables)) + c_tbl_result = cpp_concatenate.concatenate(c_tables) return Table.from_libcudf(move(c_tbl_result)) elif isinstance(objects[0], Column): for column in objects: c_columns.push_back((column).view()) with nogil: - c_col_result = move(cpp_concatenate.concatenate(c_columns)) + c_col_result = cpp_concatenate.concatenate(c_columns) return Column.from_libcudf(move(c_col_result)) else: raise ValueError("input must be a list of Columns or Tables") diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/contiguous_split.pxd new file mode 100644 index 00000000000..2a10cb5b3d5 --- /dev/null +++ b/python/pylibcudf/pylibcudf/contiguous_split.pxd @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from pylibcudf.libcudf.contiguous_split cimport packed_columns + +from .gpumemoryview cimport gpumemoryview +from .table cimport Table + + +cdef class PackedColumns: + cdef unique_ptr[packed_columns] c_obj + + @staticmethod + cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data) + +cpdef PackedColumns pack(Table input) + +cpdef Table unpack(PackedColumns input) + +cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data) diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx new file mode 100644 index 00000000000..ed926a3fcc0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx @@ -0,0 +1,198 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libc.stdint cimport uint8_t +from libcpp.memory cimport make_unique, unique_ptr +from libcpp.utility cimport move +from libcpp.vector cimport vector +from pylibcudf.libcudf.contiguous_split cimport ( + pack as cpp_pack, + packed_columns, + unpack as cpp_unpack, +) +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.libcudf.table.table_view cimport table_view + +from rmm.pylibrmm.device_buffer cimport DeviceBuffer + +from .gpumemoryview cimport gpumemoryview +from .table cimport Table +from .utils cimport int_to_void_ptr + + +cdef class HostBuffer: + """Owning host buffer that implements the buffer protocol""" + cdef unique_ptr[vector[uint8_t]] c_obj + cdef size_t nbytes + cdef Py_ssize_t[1] shape + cdef Py_ssize_t[1] strides + + @staticmethod + cdef HostBuffer from_unique_ptr( + unique_ptr[vector[uint8_t]] vec + ): + cdef HostBuffer out = HostBuffer() + out.c_obj = move(vec) + out.nbytes = dereference(out.c_obj).size() + out.shape[0] = out.nbytes + out.strides[0] = 1 + return out + + def __getbuffer__(self, Py_buffer *buffer, int flags): + buffer.buf = dereference(self.c_obj).data() + buffer.format = NULL # byte + buffer.internal = NULL + buffer.itemsize = 1 + buffer.len = self.nbytes + buffer.ndim = 1 + buffer.obj = self + buffer.readonly = 0 + buffer.shape = self.shape + buffer.strides = self.strides + buffer.suboffsets = NULL + + def __releasebuffer__(self, Py_buffer *buffer): + pass + +cdef class PackedColumns: + """Column data in a serialized format. + + Contains data from an array of columns in two contiguous buffers: + one on host, which contains table metadata and one on device which + contains the table data. + + For details, see :cpp:class:`cudf::packed_columns`. + """ + def __init__(self): + raise ValueError( + "PackedColumns should not be constructed directly. " + "Use one of the factories." + ) + + @staticmethod + cdef PackedColumns from_libcudf(unique_ptr[packed_columns] data): + """Create a Python PackedColumns from a libcudf packed_columns.""" + cdef PackedColumns out = PackedColumns.__new__(PackedColumns) + out.c_obj = move(data) + return out + + def release(self): + """Releases and returns the underlying serialized metadata and gpu data. + + The ownership of the memory are transferred to the returned buffers. After + this call, `self` is empty. + + Returns + ------- + memoryview (of a HostBuffer) + The serialized metadata as contiguous host memory. + gpumemoryview (of a rmm.DeviceBuffer) + The serialized gpu data as contiguous device memory. + """ + if not (dereference(self.c_obj).metadata and dereference(self.c_obj).gpu_data): + raise ValueError("Cannot release empty PackedColumns") + + return ( + memoryview( + HostBuffer.from_unique_ptr(move(dereference(self.c_obj).metadata)) + ), + gpumemoryview( + DeviceBuffer.c_from_unique_ptr(move(dereference(self.c_obj).gpu_data)) + ) + ) + + +cpdef PackedColumns pack(Table input): + """Deep-copy a table into a serialized contiguous memory format. + + Later use `unpack` or `unpack_from_memoryviews` to unpack the serialized + data back into the table. + + Examples + -------- + >>> packed = pylibcudf.contiguous_split.pack(...) + >>> # Either unpack the whole `PackedColumns` at once. + >>> pylibcudf.contiguous_split.unpack(packed) + >>> # Or unpack the two serialized buffers in `PackedColumns`. + >>> metadata, gpu_data = packed.release() + >>> pylibcudf.contiguous_split.unpack_from_memoryviews(metadata, gpu_data) + + For details, see :cpp:func:`cudf::pack`. + + Parameters + ---------- + input : Table + Table to pack. + + Returns + ------- + PackedColumns + The packed columns. + """ + return PackedColumns.from_libcudf( + make_unique[packed_columns](cpp_pack(input.view())) + ) + + +cpdef Table unpack(PackedColumns input): + """Deserialize the result of `pack`. + + Copies the result of a serialized table into a table. + Contrary to the libcudf C++ function, the returned table is a copy + of the serialized data. + + For details, see :cpp:func:`cudf::unpack`. + + Parameters + ---------- + input : PackedColumns + The packed columns to unpack. + + Returns + ------- + Table + Copy of the packed columns. + """ + cdef table_view v = cpp_unpack(dereference(input.c_obj)) + # Since `Table.from_table_view` doesn't support an arbitrary owning object, + # we copy the table, see . + cdef unique_ptr[table] t = make_unique[table](v) + return Table.from_libcudf(move(t)) + + +cpdef Table unpack_from_memoryviews(memoryview metadata, gpumemoryview gpu_data): + """Deserialize the result of `pack`. + + Copies the result of a serialized table into a table. + Contrary to the libcudf C++ function, the returned table is a copy + of the serialized data. + + For details, see :cpp:func:`cudf::unpack`. + + Parameters + ---------- + metadata : memoryview + The packed metadata to unpack. + gpu_data : gpumemoryview + The packed gpu_data to unpack. + + Returns + ------- + Table + Copy of the packed columns. + """ + if metadata.nbytes == 0: + if gpu_data.__cuda_array_interface__["data"][0] != 0: + raise ValueError("Expected an empty gpu_data from unpacking an empty table") + return Table.from_libcudf(make_unique[table](table_view())) + + # Extract the raw data pointers + cdef const uint8_t[::1] _metadata = metadata + cdef const uint8_t* metadata_ptr = &_metadata[0] + cdef const uint8_t* gpu_data_ptr = int_to_void_ptr(gpu_data.ptr) + + cdef table_view v = cpp_unpack(metadata_ptr, gpu_data_ptr) + # Since `Table.from_table_view` doesn't support an arbitrary owning object, + # we copy the table, see . + cdef unique_ptr[table] t = make_unique[table](v) + return Table.from_libcudf(move(t)) diff --git a/python/pylibcudf/pylibcudf/copying.pyx b/python/pylibcudf/pylibcudf/copying.pyx index 9743119d92a..4938f1a3dda 100644 --- a/python/pylibcudf/pylibcudf/copying.pyx +++ b/python/pylibcudf/pylibcudf/copying.pyx @@ -67,13 +67,12 @@ cpdef Table gather( """ cdef unique_ptr[table] c_result with nogil: - c_result = move( - cpp_copying.gather( - source_table.view(), - gather_map.view(), - bounds_policy - ) + c_result = cpp_copying.gather( + source_table.view(), + gather_map.view(), + bounds_policy ) + return Table.from_libcudf(move(c_result)) @@ -121,22 +120,18 @@ cpdef Table scatter( cdef vector[reference_wrapper[const scalar]] source_scalars if TableOrListOfScalars is Table: with nogil: - c_result = move( - cpp_copying.scatter( - source.view(), - scatter_map.view(), - target_table.view(), - ) + c_result = cpp_copying.scatter( + source.view(), + scatter_map.view(), + target_table.view(), ) else: source_scalars = _as_vector(source) with nogil: - c_result = move( - cpp_copying.scatter( - source_scalars, - scatter_map.view(), - target_table.view(), - ) + c_result = cpp_copying.scatter( + source_scalars, + scatter_map.view(), + target_table.view(), ) return Table.from_libcudf(move(c_result)) @@ -160,11 +155,11 @@ cpdef ColumnOrTable empty_like(ColumnOrTable input): cdef unique_ptr[column] c_col_result if ColumnOrTable is Column: with nogil: - c_col_result = move(cpp_copying.empty_like(input.view())) + c_col_result = cpp_copying.empty_like(input.view()) return Column.from_libcudf(move(c_col_result)) else: with nogil: - c_tbl_result = move(cpp_copying.empty_like(input.view())) + c_tbl_result = cpp_copying.empty_like(input.view()) return Table.from_libcudf(move(c_tbl_result)) @@ -195,13 +190,11 @@ cpdef Column allocate_like( cdef size_type c_size = size if size is not None else input_column.size() with nogil: - c_result = move( - cpp_copying.allocate_like( + c_result = cpp_copying.allocate_like( input_column.view(), c_size, policy, ) - ) return Column.from_libcudf(move(c_result)) @@ -298,12 +291,12 @@ cpdef Column copy_range( cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_copying.copy_range( + c_result = cpp_copying.copy_range( input_column.view(), target_column.view(), input_begin, input_end, - target_begin) + target_begin ) return Column.from_libcudf(move(c_result)) @@ -337,13 +330,11 @@ cpdef Column shift(Column input, size_type offset, Scalar fill_value): """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_copying.shift( + c_result = cpp_copying.shift( input.view(), offset, dereference(fill_value.c_obj) ) - ) return Column.from_libcudf(move(c_result)) @@ -378,7 +369,7 @@ cpdef list slice(ColumnOrTable input, list indices): cdef int i if ColumnOrTable is Column: with nogil: - c_col_result = move(cpp_copying.slice(input.view(), c_indices)) + c_col_result = cpp_copying.slice(input.view(), c_indices) return [ Column.from_column_view(c_col_result[i], input) @@ -386,7 +377,7 @@ cpdef list slice(ColumnOrTable input, list indices): ] else: with nogil: - c_tbl_result = move(cpp_copying.slice(input.view(), c_indices)) + c_tbl_result = cpp_copying.slice(input.view(), c_indices) return [ Table.from_table_view(c_tbl_result[i], input) @@ -418,7 +409,7 @@ cpdef list split(ColumnOrTable input, list splits): if ColumnOrTable is Column: with nogil: - c_col_result = move(cpp_copying.split(input.view(), c_splits)) + c_col_result = cpp_copying.split(input.view(), c_splits) return [ Column.from_column_view(c_col_result[i], input) @@ -426,7 +417,7 @@ cpdef list split(ColumnOrTable input, list splits): ] else: with nogil: - c_tbl_result = move(cpp_copying.split(input.view(), c_splits)) + c_tbl_result = cpp_copying.split(input.view(), c_splits) return [ Table.from_table_view(c_tbl_result[i], input) @@ -472,29 +463,25 @@ cpdef Column copy_if_else( if LeftCopyIfElseOperand is Column and RightCopyIfElseOperand is Column: with nogil: - result = move( - cpp_copying.copy_if_else(lhs.view(), rhs.view(), boolean_mask.view()) + result = cpp_copying.copy_if_else( + lhs.view(), + rhs.view(), + boolean_mask.view() ) elif LeftCopyIfElseOperand is Column and RightCopyIfElseOperand is Scalar: with nogil: - result = move( - cpp_copying.copy_if_else( - lhs.view(), dereference(rhs.c_obj), boolean_mask.view() - ) + result = cpp_copying.copy_if_else( + lhs.view(), dereference(rhs.c_obj), boolean_mask.view() ) elif LeftCopyIfElseOperand is Scalar and RightCopyIfElseOperand is Column: with nogil: - result = move( - cpp_copying.copy_if_else( - dereference(lhs.c_obj), rhs.view(), boolean_mask.view() - ) + result = cpp_copying.copy_if_else( + dereference(lhs.c_obj), rhs.view(), boolean_mask.view() ) else: with nogil: - result = move( - cpp_copying.copy_if_else( - dereference(lhs.c_obj), dereference(rhs.c_obj), boolean_mask.view() - ) + result = cpp_copying.copy_if_else( + dereference(lhs.c_obj), dereference(rhs.c_obj), boolean_mask.view() ) return Column.from_libcudf(move(result)) @@ -541,22 +528,18 @@ cpdef Table boolean_mask_scatter( if TableOrListOfScalars is Table: with nogil: - result = move( - cpp_copying.boolean_mask_scatter( - input.view(), - target.view(), - boolean_mask.view() - ) + result = cpp_copying.boolean_mask_scatter( + input.view(), + target.view(), + boolean_mask.view() ) else: source_scalars = _as_vector(input) with nogil: - result = move( - cpp_copying.boolean_mask_scatter( - source_scalars, - target.view(), - boolean_mask.view(), - ) + result = cpp_copying.boolean_mask_scatter( + source_scalars, + target.view(), + boolean_mask.view(), ) return Table.from_libcudf(move(result)) @@ -586,8 +569,6 @@ cpdef Scalar get_element(Column input_column, size_type index): """ cdef unique_ptr[scalar] c_output with nogil: - c_output = move( - cpp_copying.get_element(input_column.view(), index) - ) + c_output = cpp_copying.get_element(input_column.view(), index) return Scalar.from_libcudf(move(c_output)) diff --git a/python/pylibcudf/pylibcudf/datetime.pxd b/python/pylibcudf/pylibcudf/datetime.pxd index 2fce48cf1b4..72ce680ba7a 100644 --- a/python/pylibcudf/pylibcudf/datetime.pxd +++ b/python/pylibcudf/pylibcudf/datetime.pxd @@ -1,8 +1,15 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from pylibcudf.libcudf.datetime cimport datetime_component + from .column cimport Column cpdef Column extract_year( Column col ) + +cpdef Column extract_datetime_component( + Column col, + datetime_component component +) diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx index e8e0caaf42d..ac4335cca56 100644 --- a/python/pylibcudf/pylibcudf/datetime.pyx +++ b/python/pylibcudf/pylibcudf/datetime.pyx @@ -3,19 +3,14 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.datetime cimport ( - day_of_year as cpp_day_of_year, - extract_day as cpp_extract_day, - extract_hour as cpp_extract_hour, - extract_microsecond_fraction as cpp_extract_microsecond_fraction, - extract_millisecond_fraction as cpp_extract_millisecond_fraction, - extract_minute as cpp_extract_minute, - extract_month as cpp_extract_month, - extract_nanosecond_fraction as cpp_extract_nanosecond_fraction, - extract_second as cpp_extract_second, - extract_weekday as cpp_extract_weekday, + datetime_component, + extract_datetime_component as cpp_extract_datetime_component, extract_year as cpp_extract_year, ) +from pylibcudf.libcudf.datetime import \ + datetime_component as DatetimeComponent # no-cython-lint + from .column cimport Column @@ -38,44 +33,32 @@ cpdef Column extract_year( cdef unique_ptr[column] result with nogil: - result = move(cpp_extract_year(values.view())) + result = cpp_extract_year(values.view()) return Column.from_libcudf(move(result)) +cpdef Column extract_datetime_component( + Column values, + datetime_component component +): + """ + Extract a datetime component from a datetime column. -def extract_datetime_component(Column col, str field): + For details, see :cpp:func:`cudf::extract_datetime_component`. - cdef unique_ptr[column] c_result + Parameters + ---------- + values : Column + The column to extract the component from. + component : DatetimeComponent + The datetime component to extract. - with nogil: - if field == "year": - c_result = move(cpp_extract_year(col.view())) - elif field == "month": - c_result = move(cpp_extract_month(col.view())) - elif field == "day": - c_result = move(cpp_extract_day(col.view())) - elif field == "weekday": - c_result = move(cpp_extract_weekday(col.view())) - elif field == "hour": - c_result = move(cpp_extract_hour(col.view())) - elif field == "minute": - c_result = move(cpp_extract_minute(col.view())) - elif field == "second": - c_result = move(cpp_extract_second(col.view())) - elif field == "millisecond": - c_result = move( - cpp_extract_millisecond_fraction(col.view()) - ) - elif field == "microsecond": - c_result = move( - cpp_extract_microsecond_fraction(col.view()) - ) - elif field == "nanosecond": - c_result = move( - cpp_extract_nanosecond_fraction(col.view()) - ) - elif field == "day_of_year": - c_result = move(cpp_day_of_year(col.view())) - else: - raise ValueError(f"Invalid datetime field: '{field}'") + Returns + ------- + Column + Column with the extracted component. + """ + cdef unique_ptr[column] result - return Column.from_libcudf(move(c_result)) + with nogil: + result = cpp_extract_datetime_component(values.view(), component) + return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx index 61b430e64aa..0372e1132cc 100644 --- a/python/pylibcudf/pylibcudf/filling.pyx +++ b/python/pylibcudf/pylibcudf/filling.pyx @@ -48,13 +48,11 @@ cpdef Column fill( cdef unique_ptr[column] result with nogil: - result = move( - cpp_fill( - destination.view(), - begin, - end, - dereference(( value).c_obj) - ) + result = cpp_fill( + destination.view(), + begin, + end, + dereference(( value).c_obj) ) return Column.from_libcudf(move(result)) @@ -112,12 +110,10 @@ cpdef Column sequence(size_type size, Scalar init, Scalar step): cdef unique_ptr[column] result cdef size_type c_size = size with nogil: - result = move( - cpp_sequence( - c_size, - dereference(init.c_obj), - dereference(step.c_obj), - ) + result = cpp_sequence( + c_size, + dereference(init.c_obj), + dereference(step.c_obj), ) return Column.from_libcudf(move(result)) @@ -152,18 +148,14 @@ cpdef Table repeat( if ColumnOrSize is Column: with nogil: - result = move( - cpp_repeat( - input_table.view(), - count.view() - ) + result = cpp_repeat( + input_table.view(), + count.view() ) if ColumnOrSize is size_type: with nogil: - result = move( - cpp_repeat( - input_table.view(), - count - ) + result = cpp_repeat( + input_table.view(), + count ) return Table.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx index afb95dba5b3..71f9ecb0453 100644 --- a/python/pylibcudf/pylibcudf/groupby.pyx +++ b/python/pylibcudf/pylibcudf/groupby.pyx @@ -176,7 +176,7 @@ cdef class GroupBy: # We rely on libcudf to tell us this rather than checking the types beforehand # ourselves. with nogil: - c_res = move(dereference(self.c_obj).aggregate(c_requests)) + c_res = dereference(self.c_obj).aggregate(c_requests) return GroupBy._parse_outputs(move(c_res)) cpdef tuple scan(self, list requests): @@ -205,7 +205,7 @@ cdef class GroupBy: cdef pair[unique_ptr[table], vector[aggregation_result]] c_res with nogil: - c_res = move(dereference(self.c_obj).scan(c_requests)) + c_res = dereference(self.c_obj).scan(c_requests) return GroupBy._parse_outputs(move(c_res)) cpdef tuple shift(self, Table values, list offset, list fill_values): @@ -234,10 +234,11 @@ cdef class GroupBy: cdef vector[size_type] c_offset = offset cdef pair[unique_ptr[table], unique_ptr[table]] c_res with nogil: - c_res = move( - dereference(self.c_obj).shift(values.view(), c_offset, c_fill_values) + c_res = dereference(self.c_obj).shift( + values.view(), + c_offset, + c_fill_values ) - return ( Table.from_libcudf(move(c_res.first)), Table.from_libcudf(move(c_res.second)), @@ -264,10 +265,10 @@ cdef class GroupBy: cdef pair[unique_ptr[table], unique_ptr[table]] c_res cdef vector[replace_policy] c_replace_policies = replace_policies with nogil: - c_res = move( - dereference(self.c_obj).replace_nulls(value.view(), c_replace_policies) + c_res = dereference(self.c_obj).replace_nulls( + value.view(), + c_replace_policies ) - return ( Table.from_libcudf(move(c_res.first)), Table.from_libcudf(move(c_res.second)), diff --git a/python/pylibcudf/pylibcudf/interop.pxd b/python/pylibcudf/pylibcudf/interop.pxd new file mode 100644 index 00000000000..2a0a8c15fdd --- /dev/null +++ b/python/pylibcudf/pylibcudf/interop.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.table cimport Table + + +cpdef Table from_dlpack(object managed_tensor) + +cpdef object to_dlpack(Table input) diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx index 1a03fa5b45b..61e812353b7 100644 --- a/python/pylibcudf/pylibcudf/interop.pyx +++ b/python/pylibcudf/pylibcudf/interop.pyx @@ -1,6 +1,11 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. -from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_New +from cpython.pycapsule cimport ( + PyCapsule_GetPointer, + PyCapsule_IsValid, + PyCapsule_New, + PyCapsule_SetName, +) from libc.stdlib cimport free from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -16,11 +21,14 @@ from pylibcudf.libcudf.interop cimport ( ArrowArray, ArrowArrayStream, ArrowSchema, + DLManagedTensor, column_metadata, from_arrow_column as cpp_from_arrow_column, from_arrow_stream as cpp_from_arrow_stream, + from_dlpack as cpp_from_dlpack, to_arrow_host_raw, to_arrow_schema_raw, + to_dlpack as cpp_to_dlpack, ) from pylibcudf.libcudf.table.table cimport table @@ -131,7 +139,7 @@ def _from_arrow_table(pyarrow_object, *, DataType data_type=None): cdef unique_ptr[table] c_result with nogil: # The libcudf function here will release the stream. - c_result = move(cpp_from_arrow_stream(c_stream)) + c_result = cpp_from_arrow_stream(c_stream) return Table.from_libcudf(move(c_result)) @@ -166,7 +174,7 @@ def _from_arrow_column(pyarrow_object, *, DataType data_type=None): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_from_arrow_column(c_schema, c_array)) + c_result = cpp_from_arrow_column(c_schema, c_array) # The capsule destructors should release automatically for us, but we # choose to do it explicitly here for clarity. @@ -315,3 +323,87 @@ def _to_arrow_scalar(cudf_object, metadata=None): # Note that metadata for scalars is primarily important for preserving # information on nested types since names are otherwise irrelevant. return to_arrow(Column.from_scalar(cudf_object, 1), metadata=metadata)[0] + + +cpdef Table from_dlpack(object managed_tensor): + """ + Convert a DLPack DLTensor into a cudf table. + + For details, see :cpp:func:`cudf::from_dlpack` + + Parameters + ---------- + managed_tensor : PyCapsule + A 1D or 2D column-major (Fortran order) tensor. + + Returns + ------- + Table + Table with a copy of the tensor data. + """ + if not PyCapsule_IsValid(managed_tensor, "dltensor"): + raise ValueError("Invalid PyCapsule object") + cdef unique_ptr[table] c_result + cdef DLManagedTensor* dlpack_tensor = PyCapsule_GetPointer( + managed_tensor, "dltensor" + ) + if dlpack_tensor is NULL: + raise ValueError("PyCapsule object contained a NULL pointer") + PyCapsule_SetName(managed_tensor, "used_dltensor") + + # Note: A copy is always performed when converting the dlpack + # data to a libcudf table. We also delete the dlpack_tensor pointer + # as the pointer is not deleted by libcudf's from_dlpack function. + # TODO: https://github.com/rapidsai/cudf/issues/10874 + # TODO: https://github.com/rapidsai/cudf/issues/10849 + with nogil: + c_result = cpp_from_dlpack(dlpack_tensor) + + cdef Table result = Table.from_libcudf(move(c_result)) + dlpack_tensor.deleter(dlpack_tensor) + return result + + +cpdef object to_dlpack(Table input): + """ + Convert a cudf table into a DLPack DLTensor. + + For details, see :cpp:func:`cudf::to_dlpack` + + Parameters + ---------- + input : Table + A 1D or 2D column-major (Fortran order) tensor. + + Returns + ------- + PyCapsule + 1D or 2D DLPack tensor with a copy of the table data, or nullptr. + """ + for col in input._columns: + if col.null_count(): + raise ValueError( + "Cannot create a DLPack tensor with null values. " + "Input is required to have null count as zero." + ) + cdef DLManagedTensor *dlpack_tensor + + with nogil: + dlpack_tensor = cpp_to_dlpack(input.view()) + + return PyCapsule_New( + dlpack_tensor, + "dltensor", + dlmanaged_tensor_pycapsule_deleter + ) + + +cdef void dlmanaged_tensor_pycapsule_deleter(object pycap_obj) noexcept: + if PyCapsule_IsValid(pycap_obj, "used_dltensor"): + # we do not call a used capsule's deleter + return + cdef DLManagedTensor* dlpack_tensor = PyCapsule_GetPointer( + pycap_obj, "dltensor" + ) + if dlpack_tensor is not NULL: + dlpack_tensor.deleter(dlpack_tensor) diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx index 438b0ff1634..fe765b34f82 100644 --- a/python/pylibcudf/pylibcudf/io/avro.pyx +++ b/python/pylibcudf/pylibcudf/io/avro.pyx @@ -45,7 +45,7 @@ cpdef TableWithMetadata read_avro( for col in columns: c_columns.push_back(str(col).encode()) - cdef avro_reader_options avro_opts = move( + cdef avro_reader_options avro_opts = ( avro_reader_options.builder(source_info.c_obj) .columns(c_columns) .skip_rows(skip_rows) diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx index b53d6771cd6..2c61cc42d82 100644 --- a/python/pylibcudf/pylibcudf/io/csv.pyx +++ b/python/pylibcudf/pylibcudf/io/csv.pyx @@ -168,7 +168,7 @@ def read_csv( cdef vector[data_type] c_dtypes_list cdef map[string, data_type] c_dtypes_map - cdef csv_reader_options options = move( + cdef csv_reader_options options = ( csv_reader_options.builder(source_info.c_obj) .compression(compression) .mangle_dupe_cols(mangle_dupe_cols) diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx index 29e49083bc6..65f78f830f1 100644 --- a/python/pylibcudf/pylibcudf/io/json.pyx +++ b/python/pylibcudf/pylibcudf/io/json.pyx @@ -59,7 +59,7 @@ cdef json_reader_options _setup_json_reader_options( json_recovery_mode_t recovery_mode): cdef vector[data_type] types_vec - cdef json_reader_options opts = move( + cdef json_reader_options opts = ( json_reader_options.builder(source_info.c_obj) .compression(compression) .lines(lines) diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx index 01a5e4b04a1..70e0a7995a2 100644 --- a/python/pylibcudf/pylibcudf/io/orc.pyx +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -252,7 +252,7 @@ cpdef TableWithMetadata read_orc( """ cdef orc_reader_options opts cdef vector[vector[size_type]] c_stripes - opts = move( + opts = ( orc_reader_options.builder(source_info.c_obj) .use_index(use_index) .build() diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyx b/python/pylibcudf/pylibcudf/io/timezone.pyx index e02239d7252..f120b65fb2c 100644 --- a/python/pylibcudf/pylibcudf/io/timezone.pyx +++ b/python/pylibcudf/pylibcudf/io/timezone.pyx @@ -33,11 +33,9 @@ cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name): cdef string c_tzname = timezone_name.encode() with nogil: - c_result = move( - cpp_make_timezone_transition_table( - make_optional[string](c_tzdir), - c_tzname - ) + c_result = cpp_make_timezone_transition_table( + make_optional[string](c_tzdir), + c_tzname ) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx index 25664286f19..bc72647ea8e 100644 --- a/python/pylibcudf/pylibcudf/join.pyx +++ b/python/pylibcudf/pylibcudf/join.pyx @@ -9,7 +9,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport null_equality -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer from .column cimport Column from .table cimport Table @@ -212,5 +212,5 @@ cpdef Table cross_join(Table left, Table right): """ cdef unique_ptr[table] result with nogil: - result = move(cpp_join.cross_join(left.view(), right.view())) + result = cpp_join.cross_join(left.view(), right.view()) return Table.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/json.pxd b/python/pylibcudf/pylibcudf/json.pxd new file mode 100644 index 00000000000..87a87349b8a --- /dev/null +++ b/python/pylibcudf/pylibcudf/json.pxd @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.json cimport get_json_object_options +from pylibcudf.scalar cimport Scalar + + +cdef class GetJsonObjectOptions: + cdef get_json_object_options options + + +cpdef Column get_json_object( + Column col, + Scalar json_path, + GetJsonObjectOptions options=* +) diff --git a/python/pylibcudf/pylibcudf/json.pyx b/python/pylibcudf/pylibcudf/json.pyx new file mode 100644 index 00000000000..ebb82f80408 --- /dev/null +++ b/python/pylibcudf/pylibcudf/json.pyx @@ -0,0 +1,152 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf cimport json as cpp_json +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.scalar cimport Scalar + + +cdef class GetJsonObjectOptions: + """Settings for ``get_json_object()``""" + def __init__( + self, + *, + allow_single_quotes=False, + strip_quotes_from_single_strings=True, + missing_fields_as_nulls=False + ): + self.set_allow_single_quotes(allow_single_quotes) + self.set_strip_quotes_from_single_strings( + strip_quotes_from_single_strings + ) + self.set_missing_fields_as_nulls(missing_fields_as_nulls) + + def get_allow_single_quotes(self): + """ + Returns true/false depending on whether single-quotes for representing strings + are allowed. + + Returns + ------- + bool + true if single-quotes are allowed, false otherwise. + """ + return self.options.get_allow_single_quotes() + + def get_strip_quotes_from_single_strings(self): + """ + Returns true/false depending on whether individually returned string values have + their quotes stripped. + + Returns + ------- + bool + true if individually returned string values have their quotes stripped. + """ + return self.options.get_strip_quotes_from_single_strings() + + def get_missing_fields_as_nulls(self): + """ + Whether a field not contained by an object is to be interpreted as null. + + Returns + ------- + bool + true if missing fields are interpreted as null. + """ + return self.options.get_missing_fields_as_nulls() + + def set_allow_single_quotes(self, bool val): + """ + Set whether single-quotes for strings are allowed. + + Parameters + ---------- + val : bool + Whether to allow single quotes + + Returns + ------- + None + """ + self.options.set_allow_single_quotes(val) + + def set_strip_quotes_from_single_strings(self, bool val): + """ + Set whether individually returned string values have their quotes stripped. + + Parameters + ---------- + val : bool + Whether to strip quotes from single strings. + + Returns + ------- + None + """ + self.options.set_strip_quotes_from_single_strings(val) + + def set_missing_fields_as_nulls(self, bool val): + """ + Set whether missing fields are interpreted as null. + + Parameters + ---------- + val : bool + Whether to treat missing fields as nulls. + + Returns + ------- + None + """ + self.options.set_missing_fields_as_nulls(val) + + +cpdef Column get_json_object( + Column col, + Scalar json_path, + GetJsonObjectOptions options=None +): + """ + Apply a JSONPath string to all rows in an input strings column. + + For details, see :cpp:func:`cudf::get_json_object` + + Parameters + ---------- + col : Column + The input strings column. Each row must contain a valid json string. + + json_path : Scalar + The JSONPath string to be applied to each row. + + options : GetJsonObjectOptions + Options for controlling the behavior of the function. + + Returns + ------- + Column + New strings column containing the retrieved json object strings. + """ + cdef unique_ptr[column] c_result + cdef string_scalar* c_json_path = ( + json_path.c_obj.get() + ) + if options is None: + options = GetJsonObjectOptions() + + cdef cpp_json.get_json_object_options c_options = options.options + + with nogil: + c_result = cpp_json.get_json_object( + col.view(), + dereference(c_json_path), + c_options + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx index b3f6a92d85c..226a9e14172 100644 --- a/python/pylibcudf/pylibcudf/labeling.pyx +++ b/python/pylibcudf/pylibcudf/labeling.pyx @@ -54,14 +54,12 @@ cpdef Column label_bins( ) with nogil: - c_result = move( - cpp_labeling.label_bins( - input.view(), - left_edges.view(), - c_left_inclusive, - right_edges.view(), - c_right_inclusive, - ) + c_result = cpp_labeling.label_bins( + input.view(), + left_edges.view(), + c_left_inclusive, + right_edges.view(), + c_right_inclusive, ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt index 2167616690f..15beaee47d4 100644 --- a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt @@ -12,8 +12,9 @@ # the License. # ============================================================================= -set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx labeling.pyx reduce.pyx - replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx +set(cython_sources + aggregation.pyx binaryop.pyx copying.pyx datetime.pyx expressions.pyx labeling.pyx reduce.pyx + replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd index 7a369701bbd..76f35cbba71 100644 --- a/python/pylibcudf/pylibcudf/libcudf/column/column.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd @@ -9,7 +9,7 @@ from pylibcudf.libcudf.column.column_view cimport ( ) from pylibcudf.libcudf.types cimport data_type, size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/column/column.hpp" namespace "cudf" nogil: diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd index f1a326bcd40..b2388858127 100644 --- a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd @@ -11,7 +11,7 @@ from pylibcudf.libcudf.types cimport ( type_id, ) -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil: diff --git a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd index 92f5a185a54..def292148c5 100644 --- a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd @@ -4,9 +4,9 @@ from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector from pylibcudf.libcudf.column.column cimport column, column_view from pylibcudf.libcudf.table.table cimport table, table_view -from pylibcudf.libcudf.utilities.host_span cimport host_span +from pylibcudf.libcudf.utilities.span cimport host_span -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil: diff --git a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd index cadac6a0022..12090af16cc 100644 --- a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd @@ -6,7 +6,7 @@ from libcpp.vector cimport vector from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil: @@ -26,3 +26,8 @@ cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil: cdef packed_columns pack (const table_view& input) except + cdef table_view unpack (const packed_columns& input) except + + + cdef table_view unpack ( + const uint8_t* metadata, + const uint8_t* gpu_data + ) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/copying.pxd b/python/pylibcudf/pylibcudf/libcudf/copying.pxd index 4d4a4ba9b89..e6e719d6436 100644 --- a/python/pylibcudf/pylibcudf/libcudf/copying.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/copying.pxd @@ -16,7 +16,7 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer ctypedef const scalar constscalar diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd index a4465343197..73cdfb96af5 100644 --- a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from libc.stdint cimport uint8_t from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -7,6 +8,18 @@ from pylibcudf.libcudf.scalar.scalar cimport scalar cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: + cpdef enum class datetime_component(uint8_t): + YEAR + MONTH + DAY + WEEKDAY + HOUR + MINUTE + SECOND + MILLISECOND + MICROSECOND + NANOSECOND + cdef unique_ptr[column] extract_year(const column_view& column) except + cdef unique_ptr[column] extract_month(const column_view& column) except + cdef unique_ptr[column] extract_day(const column_view& column) except + @@ -23,6 +36,10 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil: cdef unique_ptr[column] extract_nanosecond_fraction( const column_view& column ) except + + cdef unique_ptr[column] extract_datetime_component( + const column_view& column, + datetime_component component + ) except + ctypedef enum rounding_frequency "cudf::datetime::rounding_frequency": DAY "cudf::datetime::rounding_frequency::DAY" diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pyx b/python/pylibcudf/pylibcudf/libcudf/datetime.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/fixed_point/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/fixed_point/__init__.pxd new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd new file mode 100644 index 00000000000..e55574020f4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/libcudf/fixed_point/fixed_point.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. + +from libc.stdint cimport int32_t + + +cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil: + cdef cppclass scale_type: + scale_type(int32_t) diff --git a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd index 848462131fe..17ea33a2066 100644 --- a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd @@ -22,7 +22,6 @@ from pylibcudf.libcudf.types cimport ( size_type, sorted, ) -from pylibcudf.libcudf.utilities.host_span cimport host_span # workaround for https://github.com/cython/cython/issues/3885 ctypedef const scalar constscalar diff --git a/python/pylibcudf/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd index 30b97fdec34..b75e9ca7001 100644 --- a/python/pylibcudf/pylibcudf/libcudf/interop.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd @@ -32,11 +32,13 @@ cdef extern from "cudf/interop.hpp" nogil: cdef extern from "cudf/interop.hpp" namespace "cudf" \ nogil: - cdef unique_ptr[table] from_dlpack(const DLManagedTensor* tensor - ) except + + cdef unique_ptr[table] from_dlpack( + const DLManagedTensor* managed_tensor + ) except + - DLManagedTensor* to_dlpack(table_view input_table - ) except + + DLManagedTensor* to_dlpack( + const table_view& input + ) except + cdef cppclass column_metadata: column_metadata() except + diff --git a/python/pylibcudf/pylibcudf/libcudf/join.pxd b/python/pylibcudf/pylibcudf/libcudf/join.pxd index 6f6c145b23c..21033a0284e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/join.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/join.pxd @@ -9,7 +9,7 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport null_equality, size_type -from rmm._lib.device_uvector cimport device_uvector +from rmm.librmm.device_uvector cimport device_uvector ctypedef unique_ptr[device_uvector[size_type]] gather_map_type ctypedef pair[gather_map_type, gather_map_type] gather_map_pair_type diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/json.pxd b/python/pylibcudf/pylibcudf/libcudf/json.pxd similarity index 100% rename from python/pylibcudf/pylibcudf/libcudf/strings/json.pxd rename to python/pylibcudf/pylibcudf/libcudf/json.pxd diff --git a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd index 5f582091b06..27af4a3bdb1 100644 --- a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd @@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport bitmask_type, mask_state, size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil: diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index f2dd22f43aa..41250037dcf 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -1,13 +1,21 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar from pylibcudf.libcudf.types cimport size_type cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: + cdef unique_ptr[column] minhash( + const column_view &strings, + const numeric_scalar[uint32_t] seed, + const size_type width, + ) except + + cdef unique_ptr[column] minhash( const column_view &strings, const column_view &seeds, @@ -20,6 +28,12 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const size_type width, ) except + + cdef unique_ptr[column] minhash64( + const column_view &strings, + const numeric_scalar[uint64_t] seed, + const size_type width, + ) except + + cdef unique_ptr[column] word_minhash( const column_view &input, const column_view &seeds diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd index 673bffa28ae..be3a2d75718 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t +from libcpp cimport bool from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -8,9 +9,9 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil: - ctypedef enum letter_type: - CONSONANT 'nvtext::letter_type::CONSONANT' - VOWEL 'nvtext::letter_type::VOWEL' + cpdef enum class letter_type: + CONSONANT + VOWEL cdef unique_ptr[column] porter_stemmer_measure( const column_view & strings diff --git a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd index 4b40a8a26f6..a51413669c5 100644 --- a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd @@ -4,9 +4,9 @@ from libc.stdint cimport int32_t, int64_t from libcpp cimport bool from libcpp.string cimport string from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.fixed_point.fixed_point cimport scale_type from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport data_type -from pylibcudf.libcudf.wrappers.decimals cimport scale_type cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil: diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt index b8b4343173e..f5f2113332a 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources char_types.pyx regex_flags.pyx side_type.pyx translate.pyx) +set(cython_sources char_types.pyx combine.pyx regex_flags.pyx side_type.pyx translate.pyx) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd index e4c9fa5817a..e659993b834 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from libcpp cimport int from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view @@ -9,21 +10,29 @@ from pylibcudf.libcudf.table.table_view cimport table_view cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil: - ctypedef enum separator_on_nulls: - YES 'cudf::strings::separator_on_nulls::YES' - NO 'cudf::strings::separator_on_nulls::NO' + cpdef enum class separator_on_nulls(int): + YES + NO - ctypedef enum output_if_empty_list: - EMPTY_STRING 'cudf::strings::output_if_empty_list::EMPTY_STRING' - NULL_ELEMENT 'cudf::strings::output_if_empty_list::NULL_ELEMENT' + cpdef enum class output_if_empty_list(int): + EMPTY_STRING + NULL_ELEMENT cdef unique_ptr[column] concatenate( - table_view source_strings, + table_view strings_columns, string_scalar separator, - string_scalar narep) except + + string_scalar narep, + separator_on_nulls separate_nulls) except + + + cdef unique_ptr[column] concatenate( + table_view strings_columns, + column_view separators, + string_scalar separator_narep, + string_scalar col_narep, + separator_on_nulls separate_nulls) except + cdef unique_ptr[column] join_strings( - column_view source_strings, + column_view input, string_scalar separator, string_scalar narep) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pyx new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd index 83a9573baad..e6688cfff81 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd @@ -8,10 +8,10 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_booleans( - column_view input_col, + column_view input, string_scalar true_string) except + cdef unique_ptr[column] from_booleans( - column_view input_col, + column_view booleans, string_scalar true_string, string_scalar false_string) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd index fa8975c4df9..fceddd58df0 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd @@ -10,14 +10,14 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_timestamps( - column_view input_col, + column_view input, data_type timestamp_type, string format) except + cdef unique_ptr[column] from_timestamps( - column_view input_col, + column_view timestamps, string format, - column_view input_strings_names) except + + column_view names) except + cdef unique_ptr[column] is_timestamp( column_view input_col, diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd index ebe10574353..43ffad1d89f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd @@ -10,7 +10,7 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_durations( - const column_view & strings_col, + const column_view & input, data_type duration_type, const string & format) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd index 6f820f3c9a4..72ab329f2dd 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd @@ -9,13 +9,13 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_fixed_point( - column_view input_col, + column_view input, data_type output_type) except + cdef unique_ptr[column] from_fixed_point( - column_view input_col) except + + column_view input) except + cdef unique_ptr[column] is_fixed_point( - column_view source_strings, - data_type output_type + column_view input, + data_type decimal_type ) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd index f4fc4674506..a45c7f9979e 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd @@ -9,12 +9,12 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_floats( - column_view input_col, + column_view strings, data_type output_type) except + cdef unique_ptr[column] from_floats( - column_view input_col) except + + column_view floats) except + cdef unique_ptr[column] is_float( - column_view source_strings + column_view input ) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd index f12aab0a2e4..69d566b8c49 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport data_type @@ -9,23 +10,28 @@ from pylibcudf.libcudf.types cimport data_type cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] to_integers( - column_view input_col, - data_type output_type) except + + column_view input, + data_type output_type) except +libcudf_exception_handler cdef unique_ptr[column] from_integers( - column_view input_col) except + + column_view integers) except +libcudf_exception_handler + + cdef unique_ptr[column] is_integer( + column_view input + ) except +libcudf_exception_handler cdef unique_ptr[column] is_integer( - column_view source_strings - ) except + + column_view input, + data_type int_type + ) except +libcudf_exception_handler cdef unique_ptr[column] hex_to_integers( - column_view input_col, + column_view input, data_type output_type) except + cdef unique_ptr[column] is_hex( - column_view source_strings - ) except + + column_view input + ) except +libcudf_exception_handler cdef unique_ptr[column] integers_to_hex( - column_view input_col) except + + column_view input) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd index fe571cfced6..801db438e92 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd @@ -8,11 +8,11 @@ from pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "cudf/strings/convert/convert_ipv4.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] ipv4_to_integers( - column_view input_col) except + + column_view input) except + cdef unique_ptr[column] integers_to_ipv4( - column_view input_col) except + + column_view integers) except + cdef unique_ptr[column] is_ipv4( - column_view source_strings + column_view input ) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd index 109111568d8..6e1ecd30539 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd @@ -9,6 +9,6 @@ cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] format_list_column( - column_view input_col, + column_view input, string_scalar na_rep, column_view separators) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd index 5c07b698454..cb319ad143b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd @@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view cdef extern from "cudf/strings/convert/convert_urls.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[column] url_encode( - column_view input_col) except + + column_view input) except + cdef unique_ptr[column] url_decode( - column_view input_col) except + + column_view input) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd index 0491644a10a..3d048c1f50b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd @@ -9,5 +9,5 @@ cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \ nogil: cdef unique_ptr[column] find_multiple( - column_view source_strings, + column_view input, column_view targets) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd index e0a8b776465..0d286c36446 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd @@ -11,3 +11,7 @@ cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] findall( column_view input, regex_program prog) except + + + cdef unique_ptr[column] find_re( + column_view input, + regex_program prog) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd index 657fe61eb14..875f8cafd14 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd @@ -12,11 +12,11 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/strings/padding.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] pad( - column_view source_strings, + column_view input, size_type width, side_type side, string fill_char) except + cdef unique_ptr[column] zfill( - column_view source_strings, + column_view input, size_type width) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd index 40f0e2fa50c..6b0c90d0acc 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd @@ -6,6 +6,7 @@ from libcpp.vector cimport vector from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings.regex_flags cimport regex_flags from pylibcudf.libcudf.strings.regex_program cimport regex_program from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.types cimport size_type @@ -14,17 +15,18 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/strings/replace_re.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] replace_re( - column_view source_strings, - regex_program, - string_scalar repl, - size_type maxrepl) except + - - cdef unique_ptr[column] replace_with_backrefs( - column_view source_strings, - regex_program, - string repl) except + + column_view input, + regex_program prog, + string_scalar replacement, + size_type max_replace_count) except + cdef unique_ptr[column] replace_re( - column_view source_strings, + column_view input, vector[string] patterns, - column_view repls) except + + column_view replacements, + regex_flags flags) except + + + cdef unique_ptr[column] replace_with_backrefs( + column_view input, + regex_program prog, + string replacement) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd index 019ff3f17ba..e92c5dc1d66 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd @@ -1,12 +1,10 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. -from libc.stdint cimport int32_t +from libcpp cimport int cdef extern from "cudf/strings/side_type.hpp" namespace "cudf::strings" nogil: - cpdef enum class side_type(int32_t): - LEFT 'cudf::strings::side_type::LEFT' - RIGHT 'cudf::strings::side_type::RIGHT' - BOTH 'cudf::strings::side_type::BOTH' - -ctypedef int32_t underlying_type_t_side_type + cpdef enum class side_type(int): + LEFT + RIGHT + BOTH diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd index 4162e886a7d..4299cf62e99 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd @@ -12,9 +12,9 @@ cdef extern from "cudf/strings/split/partition.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] partition( - column_view source_strings, + column_view input, string_scalar delimiter) except + cdef unique_ptr[table] rpartition( - column_view source_strings, + column_view input, string_scalar delimiter) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd index 3046149aebb..a22a79fc7d7 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd @@ -14,22 +14,22 @@ cdef extern from "cudf/strings/split/split.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] split( - column_view source_strings, + column_view strings_column, string_scalar delimiter, size_type maxsplit) except + cdef unique_ptr[table] rsplit( - column_view source_strings, + column_view strings_column, string_scalar delimiter, size_type maxsplit) except + cdef unique_ptr[column] split_record( - column_view source_strings, + column_view strings, string_scalar delimiter, size_type maxsplit) except + cdef unique_ptr[column] rsplit_record( - column_view source_strings, + column_view strings, string_scalar delimiter, size_type maxsplit) except + @@ -38,21 +38,21 @@ cdef extern from "cudf/strings/split/split_re.hpp" namespace \ "cudf::strings" nogil: cdef unique_ptr[table] split_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + cdef unique_ptr[table] rsplit_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + cdef unique_ptr[column] split_record_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + cdef unique_ptr[column] rsplit_record_re( - const column_view& source_strings, - regex_program, + const column_view& input, + regex_program prog, size_type maxsplit) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd index b0ca771762d..dd527a78e7f 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd @@ -10,6 +10,6 @@ from pylibcudf.libcudf.strings.side_type cimport side_type cdef extern from "cudf/strings/strip.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] strip( - column_view source_strings, - side_type stype, + column_view input, + side_type side, string_scalar to_strip) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd index c0053391328..abc1bd43ad2 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd @@ -9,5 +9,5 @@ from pylibcudf.libcudf.types cimport size_type cdef extern from "cudf/strings/wrap.hpp" namespace "cudf::strings" nogil: cdef unique_ptr[column] wrap( - column_view source_strings, + column_view input, size_type width) except + diff --git a/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd index 0c8fe1060ac..2eca043e451 100644 --- a/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd @@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.types cimport size_type -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \ diff --git a/python/pylibcudf/pylibcudf/libcudf/transform.pxd b/python/pylibcudf/pylibcudf/libcudf/transform.pxd index 38298a7c1f1..d21510bd731 100644 --- a/python/pylibcudf/pylibcudf/libcudf/transform.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/transform.pxd @@ -11,7 +11,7 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport bitmask_type, data_type, size_type -from rmm._lib.device_buffer cimport device_buffer +from rmm.librmm.device_buffer cimport device_buffer cdef extern from "cudf/transform.hpp" namespace "cudf" nogil: diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd similarity index 100% rename from python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd rename to python/pylibcudf/pylibcudf/libcudf/utilities/span.pxd diff --git a/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd deleted file mode 100644 index 558299501d6..00000000000 --- a/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int32_t, int64_t -from pylibcudf.libcudf.types cimport int128 - - -cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil: - # cython type stub to help resolve to numeric::decimal64 - ctypedef int64_t decimal64 - # cython type stub to help resolve to numeric::decimal32 - ctypedef int64_t decimal32 - # cython type stub to help resolve to numeric::decimal128 - ctypedef int128 decimal128 - - cdef cppclass scale_type: - scale_type(int32_t) diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx index 6f82124d06e..ecaf62d6895 100644 --- a/python/pylibcudf/pylibcudf/lists.pyx +++ b/python/pylibcudf/pylibcudf/lists.pyx @@ -69,7 +69,7 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx): cdef unique_ptr[table] c_result with nogil: - c_result = move(cpp_explode.explode_outer(input.view(), explode_column_idx)) + c_result = cpp_explode.explode_outer(input.view(), explode_column_idx) return Table.from_libcudf(move(c_result)) @@ -92,7 +92,7 @@ cpdef Column concatenate_rows(Table input): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_concatenate_rows(input.view())) + c_result = cpp_concatenate_rows(input.view()) return Column.from_libcudf(move(c_result)) @@ -123,10 +123,7 @@ cpdef Column concatenate_list_elements(Column input, bool dropna): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_concatenate_list_elements( - input.view(), - null_policy, - )) + c_result = cpp_concatenate_list_elements(input.view(), null_policy) return Column.from_libcudf(move(c_result)) @@ -161,12 +158,12 @@ cpdef Column contains(Column input, ColumnOrScalar search_key): raise TypeError("Must pass a Column or Scalar") with nogil: - c_result = move(cpp_contains.contains( + c_result = cpp_contains.contains( list_view.view(), search_key.view() if ColumnOrScalar is Column else dereference( search_key.get() ), - )) + ) return Column.from_libcudf(move(c_result)) @@ -190,7 +187,7 @@ cpdef Column contains_nulls(Column input): cdef unique_ptr[column] c_result cdef ListColumnView list_view = input.list_view() with nogil: - c_result = move(cpp_contains.contains_nulls(list_view.view())) + c_result = cpp_contains.contains_nulls(list_view.view()) return Column.from_libcudf(move(c_result)) @@ -229,13 +226,13 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o ) with nogil: - c_result = move(cpp_contains.index_of( + c_result = cpp_contains.index_of( list_view.view(), search_key.view() if ColumnOrScalar is Column else dereference( search_key.get() ), find_option, - )) + ) return Column.from_libcudf(move(c_result)) @@ -258,9 +255,7 @@ cpdef Column reverse(Column input): cdef ListColumnView list_view = input.list_view() with nogil: - c_result = move(cpp_reverse.reverse( - list_view.view(), - )) + c_result = cpp_reverse.reverse(list_view.view()) return Column.from_libcudf(move(c_result)) @@ -288,10 +283,10 @@ cpdef Column segmented_gather(Column input, Column gather_map_list): cdef ListColumnView list_view2 = gather_map_list.list_view() with nogil: - c_result = move(cpp_gather.segmented_gather( + c_result = cpp_gather.segmented_gather( list_view1.view(), list_view2.view(), - )) + ) return Column.from_libcudf(move(c_result)) @@ -316,10 +311,10 @@ cpdef Column extract_list_element(Column input, ColumnOrSizeType index): cdef ListColumnView list_view = input.list_view() with nogil: - c_result = move(cpp_extract_list_element( + c_result = cpp_extract_list_element( list_view.view(), index.view() if ColumnOrSizeType is Column else index, - )) + ) return Column.from_libcudf(move(c_result)) @@ -344,7 +339,7 @@ cpdef Column count_elements(Column input): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_count_elements(list_view.view())) + c_result = cpp_count_elements(list_view.view()) return Column.from_libcudf(move(c_result)) @@ -373,17 +368,14 @@ cpdef Column sequences(Column starts, Column sizes, Column steps = None): if steps is not None: with nogil: - c_result = move(cpp_filling.sequences( + c_result = cpp_filling.sequences( starts.view(), steps.view(), sizes.view(), - )) + ) else: with nogil: - c_result = move(cpp_filling.sequences( - starts.view(), - sizes.view(), - )) + c_result = cpp_filling.sequences(starts.view(), sizes.view()) return Column.from_libcudf(move(c_result)) cpdef Column sort_lists( @@ -423,17 +415,17 @@ cpdef Column sort_lists( with nogil: if stable: - c_result = move(cpp_stable_sort_lists( + c_result = cpp_stable_sort_lists( list_view.view(), c_sort_order, na_position, - )) + ) else: - c_result = move(cpp_sort_lists( + c_result = cpp_sort_lists( list_view.view(), c_sort_order, na_position, - )) + ) return Column.from_libcudf(move(c_result)) @@ -477,12 +469,12 @@ cpdef Column difference_distinct( ) with nogil: - c_result = move(cpp_set_operations.difference_distinct( + c_result = cpp_set_operations.difference_distinct( lhs_view.view(), rhs_view.view(), c_nulls_equal, c_nans_equal, - )) + ) return Column.from_libcudf(move(c_result)) @@ -525,12 +517,12 @@ cpdef Column have_overlap( ) with nogil: - c_result = move(cpp_set_operations.have_overlap( + c_result = cpp_set_operations.have_overlap( lhs_view.view(), rhs_view.view(), c_nulls_equal, c_nans_equal, - )) + ) return Column.from_libcudf(move(c_result)) @@ -573,12 +565,12 @@ cpdef Column intersect_distinct( ) with nogil: - c_result = move(cpp_set_operations.intersect_distinct( + c_result = cpp_set_operations.intersect_distinct( lhs_view.view(), rhs_view.view(), c_nulls_equal, c_nans_equal, - )) + ) return Column.from_libcudf(move(c_result)) @@ -622,12 +614,12 @@ cpdef Column union_distinct( ) with nogil: - c_result = move(cpp_set_operations.union_distinct( + c_result = cpp_set_operations.union_distinct( lhs_view.view(), rhs_view.view(), c_nulls_equal, c_nans_equal, - )) + ) return Column.from_libcudf(move(c_result)) @@ -652,10 +644,10 @@ cpdef Column apply_boolean_mask(Column input, Column boolean_mask): cdef ListColumnView list_view = input.list_view() cdef ListColumnView mask_view = boolean_mask.list_view() with nogil: - c_result = move(cpp_apply_boolean_mask( + c_result = cpp_apply_boolean_mask( list_view.view(), mask_view.view(), - )) + ) return Column.from_libcudf(move(c_result)) @@ -690,9 +682,9 @@ cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal): ) with nogil: - c_result = move(cpp_distinct( + c_result = cpp_distinct( list_view.view(), c_nulls_equal, c_nans_equal, - )) + ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx index 6d707b67449..61a21aafdb2 100644 --- a/python/pylibcudf/pylibcudf/merge.pyx +++ b/python/pylibcudf/pylibcudf/merge.pyx @@ -47,12 +47,10 @@ cpdef Table merge ( cdef unique_ptr[table] c_result with nogil: - c_result = move( - cpp_merge.merge( - c_tables_to_merge, - c_key_cols, - c_column_order, - c_null_precedence, - ) + c_result = cpp_merge.merge( + c_tables_to_merge, + c_key_cols, + c_column_order, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/null_mask.pxd index ab5c0080312..9bdfaee2842 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pxd +++ b/python/pylibcudf/pylibcudf/null_mask.pxd @@ -2,7 +2,7 @@ from pylibcudf.libcudf.types cimport mask_state, size_type -from rmm._lib.device_buffer cimport DeviceBuffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from .column cimport Column diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx index 5bdde06f21f..74180951562 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pyx +++ b/python/pylibcudf/pylibcudf/null_mask.pyx @@ -6,7 +6,8 @@ from libcpp.utility cimport move from pylibcudf.libcudf cimport null_mask as cpp_null_mask from pylibcudf.libcudf.types cimport mask_state, size_type -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +from rmm.librmm.device_buffer cimport device_buffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from pylibcudf.libcudf.types import mask_state as MaskState # no-cython-lint @@ -31,13 +32,13 @@ cpdef DeviceBuffer copy_bitmask(Column col): Returns ------- rmm.DeviceBuffer - A ``DeviceBuffer`` containing ``col``'s bitmask, or an empty ``DeviceBuffer`` - if ``col`` is not nullable + A ``DeviceBuffer`` containing ``col``'s bitmask, or an empty + ``DeviceBuffer`` if ``col`` is not nullable """ cdef device_buffer db with nogil: - db = move(cpp_null_mask.copy_bitmask(col.view())) + db = cpp_null_mask.copy_bitmask(col.view()) return buffer_to_python(move(db)) @@ -89,7 +90,7 @@ cpdef DeviceBuffer create_null_mask( cdef device_buffer db with nogil: - db = move(cpp_null_mask.create_null_mask(size, state)) + db = cpp_null_mask.create_null_mask(size, state) return buffer_to_python(move(db)) @@ -113,7 +114,7 @@ cpdef tuple bitmask_and(list columns): cdef pair[device_buffer, size_type] c_result with nogil: - c_result = move(cpp_null_mask.bitmask_and(c_table.view())) + c_result = cpp_null_mask.bitmask_and(c_table.view()) return buffer_to_python(move(c_result.first)), c_result.second @@ -137,6 +138,6 @@ cpdef tuple bitmask_or(list columns): cdef pair[device_buffer, size_type] c_result with nogil: - c_result = move(cpp_null_mask.bitmask_or(c_table.view())) + c_result = cpp_null_mask.bitmask_or(c_table.view()) return buffer_to_python(move(c_result.first)), c_result.second diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt new file mode 100644 index 00000000000..d97c0a73267 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -0,0 +1,24 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx + ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx +) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_nvtext_ ASSOCIATED_TARGETS cudf +) diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd new file mode 100644 index 00000000000..a658e57018e --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . cimport ( + edit_distance, + generate_ngrams, + jaccard, + minhash, + ngrams_tokenize, + normalize, + replace, + stemmer, +) + +__all__ = [ + "edit_distance", + "generate_ngrams", + "jaccard", + "minhash", + "ngrams_tokenize", + "normalize", + "replace", + "stemmer", +] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py new file mode 100644 index 00000000000..2c1feb089a2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from . import ( + edit_distance, + generate_ngrams, + jaccard, + minhash, + ngrams_tokenize, + normalize, + replace, + stemmer, +) + +__all__ = [ + "edit_distance", + "generate_ngrams", + "jaccard", + "minhash", + "ngrams_tokenize", + "normalize", + "replace", + "stemmer", +] diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd new file mode 100644 index 00000000000..446b95afabb --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column edit_distance(Column input, Column targets) + +cpdef Column edit_distance_matrix(Column input) diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx new file mode 100644 index 00000000000..dcacb2e1267 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx @@ -0,0 +1,63 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.edit_distance cimport ( + edit_distance as cpp_edit_distance, + edit_distance_matrix as cpp_edit_distance_matrix, +) + + +cpdef Column edit_distance(Column input, Column targets): + """ + Returns the edit distance between individual strings in two strings columns + + For details, see :cpp:func:`edit_distance` + + Parameters + ---------- + input : Column + Input strings + targets : Column + Strings to compute edit distance against + + Returns + ------- + Column + New column of edit distance values + """ + cdef column_view c_strings = input.view() + cdef column_view c_targets = targets.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_edit_distance(c_strings, c_targets) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column edit_distance_matrix(Column input): + """ + Returns the edit distance between all strings in the input strings column + + For details, see :cpp:func:`edit_distance_matrix` + + Parameters + ---------- + input : Column + Input strings + + Returns + ------- + Column + New column of edit distance values + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_edit_distance_matrix(c_strings) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd new file mode 100644 index 00000000000..f15eb1f25e9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd @@ -0,0 +1,12 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator) + +cpdef Column generate_character_ngrams(Column input, size_type ngrams=*) + +cpdef Column hash_character_ngrams(Column input, size_type ngrams=*) diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx new file mode 100644 index 00000000000..09859d09e9e --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx @@ -0,0 +1,105 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.generate_ngrams cimport ( + generate_character_ngrams as cpp_generate_character_ngrams, + generate_ngrams as cpp_generate_ngrams, + hash_character_ngrams as cpp_hash_character_ngrams, +) +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator): + """ + Returns a single column of strings by generating ngrams from a strings column. + + For details, see :cpp:func:`generate_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + separator : Scalar + The string to use for separating ngram tokens + + Returns + ------- + Column + New strings columns of tokens + """ + cdef column_view c_strings = input.view() + cdef const string_scalar* c_separator = separator.c_obj.get() + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_generate_ngrams( + c_strings, + ngrams, + c_separator[0] + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2): + """ + Returns a lists column of ngrams of characters within each string. + + For details, see :cpp:func:`generate_character_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + + Returns + ------- + Column + Lists column of strings + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_generate_character_ngrams( + c_strings, + ngrams, + ) + return Column.from_libcudf(move(c_result)) + +cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2): + """ + Returns a lists column of hash values of the characters in each string + + For details, see :cpp:func:`hash_character_ngrams` + + Parameters + ---------- + input : Column + Input strings + ngram : size_type + The ngram number to generate + + Returns + ------- + Column + Lists column of hash values + """ + cdef column_view c_strings = input.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_hash_character_ngrams( + c_strings, + ngrams, + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd new file mode 100644 index 00000000000..a4d4a15335b --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column jaccard_index(Column input1, Column input2, size_type width) diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx new file mode 100644 index 00000000000..3d8669865d9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx @@ -0,0 +1,45 @@ +# Copyright (c) 2023-2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.jaccard cimport ( + jaccard_index as cpp_jaccard_index, +) +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column jaccard_index(Column input1, Column input2, size_type width): + """ + Returns the Jaccard similarity between individual rows in two strings columns. + + For details, see :cpp:func:`jaccard_index` + + Parameters + ---------- + input1 : Column + Input strings column + input2 : Column + Input strings column + width : size_type + The ngram number to generate + + Returns + ------- + Column + Index calculation values + """ + cdef column_view c_input1 = input1.view() + cdef column_view c_input2 = input2.view() + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_jaccard_index( + c_input1, + c_input2, + width + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd new file mode 100644 index 00000000000..97e8c9dc83c --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd @@ -0,0 +1,18 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t, uint64_t +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +ctypedef fused ColumnOrScalar: + Column + Scalar + +cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*) + +cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*) + +cpdef Column word_minhash(Column input, Column seeds) + +cpdef Column word_minhash64(Column input, Column seeds) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx new file mode 100644 index 00000000000..f1e012e60e5 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -0,0 +1,152 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libc.stdint cimport uint32_t, uint64_t +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.minhash cimport ( + minhash as cpp_minhash, + minhash64 as cpp_minhash64, + word_minhash as cpp_word_minhash, + word_minhash64 as cpp_word_minhash64, +) +from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference + + +cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): + """ + Returns the minhash values for each string per seed. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`minhash`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seeds : Column or Scalar + Seed value(s) used for the hash algorithm. + width : size_type + Character width used for apply substrings; + Default is 4 characters. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + if not isinstance(seeds, (Column, Scalar)): + raise TypeError("Must pass a Column or Scalar") + + with nogil: + c_result = cpp_minhash( + input.view(), + seeds.view() if ColumnOrScalar is Column else + dereference(seeds.c_obj.get()), + width + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): + """ + Returns the minhash values for each string per seed. + This function uses MurmurHash3_x64_128 for the hash algorithm. + + For details, see :cpp:func:`minhash64`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seeds : Column or Scalar + Seed value(s) used for the hash algorithm. + width : size_type + Character width used for apply substrings; + Default is 4 characters. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + if not isinstance(seeds, (Column, Scalar)): + raise TypeError("Must pass a Column or Scalar") + + with nogil: + c_result = cpp_minhash64( + input.view(), + seeds.view() if ColumnOrScalar is Column else + dereference(seeds.c_obj.get()), + width + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column word_minhash(Column input, Column seeds): + """ + Returns the minhash values for each row of strings per seed. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`word_minhash`. + + Parameters + ---------- + input : Column + Lists column of strings to compute minhash + seeds : Column or Scalar + Seed values used for the hash algorithm. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_word_minhash( + input.view(), + seeds.view() + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column word_minhash64(Column input, Column seeds): + """ + Returns the minhash values for each row of strings per seed. + This function uses MurmurHash3_x64_128 for the hash algorithm though + only the first 64-bits of the hash are used in computing the output. + + For details, see :cpp:func:`word_minhash64`. + + Parameters + ---------- + input : Column + Lists column of strings to compute minhash + seeds : Column or Scalar + Seed values used for the hash algorithm. + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_word_minhash64( + input.view(), + seeds.view() + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd new file mode 100644 index 00000000000..4f791ba1ee9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column ngrams_tokenize( + Column input, + size_type ngrams, + Scalar delimiter, + Scalar separator +) diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx new file mode 100644 index 00000000000..8a1854c5f0d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx @@ -0,0 +1,54 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport ( + ngrams_tokenize as cpp_ngrams_tokenize, +) +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column ngrams_tokenize( + Column input, + size_type ngrams, + Scalar delimiter, + Scalar separator +): + """ + Returns a single column of strings by tokenizing the input strings column + and then producing ngrams of each string. + + For details, see :cpp:func:`ngrams_tokenize` + + Parameters + ---------- + input : Column + Input strings + ngrams : size_type + The ngram number to generate + delimiter : Scalar + UTF-8 characters used to separate each string into tokens. + An empty string will separate tokens using whitespace. + separator : Scalar + The string to use for separating ngram tokens + + Returns + ------- + Column + New strings columns of tokens + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_ngrams_tokenize( + input.view(), + ngrams, + dereference(delimiter.get()), + dereference(separator.get()), + ) + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd new file mode 100644 index 00000000000..90676145afa --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from pylibcudf.column cimport Column + + +cpdef Column normalize_spaces(Column input) + +cpdef Column normalize_characters(Column input, bool do_lower_case) diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx new file mode 100644 index 00000000000..637d900b659 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx @@ -0,0 +1,64 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.normalize cimport ( + normalize_characters as cpp_normalize_characters, + normalize_spaces as cpp_normalize_spaces, +) + + +cpdef Column normalize_spaces(Column input): + """ + Returns a new strings column by normalizing the whitespace in + each string in the input column. + + For details, see :cpp:func:`normalize_spaces` + + Parameters + ---------- + input : Column + Input strings + + Returns + ------- + Column + New strings columns of normalized strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_normalize_spaces(input.view()) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column normalize_characters(Column input, bool do_lower_case): + """ + Normalizes strings characters for tokenizing. + + For details, see :cpp:func:`normalize_characters` + + Parameters + ---------- + input : Column + Input strings + do_lower_case : bool + If true, upper-case characters are converted to lower-case + and accents are stripped from those characters. If false, + accented and upper-case characters are not transformed. + + Returns + ------- + Column + Normalized strings column + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_normalize_characters(input.view(), do_lower_case) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pxd b/python/pylibcudf/pylibcudf/nvtext/replace.pxd new file mode 100644 index 00000000000..624f90e7486 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/replace.pxd @@ -0,0 +1,20 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column replace_tokens( + Column input, + Column targets, + Column replacements, + Scalar delimiter=*, +) + +cpdef Column filter_tokens( + Column input, + size_type min_token_length, + Scalar replacement=*, + Scalar delimiter=* +) diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyx b/python/pylibcudf/pylibcudf/nvtext/replace.pyx new file mode 100644 index 00000000000..b65348ce14d --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyx @@ -0,0 +1,109 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.replace cimport ( + filter_tokens as cpp_filter_tokens, + replace_tokens as cpp_replace_tokens, +) +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar + + +cpdef Column replace_tokens( + Column input, + Column targets, + Column replacements, + Scalar delimiter=None, +): + """ + Replaces specified tokens with corresponding replacement strings. + + For details, see :cpp:func:`replace_tokens` + + Parameters + ---------- + input : Column + Strings column to replace + targets : Column + Strings to compare against tokens found in ``input`` + replacements : Column + Replacement strings for each string in ``targets`` + delimiter : Scalar, optional + Characters used to separate each string into tokens. + The default of empty string will identify tokens using whitespace. + + Returns + ------- + Column + New strings column with replaced strings + """ + cdef unique_ptr[column] c_result + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + with nogil: + c_result = cpp_replace_tokens( + input.view(), + targets.view(), + replacements.view(), + dereference(delimiter.get()), + ) + return Column.from_libcudf(move(c_result)) + + +cpdef Column filter_tokens( + Column input, + size_type min_token_length, + Scalar replacement=None, + Scalar delimiter=None +): + """ + Removes tokens whose lengths are less than a specified number of characters. + + For details, see :cpp:func:`filter_tokens` + + Parameters + ---------- + input : Column + Strings column to replace + min_token_length : size_type + The minimum number of characters to retain a + token in the output string + replacement : Scalar, optional + Optional replacement string to be used in place of removed tokens + delimiter : Scalar, optional + Characters used to separate each string into tokens. + The default of empty string will identify tokens using whitespace. + Returns + ------- + Column + New strings column of filtered strings + """ + cdef unique_ptr[column] c_result + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + if replacement is None: + replacement = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + with nogil: + c_result = cpp_filter_tokens( + input.view(), + min_token_length, + dereference(replacement.get()), + dereference(delimiter.get()), + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd new file mode 100644 index 00000000000..48762efc01f --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd @@ -0,0 +1,14 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.stemmer cimport letter_type +from pylibcudf.libcudf.types cimport size_type + +ctypedef fused ColumnOrSize: + Column + size_type + +cpdef Column is_letter(Column input, bool check_vowels, ColumnOrSize indices) + +cpdef Column porter_stemmer_measure(Column input) diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx new file mode 100644 index 00000000000..854d1053624 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx @@ -0,0 +1,76 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.stemmer cimport ( + is_letter as cpp_is_letter, + letter_type, + porter_stemmer_measure as cpp_porter_stemmer_measure, +) +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column is_letter( + Column input, + bool check_vowels, + ColumnOrSize indices +): + """ + Returns boolean column indicating if the character + or characters at the provided character index or + indices (respectively) are consonants or vowels + + For details, see :cpp:func:`is_letter` + + Parameters + ---------- + input : Column + Input strings + check_vowels : bool + If true, the check is for vowels. Otherwise the check is + for consonants. + indices : Union[Column, size_type] + The character position(s) to check in each string + + Returns + ------- + Column + New boolean column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_is_letter( + input.view(), + letter_type.VOWEL if check_vowels else letter_type.CONSONANT, + indices if ColumnOrSize is size_type else indices.view() + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column porter_stemmer_measure(Column input): + """ + Returns the Porter Stemmer measurements of a strings column. + + For details, see :cpp:func:`porter_stemmer_measure` + + Parameters + ---------- + input : Column + Strings column of words to measure + + Returns + ------- + Column + New column of measure values + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_porter_stemmer_measure(input.view()) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx index 8fa70daab5a..3cff4843735 100644 --- a/python/pylibcudf/pylibcudf/partitioning.pyx +++ b/python/pylibcudf/pylibcudf/partitioning.pyx @@ -41,10 +41,10 @@ cpdef tuple[Table, list] hash_partition( cdef int c_num_partitions = num_partitions with nogil: - c_result = move( - cpp_partitioning.hash_partition( - input.view(), c_columns_to_hash, c_num_partitions - ) + c_result = cpp_partitioning.hash_partition( + input.view(), + c_columns_to_hash, + c_num_partitions ) return Table.from_libcudf(move(c_result.first)), list(c_result.second) @@ -74,8 +74,10 @@ cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partit cdef int c_num_partitions = num_partitions with nogil: - c_result = move( - cpp_partitioning.partition(t.view(), partition_map.view(), c_num_partitions) + c_result = cpp_partitioning.partition( + t.view(), + partition_map.view(), + c_num_partitions ) return Table.from_libcudf(move(c_result.first)), list(c_result.second) @@ -111,10 +113,8 @@ cpdef tuple[Table, list] round_robin_partition( cdef int c_start_partition = start_partition with nogil: - c_result = move( - cpp_partitioning.round_robin_partition( - input.view(), c_num_partitions, c_start_partition - ) + c_result = cpp_partitioning.round_robin_partition( + input.view(), c_num_partitions, c_start_partition ) return Table.from_libcudf(move(c_result.first)), list(c_result.second) diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx index 3a771fbe7ef..7d92b598bd0 100644 --- a/python/pylibcudf/pylibcudf/quantiles.pyx +++ b/python/pylibcudf/pylibcudf/quantiles.pyx @@ -66,14 +66,12 @@ cpdef Column quantile( ordered_indices_view = ordered_indices.view() with nogil: - c_result = move( - cpp_quantile( - input.view(), - q, - interp, - ordered_indices_view, - exact, - ) + c_result = cpp_quantile( + input.view(), + q, + interp, + ordered_indices_view, + exact, ) return Column.from_libcudf(move(c_result)) @@ -141,15 +139,13 @@ cpdef Table quantiles( null_precedence_vec = null_precedence with nogil: - c_result = move( - cpp_quantiles( - input.view(), - q, - interp, - is_input_sorted, - column_order_vec, - null_precedence_vec, - ) + c_result = cpp_quantiles( + input.view(), + q, + interp, + is_input_sorted, + column_order_vec, + null_precedence_vec, ) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/reduce.pyx b/python/pylibcudf/pylibcudf/reduce.pyx index b0212a5b9c1..d9ec3a9bdc4 100644 --- a/python/pylibcudf/pylibcudf/reduce.pyx +++ b/python/pylibcudf/pylibcudf/reduce.pyx @@ -39,12 +39,10 @@ cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type): cdef unique_ptr[scalar] result cdef const reduce_aggregation *c_agg = agg.view_underlying_as_reduce() with nogil: - result = move( - cpp_reduce.cpp_reduce( - col.view(), - dereference(c_agg), - data_type.c_obj - ) + result = cpp_reduce.cpp_reduce( + col.view(), + dereference(c_agg), + data_type.c_obj ) return Scalar.from_libcudf(move(result)) @@ -71,12 +69,10 @@ cpdef Column scan(Column col, Aggregation agg, scan_type inclusive): cdef unique_ptr[column] result cdef const scan_aggregation *c_agg = agg.view_underlying_as_scan() with nogil: - result = move( - cpp_reduce.cpp_scan( - col.view(), - dereference(c_agg), - inclusive, - ) + result = cpp_reduce.cpp_scan( + col.view(), + dereference(c_agg), + inclusive, ) return Column.from_libcudf(move(result)) @@ -99,7 +95,7 @@ cpdef tuple minmax(Column col): """ cdef pair[unique_ptr[scalar], unique_ptr[scalar]] result with nogil: - result = move(cpp_reduce.cpp_minmax(col.view())) + result = cpp_reduce.cpp_minmax(col.view()) return ( Scalar.from_libcudf(move(result.first)), diff --git a/python/pylibcudf/pylibcudf/replace.pyx b/python/pylibcudf/pylibcudf/replace.pyx index 115dee132fd..f77eba7ace5 100644 --- a/python/pylibcudf/pylibcudf/replace.pyx +++ b/python/pylibcudf/pylibcudf/replace.pyx @@ -56,28 +56,23 @@ cpdef Column replace_nulls(Column source_column, ReplacementType replacement): if isinstance(replacement, ReplacePolicy): policy = replacement with nogil: - c_result = move( - cpp_replace.replace_nulls(source_column.view(), policy) - ) + c_result = cpp_replace.replace_nulls(source_column.view(), policy) return Column.from_libcudf(move(c_result)) else: raise TypeError("replacement must be a Column, Scalar, or replace_policy") with nogil: if ReplacementType is Column: - c_result = move( - cpp_replace.replace_nulls(source_column.view(), replacement.view()) + c_result = cpp_replace.replace_nulls( + source_column.view(), + replacement.view() ) elif ReplacementType is Scalar: - c_result = move( - cpp_replace.replace_nulls( - source_column.view(), dereference(replacement.c_obj) - ) + c_result = cpp_replace.replace_nulls( + source_column.view(), dereference(replacement.c_obj) ) elif ReplacementType is replace_policy: - c_result = move( - cpp_replace.replace_nulls(source_column.view(), replacement) - ) + c_result = cpp_replace.replace_nulls(source_column.view(), replacement) else: assert False, "Internal error. Please contact pylibcudf developers" return Column.from_libcudf(move(c_result)) @@ -109,12 +104,10 @@ cpdef Column find_and_replace_all( """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_replace.find_and_replace_all( - source_column.view(), - values_to_replace.view(), - replacement_values.view(), - ) + c_result = cpp_replace.find_and_replace_all( + source_column.view(), + values_to_replace.view(), + replacement_values.view(), ) return Column.from_libcudf(move(c_result)) @@ -156,22 +149,18 @@ cpdef Column clamp( cdef unique_ptr[column] c_result with nogil: if lo_replace is None: - c_result = move( - cpp_replace.clamp( - source_column.view(), - dereference(lo.c_obj), - dereference(hi.c_obj), - ) + c_result = cpp_replace.clamp( + source_column.view(), + dereference(lo.c_obj), + dereference(hi.c_obj), ) else: - c_result = move( - cpp_replace.clamp( - source_column.view(), - dereference(lo.c_obj), - dereference(hi.c_obj), - dereference(lo_replace.c_obj), - dereference(hi_replace.c_obj), - ) + c_result = cpp_replace.clamp( + source_column.view(), + dereference(lo.c_obj), + dereference(hi.c_obj), + dereference(lo_replace.c_obj), + dereference(hi_replace.c_obj), ) return Column.from_libcudf(move(c_result)) @@ -199,9 +188,7 @@ cpdef Column normalize_nans_and_zeros(Column source_column, bool inplace=False): if inplace: cpp_replace.normalize_nans_and_zeros(source_column.mutable_view()) else: - c_result = move( - cpp_replace.normalize_nans_and_zeros(source_column.view()) - ) + c_result = cpp_replace.normalize_nans_and_zeros(source_column.view()) if not inplace: return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx index eb1499ebbea..6540b5198ab 100644 --- a/python/pylibcudf/pylibcudf/reshape.pyx +++ b/python/pylibcudf/pylibcudf/reshape.pyx @@ -38,7 +38,7 @@ cpdef Column interleave_columns(Table source_table): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_interleave_columns(source_table.view())) + c_result = cpp_interleave_columns(source_table.view()) return Column.from_libcudf(move(c_result)) @@ -63,6 +63,6 @@ cpdef Table tile(Table source_table, size_type count): cdef unique_ptr[table] c_result with nogil: - c_result = move(cpp_tile(source_table.view(), count)) + c_result = cpp_tile(source_table.view(), count) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/rolling.pyx b/python/pylibcudf/pylibcudf/rolling.pyx index a46540d7ffa..4fd0b005431 100644 --- a/python/pylibcudf/pylibcudf/rolling.pyx +++ b/python/pylibcudf/pylibcudf/rolling.pyx @@ -49,24 +49,21 @@ cpdef Column rolling_window( cdef const rolling_aggregation *c_agg = agg.view_underlying_as_rolling() if WindowType is Column: with nogil: - result = move( - cpp_rolling.rolling_window( - source.view(), - preceding_window.view(), - following_window.view(), - min_periods, - dereference(c_agg), - ) + result = cpp_rolling.rolling_window( + source.view(), + preceding_window.view(), + following_window.view(), + min_periods, + dereference(c_agg), ) else: with nogil: - result = move( - cpp_rolling.rolling_window( - source.view(), - preceding_window, - following_window, - min_periods, - dereference(c_agg), - ) + result = cpp_rolling.rolling_window( + source.view(), + preceding_window, + following_window, + min_periods, + dereference(c_agg), ) + return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pylibcudf/round.pyx b/python/pylibcudf/pylibcudf/round.pyx index dc60d53b07e..689363e652d 100644 --- a/python/pylibcudf/pylibcudf/round.pyx +++ b/python/pylibcudf/pylibcudf/round.pyx @@ -39,12 +39,10 @@ cpdef Column round( """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_round( - source.view(), - decimal_places, - round_method - ) + c_result = cpp_round( + source.view(), + decimal_places, + round_method ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/scalar.pxd b/python/pylibcudf/pylibcudf/scalar.pxd index 8664dfa4b7e..a273647c98d 100644 --- a/python/pylibcudf/pylibcudf/scalar.pxd +++ b/python/pylibcudf/pylibcudf/scalar.pxd @@ -4,7 +4,7 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr from pylibcudf.libcudf.scalar.scalar cimport scalar -from rmm._lib.memory_resource cimport DeviceMemoryResource +from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource from .column cimport Column from .types cimport DataType diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx index 3e20938af0c..d4888a62ad1 100644 --- a/python/pylibcudf/pylibcudf/scalar.pyx +++ b/python/pylibcudf/pylibcudf/scalar.pyx @@ -6,7 +6,7 @@ from libcpp.utility cimport move from pylibcudf.libcudf.scalar.scalar cimport scalar from pylibcudf.libcudf.scalar.scalar_factories cimport make_empty_scalar_like -from rmm._lib.memory_resource cimport get_current_device_resource +from rmm.pylibrmm.memory_resource cimport get_current_device_resource from .column cimport Column from .types cimport DataType diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx index 814bc6553d8..1a870248046 100644 --- a/python/pylibcudf/pylibcudf/search.pyx +++ b/python/pylibcudf/pylibcudf/search.pyx @@ -41,13 +41,11 @@ cpdef Column lower_bound( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_search.lower_bound( - haystack.view(), - needles.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_search.lower_bound( + haystack.view(), + needles.view(), + c_orders, + c_null_precedence, ) return Column.from_libcudf(move(c_result)) @@ -82,13 +80,11 @@ cpdef Column upper_bound( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_search.upper_bound( - haystack.view(), - needles.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_search.upper_bound( + haystack.view(), + needles.view(), + c_orders, + c_null_precedence, ) return Column.from_libcudf(move(c_result)) @@ -112,10 +108,8 @@ cpdef Column contains(Column haystack, Column needles): """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_search.contains( - haystack.view(), - needles.view(), - ) + c_result = cpp_search.contains( + haystack.view(), + needles.view(), ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx index 42289d54bca..fc40f03e1fd 100644 --- a/python/pylibcudf/pylibcudf/sorting.pyx +++ b/python/pylibcudf/pylibcudf/sorting.pyx @@ -36,12 +36,10 @@ cpdef Column sorted_order(Table source_table, list column_order, list null_prece cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.sorted_order( - source_table.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.sorted_order( + source_table.view(), + c_orders, + c_null_precedence, ) return Column.from_libcudf(move(c_result)) @@ -74,12 +72,10 @@ cpdef Column stable_sorted_order( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.stable_sorted_order( - source_table.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.stable_sorted_order( + source_table.view(), + c_orders, + c_null_precedence, ) return Column.from_libcudf(move(c_result)) @@ -118,15 +114,13 @@ cpdef Column rank( """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_sorting.rank( - input_view.view(), - method, - column_order, - null_handling, - null_precedence, - percentage, - ) + c_result = cpp_sorting.rank( + input_view.view(), + method, + column_order, + null_handling, + null_precedence, + percentage, ) return Column.from_libcudf(move(c_result)) @@ -154,12 +148,10 @@ cpdef bool is_sorted(Table tbl, list column_order, list null_precedence): cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.is_sorted( - tbl.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.is_sorted( + tbl.view(), + c_orders, + c_null_precedence, ) return c_result @@ -197,14 +189,12 @@ cpdef Table segmented_sort_by_key( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.segmented_sort_by_key( - values.view(), - keys.view(), - segment_offsets.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.segmented_sort_by_key( + values.view(), + keys.view(), + segment_offsets.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) @@ -243,14 +233,12 @@ cpdef Table stable_segmented_sort_by_key( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.stable_segmented_sort_by_key( - values.view(), - keys.view(), - segment_offsets.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.stable_segmented_sort_by_key( + values.view(), + keys.view(), + segment_offsets.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) @@ -285,13 +273,11 @@ cpdef Table sort_by_key( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.sort_by_key( - values.view(), - keys.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.sort_by_key( + values.view(), + keys.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) @@ -326,13 +312,11 @@ cpdef Table stable_sort_by_key( cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.stable_sort_by_key( - values.view(), - keys.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.stable_sort_by_key( + values.view(), + keys.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) @@ -360,12 +344,10 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence): cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.sort( - source_table.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.sort( + source_table.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) @@ -393,11 +375,9 @@ cpdef Table stable_sort(Table source_table, list column_order, list null_precede cdef vector[order] c_orders = column_order cdef vector[null_order] c_null_precedence = null_precedence with nogil: - c_result = move( - cpp_sorting.stable_sort( - source_table.view(), - c_orders, - c_null_precedence, - ) + c_result = cpp_sorting.stable_sort( + source_table.view(), + c_orders, + c_null_precedence, ) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx index d5475ea79d5..2145398a191 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pyx +++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx @@ -44,10 +44,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: - c_result = move( - cpp_stream_compaction.drop_nulls( - source_table.view(), c_keys, keep_threshold - ) + c_result = cpp_stream_compaction.drop_nulls( + source_table.view(), c_keys, keep_threshold ) return Table.from_libcudf(move(c_result)) @@ -74,10 +72,8 @@ cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold): cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: - c_result = move( - cpp_stream_compaction.drop_nulls( - source_table.view(), c_keys, keep_threshold - ) + c_result = cpp_stream_compaction.drop_nulls( + source_table.view(), c_keys, keep_threshold ) return Table.from_libcudf(move(c_result)) @@ -101,10 +97,8 @@ cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask): """ cdef unique_ptr[table] c_result with nogil: - c_result = move( - cpp_stream_compaction.apply_boolean_mask( - source_table.view(), boolean_mask.view() - ) + c_result = cpp_stream_compaction.apply_boolean_mask( + source_table.view(), boolean_mask.view() ) return Table.from_libcudf(move(c_result)) @@ -144,10 +138,8 @@ cpdef Table unique( cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: - c_result = move( - cpp_stream_compaction.unique( - input.view(), c_keys, keep, nulls_equal - ) + c_result = cpp_stream_compaction.unique( + input.view(), c_keys, keep, nulls_equal ) return Table.from_libcudf(move(c_result)) @@ -185,10 +177,8 @@ cpdef Table distinct( cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: - c_result = move( - cpp_stream_compaction.distinct( - input.view(), c_keys, keep, nulls_equal, nans_equal - ) + c_result = cpp_stream_compaction.distinct( + input.view(), c_keys, keep, nulls_equal, nans_equal ) return Table.from_libcudf(move(c_result)) @@ -221,10 +211,8 @@ cpdef Column distinct_indices( """ cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_stream_compaction.distinct_indices( - input.view(), keep, nulls_equal, nans_equal - ) + c_result = cpp_stream_compaction.distinct_indices( + input.view(), keep, nulls_equal, nans_equal ) return Column.from_libcudf(move(c_result)) @@ -262,10 +250,8 @@ cpdef Table stable_distinct( cdef unique_ptr[table] c_result cdef vector[size_type] c_keys = keys with nogil: - c_result = move( - cpp_stream_compaction.stable_distinct( - input.view(), c_keys, keep, nulls_equal, nans_equal - ) + c_result = cpp_stream_compaction.stable_distinct( + input.view(), c_keys, keep, nulls_equal, nans_equal ) return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index 052a0cf3c56..5d7fbd24b91 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -18,17 +18,22 @@ set(cython_sources case.pyx char_types.pyx contains.pyx + combine.pyx extract.pyx find.pyx + find_multiple.pyx findall.pyx + padding.pyx regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx + replace_re.pyx side_type.pyx slice.pyx strip.pyx translate.pyx + wrap.pyx ) set(linked_libraries cudf::cudf) @@ -39,3 +44,4 @@ rapids_cython_create_modules( ) add_subdirectory(convert) +add_subdirectory(split) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index 142637ff577..da1c1c576c0 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -5,17 +5,25 @@ from . cimport ( capitalize, case, char_types, + combine, contains, convert, extract, find, + find_multiple, findall, + padding, regex_flags, regex_program, + repeat, replace, + replace_re, + side_type, slice, + split, strip, translate, + wrap, ) from .side_type cimport side_type @@ -28,12 +36,18 @@ __all__ = [ "convert", "extract", "find", + "find_multiple", "findall", + "padding", "regex_flags", "regex_program", + "repeat", "replace", + "replace_re", "slice", "strip", + "split", "side_type", "translate", + "wrap", ] diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index decfadd63a4..40fa8261905 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -5,18 +5,25 @@ capitalize, case, char_types, + combine, contains, convert, extract, find, + find_multiple, findall, + padding, regex_flags, regex_program, repeat, replace, + replace_re, + side_type, slice, + split, strip, translate, + wrap, ) from .side_type import SideType @@ -29,12 +36,18 @@ "convert", "extract", "find", + "find_multiple", "findall", + "padding", "regex_flags", "regex_program", + "repeat", "replace", + "replace_re", "slice", "strip", + "split", "SideType", "translate", + "wrap", ] diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx index 36bee7bd1d9..8e46a32835d 100644 --- a/python/pylibcudf/pylibcudf/strings/attributes.pyx +++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx @@ -25,7 +25,7 @@ cpdef Column count_characters(Column source_strings): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_attributes.count_characters(source_strings.view())) + c_result = cpp_attributes.count_characters(source_strings.view()) return Column.from_libcudf(move(c_result)) @@ -48,7 +48,7 @@ cpdef Column count_bytes(Column source_strings): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_attributes.count_bytes(source_strings.view())) + c_result = cpp_attributes.count_bytes(source_strings.view()) return Column.from_libcudf(move(c_result)) @@ -71,6 +71,6 @@ cpdef Column code_points(Column source_strings): cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_attributes.code_points(source_strings.view())) + c_result = cpp_attributes.code_points(source_strings.view()) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx index 6a24d79bc4b..cb04efe5e8f 100644 --- a/python/pylibcudf/pylibcudf/strings/char_types.pyx +++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx @@ -38,12 +38,10 @@ cpdef Column all_characters_of_type( cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_char_types.all_characters_of_type( - source_strings.view(), - types, - verify_types, - ) + c_result = cpp_char_types.all_characters_of_type( + source_strings.view(), + types, + verify_types, ) return Column.from_libcudf(move(c_result)) @@ -81,13 +79,11 @@ cpdef Column filter_characters_of_type( cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_char_types.filter_characters_of_type( - source_strings.view(), - types_to_remove, - dereference(c_replacement), - types_to_keep, - ) + c_result = cpp_char_types.filter_characters_of_type( + source_strings.view(), + types_to_remove, + dereference(c_replacement), + types_to_keep, ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/strings/combine.pxd new file mode 100644 index 00000000000..ea22f626973 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/combine.pxd @@ -0,0 +1,33 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.strings.combine cimport ( + output_if_empty_list, + separator_on_nulls, +) +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + +ctypedef fused ColumnOrScalar: + Column + Scalar + +cpdef Column concatenate( + Table strings_columns, + ColumnOrScalar separator, + Scalar narep=*, + Scalar col_narep=*, + separator_on_nulls separate_nulls=*, +) + +cpdef Column join_strings(Column input, Scalar separator, Scalar narep) + + +cpdef Column join_list_elements( + Column source_strings, + ColumnOrScalar separator, + Scalar separator_narep, + Scalar string_narep, + separator_on_nulls separate_nulls, + output_if_empty_list empty_list_policy, +) diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx new file mode 100644 index 00000000000..f17d5265ab4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/combine.pyx @@ -0,0 +1,223 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings cimport combine as cpp_combine +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + +from cython.operator import dereference +from pylibcudf.libcudf.strings.combine import \ + output_if_empty_list as OutputIfEmptyList # no-cython-lint +from pylibcudf.libcudf.strings.combine import \ + separator_on_nulls as SeparatorOnNulls # no-cython-lint + + +cpdef Column concatenate( + Table strings_columns, + ColumnOrScalar separator, + Scalar narep=None, + Scalar col_narep=None, + separator_on_nulls separate_nulls=separator_on_nulls.YES, +): + """ + Concatenate all columns in the table horizontally into one new string + delimited by an optional separator string. + + Parameters + ---------- + strings_columns : Table + Strings for this operation + + separator : Column or Scalar + Separator(s) for a given row + + narep : Scalar + String to replace a null separator for a given row. + + col_narep : Scalar + String that should be used in place of any null strings found in any column. + An exception is raised when separator is a Scalar. + + separate_nulls : SeparatorOnNulls + If YES, then the separator is included for null rows. + + Returns + ------- + Column + New column with concatenated results + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_col_narep + cdef const string_scalar* c_separator + + if narep is None: + narep = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + cdef const string_scalar* c_narep = ( + narep.c_obj.get() + ) + + if ColumnOrScalar is Column: + if col_narep is None: + col_narep = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + c_col_narep = ( + col_narep.c_obj.get() + ) + with nogil: + c_result = move( + cpp_combine.concatenate( + strings_columns.view(), + separator.view(), + dereference(c_narep), + dereference(c_col_narep), + separate_nulls + ) + ) + elif ColumnOrScalar is Scalar: + if col_narep is not None: + raise ValueError( + "col_narep cannot be specified when separator is a Scalar" + ) + c_separator = (separator.c_obj.get()) + with nogil: + c_result = move( + cpp_combine.concatenate( + strings_columns.view(), + dereference(c_separator), + dereference(c_narep), + separate_nulls + ) + ) + else: + raise ValueError("separator must be a Column or a Scalar") + return Column.from_libcudf(move(c_result)) + + +cpdef Column join_strings(Column input, Scalar separator, Scalar narep): + """ + Concatenates all strings in the column into one new string delimited + by an optional separator string. + + Parameters + ---------- + input : Column + List of strings columns to concatenate + + separator : Scalar + Strings column that provides the separator for a given row + + narep : Scalar + String to replace any null strings found. + + Returns + ------- + Column + New column containing one string + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_separator = ( + separator.c_obj.get() + ) + cdef const string_scalar* c_narep = ( + narep.c_obj.get() + ) + with nogil: + c_result = move( + cpp_combine.join_strings( + input.view(), + dereference(c_separator), + dereference(c_narep), + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column join_list_elements( + Column lists_strings_column, + ColumnOrScalar separator, + Scalar separator_narep, + Scalar string_narep, + separator_on_nulls separate_nulls, + output_if_empty_list empty_list_policy, +): + """ + Given a lists column of strings (each row is a list of strings), + concatenates the strings within each row and returns a single strings + column result. + + Parameters + ---------- + lists_strings_column : Column + Column containing lists of strings to concatenate + + separator : Column or Scalar + String(s) that should inserted between each string from each row. + + separator_narep : Scalar + String that should be used to replace a null separator. + + string_narep : Scalar + String to replace null strings in any non-null list row. + Ignored if separator is a Scalar. + + separate_nulls : SeparatorOnNulls + If YES, then the separator is included for null rows + if `narep` is valid + + empty_list_policy : OutputIfEmptyList + If set to EMPTY_STRING, any input row that is an empty + list will result in an empty string. Otherwise, it will + result in a null. + + + Returns + ------- + Column + New strings column with concatenated results + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_separator_narep = ( + separator_narep.c_obj.get() + ) + cdef const string_scalar* c_string_narep = ( + string_narep.c_obj.get() + ) + cdef const string_scalar* c_separator + + if ColumnOrScalar is Column: + with nogil: + c_result = move( + cpp_combine.join_list_elements( + lists_strings_column.view(), + separator.view(), + dereference(c_separator_narep), + dereference(c_string_narep), + separate_nulls, + empty_list_policy, + ) + ) + elif ColumnOrScalar is Scalar: + c_separator = (separator.c_obj.get()) + with nogil: + c_result = move( + cpp_combine.join_list_elements( + lists_strings_column.view(), + dereference(c_separator), + dereference(c_separator_narep), + separate_nulls, + empty_list_policy, + ) + ) + else: + raise ValueError("separator must be a Column or a Scalar") + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx index 82bd1fbea32..d4b1130241d 100644 --- a/python/pylibcudf/pylibcudf/strings/contains.pyx +++ b/python/pylibcudf/pylibcudf/strings/contains.pyx @@ -38,10 +38,10 @@ cpdef Column contains_re( cdef unique_ptr[column] result with nogil: - result = move(cpp_contains.contains_re( + result = cpp_contains.contains_re( input.view(), prog.c_obj.get()[0] - )) + ) return Column.from_libcudf(move(result)) @@ -71,10 +71,10 @@ cpdef Column count_re( cdef unique_ptr[column] result with nogil: - result = move(cpp_contains.count_re( + result = cpp_contains.count_re( input.view(), prog.c_obj.get()[0] - )) + ) return Column.from_libcudf(move(result)) @@ -105,10 +105,10 @@ cpdef Column matches_re( cdef unique_ptr[column] result with nogil: - result = move(cpp_contains.matches_re( + result = cpp_contains.matches_re( input.view(), prog.c_obj.get()[0] - )) + ) return Column.from_libcudf(move(result)) @@ -149,19 +149,19 @@ cpdef Column like(Column input, ColumnOrScalar pattern, Scalar escape_character= if ColumnOrScalar is Column: with nogil: - result = move(cpp_contains.like( + result = cpp_contains.like( input.view(), pattern.view(), dereference(c_escape_character) - )) + ) elif ColumnOrScalar is Scalar: c_pattern = (pattern.c_obj.get()) with nogil: - result = move(cpp_contains.like( + result = cpp_contains.like( input.view(), dereference(c_pattern), dereference(c_escape_character) - )) + ) else: raise ValueError("pattern must be a Column or a Scalar") diff --git a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt index 175c9b3738e..8ba84ba7d50 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt @@ -12,7 +12,10 @@ # the License. # ============================================================================= -set(cython_sources convert_durations.pyx convert_datetime.pyx) +set(cython_sources + convert_booleans.pyx convert_datetime.pyx convert_durations.pyx convert_fixed_point.pyx + convert_floats.pyx convert_integers.pyx convert_ipv4.pyx convert_lists.pyx convert_urls.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd index 05324cb49df..85300936e4d 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.pxd @@ -1,2 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . cimport convert_datetime, convert_durations +from . cimport ( + convert_booleans, + convert_datetime, + convert_durations, + convert_fixed_point, + convert_floats, + convert_integers, + convert_ipv4, + convert_lists, + convert_urls, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/strings/convert/__init__.py index d803399d53c..aa27a7c8929 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/convert/__init__.py @@ -1,2 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import convert_datetime, convert_durations +from . import ( + convert_booleans, + convert_datetime, + convert_durations, + convert_fixed_point, + convert_floats, + convert_integers, + convert_ipv4, + convert_lists, + convert_urls, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd new file mode 100644 index 00000000000..312ac3c0ca0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd @@ -0,0 +1,9 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar + + +cpdef Column to_booleans(Column input, Scalar true_string) + +cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx new file mode 100644 index 00000000000..dc12b291b11 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx @@ -0,0 +1,87 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings.convert cimport ( + convert_booleans as cpp_convert_booleans, +) +from pylibcudf.scalar cimport Scalar + +from cython.operator import dereference + + +cpdef Column to_booleans(Column input, Scalar true_string): + """ + Returns a new bool column by parsing boolean values from the strings + in the provided strings column. + + For details, see :cpp:func:`cudf::strings::to_booleans`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + true_string : Scalar + String to expect for true. Non-matching strings are false + + Returns + ------- + Column + New bool column converted from strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_true_string = ( + true_string.c_obj.get() + ) + + with nogil: + c_result = cpp_convert_booleans.to_booleans( + input.view(), + dereference(c_true_string) + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_booleans(Column booleans, Scalar true_string, Scalar false_string): + """ + Returns a new strings column converting the boolean values from the + provided column into strings. + + For details, see :cpp:func:`cudf::strings::from_booleans`. + + Parameters + ---------- + booleans : Column + Boolean column to convert. + + true_string : Scalar + String to use for true in the output column. + + false_string : Scalar + String to use for false in the output column. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_true_string = ( + true_string.c_obj.get() + ) + cdef const string_scalar* c_false_string = ( + false_string.c_obj.get() + ) + + with nogil: + c_result = cpp_convert_booleans.from_booleans( + booleans.view(), + dereference(c_true_string), + dereference(c_false_string), + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd index 07c84d263d6..80ec168644b 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd @@ -8,11 +8,16 @@ from pylibcudf.types cimport DataType cpdef Column to_timestamps( Column input, DataType timestamp_type, - const string& format + str format ) cpdef Column from_timestamps( - Column input, - const string& format, + Column timestamps, + str format, Column input_strings_names ) + +cpdef Column is_timestamp( + Column input, + str format, +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx index fcacb096f87..0ee60812e00 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx @@ -15,28 +15,74 @@ from pylibcudf.types import DataType cpdef Column to_timestamps( Column input, DataType timestamp_type, - const string& format + str format ): + """ + Returns a new timestamp column converting a strings column into + timestamps using the provided format pattern. + + For details, see cpp:`cudf::strings::to_timestamps`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + timestamp_type : DataType + The timestamp type used for creating the output column. + + format : str + String specifying the timestamp format in strings. + + Returns + ------- + Column + New datetime column + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.to_timestamps( input.view(), timestamp_type.c_obj, - format + c_format ) return Column.from_libcudf(move(c_result)) cpdef Column from_timestamps( - Column input, - const string& format, + Column timestamps, + str format, Column input_strings_names ): + """ + Returns a new strings column converting a timestamp column into + strings using the provided format pattern. + + For details, see cpp:`cudf::strings::from_timestamps`. + + Parameters + ---------- + timestamps : Column + Timestamp values to convert + + format : str + The string specifying output format. + + input_strings_names : Column + The string names to use for weekdays ("%a", "%A") and months ("%b", "%B"). + + Returns + ------- + Column + New strings column with formatted timestamps. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.from_timestamps( - input.view(), - format, + timestamps.view(), + c_format, input_strings_names.view() ) @@ -44,13 +90,33 @@ cpdef Column from_timestamps( cpdef Column is_timestamp( Column input, - const string& format + str format ): + """ + Verifies the given strings column can be parsed to timestamps + using the provided format pattern. + + For details, see cpp:`cudf::strings::is_timestamp`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + format : str + String specifying the timestamp format in strings. + + Returns + ------- + Column + New bool column. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() with nogil: c_result = cpp_convert_datetime.is_timestamp( input.view(), - format + c_format ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd index ac11b8959ed..eecdade4ef9 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd @@ -8,10 +8,10 @@ from pylibcudf.types cimport DataType cpdef Column to_durations( Column input, DataType duration_type, - const string& format + str format ) cpdef Column from_durations( - Column input, - const string& format + Column durations, + str format=* ) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx index f3e0b7c9c8e..31980ace418 100644 --- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx @@ -15,27 +15,76 @@ from pylibcudf.types import DataType cpdef Column to_durations( Column input, DataType duration_type, - const string& format + str format ): + """ + Returns a new duration column converting a strings column into + durations using the provided format pattern. + + For details, see cpp:func:`cudf::strings::to_durations` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + duration_type : DataType + The duration type used for creating the output column. + + format : str + String specifying the duration format in strings. + + Returns + ------- + Column + New duration column. + """ cdef unique_ptr[column] c_result + cdef string c_format = format.encode() + with nogil: c_result = cpp_convert_durations.to_durations( input.view(), duration_type.c_obj, - format + c_format ) return Column.from_libcudf(move(c_result)) cpdef Column from_durations( - Column input, - const string& format + Column durations, + str format=None ): + """ + Returns a new strings column converting a duration column into + strings using the provided format pattern. + + For details, see cpp:func:`cudf::strings::from_durations` + + Parameters + ---------- + durations : Column + Duration values to convert. + + format : str + The string specifying output format. + Default format is "%D days %H:%M:%S". + + Returns + ------- + Column + New strings column with formatted durations. + """ cdef unique_ptr[column] c_result + + if format is None: + format = "%D days %H:%M:%S" + cdef string c_format = format.encode() + with nogil: c_result = cpp_convert_durations.from_durations( - input.view(), - format + durations.view(), + c_format ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd new file mode 100644 index 00000000000..049b9b3fffe --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_fixed_point(Column input, DataType output_type) + +cpdef Column from_fixed_point(Column input) + +cpdef Column is_fixed_point(Column input, DataType decimal_type=*) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx new file mode 100644 index 00000000000..962a47dfadf --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx @@ -0,0 +1,99 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_fixed_point as cpp_fixed_point, +) +from pylibcudf.types cimport DataType, type_id + + +cpdef Column to_fixed_point(Column input, DataType output_type): + """ + Returns a new fixed-point column parsing decimal values from the + provided strings column. + + For details, see :cpp:func:`cudf::strings::to_fixed_point` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + output_type : DataType + Type of fixed-point column to return including the scale value. + + Returns + ------- + Column + New column of output_type. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_fixed_point.to_fixed_point( + input.view(), + output_type.c_obj, + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column from_fixed_point(Column input): + """ + Returns a new strings column converting the fixed-point values + into a strings column. + + For details, see :cpp:func:`cudf::strings::from_fixed_point` + + Parameters + ---------- + input : Column + Fixed-point column to convert. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_fixed_point.from_fixed_point(input.view()) + + return Column.from_libcudf(move(c_result)) + +cpdef Column is_fixed_point(Column input, DataType decimal_type=None): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to fixed-point. + + For details, see :cpp:func:`cudf::strings::is_fixed_point` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + decimal_type : DataType + Fixed-point type (with scale) used only for checking overflow. + Defaults to Decimal64 + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + if decimal_type is None: + decimal_type = DataType(type_id.DECIMAL64) + + with nogil: + c_result = cpp_fixed_point.is_fixed_point( + input.view(), + decimal_type.c_obj, + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd new file mode 100644 index 00000000000..1284ff552aa --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_floats(Column strings, DataType output_type) + +cpdef Column from_floats(Column floats) + +cpdef Column is_float(Column input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx new file mode 100644 index 00000000000..1296f4f9db5 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx @@ -0,0 +1,91 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_floats as cpp_convert_floats, +) +from pylibcudf.types cimport DataType + + +cpdef Column to_floats(Column strings, DataType output_type): + """ + Returns a new numeric column by parsing float values from each string + in the provided strings column. + + For details, see cpp:func:`cudf::strings::to_floats` + + Parameters + ---------- + strings : Column + Strings instance for this operation. + + output_type : DataType + Type of float numeric column to return. + + Returns + ------- + Column + New column with floats converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_convert_floats.to_floats( + strings.view(), + output_type.c_obj, + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column from_floats(Column floats): + """ + Returns a new strings column converting the float values from the + provided column into strings. + + For details, see cpp:func:`cudf::strings::from_floats` + + Parameters + ---------- + floats : Column + Numeric column to convert. + + Returns + ------- + Column + New strings column with floats as strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_convert_floats.from_floats(floats.view()) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_float(Column input): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to floats. + + For details, see cpp:func:`cudf::strings::is_float` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_convert_floats.is_float(input.view()) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd new file mode 100644 index 00000000000..eff2e080c27 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd @@ -0,0 +1,17 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.types cimport DataType + + +cpdef Column to_integers(Column input, DataType output_type) + +cpdef Column from_integers(Column integers) + +cpdef Column is_integer(Column input, DataType int_type=*) + +cpdef Column hex_to_integers(Column input, DataType output_type) + +cpdef Column is_hex(Column input) + +cpdef Column integers_to_hex(Column input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx new file mode 100644 index 00000000000..5558683a502 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx @@ -0,0 +1,206 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport ( + convert_integers as cpp_convert_integers, +) +from pylibcudf.types cimport DataType + + +cpdef Column to_integers(Column input, DataType output_type): + """ + Returns a new integer numeric column parsing integer values from the + provided strings column. + + For details, cpp:func:`cudf::strings::to_integers`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + output_type : DataType + Type of integer numeric column to return. + + Returns + ------- + Column + New column with integers converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.to_integers( + input.view(), + output_type.c_obj + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column from_integers(Column integers): + """ + Returns a new strings column converting the integer values from the + provided column into strings. + + For details, cpp:func:`cudf::strings::from_integers`. + + Parameters + ---------- + integers : Column + Strings instance for this operation. + + Returns + ------- + Column + New strings column with integers as strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.from_integers( + integers.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_integer(Column input, DataType int_type=None): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to integers. + + For details, cpp:func:`cudf::strings::is_integer`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + int_type : DataType + Integer type used for checking underflow and overflow. + By default, does not check an integer type for underflow + or overflow. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + if int_type is None: + with nogil: + c_result = move( + cpp_convert_integers.is_integer( + input.view(), + ) + ) + else: + with nogil: + c_result = move( + cpp_convert_integers.is_integer( + input.view(), + int_type.c_obj + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column hex_to_integers(Column input, DataType output_type): + """ + Returns a new integer numeric column parsing hexadecimal values + from the provided strings column. + + For details, cpp:func:`cudf::strings::hex_to_integers`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + output_type : DataType + Type of integer numeric column to return. + + Returns + ------- + Column + New column with integers converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.hex_to_integers( + input.view(), + output_type.c_obj + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_hex(Column input): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to integers from hex. + + For details, cpp:func:`cudf::strings::is_hex`. + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.is_hex( + input.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column integers_to_hex(Column input): + """ + Returns a new strings column converting integer columns to hexadecimal + characters. + + For details, cpp:func:`cudf::strings::integers_to_hex`. + + Parameters + ---------- + input : Column + Integer column to convert to hex. + + Returns + ------- + Column + New strings column with hexadecimal characters. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_convert_integers.integers_to_hex( + input.view(), + ) + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd new file mode 100644 index 00000000000..c61f5c0bdca --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column ipv4_to_integers(Column input) + +cpdef Column integers_to_ipv4(Column integers) + +cpdef Column is_ipv4(Column input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx new file mode 100644 index 00000000000..834781f95f3 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx @@ -0,0 +1,80 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport convert_ipv4 as cpp_convert_ipv4 + + +cpdef Column ipv4_to_integers(Column input): + """ + Converts IPv4 addresses into integers. + + For details, see cpp:func:`cudf::strings::ipv4_to_integers` + + Parameters + ---------- + input : Column + Strings instance for this operation + + Returns + ------- + Column + New uint32 column converted from strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_convert_ipv4.ipv4_to_integers(input.view()) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column integers_to_ipv4(Column integers): + """ + Converts integers into IPv4 addresses as strings. + + For details, see cpp:func:`cudf::strings::integers_to_ipv4` + + Parameters + ---------- + integers : Column + Integer (uint32) column to convert. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_convert_ipv4.integers_to_ipv4(integers.view()) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column is_ipv4(Column input): + """ + Returns a boolean column identifying strings in which all + characters are valid for conversion to integers from IPv4 format. + + For details, see cpp:func:`cudf::strings::is_ipv4` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New column of boolean results for each string. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_convert_ipv4.is_ipv4(input.view()) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd new file mode 100644 index 00000000000..1ba4272afa2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar + + +cpdef Column format_list_column( + Column input, + Scalar na_rep=*, + Column separators=* +) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx new file mode 100644 index 00000000000..cbfe5f5aa8b --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx @@ -0,0 +1,70 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.column_factories cimport make_empty_column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings.convert cimport ( + convert_lists as cpp_convert_lists, +) +from pylibcudf.scalar cimport Scalar +from pylibcudf.types cimport type_id + +from cython.operator import dereference + + +cpdef Column format_list_column( + Column input, + Scalar na_rep=None, + Column separators=None +): + """ + Convert a list column of strings into a formatted strings column. + + For details, see :cpp:func`cudf::strings::format_list_column` + + Parameters + ---------- + input : Column + Lists column to format + + na_rep : Scalar + Replacement string for null elements. + Default, empty string + + separators : Column + Strings to use for enclosing list components and separating elements. + Default, ``,``, ``[``, ``]`` + + Returns + ------- + Column + New strings column + """ + cdef unique_ptr[column] c_result + + if na_rep is None: + na_rep = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + cdef const string_scalar* c_na_rep = ( + na_rep.c_obj.get() + ) + + if separators is None: + separators = make_empty_column(type_id.STRING) + + with nogil: + c_result = cpp_convert_lists.format_list_column( + input.view(), + dereference(c_na_rep), + separators.view() + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd new file mode 100644 index 00000000000..da05ce93426 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column url_encode(Column Input) + +cpdef Column url_decode(Column Input) diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx new file mode 100644 index 00000000000..82f8a75f1d9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings.convert cimport convert_urls as cpp_convert_urls + + +cpdef Column url_encode(Column input): + """ + Encodes each string using URL encoding. + + For details, see :cpp:func:`cudf::strings::url_encode` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_convert_urls.url_encode(input.view()) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column url_decode(Column input): + """ + Decodes each string using URL encoding. + + For details, see :cpp:func:`cudf::strings::url_decode` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_convert_urls.url_decode(input.view()) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx index dcb11ca10ce..b56eccc8287 100644 --- a/python/pylibcudf/pylibcudf/strings/extract.pyx +++ b/python/pylibcudf/pylibcudf/strings/extract.pyx @@ -33,11 +33,9 @@ cpdef Table extract(Column input, RegexProgram prog): cdef unique_ptr[table] c_result with nogil: - c_result = move( - cpp_extract.extract( - input.view(), - prog.c_obj.get()[0] - ) + c_result = cpp_extract.extract( + input.view(), + prog.c_obj.get()[0] ) return Table.from_libcudf(move(c_result)) @@ -66,11 +64,9 @@ cpdef Column extract_all_record(Column input, RegexProgram prog): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_extract.extract_all_record( - input.view(), - prog.c_obj.get()[0] - ) + c_result = cpp_extract.extract_all_record( + input.view(), + prog.c_obj.get()[0] ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/find.pyx b/python/pylibcudf/pylibcudf/strings/find.pyx index 22d370bf7e8..6fc6dca24fd 100644 --- a/python/pylibcudf/pylibcudf/strings/find.pyx +++ b/python/pylibcudf/pylibcudf/strings/find.pyx @@ -50,22 +50,18 @@ cpdef Column find( cdef unique_ptr[column] result if ColumnOrScalar is Column: with nogil: - result = move( - cpp_find.find( - input.view(), - target.view(), - start - ) + result = cpp_find.find( + input.view(), + target.view(), + start ) elif ColumnOrScalar is Scalar: with nogil: - result = move( - cpp_find.find( - input.view(), - dereference((target.c_obj.get())), - start, - stop - ) + result = cpp_find.find( + input.view(), + dereference((target.c_obj.get())), + start, + stop ) else: raise ValueError(f"Invalid target {target}") @@ -104,13 +100,11 @@ cpdef Column rfind( """ cdef unique_ptr[column] result with nogil: - result = move( - cpp_find.rfind( - input.view(), - dereference((target.c_obj.get())), - start, - stop - ) + result = cpp_find.rfind( + input.view(), + dereference((target.c_obj.get())), + start, + stop ) return Column.from_libcudf(move(result)) @@ -149,19 +143,15 @@ cpdef Column contains( cdef unique_ptr[column] result if ColumnOrScalar is Column: with nogil: - result = move( - cpp_find.contains( - input.view(), - target.view() - ) + result = cpp_find.contains( + input.view(), + target.view() ) elif ColumnOrScalar is Scalar: with nogil: - result = move( - cpp_find.contains( - input.view(), - dereference((target.c_obj.get())) - ) + result = cpp_find.contains( + input.view(), + dereference((target.c_obj.get())) ) else: raise ValueError(f"Invalid target {target}") @@ -204,19 +194,15 @@ cpdef Column starts_with( if ColumnOrScalar is Column: with nogil: - result = move( - cpp_find.starts_with( - input.view(), - target.view() - ) + result = cpp_find.starts_with( + input.view(), + target.view() ) elif ColumnOrScalar is Scalar: with nogil: - result = move( - cpp_find.starts_with( - input.view(), - dereference((target.c_obj.get())) - ) + result = cpp_find.starts_with( + input.view(), + dereference((target.c_obj.get())) ) else: raise ValueError(f"Invalid target {target}") @@ -256,19 +242,15 @@ cpdef Column ends_with( cdef unique_ptr[column] result if ColumnOrScalar is Column: with nogil: - result = move( - cpp_find.ends_with( - input.view(), - target.view() - ) + result = cpp_find.ends_with( + input.view(), + target.view() ) elif ColumnOrScalar is Scalar: with nogil: - result = move( - cpp_find.ends_with( - input.view(), - dereference((target.c_obj.get())) - ) + result = cpp_find.ends_with( + input.view(), + dereference((target.c_obj.get())) ) else: raise ValueError(f"Invalid target {target}") diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd new file mode 100644 index 00000000000..b7b3aefa336 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column find_multiple(Column input, Column targets) diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx new file mode 100644 index 00000000000..672aa606bd0 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx @@ -0,0 +1,37 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport find_multiple as cpp_find_multiple + + +cpdef Column find_multiple(Column input, Column targets): + """ + Returns a lists column with character position values where each + of the target strings are found in each string. + + For details, see :cpp:func:`cudf::strings::find_multiple`. + + Parameters + ---------- + input : Column + Strings instance for this operation + targets : Column + Strings to search for in each string + + Returns + ------- + Column + Lists column with character position values + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_find_multiple.find_multiple( + input.view(), + targets.view() + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/strings/findall.pxd index 54afa088141..3c35a9c9aa9 100644 --- a/python/pylibcudf/pylibcudf/strings/findall.pxd +++ b/python/pylibcudf/pylibcudf/strings/findall.pxd @@ -4,4 +4,5 @@ from pylibcudf.column cimport Column from pylibcudf.strings.regex_program cimport RegexProgram +cpdef Column find_re(Column input, RegexProgram pattern) cpdef Column findall(Column input, RegexProgram pattern) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx index 3a6b87504b3..89fa4302824 100644 --- a/python/pylibcudf/pylibcudf/strings/findall.pyx +++ b/python/pylibcudf/pylibcudf/strings/findall.pyx @@ -30,11 +30,39 @@ cpdef Column findall(Column input, RegexProgram pattern): cdef unique_ptr[column] c_result with nogil: - c_result = move( - cpp_findall.findall( - input.view(), - pattern.c_obj.get()[0] - ) + c_result = cpp_findall.findall( + input.view(), + pattern.c_obj.get()[0] + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column find_re(Column input, RegexProgram pattern): + """ + Returns character positions where the pattern first matches + the elements in input strings. + + For details, see :cpp:func:`cudf::strings::find_re` + + Parameters + ---------- + input : Column + Strings instance for this operation + pattern : RegexProgram + Regex pattern + + Returns + ------- + Column + New column of integers + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_findall.find_re( + input.view(), + pattern.c_obj.get()[0] ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/padding.pxd b/python/pylibcudf/pylibcudf/strings/padding.pxd new file mode 100644 index 00000000000..a035a5ad187 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/padding.pxd @@ -0,0 +1,11 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.string cimport string +from pylibcudf.column cimport Column +from pylibcudf.libcudf.strings.side_type cimport side_type +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column pad(Column input, size_type width, side_type side, str fill_char) + +cpdef Column zfill(Column input, size_type width) diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyx b/python/pylibcudf/pylibcudf/strings/padding.pyx new file mode 100644 index 00000000000..f6950eecf60 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/padding.pyx @@ -0,0 +1,71 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport padding as cpp_padding +from pylibcudf.libcudf.strings.side_type cimport side_type + + +cpdef Column pad(Column input, size_type width, side_type side, str fill_char): + """ + Add padding to each string using a provided character. + + For details, see :cpp:func:`cudf::strings::pad`. + + Parameters + ---------- + input : Column + Strings instance for this operation + width : int + The minimum number of characters for each string. + side : SideType + Where to place the padding characters. + fill_char : str + Single UTF-8 character to use for padding + + Returns + ------- + Column + New column with padded strings. + """ + cdef unique_ptr[column] c_result + cdef string c_fill_char = fill_char.encode("utf-8") + + with nogil: + c_result = cpp_padding.pad( + input.view(), + width, + side, + c_fill_char, + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column zfill(Column input, size_type width): + """ + Add '0' as padding to the left of each string. + + For details, see :cpp:func:`cudf::strings::zfill`. + + Parameters + ---------- + input : Column + Strings instance for this operation + width : int + The minimum number of characters for each string. + + Returns + ------- + Column + New column of strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_padding.zfill( + input.view(), + width, + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx index 5f627218f6e..fb2bb13c666 100644 --- a/python/pylibcudf/pylibcudf/strings/repeat.pyx +++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx @@ -31,19 +31,15 @@ cpdef Column repeat_strings(Column input, ColumnorSizeType repeat_times): if ColumnorSizeType is Column: with nogil: - c_result = move( - cpp_repeat.repeat_strings( - input.view(), - repeat_times.view() - ) + c_result = cpp_repeat.repeat_strings( + input.view(), + repeat_times.view() ) elif ColumnorSizeType is size_type: with nogil: - c_result = move( - cpp_repeat.repeat_strings( - input.view(), - repeat_times - ) + c_result = cpp_repeat.repeat_strings( + input.view(), + repeat_times ) else: raise ValueError("repeat_times must be size_type or integer") diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx index 9d0ebf4a814..6db7f04fcbb 100644 --- a/python/pylibcudf/pylibcudf/strings/replace.pyx +++ b/python/pylibcudf/pylibcudf/strings/replace.pyx @@ -55,12 +55,12 @@ cpdef Column replace( repl_str = (repl.c_obj.get()) with nogil: - c_result = move(cpp_replace( + c_result = cpp_replace( input.view(), target_str[0], repl_str[0], maxrepl, - )) + ) return Column.from_libcudf(move(c_result)) @@ -98,11 +98,11 @@ cpdef Column replace_multiple( cdef unique_ptr[column] c_result with nogil: - c_result = move(cpp_replace_multiple( + c_result = cpp_replace_multiple( input.view(), target.view(), repl.view(), - )) + ) return Column.from_libcudf(move(c_result)) @@ -151,11 +151,11 @@ cpdef Column replace_slice( cdef const string_scalar* scalar_str = (repl.c_obj.get()) with nogil: - c_result = move(cpp_replace_slice( + c_result = cpp_replace_slice( input.view(), scalar_str[0], start, stop - )) + ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pxd b/python/pylibcudf/pylibcudf/strings/replace_re.pxd new file mode 100644 index 00000000000..e27ccd55f7d --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/replace_re.pxd @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.regex_flags cimport regex_flags +from pylibcudf.strings.regex_program cimport RegexProgram + +ctypedef fused Replacement: + Column + Scalar + +ctypedef fused Patterns: + RegexProgram + list + + +cpdef Column replace_re( + Column input, + Patterns patterns, + Replacement replacement=*, + size_type max_replace_count=*, + regex_flags flags=* +) + +cpdef Column replace_with_backrefs( + Column input, + RegexProgram prog, + str replacement +) diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyx b/python/pylibcudf/pylibcudf/strings/replace_re.pyx new file mode 100644 index 00000000000..ccc33fd4425 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyx @@ -0,0 +1,134 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings cimport replace_re as cpp_replace_re +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.regex_flags cimport regex_flags +from pylibcudf.strings.regex_program cimport RegexProgram + + +cpdef Column replace_re( + Column input, + Patterns patterns, + Replacement replacement=None, + size_type max_replace_count=-1, + regex_flags flags=regex_flags.DEFAULT, +): + """ + For each string, replaces any character sequence matching the given patterns + with the provided replacement. + + For details, see :cpp:func:`cudf::strings::replace_re` + + Parameters + ---------- + input : Column + Strings instance for this operation. + patterns: RegexProgram or list[str] + If RegexProgram, the regex to match to each string. + If list[str], a list of regex strings to search within each string. + replacement : Scalar or Column + If Scalar, the string used to replace the matched sequence in each string. + ``patterns`` must be a RegexProgram. + If Column, the strings used for replacement. + ``patterns`` must be a list[str]. + max_replace_count : int + The maximum number of times to replace the matched pattern + within each string. ``patterns`` must be a RegexProgram. + Default replaces every substring that is matched. + flags : RegexFlags + Regex flags for interpreting special characters in the patterns. + ``patterns`` must be a list[str] + + Returns + ------- + Column + New strings column + """ + cdef unique_ptr[column] c_result + cdef vector[string] c_patterns + + if Patterns is RegexProgram and Replacement is Scalar: + if replacement is None: + replacement = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + with nogil: + c_result = move( + cpp_replace_re.replace_re( + input.view(), + patterns.c_obj.get()[0], + dereference((replacement.get())), + max_replace_count + ) + ) + + return Column.from_libcudf(move(c_result)) + elif Patterns is list and Replacement is Column: + c_patterns.reserve(len(patterns)) + for pattern in patterns: + c_patterns.push_back(pattern.encode()) + + with nogil: + c_result = move( + cpp_replace_re.replace_re( + input.view(), + c_patterns, + replacement.view(), + flags, + ) + ) + + return Column.from_libcudf(move(c_result)) + else: + raise TypeError("Must pass either a RegexProgram and a Scalar or a list") + + +cpdef Column replace_with_backrefs( + Column input, + RegexProgram prog, + str replacement +): + """ + For each string, replaces any character sequence matching the given regex + using the replacement template for back-references. + + For details, see :cpp:func:`cudf::strings::replace_with_backrefs` + + Parameters + ---------- + input : Column + Strings instance for this operation. + + prog: RegexProgram + Regex program instance. + + replacement : str + The replacement template for creating the output string. + + Returns + ------- + Column + New strings column. + """ + cdef unique_ptr[column] c_result + cdef string c_replacement = replacement.encode() + + with nogil: + c_result = cpp_replace_re.replace_with_backrefs( + input.view(), + prog.c_obj.get()[0], + c_replacement, + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/strings/side_type.pxd index 34b7a580380..34b03e9bc27 100644 --- a/python/pylibcudf/pylibcudf/strings/side_type.pxd +++ b/python/pylibcudf/pylibcudf/strings/side_type.pxd @@ -1,3 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - from pylibcudf.libcudf.strings.side_type cimport side_type diff --git a/python/pylibcudf/pylibcudf/strings/side_type.pyx b/python/pylibcudf/pylibcudf/strings/side_type.pyx index acdc7d6ff1f..cf0c770cc11 100644 --- a/python/pylibcudf/pylibcudf/strings/side_type.pyx +++ b/python/pylibcudf/pylibcudf/strings/side_type.pyx @@ -1,4 +1,3 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - from pylibcudf.libcudf.strings.side_type import \ side_type as SideType # no-cython-lint diff --git a/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt new file mode 100644 index 00000000000..8f544f6f537 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/CMakeLists.txt @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources partition.pyx split.pyx) + +set(linked_libraries cudf::cudf) +rapids_cython_create_modules( + CXX + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf +) diff --git a/python/cudf_kafka/cudf_kafka/tests/pytest.ini b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd similarity index 56% rename from python/cudf_kafka/cudf_kafka/tests/pytest.ini rename to python/pylibcudf/pylibcudf/strings/split/__init__.pxd index 7b0a9f29fb1..72086e57d9f 100644 --- a/python/cudf_kafka/cudf_kafka/tests/pytest.ini +++ b/python/pylibcudf/pylibcudf/strings/split/__init__.pxd @@ -1,4 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -addopts = --tb=native +from . cimport partition, split diff --git a/python/cudf_polars/tests/pytest.ini b/python/pylibcudf/pylibcudf/strings/split/__init__.py similarity index 56% rename from python/cudf_polars/tests/pytest.ini rename to python/pylibcudf/pylibcudf/strings/split/__init__.py index 7b0a9f29fb1..2033e5e275b 100644 --- a/python/cudf_polars/tests/pytest.ini +++ b/python/pylibcudf/pylibcudf/strings/split/__init__.py @@ -1,4 +1,2 @@ # Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -addopts = --tb=native +from . import partition, split diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/strings/split/partition.pxd new file mode 100644 index 00000000000..c18257a4787 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + + +cpdef Table partition(Column input, Scalar delimiter=*) + +cpdef Table rpartition(Column input, Scalar delimiter=*) diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx new file mode 100644 index 00000000000..0fb4f186c41 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx @@ -0,0 +1,91 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.scalar.scalar_factories cimport ( + make_string_scalar as cpp_make_string_scalar, +) +from pylibcudf.libcudf.strings.split cimport partition as cpp_partition +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.scalar cimport Scalar +from pylibcudf.table cimport Table + +from cython.operator import dereference + + +cpdef Table partition(Column input, Scalar delimiter=None): + """ + Returns a set of 3 columns by splitting each string using the + specified delimiter. + + For details, see :cpp:func:`cudf::strings::partition`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating where to split each string. + + Returns + ------- + Table + New table of strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + with nogil: + c_result = cpp_partition.partition( + input.view(), + dereference(c_delimiter) + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Table rpartition(Column input, Scalar delimiter=None): + """ + Returns a set of 3 columns by splitting each string using the + specified delimiter starting from the end of each string. + + For details, see :cpp:func:`cudf::strings::rpartition`. + + Parameters + ---------- + input : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating where to split each string. + + Returns + ------- + Table + New strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + if delimiter is None: + delimiter = Scalar.from_libcudf( + cpp_make_string_scalar("".encode()) + ) + + with nogil: + c_result = cpp_partition.rpartition( + input.view(), + dereference(c_delimiter) + ) + + return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/strings/split/split.pxd new file mode 100644 index 00000000000..355a1874298 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/split.pxd @@ -0,0 +1,24 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + + +cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit) + +cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit) + +cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit) + +cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit) + +cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit) + +cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit) diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx new file mode 100644 index 00000000000..e3827f6645e --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx @@ -0,0 +1,310 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.scalar.scalar cimport string_scalar +from pylibcudf.libcudf.strings.split cimport split as cpp_split +from pylibcudf.libcudf.table.table cimport table +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.scalar cimport Scalar +from pylibcudf.strings.regex_program cimport RegexProgram +from pylibcudf.table cimport Table + +from cython.operator import dereference + + +cpdef Table split(Column strings_column, Scalar delimiter, size_type maxsplit): + """ + Returns a list of columns by splitting each string using the + specified delimiter. + + For details, see :cpp:func:`cudf::strings::split`. + + Parameters + ---------- + strings_column : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating the split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + New table of strings columns + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = cpp_split.split( + strings_column.view(), + dereference(c_delimiter), + maxsplit, + ) + + return Table.from_libcudf(move(c_result)) + + +cpdef Table rsplit(Column strings_column, Scalar delimiter, size_type maxsplit): + """ + Returns a list of columns by splitting each string using the + specified delimiter starting from the end of each string. + + For details, see :cpp:func:`cudf::strings::rsplit`. + + Parameters + ---------- + strings_column : Column + Strings instance for this operation + + delimiter : Scalar + UTF-8 encoded string indicating the split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + New table of strings columns. + """ + cdef unique_ptr[table] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = cpp_split.rsplit( + strings_column.view(), + dereference(c_delimiter), + maxsplit, + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Column split_record(Column strings, Scalar delimiter, size_type maxsplit): + """ + Splits individual strings elements into a list of strings. + + For details, see :cpp:func:`cudf::strings::split_record`. + + Parameters + ---------- + strings : Column + A column of string elements to be split. + + delimiter : Scalar + The string to identify split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = cpp_split.split_record( + strings.view(), + dereference(c_delimiter), + maxsplit, + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column rsplit_record(Column strings, Scalar delimiter, size_type maxsplit): + """ + Splits individual strings elements into a list of strings starting + from the end of each string. + + For details, see :cpp:func:`cudf::strings::rsplit_record`. + + Parameters + ---------- + strings : Column + A column of string elements to be split. + + delimiter : Scalar + The string to identify split points in each string. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + cdef const string_scalar* c_delimiter = ( + delimiter.c_obj.get() + ) + + with nogil: + c_result = cpp_split.rsplit_record( + strings.view(), + dereference(c_delimiter), + maxsplit, + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Table split_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a table of strings columns + using a regex_program's pattern to delimit each string. + + For details, see :cpp:func:`cudf::strings::split_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + A table of columns of strings. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = cpp_split.split_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Table rsplit_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a table of strings columns + using a regex_program's pattern to delimit each string starting from + the end of the string. + + For details, see :cpp:func:`cudf::strings::rsplit_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Table + A table of columns of strings. + """ + cdef unique_ptr[table] c_result + + with nogil: + c_result = cpp_split.rsplit_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + + return Table.from_libcudf(move(c_result)) + +cpdef Column split_record_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a list column of strings using the given + regex_program to delimit each string. + + For details, see :cpp:func:`cudf::strings::split_record_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_split.split_record_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + + return Column.from_libcudf(move(c_result)) + +cpdef Column rsplit_record_re(Column input, RegexProgram prog, size_type maxsplit): + """ + Splits strings elements into a list column of strings using the given + regex_program to delimit each string starting from the end of the string. + + For details, see :cpp:func:`cudf::strings::rsplit_record_re`. + + Parameters + ---------- + input : Column + A column of string elements to be split. + + prog : RegexProgram + Regex program instance. + + maxsplit : int + Maximum number of splits to perform. -1 indicates all possible + splits on each string. + + Returns + ------- + Column + Lists column of strings. + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_split.rsplit_record_re( + input.view(), + prog.c_obj.get()[0], + maxsplit, + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx index a62c7ec4528..d85da8e6cdd 100644 --- a/python/pylibcudf/pylibcudf/strings/translate.pyx +++ b/python/pylibcudf/pylibcudf/strings/translate.pyx @@ -62,11 +62,9 @@ cpdef Column translate(Column input, dict chars_table): ) with nogil: - c_result = move( - cpp_translate.translate( - input.view(), - c_chars_table - ) + c_result = cpp_translate.translate( + input.view(), + c_chars_table ) return Column.from_libcudf(move(c_result)) @@ -111,12 +109,10 @@ cpdef Column filter_characters( ) with nogil: - c_result = move( - cpp_translate.filter_characters( - input.view(), - c_characters_to_filter, - keep_characters, - dereference(c_replacement), - ) + c_result = cpp_translate.filter_characters( + input.view(), + c_characters_to_filter, + keep_characters, + dereference(c_replacement), ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pxd b/python/pylibcudf/pylibcudf/strings/wrap.pxd new file mode 100644 index 00000000000..fcc86650acf --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/wrap.pxd @@ -0,0 +1,7 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column wrap(Column input, size_type width) diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyx b/python/pylibcudf/pylibcudf/strings/wrap.pyx new file mode 100644 index 00000000000..2ced250f837 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/wrap.pyx @@ -0,0 +1,40 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport wrap as cpp_wrap +from pylibcudf.libcudf.types cimport size_type + + +cpdef Column wrap(Column input, size_type width): + """ + Wraps strings onto multiple lines shorter than `width` by + replacing appropriate white space with + new-line characters (ASCII 0x0A). + + For details, see :cpp:func:`cudf::strings::wrap`. + + Parameters + ---------- + input : Column + String column + + width : int + Maximum character width of a line within each string + + Returns + ------- + Column + Column of wrapped strings + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_wrap.wrap( + input.view(), + width, + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/table.pyx b/python/pylibcudf/pylibcudf/table.pyx index 5f77b89a605..d0d6f2343d0 100644 --- a/python/pylibcudf/pylibcudf/table.pyx +++ b/python/pylibcudf/pylibcudf/table.pyx @@ -49,9 +49,7 @@ cdef class Table: calling libcudf algorithms, and should generally not be needed by users (even direct pylibcudf Cython users). """ - cdef vector[unique_ptr[column]] c_columns = move( - dereference(libcudf_tbl).release() - ) + cdef vector[unique_ptr[column]] c_columns = dereference(libcudf_tbl).release() cdef vector[unique_ptr[column]].size_type i return Table([ diff --git a/python/pylibcudf/pylibcudf/tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py index 9f389fa42c4..d95849ef371 100644 --- a/python/pylibcudf/pylibcudf/tests/common/utils.py +++ b/python/pylibcudf/pylibcudf/tests/common/utils.py @@ -7,10 +7,11 @@ import numpy as np import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from pyarrow.orc import write_table as orc_write_table from pyarrow.parquet import write_table as pq_write_table + +import pylibcudf as plc from pylibcudf.io.types import CompressionType diff --git a/python/pylibcudf/pylibcudf/tests/conftest.py b/python/pylibcudf/pylibcudf/tests/conftest.py index fdce6f353ca..a19a8835498 100644 --- a/python/pylibcudf/pylibcudf/tests/conftest.py +++ b/python/pylibcudf/pylibcudf/tests/conftest.py @@ -8,8 +8,9 @@ import numpy as np import pyarrow as pa -import pylibcudf as plc import pytest + +import pylibcudf as plc from pylibcudf.io.types import CompressionType sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common")) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_avro.py b/python/pylibcudf/pylibcudf/tests/io/test_avro.py index 0cd5064a697..3d9d99ffa61 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_avro.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_avro.py @@ -5,10 +5,11 @@ import fastavro import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_table_and_meta_eq +import pylibcudf as plc + avro_dtype_pairs = [ ("boolean", pa.bool_()), ("int", pa.int32()), diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py index ab26f23418d..22c83acc47c 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py @@ -5,9 +5,7 @@ import pandas as pd import pyarrow as pa -import pylibcudf as plc import pytest -from pylibcudf.io.types import CompressionType from utils import ( _convert_types, assert_table_and_meta_eq, @@ -15,6 +13,9 @@ write_source_str, ) +import pylibcudf as plc +from pylibcudf.io.types import CompressionType + # Shared kwargs to pass to make_source _COMMON_CSV_SOURCE_KWARGS = { "format": "csv", diff --git a/python/pylibcudf/pylibcudf/tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py index 9d976fedf00..453e5ce32a8 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_json.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py @@ -3,9 +3,7 @@ import pandas as pd import pyarrow as pa -import pylibcudf as plc import pytest -from pylibcudf.io.types import CompressionType from utils import ( assert_table_and_meta_eq, make_source, @@ -13,6 +11,9 @@ write_source_str, ) +import pylibcudf as plc +from pylibcudf.io.types import CompressionType + # Shared kwargs to pass to make_source _COMMON_JSON_SOURCE_KWARGS = {"format": "json", "orient": "records"} diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py index 42b14b1feff..5ed660ba6cf 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import _convert_types, assert_table_and_meta_eq, make_source +import pylibcudf as plc + # Shared kwargs to pass to make_source _COMMON_ORC_SOURCE_KWARGS = {"format": "orc"} diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py index f6e843ccf66..41298601539 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py @@ -1,9 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from pyarrow.parquet import read_table +from utils import assert_table_and_meta_eq, make_source + +import pylibcudf as plc from pylibcudf.expressions import ( ASTOperator, ColumnNameReference, @@ -11,7 +13,6 @@ Literal, Operation, ) -from utils import assert_table_and_meta_eq, make_source # Shared kwargs to pass to make_source _COMMON_PARQUET_SOURCE_KWARGS = {"format": "parquet"} diff --git a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py index 747f58ec8cf..0c43c363e55 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py @@ -2,9 +2,10 @@ import io -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo]) def io_class(request): diff --git a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py index 76b0424b2af..b3555013927 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import zoneinfo -import pylibcudf as plc import pytest +import pylibcudf as plc + def test_make_timezone_transition_table(): if len(zoneinfo.TZPATH) == 0: diff --git a/python/pylibcudf/pylibcudf/tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini deleted file mode 100644 index f572f85ca49..00000000000 --- a/python/pylibcudf/pylibcudf/tests/pytest.ini +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -[pytest] -xfail_strict = true -filterwarnings = - error - ignore:::.*xdist.* - ignore:::.*pytest.* -addopts = --tb=native diff --git a/python/pylibcudf/pylibcudf/tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py index f784cb3c191..bbb08e8b95a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_binaryops.py +++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py @@ -4,10 +4,11 @@ import numpy as np import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + def idfn(param): ltype, rtype, outtype, plc_op, _ = param diff --git a/python/pylibcudf/pylibcudf/tests/test_column_factories.py b/python/pylibcudf/pylibcudf/tests/test_column_factories.py index 8cedbc6d42f..e317362a76b 100644 --- a/python/pylibcudf/pylibcudf/tests/test_column_factories.py +++ b/python/pylibcudf/pylibcudf/tests/test_column_factories.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq +import pylibcudf as plc + EMPTY_COL_SIZE = 3 NUMERIC_TYPES = [ diff --git a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py index 0e129fdf0ef..24cd6b9e35f 100644 --- a/python/pylibcudf/pylibcudf/tests/test_column_from_device.py +++ b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py @@ -1,12 +1,13 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq import rmm +import pylibcudf as plc + VALID_TYPES = [ pa.int8(), pa.int16(), diff --git a/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py new file mode 100644 index 00000000000..6d8b5993964 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_contiguous_split.py @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_table_eq + +import pylibcudf as plc + +param_pyarrow_tables = [ + pa.table([]), + pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}), + pa.table({"a": [1, 2, 3]}), + pa.table({"a": [1], "b": [2], "c": [3]}), + pa.table({"a": ["a", "bb", "ccc"]}), + pa.table({"a": [1, 2, None], "b": [None, 3, 4]}), + pa.table( + { + "a": [["a", "b"], ["cde"]], + "b": [ + {"alpha": [1, 2], "beta": None}, + {"alpha": [3, 4], "beta": 5}, + ], + } + ), +] + + +@pytest.mark.parametrize("arrow_tbl", param_pyarrow_tables) +def test_pack_and_unpack(arrow_tbl): + plc_tbl = plc.interop.from_arrow(arrow_tbl) + packed = plc.contiguous_split.pack(plc_tbl) + + res = plc.contiguous_split.unpack(packed) + assert_table_eq(arrow_tbl, res) + + +@pytest.mark.parametrize("arrow_tbl", param_pyarrow_tables) +def test_pack_and_unpack_from_memoryviews(arrow_tbl): + plc_tbl = plc.interop.from_arrow(arrow_tbl) + packed = plc.contiguous_split.pack(plc_tbl) + + metadata, gpudata = packed.release() + + with pytest.raises(ValueError, match="Cannot release empty"): + packed.release() + + del packed # `metadata` and `gpudata` will survive + + res = plc.contiguous_split.unpack_from_memoryviews(metadata, gpudata) + assert_table_eq(arrow_tbl, res) diff --git a/python/pylibcudf/pylibcudf/tests/test_copying.py b/python/pylibcudf/pylibcudf/tests/test_copying.py index 628682d0a66..c0a41b96b1a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_copying.py +++ b/python/pylibcudf/pylibcudf/tests/test_copying.py @@ -2,7 +2,6 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import ( DEFAULT_STRUCT_TESTING_TYPE, @@ -16,6 +15,8 @@ metadata_from_arrow_type, ) +import pylibcudf as plc + # TODO: consider moving this to conftest and "pairing" # it with pa_type, so that they don't get out of sync diff --git a/python/pylibcudf/pylibcudf/tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py index 89c96829e71..a80ab8d9f65 100644 --- a/python/pylibcudf/pylibcudf/tests/test_datetime.py +++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py @@ -1,26 +1,13 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import datetime -import functools import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq - -@pytest.fixture -def date_column(has_nulls): - values = [ - datetime.date(1999, 1, 1), - datetime.date(2024, 10, 12), - datetime.date(1, 1, 1), - datetime.date(9999, 1, 1), - ] - if has_nulls: - values[2] = None - return plc.interop.from_arrow(pa.array(values, type=pa.date32())) +import pylibcudf as plc @pytest.fixture(scope="module", params=["s", "ms", "us", "ns"]) @@ -40,24 +27,35 @@ def datetime_column(has_nulls, request): ) -@pytest.mark.parametrize( - "component, pc_fun", - [ - ("year", pc.year), - ("month", pc.month), - ("day", pc.day), - ("weekday", functools.partial(pc.day_of_week, count_from_zero=False)), - ("hour", pc.hour), - ("minute", pc.minute), - ("second", pc.second), - ("millisecond", pc.millisecond), - ("microsecond", pc.microsecond), - ("nanosecond", pc.nanosecond), +@pytest.fixture( + params=[ + ("year", plc.datetime.DatetimeComponent.YEAR), + ("month", plc.datetime.DatetimeComponent.MONTH), + ("day", plc.datetime.DatetimeComponent.DAY), + ("day_of_week", plc.datetime.DatetimeComponent.WEEKDAY), + ("hour", plc.datetime.DatetimeComponent.HOUR), + ("minute", plc.datetime.DatetimeComponent.MINUTE), + ("second", plc.datetime.DatetimeComponent.SECOND), + ("millisecond", plc.datetime.DatetimeComponent.MILLISECOND), + ("microsecond", plc.datetime.DatetimeComponent.MICROSECOND), + ("nanosecond", plc.datetime.DatetimeComponent.NANOSECOND), ], + ids=lambda x: x[0], ) -def test_extraction(datetime_column, component, pc_fun): +def component(request): + return request.param + + +def test_extract_datetime_component(datetime_column, component): + attr, component = component + kwargs = {} + if attr == "day_of_week": + kwargs = {"count_from_zero": False} got = plc.datetime.extract_datetime_component(datetime_column, component) # libcudf produces an int16, arrow produces an int64 - expect = pc_fun(plc.interop.to_arrow(datetime_column)).cast(pa.int16()) + + expect = getattr(pc, attr)( + plc.interop.to_arrow(datetime_column), **kwargs + ).cast(pa.int16()) assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_expressions.py b/python/pylibcudf/pylibcudf/tests/test_expressions.py index 5894ef4624c..6eabd6db617 100644 --- a/python/pylibcudf/pylibcudf/tests/test_expressions.py +++ b/python/pylibcudf/pylibcudf/tests/test_expressions.py @@ -1,8 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest +import pylibcudf as plc + # We can't really evaluate these expressions, so just make sure # construction works properly diff --git a/python/pylibcudf/pylibcudf/tests/test_interop.py b/python/pylibcudf/pylibcudf/tests/test_interop.py index 01c998f16d4..af80b6e5978 100644 --- a/python/pylibcudf/pylibcudf/tests/test_interop.py +++ b/python/pylibcudf/pylibcudf/tests/test_interop.py @@ -1,8 +1,12 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +import cupy as cp +import numpy as np import pyarrow as pa -import pylibcudf as plc import pytest +from utils import assert_table_eq + +import pylibcudf as plc def test_list_dtype_roundtrip(): @@ -66,3 +70,31 @@ def test_decimal_other(data_type): arrow_type = plc.interop.to_arrow(data_type, precision=precision) assert arrow_type == pa.decimal128(precision, 0) + + +def test_round_trip_dlpack_plc_table(): + expected = pa.table({"a": [1, 2, 3], "b": [5, 6, 7]}) + plc_table = plc.interop.from_arrow(expected) + result = plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table)) + assert_table_eq(expected, result) + + +@pytest.mark.parametrize("array", [np.array, cp.array]) +def test_round_trip_dlpack_array(array): + arr = array([1, 2, 3]) + result = plc.interop.from_dlpack(arr.__dlpack__()) + expected = pa.table({"a": [1, 2, 3]}) + assert_table_eq(expected, result) + + +def test_to_dlpack_error(): + plc_table = plc.interop.from_arrow( + pa.table({"a": [1, None, 3], "b": [5, 6, 7]}) + ) + with pytest.raises(ValueError, match="Cannot create a DLPack tensor"): + plc.interop.from_dlpack(plc.interop.to_dlpack(plc_table)) + + +def test_from_dlpack_error(): + with pytest.raises(ValueError, match="Invalid PyCapsule object"): + plc.interop.from_dlpack(1) diff --git a/python/pylibcudf/pylibcudf/tests/test_join.py b/python/pylibcudf/pylibcudf/tests/test_join.py index 61e02f4d28d..f43a56046a4 100644 --- a/python/pylibcudf/pylibcudf/tests/test_join.py +++ b/python/pylibcudf/pylibcudf/tests/test_join.py @@ -2,9 +2,10 @@ import numpy as np import pyarrow as pa -import pylibcudf as plc from utils import assert_table_eq +import pylibcudf as plc + def test_cross_join(): left = pa.Table.from_arrays([[0, 1, 2], [3, 4, 5]], names=["a", "b"]) diff --git a/python/pylibcudf/pylibcudf/tests/test_json.py b/python/pylibcudf/pylibcudf/tests/test_json.py new file mode 100644 index 00000000000..486a9524e92 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_json.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def plc_col(): + arr = pa.array( + ['{"foo": {"bar": [{"a": 1, "b": 2}, {"a": 3, "b": 4}]', None] + ) + return plc.interop.from_arrow(arr) + + +@pytest.fixture(scope="module") +def json_path(): + slr = pa.scalar("$.foo.bar") + return plc.interop.from_arrow(slr) + + +@pytest.mark.parametrize("allow_single_quotes", [True, False]) +@pytest.mark.parametrize("strip_quotes_from_single_strings", [True, False]) +@pytest.mark.parametrize("missing_fields_as_nulls", [True, False]) +def test_get_json_object( + plc_col, + json_path, + allow_single_quotes, + strip_quotes_from_single_strings, + missing_fields_as_nulls, +): + result = plc.json.get_json_object( + plc_col, + json_path, + plc.json.GetJsonObjectOptions( + allow_single_quotes=allow_single_quotes, + strip_quotes_from_single_strings=strip_quotes_from_single_strings, + missing_fields_as_nulls=missing_fields_as_nulls, + ), + ) + expected = pa.array(['[{"a": 1, "b": 2}, {"a": 3, "b": 4}]', None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py index f7fb7463b50..beacfc63ce5 100644 --- a/python/pylibcudf/pylibcudf/tests/test_labeling.py +++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.mark.parametrize("left_inclusive", [True, False]) @pytest.mark.parametrize("right_inclusive", [True, False]) diff --git a/python/pylibcudf/pylibcudf/tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py index 2353a6ff8f9..f3ef555f11d 100644 --- a/python/pylibcudf/pylibcudf/tests/test_lists.py +++ b/python/pylibcudf/pylibcudf/tests/test_lists.py @@ -3,10 +3,11 @@ import numpy as np import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture def test_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_null_mask.py b/python/pylibcudf/pylibcudf/tests/test_null_mask.py index 3edcae59edc..cd3da856de2 100644 --- a/python/pylibcudf/pylibcudf/tests/test_null_mask.py +++ b/python/pylibcudf/pylibcudf/tests/test_null_mask.py @@ -1,12 +1,13 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest -from pylibcudf.null_mask import MaskState import rmm +import pylibcudf as plc +from pylibcudf.null_mask import MaskState + @pytest.fixture(params=[False, True]) def nullable(request): diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py new file mode 100644 index 00000000000..8b14e0db576 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_edit_distance.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def edit_distance_data(): + arr1 = ["hallo", "goodbye", "world"] + arr2 = ["hello", "", "world"] + return pa.array(arr1), pa.array(arr2) + + +def test_edit_distance(edit_distance_data): + input_col, targets = edit_distance_data + result = plc.nvtext.edit_distance.edit_distance( + plc.interop.from_arrow(input_col), + plc.interop.from_arrow(targets), + ) + expected = pa.array([1, 7, 0], type=pa.int32()) + assert_column_eq(result, expected) + + +def test_edit_distance_matrix(edit_distance_data): + input_col, _ = edit_distance_data + result = plc.nvtext.edit_distance.edit_distance_matrix( + plc.interop.from_arrow(input_col) + ) + expected = pa.array( + [[0, 7, 4], [7, 0, 6], [4, 6, 0]], type=pa.list_(pa.int32()) + ) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py new file mode 100644 index 00000000000..fae4685f81b --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_col(): + arr = ["ab", "cde", "fgh"] + return pa.array(arr) + + +@pytest.mark.parametrize("ngram", [2, 3]) +@pytest.mark.parametrize("sep", ["_", "**", ","]) +def test_generate_ngrams(input_col, ngram, sep): + result = plc.nvtext.generate_ngrams.generate_ngrams( + plc.interop.from_arrow(input_col), + ngram, + plc.interop.from_arrow(pa.scalar(sep)), + ) + expected = pa.array([f"ab{sep}cde", f"cde{sep}fgh"]) + if ngram == 3: + expected = pa.array([f"ab{sep}cde{sep}fgh"]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("ngram", [2, 3]) +def test_generate_character_ngrams(input_col, ngram): + result = plc.nvtext.generate_ngrams.generate_character_ngrams( + plc.interop.from_arrow(input_col), + ngram, + ) + expected = pa.array([["ab"], ["cd", "de"], ["fg", "gh"]]) + if ngram == 3: + expected = pa.array([[], ["cde"], ["fgh"]]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("ngram", [2, 3]) +def test_hash_character_ngrams(input_col, ngram): + result = plc.nvtext.generate_ngrams.hash_character_ngrams( + plc.interop.from_arrow(input_col), + ngram, + ) + pa_result = plc.interop.to_arrow(result) + assert all( + len(got) == max(0, len(s.as_py()) - ngram + 1) + for got, s in zip(pa_result, input_col) + ) + assert pa_result.type == pa.list_( + pa.field("element", pa.uint32(), nullable=False) + ) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py new file mode 100644 index 00000000000..05fe7b53c16 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_jaccard.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_data(): + input1 = ["the fuzzy dog", "little piggy", "funny bunny", "chatty parrot"] + input2 = ["the fuzzy cat", "bitty piggy", "funny bunny", "silent partner"] + return pa.array(input1), pa.array(input2) + + +@pytest.mark.parametrize("width", [2, 3]) +def test_jaccard_index(input_data, width): + def get_tokens(s, width): + return [s[i : i + width] for i in range(len(s) - width + 1)] + + def jaccard_index(s1, s2, width): + x = set(get_tokens(s1, width)) + y = set(get_tokens(s2, width)) + return len(x & y) / len(x | y) + + input1, input2 = input_data + result = plc.nvtext.jaccard.jaccard_index( + plc.interop.from_arrow(input1), plc.interop.from_arrow(input2), width + ) + expected = pa.array( + [ + jaccard_index(s1.as_py(), s2.as_py(), width) + for s1, s2 in zip(input1, input2) + ], + type=pa.float32(), + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py new file mode 100644 index 00000000000..ead9ee094af --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest + +import pylibcudf as plc + + +@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) +def minhash_input_data(request): + input_arr = pa.array(["foo", "bar", "foo foo", "bar bar"]) + seeds = pa.array([2, 3, 4, 5], request.param) + return input_arr, seeds, request.param + + +@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()]) +def word_minhash_input_data(request): + input_arr = pa.array([["foo", "bar"], ["foo foo", "bar bar"]]) + seeds = pa.array([2, 3, 4, 5], request.param) + return input_arr, seeds, request.param + + +@pytest.mark.parametrize("width", [5, 12]) +def test_minhash(minhash_input_data, width): + input_arr, seeds, seed_type = minhash_input_data + minhash_func = ( + plc.nvtext.minhash.minhash + if seed_type == pa.uint32() + else plc.nvtext.minhash.minhash64 + ) + result = minhash_func( + plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width + ) + pa_result = plc.interop.to_arrow(result) + assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) + assert pa_result.type == pa.list_( + pa.field("element", seed_type, nullable=False) + ) + + +def test_word_minhash(word_minhash_input_data): + input_arr, seeds, seed_type = word_minhash_input_data + word_minhash_func = ( + plc.nvtext.minhash.word_minhash + if seed_type == pa.uint32() + else plc.nvtext.minhash.word_minhash64 + ) + result = word_minhash_func( + plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds) + ) + pa_result = plc.interop.to_arrow(result) + assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr)) + assert pa_result.type == pa.list_( + pa.field("element", seed_type, nullable=False) + ) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py new file mode 100644 index 00000000000..84748b5597e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_ngrams_tokenize.py @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_col(): + arr = ["a*b*c*d", "a b c d", "a-b-c-d", "a*b c-d"] + return pa.array(arr) + + +@pytest.mark.parametrize("ngrams", [2, 3]) +@pytest.mark.parametrize("delim", ["*", " ", "-"]) +@pytest.mark.parametrize("sep", ["_", "&", ","]) +def test_ngrams_tokenize(input_col, ngrams, delim, sep): + def ngrams_tokenize(strings, ngrams, delim, sep): + tokens = [] + for s in strings: + ss = s.split(delim) + for i in range(len(ss) - ngrams + 1): + token = sep.join(ss[i : i + ngrams]) + tokens.append(token) + return tokens + + result = plc.nvtext.ngrams_tokenize.ngrams_tokenize( + plc.interop.from_arrow(input_col), + ngrams, + plc.interop.from_arrow(pa.scalar(delim)), + plc.interop.from_arrow(pa.scalar(sep)), + ) + expected = pa.array( + ngrams_tokenize(input_col.to_pylist(), ngrams, delim, sep) + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py new file mode 100644 index 00000000000..25b6d1389ec --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def norm_spaces_input_data(): + arr = ["a b", " c d\n", "e \t f "] + return pa.array(arr) + + +@pytest.fixture(scope="module") +def norm_chars_input_data(): + arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"] + return pa.array(arr) + + +def test_normalize_spaces(norm_spaces_input_data): + result = plc.nvtext.normalize.normalize_spaces( + plc.interop.from_arrow(norm_spaces_input_data) + ) + expected = pa.array(["a b", "c d", "e f"]) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("do_lower", [True, False]) +def test_normalize_characters(norm_chars_input_data, do_lower): + result = plc.nvtext.normalize.normalize_characters( + plc.interop.from_arrow(norm_chars_input_data), + do_lower, + ) + expected = pa.array( + ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "] + ) + if not do_lower: + expected = pa.array( + ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "] + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py new file mode 100644 index 00000000000..65687f31c85 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_replace.py @@ -0,0 +1,64 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_col(): + arr = ["the quick", "brown fox", "jumps*over the", "lazy dog"] + return pa.array(arr) + + +@pytest.fixture(scope="module") +def targets(): + arr = ["the quick", "brown fox", "jumps*over the", "lazy dog"] + return pa.array(arr) + + +@pytest.mark.parametrize("delim", ["*", None]) +def test_replace_tokens(input_col, targets, delim): + replacements = pa.array(["slow", "cat", "looked", "rat"]) + result = plc.nvtext.replace.replace_tokens( + plc.interop.from_arrow(input_col), + plc.interop.from_arrow(targets), + plc.interop.from_arrow(replacements), + plc.interop.from_arrow(pa.scalar(delim)) if delim else None, + ) + expected = pa.array(["slow", "cat", "jumps*over the", "rat"]) + if not delim: + expected = pa.array( + ["the quick", "brown fox", "jumps*over the", "lazy dog"] + ) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("min_token_length", [4, 5]) +@pytest.mark.parametrize("replace", ["---", None]) +@pytest.mark.parametrize("delim", ["*", None]) +def test_filter_tokens(input_col, min_token_length, replace, delim): + result = plc.nvtext.replace.filter_tokens( + plc.interop.from_arrow(input_col), + min_token_length, + plc.interop.from_arrow(pa.scalar(replace)) if replace else None, + plc.interop.from_arrow(pa.scalar(delim)) if delim else None, + ) + expected = pa.array( + ["the quick", "brown fox", "jumps*over the", "lazy dog"] + ) + if not delim and not replace and min_token_length == 4: + expected = pa.array([" quick", "brown ", "jumps*over ", "lazy "]) + if not delim and not replace and min_token_length == 5: + expected = pa.array([" quick", "brown ", "jumps*over ", " "]) + if not delim and replace == "---" and min_token_length == 4: + expected = pa.array( + ["--- quick", "brown ---", "jumps*over ---", "lazy ---"] + ) + if not delim and replace == "---" and min_token_length == 5: + expected = pa.array( + ["--- quick", "brown ---", "jumps*over ---", "--- ---"] + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py new file mode 100644 index 00000000000..e7f4a971f08 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_stemmer.py @@ -0,0 +1,48 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_col(): + arr = ["trouble", "toy", "syzygy"] + return pa.array(arr) + + +@pytest.mark.parametrize("check_vowels", [True, False]) +@pytest.mark.parametrize("indices", [[3, 1, 4], 1]) +def test_is_letter(input_col, check_vowels, indices): + def is_letter(s, i, check): + vowels = "aeiouy" + return (s[i] in vowels) == check + + result = plc.nvtext.stemmer.is_letter( + plc.interop.from_arrow(input_col), + check_vowels, + plc.interop.from_arrow(pa.array(indices)) + if isinstance(indices, list) + else indices, + ) + expected = pa.array( + [ + is_letter( + s, + indices[i] if isinstance(indices, list) else indices, + check_vowels, + ) + for i, s in enumerate(input_col.to_pylist()) + ] + ) + assert_column_eq(result, expected) + + +def test_porter_stemmer_measure(input_col): + result = plc.nvtext.stemmer.porter_stemmer_measure( + plc.interop.from_arrow(input_col), + ) + expected = pa.array([1, 1, 2], type=pa.int32()) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py index 444d0089d2c..c55e54cebc6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_partitioning.py +++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_table_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def partitioning_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_quantiles.py b/python/pylibcudf/pylibcudf/tests/test_quantiles.py index bac56691306..e4a24fb1c98 100644 --- a/python/pylibcudf/pylibcudf/tests/test_quantiles.py +++ b/python/pylibcudf/pylibcudf/tests/test_quantiles.py @@ -3,10 +3,11 @@ import numpy as np import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq, assert_table_eq +import pylibcudf as plc + # Map pylibcudf interpolation options to pyarrow options interp_mapping = { plc.types.Interpolation.LINEAR: "linear", diff --git a/python/pylibcudf/pylibcudf/tests/test_regex_program.py b/python/pylibcudf/pylibcudf/tests/test_regex_program.py index 777315df538..52598f2c462 100644 --- a/python/pylibcudf/pylibcudf/tests/test_regex_program.py +++ b/python/pylibcudf/pylibcudf/tests/test_regex_program.py @@ -1,8 +1,9 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.mark.parametrize("pat", ["(", "*", "\\"]) def test_regex_program_invalid(pat): diff --git a/python/pylibcudf/pylibcudf/tests/test_reshape.py b/python/pylibcudf/pylibcudf/tests/test_reshape.py index 01115bc363a..ef23e23766a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_reshape.py +++ b/python/pylibcudf/pylibcudf/tests/test_reshape.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq, assert_table_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def reshape_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_round.py b/python/pylibcudf/pylibcudf/tests/test_round.py index 0b30316b9a0..2526580bc13 100644 --- a/python/pylibcudf/pylibcudf/tests/test_round.py +++ b/python/pylibcudf/pylibcudf/tests/test_round.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(params=["float32", "float64"]) def column(request, has_nulls): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py index a1820def0b1..f461657281a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture() def str_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py index 176ccc55b96..3e31c75c38a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def str_data(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_case.py b/python/pylibcudf/pylibcudf/tests/test_string_case.py index 233cc253b14..08ac371fd96 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_case.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_case.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def string_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py index bcd030c019e..06b44210d74 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_char_types.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_char_types.py @@ -2,9 +2,10 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_all_characters_of_type(): pa_array = pa.array(["1", "A"]) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_combine.py b/python/pylibcudf/pylibcudf/tests/test_string_combine.py new file mode 100644 index 00000000000..eea3ac68e84 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_combine.py @@ -0,0 +1,84 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_concatenate_scalar_seperator(): + plc_table = plc.interop.from_arrow( + pa.table({"a": ["a", None, "c"], "b": ["a", "b", None]}) + ) + sep = plc.interop.from_arrow(pa.scalar("-")) + result = plc.strings.combine.concatenate( + plc_table, + sep, + ) + expected = pa.array(["a-a", "-b", "c-"]) + assert_column_eq(result, expected) + + result = plc.strings.combine.concatenate( + plc_table, sep, narep=plc.interop.from_arrow(pa.scalar("!")) + ) + expected = pa.array(["a-a", "!-b", "c-!"]) + assert_column_eq(result, expected) + + with pytest.raises(ValueError): + plc.strings.combine.concatenate( + plc_table, + sep, + narep=plc.interop.from_arrow(pa.scalar("!")), + col_narep=plc.interop.from_arrow(pa.scalar("?")), + ) + + +def test_concatenate_column_seperator(): + plc_table = plc.interop.from_arrow( + pa.table({"a": ["a", None, "c"], "b": ["a", "b", None]}) + ) + sep = plc.interop.from_arrow(pa.array(["-", "?", ","])) + result = plc.strings.combine.concatenate( + plc_table, + sep, + ) + expected = pa.array(["a-a", "?b", "c,"]) + assert_column_eq(result, expected) + + result = plc.strings.combine.concatenate( + plc_table, + plc.interop.from_arrow(pa.array([None, "?", ","])), + narep=plc.interop.from_arrow(pa.scalar("1")), + col_narep=plc.interop.from_arrow(pa.scalar("*")), + ) + expected = pa.array(["a1a", "*?b", "c,*"]) + assert_column_eq(result, expected) + + +def test_join_strings(): + pa_arr = pa.array(list("abc")) + sep = pa.scalar("") + result = plc.strings.combine.join_strings( + plc.interop.from_arrow(pa_arr), + plc.interop.from_arrow(sep), + plc.interop.from_arrow(pa.scalar("")), + ) + expected = pa.array(["abc"]) + assert_column_eq(result, expected) + + +def test_join_list_elements(): + pa_arr = pa.array([["a", "a"], ["b", "b"]]) + sep = pa.scalar("") + result = plc.strings.combine.join_list_elements( + plc.interop.from_arrow(pa_arr), + plc.interop.from_arrow(sep), + plc.interop.from_arrow(pa.scalar("")), + plc.interop.from_arrow(pa.scalar("")), + plc.strings.combine.SeparatorOnNulls.YES, + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, + ) + expected = pc.binary_join(pa.array([["a", "a"], ["b", "b"]]), sep) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py index 4e4dd7cbb00..ba9a4a7d3b8 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_contains.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def target_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert.py b/python/pylibcudf/pylibcudf/tests/test_string_convert.py index e9e95459d0e..3f3f452c4f6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_convert.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert.py @@ -1,12 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from datetime import datetime - import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture( scope="module", @@ -21,39 +20,16 @@ def timestamp_type(request): return request.param -@pytest.fixture( - scope="module", - params=[ - pa.duration("ns"), - pa.duration("us"), - pa.duration("ms"), - pa.duration("s"), - ], -) -def duration_type(request): - return request.param - - @pytest.fixture(scope="module") def pa_timestamp_col(): return pa.array(["2011-01-01", "2011-01-02", "2011-01-03"]) -@pytest.fixture(scope="module") -def pa_duration_col(): - return pa.array(["05:20:25"]) - - @pytest.fixture(scope="module") def plc_timestamp_col(pa_timestamp_col): return plc.interop.from_arrow(pa_timestamp_col) -@pytest.fixture(scope="module") -def plc_duration_col(pa_duration_col): - return plc.interop.from_arrow(pa_duration_col) - - @pytest.mark.parametrize("format", ["%Y-%m-%d"]) def test_to_datetime( pa_timestamp_col, plc_timestamp_col, timestamp_type, format @@ -62,24 +38,6 @@ def test_to_datetime( got = plc.strings.convert.convert_datetime.to_timestamps( plc_timestamp_col, plc.interop.from_arrow(timestamp_type), - format.encode(), - ) - assert_column_eq(expect, got) - - -@pytest.mark.parametrize("format", ["%H:%M:%S"]) -def test_to_duration(pa_duration_col, plc_duration_col, duration_type, format): - def to_timedelta(duration_str): - date = datetime.strptime(duration_str, format) - return date - datetime(1900, 1, 1) # "%H:%M:%S" zero date - - expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast( - duration_type - ) - - got = plc.strings.convert.convert_durations.to_durations( - plc_duration_col, - plc.interop.from_arrow(duration_type), - format.encode(), + format, ) assert_column_eq(expect, got) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py new file mode 100644 index 00000000000..b391d2b290e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_booleans.py @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_to_booleans(): + pa_array = pa.array(["true", None, "True"]) + result = plc.strings.convert.convert_booleans.to_booleans( + plc.interop.from_arrow(pa_array), + plc.interop.from_arrow(pa.scalar("True")), + ) + expected = pa.array([False, None, True]) + assert_column_eq(result, expected) + + +def test_from_booleans(): + pa_array = pa.array([True, None, False]) + result = plc.strings.convert.convert_booleans.from_booleans( + plc.interop.from_arrow(pa_array), + plc.interop.from_arrow(pa.scalar("A")), + plc.interop.from_arrow(pa.scalar("B")), + ) + expected = pa.array(["A", None, "B"]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py new file mode 100644 index 00000000000..c9368d858a4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_datetime.py @@ -0,0 +1,47 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import datetime + +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture +def fmt(): + return "%Y-%m-%dT%H:%M:%S" + + +def test_to_timestamp(fmt): + arr = pa.array(["2020-01-01T01:01:01", None]) + result = plc.strings.convert.convert_datetime.to_timestamps( + plc.interop.from_arrow(arr), + plc.DataType(plc.TypeId.TIMESTAMP_SECONDS), + fmt, + ) + expected = pc.strptime(arr, fmt, "s") + assert_column_eq(result, expected) + + +def test_from_timestamp(fmt): + arr = pa.array([datetime.datetime(2020, 1, 1, 1, 1, 1), None]) + result = plc.strings.convert.convert_datetime.from_timestamps( + plc.interop.from_arrow(arr), + fmt, + plc.interop.from_arrow(pa.array([], type=pa.string())), + ) + # pc.strftime will add the extra %f + expected = pa.array(["2020-01-01T01:01:01", None]) + assert_column_eq(result, expected) + + +def test_is_timestamp(fmt): + arr = pa.array(["2020-01-01T01:01:01", None, "2020-01-01"]) + result = plc.strings.convert.convert_datetime.is_timestamp( + plc.interop.from_arrow(arr), + fmt, + ) + expected = pa.array([True, None, False]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py new file mode 100644 index 00000000000..2d3578e4e71 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_durations.py @@ -0,0 +1,62 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from datetime import datetime, timedelta + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture( + params=[ + pa.duration("ns"), + pa.duration("us"), + pa.duration("ms"), + pa.duration("s"), + ], +) +def duration_type(request): + return request.param + + +@pytest.fixture(scope="module") +def pa_duration_col(): + return pa.array(["05:20:25"]) + + +@pytest.fixture(scope="module") +def plc_duration_col(pa_duration_col): + return plc.interop.from_arrow(pa_duration_col) + + +def test_to_duration(pa_duration_col, plc_duration_col, duration_type): + format = "%H:%M:%S" + + def to_timedelta(duration_str): + date = datetime.strptime(duration_str, format) + return date - datetime(1900, 1, 1) # "%H:%M:%S" zero date + + expect = pa.array([to_timedelta(d.as_py()) for d in pa_duration_col]).cast( + duration_type + ) + + got = plc.strings.convert.convert_durations.to_durations( + plc_duration_col, + plc.interop.from_arrow(duration_type), + format, + ) + assert_column_eq(expect, got) + + +@pytest.mark.parametrize("format", [None, "%D days %H:%M:%S"]) +def test_from_durations(format): + pa_array = pa.array( + [timedelta(days=1, hours=1, minutes=1, seconds=1), None] + ) + result = plc.strings.convert.convert_durations.from_durations( + plc.interop.from_arrow(pa_array), format + ) + expected = pa.array(["1 days 01:01:01", None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py new file mode 100644 index 00000000000..012e722038e --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_fixed_point.py @@ -0,0 +1,35 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import decimal + +import pyarrow as pa +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_to_fixed_point(): + typ = pa.decimal128(38, 2) + arr = pa.array(["123", "1.23", None]) + result = plc.strings.convert.convert_fixed_point.to_fixed_point( + plc.interop.from_arrow(arr), plc.interop.from_arrow(typ) + ) + expected = arr.cast(typ) + assert_column_eq(result, expected) + + +def test_from_fixed_point(): + arr = pa.array([decimal.Decimal("1.1"), None]) + result = plc.strings.convert.convert_fixed_point.from_fixed_point( + plc.interop.from_arrow(arr), + ) + expected = pa.array(["1.1", None]) + assert_column_eq(result, expected) + + +def test_is_fixed_point(): + arr = pa.array(["123", "1.23", "1.2.3", "", None]) + result = plc.strings.convert.convert_fixed_point.is_fixed_point( + plc.interop.from_arrow(arr), + ) + expected = pa.array([True, True, False, False, None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py new file mode 100644 index 00000000000..8ee2b5075af --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_floats.py @@ -0,0 +1,34 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_to_floats(): + typ = pa.float32() + arr = pa.array(["-1.23", "1", None]) + result = plc.strings.convert.convert_floats.to_floats( + plc.interop.from_arrow(arr), plc.interop.from_arrow(typ) + ) + expected = arr.cast(typ) + assert_column_eq(result, expected) + + +def test_from_floats(): + arr = pa.array([-1.23, 1, None]) + result = plc.strings.convert.convert_floats.from_floats( + plc.interop.from_arrow(arr), + ) + expected = pa.array(["-1.23", "1.0", None]) + assert_column_eq(result, expected) + + +def test_is_float(): + arr = pa.array(["-1.23", "1", "1.2.3", "A", None]) + result = plc.strings.convert.convert_floats.is_float( + plc.interop.from_arrow(arr), + ) + expected = pa.array([True, True, False, False, None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py new file mode 100644 index 00000000000..01192c2d1f8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_integers.py @@ -0,0 +1,70 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_to_integers(): + typ = pa.int8() + arr = pa.array(["1", "-1", None]) + result = plc.strings.convert.convert_integers.to_integers( + plc.interop.from_arrow(arr), plc.interop.from_arrow(typ) + ) + expected = arr.cast(typ) + assert_column_eq(result, expected) + + +def test_from_integers(): + arr = pa.array([1, -1, None]) + result = plc.strings.convert.convert_integers.from_integers( + plc.interop.from_arrow(arr) + ) + expected = pa.array(["1", "-1", None]) + assert_column_eq(result, expected) + + +def test_is_integer(): + arr = pa.array(["1", "-1", "1.2", "A", None]) + plc_column = plc.interop.from_arrow(arr) + result = plc.strings.convert.convert_integers.is_integer(plc_column) + expected = pa.array([True, True, False, False, None]) + assert_column_eq(result, expected) + + result = plc.strings.convert.convert_integers.is_integer( + plc_column, plc.interop.from_arrow(pa.uint8()) + ) + expected = pa.array([True, False, False, False, None]) + assert_column_eq(result, expected) + + +def test_hex_to_integers(): + typ = pa.int32() + data = ["0xff", "0x2a", None] + result = plc.strings.convert.convert_integers.hex_to_integers( + plc.interop.from_arrow(pa.array(data)), plc.interop.from_arrow(typ) + ) + expected = pa.array( + [int(val, 16) if isinstance(val, str) else val for val in data], + type=typ, + ) + assert_column_eq(result, expected) + + +def test_is_hex(): + arr = pa.array(["0xff", "123", "!", None]) + result = plc.strings.convert.convert_integers.is_hex( + plc.interop.from_arrow(arr) + ) + expected = pa.array([True, True, False, None]) + assert_column_eq(result, expected) + + +def test_integers_to_hex(): + data = [255, -42, None] + arr = pa.array(data) + result = plc.strings.convert.convert_integers.integers_to_hex( + plc.interop.from_arrow(arr) + ) + expected = pa.array(["FF", "FFFFFFFFFFFFFFD6", None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py new file mode 100644 index 00000000000..b533809f106 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_ipv4.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_ipv4_to_integers(): + arr = pa.array(["123.45.67.890", None]) + result = plc.strings.convert.convert_ipv4.ipv4_to_integers( + plc.interop.from_arrow(arr) + ) + expected = pa.array([2066564730, None], type=pa.uint32()) + assert_column_eq(result, expected) + + +def test_integers_to_ipv4(): + arr = pa.array([1, 0, None], type=pa.uint32()) + result = plc.strings.convert.convert_ipv4.integers_to_ipv4( + plc.interop.from_arrow(arr) + ) + expected = pa.array(["0.0.0.1", "0.0.0.0", None]) + assert_column_eq(result, expected) + + +def test_is_ipv4(): + arr = pa.array(["0.0.0.1", "1.2.34", "A", None]) + result = plc.strings.convert.convert_ipv4.is_ipv4( + plc.interop.from_arrow(arr) + ) + expected = pa.array([True, False, False, None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py new file mode 100644 index 00000000000..737036a4f0f --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_lists.py @@ -0,0 +1,22 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.mark.parametrize("na_rep", [None, pa.scalar("")]) +@pytest.mark.parametrize("separators", [None, pa.array([",", "[", "]"])]) +def test_format_list_column(na_rep, separators): + arr = pa.array([["1", "A"], None]) + result = plc.strings.convert.convert_lists.format_list_column( + plc.interop.from_arrow(arr), + na_rep if na_rep is None else plc.interop.from_arrow(na_rep), + separators + if separators is None + else plc.interop.from_arrow(separators), + ) + expected = pa.array(["[1,A]", ""]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py new file mode 100644 index 00000000000..528736798c7 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py @@ -0,0 +1,37 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import urllib + +import pyarrow as pa +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_url_encode(): + data = ["/home/nfs", None] + arr = pa.array(data) + result = plc.strings.convert.convert_urls.url_encode( + plc.interop.from_arrow(arr) + ) + expected = pa.array( + [ + urllib.parse.quote(url, safe="") if isinstance(url, str) else url + for url in data + ] + ) + assert_column_eq(result, expected) + + +def test_url_decode(): + data = ["%2Fhome%2fnfs", None] + arr = pa.array(data) + result = plc.strings.convert.convert_urls.url_decode( + plc.interop.from_arrow(arr) + ) + expected = pa.array( + [ + urllib.parse.unquote(url) if isinstance(url, str) else url + for url in data + ] + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_extract.py b/python/pylibcudf/pylibcudf/tests/test_string_extract.py index 788b86423c4..e70edf4fb33 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_extract.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_extract.py @@ -2,6 +2,7 @@ import pyarrow as pa import pyarrow.compute as pc + import pylibcudf as plc diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find.py b/python/pylibcudf/pylibcudf/tests/test_string_find.py index db3b13a5aae..82ec18832a9 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_find.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_find.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def data_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py new file mode 100644 index 00000000000..fa9eee3594b --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_find_multiple.py @@ -0,0 +1,23 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_find_multiple(): + arr = pa.array(["abc", "def"]) + targets = pa.array(["a", "c", "e"]) + result = plc.strings.find_multiple.find_multiple( + plc.interop.from_arrow(arr), + plc.interop.from_arrow(targets), + ) + expected = pa.array( + [ + [elem.find(target) for target in targets.to_pylist()] + for elem in arr.to_pylist() + ], + type=pa.list_(pa.int32()), + ) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_findall.py b/python/pylibcudf/pylibcudf/tests/test_string_findall.py index 994552fa276..b73d812c898 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_findall.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_findall.py @@ -2,9 +2,10 @@ import re import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_findall(): arr = pa.array(["bunny", "rabbit", "hare", "dog"]) @@ -21,3 +22,20 @@ def test_findall(): type=pa_result.type, ) assert_column_eq(result, expected) + + +def test_find_re(): + arr = pa.array(["bunny", "rabbit", "hare", "dog"]) + pattern = "[eb]" + result = plc.strings.findall.find_re( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pattern, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + ) + pa_result = plc.interop.to_arrow(result) + expected = pa.array( + [0, 2, 3, -1], + type=pa_result.type, + ) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_padding.py b/python/pylibcudf/pylibcudf/tests/test_string_padding.py new file mode 100644 index 00000000000..79498132097 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_padding.py @@ -0,0 +1,27 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc + +import pylibcudf as plc + + +def test_pad(): + arr = pa.array(["a", "1", None]) + plc_result = plc.strings.padding.pad( + plc.interop.from_arrow(arr), + 2, + plc.strings.side_type.SideType.LEFT, + "!", + ) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array(pc.utf8_lpad(arr, 2, padding="!")) + assert result.equals(expected) + + +def test_zfill(): + arr = pa.array(["a", "1", None]) + plc_result = plc.strings.padding.zfill(plc.interop.from_arrow(arr), 2) + result = plc.interop.to_arrow(plc_result) + expected = pa.chunked_array(pc.utf8_lpad(arr, 2, padding="0")) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py index 18b5d8bf4d0..c06c06be7c6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_repeat.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_repeat.py @@ -2,9 +2,10 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.mark.parametrize("repeats", [pa.array([2, 2]), 2]) def test_repeat_strings(repeats): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace.py b/python/pylibcudf/pylibcudf/tests/test_string_replace.py index 5a9c2007b73..2c7d25133de 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_replace.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_replace.py @@ -2,10 +2,11 @@ import pyarrow as pa import pyarrow.compute as pc -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def data_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py new file mode 100644 index 00000000000..511f826441a --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_replace_re.py @@ -0,0 +1,72 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.mark.parametrize("max_replace_count", [-1, 1]) +def test_replace_re_regex_program_scalar(max_replace_count): + arr = pa.array(["foo", "fuz", None]) + pat = "f." + repl = "ba" + result = plc.strings.replace_re.replace_re( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + pat, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + plc.interop.from_arrow(pa.scalar(repl)), + max_replace_count=max_replace_count, + ) + expected = pc.replace_substring_regex( + arr, + pat, + repl, + max_replacements=max_replace_count + if max_replace_count != -1 + else None, + ) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize( + "flags", + [ + plc.strings.regex_flags.RegexFlags.DEFAULT, + plc.strings.regex_flags.RegexFlags.DOTALL, + ], +) +def test_replace_re_list_str_columns(flags): + arr = pa.array(["foo", "fuz", None]) + pats = ["oo", "uz"] + repls = ["a", "b"] + result = plc.strings.replace_re.replace_re( + plc.interop.from_arrow(arr), + pats, + plc.interop.from_arrow(pa.array(repls)), + flags=flags, + ) + expected = arr + for pat, repl in zip(pats, repls): + expected = pc.replace_substring_regex( + expected, + pat, + repl, + ) + assert_column_eq(result, expected) + + +def test_replace_with_backrefs(): + arr = pa.array(["Z756", None]) + result = plc.strings.replace_re.replace_with_backrefs( + plc.interop.from_arrow(arr), + plc.strings.regex_program.RegexProgram.create( + "(\\d)(\\d)", plc.strings.regex_flags.RegexFlags.DEFAULT + ), + "V\\2\\1", + ) + expected = pa.array(["ZV576", None]) + assert_column_eq(result, expected) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_slice.py b/python/pylibcudf/pylibcudf/tests/test_string_slice.py index d9ce5591b98..1759f739e31 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_slice.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_slice.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture(scope="module") def pa_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py new file mode 100644 index 00000000000..4e80f19b814 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_split_partition.py @@ -0,0 +1,44 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_table_eq + +import pylibcudf as plc + + +@pytest.fixture +def data_col(): + pa_arr = pa.array(["ab_cd", "def_g_h", None]) + plc_column = plc.interop.from_arrow(pa_arr) + return pa_arr, plc_column + + +def test_partition(data_col): + pa_arr, plc_column = data_col + result = plc.strings.split.partition.partition( + plc_column, plc.interop.from_arrow(pa.scalar("_")) + ) + expected = pa.table( + { + "a": ["ab", "def", None], + "b": ["_", "_", None], + "c": ["cd", "g_h", None], + } + ) + assert_table_eq(expected, result) + + +def test_rpartition(data_col): + pa_arr, plc_column = data_col + result = plc.strings.split.partition.rpartition( + plc_column, plc.interop.from_arrow(pa.scalar("_")) + ) + expected = pa.table( + { + "a": ["ab", "def_g", None], + "b": ["_", "_", None], + "c": ["cd", "h", None], + } + ) + assert_table_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_split_split.py b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py new file mode 100644 index 00000000000..450b336ce65 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_split_split.py @@ -0,0 +1,131 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pytest +from utils import assert_column_eq, assert_table_eq + +import pylibcudf as plc + + +@pytest.fixture +def data_col(): + pa_array = pa.array(["a_b_c", "d-e-f", None]) + plc_column = plc.interop.from_arrow(pa_array) + return pa_array, plc_column + + +@pytest.fixture +def delimiter(): + delimiter = "_" + plc_delimiter = plc.interop.from_arrow(pa.scalar(delimiter)) + return delimiter, plc_delimiter + + +@pytest.fixture +def re_delimiter(): + return "[_-]" + + +def test_split(data_col, delimiter): + _, plc_column = data_col + _, plc_delimiter = delimiter + result = plc.strings.split.split.split(plc_column, plc_delimiter, 1) + expected = pa.table( + { + "a": ["a", "d-e-f", None], + "b": ["b_c", None, None], + } + ) + assert_table_eq(expected, result) + + +def test_rsplit(data_col, delimiter): + _, plc_column = data_col + _, plc_delimiter = delimiter + result = plc.strings.split.split.rsplit(plc_column, plc_delimiter, 1) + expected = pa.table( + { + "a": ["a_b", "d-e-f", None], + "b": ["c", None, None], + } + ) + assert_table_eq(expected, result) + + +def test_split_record(data_col, delimiter): + pa_array, plc_column = data_col + delim, plc_delim = delimiter + result = plc.strings.split.split.split_record(plc_column, plc_delim, 1) + expected = pc.split_pattern(pa_array, delim, max_splits=1) + assert_column_eq(expected, result) + + +def test_rsplit_record(data_col, delimiter): + pa_array, plc_column = data_col + delim, plc_delim = delimiter + result = plc.strings.split.split.split_record(plc_column, plc_delim, 1) + expected = pc.split_pattern(pa_array, delim, max_splits=1) + assert_column_eq(expected, result) + + +def test_split_re(data_col, re_delimiter): + _, plc_column = data_col + result = plc.strings.split.split.split_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pa.table( + { + "a": ["a", "d", None], + "b": ["b_c", "e-f", None], + } + ) + assert_table_eq(expected, result) + + +def test_rsplit_re(data_col, re_delimiter): + _, plc_column = data_col + result = plc.strings.split.split.rsplit_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pa.table( + { + "a": ["a_b", "d-e", None], + "b": ["c", "f", None], + } + ) + assert_table_eq(expected, result) + + +def test_split_record_re(data_col, re_delimiter): + pa_array, plc_column = data_col + result = plc.strings.split.split.split_record_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + 1, + ) + expected = pc.split_pattern_regex(pa_array, re_delimiter, max_splits=1) + assert_column_eq(expected, result) + + +def test_rsplit_record_re(data_col, re_delimiter): + pa_array, plc_column = data_col + result = plc.strings.split.split.rsplit_record_re( + plc_column, + plc.strings.regex_program.RegexProgram.create( + re_delimiter, plc.strings.regex_flags.RegexFlags.DEFAULT + ), + -1, + ) + expected = pc.split_pattern_regex(pa_array, re_delimiter) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_string_strip.py b/python/pylibcudf/pylibcudf/tests/test_string_strip.py index 005e5e4a405..5869e5f4920 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_strip.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_strip.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + data_strings = [ "AbC", "123abc", diff --git a/python/pylibcudf/pylibcudf/tests/test_string_translate.py b/python/pylibcudf/pylibcudf/tests/test_string_translate.py index 2ae893e69fb..84fd3354ac6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_string_translate.py +++ b/python/pylibcudf/pylibcudf/tests/test_string_translate.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from utils import assert_column_eq +import pylibcudf as plc + @pytest.fixture def data_col(): diff --git a/python/pylibcudf/pylibcudf/tests/test_string_wrap.py b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py new file mode 100644 index 00000000000..00442d866e9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_wrap.py @@ -0,0 +1,26 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import textwrap + +import pyarrow as pa +from utils import assert_column_eq + +import pylibcudf as plc + + +def test_wrap(): + width = 12 + pa_array = pa.array( + [ + "the quick brown fox jumped over the lazy brown dog", + "hello, world", + None, + ] + ) + result = plc.strings.wrap.wrap(plc.interop.from_arrow(pa_array), width) + expected = pa.array( + [ + textwrap.fill(val, width) if isinstance(val, str) else val + for val in pa_array.to_pylist() + ] + ) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_table.py b/python/pylibcudf/pylibcudf/tests/test_table.py index e822d6a97a8..ac39ef4c5c9 100644 --- a/python/pylibcudf/pylibcudf/tests/test_table.py +++ b/python/pylibcudf/pylibcudf/tests/test_table.py @@ -1,9 +1,10 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest +import pylibcudf as plc + @pytest.mark.parametrize( "arrow_tbl", diff --git a/python/pylibcudf/pylibcudf/tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py index d5c618f07e4..49802fe64ac 100644 --- a/python/pylibcudf/pylibcudf/tests/test_transform.py +++ b/python/pylibcudf/pylibcudf/tests/test_transform.py @@ -3,9 +3,10 @@ import math import pyarrow as pa -import pylibcudf as plc from utils import assert_column_eq +import pylibcudf as plc + def test_nans_to_nulls(has_nans): if has_nans: diff --git a/python/pylibcudf/pylibcudf/tests/test_transpose.py b/python/pylibcudf/pylibcudf/tests/test_transpose.py index ac11123f680..b0c0bc72ead 100644 --- a/python/pylibcudf/pylibcudf/tests/test_transpose.py +++ b/python/pylibcudf/pylibcudf/tests/test_transpose.py @@ -1,10 +1,11 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import pyarrow as pa -import pylibcudf as plc import pytest from packaging.version import parse +import pylibcudf as plc + @pytest.mark.skipif( parse(pa.__version__) < parse("16.0.0"), diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx index de425a27c15..bce9702752a 100644 --- a/python/pylibcudf/pylibcudf/transform.pyx +++ b/python/pylibcudf/pylibcudf/transform.pyx @@ -9,7 +9,8 @@ from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport bitmask_type, size_type -from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer +from rmm.librmm.device_buffer cimport device_buffer +from rmm.pylibrmm.device_buffer cimport DeviceBuffer from .column cimport Column from .gpumemoryview cimport gpumemoryview @@ -34,7 +35,7 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): cdef pair[unique_ptr[device_buffer], size_type] c_result with nogil: - c_result = move(cpp_transform.nans_to_nulls(input.view())) + c_result = cpp_transform.nans_to_nulls(input.view()) return ( gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), @@ -58,7 +59,7 @@ cpdef tuple[gpumemoryview, int] bools_to_mask(Column input): cdef pair[unique_ptr[device_buffer], size_type] c_result with nogil: - c_result = move(cpp_transform.bools_to_mask(input.view())) + c_result = cpp_transform.bools_to_mask(input.view()) return ( gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))), @@ -87,7 +88,7 @@ cpdef Column mask_to_bools(Py_ssize_t bitmask, int begin_bit, int end_bit): cdef bitmask_type * bitmask_ptr = int_to_bitmask_ptr(bitmask) with nogil: - c_result = move(cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit)) + c_result = cpp_transform.mask_to_bools(bitmask_ptr, begin_bit, end_bit) return Column.from_libcudf(move(c_result)) @@ -118,10 +119,8 @@ cpdef Column transform(Column input, str unary_udf, DataType output_type, bool i cdef bool c_is_ptx = is_ptx with nogil: - c_result = move( - cpp_transform.transform( - input.view(), c_unary_udf, output_type.c_obj, c_is_ptx - ) + c_result = cpp_transform.transform( + input.view(), c_unary_udf, output_type.c_obj, c_is_ptx ) return Column.from_libcudf(move(c_result)) @@ -143,7 +142,7 @@ cpdef tuple[Table, Column] encode(Table input): cdef pair[unique_ptr[table], unique_ptr[column]] c_result with nogil: - c_result = move(cpp_transform.encode(input.view())) + c_result = cpp_transform.encode(input.view()) return ( Table.from_libcudf(move(c_result.first)), @@ -171,7 +170,7 @@ cpdef Table one_hot_encode(Column input, Column categories): cdef Table owner_table with nogil: - c_result = move(cpp_transform.one_hot_encode(input.view(), categories.view())) + c_result = cpp_transform.one_hot_encode(input.view(), categories.view()) owner_table = Table( [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns() diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx index a708f6cc37f..a24f937ced3 100644 --- a/python/pylibcudf/pylibcudf/transpose.pyx +++ b/python/pylibcudf/pylibcudf/transpose.pyx @@ -29,7 +29,7 @@ cpdef Table transpose(Table input_table): cdef Table owner_table with nogil: - c_result = move(cpp_transpose.transpose(input_table.view())) + c_result = cpp_transpose.transpose(input_table.view()) owner_table = Table( [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns() diff --git a/python/pylibcudf/pylibcudf/unary.pyx b/python/pylibcudf/pylibcudf/unary.pyx index 839360ef406..53e8c382b5e 100644 --- a/python/pylibcudf/pylibcudf/unary.pyx +++ b/python/pylibcudf/pylibcudf/unary.pyx @@ -34,7 +34,7 @@ cpdef Column unary_operation(Column input, unary_operator op): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.unary_operation(input.view(), op)) + result = cpp_unary.unary_operation(input.view(), op) return Column.from_libcudf(move(result)) @@ -57,7 +57,7 @@ cpdef Column is_null(Column input): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.is_null(input.view())) + result = cpp_unary.is_null(input.view()) return Column.from_libcudf(move(result)) @@ -80,7 +80,7 @@ cpdef Column is_valid(Column input): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.is_valid(input.view())) + result = cpp_unary.is_valid(input.view()) return Column.from_libcudf(move(result)) @@ -105,7 +105,7 @@ cpdef Column cast(Column input, DataType data_type): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.cast(input.view(), data_type.c_obj)) + result = cpp_unary.cast(input.view(), data_type.c_obj) return Column.from_libcudf(move(result)) @@ -128,7 +128,7 @@ cpdef Column is_nan(Column input): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.is_nan(input.view())) + result = cpp_unary.is_nan(input.view()) return Column.from_libcudf(move(result)) @@ -151,7 +151,7 @@ cpdef Column is_not_nan(Column input): cdef unique_ptr[column] result with nogil: - result = move(cpp_unary.is_not_nan(input.view())) + result = cpp_unary.is_not_nan(input.view()) return Column.from_libcudf(move(result)) diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index a8224f54e1c..a80c85a1fa8 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -22,7 +22,8 @@ dependencies = [ "libcudf==24.12.*,>=0.0.0a0", "nvtx>=0.2.1", "packaging", - "pyarrow>=14.0.0,<18.0.0a0", + "pyarrow>=14.0.0,<18.0.0a0,!=17.0.0; platform_machine=='aarch64'", + "pyarrow>=14.0.0,<18.0.0a0; platform_machine=='x86_64'", "rmm==24.12.*,>=0.0.0a0", "typing_extensions>=4.0.0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. @@ -52,48 +53,31 @@ test = [ Homepage = "https://github.com/rapidsai/cudf" Documentation = "https://docs.rapids.ai/api/cudf/stable/" -[tool.isort] -line_length = 79 -multi_line_output = 3 -include_trailing_comma = true -force_grid_wrap = 0 -combine_as_imports = true -order_by_type = true -known_dask = [ - "dask", - "distributed", - "dask_cuda", -] -known_rapids = [ - "rmm", -] -known_first_party = [ - "cudf", -] -default_section = "THIRDPARTY" -sections = [ - "FUTURE", - "STDLIB", - "THIRDPARTY", - "DASK", - "RAPIDS", - "FIRSTPARTY", - "LOCALFOLDER", -] -skip = [ - "thirdparty", - ".eggs", - ".git", - ".hg", - ".mypy_cache", - ".tox", - ".venv", - "_build", - "buck-out", - "build", - "dist", - "__init__.py", +[tool.ruff] +extend = "../../pyproject.toml" + +[tool.ruff.lint.isort] +combine-as-imports = true +known-first-party = ["cudf"] +section-order = ["future", "standard-library", "third-party", "dask", "rapids", "first-party", "local-folder"] + +[tool.ruff.lint.isort.sections] +dask = ["dask", "distributed", "dask_cuda"] +rapids = ["rmm"] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] + +[tool.pytest.ini_options] +# --import-mode=importlib because two test_json.py exists and tests directory is not a structured module +addopts = "--tb=native --strict-config --strict-markers --import-mode=importlib" +empty_parameter_set_mark = "fail_at_collect" +filterwarnings = [ + "error", + "ignore:::.*xdist.*", + "ignore:::.*pytest.*" ] +xfail_strict = true [tool.rapids-build-backend] build-backend = "scikit_build_core.build"