Merge branch 'branch-25.02' into go

rapidsai · Feb 3, 2025 · b70d6e2 · b70d6e2
2 parents a656870 + 2b7dce6
commit b70d6e2
Show file tree

Hide file tree

Showing 176 changed files with 30,049 additions and 1,495 deletions.
diff --git a/...ontainer/cuda12.5-conda/devcontainer.json → ...ontainer/cuda12.8-conda/devcontainer.json b/...ontainer/cuda12.5-conda/devcontainer.json → ...ontainer/cuda12.8-conda/devcontainer.json
@@ -3,15 +3,15 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.5",
+      "CUDA": "12.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
       "BASE": "rapidsai/devcontainers:25.02-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
@@ -20,7 +20,7 @@
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.8-envs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuvs,type=bind,consistency=consistent",
@@ -29,7 +29,7 @@
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.5-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {

diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json → .devcontainer/cuda12.8-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json → .devcontainer/cuda12.8-pip/devcontainer.json
@@ -3,20 +3,20 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.5",
+      "CUDA": "12.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda12.5-ucx1.18.0-openmpi-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:25.02-cpp-cuda12.8-ucx1.18.0-openmpi-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/cuda:25.2": {
-      "version": "12.5",
+      "version": "12.8",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
@@ -29,15 +29,15 @@
     "ghcr.io/rapidsai/devcontainers/features/cuda",
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuvs,type=bind,consistency=consistent",
   "mounts": [
     "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -128,7 +128,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
+      container_image: "rapidsai/ci-conda:cuda12.8.0-ubuntu24.04-py3.12"
       run_script: "ci/build_docs.sh"
   rust-build:
     needs: conda-cpp-build
@@ -180,7 +180,7 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:
       arch: '["amd64"]'
-      cuda: '["12.5"]'
+      cuda: '["12.8"]'
       build_command: |
         sccache -z;
         build-all --verbose;

diff --git a/.gitignore b/.gitignore
@@ -83,3 +83,6 @@ ivf_pq_index
 # cuvs_bench
 datasets/
 /*.json
+
+# java
+.classpath
diff --git a/build.sh b/build.sh
@@ -18,7 +18,7 @@ ARGS=$*
 # scripts, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libcuvs python rust go docs tests bench-ann examples --uninstall  -v -g -n --compile-static-lib --allgpuarch --no-mg --no-cpu --cpu-only --no-shared-libs --no-nvtx --show_depr_warn --incl-cache-stats --time -h"
+VALIDARGS="clean libcuvs python rust go java docs tests bench-ann examples --uninstall  -v -g -n --compile-static-lib --allgpuarch --no-mg --no-cpu --cpu-only --no-shared-libs --no-nvtx --show_depr_warn --incl-cache-stats --time -h"
 HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench-ann=<targets>] [--build-metrics=<filename>]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
@@ -27,6 +27,7 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
    python           - build the cuvs Python package
    rust             - build the cuvs Rust bindings
    go               - build the cuvs Go bindings
+   java             - build the cuvs Java bindings
    docs             - build the documentation
    tests            - build the tests
    bench-ann        - build end-to-end ann benchmarks
@@ -62,7 +63,8 @@ SPHINX_BUILD_DIR=${REPODIR}/docs
 DOXYGEN_BUILD_DIR=${REPODIR}/cpp/doxygen
 PYTHON_BUILD_DIR=${REPODIR}/python/cuvs/_skbuild
 RUST_BUILD_DIR=${REPODIR}/rust/target
-BUILD_DIRS="${LIBCUVS_BUILD_DIR} ${PYTHON_BUILD_DIR} ${RUST_BUILD_DIR}"
+JAVA_BUILD_DIR=${REPODIR}/java/cuvs-java/target
+BUILD_DIRS="${LIBCUVS_BUILD_DIR} ${PYTHON_BUILD_DIR} ${RUST_BUILD_DIR} ${JAVA_BUILD_DIR}"
 
 # Set defaults for vars modified by flags to this script
 CMAKE_LOG_LEVEL=""
@@ -446,11 +448,21 @@ if (( ${NUMARGS} == 0 )) || hasArg rust; then
     cargo test
 fi
 
+<<<<<<< HEAD
 # Build the cuvs Go bindings
 if (( ${NUMARGS} == 0 )) || hasArg go; then
     cd ${REPODIR}/go
     go build ./...
     go test ./...
+=======
+# Build the cuvs Java bindings
+if (( ${NUMARGS} == 0 )) || hasArg java; then
+    if ! hasArg libcuvs; then
+        echo "Please add 'libcuvs' to this script's arguments (ex. './build.sh libcuvs java') if libcuvs libraries are not already built"
+    fi
+    cd ${REPODIR}/java
+    ./build.sh
+>>>>>>> branch-25.02
 fi
 
 export RAPIDS_VERSION="$(sed -E -e 's/^([0-9]{2})\.([0-9]{2})\.([0-9]{2}).*$/\1.\2.\3/' "${REPODIR}/VERSION")"

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
@@ -28,6 +28,7 @@ NEXT_UCXX_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG
 # Need to distutils-normalize the original version
 NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))")
 NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_UCXX_SHORT_TAG}'))")
+PATCH_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_PATCH}'))")
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
@@ -96,3 +97,10 @@ find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r
     sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
     sed_runner "s@rapids-\${localWorkspaceFolderBasename}-${CURRENT_SHORT_TAG}@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
 done
+
+# Update Java API version
+NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}"
+sed_runner "s/VERSION=\".*\"/VERSION=\"${NEXT_FULL_JAVA_TAG}\"/g" java/build.sh
+for FILE in java/*/pom.xml; do
+  sed_runner "/<!--CUVS_JAVA#VERSION_UPDATE_MARKER_START-->.*<!--CUVS_JAVA#VERSION_UPDATE_MARKER_END-->/s//<!--CUVS_JAVA#VERSION_UPDATE_MARKER_START--><version>${NEXT_FULL_JAVA_TAG}<\/version><!--CUVS_JAVA#VERSION_UPDATE_MARKER_END-->/g" "${FILE}"
+done
diff --git a/...vironments/all_cuda-125_arch-aarch64.yaml → ...vironments/all_cuda-128_arch-aarch64.yaml b/...vironments/all_cuda-125_arch-aarch64.yaml → ...vironments/all_cuda-128_arch-aarch64.yaml
@@ -17,14 +17,14 @@ dependencies:
 - cuda-nvtx-dev
 - cuda-profiler-api
 - cuda-python>=12.6.2,<13.0a0
-- cuda-version=12.5
+- cuda-version=12.8
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
 - doxygen>=1.8.20
-- go
 - gcc_linux-aarch64=13.*
+- go
 - graphviz
 - ipython
 - libclang==16.0.6
@@ -54,4 +54,4 @@ dependencies:
 - sysroot_linux-aarch64==2.28
 - pip:
   - nvidia-sphinx-theme
-name: all_cuda-125_arch-aarch64
+name: all_cuda-128_arch-aarch64
diff --git a/...nvironments/all_cuda-125_arch-x86_64.yaml → ...nvironments/all_cuda-128_arch-x86_64.yaml b/...nvironments/all_cuda-125_arch-x86_64.yaml → ...nvironments/all_cuda-128_arch-x86_64.yaml
@@ -17,14 +17,14 @@ dependencies:
 - cuda-nvtx-dev
 - cuda-profiler-api
 - cuda-python>=12.6.2,<13.0a0
-- cuda-version=12.5
+- cuda-version=12.8
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
 - dlpack>=0.8,<1.0
 - doxygen>=1.8.20
-- go
 - gcc_linux-64=13.*
+- go
 - graphviz
 - ipython
 - libclang==16.0.6
@@ -54,4 +54,4 @@ dependencies:
 - sysroot_linux-64==2.28
 - pip:
   - nvidia-sphinx-theme
-name: all_cuda-125_arch-x86_64
+name: all_cuda-128_arch-x86_64
diff --git a/...ents/bench_ann_cuda-125_arch-aarch64.yaml → ...ents/bench_ann_cuda-128_arch-aarch64.yaml b/...ents/bench_ann_cuda-125_arch-aarch64.yaml → ...ents/bench_ann_cuda-128_arch-aarch64.yaml
@@ -18,7 +18,7 @@ dependencies:
 - cuda-nvtx-dev
 - cuda-profiler-api
 - cuda-python>=12.6.2,<13.0a0
-- cuda-version=12.5
+- cuda-version=12.8
 - cupy>=12.0.0
 - cuvs==25.2.*,>=0.0.0a0
 - cxx-compiler
@@ -46,4 +46,4 @@ dependencies:
 - setuptools
 - sysroot_linux-aarch64==2.28
 - wheel
-name: bench_ann_cuda-125_arch-aarch64
+name: bench_ann_cuda-128_arch-aarch64
diff --git a/...ments/bench_ann_cuda-125_arch-x86_64.yaml → ...ments/bench_ann_cuda-128_arch-x86_64.yaml b/...ments/bench_ann_cuda-125_arch-x86_64.yaml → ...ments/bench_ann_cuda-128_arch-x86_64.yaml
@@ -18,7 +18,7 @@ dependencies:
 - cuda-nvtx-dev
 - cuda-profiler-api
 - cuda-python>=12.6.2,<13.0a0
-- cuda-version=12.5
+- cuda-version=12.8
 - cupy>=12.0.0
 - cuvs==25.2.*,>=0.0.0a0
 - cxx-compiler
@@ -46,4 +46,4 @@ dependencies:
 - setuptools
 - sysroot_linux-64==2.28
 - wheel
-name: bench_ann_cuda-125_arch-x86_64
+name: bench_ann_cuda-128_arch-x86_64
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -173,6 +173,9 @@ include(cmake/modules/ConfigureCUDA.cmake)
 rapids_cpm_init()
 
 if(NOT BUILD_CPU_ONLY)
+  # We must find CCCL ourselves before raft so that we get the right version.
+  include(${rapids-cmake-dir}/cpm/cccl.cmake)
+  rapids_cpm_cccl(BUILD_EXPORT_SET cuvs-exports INSTALL_EXPORT_SET cuvs-exports)
   include(cmake/thirdparty/get_raft.cmake)
   include(cmake/thirdparty/get_cutlass.cmake)
 endif()

diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
@@ -37,6 +37,19 @@ enum class MemoryType {
   kHostMmap,
   kHostPinned,
   kDevice,
+  kManaged,
+};
+
+/** Request 2MB huge pages support for an allocation */
+enum class HugePages {
+  /** Don't use huge pages if possible. */
+  kDisable = 0,
+  /** Enable huge pages if possible, ignore otherwise. */
+  kAsk = 1,
+  /** Enable huge pages if possible, warn the user otherwise. */
+  kRequire = 2,
+  /** Force enable huge pages, throw an exception if not possible. */
+  kDemand = 3
 };
 
 enum class Metric {
@@ -65,6 +78,8 @@ inline auto parse_memory_type(const std::string& memory_type) -> MemoryType
     return MemoryType::kHostPinned;
   } else if (memory_type == "device") {
     return MemoryType::kDevice;
+  } else if (memory_type == "managed") {
+    return MemoryType::kManaged;
   } else {
     throw std::runtime_error("invalid memory type: '" + memory_type + "'");
   }
@@ -130,7 +145,7 @@ class algo : public algo_base {
 
   virtual void build(const T* dataset, size_t nrow) = 0;
 
-  virtual void set_search_param(const search_param& param) = 0;
+  virtual void set_search_param(const search_param& param, const void* filter_bitset) = 0;
   // TODO(snanditale): this assumes that an algorithm can always return k results.
   // This is not always possible.
   virtual void search(const T* queries,

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
@@ -271,7 +271,8 @@ void bench_search(::benchmark::State& state,
       }
     }
     try {
-      a->set_search_param(*search_param);
+      a->set_search_param(*search_param,
+                          dataset->filter_bitset(current_algo_props->dataset_memory_type));
     } catch (const std::exception& ex) {
       state.SkipWithError("An error occurred setting search parameters: " + std::string(ex.what()));
       return;
@@ -359,13 +360,19 @@ void bench_search(::benchmark::State& state,
   // Each thread calculates recall on their partition of queries.
   // evaluate recall
   if (dataset->max_k() >= k) {
-    const std::int32_t* gt    = dataset->gt_set();
+    const std::int32_t* gt             = dataset->gt_set();
+    const std::uint32_t* filter_bitset = dataset->filter_bitset(MemoryType::kHostMmap);
+    auto filter                        = [filter_bitset](std::int32_t i) -> bool {
+      if (filter_bitset == nullptr) { return true; }
+      auto word = filter_bitset[i >> 5];
+      return word & (1 << (i & 31));
+    };
     const std::uint32_t max_k = dataset->max_k();
     result_buf.transfer_data(MemoryType::kHost, current_algo_props->query_memory_type);
     auto* neighbors_host    = reinterpret_cast<index_type*>(result_buf.data(MemoryType::kHost));
     std::size_t rows        = std::min(queries_processed, query_set_size);
     std::size_t match_count = 0;
-    std::size_t total_count = rows * static_cast<size_t>(k);
+    std::size_t total_count = 0;
 
     // We go through the groundtruth with same stride as the benchmark loop.
     size_t out_offset   = 0;
@@ -375,22 +382,44 @@ void bench_search(::benchmark::State& state,
         size_t i_orig_idx = batch_offset + i;
         size_t i_out_idx  = out_offset + i;
         if (i_out_idx < rows) {
-          for (std::uint32_t j = 0; j < k; j++) {
-            auto act_idx = static_cast<std::int32_t>(neighbors_host[i_out_idx * k + j]);
-            for (std::uint32_t l = 0; l < k; l++) {
-              auto exp_idx = gt[i_orig_idx * max_k + l];
+          /* NOTE: recall correctness & filtering
+
+          In the loop below, we filter the ground truth values on-the-fly.
+          We need enough ground truth values to compute recall correctly though.
+          But the ground truth file only contains `max_k` values per row; if there are less valid
+          values than k among them, we overestimate the recall. Essentially, we compare the first
+          `filter_pass_count` values of the algorithm output, and this counter can be less than `k`.
+          In the extreme case of very high filtering rate, we may be bypassing entire rows of
+          results. However, this is still better than no recall estimate at all.
+
+          TODO: consider generating the filtered ground truth on-the-fly
+          */
+          uint32_t filter_pass_count = 0;
+          for (std::uint32_t l = 0; l < max_k && filter_pass_count < k; l++) {
+            auto exp_idx = gt[i_orig_idx * max_k + l];
+            if (!filter(exp_idx)) { continue; }
+            filter_pass_count++;
+            for (std::uint32_t j = 0; j < k; j++) {
+              auto act_idx = static_cast<std::int32_t>(neighbors_host[i_out_idx * k + j]);
               if (act_idx == exp_idx) {
                 match_count++;
                 break;
               }
             }
           }
+          total_count += filter_pass_count;
         }
       }
       out_offset += n_queries;
       batch_offset = (batch_offset + queries_stride) % query_set_size;
     }
     double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
+    /* NOTE: recall in the throughput mode & filtering
+
+    When filtering is enabled, `total_count` may vary between individual threads, but we still take
+    the simple average across in-thread recalls. Strictly speaking, this is incorrect, but it's good
+    enough under assumption that the filtering is more-or-less uniform.
+    */
     state.counters.insert({"Recall", {actual_recall, benchmark::Counter::kAvgThreads}});
   }
 }
@@ -515,13 +544,15 @@ void dispatch_benchmark(std::string cmdline,
   auto query_file         = combine_path(data_prefix, dataset_conf.query_file);
   auto gt_file            = dataset_conf.groundtruth_neighbors_file;
   if (gt_file.has_value()) { gt_file.emplace(combine_path(data_prefix, gt_file.value())); }
-  auto dataset = std::make_shared<bin_dataset<T>>(dataset_conf.name,
-                                                  base_file,
-                                                  dataset_conf.subset_first_row,
-                                                  dataset_conf.subset_size,
-                                                  query_file,
-                                                  dataset_conf.distance,
-                                                  gt_file);
+  auto dataset =
+    std::make_shared<bench::dataset<T>>(dataset_conf.name,
+                                        base_file,
+                                        dataset_conf.subset_first_row,
+                                        dataset_conf.subset_size,
+                                        query_file,
+                                        dataset_conf.distance,
+                                        gt_file,
+                                        search_mode ? dataset_conf.filtering_rate : std::nullopt);
   ::benchmark::AddCustomContext("dataset", dataset_conf.name);
   ::benchmark::AddCustomContext("distance", dataset_conf.distance);
   std::vector<configuration::index> indices = conf.get_indices();