Skip to content

Commit

Permalink
Merge branch 'branch-25.02' into go
Browse files Browse the repository at this point in the history
  • Loading branch information
benfred committed Feb 3, 2025
2 parents a656870 + 2b7dce6 commit b70d6e2
Show file tree
Hide file tree
Showing 176 changed files with 30,049 additions and 1,495 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
"context": "${localWorkspaceFolder}/.devcontainer",
"dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
"args": {
"CUDA": "12.5",
"CUDA": "12.8",
"PYTHON_PACKAGE_MANAGER": "conda",
"BASE": "rapidsai/devcontainers:25.02-cpp-mambaforge-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.5-conda"
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.8-conda"
],
"hostRequirements": {"gpu": "optional"},
"features": {
Expand All @@ -20,7 +20,7 @@
"overrideFeatureInstallOrder": [
"ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
],
"initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"],
"initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.8-envs}"],
"postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
"workspaceFolder": "/home/coder",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuvs,type=bind,consistency=consistent",
Expand All @@ -29,7 +29,7 @@
"source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.5-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
"source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.8-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,20 @@
"context": "${localWorkspaceFolder}/.devcontainer",
"dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
"args": {
"CUDA": "12.5",
"CUDA": "12.8",
"PYTHON_PACKAGE_MANAGER": "pip",
"BASE": "rapidsai/devcontainers:25.02-cpp-cuda12.5-ucx1.18.0-openmpi-ubuntu22.04"
"BASE": "rapidsai/devcontainers:25.02-cpp-cuda12.8-ucx1.18.0-openmpi-ubuntu22.04"
}
},
"runArgs": [
"--rm",
"--name",
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.5-pip"
"${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-25.02-cuda12.8-pip"
],
"hostRequirements": {"gpu": "optional"},
"features": {
"ghcr.io/rapidsai/devcontainers/features/cuda:25.2": {
"version": "12.5",
"version": "12.8",
"installcuBLAS": true,
"installcuSOLVER": true,
"installcuRAND": true,
Expand All @@ -29,15 +29,15 @@
"ghcr.io/rapidsai/devcontainers/features/cuda",
"ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
],
"initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"],
"initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs}"],
"postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
"workspaceFolder": "/home/coder",
"workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cuvs,type=bind,consistency=consistent",
"mounts": [
"source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
"source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
"source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.8-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
],
"customizations": {
"vscode": {
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ jobs:
build_type: pull-request
node_type: "gpu-v100-latest-1"
arch: "amd64"
container_image: "rapidsai/ci-conda:latest"
container_image: "rapidsai/ci-conda:cuda12.8.0-ubuntu24.04-py3.12"
run_script: "ci/build_docs.sh"
rust-build:
needs: conda-cpp-build
Expand Down Expand Up @@ -180,7 +180,7 @@ jobs:
uses: rapidsai/shared-workflows/.github/workflows/[email protected]
with:
arch: '["amd64"]'
cuda: '["12.5"]'
cuda: '["12.8"]'
build_command: |
sccache -z;
build-all --verbose;
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,6 @@ ivf_pq_index
# cuvs_bench
datasets/
/*.json

# java
.classpath
16 changes: 14 additions & 2 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ ARGS=$*
# scripts, and that this script resides in the repo dir!
REPODIR=$(cd $(dirname $0); pwd)

VALIDARGS="clean libcuvs python rust go docs tests bench-ann examples --uninstall -v -g -n --compile-static-lib --allgpuarch --no-mg --no-cpu --cpu-only --no-shared-libs --no-nvtx --show_depr_warn --incl-cache-stats --time -h"
VALIDARGS="clean libcuvs python rust go java docs tests bench-ann examples --uninstall -v -g -n --compile-static-lib --allgpuarch --no-mg --no-cpu --cpu-only --no-shared-libs --no-nvtx --show_depr_warn --incl-cache-stats --time -h"
HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<tool>] [--limit-tests=<targets>] [--limit-bench-ann=<targets>] [--build-metrics=<filename>]
where <target> is:
clean - remove all existing build artifacts and configuration (start over)
Expand All @@ -27,6 +27,7 @@ HELP="$0 [<target> ...] [<flag> ...] [--cmake-args=\"<args>\"] [--cache-tool=<to
python - build the cuvs Python package
rust - build the cuvs Rust bindings
go - build the cuvs Go bindings
java - build the cuvs Java bindings
docs - build the documentation
tests - build the tests
bench-ann - build end-to-end ann benchmarks
Expand Down Expand Up @@ -62,7 +63,8 @@ SPHINX_BUILD_DIR=${REPODIR}/docs
DOXYGEN_BUILD_DIR=${REPODIR}/cpp/doxygen
PYTHON_BUILD_DIR=${REPODIR}/python/cuvs/_skbuild
RUST_BUILD_DIR=${REPODIR}/rust/target
BUILD_DIRS="${LIBCUVS_BUILD_DIR} ${PYTHON_BUILD_DIR} ${RUST_BUILD_DIR}"
JAVA_BUILD_DIR=${REPODIR}/java/cuvs-java/target
BUILD_DIRS="${LIBCUVS_BUILD_DIR} ${PYTHON_BUILD_DIR} ${RUST_BUILD_DIR} ${JAVA_BUILD_DIR}"

# Set defaults for vars modified by flags to this script
CMAKE_LOG_LEVEL=""
Expand Down Expand Up @@ -446,11 +448,21 @@ if (( ${NUMARGS} == 0 )) || hasArg rust; then
cargo test
fi

<<<<<<< HEAD
# Build the cuvs Go bindings
if (( ${NUMARGS} == 0 )) || hasArg go; then
cd ${REPODIR}/go
go build ./...
go test ./...
=======
# Build the cuvs Java bindings
if (( ${NUMARGS} == 0 )) || hasArg java; then
if ! hasArg libcuvs; then
echo "Please add 'libcuvs' to this script's arguments (ex. './build.sh libcuvs java') if libcuvs libraries are not already built"
fi
cd ${REPODIR}/java
./build.sh
>>>>>>> branch-25.02
fi

export RAPIDS_VERSION="$(sed -E -e 's/^([0-9]{2})\.([0-9]{2})\.([0-9]{2}).*$/\1.\2.\3/' "${REPODIR}/VERSION")"
Expand Down
8 changes: 8 additions & 0 deletions ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ NEXT_UCXX_SHORT_TAG="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG
# Need to distutils-normalize the original version
NEXT_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_SHORT_TAG}'))")
NEXT_UCXX_SHORT_TAG_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_UCXX_SHORT_TAG}'))")
PATCH_PEP440=$(python -c "from packaging.version import Version; print(Version('${NEXT_PATCH}'))")

echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"

Expand Down Expand Up @@ -96,3 +97,10 @@ find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r
sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
sed_runner "s@rapids-\${localWorkspaceFolderBasename}-${CURRENT_SHORT_TAG}@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
done

# Update Java API version
NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}"
sed_runner "s/VERSION=\".*\"/VERSION=\"${NEXT_FULL_JAVA_TAG}\"/g" java/build.sh
for FILE in java/*/pom.xml; do
sed_runner "/<!--CUVS_JAVA#VERSION_UPDATE_MARKER_START-->.*<!--CUVS_JAVA#VERSION_UPDATE_MARKER_END-->/s//<!--CUVS_JAVA#VERSION_UPDATE_MARKER_START--><version>${NEXT_FULL_JAVA_TAG}<\/version><!--CUVS_JAVA#VERSION_UPDATE_MARKER_END-->/g" "${FILE}"
done
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ dependencies:
- cuda-nvtx-dev
- cuda-profiler-api
- cuda-python>=12.6.2,<13.0a0
- cuda-version=12.5
- cuda-version=12.8
- cupy>=12.0.0
- cxx-compiler
- cython>=3.0.0
- dlpack>=0.8,<1.0
- doxygen>=1.8.20
- go
- gcc_linux-aarch64=13.*
- go
- graphviz
- ipython
- libclang==16.0.6
Expand Down Expand Up @@ -54,4 +54,4 @@ dependencies:
- sysroot_linux-aarch64==2.28
- pip:
- nvidia-sphinx-theme
name: all_cuda-125_arch-aarch64
name: all_cuda-128_arch-aarch64
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@ dependencies:
- cuda-nvtx-dev
- cuda-profiler-api
- cuda-python>=12.6.2,<13.0a0
- cuda-version=12.5
- cuda-version=12.8
- cupy>=12.0.0
- cxx-compiler
- cython>=3.0.0
- dlpack>=0.8,<1.0
- doxygen>=1.8.20
- go
- gcc_linux-64=13.*
- go
- graphviz
- ipython
- libclang==16.0.6
Expand Down Expand Up @@ -54,4 +54,4 @@ dependencies:
- sysroot_linux-64==2.28
- pip:
- nvidia-sphinx-theme
name: all_cuda-125_arch-x86_64
name: all_cuda-128_arch-x86_64
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dependencies:
- cuda-nvtx-dev
- cuda-profiler-api
- cuda-python>=12.6.2,<13.0a0
- cuda-version=12.5
- cuda-version=12.8
- cupy>=12.0.0
- cuvs==25.2.*,>=0.0.0a0
- cxx-compiler
Expand Down Expand Up @@ -46,4 +46,4 @@ dependencies:
- setuptools
- sysroot_linux-aarch64==2.28
- wheel
name: bench_ann_cuda-125_arch-aarch64
name: bench_ann_cuda-128_arch-aarch64
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dependencies:
- cuda-nvtx-dev
- cuda-profiler-api
- cuda-python>=12.6.2,<13.0a0
- cuda-version=12.5
- cuda-version=12.8
- cupy>=12.0.0
- cuvs==25.2.*,>=0.0.0a0
- cxx-compiler
Expand Down Expand Up @@ -46,4 +46,4 @@ dependencies:
- setuptools
- sysroot_linux-64==2.28
- wheel
name: bench_ann_cuda-125_arch-x86_64
name: bench_ann_cuda-128_arch-x86_64
3 changes: 3 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,9 @@ include(cmake/modules/ConfigureCUDA.cmake)
rapids_cpm_init()

if(NOT BUILD_CPU_ONLY)
# We must find CCCL ourselves before raft so that we get the right version.
include(${rapids-cmake-dir}/cpm/cccl.cmake)
rapids_cpm_cccl(BUILD_EXPORT_SET cuvs-exports INSTALL_EXPORT_SET cuvs-exports)
include(cmake/thirdparty/get_raft.cmake)
include(cmake/thirdparty/get_cutlass.cmake)
endif()
Expand Down
17 changes: 16 additions & 1 deletion cpp/bench/ann/src/common/ann_types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,19 @@ enum class MemoryType {
kHostMmap,
kHostPinned,
kDevice,
kManaged,
};

/** Request 2MB huge pages support for an allocation */
enum class HugePages {
/** Don't use huge pages if possible. */
kDisable = 0,
/** Enable huge pages if possible, ignore otherwise. */
kAsk = 1,
/** Enable huge pages if possible, warn the user otherwise. */
kRequire = 2,
/** Force enable huge pages, throw an exception if not possible. */
kDemand = 3
};

enum class Metric {
Expand Down Expand Up @@ -65,6 +78,8 @@ inline auto parse_memory_type(const std::string& memory_type) -> MemoryType
return MemoryType::kHostPinned;
} else if (memory_type == "device") {
return MemoryType::kDevice;
} else if (memory_type == "managed") {
return MemoryType::kManaged;
} else {
throw std::runtime_error("invalid memory type: '" + memory_type + "'");
}
Expand Down Expand Up @@ -130,7 +145,7 @@ class algo : public algo_base {

virtual void build(const T* dataset, size_t nrow) = 0;

virtual void set_search_param(const search_param& param) = 0;
virtual void set_search_param(const search_param& param, const void* filter_bitset) = 0;
// TODO(snanditale): this assumes that an algorithm can always return k results.
// This is not always possible.
virtual void search(const T* queries,
Expand Down
59 changes: 45 additions & 14 deletions cpp/bench/ann/src/common/benchmark.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,8 @@ void bench_search(::benchmark::State& state,
}
}
try {
a->set_search_param(*search_param);
a->set_search_param(*search_param,
dataset->filter_bitset(current_algo_props->dataset_memory_type));
} catch (const std::exception& ex) {
state.SkipWithError("An error occurred setting search parameters: " + std::string(ex.what()));
return;
Expand Down Expand Up @@ -359,13 +360,19 @@ void bench_search(::benchmark::State& state,
// Each thread calculates recall on their partition of queries.
// evaluate recall
if (dataset->max_k() >= k) {
const std::int32_t* gt = dataset->gt_set();
const std::int32_t* gt = dataset->gt_set();
const std::uint32_t* filter_bitset = dataset->filter_bitset(MemoryType::kHostMmap);
auto filter = [filter_bitset](std::int32_t i) -> bool {
if (filter_bitset == nullptr) { return true; }
auto word = filter_bitset[i >> 5];
return word & (1 << (i & 31));
};
const std::uint32_t max_k = dataset->max_k();
result_buf.transfer_data(MemoryType::kHost, current_algo_props->query_memory_type);
auto* neighbors_host = reinterpret_cast<index_type*>(result_buf.data(MemoryType::kHost));
std::size_t rows = std::min(queries_processed, query_set_size);
std::size_t match_count = 0;
std::size_t total_count = rows * static_cast<size_t>(k);
std::size_t total_count = 0;

// We go through the groundtruth with same stride as the benchmark loop.
size_t out_offset = 0;
Expand All @@ -375,22 +382,44 @@ void bench_search(::benchmark::State& state,
size_t i_orig_idx = batch_offset + i;
size_t i_out_idx = out_offset + i;
if (i_out_idx < rows) {
for (std::uint32_t j = 0; j < k; j++) {
auto act_idx = static_cast<std::int32_t>(neighbors_host[i_out_idx * k + j]);
for (std::uint32_t l = 0; l < k; l++) {
auto exp_idx = gt[i_orig_idx * max_k + l];
/* NOTE: recall correctness & filtering
In the loop below, we filter the ground truth values on-the-fly.
We need enough ground truth values to compute recall correctly though.
But the ground truth file only contains `max_k` values per row; if there are less valid
values than k among them, we overestimate the recall. Essentially, we compare the first
`filter_pass_count` values of the algorithm output, and this counter can be less than `k`.
In the extreme case of very high filtering rate, we may be bypassing entire rows of
results. However, this is still better than no recall estimate at all.
TODO: consider generating the filtered ground truth on-the-fly
*/
uint32_t filter_pass_count = 0;
for (std::uint32_t l = 0; l < max_k && filter_pass_count < k; l++) {
auto exp_idx = gt[i_orig_idx * max_k + l];
if (!filter(exp_idx)) { continue; }
filter_pass_count++;
for (std::uint32_t j = 0; j < k; j++) {
auto act_idx = static_cast<std::int32_t>(neighbors_host[i_out_idx * k + j]);
if (act_idx == exp_idx) {
match_count++;
break;
}
}
}
total_count += filter_pass_count;
}
}
out_offset += n_queries;
batch_offset = (batch_offset + queries_stride) % query_set_size;
}
double actual_recall = static_cast<double>(match_count) / static_cast<double>(total_count);
/* NOTE: recall in the throughput mode & filtering
When filtering is enabled, `total_count` may vary between individual threads, but we still take
the simple average across in-thread recalls. Strictly speaking, this is incorrect, but it's good
enough under assumption that the filtering is more-or-less uniform.
*/
state.counters.insert({"Recall", {actual_recall, benchmark::Counter::kAvgThreads}});
}
}
Expand Down Expand Up @@ -515,13 +544,15 @@ void dispatch_benchmark(std::string cmdline,
auto query_file = combine_path(data_prefix, dataset_conf.query_file);
auto gt_file = dataset_conf.groundtruth_neighbors_file;
if (gt_file.has_value()) { gt_file.emplace(combine_path(data_prefix, gt_file.value())); }
auto dataset = std::make_shared<bin_dataset<T>>(dataset_conf.name,
base_file,
dataset_conf.subset_first_row,
dataset_conf.subset_size,
query_file,
dataset_conf.distance,
gt_file);
auto dataset =
std::make_shared<bench::dataset<T>>(dataset_conf.name,
base_file,
dataset_conf.subset_first_row,
dataset_conf.subset_size,
query_file,
dataset_conf.distance,
gt_file,
search_mode ? dataset_conf.filtering_rate : std::nullopt);
::benchmark::AddCustomContext("dataset", dataset_conf.name);
::benchmark::AddCustomContext("distance", dataset_conf.distance);
std::vector<configuration::index> indices = conf.get_indices();
Expand Down
Loading

0 comments on commit b70d6e2

Please sign in to comment.