This repository has been archived by the owner on Jul 19, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_nccl.cc
61 lines (50 loc) · 1.75 KB
/
test_nccl.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#include <catch2/catch.hpp>
#include <algorithm>
#include <array>
#include <absl/strings/str_format.h>
#include <cuda_runtime.h>
#include <omp.h>
#include "cuda_helper.h"
#include "lkvs_impl.h"
#include "nccl_communicator.h"
TEMPLATE_TEST_CASE("nccl communicator",
"[communicator][nccl]",
int8_t,
int32_t,
int64_t,
uint8_t,
uint32_t,
uint64_t,
float,
double) {
int device_count;
CUDA_CHECK(cudaGetDeviceCount(&device_count));
if (device_count < 2) {
WARN("Less than 2 CUDA devices. This test may fail.");
}
elf::LocalKeyValueStore lkvs;
auto interfere1 = elf::create_nccl_communicator(&lkvs, "other-var-1", 0, 1);
auto interfere2 = elf::create_nccl_communicator(&lkvs, "other-var-2", 0, 1);
#pragma omp parallel num_threads(2)
{
CUDA_CHECK(cudaSetDevice(std::min(omp_get_thread_num(), device_count - 1)));
auto comm = create_nccl_communicator(&lkvs, "var1", omp_get_thread_num(), 2);
std::array<TestType, 4> H;
if (omp_get_thread_num() == 0) {
H = {1, 2, 3, 4};
} else {
H = {0, 8, 3, 6};
}
gpu_array<TestType, 4> Dsrc, Ddst;
Dsrc = H;
comm->allreduce(Dsrc.data(), Ddst.data(), 4, elf::Communicator::datatype_of<TestType>());
#pragma omp critical
{ CHECK(Ddst.cpu() == std::array<TestType, 4>{1, 10, 6, 10}); }
comm->broadcast(Dsrc.data(), Ddst.data(), 0, 4, elf::Communicator::datatype_of<TestType>());
#pragma omp critical
{ CHECK(Ddst.cpu() == std::array<TestType, 4>{1, 2, 3, 4}); }
comm->broadcast(Dsrc.data(), Ddst.data(), 1, 4, elf::Communicator::datatype_of<TestType>());
#pragma omp critical
{ CHECK(Ddst.cpu() == std::array<TestType, 4>{0, 8, 3, 6}); }
}
}