Skip to content
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: baidu-research/DeepBench
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: master
Choose a base ref
...
head repository: WilliamTambellini/DeepBench
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: master
Choose a head ref
Can’t automatically merge. Don’t worry, you can still create the pull request.
  • 1 commit
  • 1 file changed
  • 1 contributor

Commits on Dec 21, 2018

  1. Copy the full SHA
    3be22c9 View commit details
Showing with 28 additions and 11 deletions.
  1. +28 −11 code/nvidia/gemm_bench.cu
39 changes: 28 additions & 11 deletions code/nvidia/gemm_bench.cu
Original file line number Diff line number Diff line change
@@ -9,6 +9,7 @@
#include <sstream>

#include <cuda.h>
#include <cuda_fp16.h>
#include <cublas_v2.h>
#include <curand.h>

@@ -54,7 +55,7 @@ Supported precision types:
For Maxwell GPUS:
float for training and inference
For Pascal GPUS:
For Pascal/Volta GPUS:
float, half for training
float, half, int8 for inference
@@ -85,11 +86,22 @@ int time_gemm(Tensor<T1> A, Tensor<T1> B, Tensor<T2> C, bool a_t, bool b_t, cubl
cudaDataType_t compute_type = CUDA_R_32F;
cublasGemmAlgo_t algo;

if (std::is_same<T1, uint16_t>::value) {
if (std::is_same<T1, __half>::value) {
A_type = CUDA_R_16F;
B_type = CUDA_R_16F;
C_type = CUDA_R_16F;
compute_type = CUDA_R_16F;
}

if (std::is_same<T2, float>::value) {
C_type = CUDA_R_32F;
compute_type = CUDA_R_32F;
} else if (std::is_same<T2, __half>::value) {
C_type = CUDA_R_16F;
compute_type = CUDA_R_16F;
} else if (std::is_same<T2, int>::value) {
compute_type = CUDA_R_32I;
} else {
std::cerr << "Unsuported T2 (output) type" << std::endl;
exit(1);
}

if (std::is_same<T1, uint8_t>::value) {
@@ -219,8 +231,7 @@ int main(int argc, char **argv) {

if (status != CUBLAS_STATUS_SUCCESS) {
std::cout << "CUBLAS math mode failed" << std::endl;
}

} else std::cout << "CUBALS_TENSOR_OP_MATH ON" << std::endl;


curandGenerator_t curand_gen;
@@ -290,18 +301,24 @@ int main(int argc, char **argv) {
if (!skip_kernel)
time_ms = time_gemm<uint8_t, int>(a, b, c, a_t, b_t, cublas_handle);
} else if (precision == "half") {
auto a = rand<uint16_t>({a_t ? k : m, a_t ? m : k}, curand_gen);
auto b = rand<uint16_t>({b_t ? n : k, b_t ? k : n}, curand_gen);
auto c = zeros<uint16_t>({m, n});
auto a = rand<__half>({a_t ? k : m, a_t ? m : k}, curand_gen);
auto b = rand<__half>({b_t ? n : k, b_t ? k : n}, curand_gen);
auto c = zeros<__half>({m, n});
std::cout << std::setw(13) << precision;
time_ms = time_gemm<uint16_t, uint16_t>(a, b, c, a_t, b_t, cublas_handle);
time_ms = time_gemm<__half, __half>(a, b, c, a_t, b_t, cublas_handle);
} else if (precision == "float") {
auto a = rand<float>({a_t ? k : m, a_t ? m : k}, curand_gen);
auto b = rand<float>({b_t ? n : k, b_t ? k : n}, curand_gen);
auto c = zeros<float>({m, n});
std::cout << std::setw(13) << precision;
time_ms = time_gemm<float, float>(a, b, c, a_t, b_t, cublas_handle);
} else {
} else if (precision == "mixed") { // f16 x f16 to f32
auto a = rand<__half>({a_t ? k : m, a_t ? m : k}, curand_gen);
auto b = rand<__half>({b_t ? n : k, b_t ? k : n}, curand_gen);
auto c = zeros<float>({m, n});
std::cout << std::setw(13) << precision;
time_ms = time_gemm<__half, float>(a, b, c, a_t, b_t, cublas_handle);
} else {
throw std::runtime_error(ss.str());
}
#else