In-place shift of uint vectors corrupts s1 and further components #358

proski · 2025-01-08T07:53:43Z

This is a duplicate of intel/compute-runtime#790 - moved here as there is evidence that the issue is compiler related.

The issue can be reproduced Intel Compute Runtime versions from 23.22.26516.18 until 24.45.31740.9 (inclusive on both ends). Versions 23.17.26241.22 and older are not affected. Version 24.48.31907.7 (currently the latest release) is not affected either. Even though the latest release is not affected, I'd like someone to have a closer look, as the fix might be accidental.

Following is an improved version of the demo I posted in the original ticket.

#define CL_TARGET_OPENCL_VERSION 300

#include <CL/opencl.h>
#include <stdio.h>

#define VECTOR_SIZE 4
#define INPUT_SIZE (3 * VECTOR_SIZE)

#define CODE(...) #__VA_ARGS__

const char *KernelSource = CODE(

    __kernel void shift_test(__global uint *input, __global uint *output) {
      uint4 d0 = vload4(0, input);
      uint4 d1 = vload4(1, input);
      uint4 d2 = vload4(2, input);
      d0 = (d0 << 1) | (d1 >> 31);
      d1 = (d1 << 1) | (d2 >> 31);
      d2 = d2 << 1;

      int i = get_global_id(0);
      if (i == 0) {
        vstore4(d0, 0, output);
        vstore4(d1, 1, output);
        vstore4(d2, 2, output);
      }
    }

);

int main(int argc, char **argv) {
  unsigned int data[INPUT_SIZE] = {10, 10, 10, 10, 11, 11,
                                   11, 11, 12, 12, 12, 12};
  unsigned int results[INPUT_SIZE];

  cl_platform_id platform;
  int err = clGetPlatformIDs(1, &platform, NULL);
  if (err < 0) {
    perror("Couldn't identify a platform");
    exit(1);
  }

  cl_device_id device_id;
  int gpu = 1;
  err = clGetDeviceIDs(platform, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU,
                       1, &device_id, NULL);
  if (err != CL_SUCCESS) {
    printf("Error: Failed to create a device group!\n");
    return EXIT_FAILURE;
  }

  cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
  if (!context) {
    printf("Error: Failed to create a compute context!\n");
    return EXIT_FAILURE;
  }

  cl_command_queue commands =
      clCreateCommandQueueWithProperties(context, device_id, 0, &err);
  if (!commands) {
    printf("Error: Failed to create a command queue!\n");
    return EXIT_FAILURE;
  }

  cl_program program = clCreateProgramWithSource(
      context, 1, (const char **)&KernelSource, NULL, &err);
  if (!program) {
    printf("Error: Failed to create compute program!\n");
    return EXIT_FAILURE;
  }

  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  if (err != CL_SUCCESS) {
    size_t len;
    char *buffer;

    printf("Error: Failed to build program executable!\n");
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL,
                          &len);
    buffer = malloc(len);
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, len, buffer,
                          NULL);
    printf("%s\n", buffer);
    free(buffer);
    exit(1);
  }

  cl_kernel kernel = clCreateKernel(program, "shift_test", &err);
  if (!kernel || err != CL_SUCCESS) {
    printf("Error: Failed to create compute kernel!\n");
    exit(1);
  }

  cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                sizeof(int) * INPUT_SIZE, NULL, NULL);
  cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                 sizeof(int) * INPUT_SIZE, NULL, NULL);
  if (!output) {
    printf("Error: Failed to allocate device memory!\n");
    exit(1);
  }

  err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0,
                             sizeof(int) * INPUT_SIZE, data, 0, NULL, NULL);
  if (err != CL_SUCCESS) {
    printf("Error: Failed to write to source array!\n");
    exit(1);
  }

  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
  if (err != CL_SUCCESS) {
    printf("Error: Failed to set kernel arguments! %d\n", err);
    exit(1);
  }

  size_t local;
  err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE,
                                 sizeof(local), &local, NULL);
  if (err != CL_SUCCESS) {
    printf("Error: Failed to retrieve kernel work group info! %d\n", err);
    exit(1);
  }

  size_t global = local;
  err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0,
                               NULL, NULL);
  if (err) {
    printf("Error: Failed to execute kernel!\n");
    return EXIT_FAILURE;
  }

  clFinish(commands);

  err = clEnqueueReadBuffer(commands, output, CL_TRUE, 0,
                            sizeof(int) * INPUT_SIZE, results, 0, NULL, NULL);
  if (err != CL_SUCCESS) {
    printf("Error: Failed to read output array! %d\n", err);
    exit(1);
  }

  for (int c = 0; c < VECTOR_SIZE; c++) {
    printf("result.s%d = %d:%d:%d\n", c, results[0 * VECTOR_SIZE + c],
           results[1 * VECTOR_SIZE + c], results[2 * VECTOR_SIZE + c]);
  }

  clReleaseMemObject(input);
  clReleaseMemObject(output);
  clReleaseProgram(program);
  clReleaseKernel(kernel);
  clReleaseCommandQueue(commands);
  clReleaseContext(context);

  return 0;
}

3 uint4 vectors (d0, d1 and d2) al loaded with identical values 10, 11 and 12. Then they are shifted by 1 bit in place, and then the top bit of the next vector (it should always be 0) is fed into the lower bit.

The expected result is that the values are multiplied by 2:

result.s0 = 20:22:24
result.s0 = 20:22:24
result.s0 = 20:22:24
result.s0 = 20:22:24

The actual output shows corruption of s1 and further components of the vectors that received the top bit from another vector.

result.s0 = 20:22:24
result.s1 = 0:0:24
result.s2 = 0:0:24
result.s3 = 0:0:24

The issue was originally observed in mfakto: primesearch/mfakto#15

mfakto saves the binary kernel and uses it as long as the configuration remains the same. If mfakto is run and then another version of Intel Compute Runtime is installed, the behavior of mfakto doesn't change, i.e. the behavior (buggy or correct) is captured in the compiled binary file. That makes me think that the issue is with the compiler.

The text was updated successfully, but these errors were encountered:

proski · 2025-01-08T08:13:43Z

I assumed that ulong is not affected. But it looks like it's also affected if shift by 63 bits is used. Moreover, I see that s0 is also affected. The output of the attached program is

result.s0 = 0:22:24
result.s1 = 0:0:24
result.s2 = 0:0:24
result.s3 = 0:22:24

demo-ulong.c.txt

The fallback implementation of amd_bitalign() triggers a bug with Intel Compute Runtime (NEO) versions from 23.22.26516.18 to 24.45.31740.9 inclusive. intel/intel-graphics-compiler#358 The bug affects all but the first component of the vectors, so the self-tests would pass with VectorSize=1. For higher values of VectorSize, including the default VectorSize=2, approximately half of the self-tests fail, all in barrett32 kernels. Add generic_bitalign() that is always implemented using shifts. Use it in all cases when the destination is the same as one of the sources. If Intel Compute Runtime is detected, use 64-bit shifts in generic_bitalign(). For other platforms, keep using 32-bit shifts. Make amd_bitalign() an alias to generic_bitalign() on systems where amd_bitalign() is not available. That way, it would also expand to 64-bit shifts for Intel Compute Runtime.

proski mentioned this issue Jan 9, 2025

Fix support for Intel Compute Runtime with VectorSize > 1 primesearch/mfakto#15

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

In-place shift of uint vectors corrupts s1 and further components #358

In-place shift of uint vectors corrupts s1 and further components #358

proski commented Jan 8, 2025

proski commented Jan 8, 2025

In-place shift of uint vectors corrupts s1 and further components #358

In-place shift of uint vectors corrupts s1 and further components #358

Comments

proski commented Jan 8, 2025

proski commented Jan 8, 2025