Skip to content

Commit

Permalink
Fix support for Intel Compute Runtime with VectorSize > 1
Browse files Browse the repository at this point in the history
The fallback implementation of amd_bitalign() triggers a bug with Intel Compute
Runtime (NEO) versions from 23.22.26516.18 to 24.45.31740.9 inclusive.

intel/intel-graphics-compiler#358

The bug affects all but the first component of the vectors, so the self-tests
would pass with VectorSize=1. For higher values of VectorSize, including the
default VectorSize=2, approximately half of the self-tests fail, all in
barrett32 kernels.

Add generic_bitalign() that is always implemented using shifts. Use it in all
cases when the destination is the same as one of the sources.

If Intel Compute Runtime is detected, use 64-bit shifts in generic_bitalign().
For other platforms, keep using 32-bit shifts.

Make amd_bitalign() an alias to generic_bitalign() on systems where
amd_bitalign() is not available. That way, it would also expand to 64-bit
shifts for Intel Compute Runtime.
  • Loading branch information
proski committed Jan 11, 2025
1 parent b7da6f8 commit c8dc049
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 28 deletions.
54 changes: 27 additions & 27 deletions src/barrett.cl
Original file line number Diff line number Diff line change
Expand Up @@ -253,27 +253,27 @@ Adding x*x to a few carries will not cascade the carry

void shl_96(int96_v * const a)
/* shiftleft a one bit */
{ /* here, bitalign improves the 92-bit kernel, and slows down 76-bit */
{ /* here, amd_bitalign improves the 92-bit kernel, and slows down 76-bit */
a->d2 = amd_bitalign(a->d2, a->d1, 31);
a->d1 = amd_bitalign(a->d1, a->d0, 31);
// a->d2 = (a->d2 << 1) | (a->d1 >> 31);
// a->d1 = (a->d1 << 1) | (a->d0 >> 31);
// a->d2 = generic_bitalign(a->d2, a->d1, 31);
// a->d1 = generic_bitalign(a->d1, a->d0, 31);
a->d0 = a->d0 << 1;
}

void shl_192(int192_v * const a)
/* shiftleft a one bit */
{ /* in this function, bitalign slows down all kernels */
{ /* in this function, amd_bitalign slows down all kernels */
// a->d5 = amd_bitalign(a->d5, a->d4, 31);
// a->d4 = amd_bitalign(a->d4, a->d3, 31);
// a->d3 = amd_bitalign(a->d3, a->d2, 31);
// a->d2 = amd_bitalign(a->d2, a->d1, 31);
// a->d1 = amd_bitalign(a->d1, a->d0, 31);
a->d5 = (a->d5 << 1) | (a->d4 >> 31);
a->d4 = (a->d4 << 1) | (a->d3 >> 31);
a->d3 = (a->d3 << 1) | (a->d2 >> 31);
a->d2 = (a->d2 << 1) | (a->d1 >> 31);
a->d1 = (a->d1 << 1) | (a->d0 >> 31);
a->d5 = generic_bitalign(a->d5, a->d4, 31);
a->d4 = generic_bitalign(a->d4, a->d3, 31);
a->d3 = generic_bitalign(a->d3, a->d2, 31);
a->d2 = generic_bitalign(a->d2, a->d1, 31);
a->d1 = generic_bitalign(a->d1, a->d0, 31);
a->d0 = a->d0 << 1;
}

Expand Down Expand Up @@ -442,12 +442,12 @@ void div_192_96(int96_v * const res, __private uint qd5, const int96_v n, const

// shiftleft nn 11 bits
#ifndef DIV_160_96
nn.d3 = (nn.d3 << 11) + (nn.d2 >> 21);
nn.d3 = generic_bitalign(nn.d3, nn.d2, 21);
#endif
nn.d2 = amd_bitalign(nn.d2, nn.d1, 21);
nn.d1 = amd_bitalign(nn.d1, nn.d0, 21);
// nn.d2 = (nn.d2 << 11) + (nn.d1 >> 21);
// nn.d1 = (nn.d1 << 11) + (nn.d0 >> 21);
// nn.d2 = generic_bitalign(nn.d2, nn.d1, 21);
// nn.d1 = generic_bitalign(nn.d1, nn.d0, 21);
nn.d0 = nn.d0 << 11;

// q = q - nn
Expand Down Expand Up @@ -510,11 +510,11 @@ void div_192_96(int96_v * const res, __private uint qd5, const int96_v n, const
nn.d4 = nn.d3 >> 9;
#endif
// nn.d3 = amd_bitalign(nn.d3, nn.d2, 9);
nn.d3 = (nn.d3 << 23) + (nn.d2 >> 9);
nn.d3 = generic_bitalign(nn.d3, nn.d2, 9);
nn.d2 = amd_bitalign(nn.d2, nn.d1, 9);
// nn.d2 = (nn.d2 << 23) + (nn.d1 >> 9);
// nn.d2 = generic_bitalign(nn.d2, nn.d1, 9);
// nn.d1 = amd_bitalign(nn.d1, nn.d0, 9);
nn.d1 = (nn.d1 << 23) + (nn.d0 >> 9);
nn.d1 = generic_bitalign(nn.d1, nn.d0, 9);
nn.d0 = nn.d0 << 23;

// q = q - nn
Expand Down Expand Up @@ -642,9 +642,9 @@ void div_192_96(int96_v * const res, __private uint qd5, const int96_v n, const
#ifdef CHECKS_MODBASECASE
nn.d4 = nn.d3 >> 17;
#endif
nn.d3 = (nn.d3 << 15) + (nn.d2 >> 17);
nn.d2 = (nn.d2 << 15) + (nn.d1 >> 17);
nn.d1 = (nn.d1 << 15) + (nn.d0 >> 17);
nn.d3 = generic_bitalign(nn.d3, nn.d2, 17);
nn.d2 = generic_bitalign(nn.d2, nn.d1, 17);
nn.d1 = generic_bitalign(nn.d1, nn.d0, 17);
nn.d0 = nn.d0 << 15;

// q = q - nn
Expand Down Expand Up @@ -877,12 +877,12 @@ DIV_160_96 here. */

// shiftleft nn 11 bits
#ifndef DIV_160_96
nn.d3 = (nn.d3 << 11) + (nn.d2 >> 21);
nn.d3 = generic_bitalign(nn.d3, nn.d2, 21);
#endif
nn.d2 = amd_bitalign(nn.d2, nn.d1, 21);
nn.d1 = amd_bitalign(nn.d1, nn.d0, 21);
// nn.d2 = (nn.d2 << 11) + (nn.d1 >> 21);
// nn.d1 = (nn.d1 << 11) + (nn.d0 >> 21);
// nn.d2 = generic_bitalign(nn.d2, nn.d1, 21);
// nn.d1 = generic_bitalign(nn.d1, nn.d0, 21);
nn.d0 = nn.d0 << 11;

// q = q - nn
Expand Down Expand Up @@ -945,11 +945,11 @@ DIV_160_96 here. */
nn.d4 = nn.d3 >> 9;
#endif
// nn.d3 = amd_bitalign(nn.d3, nn.d2, 9);
nn.d3 = (nn.d3 << 23) + (nn.d2 >> 9);
nn.d3 = generic_bitalign(nn.d3, nn.d2, 9);
nn.d2 = amd_bitalign(nn.d2, nn.d1, 9);
// nn.d2 = (nn.d2 << 23) + (nn.d1 >> 9);
// nn.d2 = generic_bitalign(nn.d2, nn.d1, 9);
// nn.d1 = amd_bitalign(nn.d1, nn.d0, 9);
nn.d1 = (nn.d1 << 23) + (nn.d0 >> 9);
nn.d1 = generic_bitalign(nn.d1, nn.d0, 9);
nn.d0 = nn.d0 << 23;

// q = q - nn
Expand Down Expand Up @@ -1077,9 +1077,9 @@ DIV_160_96 here. */
#ifdef CHECKS_MODBASECASE
nn.d4 = nn.d3 >> 17;
#endif
nn.d3 = (nn.d3 << 15) + (nn.d2 >> 17);
nn.d2 = (nn.d2 << 15) + (nn.d1 >> 17);
nn.d1 = (nn.d1 << 15) + (nn.d0 >> 17);
nn.d3 = generic_bitalign(nn.d3, nn.d2, 17);
nn.d2 = generic_bitalign(nn.d2, nn.d1, 17);
nn.d1 = generic_bitalign(nn.d1, nn.d0, 17);
nn.d0 = nn.d0 << 15;

// q = q - nn
Expand Down
20 changes: 19 additions & 1 deletion src/common.cl
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,24 @@ uint popcount(uint x)
#define ATOMIC_INC(x) ((x)++)
#endif

// generic_bitalign() emulates amd_bitalign() using shifts.
#ifdef cl_intel_subgroups
// Workaround for Intel Compute Runtime (NEO) versions 23.22.26516.18 to
// 24.45.31740.9: https://github.com/intel/intel-graphics-compiler/issues/358
// Use 64-bit shifts. They are faster than 32-bit shifts on Intel, so it's not
// needed to limit this workaround to specific versions.
inline uint_v generic_bitalign(const uint_v high, const uint_v low, const int shift)
{
return CONVERT_UINT_V(((CONVERT_ULONG_V(high) << 32) | CONVERT_ULONG_V(low)) >> shift);
}
#else
// Use 32-bit shifts for other platforms.
inline uint_v generic_bitalign(const uint_v high, const uint_v low, const int shift)
{
return (high << (32 - shift)) | (low >> shift);
}
#endif

#ifdef cl_amd_media_ops
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
#else
Expand All @@ -180,7 +198,7 @@ uint popcount(uint x)
// Description
// dst.s0 = (uint) (((((long)src0.s0) << 32) | (long)src1.s0) >> (src2.s0 & 31))
// similar operation applied to other components of the vectors.
#define amd_bitalign(src0, src1, src2) (src0 << (32-src2)) | (src1 >> src2)
#define amd_bitalign(src0, src1, src2) generic_bitalign(src0, src1, src2)
#endif

#ifdef cl_amd_media_ops2
Expand Down

0 comments on commit c8dc049

Please sign in to comment.