diff --git a/gpu/bitnet_kernels/bitnet_kernels.h b/gpu/bitnet_kernels/bitnet_kernels.h index 1d897908..a21d5cc8 100644 --- a/gpu/bitnet_kernels/bitnet_kernels.h +++ b/gpu/bitnet_kernels/bitnet_kernels.h @@ -49,7 +49,7 @@ __global__ void __launch_bounds__(128) ladder_int8xint2_kernel(int8_t* __restric constexpr int wmma_K = 32; constexpr int wmma_N = 16; int in_thread_C_local[1]; - signed char A_local[K_per_loop]; + alignas(16) signed char A_local[K_per_loop]; int B_reshape_local[1]; signed char B_decode_local[K_per_loop]; int red_buf0[1]; @@ -80,4 +80,4 @@ __global__ void __launch_bounds__(128) ladder_int8xint2_kernel(int8_t* __restric int ws_idx = out_idx / (N / ws_num); if (threadIdx.x == 0) dtype_transform[out_idx] = (__nv_bfloat16)(((float)red_buf0[0])/(float)s[0]*(float)ws[ws_idx]); -} \ No newline at end of file +}