@@ -1671,6 +1671,7 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
1671
1671
1672
1672
static __device__ __forceinline__ float vec_dot_iq1_s_q8_1 (
1673
1673
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1674
+ #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
1674
1675
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
1675
1676
1676
1677
const int qs_packed = get_int_b2 (bq1->qs , iqs);
@@ -1697,10 +1698,12 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
1697
1698
const float delta = -1 .0f + IQ1S_DELTA - (qh & 0x8000 ) * (2 .0f *IQ1S_DELTA/0x8000 );
1698
1699
const float2 ds = __half22float2 (bq8_1[iqs].ds );
1699
1700
return d1q * (ds.x *sumi + ds.y *delta);
1701
+ #endif
1700
1702
}
1701
1703
1702
1704
static __device__ __forceinline__ float vec_dot_iq1_m_q8_1 (
1703
1705
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1706
+ #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 610
1704
1707
1705
1708
const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
1706
1709
@@ -1741,6 +1744,7 @@ static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
1741
1744
const int sc0 = 2 *((tmp >> 0 ) & 0x07 ) + 1 ;
1742
1745
const int sc1 = 2 *((tmp >> 3 ) & 0x07 ) + 1 ;
1743
1746
return d * ((sumi[0 ] + sumf[0 ]) * sc0 + (sumi[1 ] + sumf[1 ]) * sc1);
1747
+ #endif
1744
1748
}
1745
1749
1746
1750
static __device__ __forceinline__ void get_int_from_table_16 (const uint32_t & q4, const uint8_t * values,
0 commit comments