Spaces:
Running
Running
quants : use MM256_SET_M128I consistently to fix gcc 7 build (llama/5889)
Browse files- ggml-quants.c +14 -13
ggml-quants.c
CHANGED
|
@@ -51,6 +51,7 @@
|
|
| 51 |
|
| 52 |
#define UNUSED GGML_UNUSED
|
| 53 |
|
|
|
|
| 54 |
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
| 55 |
|
| 56 |
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
|
@@ -9563,7 +9564,7 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
| 9563 |
|
| 9564 |
const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
|
| 9565 |
const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
|
| 9566 |
-
const __m256i full_signs =
|
| 9567 |
|
| 9568 |
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
| 9569 |
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
|
|
@@ -9585,8 +9586,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
| 9585 |
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
| 9586 |
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
| 9587 |
|
| 9588 |
-
const __m256i sc1 =
|
| 9589 |
-
const __m256i sc2 =
|
| 9590 |
|
| 9591 |
const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
|
| 9592 |
|
|
@@ -9653,8 +9654,8 @@ void ggml_vec_dot_iq2_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
| 9653 |
|
| 9654 |
const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
|
| 9655 |
const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
|
| 9656 |
-
const __m256i full_signs_1 =
|
| 9657 |
-
const __m256i full_signs_2 =
|
| 9658 |
|
| 9659 |
__m256i signs;
|
| 9660 |
signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
|
|
@@ -10551,10 +10552,10 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|
| 10551 |
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
|
| 10552 |
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
|
| 10553 |
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
|
| 10554 |
-
const __m256i q4b_1 =
|
| 10555 |
-
|
| 10556 |
-
const __m256i q4b_2 =
|
| 10557 |
-
|
| 10558 |
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
| 10559 |
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
| 10560 |
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
|
@@ -10661,10 +10662,10 @@ void ggml_vec_dot_iq4_xs_q8_K(int n, float * restrict s, size_t bs, const void *
|
|
| 10661 |
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
| 10662 |
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
| 10663 |
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
| 10664 |
-
const __m256i q4b_1 =
|
| 10665 |
-
|
| 10666 |
-
const __m256i q4b_2 =
|
| 10667 |
-
|
| 10668 |
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
| 10669 |
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
| 10670 |
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|
|
|
|
| 51 |
|
| 52 |
#define UNUSED GGML_UNUSED
|
| 53 |
|
| 54 |
+
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|
| 55 |
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
|
| 56 |
|
| 57 |
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
|
|
|
| 9564 |
|
| 9565 |
const __m128i odd_bits = _mm_shuffle_epi8(bit_helper, partial_sign_bits_for_counting);
|
| 9566 |
const __m128i full_sign_bits = _mm_or_si128(partial_sign_bits, odd_bits);
|
| 9567 |
+
const __m256i full_signs = MM256_SET_M128I(full_sign_bits, full_sign_bits);
|
| 9568 |
|
| 9569 |
const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
| 9570 |
const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)(y[i].qs+32));
|
|
|
|
| 9586 |
const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1);
|
| 9587 |
const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2);
|
| 9588 |
|
| 9589 |
+
const __m256i sc1 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[0] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[0] & 0xf)+1));
|
| 9590 |
+
const __m256i sc2 = MM256_SET_M128I(_mm_set1_epi16(2*(x[i].scales[1] >> 4)+1), _mm_set1_epi16(2*(x[i].scales[1] & 0xf)+1));
|
| 9591 |
|
| 9592 |
const __m256i sum = _mm256_add_epi32(_mm256_madd_epi16(sc1, dot1), _mm256_madd_epi16(sc2, dot2));
|
| 9593 |
|
|
|
|
| 9654 |
|
| 9655 |
const __m128i full_signs_l = _mm256_castsi256_si128(full_sign_bits);
|
| 9656 |
const __m128i full_signs_h = _mm256_extractf128_si256(full_sign_bits, 1);
|
| 9657 |
+
const __m256i full_signs_1 = MM256_SET_M128I(full_signs_l, full_signs_l);
|
| 9658 |
+
const __m256i full_signs_2 = MM256_SET_M128I(full_signs_h, full_signs_h);
|
| 9659 |
|
| 9660 |
__m256i signs;
|
| 9661 |
signs = _mm256_shuffle_epi8(full_signs_1, block_sign_shuffle_1);
|
|
|
|
| 10552 |
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)x[1].qs);
|
| 10553 |
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)y[0].qs);
|
| 10554 |
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)y[1].qs);
|
| 10555 |
+
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
| 10556 |
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
| 10557 |
+
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
| 10558 |
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
| 10559 |
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
| 10560 |
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
| 10561 |
const __m256i p_1 = _mm256_madd_epi16(p16_1, mone);
|
|
|
|
| 10662 |
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i*)qs); qs += 16;
|
| 10663 |
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
| 10664 |
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32;
|
| 10665 |
+
const __m256i q4b_1 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b)),
|
| 10666 |
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_1, m4b)));
|
| 10667 |
+
const __m256i q4b_2 = MM256_SET_M128I(_mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b)),
|
| 10668 |
+
_mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b)));
|
| 10669 |
const __m256i p16_1 = mul_add_epi8(q4b_1, q8b_1);
|
| 10670 |
const __m256i p16_2 = mul_add_epi8(q4b_2, q8b_2);
|
| 10671 |
const int16_t ls1 = ((x[ibl].scales_l[ib/2] & 0xf) | ((sh << 4) & 0x30)) - 32;
|