junchao-loongson commited on
Commit
133ffbf
·
1 Parent(s): 562afce

ggml : fix loongarch build (O2 issue) (llama/7636)

Browse files
Files changed (2) hide show
  1. ggml-quants.c +14 -6
  2. ggml.c +1 -1
ggml-quants.c CHANGED
@@ -6828,6 +6828,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6828
 
6829
  int bit = 0;
6830
  int is = 0;
 
6831
 
6832
  const uint8_t * restrict q3 = x[i].qs;
6833
  const int8_t * restrict q8 = y[i].qs;
@@ -6836,21 +6837,25 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6836
  // load low 2 bits
6837
  const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
6838
 
 
6839
  // prepare low and high bits
6840
  const __m256i q3l_0 = __lasx_xvand_v(q3bits, m3);
6841
- const __m256i q3h_0 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
6842
  ++bit;
6843
 
 
6844
  const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 2), m3);
6845
- const __m256i q3h_1 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
6846
  ++bit;
6847
 
 
6848
  const __m256i q3l_2 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 4), m3);
6849
- const __m256i q3h_2 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
6850
  ++bit;
6851
 
 
6852
  const __m256i q3l_3 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 6), m3);
6853
- const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
6854
  ++bit;
6855
 
6856
  // load Q8 quants
@@ -8033,6 +8038,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8033
  __m256i sumi = __lasx_xvldi(0);
8034
 
8035
  int bit = 0;
 
8036
 
8037
  for (int j = 0; j < QK_K/64; ++j) {
8038
 
@@ -8041,13 +8047,15 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8041
 
8042
  const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
8043
 
 
8044
  const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4);
8045
- const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvand_v(hbits, hmask), bit++), 4);
8046
  const __m256i q5_0 = __lasx_xvadd_b(q5l_0, q5h_0);
8047
  hmask = __lasx_xvslli_h(hmask, 1);
8048
 
 
8049
  const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4);
8050
- const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvand_v(hbits, hmask), bit++), 4);
8051
  const __m256i q5_1 = __lasx_xvadd_b(q5l_1, q5h_1);
8052
  hmask = __lasx_xvslli_h(hmask, 1);
8053
 
 
6828
 
6829
  int bit = 0;
6830
  int is = 0;
6831
+ __m256i xvbit;
6832
 
6833
  const uint8_t * restrict q3 = x[i].qs;
6834
  const int8_t * restrict q8 = y[i].qs;
 
6837
  // load low 2 bits
6838
  const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
6839
 
6840
+ xvbit = __lasx_xvreplgr2vr_h(bit);
6841
  // prepare low and high bits
6842
  const __m256i q3l_0 = __lasx_xvand_v(q3bits, m3);
6843
+ const __m256i q3h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
6844
  ++bit;
6845
 
6846
+ xvbit = __lasx_xvreplgr2vr_h(bit);
6847
  const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 2), m3);
6848
+ const __m256i q3h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
6849
  ++bit;
6850
 
6851
+ xvbit = __lasx_xvreplgr2vr_h(bit);
6852
  const __m256i q3l_2 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 4), m3);
6853
+ const __m256i q3h_2 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
6854
  ++bit;
6855
 
6856
+ xvbit = __lasx_xvreplgr2vr_h(bit);
6857
  const __m256i q3l_3 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 6), m3);
6858
+ const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
6859
  ++bit;
6860
 
6861
  // load Q8 quants
 
8038
  __m256i sumi = __lasx_xvldi(0);
8039
 
8040
  int bit = 0;
8041
+ __m256i xvbit;
8042
 
8043
  for (int j = 0; j < QK_K/64; ++j) {
8044
 
 
8047
 
8048
  const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
8049
 
8050
+ xvbit = __lasx_xvreplgr2vr_h(bit++);
8051
  const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4);
8052
+ const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
8053
  const __m256i q5_0 = __lasx_xvadd_b(q5l_0, q5h_0);
8054
  hmask = __lasx_xvslli_h(hmask, 1);
8055
 
8056
+ xvbit = __lasx_xvreplgr2vr_h(bit++);
8057
  const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4);
8058
+ const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
8059
  const __m256i q5_1 = __lasx_xvadd_b(q5l_1, q5h_1);
8060
  hmask = __lasx_xvslli_h(hmask, 1);
8061
 
ggml.c CHANGED
@@ -1580,7 +1580,7 @@ do { \
1580
  #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
1581
  #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
1582
 
1583
- static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
1584
  float tmp[8];
1585
 
1586
  for (int i = 0; i < 8; i++) {
 
1580
  #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
1581
  #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
1582
 
1583
+ static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t *x) {
1584
  float tmp[8];
1585
 
1586
  for (int i = 0; i < 8; i++) {