Spaces:
Running
Running
junchao-loongson
commited on
Commit
·
133ffbf
1
Parent(s):
562afce
ggml : fix loongarch build (O2 issue) (llama/7636)
Browse files- ggml-quants.c +14 -6
- ggml.c +1 -1
ggml-quants.c
CHANGED
|
@@ -6828,6 +6828,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 6828 |
|
| 6829 |
int bit = 0;
|
| 6830 |
int is = 0;
|
|
|
|
| 6831 |
|
| 6832 |
const uint8_t * restrict q3 = x[i].qs;
|
| 6833 |
const int8_t * restrict q8 = y[i].qs;
|
|
@@ -6836,21 +6837,25 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 6836 |
// load low 2 bits
|
| 6837 |
const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
|
| 6838 |
|
|
|
|
| 6839 |
// prepare low and high bits
|
| 6840 |
const __m256i q3l_0 = __lasx_xvand_v(q3bits, m3);
|
| 6841 |
-
const __m256i q3h_0 = __lasx_xvslli_h(
|
| 6842 |
++bit;
|
| 6843 |
|
|
|
|
| 6844 |
const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 2), m3);
|
| 6845 |
-
const __m256i q3h_1 = __lasx_xvslli_h(
|
| 6846 |
++bit;
|
| 6847 |
|
|
|
|
| 6848 |
const __m256i q3l_2 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 4), m3);
|
| 6849 |
-
const __m256i q3h_2 = __lasx_xvslli_h(
|
| 6850 |
++bit;
|
| 6851 |
|
|
|
|
| 6852 |
const __m256i q3l_3 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 6), m3);
|
| 6853 |
-
const __m256i q3h_3 = __lasx_xvslli_h(
|
| 6854 |
++bit;
|
| 6855 |
|
| 6856 |
// load Q8 quants
|
|
@@ -8033,6 +8038,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 8033 |
__m256i sumi = __lasx_xvldi(0);
|
| 8034 |
|
| 8035 |
int bit = 0;
|
|
|
|
| 8036 |
|
| 8037 |
for (int j = 0; j < QK_K/64; ++j) {
|
| 8038 |
|
|
@@ -8041,13 +8047,15 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 8041 |
|
| 8042 |
const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
|
| 8043 |
|
|
|
|
| 8044 |
const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4);
|
| 8045 |
-
const __m256i q5h_0 = __lasx_xvslli_h(
|
| 8046 |
const __m256i q5_0 = __lasx_xvadd_b(q5l_0, q5h_0);
|
| 8047 |
hmask = __lasx_xvslli_h(hmask, 1);
|
| 8048 |
|
|
|
|
| 8049 |
const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4);
|
| 8050 |
-
const __m256i q5h_1 = __lasx_xvslli_h(
|
| 8051 |
const __m256i q5_1 = __lasx_xvadd_b(q5l_1, q5h_1);
|
| 8052 |
hmask = __lasx_xvslli_h(hmask, 1);
|
| 8053 |
|
|
|
|
| 6828 |
|
| 6829 |
int bit = 0;
|
| 6830 |
int is = 0;
|
| 6831 |
+
__m256i xvbit;
|
| 6832 |
|
| 6833 |
const uint8_t * restrict q3 = x[i].qs;
|
| 6834 |
const int8_t * restrict q8 = y[i].qs;
|
|
|
|
| 6837 |
// load low 2 bits
|
| 6838 |
const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
|
| 6839 |
|
| 6840 |
+
xvbit = __lasx_xvreplgr2vr_h(bit);
|
| 6841 |
// prepare low and high bits
|
| 6842 |
const __m256i q3l_0 = __lasx_xvand_v(q3bits, m3);
|
| 6843 |
+
const __m256i q3h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
|
| 6844 |
++bit;
|
| 6845 |
|
| 6846 |
+
xvbit = __lasx_xvreplgr2vr_h(bit);
|
| 6847 |
const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 2), m3);
|
| 6848 |
+
const __m256i q3h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
|
| 6849 |
++bit;
|
| 6850 |
|
| 6851 |
+
xvbit = __lasx_xvreplgr2vr_h(bit);
|
| 6852 |
const __m256i q3l_2 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 4), m3);
|
| 6853 |
+
const __m256i q3h_2 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
|
| 6854 |
++bit;
|
| 6855 |
|
| 6856 |
+
xvbit = __lasx_xvreplgr2vr_h(bit);
|
| 6857 |
const __m256i q3l_3 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 6), m3);
|
| 6858 |
+
const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
|
| 6859 |
++bit;
|
| 6860 |
|
| 6861 |
// load Q8 quants
|
|
|
|
| 8038 |
__m256i sumi = __lasx_xvldi(0);
|
| 8039 |
|
| 8040 |
int bit = 0;
|
| 8041 |
+
__m256i xvbit;
|
| 8042 |
|
| 8043 |
for (int j = 0; j < QK_K/64; ++j) {
|
| 8044 |
|
|
|
|
| 8047 |
|
| 8048 |
const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
|
| 8049 |
|
| 8050 |
+
xvbit = __lasx_xvreplgr2vr_h(bit++);
|
| 8051 |
const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4);
|
| 8052 |
+
const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
|
| 8053 |
const __m256i q5_0 = __lasx_xvadd_b(q5l_0, q5h_0);
|
| 8054 |
hmask = __lasx_xvslli_h(hmask, 1);
|
| 8055 |
|
| 8056 |
+
xvbit = __lasx_xvreplgr2vr_h(bit++);
|
| 8057 |
const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4);
|
| 8058 |
+
const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
|
| 8059 |
const __m256i q5_1 = __lasx_xvadd_b(q5l_1, q5h_1);
|
| 8060 |
hmask = __lasx_xvslli_h(hmask, 1);
|
| 8061 |
|
ggml.c
CHANGED
|
@@ -1580,7 +1580,7 @@ do { \
|
|
| 1580 |
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
| 1581 |
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
| 1582 |
|
| 1583 |
-
static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
|
| 1584 |
float tmp[8];
|
| 1585 |
|
| 1586 |
for (int i = 0; i < 8; i++) {
|
|
|
|
| 1580 |
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
| 1581 |
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
| 1582 |
|
| 1583 |
+
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t *x) {
|
| 1584 |
float tmp[8];
|
| 1585 |
|
| 1586 |
for (int i = 0; i < 8; i++) {
|