Spaces:
Sleeping
Sleeping
ggml : resolve merge (ggml/0)
Browse files- examples/common-ggml.cpp +2 -0
- ggml-metal.metal +3 -3
- ggml.c +4 -1
examples/common-ggml.cpp
CHANGED
|
@@ -71,6 +71,7 @@ bool ggml_common_quantize_0(
|
|
| 71 |
case GGML_FTYPE_MOSTLY_IQ4_NL:
|
| 72 |
case GGML_FTYPE_MOSTLY_IQ4_XS:
|
| 73 |
case GGML_FTYPE_MOSTLY_IQ1_M:
|
|
|
|
| 74 |
{
|
| 75 |
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
| 76 |
return false;
|
|
@@ -207,6 +208,7 @@ bool ggml_common_quantize_0(
|
|
| 207 |
case GGML_TYPE_IQ4_NL:
|
| 208 |
case GGML_TYPE_IQ4_XS:
|
| 209 |
case GGML_TYPE_IQ1_M:
|
|
|
|
| 210 |
case GGML_TYPE_COUNT:
|
| 211 |
{
|
| 212 |
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
|
|
|
| 71 |
case GGML_FTYPE_MOSTLY_IQ4_NL:
|
| 72 |
case GGML_FTYPE_MOSTLY_IQ4_XS:
|
| 73 |
case GGML_FTYPE_MOSTLY_IQ1_M:
|
| 74 |
+
case GGML_FTYPE_MOSTLY_BF16:
|
| 75 |
{
|
| 76 |
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
| 77 |
return false;
|
|
|
|
| 208 |
case GGML_TYPE_IQ4_NL:
|
| 209 |
case GGML_TYPE_IQ4_XS:
|
| 210 |
case GGML_TYPE_IQ1_M:
|
| 211 |
+
case GGML_TYPE_BF16:
|
| 212 |
case GGML_TYPE_COUNT:
|
| 213 |
{
|
| 214 |
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
|
ggml-metal.metal
CHANGED
|
@@ -296,7 +296,7 @@ kernel void kernel_silu(
|
|
| 296 |
dst[tpig] = x / (1.0f + exp(-x));
|
| 297 |
}
|
| 298 |
|
| 299 |
-
|
| 300 |
device const float4 * src0,
|
| 301 |
device float4 * dst,
|
| 302 |
uint tpig[[thread_position_in_grid]]) {
|
|
@@ -2217,7 +2217,7 @@ kernel void kernel_flash_attn_ext_f16(
|
|
| 2217 |
|
| 2218 |
// ALiBi
|
| 2219 |
if (max_bias > 0.0f) {
|
| 2220 |
-
const
|
| 2221 |
|
| 2222 |
const float base = h < n_head_log2 ? m0 : m1;
|
| 2223 |
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
|
@@ -2473,7 +2473,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
|
| 2473 |
|
| 2474 |
// ALiBi
|
| 2475 |
if (max_bias > 0.0f) {
|
| 2476 |
-
const
|
| 2477 |
|
| 2478 |
const float base = h < n_head_log2 ? m0 : m1;
|
| 2479 |
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
|
|
|
| 296 |
dst[tpig] = x / (1.0f + exp(-x));
|
| 297 |
}
|
| 298 |
|
| 299 |
+
kernel void kernel_silu_4(
|
| 300 |
device const float4 * src0,
|
| 301 |
device float4 * dst,
|
| 302 |
uint tpig[[thread_position_in_grid]]) {
|
|
|
|
| 2217 |
|
| 2218 |
// ALiBi
|
| 2219 |
if (max_bias > 0.0f) {
|
| 2220 |
+
const uint32_t h = iq2;
|
| 2221 |
|
| 2222 |
const float base = h < n_head_log2 ? m0 : m1;
|
| 2223 |
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
|
|
|
| 2473 |
|
| 2474 |
// ALiBi
|
| 2475 |
if (max_bias > 0.0f) {
|
| 2476 |
+
const uint32_t h = iq2;
|
| 2477 |
|
| 2478 |
const float base = h < n_head_log2 ? m0 : m1;
|
| 2479 |
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
ggml.c
CHANGED
|
@@ -4,7 +4,6 @@
|
|
| 4 |
#include "ggml-impl.h"
|
| 5 |
#include "ggml-quants.h"
|
| 6 |
#include "ggml.h"
|
| 7 |
-
#include "sgemm.h"
|
| 8 |
|
| 9 |
#if defined(_MSC_VER) || defined(__MINGW32__)
|
| 10 |
#include <malloc.h> // using malloc.h with MSC/MINGW
|
|
@@ -37,6 +36,10 @@
|
|
| 37 |
#undef GGML_USE_LLAMAFILE
|
| 38 |
#endif
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
#if defined(_MSC_VER)
|
| 41 |
// disable "possible loss of data" to avoid hundreds of casts
|
| 42 |
// we should just be careful :)
|
|
|
|
| 4 |
#include "ggml-impl.h"
|
| 5 |
#include "ggml-quants.h"
|
| 6 |
#include "ggml.h"
|
|
|
|
| 7 |
|
| 8 |
#if defined(_MSC_VER) || defined(__MINGW32__)
|
| 9 |
#include <malloc.h> // using malloc.h with MSC/MINGW
|
|
|
|
| 36 |
#undef GGML_USE_LLAMAFILE
|
| 37 |
#endif
|
| 38 |
|
| 39 |
+
#ifdef GGML_USE_LLAMAFILE
|
| 40 |
+
#include "sgemm.h"
|
| 41 |
+
#endif
|
| 42 |
+
|
| 43 |
#if defined(_MSC_VER)
|
| 44 |
// disable "possible loss of data" to avoid hundreds of casts
|
| 45 |
// we should just be careful :)
|