Skip to content

Commit

Permalink
Make GGML vector ops go faster across hardware
Browse files Browse the repository at this point in the history
  • Loading branch information
jart committed May 2, 2024
1 parent 38311f2 commit c9d7393
Show file tree
Hide file tree
Showing 26 changed files with 3,436 additions and 944 deletions.
9 changes: 9 additions & 0 deletions llama.cpp/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,15 @@ o/$(MODE)/llama.cpp/ggml-quants-amd-avx.o: private TARGET_ARCH += -Xx86_64-mtune
o/$(MODE)/llama.cpp/ggml-quants-amd-avx2.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mf16c -Xx86_64-mfma -Xx86_64-mavx2
o/$(MODE)/llama.cpp/ggml-quants-amd-avx512.o: private TARGET_ARCH += -Xx86_64-mtune=cannonlake -Xx86_64-mf16c -Xx86_64-mfma -Xx86_64-mavx2 -Xx86_64-mavx512f

o/$(MODE)/llama.cpp/ggml-vector.o: private CXXFLAGS += -Os
o/$(MODE)/llama.cpp/ggml-vector-amd-avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge
o/$(MODE)/llama.cpp/ggml-vector-amd-fma.o: private TARGET_ARCH += -Xx86_64-mtune=bdver2 -Xx86_64-mfma
o/$(MODE)/llama.cpp/ggml-vector-amd-f16c.o: private TARGET_ARCH += -Xx86_64-mtune=ivybridge -Xx86_64-mf16c
o/$(MODE)/llama.cpp/ggml-vector-amd-avx2.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mf16c -Xx86_64-mfma -Xx86_64-mavx2
o/$(MODE)/llama.cpp/ggml-vector-amd-avx512.o: private TARGET_ARCH += -Xx86_64-mtune=cannonlake -Xx86_64-mf16c -Xx86_64-mfma -Xx86_64-mavx2 -Xx86_64-mavx512f
o/$(MODE)/llama.cpp/ggml-vector-amd-avx512bf16.o: private TARGET_ARCH += -Xx86_64-mtune=znver4 -Xx86_64-mf16c -Xx86_64-mfma -Xx86_64-mavx2 -Xx86_64-mavx512f -Xx86_64-mavx512vl -Xx86_64-mavx512bf16
o/$(MODE)/llama.cpp/ggml-vector-arm82.o: private TARGET_ARCH += -Xaarch64-march=armv8.2-a+fp16

$(LLAMA_CPP_OBJS): llama.cpp/BUILD.mk

.PHONY: o/$(MODE)/llama.cpp
Expand Down
80 changes: 77 additions & 3 deletions llama.cpp/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,83 @@
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))

/**
* Converts brain16 to float32.
*
* The bfloat16 floating point format has the following structure:
*
* ┌sign
* │
* │ ┌exponent
* │ │
* │ │ ┌mantissa
* │ │ │
* │┌──┴───┐┌─┴───┐
* 0b0000000000000000 brain16
*
* Since bf16 has the same number of exponent bits as a 32bit float,
* encoding and decoding numbers becomes relatively straightforward.
*
* ┌sign
* │
* │ ┌exponent
* │ │
* │ │ ┌mantissa
* │ │ │
* │┌──┴───┐┌─┴───────────────────┐
* 0b00000000000000000000000000000000 IEEE binary32
*
* For comparison, the standard fp16 format has fewer exponent bits.
*
* ┌sign
* │
* │ ┌exponent
* │ │
* │ │ ┌mantissa
* │ │ │
* │┌─┴─┐┌─┴──────┐
* 0b0000000000000000 IEEE binary16
*
* @see IEEE 754-2008
*/
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
union {
float f;
uint32_t i;
} u;
u.i = (uint32_t)h.bits << 16;
return u.f;
}

/**
* Converts float32 to brain16.
*
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
* Subnormals shall be flushed to zero, and NANs will be quiet.
* This code should vectorize nicely if using modern compilers.
*/
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
ggml_bf16_t h;
union {
float f;
uint32_t i;
} u;
u.f = s;
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
h.bits = (u.i >> 16) | 64; /* force to quiet */
return h;
}
if (!(u.i & 0x7f800000)) { /* subnormal */
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
return h;
}
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
return h;
}

#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)

#ifdef __cplusplus
extern "C" {
#endif
Expand Down Expand Up @@ -520,9 +597,6 @@ size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml
// return index, asserts if table is full
size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);

#define GGML_FP32_TO_BF16(x) ggml_fp32_to_bf16(x)
#define GGML_BF16_TO_FP32(x) ggml_bf16_to_fp32(x)

#ifdef __cplusplus
}
#endif
31 changes: 8 additions & 23 deletions llama.cpp/ggml-quants.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,36 +42,21 @@
f.write(' typeof(%s) *ptr_%s;\n' % (func, func))
f.write('\n')
f.write(' QuantFuncs() {\n')
f.write('#ifdef __x86_64__\n')
for arch, mac, needs in ARCHS:
if mac == '__x86_64__':
f.write(' if (%s) {\n' % (' && '.join('X86_HAVE(%s)' % (need) for need in needs) or '1'))
for func, proto in FUNCS:
f.write(' ptr_%s = %s_%s;\n' % (func, func, arch))
f.write(' return;\n')
f.write(' }\n')
f.write('#else\n')
for func, proto in FUNCS:
f.write(' ptr_%s = %s_arm80;\n' % (func, func))
f.write('#endif\n')
f.write('#ifdef %s\n' % (mac))
f.write(' if (%s) {\n' % (' && '.join('X86_HAVE(%s)' % (need) for need in needs) or '1'))
for func, proto in FUNCS:
f.write(' ptr_%s = %s_%s;\n' % (func, func, arch))
f.write(' return;\n')
f.write(' }\n')
f.write('#endif\n')
f.write(' }\n')
f.write('} funcs;\n')
f.write('\n')
for func, proto in FUNCS:
proto = proto.replace(';', '')
args = [s.split(' ')[-1] for s in re.search(r'(?<=\().*(?=\))', proto).group(0).split(',')]
f.write(proto + ' {\n')
if 'imatrix' in proto:
args = 'src, dst, nrows, n_per_row, imatrix'
elif 'quantize' in proto:
args = 'x, y, k'
elif 'vec_dot' in proto:
args = 'n, s, bs, vx, bx, vy, by, nrc'
elif 'grid_size' in proto:
args = 'grid_size'
elif 'validate' in proto:
args = 'type, data, nbytes'
else:
args = 'type'
f.write(' return funcs.ptr_%s(%s);\n' % (func, args))
f.write('}\n')
f.write('\n')
57 changes: 57 additions & 0 deletions llama.cpp/ggml-vector-amd-avx.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifdef __x86_64__
#define ggml_fp16_to_fp32_row ggml_fp16_to_fp32_row_amd_avx
#define ggml_fp32_to_fp16_row ggml_fp32_to_fp16_row_amd_avx
#define ggml_bf16_to_fp32_row ggml_bf16_to_fp32_row_amd_avx
#define ggml_fp32_to_bf16_row ggml_fp32_to_bf16_row_amd_avx
#define ggml_vec_dot_f32 ggml_vec_dot_f32_amd_avx
#define ggml_vec_dot_f16 ggml_vec_dot_f16_amd_avx
#define ggml_vec_dot_bf16 ggml_vec_dot_bf16_amd_avx
#define ggml_vec_dot_f16_unroll ggml_vec_dot_f16_unroll_amd_avx
#define ggml_vec_mad_f32_unroll ggml_vec_mad_f32_unroll_amd_avx
#define ggml_vec_set_i8 ggml_vec_set_i8_amd_avx
#define ggml_vec_set_i16 ggml_vec_set_i16_amd_avx
#define ggml_vec_set_i32 ggml_vec_set_i32_amd_avx
#define ggml_vec_set_f16 ggml_vec_set_f16_amd_avx
#define ggml_vec_set_bf16 ggml_vec_set_bf16_amd_avx
#define ggml_vec_add_f32 ggml_vec_add_f32_amd_avx
#define ggml_vec_add1_f32 ggml_vec_add1_f32_amd_avx
#define ggml_vec_acc_f32 ggml_vec_acc_f32_amd_avx
#define ggml_vec_acc1_f32 ggml_vec_acc1_f32_amd_avx
#define ggml_vec_sub_f32 ggml_vec_sub_f32_amd_avx
#define ggml_vec_set_f32 ggml_vec_set_f32_amd_avx
#define ggml_vec_cpy_f32 ggml_vec_cpy_f32_amd_avx
#define ggml_vec_neg_f32 ggml_vec_neg_f32_amd_avx
#define ggml_vec_mul_f32 ggml_vec_mul_f32_amd_avx
#define ggml_vec_div_f32 ggml_vec_div_f32_amd_avx
#define ggml_vec_scale_f32 ggml_vec_scale_f32_amd_avx
#define ggml_vec_scale_f16 ggml_vec_scale_f16_amd_avx
#define ggml_vec_mad_f32 ggml_vec_mad_f32_amd_avx
#define ggml_vec_mad_f16 ggml_vec_mad_f16_amd_avx
#define ggml_vec_norm_f32 ggml_vec_norm_f32_amd_avx
#define ggml_vec_sqr_f32 ggml_vec_sqr_f32_amd_avx
#define ggml_vec_sqrt_f32 ggml_vec_sqrt_f32_amd_avx
#define ggml_vec_log_f32 ggml_vec_log_f32_amd_avx
#define ggml_vec_abs_f32 ggml_vec_abs_f32_amd_avx
#define ggml_vec_sgn_f32 ggml_vec_sgn_f32_amd_avx
#define ggml_vec_step_f32 ggml_vec_step_f32_amd_avx
#define ggml_vec_tanh_f32 ggml_vec_tanh_f32_amd_avx
#define ggml_vec_elu_f32 ggml_vec_elu_f32_amd_avx
#define ggml_vec_relu_f32 ggml_vec_relu_f32_amd_avx
#define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_amd_avx
#define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_amd_avx
#define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_amd_avx
#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_amd_avx
#define ggml_vec_gelu_f32 ggml_vec_gelu_f32_amd_avx
#define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_amd_avx
#define ggml_vec_silu_f32 ggml_vec_silu_f32_amd_avx
#define ggml_silu_backward_f32 ggml_silu_backward_f32_amd_avx
#define ggml_vec_silu_backward_f32 ggml_vec_silu_backward_f32_amd_avx
#define ggml_vec_sum_f32 ggml_vec_sum_f32_amd_avx
#define ggml_vec_sum_f32_ggf ggml_vec_sum_f32_ggf_amd_avx
#define ggml_vec_sum_f16_ggf ggml_vec_sum_f16_ggf_amd_avx
#define ggml_vec_max_f32 ggml_vec_max_f32_amd_avx
#define ggml_vec_norm_inv_f32 ggml_vec_norm_inv_f32_amd_avx
#define ggml_vec_argmax_f32 ggml_vec_argmax_f32_amd_avx
#define GGML_VECTOR
#include "ggml-vector.inc"
#endif // __x86_64__
57 changes: 57 additions & 0 deletions llama.cpp/ggml-vector-amd-avx2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifdef __x86_64__
#define ggml_fp16_to_fp32_row ggml_fp16_to_fp32_row_amd_avx2
#define ggml_fp32_to_fp16_row ggml_fp32_to_fp16_row_amd_avx2
#define ggml_bf16_to_fp32_row ggml_bf16_to_fp32_row_amd_avx2
#define ggml_fp32_to_bf16_row ggml_fp32_to_bf16_row_amd_avx2
#define ggml_vec_dot_f32 ggml_vec_dot_f32_amd_avx2
#define ggml_vec_dot_f16 ggml_vec_dot_f16_amd_avx2
#define ggml_vec_dot_bf16 ggml_vec_dot_bf16_amd_avx2
#define ggml_vec_dot_f16_unroll ggml_vec_dot_f16_unroll_amd_avx2
#define ggml_vec_mad_f32_unroll ggml_vec_mad_f32_unroll_amd_avx2
#define ggml_vec_set_i8 ggml_vec_set_i8_amd_avx2
#define ggml_vec_set_i16 ggml_vec_set_i16_amd_avx2
#define ggml_vec_set_i32 ggml_vec_set_i32_amd_avx2
#define ggml_vec_set_f16 ggml_vec_set_f16_amd_avx2
#define ggml_vec_set_bf16 ggml_vec_set_bf16_amd_avx2
#define ggml_vec_add_f32 ggml_vec_add_f32_amd_avx2
#define ggml_vec_add1_f32 ggml_vec_add1_f32_amd_avx2
#define ggml_vec_acc_f32 ggml_vec_acc_f32_amd_avx2
#define ggml_vec_acc1_f32 ggml_vec_acc1_f32_amd_avx2
#define ggml_vec_sub_f32 ggml_vec_sub_f32_amd_avx2
#define ggml_vec_set_f32 ggml_vec_set_f32_amd_avx2
#define ggml_vec_cpy_f32 ggml_vec_cpy_f32_amd_avx2
#define ggml_vec_neg_f32 ggml_vec_neg_f32_amd_avx2
#define ggml_vec_mul_f32 ggml_vec_mul_f32_amd_avx2
#define ggml_vec_div_f32 ggml_vec_div_f32_amd_avx2
#define ggml_vec_scale_f32 ggml_vec_scale_f32_amd_avx2
#define ggml_vec_scale_f16 ggml_vec_scale_f16_amd_avx2
#define ggml_vec_mad_f32 ggml_vec_mad_f32_amd_avx2
#define ggml_vec_mad_f16 ggml_vec_mad_f16_amd_avx2
#define ggml_vec_norm_f32 ggml_vec_norm_f32_amd_avx2
#define ggml_vec_sqr_f32 ggml_vec_sqr_f32_amd_avx2
#define ggml_vec_sqrt_f32 ggml_vec_sqrt_f32_amd_avx2
#define ggml_vec_log_f32 ggml_vec_log_f32_amd_avx2
#define ggml_vec_abs_f32 ggml_vec_abs_f32_amd_avx2
#define ggml_vec_sgn_f32 ggml_vec_sgn_f32_amd_avx2
#define ggml_vec_step_f32 ggml_vec_step_f32_amd_avx2
#define ggml_vec_tanh_f32 ggml_vec_tanh_f32_amd_avx2
#define ggml_vec_elu_f32 ggml_vec_elu_f32_amd_avx2
#define ggml_vec_relu_f32 ggml_vec_relu_f32_amd_avx2
#define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_amd_avx2
#define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_amd_avx2
#define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_amd_avx2
#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_amd_avx2
#define ggml_vec_gelu_f32 ggml_vec_gelu_f32_amd_avx2
#define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_amd_avx2
#define ggml_vec_silu_f32 ggml_vec_silu_f32_amd_avx2
#define ggml_silu_backward_f32 ggml_silu_backward_f32_amd_avx2
#define ggml_vec_silu_backward_f32 ggml_vec_silu_backward_f32_amd_avx2
#define ggml_vec_sum_f32 ggml_vec_sum_f32_amd_avx2
#define ggml_vec_sum_f32_ggf ggml_vec_sum_f32_ggf_amd_avx2
#define ggml_vec_sum_f16_ggf ggml_vec_sum_f16_ggf_amd_avx2
#define ggml_vec_max_f32 ggml_vec_max_f32_amd_avx2
#define ggml_vec_norm_inv_f32 ggml_vec_norm_inv_f32_amd_avx2
#define ggml_vec_argmax_f32 ggml_vec_argmax_f32_amd_avx2
#define GGML_VECTOR
#include "ggml-vector.inc"
#endif // __x86_64__
57 changes: 57 additions & 0 deletions llama.cpp/ggml-vector-amd-avx512.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifdef __x86_64__
#define ggml_fp16_to_fp32_row ggml_fp16_to_fp32_row_amd_avx512
#define ggml_fp32_to_fp16_row ggml_fp32_to_fp16_row_amd_avx512
#define ggml_bf16_to_fp32_row ggml_bf16_to_fp32_row_amd_avx512
#define ggml_fp32_to_bf16_row ggml_fp32_to_bf16_row_amd_avx512
#define ggml_vec_dot_f32 ggml_vec_dot_f32_amd_avx512
#define ggml_vec_dot_f16 ggml_vec_dot_f16_amd_avx512
#define ggml_vec_dot_bf16 ggml_vec_dot_bf16_amd_avx512
#define ggml_vec_dot_f16_unroll ggml_vec_dot_f16_unroll_amd_avx512
#define ggml_vec_mad_f32_unroll ggml_vec_mad_f32_unroll_amd_avx512
#define ggml_vec_set_i8 ggml_vec_set_i8_amd_avx512
#define ggml_vec_set_i16 ggml_vec_set_i16_amd_avx512
#define ggml_vec_set_i32 ggml_vec_set_i32_amd_avx512
#define ggml_vec_set_f16 ggml_vec_set_f16_amd_avx512
#define ggml_vec_set_bf16 ggml_vec_set_bf16_amd_avx512
#define ggml_vec_add_f32 ggml_vec_add_f32_amd_avx512
#define ggml_vec_add1_f32 ggml_vec_add1_f32_amd_avx512
#define ggml_vec_acc_f32 ggml_vec_acc_f32_amd_avx512
#define ggml_vec_acc1_f32 ggml_vec_acc1_f32_amd_avx512
#define ggml_vec_sub_f32 ggml_vec_sub_f32_amd_avx512
#define ggml_vec_set_f32 ggml_vec_set_f32_amd_avx512
#define ggml_vec_cpy_f32 ggml_vec_cpy_f32_amd_avx512
#define ggml_vec_neg_f32 ggml_vec_neg_f32_amd_avx512
#define ggml_vec_mul_f32 ggml_vec_mul_f32_amd_avx512
#define ggml_vec_div_f32 ggml_vec_div_f32_amd_avx512
#define ggml_vec_scale_f32 ggml_vec_scale_f32_amd_avx512
#define ggml_vec_scale_f16 ggml_vec_scale_f16_amd_avx512
#define ggml_vec_mad_f32 ggml_vec_mad_f32_amd_avx512
#define ggml_vec_mad_f16 ggml_vec_mad_f16_amd_avx512
#define ggml_vec_norm_f32 ggml_vec_norm_f32_amd_avx512
#define ggml_vec_sqr_f32 ggml_vec_sqr_f32_amd_avx512
#define ggml_vec_sqrt_f32 ggml_vec_sqrt_f32_amd_avx512
#define ggml_vec_log_f32 ggml_vec_log_f32_amd_avx512
#define ggml_vec_abs_f32 ggml_vec_abs_f32_amd_avx512
#define ggml_vec_sgn_f32 ggml_vec_sgn_f32_amd_avx512
#define ggml_vec_step_f32 ggml_vec_step_f32_amd_avx512
#define ggml_vec_tanh_f32 ggml_vec_tanh_f32_amd_avx512
#define ggml_vec_elu_f32 ggml_vec_elu_f32_amd_avx512
#define ggml_vec_relu_f32 ggml_vec_relu_f32_amd_avx512
#define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_amd_avx512
#define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_amd_avx512
#define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_amd_avx512
#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_amd_avx512
#define ggml_vec_gelu_f32 ggml_vec_gelu_f32_amd_avx512
#define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_amd_avx512
#define ggml_vec_silu_f32 ggml_vec_silu_f32_amd_avx512
#define ggml_silu_backward_f32 ggml_silu_backward_f32_amd_avx512
#define ggml_vec_silu_backward_f32 ggml_vec_silu_backward_f32_amd_avx512
#define ggml_vec_sum_f32 ggml_vec_sum_f32_amd_avx512
#define ggml_vec_sum_f32_ggf ggml_vec_sum_f32_ggf_amd_avx512
#define ggml_vec_sum_f16_ggf ggml_vec_sum_f16_ggf_amd_avx512
#define ggml_vec_max_f32 ggml_vec_max_f32_amd_avx512
#define ggml_vec_norm_inv_f32 ggml_vec_norm_inv_f32_amd_avx512
#define ggml_vec_argmax_f32 ggml_vec_argmax_f32_amd_avx512
#define GGML_VECTOR
#include "ggml-vector.inc"
#endif // __x86_64__
57 changes: 57 additions & 0 deletions llama.cpp/ggml-vector-amd-avx512bf16.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifdef __x86_64__
#define ggml_fp16_to_fp32_row ggml_fp16_to_fp32_row_amd_avx512bf16
#define ggml_fp32_to_fp16_row ggml_fp32_to_fp16_row_amd_avx512bf16
#define ggml_bf16_to_fp32_row ggml_bf16_to_fp32_row_amd_avx512bf16
#define ggml_fp32_to_bf16_row ggml_fp32_to_bf16_row_amd_avx512bf16
#define ggml_vec_dot_f32 ggml_vec_dot_f32_amd_avx512bf16
#define ggml_vec_dot_f16 ggml_vec_dot_f16_amd_avx512bf16
#define ggml_vec_dot_bf16 ggml_vec_dot_bf16_amd_avx512bf16
#define ggml_vec_dot_f16_unroll ggml_vec_dot_f16_unroll_amd_avx512bf16
#define ggml_vec_mad_f32_unroll ggml_vec_mad_f32_unroll_amd_avx512bf16
#define ggml_vec_set_i8 ggml_vec_set_i8_amd_avx512bf16
#define ggml_vec_set_i16 ggml_vec_set_i16_amd_avx512bf16
#define ggml_vec_set_i32 ggml_vec_set_i32_amd_avx512bf16
#define ggml_vec_set_f16 ggml_vec_set_f16_amd_avx512bf16
#define ggml_vec_set_bf16 ggml_vec_set_bf16_amd_avx512bf16
#define ggml_vec_add_f32 ggml_vec_add_f32_amd_avx512bf16
#define ggml_vec_add1_f32 ggml_vec_add1_f32_amd_avx512bf16
#define ggml_vec_acc_f32 ggml_vec_acc_f32_amd_avx512bf16
#define ggml_vec_acc1_f32 ggml_vec_acc1_f32_amd_avx512bf16
#define ggml_vec_sub_f32 ggml_vec_sub_f32_amd_avx512bf16
#define ggml_vec_set_f32 ggml_vec_set_f32_amd_avx512bf16
#define ggml_vec_cpy_f32 ggml_vec_cpy_f32_amd_avx512bf16
#define ggml_vec_neg_f32 ggml_vec_neg_f32_amd_avx512bf16
#define ggml_vec_mul_f32 ggml_vec_mul_f32_amd_avx512bf16
#define ggml_vec_div_f32 ggml_vec_div_f32_amd_avx512bf16
#define ggml_vec_scale_f32 ggml_vec_scale_f32_amd_avx512bf16
#define ggml_vec_scale_f16 ggml_vec_scale_f16_amd_avx512bf16
#define ggml_vec_mad_f32 ggml_vec_mad_f32_amd_avx512bf16
#define ggml_vec_mad_f16 ggml_vec_mad_f16_amd_avx512bf16
#define ggml_vec_norm_f32 ggml_vec_norm_f32_amd_avx512bf16
#define ggml_vec_sqr_f32 ggml_vec_sqr_f32_amd_avx512bf16
#define ggml_vec_sqrt_f32 ggml_vec_sqrt_f32_amd_avx512bf16
#define ggml_vec_log_f32 ggml_vec_log_f32_amd_avx512bf16
#define ggml_vec_abs_f32 ggml_vec_abs_f32_amd_avx512bf16
#define ggml_vec_sgn_f32 ggml_vec_sgn_f32_amd_avx512bf16
#define ggml_vec_step_f32 ggml_vec_step_f32_amd_avx512bf16
#define ggml_vec_tanh_f32 ggml_vec_tanh_f32_amd_avx512bf16
#define ggml_vec_elu_f32 ggml_vec_elu_f32_amd_avx512bf16
#define ggml_vec_relu_f32 ggml_vec_relu_f32_amd_avx512bf16
#define ggml_vec_leaky_relu_f32 ggml_vec_leaky_relu_f32_amd_avx512bf16
#define ggml_vec_hardswish_f32 ggml_vec_hardswish_f32_amd_avx512bf16
#define ggml_vec_hardsigmoid_f32 ggml_vec_hardsigmoid_f32_amd_avx512bf16
#define ggml_vec_gelu_f16 ggml_vec_gelu_f16_amd_avx512bf16
#define ggml_vec_gelu_f32 ggml_vec_gelu_f32_amd_avx512bf16
#define ggml_vec_gelu_quick_f32 ggml_vec_gelu_quick_f32_amd_avx512bf16
#define ggml_vec_silu_f32 ggml_vec_silu_f32_amd_avx512bf16
#define ggml_silu_backward_f32 ggml_silu_backward_f32_amd_avx512bf16
#define ggml_vec_silu_backward_f32 ggml_vec_silu_backward_f32_amd_avx512bf16
#define ggml_vec_sum_f32 ggml_vec_sum_f32_amd_avx512bf16
#define ggml_vec_sum_f32_ggf ggml_vec_sum_f32_ggf_amd_avx512bf16
#define ggml_vec_sum_f16_ggf ggml_vec_sum_f16_ggf_amd_avx512bf16
#define ggml_vec_max_f32 ggml_vec_max_f32_amd_avx512bf16
#define ggml_vec_norm_inv_f32 ggml_vec_norm_inv_f32_amd_avx512bf16
#define ggml_vec_argmax_f32 ggml_vec_argmax_f32_amd_avx512bf16
#define GGML_VECTOR
#include "ggml-vector.inc"
#endif // __x86_64__

0 comments on commit c9d7393

Please sign in to comment.