Skip to content

Commit

Permalink
Sync with upstream llama.cpp project
Browse files Browse the repository at this point in the history
This improves tokenization for Command-R, Refact, Olmo, and StarCoder.
  • Loading branch information
jart committed May 8, 2024
1 parent 0e2845a commit 94d0940
Show file tree
Hide file tree
Showing 15 changed files with 540 additions and 535 deletions.
4 changes: 2 additions & 2 deletions llama.cpp/README.llamafile
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ LICENSE
ORIGIN

https://github.com/ggerganov/llama.cpp/pull/4406/
a8f9b076316e16aadd0791015b3bfd446fe1e904
2024-04-30
c780e75305dba1f67691a8dc0e8bc8425838a452
2024-05-07

LOCAL MODIFICATIONS

Expand Down
2 changes: 1 addition & 1 deletion llama.cpp/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ int32_t get_num_physical_cores() {
// enumerate the set of thread siblings, num entries is num cores
std::unordered_set<std::string> siblings;
for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
std::ifstream thread_siblings("/sys/devices/system/cpu"
std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
+ std::to_string(cpu) + "/topology/thread_siblings");
if (!thread_siblings.is_open()) {
break; // no more cpus
Expand Down
4 changes: 0 additions & 4 deletions llama.cpp/ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@
#include <stdlib.h>
#include <string.h>

#ifndef NDEBUG
#define NDEBUG // [jart] delete printf debugging
#endif

#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MAX_FREE_BLOCKS 256

Expand Down
6 changes: 2 additions & 4 deletions llama.cpp/ggml-quants.inc
Original file line number Diff line number Diff line change
Expand Up @@ -1017,9 +1017,7 @@ void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int6
// ===================== Helper functions
//
static inline int nearest_int(float fval) {
// [jart] https://github.com/ggerganov/llama.cpp/issues/2982
// assert(fval <= 4194303.f);
fval = fminf(fval, 4194303.f);
assert(fval <= 4194303.f);
float val = fval + 12582912.f;
int i; memcpy(&i, &val, sizeof(int));
return (i & 0x007fffff) - 0x00400000;
Expand Down Expand Up @@ -3779,7 +3777,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);

int sumi = __riscv_vmv_x_s_i32m1_i32(vs2)mul_sum_us8_pairs_float;
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);

sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
}
Expand Down
71 changes: 0 additions & 71 deletions llama.cpp/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -17742,7 +17742,6 @@ struct ggml_compute_state_shared {
const int n_threads;

// synchronization primitives
atomic_int n_alive; // num threads alive
atomic_int n_active; // num active threads
atomic_int node_n; // active graph node
atomic_int node_task; // active graph node task phase
Expand Down Expand Up @@ -18083,7 +18082,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
state->shared->node_n += 1;
state->ec = GGML_STATUS_ABORTED;
atomic_fetch_sub_explicit(&state->shared->n_alive, 1, memory_order_release);
return 0;
}

Expand Down Expand Up @@ -18218,72 +18216,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
else {
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
}

#if 0
// expensive integrity check
// not useful to test mt-safety
if (!state->ith) {
int64_t nans = 0;
int64_t infs = 0;
if (node->type == GGML_TYPE_F32)
for (int i3 = 0; i3 < node->ne[3]; ++i3)
for (int i2 = 0; i2 < node->ne[2]; ++i2)
for (int i1 = 0; i1 < node->ne[1]; ++i1)
for (int i0 = 0; i0 < node->ne[0]; ++i0) {
float x = *(const float *)((const char *)node->data +
i3 * node->nb[3] +
i2 * node->nb[2] +
i1 * node->nb[1] +
i0 * node->nb[0]);
if (isnan(x))
++nans;
if (isinf(x))
++infs;
}
else if (node->type == GGML_TYPE_F16)
for (int i3 = 0; i3 < node->ne[3]; ++i3)
for (int i2 = 0; i2 < node->ne[2]; ++i2)
for (int i1 = 0; i1 < node->ne[1]; ++i1)
for (int i0 = 0; i0 < node->ne[0]; ++i0) {
float x = GGML_FP16_TO_FP32(*(const ggml_fp16_t *)((const char *)node->data +
i3 * node->nb[3] +
i2 * node->nb[2] +
i1 * node->nb[1] +
i0 * node->nb[0]));
if (isnan(x))
++nans;
if (isinf(x))
++infs;
}
else if (node->type == GGML_TYPE_BF16)
for (int i3 = 0; i3 < node->ne[3]; ++i3)
for (int i2 = 0; i2 < node->ne[2]; ++i2)
for (int i1 = 0; i1 < node->ne[1]; ++i1)
for (int i0 = 0; i0 < node->ne[0]; ++i0) {
float x = GGML_BF16_TO_FP32(*(const ggml_bf16_t *)((const char *)node->data +
i3 * node->nb[3] +
i2 * node->nb[2] +
i1 * node->nb[1] +
i0 * node->nb[0]));
if (isnan(x))
++nans;
if (isinf(x))
++infs;
}
else
continue;
if (nans || infs) {
flockfile(stderr);
fprintf(stderr, "ERROR: node #%d produced %" PRId64 " NaNs and %" PRId64 " infinities in sequence %s -> %s\n",
node_n, nans, infs, node_n ? ggml_op_name(cgraph->nodes[node_n - 1]->op) : "n/a",
ggml_op_name(node->op));
for (int i = 0; i < GGML_MAX_SRC && node->src[i]; ++i)
fprintf(stderr, "\t- src[%d] is %s\n", i, ggml_op_name(node->src[i]->op));
exit(1);
}
}
#endif

}

#ifdef LLAMAFILE_SYNC_REPORT
Expand All @@ -18293,7 +18225,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
fprintf(stderr, "SYNC %03d %3d%% working\n", state->ith, workpercent);
#endif

atomic_fetch_sub_explicit(&state->shared->n_alive, 1, memory_order_release);
return 0;
}

Expand Down Expand Up @@ -18570,7 +18501,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
/*.perf_node_start_cycles =*/ 0,
/*.perf_node_start_time_us =*/ 0,
/*.n_threads =*/ n_threads,
/*.n_alive =*/ n_threads,
/*.n_active =*/ n_threads,
/*.node_n =*/ -1,
/*.node_task =*/ GGML_TASK_TYPE_FINALIZE,
Expand Down Expand Up @@ -21748,4 +21678,3 @@ int ggml_cpu_has_matmul_int8(void) {
}

////////////////////////////////////////////////////////////////////////////////

9 changes: 0 additions & 9 deletions llama.cpp/grammar-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,6 @@ namespace grammar_parser {
pos++;
last_sym_start = out_elements.size();
while (*pos != '"') {
if (!*pos) { // [jart] don't sync until upstream fixes bug
throw std::runtime_error("unexpected end of input");
}
auto char_pair = parse_char(pos);
pos = char_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
Expand All @@ -162,9 +159,6 @@ namespace grammar_parser {
}
last_sym_start = out_elements.size();
while (*pos != ']') {
if (!*pos) { // [jart] don't sync until upstream fixes bug
throw std::runtime_error("unexpected end of input");
}
auto char_pair = parse_char(pos);
pos = char_pair.second;
enum llama_gretype type = last_sym_start < out_elements.size()
Expand All @@ -173,9 +167,6 @@ namespace grammar_parser {

out_elements.push_back({type, char_pair.first});
if (pos[0] == '-' && pos[1] != ']') {
if (pos[1]) { // [jart] don't sync until upstream fixes bug
throw std::runtime_error("unexpected end of input");
}
auto endchar_pair = parse_char(pos + 1);
pos = endchar_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
Expand Down
36 changes: 29 additions & 7 deletions llama.cpp/imatrix/imatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

struct Stats {
std::vector<float> values;
std::vector<int> counts;
int ncall = 0;
};

Expand Down Expand Up @@ -125,12 +126,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
auto & e = m_stats[wname];

++e.ncall;
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
// using the following line, we can correct for that if needed by replacing the line above with:
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;

if (e.values.empty()) {
e.values.resize(src1->ne[0]*n_as, 0);
e.counts.resize(src1->ne[0]*n_as, 0);
}
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
Expand All @@ -157,6 +156,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *

for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++;
}
}
}
Expand All @@ -174,6 +174,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
auto& e = m_stats[wname];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
e.counts.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
Expand All @@ -187,6 +188,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
const float * x = data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
e.counts[j]++;
}
}
if (e.ncall > m_last_call) {
Expand Down Expand Up @@ -226,7 +228,13 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
int nval = p.second.values.size();
out.write((const char *) &nval, sizeof(nval));
if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
if (nval > 0) {
std::vector<float> tmp(nval);
for (int i = 0; i < nval; i++) {
tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
}
out.write((const char*)tmp.data(), nval*sizeof(float));
}
}

// Write the number of call the matrix was computed with
Expand Down Expand Up @@ -274,14 +282,28 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
imatrix_data = {};
return false;
}
e.values.resize(nval);
in.read((char*)e.values.data(), nval*sizeof(float));

// When re-called from load_imatrix() with add set, this will already be created.
if (e.values.empty()) {
e.values.resize(nval, 0);
e.counts.resize(nval, 0);
}

std::vector<float> tmp(nval);
in.read((char*)tmp.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n",__func__,i);
imatrix_data = {};
return false;
}
e.ncall = ncall;

// Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
for (int i = 0; i < nval; i++) {
e.values[i] += tmp[i];
e.counts[i] += ncall;
}
e.ncall += ncall;

}
return true;
}
Expand Down
32 changes: 25 additions & 7 deletions llama.cpp/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4190,6 +4190,15 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "gpt-2") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "refact") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
} else if (
tokenizer_pre == "command-r") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
} else if (
tokenizer_pre == "olmo") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
Expand Down Expand Up @@ -11768,7 +11777,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto& token_data = vocab.id_to_token.at(id);
const auto & token_data = vocab.id_to_token.at(id);
switch (llama_vocab_get_type(vocab)) {
case LLAMA_VOCAB_TYPE_SPM: {
auto buf = token_data.text.substr(3, 2);
Expand Down Expand Up @@ -12028,14 +12037,13 @@ struct llm_tokenizer_bpe {
"\\s?\\p{L}+",
"\\s?\\p{P}+",
"[一-龥ࠀ-一가-퟿]+",
"\\p{N}+",
"\\p{N}",
});
break;
case LLAMA_VOCAB_PRE_TYPE_FALCON:
word_collection = unicode_regex_split(text, {
"[\\p{P}\\$\\+<=>\\^~\\|]+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"\\p{N}+",
"[0-9][0-9][0-9]",
});
break;
Expand All @@ -12051,7 +12059,15 @@ struct llm_tokenizer_bpe {
});
break;
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
case LLAMA_VOCAB_PRE_TYPE_REFACT:
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
word_collection = unicode_regex_split(text, {
"\\p{N}",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
});
break;
case LLAMA_VOCAB_PRE_TYPE_GPT2:
case LLAMA_VOCAB_PRE_TYPE_OLMO:
word_collection = unicode_regex_split(text, {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
});
Expand Down Expand Up @@ -17130,15 +17146,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
return ctx->embd;
}

// [jart] DO NOT SYNC this function
static float * llama_get_embeddings_ith_fail(int i, std::string reason) {
LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, reason);
return nullptr;
}

// [jart] DO NOT SYNC this function
float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
int32_t j = -1;
llama_synchronize(ctx);
// [jart] DO NOT SYNC this function
if (ctx->embd == nullptr) {
return llama_get_embeddings_ith_fail(i, "no embeddings");
}
Expand Down Expand Up @@ -17268,9 +17285,10 @@ int32_t llama_tokenize(

static std::string llama_decode_text(const std::string & text) {
std::string decoded_text;
auto unicode_sequences = unicode_cpts_from_utf8(text);
for (auto & unicode_sequence : unicode_sequences) {
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));

const auto cpts = unicode_cpts_from_utf8(text);
for (const auto cpt : cpts) {
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
}

return decoded_text;
Expand Down
5 changes: 4 additions & 1 deletion llama.cpp/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
LLAMA_VOCAB_PRE_TYPE_OLMO = 10,
};

// note: these values should be synchronized with ggml_rope
Expand Down Expand Up @@ -174,7 +177,7 @@ extern "C" {
bool sorted;
} llama_token_data_array;

typedef bool (*llama_progress_callback)(float progress, void *ctx);
typedef bool (*llama_progress_callback)(float progress, void * user_data);

// Input data for llama_decode
// A llama_batch object can contain input about one or many sequences
Expand Down

0 comments on commit 94d0940

Please sign in to comment.