Sync with upstream llama.cpp project

Mozilla-Ocho · May 8, 2024 · 94d0940 · 94d0940
1 parent 0e2845a
commit 94d0940
Show file tree

Hide file tree

Showing 15 changed files with 540 additions and 535 deletions.
diff --git a/llama.cpp/README.llamafile b/llama.cpp/README.llamafile
@@ -9,8 +9,8 @@ LICENSE
 ORIGIN
 
   https://github.com/ggerganov/llama.cpp/pull/4406/
-  a8f9b076316e16aadd0791015b3bfd446fe1e904
-  2024-04-30
+  c780e75305dba1f67691a8dc0e8bc8425838a452
+  2024-05-07
 
 LOCAL MODIFICATIONS
 

diff --git a/llama.cpp/common.cpp b/llama.cpp/common.cpp
@@ -82,7 +82,7 @@ int32_t get_num_physical_cores() {
     // enumerate the set of thread siblings, num entries is num cores
     std::unordered_set<std::string> siblings;
     for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
-        std::ifstream thread_siblings("/sys/devices/system/cpu"
+        std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
             + std::to_string(cpu) + "/topology/thread_siblings");
         if (!thread_siblings.is_open()) {
             break; // no more cpus

diff --git a/llama.cpp/ggml-alloc.c b/llama.cpp/ggml-alloc.c
@@ -11,10 +11,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#ifndef NDEBUG
-#define NDEBUG // [jart] delete printf debugging
-#endif
-
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define MAX_FREE_BLOCKS 256
 

diff --git a/llama.cpp/ggml-quants.inc b/llama.cpp/ggml-quants.inc
@@ -1017,9 +1017,7 @@ void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int6
 // ===================== Helper functions
 //
 static inline int nearest_int(float fval) {
-    // [jart] https://github.com/ggerganov/llama.cpp/issues/2982
-    // assert(fval <= 4194303.f);
-    fval = fminf(fval, 4194303.f);
+    assert(fval <= 4194303.f);
     float val = fval + 12582912.f;
     int i; memcpy(&i, &val, sizeof(int));
     return (i & 0x007fffff) - 0x00400000;
@@ -3779,7 +3777,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
         vint32m1_t vs1 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul1, vec_zero, vl);
         vint32m1_t vs2 = __riscv_vwredsum_vs_i16m1_i32m1(vec_mul2, vs1, vl);
 
-        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2)mul_sum_us8_pairs_float;
+        int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
 
         sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d);
     }

diff --git a/llama.cpp/ggml.c b/llama.cpp/ggml.c
@@ -17742,7 +17742,6 @@ struct ggml_compute_state_shared {
     const int n_threads;
 
     // synchronization primitives
-    atomic_int n_alive;   // num threads alive
     atomic_int n_active;  // num active threads
     atomic_int node_n;    // active graph node
     atomic_int node_task; // active graph node task phase
@@ -18083,7 +18082,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
             state->shared->node_n += 1;
             state->ec = GGML_STATUS_ABORTED;
-            atomic_fetch_sub_explicit(&state->shared->n_alive, 1, memory_order_release);
             return 0;
         }
 
@@ -18218,72 +18216,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         else {
             ggml_graph_compute_thread_sync_task(&task_phase, state, false);
         }
-
-#if 0
-        // expensive integrity check
-        // not useful to test mt-safety
-        if (!state->ith) {
-            int64_t nans = 0;
-            int64_t infs = 0;
-            if (node->type == GGML_TYPE_F32)
-                for (int i3 = 0; i3 < node->ne[3]; ++i3)
-                    for (int i2 = 0; i2 < node->ne[2]; ++i2)
-                        for (int i1 = 0; i1 < node->ne[1]; ++i1)
-                            for (int i0 = 0; i0 < node->ne[0]; ++i0) {
-                                float x = *(const float *)((const char *)node->data +
-                                                           i3 * node->nb[3] +
-                                                           i2 * node->nb[2] +
-                                                           i1 * node->nb[1] +
-                                                           i0 * node->nb[0]);
-                                if (isnan(x))
-                                    ++nans;
-                                if (isinf(x))
-                                    ++infs;
-                            }
-            else if (node->type == GGML_TYPE_F16)
-                for (int i3 = 0; i3 < node->ne[3]; ++i3)
-                    for (int i2 = 0; i2 < node->ne[2]; ++i2)
-                        for (int i1 = 0; i1 < node->ne[1]; ++i1)
-                            for (int i0 = 0; i0 < node->ne[0]; ++i0) {
-                                float x = GGML_FP16_TO_FP32(*(const ggml_fp16_t *)((const char *)node->data +
-                                                                                   i3 * node->nb[3] +
-                                                                                   i2 * node->nb[2] +
-                                                                                   i1 * node->nb[1] +
-                                                                                   i0 * node->nb[0]));
-                                if (isnan(x))
-                                    ++nans;
-                                if (isinf(x))
-                                    ++infs;
-                            }
-            else if (node->type == GGML_TYPE_BF16)
-                for (int i3 = 0; i3 < node->ne[3]; ++i3)
-                    for (int i2 = 0; i2 < node->ne[2]; ++i2)
-                        for (int i1 = 0; i1 < node->ne[1]; ++i1)
-                            for (int i0 = 0; i0 < node->ne[0]; ++i0) {
-                                float x = GGML_BF16_TO_FP32(*(const ggml_bf16_t *)((const char *)node->data +
-                                                                                   i3 * node->nb[3] +
-                                                                                   i2 * node->nb[2] +
-                                                                                   i1 * node->nb[1] +
-                                                                                   i0 * node->nb[0]));
-                                if (isnan(x))
-                                    ++nans;
-                                if (isinf(x))
-                                    ++infs;
-                            }
-            else
-                continue;
-            if (nans || infs) {
-                flockfile(stderr);
-                fprintf(stderr, "ERROR: node #%d produced %" PRId64 " NaNs and %" PRId64 " infinities in sequence %s -> %s\n",
-                        node_n, nans, infs, node_n ? ggml_op_name(cgraph->nodes[node_n - 1]->op) : "n/a",
-                        ggml_op_name(node->op));
-                for (int i = 0; i < GGML_MAX_SRC && node->src[i]; ++i)
-                    fprintf(stderr, "\t- src[%d] is %s\n", i, ggml_op_name(node->src[i]->op));
-                exit(1);
-            }
-        }
-#endif
-
     }
 
 #ifdef LLAMAFILE_SYNC_REPORT
@@ -18293,7 +18225,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     fprintf(stderr, "SYNC %03d %3d%% working\n", state->ith, workpercent);
 #endif
 
-    atomic_fetch_sub_explicit(&state->shared->n_alive, 1, memory_order_release);
     return 0;
 }
 
@@ -18570,7 +18501,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         /*.perf_node_start_cycles  =*/ 0,
         /*.perf_node_start_time_us =*/ 0,
         /*.n_threads               =*/ n_threads,
-        /*.n_alive                 =*/ n_threads,
         /*.n_active                =*/ n_threads,
         /*.node_n                  =*/ -1,
         /*.node_task               =*/ GGML_TASK_TYPE_FINALIZE,
@@ -21748,4 +21678,3 @@ int ggml_cpu_has_matmul_int8(void) {
 }
 
 ////////////////////////////////////////////////////////////////////////////////
-
diff --git a/llama.cpp/grammar-parser.cpp b/llama.cpp/grammar-parser.cpp
@@ -145,9 +145,6 @@ namespace grammar_parser {
                 pos++;
                 last_sym_start = out_elements.size();
                 while (*pos != '"') {
-                    if (!*pos) {  // [jart] don't sync until upstream fixes bug
-                        throw std::runtime_error("unexpected end of input");
-                    }
                     auto char_pair = parse_char(pos);
                          pos       = char_pair.second;
                     out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
@@ -162,9 +159,6 @@ namespace grammar_parser {
                 }
                 last_sym_start = out_elements.size();
                 while (*pos != ']') {
-                    if (!*pos) {  // [jart] don't sync until upstream fixes bug
-                        throw std::runtime_error("unexpected end of input");
-                    }
                     auto char_pair = parse_char(pos);
                          pos       = char_pair.second;
                     enum llama_gretype type = last_sym_start < out_elements.size()
@@ -173,9 +167,6 @@ namespace grammar_parser {
 
                     out_elements.push_back({type, char_pair.first});
                     if (pos[0] == '-' && pos[1] != ']') {
-                        if (pos[1]) {  // [jart] don't sync until upstream fixes bug
-                            throw std::runtime_error("unexpected end of input");
-                        }
                         auto endchar_pair = parse_char(pos + 1);
                              pos          = endchar_pair.second;
                         out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});

diff --git a/llama.cpp/imatrix/imatrix.cpp b/llama.cpp/imatrix/imatrix.cpp
@@ -23,6 +23,7 @@
 
 struct Stats {
     std::vector<float> values;
+    std::vector<int> counts;
     int ncall = 0;
 };
 
@@ -125,12 +126,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
         auto & e = m_stats[wname];
 
         ++e.ncall;
-        // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
-        //       using the following line, we can correct for that if needed by replacing the line above with:
-        //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
 
         if (e.values.empty()) {
             e.values.resize(src1->ne[0]*n_as, 0);
+            e.counts.resize(src1->ne[0]*n_as, 0);
         }
         else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
             fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
@@ -157,6 +156,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
 
                     for (int j = 0; j < (int)src1->ne[0]; ++j) {
                         e.values[e_start + j] += x[j]*x[j];
+                        e.counts[e_start + j]++;
                     }
                 }
             }
@@ -174,6 +174,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
         auto& e = m_stats[wname];
         if (e.values.empty()) {
             e.values.resize(src1->ne[0], 0);
+            e.counts.resize(src1->ne[0], 0);
         }
         else if (e.values.size() != (size_t)src1->ne[0]) {
             fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
@@ -187,6 +188,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
             const float * x = data + row * src1->ne[0];
             for (int j = 0; j < (int)src1->ne[0]; ++j) {
                 e.values[j] += x[j]*x[j];
+                e.counts[j]++;
             }
         }
         if (e.ncall > m_last_call) {
@@ -226,7 +228,13 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
         out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
         int nval = p.second.values.size();
         out.write((const char *) &nval, sizeof(nval));
-        if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
+        if (nval > 0) {
+            std::vector<float> tmp(nval);
+            for (int i = 0; i < nval; i++) {
+                tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
+            }
+            out.write((const char*)tmp.data(), nval*sizeof(float));
+        }
     }
 
     // Write the number of call the matrix was computed with
@@ -274,14 +282,28 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
             imatrix_data = {};
             return false;
         }
-        e.values.resize(nval);
-        in.read((char*)e.values.data(), nval*sizeof(float));
+
+        // When re-called from load_imatrix() with add set, this will already be created.
+        if (e.values.empty()) {
+            e.values.resize(nval, 0);
+            e.counts.resize(nval, 0);
+        }
+
+        std::vector<float> tmp(nval);
+        in.read((char*)tmp.data(), nval*sizeof(float));
         if (in.fail()) {
             printf("%s: failed reading data for entry %d\n",__func__,i);
             imatrix_data = {};
             return false;
         }
-        e.ncall = ncall;
+
+        // Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
+        for (int i = 0; i < nval; i++) {
+            e.values[i] += tmp[i];
+            e.counts[i] += ncall;
+        }
+        e.ncall += ncall;
+
     }
     return true;
 }

diff --git a/llama.cpp/llama.cpp b/llama.cpp/llama.cpp
@@ -4190,6 +4190,15 @@ static void llm_load_vocab(
             } else if (
                     tokenizer_pre == "gpt-2") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
+            } else if (
+                    tokenizer_pre == "refact") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
+            } else if (
+                tokenizer_pre == "command-r") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
+            } else if (
+                tokenizer_pre == "olmo") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
@@ -11768,7 +11777,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
 static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
     GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
     GGML_ASSERT(llama_is_byte_token(vocab, id));
-    const auto& token_data = vocab.id_to_token.at(id);
+    const auto & token_data = vocab.id_to_token.at(id);
     switch (llama_vocab_get_type(vocab)) {
         case LLAMA_VOCAB_TYPE_SPM: {
             auto buf = token_data.text.substr(3, 2);
@@ -12028,14 +12037,13 @@ struct llm_tokenizer_bpe {
                             "\\s?\\p{L}+",
                             "\\s?\\p{P}+",
                             "[一-龥ࠀ-一가-퟿]+",
-                            "\\p{N}+",
+                            "\\p{N}",
                         });
                         break;
                     case LLAMA_VOCAB_PRE_TYPE_FALCON:
                         word_collection = unicode_regex_split(text, {
                             "[\\p{P}\\$\\+<=>\\^~\\|]+",
                             "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
-                            "\\p{N}+",
                             "[0-9][0-9][0-9]",
                         });
                         break;
@@ -12051,7 +12059,15 @@ struct llm_tokenizer_bpe {
                         });
                         break;
                     case LLAMA_VOCAB_PRE_TYPE_STARCODER:
+                    case LLAMA_VOCAB_PRE_TYPE_REFACT:
+                    case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
+                        word_collection = unicode_regex_split(text, {
+                            "\\p{N}",
+                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                        });
+                        break;
                     case LLAMA_VOCAB_PRE_TYPE_GPT2:
+                    case LLAMA_VOCAB_PRE_TYPE_OLMO:
                         word_collection = unicode_regex_split(text, {
                             "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                         });
@@ -17130,15 +17146,16 @@ float * llama_get_embeddings(struct llama_context * ctx) {
     return ctx->embd;
 }
 
+// [jart] DO NOT SYNC this function
 static float * llama_get_embeddings_ith_fail(int i, std::string reason) {
     LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, reason);
     return nullptr;
 }
 
+// [jart] DO NOT SYNC this function
 float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) {
     int32_t j = -1;
     llama_synchronize(ctx);
-    // [jart] DO NOT SYNC this function
     if (ctx->embd == nullptr) {
         return llama_get_embeddings_ith_fail(i, "no embeddings");
     }
@@ -17268,9 +17285,10 @@ int32_t llama_tokenize(
 
 static std::string llama_decode_text(const std::string & text) {
     std::string decoded_text;
-    auto unicode_sequences = unicode_cpts_from_utf8(text);
-    for (auto & unicode_sequence : unicode_sequences) {
-        decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
+
+    const auto cpts = unicode_cpts_from_utf8(text);
+    for (const auto cpt : cpts) {
+        decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
     }
 
     return decoded_text;

diff --git a/llama.cpp/llama.h b/llama.cpp/llama.h
@@ -81,6 +81,9 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
         LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
         LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
+        LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
+        LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
+        LLAMA_VOCAB_PRE_TYPE_OLMO           = 10,
     };
 
     // note: these values should be synchronized with ggml_rope
@@ -174,7 +177,7 @@ extern "C" {
         bool sorted;
     } llama_token_data_array;
 
-    typedef bool (*llama_progress_callback)(float progress, void *ctx);
+    typedef bool (*llama_progress_callback)(float progress, void * user_data);
 
     // Input data for llama_decode
     // A llama_batch object can contain input about one or many sequences