Mozilla-Ocho · jart · May 7, 2024 · May 2, 2024
diff --git a/llama.cpp/server/server.cpp b/llama.cpp/server/server.cpp
@@ -1371,6 +1371,7 @@ struct llama_server_context
     bool ingest_images(llama_client_slot &slot, int n_batch)
     {
         int image_idx = 0;
+        std::string prompt = "";
 
         while (image_idx < (int) slot.images.size())
         {
@@ -1432,6 +1433,11 @@ struct llama_server_context
                 slot.params.input_suffix : // no more images, then process suffix prompt
                 (json)(slot.images[image_idx].prefix_prompt);
 
+            // rebuild the prompt since it was cleared earlier
+            prompt += img.prefix_prompt;
+            prompt += "[img-" + std::to_string(img.id) + "]";
+            prompt += json_prompt;
+
             std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
             for (int i = 0; i < (int) append_tokens.size(); ++i)
             {
@@ -1440,6 +1446,13 @@ struct llama_server_context
             }
         }
 
+        // There is no prompt caching in multimodal currently
+        slot.num_prompt_tokens = slot.n_past;
+        slot.num_prompt_tokens_processed = slot.n_past;
+
+        // prompt for multimodal is set to empty to avoid processing those tokens here
+        slot.prompt = prompt;
+
         return true;
     }