aws-samples · FireballDWF · Apr 2, 2024 · Apr 3, 2024
diff --git a/03_Model_customization/01_fine-tuning-titan-lite.ipynb b/03_Model_customization/01_fine-tuning-titan-lite.ipynb
@@ -191,7 +191,7 @@
     "\n",
     "Amazon Titan text model customization hyperparameters: \n",
     "- `epochs`: The number of iterations through the entire training dataset and can take up any integer values in the range of 1-10, with a default value of 5.\n",
-    "- `batchSize`: The number of samples processed before updating model parametersand can take up any integer values in the range of 1-64, with a default value of 1.\n",
+    "- `batchSize`: The number of samples processed before updating model parameters and can take up any integer values in the range of 1-64, with a default value of 1.\n",
     "- `learningRate`:\tThe rate at which model parameters are updated after each batch\twhich can take up a float value betweek 0.0-1.0 with a default value set to\t1.00E-5.\n",
     "- `learningRateWarmupSteps`: The number of iterations over which the learning rate is gradually increased to the specified rate and can take any integer value between 0-250 with a default value of 5.\n",
     "\n",

diff --git a/03_Model_customization/03_continued_pretraining_titan_text.ipynb b/03_Model_customization/03_continued_pretraining_titan_text.ipynb
@@ -268,8 +268,8 @@
     "# - in our testing Character split works better with this PDF data set\n",
     "text_splitter = RecursiveCharacterTextSplitter(\n",
     "    # Set a really small chunk size, just to show.\n",
-    "    chunk_size = 20000, # 4096 tokens * 6 chars per token = 24,576 \n",
-    "    chunk_overlap = 2000, # overlap for continuity across chunks\n",
+    "    chunk_size = 4000, # when set to 20000, got error "Maximum input token count 4919 exceeds limit of 4096".  Orginal comment was 4096 tokens * 6 chars per token = 24,576 \n",
+    "    chunk_overlap = 1000, # overlap for continuity across chunks\n",
     ")\n",
     "\n",
     "docs = text_splitter.split_documents(document)"