support code interpreter finetune

InternLM · Mar 27, 2024 · e571688 · e571688
1 parent 5c8c265
commit e571688
Show file tree

Hide file tree

Showing 9 changed files with 249 additions and 70 deletions.
diff --git a/xtuner/configs/internlm/internlm2_chat_1_8b/hybrid/agent.json b/xtuner/configs/internlm/internlm2_chat_1_8b/hybrid/agent.json
@@ -0,0 +1,62 @@
+{
+    "messages": [
+        {"role": "system", "content": "You are InternLM2-Chat, a harmless AI assistant"},
+        {
+            "role": "user", 
+            "content": "Please help me process and visualize this dataset.", 
+            "files": [{"path": "data.csv", "size": "10K"}]
+        }, 
+        {
+            "role": "assistant", 
+            "content": "I have processed the data and visualized it for you.", 
+            "code_interpreter_call": "```python\nimport plotly.express as px\nimport pandas as pd\n\n# Load the data into a pandas dataframe\ndf = pd.read_csv('data.csv')\n\n# Create a scatter plot of rainfall vs wind direction\nfig = px.scatter(df, x='WindDir9am', y='Rainfall', color='WindDir3pm',\n                 labels={'WindDir9am': 'Wind Direction 9am', 'Rainfall': '\n\nRainfall', 'WindDir3pm': 'Wind Direction 3pm'},\n                 title='Rainfall vs Wind Direction',\n                 template='plotly_dark',\n                 width=600, height=500)\n\n# Add a hover effect to show the date\nfig.update_traces(hovertemplate='<b>Date: %{text}</b><br>Wind Direction 9am: %{x}<br>Rainfall: %{y}<br>Wind Direction 3pm: %{marker.color}')\n\n# Show the plot\nfig.show()\n```"
+        }, 
+        {
+            "role": "code_interpreter", 
+            "content": "![image](xxx.png)"
+        }, 
+        {
+            "role": "assistant", 
+            "content": "Since the code output is not included here, I cannot provide specific chart content. However, if the code executed correctly, it should display a polar plot with two filled areas representing the relationship between wind direction at 9 am and rainfall, and between wind direction at 3 pm and rainfall, respectively. The values for each direction are based on the average rainfall calculated from the provided dataset. The chart should have a clear title, a legend, and be intuitive for comparing rainfall with different wind directions. Given the use of a dark theme, the overall appearance of the chart should be bright lines and filled areas on a dark background."
+        }, 
+        {
+            "role": "user", 
+            "content": "I want to know today's weather in Shanghai"
+        },
+        {
+            "role": "assistant", 
+            "content": "Sure, I will search for the weather of Shanghai.", 
+            "function_call": {
+                "name": "get_current_weather", 
+                "parameters": {"location": "Shanghai"}
+            }
+        }, 
+        {
+            "role": "function", 
+            "name": "get_current_weather", 
+            "content": "{'temperature': 22}"
+        }, 
+        {
+            "role": "assistant", 
+            "content": "The weather in Shanghai is 22 celsius"
+        }
+    ], 
+
+    "functions": [
+        {
+            "name": "get_current_weather", 
+            "description": "Get the current weather in a given location", 
+            "parameters": {
+                "type": "object", 
+                "properties": {
+                    "location": {
+                        "type": "string", 
+                        "description": "The city and state, e.g. San Francisco, CA",
+                        "unit": {"type": "string"}}, 
+                        "required": ["location"]
+                    }
+            }
+        }
+    ], 
+
+    "code_interpreter": "You now have access to a Jupyter notebook environment supporting Python code execution. Just send code to python to run in this stateful environment. This feature is suitable for:\n- Data analysis or processing (such as data manipulation and graphic creation)\n- Complex calculations (such as math and physics problems)\n- Programming examples (for understanding programming concepts or language features)\n- Text processing and analysis (including text analysis and natural language processing)\n- Machine learning and data science (model training and data visualization)\n- File operations and data import (handling CSV, JSON, etc. formats)"}
diff --git a/xtuner/configs/internlm/internlm2_chat_1_8b/hybrid/example.py b/xtuner/configs/internlm/internlm2_chat_1_8b/hybrid/example.py
@@ -0,0 +1,29 @@
+import json
+
+from xtuner.types import HybridChatTemplate, TrainingHybridChatMessages
+
+
+chat_template = HybridChatTemplate(
+    system='<|im_start|>system\n{system}<|im_end|>\n',
+    user='<|im_start|>user\n{user}<|im_end|>\n<|im_start|>assistant\n',
+    assistant='{assistant}<|im_end|>\n',
+    stop_words=['<|im_end|>'],
+    image_token='<image>',
+    files='<|im_start|>user name=file\n{files}<|im_end|>\n', 
+    function_call='{assistant}<|action_start|><|plugin|>\n{function_call}<|action_end|><|im_end|>\n',  # noqa: E501, E251
+    function_result='<|im_start|>environment name=<|plugin|>\n{function_result}<|im_end|>\n<|im_start|>assistant\n',  # noqa: E501, E251
+    functions='<|im_start|>system name=<|plugin|>\n{functions}<|im_end|>\n',
+    code_interpreter_call='{assistant}<|action_start|><|interpreter|>\n{code_interpreter_call}<|action_end|><|im_end|>\n',  # noqa: E501, E251
+    code_interpreter_result='<|im_start|>environment name=<|interpreter|>\n{code_interpreter_result}<|im_end|>\n<|im_start|>assistant\n',  # noqa: E501, E251
+    code_interpreter='<|im_start|>system name=<|interpreter|>\n{code_interpreter}<|im_end|>\n'
+
+)
+
+agent_data = json.load(open('agent.json'))
+
+msg = TrainingHybridChatMessages.from_dict(agent_data)
+print(msg.apply_chat_template(chat_template))
+
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained('internlm/internlm2-chat-7b', trust_remote_code=True)
+print(msg.tokenize(tokenizer, chat_template))
diff --git a/xtuner/configs/internlm/internlm2_chat_1_8b/hybrid/internlm2_chat_1_8b_function_call.py b/xtuner/configs/internlm/internlm2_chat_1_8b/hybrid/internlm2_chat_1_8b_function_call.py
@@ -4,10 +4,8 @@
 from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
                             LoggerHook, ParamSchedulerHook)
 from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
-
 from torch.optim import AdamW
 from transformers import AutoModelForCausalLM, AutoTokenizer
-
 
 from xtuner.dataset.hybrid import HybridDataset, hybrid_collate_fn
 from xtuner.dataset.hybrid.mappings import openai_to_raw_training
@@ -74,7 +72,6 @@
     trust_remote_code=True,
     padding_side='right')
 
-
 model = dict(
     type=HybridFinetune,
     llm=dict(
@@ -95,7 +92,7 @@
     chat_template=chat_template,
     max_length=max_length,
     pack_to_max_length=True,
-    num_workers = dataloader_num_workers,
+    num_workers=dataloader_num_workers,
     mappings=[openai_to_raw_training])
 
 train_dataloader = dict(

diff --git a/xtuner/configs/internlm/internlm2_chat_1_8b/hybrid/internlm2_chat_1_8b_llava_sft.py b/xtuner/configs/internlm/internlm2_chat_1_8b/hybrid/internlm2_chat_1_8b_llava_sft.py
@@ -4,9 +4,11 @@
 from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
                             LoggerHook, ParamSchedulerHook)
 from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
+from peft import LoraConfig
 from torch.optim import AdamW
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
-                          CLIPImageProcessor, CLIPVisionModel)
+                          BitsAndBytesConfig, CLIPImageProcessor,
+                          CLIPVisionModel)
 
 from xtuner.dataset.hybrid import HybridDataset, hybrid_collate_fn
 from xtuner.dataset.hybrid.mappings import (insert_img_pad_tokens,
@@ -21,15 +23,17 @@
 #                          PART 1  Settings                           #
 #######################################################################
 # Model
+# llm_name_or_path = '/mnt/petrelfs/share_data/basemodel/checkpoints/llm/hf_hub/models--internlm--internlm2-chat-1_8b/snapshots/aa8a7450c2227a3b6733b3c6fe33fefbb2ca54f9/'
 llm_name_or_path = '/mnt/petrelfs/share_data/linzhihao/model/models--internlm--internlm2-chat-7b/snapshots/2292b86b21cb856642782cebed0a453997453b1f/'
 visual_encoder_name_or_path = 'openai/clip-vit-large-patch14-336'
+use_varlen_attn = False
 # Specify the pretrained pth
 pretrained_pth = None
 # Data
 data_dir = './llava_data/'
 data_files = ['LLaVA-Instruct-150K/llava_v1_5_mix665k.json']
 image_dir = data_dir + 'llava_images'
-max_length = 1024 * 32
+max_length = 1024 * 2
 
 # Chat Template
 chat_template = dict(
@@ -46,12 +50,12 @@
     functions='<|im_start|>system name=<|plugin|>\n{functions}<|im_end|>\n')
 
 # Scheduler & Optimizer
-batch_size = 1  # per_device
+batch_size = 16  # per_device
 accumulative_counts = 1
-dataloader_num_workers = 4
+dataloader_num_workers = 0
 max_epochs = 1
 optim_type = AdamW
-lr = 2e-4
+lr = 0
 betas = (0.9, 0.999)
 weight_decay = 0
 max_norm = 1  # grad clip
@@ -86,14 +90,34 @@
     freeze_llm=False,
     freeze_visual_encoder=True,
     pretrained_pth=pretrained_pth,
+    use_varlen_attn=use_varlen_attn,
     llm=dict(
         type=AutoModelForCausalLM.from_pretrained,
         pretrained_model_name_or_path=llm_name_or_path,
         trust_remote_code=True,
-        torch_dtype=torch.float16),
+        torch_dtype=torch.bfloat16,
+        attn_implementation='flash_attention_2',
+        quantization_config=dict(
+            type=BitsAndBytesConfig,
+            load_in_4bit=True,
+            load_in_8bit=False,
+            llm_int8_threshold=6.0,
+            llm_int8_has_fp16_weight=False,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4')),
+    llm_lora=dict(
+        type=LoraConfig,
+        r=512,
+        lora_alpha=256,
+        lora_dropout=0.05,
+        bias='none',
+        task_type='CAUSAL_LM'),
     visual_encoder=dict(
         type=CLIPVisionModel.from_pretrained,
-        pretrained_model_name_or_path=visual_encoder_name_or_path))
+        pretrained_model_name_or_path=visual_encoder_name_or_path),
+    visual_encoder_lora=dict(
+        type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, bias='none'))
 
 #######################################################################
 #                      PART 3  Dataset & Dataloader                   #
@@ -102,16 +126,16 @@
     type=HybridDataset,
     data_dir=data_dir,
     data_files=data_files,
-    data_cached='cached_llava',
+    # data_cached='cached_llava',
     image_dir=image_dir,
-    sample_ratio=1,
+    sample_ratio=0.1,
     tokenizer=tokenizer,
     chat_template=chat_template,
     image_processor=image_processor,
     pad_img_to_squared=True,
     max_length=max_length,
-    pack_to_max_length=True,
-    num_workers=dataloader_num_workers,
+    pack_to_max_length=False,
+    num_workers=4,
     mappings=[
         llava_to_openai,
         openai_to_raw_training,
@@ -120,7 +144,7 @@
 
 train_dataloader = dict(
     batch_size=batch_size,
-    num_workers=dataloader_num_workers,
+    num_workers=4,
     dataset=llava_dataset,
     sampler=dict(type=DefaultSampler, shuffle=True),
     collate_fn=dict(type=hybrid_collate_fn))
@@ -182,7 +206,7 @@
     # record the time of every iteration.
     timer=dict(type=IterTimerHook),
     # print log every 10 iterations.
-    logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
+    logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=1),
     # enable the parameter scheduler.
     param_scheduler=dict(type=ParamSchedulerHook),
     # save checkpoint per `save_steps`.

diff --git a/xtuner/configs/internlm/internlm2_chat_1_8b/hybrid/multi_modal.json b/xtuner/configs/internlm/internlm2_chat_1_8b/hybrid/multi_modal.json
@@ -13,7 +13,7 @@
                         "image_url": "image2.jpg"
                     },
                     {
-                        "type": "text", 
+                        "type": "text",
                         "text": "What are the colors of the bus in the first image?"
                     }
                 ]
@@ -37,5 +37,3 @@
         ]
     }
 ]
-
-
diff --git a/xtuner/dataset/hybrid/dataset.py b/xtuner/dataset/hybrid/dataset.py
@@ -287,7 +287,6 @@ def img_sample_counter(item):
         def img_counter(item):
             return len(item['image_urls'])
 
-
         with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
             images = list(
                 tqdm(
@@ -403,8 +402,10 @@ def __getitem__(self, item: int) -> Dict[str, List]:
         assistant='{assistant}<|im_end|>\n',
         stop_words=['<|im_end|>'],
         image_token='<image>',
-        function_call='{assistant}<|action_start|><|plugin|>\n{function_call}<|action_end|><|im_end|>\n',  # noqa: E501, E251
-        function_result='<|im_start|>environment name=<|plugin|>\n{function_result}<|im_end|>\n<|im_start|>assistant\n',  # noqa: E501, E251
+        function_call=
+        '{assistant}<|action_start|><|plugin|>\n{function_call}<|action_end|><|im_end|>\n',  # noqa: E501, E251
+        function_result=
+        '<|im_start|>environment name=<|plugin|>\n{function_result}<|im_end|>\n<|im_start|>assistant\n',  # noqa: E501, E251
         functions='<|im_start|>system name=<|plugin|>\n{functions}<|im_end|>\n'
     )