support InternVL2-2B

sophgo · Sep 16, 2024 · 9e90034 · 9e90034
1 parent a3f66f0
commit 9e90034
Show file tree

Hide file tree

Showing 21 changed files with 2,499 additions and 140 deletions.
diff --git a/README.md b/README.md
@@ -52,6 +52,7 @@
 |Yi-34B-chat          |:white\_check\_mark:|:white\_check\_mark:|                    |[LINK](https://huggingface.co/01-ai/Yi-34B-Chat)                          |
 |Qwen-VL-Chat         |:white\_check\_mark:|:white\_check\_mark:|                    |[LINK](https://huggingface.co/Qwen/Qwen-VL-Chat)                          |
 |InternVL2-4B         |:white\_check\_mark:|:white\_check\_mark:|                    |[LINK](https://huggingface.co/OpenGVLab/InternVL2-4B)                     |
+|InternVL2-2B         |:white\_check\_mark:|:white\_check\_mark:|                    |[LINK](https://huggingface.co/OpenGVLab/InternVL2-2B)                     |
 
 
 如果您想要知道转换细节和源码，可以到本项目[models](./models)子目录查看各类模型部署细节。
@@ -86,7 +87,7 @@ git clone https://github.com/sophgo/LLM-TPU.git
 | Qwen1.5-1.8B    | ./run.sh --model qwen1.5-1.8b --arch soc    | ./run.sh --model qwen1.5-1.8b  --arch pcie   |
 | LWM-Text-Chat   | ./run.sh --model lwm-text-chat --arch soc   | ./run.sh --model lwm-text-chat  --arch pcie  |
 | WizardCoder-15B | ./run.sh --model wizardcoder-15b --arch soc | ./run.sh --model wizardcoder-15b --arch pcie |
-
+| InternVL2-4B    | ./run.sh --model internvl2-4b --arch soc    | ./run.sh --model internvl2-4b --arch pcie    |
 
 ## 进阶功能
 进阶功能说明：

diff --git a/models/InternVL2/README.md b/models/InternVL2/README.md
@@ -83,10 +83,20 @@ cd build && cmake .. && make && cp *cpython* .. && cd ..
 * python demo
 
 ```
-python3 pipeline.py --model_path internvl2-4b_bm1684x_int4.bmodel --tokenizer ../support/token_config/ --devid 0
+python3 pipeline.py --model_path internvl2-4b_bm1684x_int4.bmodel --tokenizer ../support/token_config_4b --devid 0
 ```
 model为实际的model储存路径；tokenizer_path为实际的tokenizer配置的储存路径
 
 * 运行效果
 
-![](../../assets/internvl2-4b.png)
+![](../../assets/internvl2-4b.png)
+
+## 常见问题
+
+#### 是否支持InternVL2-2B ?
+
+是支持的，步骤基本一致。
+1. 将`files/InternVL2-2B`里面的文件替换到`InternVL2-2B`中；
+2. 执行`export_onnx.py`导出onnx；
+3. 执行`./compile.sh --name internvl2-2b`生成模型`internvl2-2b_bm1684x_int4.bmodel`
+4. 运行程序是一致的，但是需要指定`token_config_2b`，执行命令：`python3 pipeline.py --model_path internvl2-4b_bm1684x_int4.bmodel --tokenizer ../support/token_config_2b --devid 0`
diff --git a/models/InternVL2/compile/compile.sh b/models/InternVL2/compile/compile.sh
@@ -54,7 +54,7 @@ fi
 
 if [ x$mode == x"int8" ]; then
     quantize_args="--quantize W8BF16"
-elif [ x$mode == x"f16" ]; then
+elif [ x$mode == x"bf16" ]; then
     quantize_args="--quantize BF16"
 elif [ x$mode == x"int4" ]; then
     quantize_args="--quantize W4BF16 --q_group_size 64"

diff --git a/models/InternVL2/compile/export_onnx.py b/models/InternVL2/compile/export_onnx.py
@@ -29,6 +29,7 @@
 args = parser.parse_args()
 
 model_path = args.model_path
+is_4B = "InternVL2-4B" in model_path
 folder = f"./tmp/onnx"
 
 origin_model = AutoModelForCausalLM.from_pretrained(
@@ -48,7 +49,7 @@
 HEAD_DIM = HIDDEN_SIZE // NUM_ATTENTION_HEADS
 VOCAB_SIZE = config.llm_config.vocab_size
 DOWNSAMPLE_RATIO = config.downsample_ratio
-EOS_TOKEN_ID = config.llm_config.eos_token_id
+ID_EOS = config.llm_config.eos_token_id
 print(f'Layers: {NUM_LAYERS}\nHidden size: {HIDDEN_SIZE}\n')
 
 vit = origin_model.vision_model
@@ -63,9 +64,10 @@ class Embedding(torch.nn.Module):
 
     def __init__(self):
         super().__init__()
+        self.embed = transformer.get_input_embeddings()
 
     def forward(self, input_ids):
-        hidden_states = transformer.embed_tokens(input_ids)
+        hidden_states = self.embed(input_ids)
         return hidden_states
 
 
@@ -75,13 +77,18 @@ def __init__(self, layer_id):
         super().__init__()
         self.layer_id = layer_id
         self.layer = layers[layer_id]
-        self.rotary_emb = self.layer.self_attn.rotary_emb
+
         position_ids = torch.tensor(
             [range(SEQ_LENGTH)], dtype=torch.long).cuda()
         value_states = torch.randn(
             (1, SEQ_LENGTH, config.llm_config.num_key_value_heads, HEAD_DIM)).bfloat16().cuda()
-        self.cos, self.sin = self.rotary_emb(
-            value_states, position_ids, SEQ_LENGTH)
+        if is_4B:
+            self.rotary_emb = self.layer.self_attn.rotary_emb
+            self.cos, self.sin = self.rotary_emb(
+                value_states, position_ids, SEQ_LENGTH)
+        else:
+            self.rotary_emb = self.layer.attention.rotary_emb
+            self.cos, self.sin = self.rotary_emb(value_states, SEQ_LENGTH)
         self.cos = self.cos.view(SEQ_LENGTH, HEAD_DIM)
         self.sin = self.sin.view(SEQ_LENGTH, HEAD_DIM)
 
@@ -105,13 +112,17 @@ def __init__(self, layer_id):
         super().__init__()
         self.layer_id = layer_id
         self.layer = layers[layer_id]
-        self.rotary_emb = self.layer.self_attn.rotary_emb
         position_ids = torch.tensor(
             [range(SEQ_LENGTH)], dtype=torch.long).cuda()
         value_states = torch.randn(
             (1, SEQ_LENGTH, config.llm_config.num_key_value_heads, HEAD_DIM)).bfloat16().cuda()
-        self.cos, self.sin = self.rotary_emb(
-            value_states, position_ids, SEQ_LENGTH)
+        if is_4B:
+            self.rotary_emb = self.layer.self_attn.rotary_emb
+            self.cos, self.sin = self.rotary_emb(
+                value_states, position_ids, SEQ_LENGTH)
+        else:
+            self.rotary_emb = self.layer.attention.rotary_emb
+            self.cos, self.sin = self.rotary_emb(value_states, SEQ_LENGTH)
         self.cos = self.cos.view(SEQ_LENGTH, HEAD_DIM)
         self.sin = self.sin.view(SEQ_LENGTH, HEAD_DIM)
 
@@ -134,10 +145,11 @@ class LmHead(torch.nn.Module):
 
     def __init__(self):
         super().__init__()
+        self.lm_head = origin_model.language_model.get_output_embeddings()
 
     def forward(self, hidden_states):
         hidden_states = transformer.norm(hidden_states)
-        m_logits = origin_model.language_model.lm_head(hidden_states)
+        m_logits = self.lm_head(hidden_states)
         _, token = torch.topk(m_logits.float(), 1)
         return token
 
@@ -251,68 +263,10 @@ def build_transform(input_size):
     return transform
 
 
-def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
-    best_ratio_diff = float('inf')
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        target_aspect_ratio = ratio[0] / ratio[1]
-        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
-        if ratio_diff < best_ratio_diff:
-            best_ratio_diff = ratio_diff
-            best_ratio = ratio
-        elif ratio_diff == best_ratio_diff:
-            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
-                best_ratio = ratio
-    return best_ratio
-
-
-def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
-    orig_width, orig_height = image.size
-    aspect_ratio = orig_width / orig_height
-
-    # calculate the existing image aspect ratio
-    target_ratios = set(
-        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
-        i * j <= max_num and i * j >= min_num)
-    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
-
-    # find the closest aspect ratio to the target
-    target_aspect_ratio = find_closest_aspect_ratio(
-        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
-
-    # calculate the target width and height
-    target_width = image_size * target_aspect_ratio[0]
-    target_height = image_size * target_aspect_ratio[1]
-    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
-
-    # resize the image
-    resized_img = image.resize((target_width, target_height))
-    processed_images = []
-    for i in range(blocks):
-        box = (
-            (i % (target_width // image_size)) * image_size,
-            (i // (target_width // image_size)) * image_size,
-            ((i % (target_width // image_size)) + 1) * image_size,
-            ((i // (target_width // image_size)) + 1) * image_size
-        )
-        # split the image
-        split_img = resized_img.crop(box)
-        processed_images.append(split_img)
-    assert len(processed_images) == blocks
-    if use_thumbnail and len(processed_images) != 1:
-        thumbnail_img = image.resize((image_size, image_size))
-        processed_images.append(thumbnail_img)
-    return processed_images
-
-
 def load_image(image_file, input_size=448, max_num=12):
     image = Image.open(image_file).convert('RGB')
     transform = build_transform(input_size=input_size)
-    images = dynamic_preprocess(
-        image, image_size=input_size, use_thumbnail=True, max_num=max_num)
-    pixel_values = [transform(image) for image in images]
-    pixel_values = torch.stack(pixel_values)
+    pixel_values = transform(image)
     return pixel_values
 
 
@@ -332,7 +286,8 @@ def test_net_with_mask():
     pixel_values = load_image(jpg, max_num=1).to(
         torch.bfloat16).cuda()  # [1, 3, 448, 448]
     vit_embeds = vit_infer(pixel_values)  # [1, 256, 3072]
-
+    ID_IM_END = tokenizer.convert_tokens_to_ids("<|im_end|>")
+    ID_END = tokenizer.convert_tokens_to_ids("<|end|>")
     token_len = len(ids)
     ids = ids + (SEQ_LENGTH - token_len) * [0]
     input_ids = torch.tensor(ids).view(SEQ_LENGTH).cuda()
@@ -362,7 +317,7 @@ def test_net_with_mask():
     lm = LmHead()
     token = lm(out.bfloat16()).view(1)
     out_ids = [int(token)]
-    while int(token) < EOS_TOKEN_ID and token_len < SEQ_LENGTH:
+    while int(token) not in [ID_EOS, ID_IM_END, ID_END] and token_len < SEQ_LENGTH:
         token_len += 1
         input_ids = torch.tensor([token]).cuda()
         out = embed(input_ids).view(1, 1, HIDDEN_SIZE)

diff --git a/models/InternVL2/compile/files/InternVL2-2B/config.json b/models/InternVL2/compile/files/InternVL2-2B/config.json
@@ -0,0 +1,143 @@
+{
+  "_commit_hash": null,
+  "architectures": [
+    "InternVLChatModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_internvl_chat.InternVLChatConfig",
+    "AutoModel": "modeling_internvl_chat.InternVLChatModel",
+    "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel"
+  },
+  "downsample_ratio": 0.5,
+  "dynamic_image_size": true,
+  "force_image_size": 448,
+  "llm_config": {
+    "_name_or_path": "internlm/internlm2-chat-1_8b",
+    "add_cross_attention": false,
+    "architectures": [
+      "InternLM2ForCausalLM"
+    ],
+    "attn_implementation": "flash_attention_2",
+    "auto_map": {
+      "AutoConfig": "configuration_internlm2.InternLM2Config",
+      "AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
+      "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM"
+    },
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bias": false,
+    "bos_token_id": 1,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 8192,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 512,
+    "min_length": 0,
+    "model_type": "internlm2",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 24,
+    "num_key_value_heads": 8,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 2,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": {
+      "factor": 2.0,
+      "type": "dynamic"
+    },
+    "rope_theta": 1000000,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "transformers_version": "4.37.2",
+    "typical_p": 1.0,
+    "use_bfloat16": true,
+    "use_cache": true,
+    "vocab_size": 92553
+  },
+  "max_dynamic_patch": 12,
+  "min_dynamic_patch": 1,
+  "model_type": "internvl_chat",
+  "ps_version": "v2",
+  "select_layer": -1,
+  "template": "internlm2-chat",
+  "torch_dtype": "bfloat16",
+  "use_backbone_lora": 0,
+  "use_llm_lora": 0,
+  "use_thumbnail": true,
+  "vision_config": {
+    "architectures": [
+      "InternVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "drop_path_rate": 0.0,
+    "dropout": 0.0,
+    "hidden_act": "gelu",
+    "hidden_size": 1024,
+    "image_size": 448,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "layer_norm_eps": 1e-06,
+    "model_type": "intern_vit_6b",
+    "norm_type": "layer_norm",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 24,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "patch_size": 14,
+    "qk_normalization": false,
+    "qkv_bias": true,
+    "return_dict": true,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.37.2",
+    "use_bfloat16": true,
+    "use_flash_attn": true
+  }
+}