huggingface · akshayballal95 · Oct 21, 2024 · Oct 21, 2024
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
@@ -73,6 +73,7 @@
 from .constants import ONNX_DECODER_MERGED_NAME, ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME
 from .model_patcher import (
  CLIPModelPatcher,
+ ColPaliModelPatcher,
  FalconModelPatcher,
  MistralModelPatcher,
  MusicgenModelPatcher,
@@ -2310,3 +2311,59 @@ def overwrite_shape_and_generate_input(
 
 class EncoderDecoderOnnxConfig(EncoderDecoderBaseOnnxConfig):
  NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig
+
+
+class PaliGemmaOnnxConfig(GemmaOnnxConfig):
+
+ DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyVisionInputGenerator)
+
+ NORMALIZED_CONFIG_CLASS = NormalizedTextAndVisionConfig.with_args(
+ text_config="text_config", vision_config="vision_config"
+ )
+
+ @property
+ def inputs(self) -> Dict[str, Dict[int, str]]:
+ dynamic_axis = {0: "batch_size", 1: "sequence_length"}
+
+ if self.task == "feature-extraction":
+ return {
+ "input_ids": dynamic_axis,
+ "attention_mask": dynamic_axis,
+ "pixel_values": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
+ }
+ elif self.task == "text-generation":
+ return {
+ "input_ids": dynamic_axis,
+ "attention_mask": dynamic_axis,
+ }
+
+ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
+
+ dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs)
+
+ if framework == "pt":
+
+ if self.task == "feature-extraction":
+ generator = self.DUMMY_INPUT_GENERATOR_CLASSES[0](self.task, self._normalized_config)
+ prefix_tensor = generator.constant_tensor(
+ shape=[dummy_inputs["input_ids"].shape[0], 1024],
+ value=self._normalized_config.image_token_index,
+ framework=framework,
+ )
+ dummy_inputs["input_ids"] = generator.concat_inputs([prefix_tensor, dummy_inputs["input_ids"]], dim=1)
+ dummy_inputs["attention_mask"] = generator.random_mask_tensor(
+ shape=[generator.batch_size, generator.sequence_length + 1024],
+ padding_side=generator.padding_side,
+ framework=framework,
+ dtype="int64",
+ )
+ return dummy_inputs
+
+ def patch_model_for_export(
+ self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+ ) -> "ModelPatcher":
+
+ if self.task == "feature-extraction":
+ return ColPaliModelPatcher(self, model, model_kwargs=model_kwargs)
+ else:
+ return super().patch_model_for_export(model, model_kwargs=model_kwargs)
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
@@ -510,6 +510,24 @@ def patched_forward(*args, **kwargs):
  self.patched_forward = patched_forward
 
 
+class ColPaliModelPatcher(ModelPatcher):
+ def __init__(
+ self,
+ config: "OnnxConfig",
+ model: Union["PreTrainedModel", "TFPreTrainedModel"],
+ model_kwargs: Optional[Dict[str, Any]] = None,
+ ):
+ super().__init__(config, model, model_kwargs)
+
+ def patched_forward(input_ids=None, pixel_values=None, attention_mask=None, **kwargs):
+ outputs = self.orig_forward(
+ input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **kwargs
+ )
+ return outputs
+
+ self.patched_forward = patched_forward
+
+
 class SAMModelPatcher(ModelPatcher):
  def __init__(
  self,

diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
@@ -915,6 +915,10 @@ class TasksManager:
  "text-classification",
  onnx="LlamaOnnxConfig",
  ),
+ "paligemma": supported_tasks_mapping(
+ "feature-extraction",
+ onnx="PaliGemmaOnnxConfig",
+ ),
  "pegasus": supported_tasks_mapping(
  "feature-extraction",
  "feature-extraction-with-past",