vllm-project · DarkLight1337 · Jul 6, 2024 · Jul 6, 2024 · Jul 6, 2024 · Jul 6, 2024
diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst
@@ -5,10 +5,10 @@ Input Processing
 
 .. currentmodule:: vllm.inputs
 
-vLLM provides a mechanism for defining input processors for each model so that the inputs are processed
-in :class:`~vllm.LLMEngine` before they are passed to model executors. 
+Each model can override parts of vLLM's :ref:`input processing pipeline <input_processing_pipeline>` via
+:data:`~vllm.inputs.INPUT_REGISTRY` and :data:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
 
-Currently, this mechanism is only utilized in :ref:`multi-modal models <multi_modality>` for preprocessing multi-modal input 
+Currently, this mechanism is only utilized in :ref:`multi-modal <multi_modality>` models for preprocessing multi-modal input 
 data in addition to input prompt, but it can be extended to text-only language models when needed.
 
 Guides

diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
@@ -7,25 +7,17 @@ Multi-Modality
 
 vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
 
-:class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
-which allows you to pass in multi-modal input alongside text and token prompts.
+Multi-modal input can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
+via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptStrictInputs`.
 
 .. note::
    ``multi_modal_data`` can accept keys and values beyond the builtin ones, as long as a customized plugin is registered through 
-    :class:`vllm.multimodal.MULTIMODAL_REGISTRY`.
+   the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
 
-By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model, please follow :ref:`the guide for adding a new multimodal model. <adding_a_new_multimodal_model>`.
+To implement a new multi-modal model in vLLM, please follow :ref:`this guide <enabling_multimodal_inputs>`.
 
-
-# TODO: Add more instructions on how to do that once embeddings is in.
-
-Guides
-++++++
-
-.. toctree::
-   :maxdepth: 1
-
-   adding_multimodal_model
+..
+  TODO: Add more instructions on how to add new plugins once embeddings is in.
 
 Module Contents
 +++++++++++++++

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -92,6 +92,7 @@ Documentation
 
    models/supported_models
    models/adding_model
+   models/enabling_multimodal_inputs
    models/engine_args
    models/lora
    models/vlm
@@ -116,6 +117,7 @@ Documentation
    automatic_prefix_caching/details
 
 .. toctree::
+   :maxdepth: 2
    :caption: Developer Documentation
 
    dev/sampling_params

diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
@@ -10,6 +10,10 @@ This document provides a high-level guide on integrating a `HuggingFace Transfor
     The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
     However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
 
+.. note::
+    By default, vLLM models do not support multi-modal inputs. To enable multi-modal support,
+    please follow :ref:`this guide <enabling_multimodal_inputs>` after implementing the model here.
+
 .. tip::
     If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
     We will be happy to help you out!
@@ -44,23 +48,23 @@ Next, you need to rewrite the :meth:`~torch.nn.Module.forward` method of your mo
 
 .. code-block:: diff
 
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-    -    attention_mask: Optional[torch.Tensor] = None,
-    -    position_ids: Optional[torch.LongTensor] = None,
-    -    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    -    inputs_embeds: Optional[torch.FloatTensor] = None,
-    -    labels: Optional[torch.LongTensor] = None,
-    -    use_cache: Optional[bool] = None,
-    -    output_attentions: Optional[bool] = None,
-    -    output_hidden_states: Optional[bool] = None,
-    -    return_dict: Optional[bool] = None,
-    -) -> Union[Tuple, CausalLMOutputWithPast]:
-    +    positions: torch.Tensor,
-    +    kv_caches: List[torch.Tensor],
-    +    attn_metadata: AttentionMetadata,
-    +) -> Optional[SamplerOutput]:
+      def forward(
+          self,
+          input_ids: torch.Tensor,
+    -     attention_mask: Optional[torch.Tensor] = None,
+    -     position_ids: Optional[torch.LongTensor] = None,
+    -     past_key_values: Optional[List[torch.FloatTensor]] = None,
+    -     inputs_embeds: Optional[torch.FloatTensor] = None,
+    -     labels: Optional[torch.LongTensor] = None,
+    -     use_cache: Optional[bool] = None,
+    -     output_attentions: Optional[bool] = None,
+    -     output_hidden_states: Optional[bool] = None,
+    -     return_dict: Optional[bool] = None,
+    - ) -> Union[Tuple, CausalLMOutputWithPast]:
+    +     positions: torch.Tensor,
+    +     kv_caches: List[torch.Tensor],
+    +     attn_metadata: AttentionMetadata,
+    + ) -> Optional[SamplerOutput]:
 
 1. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
 2. Replace the attention operation with either :code:`PagedAttention`, :code:`PagedAttentionWithRoPE`, or :code:`PagedAttentionWithALiBi` depending on the model's architecture.

diff --git a/...ev/multimodal/adding_multimodal_model.rst → ...rce/models/enabling_multimodal_inputs.rst b/...ev/multimodal/adding_multimodal_model.rst → ...rce/models/enabling_multimodal_inputs.rst
@@ -1,26 +1,21 @@
-.. _adding_a_new_multimodal_model:
+.. _enabling_multimodal_inputs:
 
-Adding a New Multimodal Model
-=============================
+Enabling Multimodal Inputs
+==========================
 
-This document provides a high-level guide on integrating a :ref:`multi-modal model <multi_modality>` into vLLM.
+This document walks you through the steps to extend a vLLM model so that it accepts :ref:`multi-modal <multi_modality>` inputs.
 
-.. note::
-    The complexity of adding a new model depends heavily on the model's architecture.
-    The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
-    However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
-
-.. tip::
-    If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
-    We will be happy to help you out!
+.. seealso::
+    :ref:`adding_a_new_model`
 
 
-1. Set up the base vLLM model
+1. Update the base vLLM model
 -----------------------------
 
-As usual, follow :ref:`these steps <adding_a_new_model>` to implement the model in vLLM, but note the following:
+It is assumed that you have already implemented the model in vLLM according to :ref:`these steps <adding_a_new_model>`.
+Further update the model as follows:
 
-- You should additionally implement the :class:`~vllm.model_executor.models.interfaces.SupportsVision` interface.
+- Implement the :class:`~vllm.model_executor.models.interfaces.SupportsVision` interface.
 
   .. code-block:: diff
 
@@ -33,19 +28,19 @@ As usual, follow :ref:`these steps <adding_a_new_model>` to implement the model
       The model class does not have to be named :code:`*ForCausalLM`.
       Check out `the HuggingFace Transformers documentation <https://huggingface.co/docs/transformers/model_doc/auto#multimodal>`__ for some examples.
 
-- While implementing the :meth:`~torch.nn.Module.forward` method, reserve a keyword parameter
+- If you haven't already done so, reserve a keyword parameter in :meth:`~torch.nn.Module.forward`
   for each input tensor that corresponds to a multi-modal input, as shown in the following example:
 
   .. code-block:: diff
 
-      def forward(
-          self,
-          input_ids: torch.Tensor,
-          positions: torch.Tensor,
-          kv_caches: List[torch.Tensor],
-          attn_metadata: AttentionMetadata,
-      +   pixel_values: torch.Tensor,
-      ) -> SamplerOutput:
+        def forward(
+            self,
+            input_ids: torch.Tensor,
+            positions: torch.Tensor,
+            kv_caches: List[torch.Tensor],
+            attn_metadata: AttentionMetadata,
+      +     pixel_values: torch.Tensor,
+        ) -> SamplerOutput:
 
 
 2. Register input mappers
@@ -68,8 +63,8 @@ A default mapper is available for each modality in the core vLLM library. This i
     :ref:`input_processing_pipeline`
 
 
-3. Register maximum number of multimodal tokens
-----------------------------------------------------------
+3. Register maximum number of multi-modal tokens
+------------------------------------------------
 
 For each modality type that the model accepts as input, calculate the maximum possible number of tokens
 and register it via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_max_multimodal_tokens>`.

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -192,7 +192,7 @@ Vision Language Models
     -
 
 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
-Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Adding a New Multimodal Model <adding_a_new_multimodal_model>` 
+Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>` 
 for instructions on how to implement support for your model.
 Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ project.
 

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
@@ -141,7 +141,7 @@ def dummy_data_for_profiling(self, model_config: "ModelConfig",
         The model is identified by ``model_config``.
 
         See also:
-            :ref:`adding_a_new_multimodal_model`
+            :ref:`enabling_multimodal_inputs`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture

@@ -162,8 +162,8 @@ def register_input_mapper(
         If `None` is provided, then the default input mapper is used instead.
 
         See also:
-            :ref:`input_processing_pipeline`
-            :ref:`adding_a_new_multimodal_model`
+            - :ref:`input_processing_pipeline`
+            - :ref:`enabling_multimodal_inputs`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -192,7 +192,8 @@ def map_input(self, model_config: ModelConfig,
             TypeError: If the data type is not supported.
 
         See also:
-            :ref:`adding_a_new_multimodal_model`
+            - :ref:`input_processing_pipeline`
+            - :ref:`enabling_multimodal_inputs`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
@@ -230,7 +231,7 @@ def register_max_multimodal_tokens(
         If `None` is provided, then the default calculation is used instead.
 
         See also:
-            :ref:`adding_a_new_multimodal_model`
+            :ref:`enabling_multimodal_inputs`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -260,7 +261,7 @@ def get_max_multimodal_tokens(self, model_config: ModelConfig) -> int:
         The model is identified by ``model_config``.
 
         See also:
-            :ref:`adding_a_new_multimodal_model`
+            :ref:`enabling_multimodal_inputs`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture