Skip to content

Commit

Permalink
Fix VLM generation issues (#32836)
Browse files Browse the repository at this point in the history
* fix in one commit

* add parameterized

* fix tests

* fix test flakiness

* maybe that's why flaky

* style

* flakiness...

---------

Co-authored-by: raushan <[email protected]>
  • Loading branch information
ArthurZucker and zucchini-nlp committed Aug 20, 2024
1 parent fff9be1 commit 4fd0f48
Show file tree
Hide file tree
Showing 13 changed files with 211 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -853,7 +853,7 @@ def forward(
inputs_embeds = self.get_input_embeddings()(input_ids)

# Merge text and images in prefill stage
if past_key_values is None:
if input_ids is not None and inputs_embeds.shape[1] != 1:
# First merge image tokens if there are any
if pixel_values is not None and pixel_values.size(0) > 0:
image_features = self._get_image_features(pixel_values, image_sizes)
Expand Down Expand Up @@ -910,7 +910,7 @@ def forward(
pass

# generation with cache, decoding stage
elif past_key_values is not None and (pixel_values is not None or pixel_values_videos is not None):
elif pixel_values is not None or pixel_values_videos is not None:
# Retrieve the first layer to inspect the logits and mask out the hidden states that are set to 0
first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
# Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
Expand Down
3 changes: 0 additions & 3 deletions src/transformers/models/video_llava/modeling_video_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,9 +653,6 @@ def prepare_inputs_for_generation(
if cache_length < past_length and attention_mask is not None:
attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]

pixel_values_videos = None
pixel_values_images = None

position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
# create position_ids on the fly for batch generation
Expand Down
14 changes: 14 additions & 0 deletions tests/models/blip/test_modeling_blip.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import numpy as np
import requests
from parameterized import parameterized

from transformers import BlipConfig, BlipTextConfig, BlipVisionConfig
from transformers.testing_utils import (
Expand Down Expand Up @@ -1106,6 +1107,7 @@ def test_model_from_pretrained(self):
@require_torch
class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (BlipForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (BlipForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_head_masking = False
test_pruning = False
Expand All @@ -1116,6 +1118,18 @@ class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
def setUp(self):
self.model_tester = BlipTextImageModelsModelTester(self)

@parameterized.expand([(True,), (False,)])
def test_greedy_generation(self, use_cache: bool):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()

out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20, use_cache=use_cache)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 19)

def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
Expand Down
18 changes: 16 additions & 2 deletions tests/models/blip_2/test_modeling_blip_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import numpy as np
import requests
from parameterized import parameterized

from transformers import CONFIG_MAPPING, Blip2Config, Blip2QFormerConfig, Blip2VisionConfig
from transformers.testing_utils import (
Expand Down Expand Up @@ -314,7 +315,7 @@ def __init__(
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=20,
max_position_embeddings=256,
eos_token_id=2,
pad_token_id=1,
bos_token_id=0,
Expand Down Expand Up @@ -436,8 +437,9 @@ def prepare_config_and_inputs_for_common(self):


@require_torch
class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_head_masking = False
test_pruning = False
Expand All @@ -448,6 +450,18 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT
def setUp(self):
self.model_tester = Blip2ForConditionalGenerationDecoderOnlyModelTester(self)

@parameterized.expand([(True,), (False,)])
def test_greedy_generation(self, use_cache: bool):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()

out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20, use_cache=use_cache)
self.assertTrue(out.shape[1] == 21) # BLIP is special, so should be 21

def test_for_conditional_generation(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
Expand Down
20 changes: 17 additions & 3 deletions tests/models/instructblip/test_modeling_instructblip.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import numpy as np
import requests
from parameterized import parameterized

from transformers import (
CONFIG_MAPPING,
Expand All @@ -38,7 +39,6 @@
)
from transformers.utils import is_torch_available, is_vision_available

from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
Expand Down Expand Up @@ -319,7 +319,7 @@ def __init__(
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=20,
max_position_embeddings=256,
eos_token_id=2,
pad_token_id=1,
bos_token_id=0,
Expand Down Expand Up @@ -452,8 +452,9 @@ def prepare_config_and_inputs_for_common(self):


@require_torch
class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_head_masking = False
test_pruning = False
Expand All @@ -464,6 +465,19 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
def setUp(self):
self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self)

@parameterized.expand([(True,), (False,)])
def test_greedy_generation(self, use_cache: bool):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
model.config.text_config.architectures = ["OptForCausalLM"]

out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20, use_cache=use_cache)
self.assertTrue(out.shape[1] == 21) # BLIP is special, therefore 21

def test_for_conditional_generation(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
Expand Down
22 changes: 17 additions & 5 deletions tests/models/instructblipvideo/test_modeling_instructblipvideo.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

import numpy as np
from huggingface_hub import hf_hub_download
from parameterized import parameterized

from transformers import (
CONFIG_MAPPING,
Expand All @@ -38,7 +39,6 @@
)
from transformers.utils import is_torch_available, is_vision_available

from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
Expand Down Expand Up @@ -333,7 +333,7 @@ def __init__(
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=100,
max_position_embeddings=256,
eos_token_id=2,
pad_token_id=1,
bos_token_id=0,
Expand Down Expand Up @@ -471,10 +471,9 @@ def prepare_config_and_inputs_for_common(self):


@require_torch
class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
):
class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
fx_compatible = False
test_head_masking = False
test_pruning = False
Expand All @@ -485,6 +484,19 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
def setUp(self):
self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self)

@parameterized.expand([(True,), (False,)])
def test_greedy_generation(self, use_cache: bool):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
model.config.text_config.architectures = ["OptForCausalLM"]

out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20, use_cache=use_cache)
self.assertTrue(out.shape[1] == 21) # BLIP is special, therefore 21

def test_for_conditional_generation(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs)
Expand Down
11 changes: 11 additions & 0 deletions tests/models/kosmos2/test_modeling_kosmos2.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,17 @@ def setUp(self):
self.model_tester = Kosmos2ModelTester(self)
self.config_tester = ConfigTester(self, config_class=Kosmos2Config, hidden_size=37)

def test_greedy_generation(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()

out = model.generate(**inputs_dict, min_new_tokens=20, max_new_tokens=20)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)

# overwrite from common to skip `image_to_text_projection.latent_query`
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
Expand Down
24 changes: 23 additions & 1 deletion tests/models/llava/test_modeling_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import unittest

import requests
from parameterized import parameterized

from transformers import (
AutoProcessor,
Expand Down Expand Up @@ -80,7 +81,7 @@ def __init__(
"initializer_range": 0.02,
"num_labels": 3,
"num_choices": 4,
"pad_token_id": 0,
"pad_token_id": 1,
},
is_training=True,
vision_config={
Expand Down Expand Up @@ -148,6 +149,8 @@ def prepare_config_and_inputs_for_common(self):
config, pixel_values = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
attention_mask = input_ids.ne(1).to(torch_device)
# set to random non-image token to prevent flakiness
input_ids[input_ids == config.image_token_index] = 1
# we are giving 3 images let's make sure we pass in 3 image tokens
input_ids[:, 1] = config.image_token_index
inputs_dict = {
Expand Down Expand Up @@ -178,6 +181,7 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase
"""

all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = {"image-to-text": LlavaForConditionalGeneration} if is_torch_available() else {}
test_pruning = False
test_head_masking = False
Expand All @@ -186,6 +190,24 @@ def setUp(self):
self.model_tester = LlavaVisionText2TextModelTester(self)
self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False)

@parameterized.expand([(True,), (False,)])
def test_greedy_generation(self, use_cache: bool):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()

out = model.generate(
**inputs_dict,
min_new_tokens=20,
max_new_tokens=20,
use_cache=use_cache,
bad_words_ids=[[config.image_token_index]],
)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)

@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
Expand Down
27 changes: 24 additions & 3 deletions tests/models/llava_next/test_modeling_llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import requests
from huggingface_hub import hf_hub_download
from parameterized import parameterized

from transformers import (
AutoProcessor,
Expand All @@ -34,7 +35,6 @@
torch_device,
)

from ...generation.test_utils import GenerationTesterMixin
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
Expand Down Expand Up @@ -86,7 +86,7 @@ def __init__(
"initializer_range": 0.02,
"num_labels": 3,
"num_choices": 4,
"pad_token_id": 0,
"pad_token_id": 1,
},
is_training=True,
vision_config={
Expand Down Expand Up @@ -157,6 +157,8 @@ def prepare_config_and_inputs_for_common(self):
config, pixel_values = config_and_inputs
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
# set to random non-image token to prevent flakiness
input_ids[input_ids == config.image_token_index] = 2
# we are giving 3 images let's make sure we pass in 3 image tokens
input_ids[:, 1] = config.image_token_index
labels = torch.zeros((self.batch_size, self.seq_length), dtype=torch.long, device=torch_device)
Expand Down Expand Up @@ -208,12 +210,13 @@ def create_and_check_llava_next_model_fp16_autocast_forward(


@require_torch
class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
"""
Model tester for `LlavaNextForConditionalGeneration`.
"""

all_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
all_generative_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False

Expand All @@ -237,6 +240,24 @@ def test_initialization(self):
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)

@parameterized.expand([(True,), (False,)])
def test_greedy_generation(self, use_cache: bool):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()

out = model.generate(
**inputs_dict,
min_new_tokens=20,
max_new_tokens=20,
use_cache=use_cache,
bad_words_ids=[[config.image_token_index]],
)
self.assertTrue(out.shape[1] == inputs_dict["input_ids"].shape[1] + 20)

@unittest.skip(
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
)
Expand Down
Loading

0 comments on commit 4fd0f48

Please sign in to comment.