From 50f82e12823ef8844b45a0dd864a78eea80de879 Mon Sep 17 00:00:00 2001
From: Matt <Rocketknight1@users.noreply.github.com>
Date: Wed, 12 Apr 2023 17:46:41 +0100
Subject: [PATCH] Fix docstrings for TF BLIP (#22618)

* Fix docstrings for TFBLIP

* Fix missing line in TF port!

* Use values from torch tests now other bugs fixed

* Use values from torch tests now other bugs fixed

* Fix doctest string
---
 src/transformers/models/blip/modeling_tf_blip.py      | 7 +++----
 src/transformers/models/blip/modeling_tf_blip_text.py | 1 +
 tests/models/blip/test_modeling_tf_blip.py            | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/blip/modeling_tf_blip.py b/src/transformers/models/blip/modeling_tf_blip.py
index bce6b524a69..dcb5509ed43 100644
--- a/src/transformers/models/blip/modeling_tf_blip.py
+++ b/src/transformers/models/blip/modeling_tf_blip.py
@@ -1020,7 +1020,7 @@ def get_text_features(
         )
 
         pooled_output = text_outputs[1]
-        text_features = self.text_projection(pooled_output)
+        text_features = self.blip.text_projection(pooled_output)
 
         return text_features
 
@@ -1057,7 +1057,7 @@ def get_image_features(
         vision_outputs = self.blip.vision_model(pixel_values=pixel_values, return_dict=return_dict)
 
         pooled_output = vision_outputs[1]  # pooled_output
-        image_features = self.visual_projection(pooled_output)
+        image_features = self.blip.visual_projection(pooled_output)
 
         return image_features
 
@@ -1238,7 +1238,7 @@ def generate(
 
         >>> outputs = model.generate(**inputs)
         >>> print(processor.decode(outputs[0], skip_special_tokens=True))
-        two cats are laying on a couch
+        two cats sleeping on a couch
         ```
         """
 
@@ -1410,7 +1410,6 @@ def call(
         >>> inputs["labels"] = labels
         >>> outputs = model(**inputs)
         >>> loss = outputs.loss
-        >>> loss.backward()
 
         >>> # inference
         >>> text = "How many cats are in the picture?"
diff --git a/src/transformers/models/blip/modeling_tf_blip_text.py b/src/transformers/models/blip/modeling_tf_blip_text.py
index 3ddf8539948..262b2cb2796 100644
--- a/src/transformers/models/blip/modeling_tf_blip_text.py
+++ b/src/transformers/models/blip/modeling_tf_blip_text.py
@@ -462,6 +462,7 @@ def call(
                 next_decoder_cache += (layer_outputs[-1],)
             if output_attentions:
                 all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
 
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
diff --git a/tests/models/blip/test_modeling_tf_blip.py b/tests/models/blip/test_modeling_tf_blip.py
index 31630b17f94..b8fd916ec13 100644
--- a/tests/models/blip/test_modeling_tf_blip.py
+++ b/tests/models/blip/test_modeling_tf_blip.py
@@ -783,7 +783,7 @@ def test_inference_image_captioning(self):
         # Test output
         self.assertEqual(
             predictions[0].numpy().tolist(),
-            [30522, 1037, 3861, 1997, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102],
+            [30522, 1037, 3861, 1997, 1037, 2450, 1998, 2014, 3899, 2006, 1996, 3509, 102],
         )
 
     def test_inference_vqa(self):
@@ -810,6 +810,6 @@ def test_inference_itm(self):
         out_itm = model(**inputs)
         out = model(**inputs, use_itm_head=False, training=False)
 
-        expected_scores = tf.convert_to_tensor([[0.9798, 0.0202]])
+        expected_scores = tf.convert_to_tensor([[0.0029, 0.9971]])
         self.assertTrue(np.allclose(tf.nn.softmax(out_itm[0]).numpy(), expected_scores, rtol=1e-3, atol=1e-3))
-        self.assertTrue(np.allclose(out[0], tf.convert_to_tensor([[0.5053]]), rtol=1e-3, atol=1e-3))
+        self.assertTrue(np.allclose(out[0], tf.convert_to_tensor([[0.5162]]), rtol=1e-3, atol=1e-3))