From 329b645d3e02eebdac7f7de0d508c6f951130fa2 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 07:58:42 +0100
Subject: [PATCH 01/37] add outlines 0.1.0 support

---
 .../models/llms/huggingface/transformers.py   | 24 +++++--
 .../tasks/structured_outputs/outlines.py      | 70 ++++++++++++++-----
 2 files changed, 69 insertions(+), 25 deletions(-)

diff --git a/src/distilabel/models/llms/huggingface/transformers.py b/src/distilabel/models/llms/huggingface/transformers.py
index a4f9de95ab..2fb99f1b45 100644
--- a/src/distilabel/models/llms/huggingface/transformers.py
+++ b/src/distilabel/models/llms/huggingface/transformers.py
@@ -27,7 +27,7 @@
 from distilabel.utils.huggingface import HF_TOKEN_ENV_VAR
 
 if TYPE_CHECKING:
-    from transformers import Pipeline
+    from transformers import LogitsProcessorList, Pipeline
     from transformers.modeling_utils import PreTrainedModel
     from transformers.tokenization_utils import PreTrainedTokenizer
 
@@ -111,6 +111,7 @@ class TransformersLLM(LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin):
 
     _pipeline: Optional["Pipeline"] = PrivateAttr(...)
     _prefix_allowed_tokens_fn: Union[Callable, None] = PrivateAttr(default=None)
+    _logits_processor: Optional["LogitsProcessorList"] = PrivateAttr(default=None)
 
     def load(self) -> None:
         """Loads the model and tokenizer and creates the text generation pipeline. In addition,
@@ -119,7 +120,7 @@ def load(self) -> None:
             CudaDevicePlacementMixin.load(self)
 
         try:
-            from transformers import pipeline
+            from transformers import LogitsProcessorList, pipeline
         except ImportError as ie:
             raise ImportError(
                 "Transformers is not installed. Please install it using `pip install transformers`."
@@ -149,10 +150,20 @@ def load(self) -> None:
             self._pipeline.tokenizer.pad_token = self._pipeline.tokenizer.eos_token  # type: ignore
 
         if self.structured_output:
-            self._prefix_allowed_tokens_fn = self._prepare_structured_output(
-                self.structured_output
+            from distilabel.steps.tasks.structured_outputs.outlines import (
+                outlines_below_0_1_0,
             )
 
+            if outlines_below_0_1_0:
+                self._prefix_allowed_tokens_fn = self._prepare_structured_output(
+                    self.structured_output
+                )
+            else:
+                logits_processor = self._prepare_structured_output(
+                    self.structured_output
+                )
+                self._logits_processor = LogitsProcessorList([logits_processor])
+
         super().load()
 
     def unload(self) -> None:
@@ -222,7 +233,7 @@ def generate(  # type: ignore
         """
         prepared_inputs = [self.prepare_input(input=input) for input in inputs]
 
-        outputs: List[List[Dict[str, str]]] = self._pipeline(  # type: ignore
+        outputs: List[List[Dict[str, str]]] = self._pipeline(
             prepared_inputs,
             max_new_tokens=max_new_tokens,
             temperature=temperature,
@@ -232,7 +243,8 @@ def generate(  # type: ignore
             do_sample=do_sample,
             num_return_sequences=num_generations,
             prefix_allowed_tokens_fn=self._prefix_allowed_tokens_fn,
-            pad_token_id=self._pipeline.tokenizer.eos_token_id,  # type: ignore
+            logits_processor=self._logits_processor,
+            pad_token_id=self._pipeline.tokenizer.eos_token_id,
         )
         llm_output = [
             [generation["generated_text"] for generation in output]
diff --git a/src/distilabel/steps/tasks/structured_outputs/outlines.py b/src/distilabel/steps/tasks/structured_outputs/outlines.py
index fe561d11af..bda692f442 100644
--- a/src/distilabel/steps/tasks/structured_outputs/outlines.py
+++ b/src/distilabel/steps/tasks/structured_outputs/outlines.py
@@ -27,6 +27,7 @@
     get_args,
 )
 
+import pkg_resources
 from pydantic import BaseModel
 
 from distilabel.errors import DistilabelUserError
@@ -36,7 +37,11 @@
     from distilabel.steps.tasks.typing import OutlinesStructuredOutputType
 
 Frameworks = Literal["transformers", "llamacpp", "vllm"]
-"""Available frameworks for the structured output configuration. """
+# Available frameworks for the structured output configuration.
+_outlines_version = pkg_resources.get_distribution("outlines").version
+outlines_below_0_1_0 = pkg_resources.parse_version(
+    _outlines_version
+) < pkg_resources.parse_version("0.1.0")
 
 
 def model_to_schema(schema: Type[BaseModel]) -> Dict[str, Any]:
@@ -46,31 +51,56 @@ def model_to_schema(schema: Type[BaseModel]) -> Dict[str, Any]:
 
 def _get_logits_processor(framework: Frameworks) -> Tuple[Callable, Callable]:
     """Helper function to return the appropriate logits processor for the given framework."""
-    if framework == "transformers":
-        from outlines.integrations.transformers import (
-            JSONPrefixAllowedTokens,
-            RegexPrefixAllowedTokens,
+    if framework not in Frameworks.__args__:
+        raise DistilabelUserError(
+            f"Invalid framework '{framework}'. Must be one of {get_args(Frameworks)}",
+            page="sections/how_to_guides/advanced/structured_generation/",
         )
 
-        return JSONPrefixAllowedTokens, RegexPrefixAllowedTokens
+    if outlines_below_0_1_0:
+        if framework == "transformers":
+            from outlines.integrations.transformers import (
+                JSONPrefixAllowedTokens,
+                RegexPrefixAllowedTokens,
+            )
 
-    if framework == "llamacpp":
-        from outlines.integrations.llamacpp import (
-            JSONLogitsProcessor,
-            RegexLogitsProcessor,
-        )
+            return JSONPrefixAllowedTokens, RegexPrefixAllowedTokens
 
-        return JSONLogitsProcessor, RegexLogitsProcessor
+        if framework == "llamacpp":
+            from outlines.integrations.llamacpp import (
+                JSONLogitsProcessor,
+                RegexLogitsProcessor,
+            )
+
+            return JSONLogitsProcessor, RegexLogitsProcessor
 
-    if framework == "vllm":
-        from outlines.integrations.vllm import JSONLogitsProcessor, RegexLogitsProcessor
+        if framework == "vllm":
+            from outlines.integrations.vllm import (
+                JSONLogitsProcessor,
+                RegexLogitsProcessor,
+            )
+
+            return JSONLogitsProcessor, RegexLogitsProcessor
+    else:
+        from outlines.processors import JSONLogitsProcessor, RegexLogitsProcessor
 
         return JSONLogitsProcessor, RegexLogitsProcessor
 
-    raise DistilabelUserError(
-        f"Invalid framework '{framework}'. Must be one of {get_args(Frameworks)}",
-        page="sections/how_to_guides/advanced/structured_generation/",
-    )
+
+def _get_outlines_tokenizer_or_model(llm: Any, framework: Frameworks) -> Callable:
+    if not outlines_below_0_1_0:
+        if framework == "llamacpp":
+            from outlines.models.llamacpp import LlamaCppTokenizer
+
+            return LlamaCppTokenizer(llm)
+        elif framework == "transformers":
+            from outlines.models.transformers import TransformerTokenizer
+
+            return TransformerTokenizer(llm.tokenizer)
+        elif framework == "vllm":
+            return llm.get_tokenizer()
+    else:
+        return llm
 
 
 def prepare_guided_output(
@@ -104,6 +134,8 @@ def prepare_guided_output(
 
     json_processor, regex_processor = _get_logits_processor(framework)
 
+    tokenizer_or_model = _get_outlines_tokenizer_or_model(llm, framework)
+
     format = structured_output.get("format")
     schema = structured_output.get("schema")
 
@@ -120,7 +152,7 @@ def prepare_guided_output(
         return {
             "processor": json_processor(
                 schema,
-                llm,
+                tokenizer_or_model,
                 whitespace_pattern=structured_output.get("whitespace_pattern"),
             ),
             "schema": schema_as_dict(schema),

From 9dd4be972c220917c7769c5a1bf75c767540e41e Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 07:58:50 +0100
Subject: [PATCH 02/37] update tests

---
 tests/unit/steps/tasks/structured_outputs/test_outlines.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/unit/steps/tasks/structured_outputs/test_outlines.py b/tests/unit/steps/tasks/structured_outputs/test_outlines.py
index e4eb2025c8..fc6f9a2f7c 100644
--- a/tests/unit/steps/tasks/structured_outputs/test_outlines.py
+++ b/tests/unit/steps/tasks/structured_outputs/test_outlines.py
@@ -100,9 +100,6 @@ class DummyUserTest(BaseModel):
 }
 
 
-@pytest.mark.skip(
-    reason="won't work until we update our code to work with `outlines>0.1.0`"
-)
 class TestOutlinesIntegration:
     @pytest.mark.parametrize(
         "format, schema, prompt",

From 3ce1ff3b0a22824a2e3c54f228b36e153020e5f9 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 09:07:50 +0100
Subject: [PATCH 03/37] fix passing tokenizer to regex processor as well

---
 .../steps/tasks/structured_outputs/outlines.py           | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/distilabel/steps/tasks/structured_outputs/outlines.py b/src/distilabel/steps/tasks/structured_outputs/outlines.py
index bda692f442..52707dc720 100644
--- a/src/distilabel/steps/tasks/structured_outputs/outlines.py
+++ b/src/distilabel/steps/tasks/structured_outputs/outlines.py
@@ -88,7 +88,9 @@ def _get_logits_processor(framework: Frameworks) -> Tuple[Callable, Callable]:
 
 
 def _get_outlines_tokenizer_or_model(llm: Any, framework: Frameworks) -> Callable:
-    if not outlines_below_0_1_0:
+    if outlines_below_0_1_0:
+        return llm
+    else:
         if framework == "llamacpp":
             from outlines.models.llamacpp import LlamaCppTokenizer
 
@@ -99,8 +101,6 @@ def _get_outlines_tokenizer_or_model(llm: Any, framework: Frameworks) -> Callabl
             return TransformerTokenizer(llm.tokenizer)
         elif framework == "vllm":
             return llm.get_tokenizer()
-    else:
-        return llm
 
 
 def prepare_guided_output(
@@ -127,6 +127,7 @@ def prepare_guided_output(
         case of "json" will also include the schema as a dict, to simplify serialization
         and deserialization.
     """
+
     if not importlib.util.find_spec("outlines"):
         raise ImportError(
             "Outlines is not installed. Please install it using `pip install outlines`."
@@ -159,7 +160,7 @@ def prepare_guided_output(
         }
 
     if format == "regex":
-        return {"processor": regex_processor(schema, llm)}
+        return {"processor": regex_processor(schema, tokenizer_or_model)}
 
     raise DistilabelUserError(
         f"Invalid format '{format}'. Must be either 'json' or 'regex'.",

From d8d7b35ea9e39f572757d25ee9b7a555fe08d7bd Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 09:08:08 +0100
Subject: [PATCH 04/37] fix test by specifically passing None as token to
 transformersllm

---
 .../unit/steps/tasks/structured_outputs/test_outlines.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/unit/steps/tasks/structured_outputs/test_outlines.py b/tests/unit/steps/tasks/structured_outputs/test_outlines.py
index fc6f9a2f7c..236c0954ba 100644
--- a/tests/unit/steps/tasks/structured_outputs/test_outlines.py
+++ b/tests/unit/steps/tasks/structured_outputs/test_outlines.py
@@ -20,6 +20,7 @@
 from distilabel.models.llms.huggingface.transformers import TransformersLLM
 from distilabel.steps.tasks.structured_outputs.outlines import (
     model_to_schema,
+    outlines_below_0_1_0,
 )
 from distilabel.steps.tasks.typing import OutlinesStructuredOutputType
 
@@ -171,6 +172,7 @@ def test_serialization(
             structured_output=OutlinesStructuredOutputType(
                 format=format, schema=schema
             ),
+            token=None,
         )
         llm.load()
         assert llm.dump() == dump
@@ -179,4 +181,9 @@ def test_load_from_dict(self) -> None:
         llm = TransformersLLM.from_dict(DUMP_JSON)
         assert isinstance(llm, TransformersLLM)
         llm.load()
-        assert llm._prefix_allowed_tokens_fn is not None
+        if outlines_below_0_1_0:
+            assert llm._prefix_allowed_tokens_fn is not None
+            assert llm._logits_processor is None
+        else:
+            assert llm._prefix_allowed_tokens_fn is None
+            assert llm._logits_processor is not None

From 2e0b42cade251f221d548336d28876d6a73d6bb5 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 09:21:54 +0100
Subject: [PATCH 05/37] fix tests by increeasing the temperature to avoid
 exploding beam search logic

---
 tests/unit/steps/tasks/structured_outputs/test_outlines.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/steps/tasks/structured_outputs/test_outlines.py b/tests/unit/steps/tasks/structured_outputs/test_outlines.py
index 236c0954ba..446967a2d5 100644
--- a/tests/unit/steps/tasks/structured_outputs/test_outlines.py
+++ b/tests/unit/steps/tasks/structured_outputs/test_outlines.py
@@ -136,7 +136,7 @@ def test_generation(
         prompt = [
             [{"role": "system", "content": ""}, {"role": "user", "content": prompt}]
         ]
-        result = llm.generate(prompt, max_new_tokens=30)
+        result = llm.generate(prompt, max_new_tokens=30, temperature=0.7)
         assert isinstance(result, list)
         assert isinstance(result[0], dict)
         assert "generations" in result[0] and "statistics" in result[0]

From 5ee7dce349a5b9abc91b814807721c23004e5873 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 13:17:37 +0100
Subject: [PATCH 06/37] fix logit processor assignment during generation

---
 src/distilabel/models/llms/llamacpp.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/distilabel/models/llms/llamacpp.py b/src/distilabel/models/llms/llamacpp.py
index 822e5cea77..71d29aecb4 100644
--- a/src/distilabel/models/llms/llamacpp.py
+++ b/src/distilabel/models/llms/llamacpp.py
@@ -194,9 +194,7 @@ def load(self) -> None:
         )
 
         if self.structured_output:
-            self._logits_processor = self._prepare_structured_output(
-                self.structured_output
-            )
+            self._set_logits_processor(self.structured_output)
 
         if self.use_magpie_template or self.magpie_pre_query_template:
             if not self.tokenizer_id:
@@ -223,6 +221,19 @@ def load(self) -> None:
         # out of the model name, which won't be available until the `Llama` instance is created.
         super().load()
 
+    def _set_logits_processor(
+        self, structured_output: Optional[OutlinesStructuredOutputType] = None
+    ) -> None:
+        from distilabel.steps.tasks.structured_outputs.outlines import (
+            outlines_below_0_1_0,
+        )
+
+        processor = self._prepare_structured_output(structured_output)
+        if outlines_below_0_1_0:
+            self._logits_processor = processor
+        else:
+            self._logits_processor = [processor]
+
     @property
     def model_name(self) -> str:
         """Returns the model name used for the LLM."""
@@ -341,9 +352,8 @@ def generate(  # type: ignore
                 # after each generation, so subsequent calls yield nothing. This is a workaround
                 # until is fixed in the `llama_cpp` or `outlines` libraries.
                 if structured_output:
-                    self._logits_processor = self._prepare_structured_output(
-                        structured_output
-                    )
+                    self._set_logits_processor(structured_output)
+
                 if self.tokenizer_id is None:
                     completion = self._generate_chat_completion(
                         input,

From 0d26a1e4b48d0a58578e94a4beece52ca77581d6 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 9 Jan 2025 12:30:08 +0000
Subject: [PATCH 07/37] [pre-commit.ci] auto fixes from pre-commit.com hooks
 for more information, see https://pre-commit.ci

---
 src/distilabel/models/embeddings/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/distilabel/models/embeddings/__init__.py b/src/distilabel/models/embeddings/__init__.py
index 9177298748..8d4dce0f7d 100644
--- a/src/distilabel/models/embeddings/__init__.py
+++ b/src/distilabel/models/embeddings/__init__.py
@@ -20,6 +20,7 @@
 
 __all__ = [
     "Embeddings",
+    "LlamaCppEmbeddings",
     "SentenceTransformerEmbeddings",
     "vLLMEmbeddings",
 ]

From 47e38dc0b36cfaf63e5e062fdca8946c2f2e7df1 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 17:42:17 +0100
Subject: [PATCH 08/37] add support transformers

---
 .../models/llms/huggingface/transformers.py   | 151 ++++++++++++------
 .../tasks/structured_outputs/outlines.py      |   6 +-
 2 files changed, 110 insertions(+), 47 deletions(-)

diff --git a/src/distilabel/models/llms/huggingface/transformers.py b/src/distilabel/models/llms/huggingface/transformers.py
index 2fb99f1b45..faa29cb310 100644
--- a/src/distilabel/models/llms/huggingface/transformers.py
+++ b/src/distilabel/models/llms/huggingface/transformers.py
@@ -23,11 +23,15 @@
 from distilabel.models.llms.utils import compute_tokens, prepare_output
 from distilabel.models.mixins.cuda_device_placement import CudaDevicePlacementMixin
 from distilabel.models.mixins.magpie import MagpieChatTemplateMixin
+from distilabel.steps.tasks.structured_outputs.outlines import (
+    outlines_below_0_1_0,
+)
 from distilabel.steps.tasks.typing import OutlinesStructuredOutputType, StandardInput
 from distilabel.utils.huggingface import HF_TOKEN_ENV_VAR
 
 if TYPE_CHECKING:
-    from transformers import LogitsProcessorList, Pipeline
+    from outlines.models.transformers import Transformers
+    from transformers import LogitsProcessor, LogitsProcessorList, Pipeline
     from transformers.modeling_utils import PreTrainedModel
     from transformers.tokenization_utils import PreTrainedTokenizer
 
@@ -109,24 +113,38 @@ class TransformersLLM(LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin):
         description="The structured output format to use across all the generations.",
     )
 
-    _pipeline: Optional["Pipeline"] = PrivateAttr(...)
+    _pipeline: Optional[Union["Pipeline", "Transformers"]] = PrivateAttr(...)
     _prefix_allowed_tokens_fn: Union[Callable, None] = PrivateAttr(default=None)
-    _logits_processor: Optional["LogitsProcessorList"] = PrivateAttr(default=None)
-
-    def load(self) -> None:
-        """Loads the model and tokenizer and creates the text generation pipeline. In addition,
-        it will configure the tokenizer chat template."""
-        if self.device == "cuda":
-            CudaDevicePlacementMixin.load(self)
+    _logits_processor: Optional[Union["LogitsProcessor", "LogitsProcessorList"]] = (
+        PrivateAttr(default=None)
+    )
 
-        try:
-            from transformers import LogitsProcessorList, pipeline
-        except ImportError as ie:
-            raise ImportError(
-                "Transformers is not installed. Please install it using `pip install transformers`."
-            ) from ie
+    def _set_outlines_pipeline(self):
+        from outlines.models.transformers import Transformers
+        from transformers import AutoModelForCausalLM, AutoTokenizer
 
         token = self.token.get_secret_value() if self.token is not None else self.token
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model,
+            output_attentions=True,
+            token=token,
+            revision=self.revision,
+            torch_dtype=self.torch_dtype,
+            trust_remote_code=self.trust_remote_code,
+            device_map=self.device_map,
+            **(self.model_kwargs or {}),
+        ).to(self.device)
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.tokenizer or self.model,
+            token=token,
+            use_fast=self.use_fast,
+            revision=self.revision,
+            trust_remote_code=self.trust_remote_code,
+        )
+        self._pipeline = Transformers(model, tokenizer)
+
+    def _set_native_tf_pipeline(self):
+        from transformers import pipeline
 
         self._pipeline = pipeline(
             "text-generation",
@@ -139,30 +157,44 @@ def load(self) -> None:
             use_fast=self.use_fast,
             device=self.device,
             device_map=self.device_map,
-            token=token,
+            token=self.token.get_secret_value()
+            if self.token is not None
+            else self.token,
             return_full_text=False,
         )
 
         if self.chat_template is not None:
-            self._pipeline.tokenizer.chat_template = self.chat_template  # type: ignore
+            self._pipeline.tokenizer.chat_template = self.chat_template
 
-        if self._pipeline.tokenizer.pad_token is None:  # type: ignore
-            self._pipeline.tokenizer.pad_token = self._pipeline.tokenizer.eos_token  # type: ignore
+        if self._pipeline.tokenizer.pad_token is None:
+            self._pipeline.tokenizer.pad_token = self._pipeline.tokenizer.eos_token
 
-        if self.structured_output:
-            from distilabel.steps.tasks.structured_outputs.outlines import (
-                outlines_below_0_1_0,
-            )
+    def load(self) -> None:
+        """Loads the model and tokenizer and creates the text generation pipeline. In addition,
+        it will configure the tokenizer chat template."""
+        if self.device == "cuda":
+            CudaDevicePlacementMixin.load(self)
 
+        try:
+            from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline  # noqa
+        except ImportError as ie:
+            raise ImportError(
+                "Transformers is not installed. Please install it using `pip install transformers`."
+            ) from ie
+
+        if self.structured_output:
             if outlines_below_0_1_0:
+                self._set_native_tf_pipeline()
                 self._prefix_allowed_tokens_fn = self._prepare_structured_output(
                     self.structured_output
                 )
             else:
-                logits_processor = self._prepare_structured_output(
+                self._set_outlines_pipeline()
+                self._logits_processor = self._prepare_structured_output(
                     self.structured_output
                 )
-                self._logits_processor = LogitsProcessorList([logits_processor])
+        else:
+            self._set_native_tf_pipeline()
 
         super().load()
 
@@ -186,12 +218,9 @@ def prepare_input(self, input: "StandardInput") -> str:
         Returns:
             The prompt to send to the LLM.
         """
-        if self._pipeline.tokenizer.chat_template is None:  # type: ignore
-            return input[0]["content"]
-
         prompt: str = (
-            self._pipeline.tokenizer.apply_chat_template(  # type: ignore
-                input,  # type: ignore
+            self._pipeline.tokenizer.tokenizer.apply_chat_template(
+                input,
                 tokenize=False,
                 add_generation_prompt=True,
             )
@@ -201,10 +230,10 @@ def prepare_input(self, input: "StandardInput") -> str:
         return super().apply_magpie_pre_query_template(prompt, input)
 
     @validate_call
-    def generate(  # type: ignore
+    def generate(
         self,
         inputs: List[StandardInput],
-        num_generations: int = 1,
+        num_generations: int = 2,
         max_new_tokens: int = 128,
         temperature: float = 0.1,
         repetition_penalty: float = 1.1,
@@ -231,21 +260,51 @@ def generate(  # type: ignore
         Returns:
             A list of lists of strings containing the generated responses for each input.
         """
+
         prepared_inputs = [self.prepare_input(input=input) for input in inputs]
 
-        outputs: List[List[Dict[str, str]]] = self._pipeline(
-            prepared_inputs,
-            max_new_tokens=max_new_tokens,
-            temperature=temperature,
-            repetition_penalty=repetition_penalty,
-            top_p=top_p,
-            top_k=top_k,
-            do_sample=do_sample,
-            num_return_sequences=num_generations,
-            prefix_allowed_tokens_fn=self._prefix_allowed_tokens_fn,
-            logits_processor=self._logits_processor,
-            pad_token_id=self._pipeline.tokenizer.eos_token_id,
-        )
+        if self.structured_output and not outlines_below_0_1_0:
+            from outlines.models.transformers import (
+                GenerationParameters,
+                SamplingParameters,
+            )
+
+            outputs = [
+                [[] for _ in range(num_generations)]
+                for _ in range(len(prepared_inputs))
+            ]
+            for idx_generation in range(num_generations):
+                generations = self._pipeline.generate(
+                    prepared_inputs,
+                    generation_parameters=GenerationParameters(
+                        max_tokens=max_new_tokens,
+                        stop_at=None,
+                        seed=None,
+                    ),
+                    logits_processor=self._logits_processor,
+                    sampling_parameters=SamplingParameters(
+                        sampler="multinomial",
+                        top_p=top_p,
+                        top_k=top_k,
+                        temperature=temperature,
+                    ),
+                )
+                for idx_sample, generation in enumerate(generations):
+                    outputs[idx_sample][idx_generation] = {"generated_text": generation}
+        else:
+            outputs: List[List[Dict[str, str]]] = self._pipeline(
+                prepared_inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
+                top_p=top_p,
+                top_k=top_k,
+                do_sample=do_sample,
+                num_return_sequences=num_generations,
+                prefix_allowed_tokens_fn=self._prefix_allowed_tokens_fn,
+                pad_token_id=self._pipeline.tokenizer.eos_token_id,
+            )
+
         llm_output = [
             [generation["generated_text"] for generation in output]
             for output in outputs
@@ -295,7 +354,7 @@ def get_last_hidden_states(
         last_hidden_states = model(**input_ids)["last_hidden_state"]
 
         return [
-            seq_last_hidden_state[attention_mask.bool(), :].detach().cpu().numpy()
+            seq_last_hidden_state[attention_mask.bool(), :].detach().cpu().numpy()  # type: ignore
             for seq_last_hidden_state, attention_mask in zip(
                 last_hidden_states,
                 input_ids["attention_mask"],  # type: ignore
diff --git a/src/distilabel/steps/tasks/structured_outputs/outlines.py b/src/distilabel/steps/tasks/structured_outputs/outlines.py
index 52707dc720..1772761e4c 100644
--- a/src/distilabel/steps/tasks/structured_outputs/outlines.py
+++ b/src/distilabel/steps/tasks/structured_outputs/outlines.py
@@ -96,8 +96,12 @@ def _get_outlines_tokenizer_or_model(llm: Any, framework: Frameworks) -> Callabl
 
             return LlamaCppTokenizer(llm)
         elif framework == "transformers":
-            from outlines.models.transformers import TransformerTokenizer
+            from outlines.models.transformers import Transformers, TransformerTokenizer
 
+            if isinstance(llm, Transformers):
+                return llm.tokenizer
+            else:
+                return TransformerTokenizer(llm.tokenizer)
             return TransformerTokenizer(llm.tokenizer)
         elif framework == "vllm":
             return llm.get_tokenizer()

From 66ac934bcd449b26dcbabff18f32f3656cd61733 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 9 Jan 2025 16:42:33 +0000
Subject: [PATCH 09/37] [pre-commit.ci] auto fixes from pre-commit.com hooks
 for more information, see https://pre-commit.ci

---
 src/distilabel/models/embeddings/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/distilabel/models/embeddings/__init__.py b/src/distilabel/models/embeddings/__init__.py
index 8d4dce0f7d..a37b3d0985 100644
--- a/src/distilabel/models/embeddings/__init__.py
+++ b/src/distilabel/models/embeddings/__init__.py
@@ -21,6 +21,7 @@
 __all__ = [
     "Embeddings",
     "LlamaCppEmbeddings",
+    "LlamaCppEmbeddings",
     "SentenceTransformerEmbeddings",
     "vLLMEmbeddings",
 ]

From 61c353864692e92d2332bc1107905ea94903c201 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 17:49:21 +0100
Subject: [PATCH 10/37] remove duplicate import

---
 src/distilabel/models/embeddings/__init__.py | 1 -
 vllm                                         | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)
 create mode 160000 vllm

diff --git a/src/distilabel/models/embeddings/__init__.py b/src/distilabel/models/embeddings/__init__.py
index a37b3d0985..8d4dce0f7d 100644
--- a/src/distilabel/models/embeddings/__init__.py
+++ b/src/distilabel/models/embeddings/__init__.py
@@ -21,7 +21,6 @@
 __all__ = [
     "Embeddings",
     "LlamaCppEmbeddings",
-    "LlamaCppEmbeddings",
     "SentenceTransformerEmbeddings",
     "vLLMEmbeddings",
 ]
diff --git a/vllm b/vllm
new file mode 160000
index 0000000000..65097ca0af
--- /dev/null
+++ b/vllm
@@ -0,0 +1 @@
+Subproject commit 65097ca0af5c1d7caa3d9d8224fa8b4790a5f7bc

From 0738b27093cd0d2cffdc48064f61d45e3c76ddcb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 9 Jan 2025 16:50:27 +0000
Subject: [PATCH 11/37] [pre-commit.ci] auto fixes from pre-commit.com hooks
 for more information, see https://pre-commit.ci

---
 src/distilabel/models/embeddings/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/distilabel/models/embeddings/__init__.py b/src/distilabel/models/embeddings/__init__.py
index 8d4dce0f7d..a37b3d0985 100644
--- a/src/distilabel/models/embeddings/__init__.py
+++ b/src/distilabel/models/embeddings/__init__.py
@@ -21,6 +21,7 @@
 __all__ = [
     "Embeddings",
     "LlamaCppEmbeddings",
+    "LlamaCppEmbeddings",
     "SentenceTransformerEmbeddings",
     "vLLMEmbeddings",
 ]

From 8e6613b825a31e57410e395a4d1dc8b0b4ec689d Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 17:50:38 +0100
Subject: [PATCH 12/37] remove duplicate

---
 src/distilabel/models/embeddings/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/distilabel/models/embeddings/__init__.py b/src/distilabel/models/embeddings/__init__.py
index 56e994390a..65eb00c469 100644
--- a/src/distilabel/models/embeddings/__init__.py
+++ b/src/distilabel/models/embeddings/__init__.py
@@ -22,7 +22,6 @@
 __all__ = [
     "Embeddings",
     "LlamaCppEmbeddings",
-    "LlamaCppEmbeddings",
     "SentenceTransformerEmbeddings",
     "vLLMEmbeddings",
 ]

From cb4c2ce42b25bdded64ab01340da160109670419 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 17:51:00 +0100
Subject: [PATCH 13/37] remove duplicate import

---
 src/distilabel/models/embeddings/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/distilabel/models/embeddings/__init__.py b/src/distilabel/models/embeddings/__init__.py
index 56e994390a..65eb00c469 100644
--- a/src/distilabel/models/embeddings/__init__.py
+++ b/src/distilabel/models/embeddings/__init__.py
@@ -22,7 +22,6 @@
 __all__ = [
     "Embeddings",
     "LlamaCppEmbeddings",
-    "LlamaCppEmbeddings",
     "SentenceTransformerEmbeddings",
     "vLLMEmbeddings",
 ]

From 7f20d9fcb3dd77db4a57c1b9d4df8f8504cab3e9 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 18:36:02 +0100
Subject: [PATCH 14/37] return  content when nog chat template is present

---
 .../models/llms/huggingface/transformers.py   | 22 +++++++++++++------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/distilabel/models/llms/huggingface/transformers.py b/src/distilabel/models/llms/huggingface/transformers.py
index faa29cb310..16da2a1b9f 100644
--- a/src/distilabel/models/llms/huggingface/transformers.py
+++ b/src/distilabel/models/llms/huggingface/transformers.py
@@ -143,7 +143,7 @@ def _set_outlines_pipeline(self):
         )
         self._pipeline = Transformers(model, tokenizer)
 
-    def _set_native_tf_pipeline(self):
+    def _set_transformers_pipeline(self):
         from transformers import pipeline
 
         self._pipeline = pipeline(
@@ -182,9 +182,9 @@ def load(self) -> None:
                 "Transformers is not installed. Please install it using `pip install transformers`."
             ) from ie
 
-        if self.structured_output:
+        if self.structured_output is not None:
             if outlines_below_0_1_0:
-                self._set_native_tf_pipeline()
+                self._set_transformers_pipeline()
                 self._prefix_allowed_tokens_fn = self._prepare_structured_output(
                     self.structured_output
                 )
@@ -194,7 +194,7 @@ def load(self) -> None:
                     self.structured_output
                 )
         else:
-            self._set_native_tf_pipeline()
+            self._set_transformers_pipeline()
 
         super().load()
 
@@ -218,8 +218,16 @@ def prepare_input(self, input: "StandardInput") -> str:
         Returns:
             The prompt to send to the LLM.
         """
+        if self._pipeline.tokenizer.chat_template is None:  # type: ignore
+            return input[0]["content"]
+
+        if self.structured_output and not outlines_below_0_1_0:
+            tokenizer = self._pipeline.tokenizer.tokenizer
+        else:
+            tokenizer = self._pipeline.tokenizer
+
         prompt: str = (
-            self._pipeline.tokenizer.tokenizer.apply_chat_template(
+            tokenizer.apply_chat_template(
                 input,
                 tokenize=False,
                 add_generation_prompt=True,
@@ -233,7 +241,7 @@ def prepare_input(self, input: "StandardInput") -> str:
     def generate(
         self,
         inputs: List[StandardInput],
-        num_generations: int = 2,
+        num_generations: int = 1,
         max_new_tokens: int = 128,
         temperature: float = 0.1,
         repetition_penalty: float = 1.1,
@@ -263,7 +271,7 @@ def generate(
 
         prepared_inputs = [self.prepare_input(input=input) for input in inputs]
 
-        if self.structured_output and not outlines_below_0_1_0:
+        if self.structured_output is not None and not outlines_below_0_1_0:
             from outlines.models.transformers import (
                 GenerationParameters,
                 SamplingParameters,

From 61aa597fc251da43ec56f24dfe7f4b4f621691cf Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 18:39:52 +0100
Subject: [PATCH 15/37] refactor clean code

---
 src/distilabel/models/llms/huggingface/transformers.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/distilabel/models/llms/huggingface/transformers.py b/src/distilabel/models/llms/huggingface/transformers.py
index 16da2a1b9f..a3bc6bd456 100644
--- a/src/distilabel/models/llms/huggingface/transformers.py
+++ b/src/distilabel/models/llms/huggingface/transformers.py
@@ -142,6 +142,8 @@ def _set_outlines_pipeline(self):
             trust_remote_code=self.trust_remote_code,
         )
         self._pipeline = Transformers(model, tokenizer)
+        self._pipeline.tokenizer.chat_template = tokenizer.chat_template
+        self._pipeline.tokenizer.apply_chat_template = tokenizer.apply_chat_template
 
     def _set_transformers_pipeline(self):
         from transformers import pipeline
@@ -221,13 +223,8 @@ def prepare_input(self, input: "StandardInput") -> str:
         if self._pipeline.tokenizer.chat_template is None:  # type: ignore
             return input[0]["content"]
 
-        if self.structured_output and not outlines_below_0_1_0:
-            tokenizer = self._pipeline.tokenizer.tokenizer
-        else:
-            tokenizer = self._pipeline.tokenizer
-
         prompt: str = (
-            tokenizer.apply_chat_template(
+            self._pipeline.tokenizer.apply_chat_template(
                 input,
                 tokenize=False,
                 add_generation_prompt=True,

From b994f064eeba40bfa00b4a7dc540c44cafa1554a Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 18:41:36 +0100
Subject: [PATCH 16/37] chore refactor

---
 .../models/llms/huggingface/transformers.py   | 58 +++++++++----------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/src/distilabel/models/llms/huggingface/transformers.py b/src/distilabel/models/llms/huggingface/transformers.py
index a3bc6bd456..b2a3c8a706 100644
--- a/src/distilabel/models/llms/huggingface/transformers.py
+++ b/src/distilabel/models/llms/huggingface/transformers.py
@@ -119,6 +119,35 @@ class TransformersLLM(LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin):
         PrivateAttr(default=None)
     )
 
+    def load(self) -> None:
+        """Loads the model and tokenizer and creates the text generation pipeline. In addition,
+        it will configure the tokenizer chat template."""
+        if self.device == "cuda":
+            CudaDevicePlacementMixin.load(self)
+
+        try:
+            from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline  # noqa
+        except ImportError as ie:
+            raise ImportError(
+                "Transformers is not installed. Please install it using `pip install transformers`."
+            ) from ie
+
+        if self.structured_output is not None:
+            if outlines_below_0_1_0:
+                self._set_transformers_pipeline()
+                self._prefix_allowed_tokens_fn = self._prepare_structured_output(
+                    self.structured_output
+                )
+            else:
+                self._set_outlines_pipeline()
+                self._logits_processor = self._prepare_structured_output(
+                    self.structured_output
+                )
+        else:
+            self._set_transformers_pipeline()
+
+        super().load()
+
     def _set_outlines_pipeline(self):
         from outlines.models.transformers import Transformers
         from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -171,35 +200,6 @@ def _set_transformers_pipeline(self):
         if self._pipeline.tokenizer.pad_token is None:
             self._pipeline.tokenizer.pad_token = self._pipeline.tokenizer.eos_token
 
-    def load(self) -> None:
-        """Loads the model and tokenizer and creates the text generation pipeline. In addition,
-        it will configure the tokenizer chat template."""
-        if self.device == "cuda":
-            CudaDevicePlacementMixin.load(self)
-
-        try:
-            from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline  # noqa
-        except ImportError as ie:
-            raise ImportError(
-                "Transformers is not installed. Please install it using `pip install transformers`."
-            ) from ie
-
-        if self.structured_output is not None:
-            if outlines_below_0_1_0:
-                self._set_transformers_pipeline()
-                self._prefix_allowed_tokens_fn = self._prepare_structured_output(
-                    self.structured_output
-                )
-            else:
-                self._set_outlines_pipeline()
-                self._logits_processor = self._prepare_structured_output(
-                    self.structured_output
-                )
-        else:
-            self._set_transformers_pipeline()
-
-        super().load()
-
     def unload(self) -> None:
         """Unloads the `vLLM` model."""
         CudaDevicePlacementMixin.unload(self)

From a47963d5460e69f292eb4b5d0060ff30c032afb0 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 18:45:06 +0100
Subject: [PATCH 17/37] refactor logic if else statement

---
 .../models/llms/huggingface/transformers.py   | 30 ++++++++++---------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/distilabel/models/llms/huggingface/transformers.py b/src/distilabel/models/llms/huggingface/transformers.py
index b2a3c8a706..d35a531c1b 100644
--- a/src/distilabel/models/llms/huggingface/transformers.py
+++ b/src/distilabel/models/llms/huggingface/transformers.py
@@ -268,7 +268,22 @@ def generate(
 
         prepared_inputs = [self.prepare_input(input=input) for input in inputs]
 
-        if self.structured_output is not None and not outlines_below_0_1_0:
+        if self.structured_output is None or (
+            self.structured_output and outlines_below_0_1_0
+        ):
+            outputs: List[List[Dict[str, str]]] = self._pipeline(
+                prepared_inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
+                top_p=top_p,
+                top_k=top_k,
+                do_sample=do_sample,
+                num_return_sequences=num_generations,
+                prefix_allowed_tokens_fn=self._prefix_allowed_tokens_fn,
+                pad_token_id=self._pipeline.tokenizer.eos_token_id,
+            )
+        else:
             from outlines.models.transformers import (
                 GenerationParameters,
                 SamplingParameters,
@@ -296,19 +311,6 @@ def generate(
                 )
                 for idx_sample, generation in enumerate(generations):
                     outputs[idx_sample][idx_generation] = {"generated_text": generation}
-        else:
-            outputs: List[List[Dict[str, str]]] = self._pipeline(
-                prepared_inputs,
-                max_new_tokens=max_new_tokens,
-                temperature=temperature,
-                repetition_penalty=repetition_penalty,
-                top_p=top_p,
-                top_k=top_k,
-                do_sample=do_sample,
-                num_return_sequences=num_generations,
-                prefix_allowed_tokens_fn=self._prefix_allowed_tokens_fn,
-                pad_token_id=self._pipeline.tokenizer.eos_token_id,
-            )
 
         llm_output = [
             [generation["generated_text"] for generation in output]

From a0f8acd0e87dc08de66d70a27b6878bbd1965931 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 18:58:43 +0100
Subject: [PATCH 18/37] fix import when outlines is not present

---
 .../steps/tasks/structured_outputs/outlines.py       | 12 +++++++-----
 vllm                                                 |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/distilabel/steps/tasks/structured_outputs/outlines.py b/src/distilabel/steps/tasks/structured_outputs/outlines.py
index 1772761e4c..7a48f226e4 100644
--- a/src/distilabel/steps/tasks/structured_outputs/outlines.py
+++ b/src/distilabel/steps/tasks/structured_outputs/outlines.py
@@ -37,11 +37,13 @@
     from distilabel.steps.tasks.typing import OutlinesStructuredOutputType
 
 Frameworks = Literal["transformers", "llamacpp", "vllm"]
-# Available frameworks for the structured output configuration.
-_outlines_version = pkg_resources.get_distribution("outlines").version
-outlines_below_0_1_0 = pkg_resources.parse_version(
-    _outlines_version
-) < pkg_resources.parse_version("0.1.0")
+
+if importlib.util.find_spec("outlines"):
+    outlines_below_0_1_0 = pkg_resources.parse_version(
+        pkg_resources.get_distribution("outlines").version
+    ) < pkg_resources.parse_version("0.1.0")
+else:
+    outlines_below_0_1_0 = True
 
 
 def model_to_schema(schema: Type[BaseModel]) -> Dict[str, Any]:
diff --git a/vllm b/vllm
index 65097ca0af..9a228348d2 160000
--- a/vllm
+++ b/vllm
@@ -1 +1 @@
-Subproject commit 65097ca0af5c1d7caa3d9d8224fa8b4790a5f7bc
+Subproject commit 9a228348d2f9a2c85dfc67d6b9fe883bf10a4680

From b41d6f07e8fea9f25b3d9f87999a9050b629bad3 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 19:46:34 +0100
Subject: [PATCH 19/37] chore pin transformers version

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b203f7edf5..8b1d950c33 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,7 +78,7 @@ argilla = ["argilla >= 2.0.0", "ipython"]
 cohere = ["cohere >= 5.2.0"]
 groq = ["groq >= 0.4.1"]
 hf-inference-endpoints = ["huggingface_hub >= 0.22.0"]
-hf-transformers = ["transformers >= 4.34.1", "torch >= 2.0.0"]
+hf-transformers = ["transformers >= 4.34.1, < 4.45.0", "torch >= 2.0.0"]
 instructor = ["instructor >= 1.2.3"]
 litellm = ["litellm >= 1.30.0"]
 llama-cpp = ["llama-cpp-python >= 0.2.0"]

From d2fdd4c6eecb6861480e723ea13a145a35187bb2 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 19:47:17 +0100
Subject: [PATCH 20/37] chore add context w.r.t. logit processor

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 8b1d950c33..1a4e48cb30 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,6 +78,7 @@ argilla = ["argilla >= 2.0.0", "ipython"]
 cohere = ["cohere >= 5.2.0"]
 groq = ["groq >= 0.4.1"]
 hf-inference-endpoints = ["huggingface_hub >= 0.22.0"]
+# logit processor breaks in transformers 4.45.0
 hf-transformers = ["transformers >= 4.34.1, < 4.45.0", "torch >= 2.0.0"]
 instructor = ["instructor >= 1.2.3"]
 litellm = ["litellm >= 1.30.0"]

From 2b8f634ee9e80c5742e77e175c2db4d2b34203f6 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 19:48:59 +0100
Subject: [PATCH 21/37] chore bump version

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1a4e48cb30..2757a461e3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,8 +78,8 @@ argilla = ["argilla >= 2.0.0", "ipython"]
 cohere = ["cohere >= 5.2.0"]
 groq = ["groq >= 0.4.1"]
 hf-inference-endpoints = ["huggingface_hub >= 0.22.0"]
-# logit processor breaks in transformers 4.45.0
-hf-transformers = ["transformers >= 4.34.1, < 4.45.0", "torch >= 2.0.0"]
+# logit processor breaks in transformers 4.47.0
+hf-transformers = ["transformers >= 4.34.1, < 4.47.0", "torch >= 2.0.0"]
 instructor = ["instructor >= 1.2.3"]
 litellm = ["litellm >= 1.30.0"]
 llama-cpp = ["llama-cpp-python >= 0.2.0"]

From ed5f00f17012847f34aa5332bea0c317f79ec43a Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Thu, 9 Jan 2025 19:58:21 +0100
Subject: [PATCH 22/37] add simplification of transformers implementation

---
 .../models/llms/huggingface/transformers.py   | 143 +++++-------------
 .../tasks/structured_outputs/outlines.py      |   6 +-
 2 files changed, 36 insertions(+), 113 deletions(-)

diff --git a/src/distilabel/models/llms/huggingface/transformers.py b/src/distilabel/models/llms/huggingface/transformers.py
index d35a531c1b..ec178796ee 100644
--- a/src/distilabel/models/llms/huggingface/transformers.py
+++ b/src/distilabel/models/llms/huggingface/transformers.py
@@ -23,15 +23,12 @@
 from distilabel.models.llms.utils import compute_tokens, prepare_output
 from distilabel.models.mixins.cuda_device_placement import CudaDevicePlacementMixin
 from distilabel.models.mixins.magpie import MagpieChatTemplateMixin
-from distilabel.steps.tasks.structured_outputs.outlines import (
-    outlines_below_0_1_0,
-)
+from distilabel.steps.tasks.structured_outputs.outlines import outlines_below_0_1_0
 from distilabel.steps.tasks.typing import OutlinesStructuredOutputType, StandardInput
 from distilabel.utils.huggingface import HF_TOKEN_ENV_VAR
 
 if TYPE_CHECKING:
-    from outlines.models.transformers import Transformers
-    from transformers import LogitsProcessor, LogitsProcessorList, Pipeline
+    from transformers import Pipeline
     from transformers.modeling_utils import PreTrainedModel
     from transformers.tokenization_utils import PreTrainedTokenizer
 
@@ -113,11 +110,9 @@ class TransformersLLM(LLM, MagpieChatTemplateMixin, CudaDevicePlacementMixin):
         description="The structured output format to use across all the generations.",
     )
 
-    _pipeline: Optional[Union["Pipeline", "Transformers"]] = PrivateAttr(...)
+    _pipeline: Optional["Pipeline"] = PrivateAttr(...)
     _prefix_allowed_tokens_fn: Union[Callable, None] = PrivateAttr(default=None)
-    _logits_processor: Optional[Union["LogitsProcessor", "LogitsProcessorList"]] = (
-        PrivateAttr(default=None)
-    )
+    _logits_processor: Union[Callable, None] = PrivateAttr(default=None)
 
     def load(self) -> None:
         """Loads the model and tokenizer and creates the text generation pipeline. In addition,
@@ -126,56 +121,13 @@ def load(self) -> None:
             CudaDevicePlacementMixin.load(self)
 
         try:
-            from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline  # noqa
+            from transformers import pipeline
         except ImportError as ie:
             raise ImportError(
                 "Transformers is not installed. Please install it using `pip install transformers`."
             ) from ie
 
-        if self.structured_output is not None:
-            if outlines_below_0_1_0:
-                self._set_transformers_pipeline()
-                self._prefix_allowed_tokens_fn = self._prepare_structured_output(
-                    self.structured_output
-                )
-            else:
-                self._set_outlines_pipeline()
-                self._logits_processor = self._prepare_structured_output(
-                    self.structured_output
-                )
-        else:
-            self._set_transformers_pipeline()
-
-        super().load()
-
-    def _set_outlines_pipeline(self):
-        from outlines.models.transformers import Transformers
-        from transformers import AutoModelForCausalLM, AutoTokenizer
-
         token = self.token.get_secret_value() if self.token is not None else self.token
-        model = AutoModelForCausalLM.from_pretrained(
-            self.model,
-            output_attentions=True,
-            token=token,
-            revision=self.revision,
-            torch_dtype=self.torch_dtype,
-            trust_remote_code=self.trust_remote_code,
-            device_map=self.device_map,
-            **(self.model_kwargs or {}),
-        ).to(self.device)
-        tokenizer = AutoTokenizer.from_pretrained(
-            self.tokenizer or self.model,
-            token=token,
-            use_fast=self.use_fast,
-            revision=self.revision,
-            trust_remote_code=self.trust_remote_code,
-        )
-        self._pipeline = Transformers(model, tokenizer)
-        self._pipeline.tokenizer.chat_template = tokenizer.chat_template
-        self._pipeline.tokenizer.apply_chat_template = tokenizer.apply_chat_template
-
-    def _set_transformers_pipeline(self):
-        from transformers import pipeline
 
         self._pipeline = pipeline(
             "text-generation",
@@ -188,17 +140,24 @@ def _set_transformers_pipeline(self):
             use_fast=self.use_fast,
             device=self.device,
             device_map=self.device_map,
-            token=self.token.get_secret_value()
-            if self.token is not None
-            else self.token,
+            token=token,
             return_full_text=False,
         )
 
         if self.chat_template is not None:
-            self._pipeline.tokenizer.chat_template = self.chat_template
+            self._pipeline.tokenizer.chat_template = self.chat_template  # type: ignore
+
+        if self._pipeline.tokenizer.pad_token is None:  # type: ignore
+            self._pipeline.tokenizer.pad_token = self._pipeline.tokenizer.eos_token  # type: ignore
+
+        if self.structured_output:
+            processor = self._prepare_structured_output(self.structured_output)
+            if outlines_below_0_1_0:
+                self._prefix_allowed_tokens_fn = processor
+            else:
+                self._logits_processor = [processor]
 
-        if self._pipeline.tokenizer.pad_token is None:
-            self._pipeline.tokenizer.pad_token = self._pipeline.tokenizer.eos_token
+        super().load()
 
     def unload(self) -> None:
         """Unloads the `vLLM` model."""
@@ -224,8 +183,8 @@ def prepare_input(self, input: "StandardInput") -> str:
             return input[0]["content"]
 
         prompt: str = (
-            self._pipeline.tokenizer.apply_chat_template(
-                input,
+            self._pipeline.tokenizer.apply_chat_template(  # type: ignore
+                input,  # type: ignore
                 tokenize=False,
                 add_generation_prompt=True,
             )
@@ -235,7 +194,7 @@ def prepare_input(self, input: "StandardInput") -> str:
         return super().apply_magpie_pre_query_template(prompt, input)
 
     @validate_call
-    def generate(
+    def generate(  # type: ignore
         self,
         inputs: List[StandardInput],
         num_generations: int = 1,
@@ -265,53 +224,21 @@ def generate(
         Returns:
             A list of lists of strings containing the generated responses for each input.
         """
-
         prepared_inputs = [self.prepare_input(input=input) for input in inputs]
 
-        if self.structured_output is None or (
-            self.structured_output and outlines_below_0_1_0
-        ):
-            outputs: List[List[Dict[str, str]]] = self._pipeline(
-                prepared_inputs,
-                max_new_tokens=max_new_tokens,
-                temperature=temperature,
-                repetition_penalty=repetition_penalty,
-                top_p=top_p,
-                top_k=top_k,
-                do_sample=do_sample,
-                num_return_sequences=num_generations,
-                prefix_allowed_tokens_fn=self._prefix_allowed_tokens_fn,
-                pad_token_id=self._pipeline.tokenizer.eos_token_id,
-            )
-        else:
-            from outlines.models.transformers import (
-                GenerationParameters,
-                SamplingParameters,
-            )
-
-            outputs = [
-                [[] for _ in range(num_generations)]
-                for _ in range(len(prepared_inputs))
-            ]
-            for idx_generation in range(num_generations):
-                generations = self._pipeline.generate(
-                    prepared_inputs,
-                    generation_parameters=GenerationParameters(
-                        max_tokens=max_new_tokens,
-                        stop_at=None,
-                        seed=None,
-                    ),
-                    logits_processor=self._logits_processor,
-                    sampling_parameters=SamplingParameters(
-                        sampler="multinomial",
-                        top_p=top_p,
-                        top_k=top_k,
-                        temperature=temperature,
-                    ),
-                )
-                for idx_sample, generation in enumerate(generations):
-                    outputs[idx_sample][idx_generation] = {"generated_text": generation}
-
+        outputs: List[List[Dict[str, str]]] = self._pipeline(  # type: ignore
+            prepared_inputs,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            top_p=top_p,
+            top_k=top_k,
+            do_sample=do_sample,
+            num_return_sequences=num_generations,
+            prefix_allowed_tokens_fn=self._prefix_allowed_tokens_fn,
+            pad_token_id=self._pipeline.tokenizer.eos_token_id,
+            logits_processor=self._logits_processor,
+        )
         llm_output = [
             [generation["generated_text"] for generation in output]
             for output in outputs
@@ -361,7 +288,7 @@ def get_last_hidden_states(
         last_hidden_states = model(**input_ids)["last_hidden_state"]
 
         return [
-            seq_last_hidden_state[attention_mask.bool(), :].detach().cpu().numpy()  # type: ignore
+            seq_last_hidden_state[attention_mask.bool(), :].detach().cpu().numpy()
             for seq_last_hidden_state, attention_mask in zip(
                 last_hidden_states,
                 input_ids["attention_mask"],  # type: ignore
diff --git a/src/distilabel/steps/tasks/structured_outputs/outlines.py b/src/distilabel/steps/tasks/structured_outputs/outlines.py
index 7a48f226e4..b0466766b2 100644
--- a/src/distilabel/steps/tasks/structured_outputs/outlines.py
+++ b/src/distilabel/steps/tasks/structured_outputs/outlines.py
@@ -98,12 +98,8 @@ def _get_outlines_tokenizer_or_model(llm: Any, framework: Frameworks) -> Callabl
 
             return LlamaCppTokenizer(llm)
         elif framework == "transformers":
-            from outlines.models.transformers import Transformers, TransformerTokenizer
+            from outlines.models.transformers import TransformerTokenizer
 
-            if isinstance(llm, Transformers):
-                return llm.tokenizer
-            else:
-                return TransformerTokenizer(llm.tokenizer)
             return TransformerTokenizer(llm.tokenizer)
         elif framework == "vllm":
             return llm.get_tokenizer()

From 473de031a98bf66db34e24b16fea0bedb55ff993 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Fri, 10 Jan 2025 10:12:01 +0100
Subject: [PATCH 23/37] Update .gitignore to exclude .DS_Store files and remove
 vllm subproject; delete unnecessary .DS_Store files from unit tests

---
 .gitignore                    |   1 +
 tests/unit/.DS_Store          | Bin 6148 -> 0 bytes
 tests/unit/pipeline/.DS_Store | Bin 6148 -> 0 bytes
 vllm                          |   1 -
 4 files changed, 1 insertion(+), 1 deletion(-)
 delete mode 100644 tests/unit/.DS_Store
 delete mode 100644 tests/unit/pipeline/.DS_Store
 delete mode 160000 vllm

diff --git a/.gitignore b/.gitignore
index d8337200af..1aab313fb9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -77,3 +77,4 @@ venv.bak/
 # Other
 *.log
 *.swp
+.DS_Store
diff --git a/tests/unit/.DS_Store b/tests/unit/.DS_Store
deleted file mode 100644
index 213c7078d0b2be0d3b5775898e4f42658e5c59de..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKy-vh147TAwm2SC_F=Auv%o0vzWa<l`<4%Coqg~kh6!#2_JOvxi!^-C$PE=%I
z0U=~d@#n;Ll73Q?n231!xUPv7L{y;(vKR%C;iBungJ(e2IXb$h2ikPEbz7L|FAmAx
zD;luxfj0DM|2O@1&<!8GK9yp&ebcPEekH!M^WEv&)yqXWZvCye;jV7?YaT!A(xHNZ
zU?3O>27-YfU;uZvNHsSM9Sj5m!N8sYIUf?5V0Ii0_2_`o5&$UAXcgE}OGr*~%#NcW
zJP@{2prx{x7;NbnPac;YM?*^|_Tqzm<&WZpb#=_2G@Ljah7JaTfhhxr)?LW`e~DkF
zSmaMrq8AJV1OJQxo;7W=#7Ftv`sMTFu1#pyXd)6fiUEN=c?6&%=g5gV&7Z_ZTy`7{
UWfmFNbYMILl#o!tz%MZH4qEmw6aWAK

diff --git a/tests/unit/pipeline/.DS_Store b/tests/unit/pipeline/.DS_Store
deleted file mode 100644
index 37a1397976ae5b949966572accd4de98bb796f20..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHKOHKnZ47J-XM(U<ZmJv7T4MH_6S)d+Zpi`iWRx_I}*oC`r7Ouci*z)}0Q!yYR
zB!rMHCC_o}jOUFePKbyX`}Le?Mnn~wAd4~}Vji3tx$^|bx<*Tv)Y68os2N!3FPbFx
zDY9*-r(5LymVdLonJ?RJ*K69N`^V2ccU@htw%rOI)!p^^@zdGE$yeU%Z+Q2ccUg?(
zJB0)T!9Xw&3<Lu|VgNN;q?#H=9}ENo!N4m6vOgp=!R%NJb?ZQ<R{)?Kqg9~m85Nks
z0L+fX5EckqD9}RLuNZ9MuqXG+j>XWziS@}?$DjOid0{^xb5eKWY#4no5DXkLFt*`L
z>i;QzrNK-7VMvUEfneaDF~F00SugNXURyuBp48fec7Y}$enk`rbS=d|EyWnfK60d!
cHlIYte0D5`vWnO>92f@yB_z6F;1?Kp24W;LTmS$7

diff --git a/vllm b/vllm
deleted file mode 160000
index 9a228348d2..0000000000
--- a/vllm
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 9a228348d2f9a2c85dfc67d6b9fe883bf10a4680

From 995e4d41b3cf8c050d0a7c3d119f0c22c517a59f Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Fri, 10 Jan 2025 10:23:24 +0100
Subject: [PATCH 24/37] Refactor outlines version check and logits processor
 handling

- Introduced a helper function to check if the 'outlines' package is installed and its version.
- Updated the logic in `_get_logits_processor` to use the new version check, simplifying the processor selection based on the outlines version.
- Adjusted the handling of tokenizers in `_get_tokenizer_from_model` to streamline the integration with different frameworks.
- Modified `prepare_guided_output` to differentiate processing based on the outlines version, ensuring compatibility with both pre-0.1.0 and post-0.1.0 versions of the outlines package.
---
 .../tasks/structured_outputs/outlines.py      | 162 ++++++++++--------
 1 file changed, 95 insertions(+), 67 deletions(-)

diff --git a/src/distilabel/steps/tasks/structured_outputs/outlines.py b/src/distilabel/steps/tasks/structured_outputs/outlines.py
index b0466766b2..9575b82ba1 100644
--- a/src/distilabel/steps/tasks/structured_outputs/outlines.py
+++ b/src/distilabel/steps/tasks/structured_outputs/outlines.py
@@ -38,12 +38,19 @@
 
 Frameworks = Literal["transformers", "llamacpp", "vllm"]
 
-if importlib.util.find_spec("outlines"):
-    outlines_below_0_1_0 = pkg_resources.parse_version(
-        pkg_resources.get_distribution("outlines").version
-    ) < pkg_resources.parse_version("0.1.0")
-else:
-    outlines_below_0_1_0 = True
+
+def _outlines_version_below_0_1_0() -> bool:
+    """Helper function to check outlines availability and version.
+
+    Returns:
+        bool: True if outlines is not installed or version is below 0.1.0
+    """
+    if not importlib.util.find_spec("outlines"):
+        raise ImportError(
+            "Outlines is not installed. Please install it using `pip install outlines`."
+        )
+    version = pkg_resources.get_distribution("outlines").version
+    return pkg_resources.parse_version(version) < pkg_resources.parse_version("0.1.0")
 
 
 def model_to_schema(schema: Type[BaseModel]) -> Dict[str, Any]:
@@ -52,57 +59,66 @@ def model_to_schema(schema: Type[BaseModel]) -> Dict[str, Any]:
 
 
 def _get_logits_processor(framework: Frameworks) -> Tuple[Callable, Callable]:
-    """Helper function to return the appropriate logits processor for the given framework."""
-    if framework not in Frameworks.__args__:
+    """Helper function to return the appropriate logits processors for the given framework."""
+    if _outlines_version_below_0_1_0():
+        processors = {
+            "transformers": (
+                "outlines.integrations.transformers",
+                "JSONPrefixAllowedTokens",
+                "RegexPrefixAllowedTokens",
+            ),
+            "llamacpp": (
+                "outlines.integrations.llamacpp",
+                "JSONLogitsProcessor",
+                "RegexLogitsProcessor",
+            ),
+            "vllm": (
+                "outlines.integrations.vllm",
+                "JSONLogitsProcessor",
+                "RegexLogitsProcessor",
+            ),
+        }
+    else:
+        processors = {
+            "transformers": (
+                "outlines.processors",
+                "JSONLogitsProcessor",
+                "RegexLogitsProcessor",
+            ),
+            "llamacpp": (
+                "outlines.processors",
+                "JSONLogitsProcessor",
+                "RegexLogitsProcessor",
+            ),
+            "vllm": (
+                "outlines.processors",
+                "JSONLogitsProcessor",
+                "RegexLogitsProcessor",
+            ),
+        }
+
+    if framework not in processors:
         raise DistilabelUserError(
             f"Invalid framework '{framework}'. Must be one of {get_args(Frameworks)}",
             page="sections/how_to_guides/advanced/structured_generation/",
         )
 
-    if outlines_below_0_1_0:
-        if framework == "transformers":
-            from outlines.integrations.transformers import (
-                JSONPrefixAllowedTokens,
-                RegexPrefixAllowedTokens,
-            )
+    module_path, json_cls, regex_cls = processors[framework]
+    module = importlib.import_module(module_path)
+    return getattr(module, json_cls), getattr(module, regex_cls)
 
-            return JSONPrefixAllowedTokens, RegexPrefixAllowedTokens
 
-        if framework == "llamacpp":
-            from outlines.integrations.llamacpp import (
-                JSONLogitsProcessor,
-                RegexLogitsProcessor,
-            )
+def _get_tokenizer_from_model(llm: Any, framework: Frameworks) -> Callable:
+    if framework == "llamacpp":
+        from outlines.models.llamacpp import LlamaCppTokenizer
 
-            return JSONLogitsProcessor, RegexLogitsProcessor
+        return LlamaCppTokenizer(llm)
+    elif framework == "transformers":
+        from outlines.models.transformers import TransformerTokenizer
 
-        if framework == "vllm":
-            from outlines.integrations.vllm import (
-                JSONLogitsProcessor,
-                RegexLogitsProcessor,
-            )
-
-            return JSONLogitsProcessor, RegexLogitsProcessor
-    else:
-        from outlines.processors import JSONLogitsProcessor, RegexLogitsProcessor
-
-        return JSONLogitsProcessor, RegexLogitsProcessor
-
-
-def _get_outlines_tokenizer_or_model(llm: Any, framework: Frameworks) -> Callable:
-    if outlines_below_0_1_0:
-        return llm
-    else:
-        if framework == "llamacpp":
-            from outlines.models.llamacpp import LlamaCppTokenizer
-
-            return LlamaCppTokenizer(llm)
-        elif framework == "transformers":
-            from outlines.models.transformers import TransformerTokenizer
-
-            return TransformerTokenizer(llm.tokenizer)
-        elif framework == "vllm":
-            return llm.get_tokenizer()
+        return TransformerTokenizer(llm.tokenizer)
+    elif framework == "vllm":
+        return llm.get_tokenizer()
 
 
 def prepare_guided_output(
@@ -130,15 +146,8 @@ def prepare_guided_output(
         and deserialization.
     """
 
-    if not importlib.util.find_spec("outlines"):
-        raise ImportError(
-            "Outlines is not installed. Please install it using `pip install outlines`."
-        )
-
     json_processor, regex_processor = _get_logits_processor(framework)
 
-    tokenizer_or_model = _get_outlines_tokenizer_or_model(llm, framework)
-
     format = structured_output.get("format")
     schema = structured_output.get("schema")
 
@@ -151,18 +160,37 @@ def prepare_guided_output(
         elif isinstance(schema, str):
             format = "regex"
 
-    if format == "json":
-        return {
-            "processor": json_processor(
-                schema,
-                tokenizer_or_model,
-                whitespace_pattern=structured_output.get("whitespace_pattern"),
-            ),
-            "schema": schema_as_dict(schema),
-        }
-
-    if format == "regex":
-        return {"processor": regex_processor(schema, tokenizer_or_model)}
+    if _outlines_version_below_0_1_0():
+        # use the model/llm, processor is NOT a list
+        if format == "json":
+            return {
+                "processor": json_processor(
+                    schema,
+                    llm,
+                    whitespace_pattern=structured_output.get("whitespace_pattern"),
+                ),
+                "schema": schema_as_dict(schema),
+            }
+
+        if format == "regex":
+            return {"processor": regex_processor(schema, llm)}
+    else:
+        # use tokenizer, processor is a list
+        tokenizer = _get_tokenizer_from_model(llm, framework)
+        if format == "json":
+            return {
+                "processor": [
+                    json_processor(
+                        schema,
+                        tokenizer,
+                        whitespace_pattern=structured_output.get("whitespace_pattern"),
+                    )
+                ],
+                "schema": schema_as_dict(schema),
+            }
+
+        if format == "regex":
+            return {"processor": [regex_processor(schema, tokenizer)]}
 
     raise DistilabelUserError(
         f"Invalid format '{format}'. Must be either 'json' or 'regex'.",

From 59604413fc16d4d8992752571dbba2e7ce5493bd Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Fri, 10 Jan 2025 10:23:33 +0100
Subject: [PATCH 25/37] Refactor logits processor handling in LlamaCppLLM

- Replaced the `_set_logits_processor` method with direct assignment of `_logits_processor` using `_prepare_structured_output`.
- Simplified the logic for setting the logits processor in both the `load` and generation methods, enhancing code clarity and maintainability.
---
 src/distilabel/models/llms/llamacpp.py | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/src/distilabel/models/llms/llamacpp.py b/src/distilabel/models/llms/llamacpp.py
index 71d29aecb4..822e5cea77 100644
--- a/src/distilabel/models/llms/llamacpp.py
+++ b/src/distilabel/models/llms/llamacpp.py
@@ -194,7 +194,9 @@ def load(self) -> None:
         )
 
         if self.structured_output:
-            self._set_logits_processor(self.structured_output)
+            self._logits_processor = self._prepare_structured_output(
+                self.structured_output
+            )
 
         if self.use_magpie_template or self.magpie_pre_query_template:
             if not self.tokenizer_id:
@@ -221,19 +223,6 @@ def load(self) -> None:
         # out of the model name, which won't be available until the `Llama` instance is created.
         super().load()
 
-    def _set_logits_processor(
-        self, structured_output: Optional[OutlinesStructuredOutputType] = None
-    ) -> None:
-        from distilabel.steps.tasks.structured_outputs.outlines import (
-            outlines_below_0_1_0,
-        )
-
-        processor = self._prepare_structured_output(structured_output)
-        if outlines_below_0_1_0:
-            self._logits_processor = processor
-        else:
-            self._logits_processor = [processor]
-
     @property
     def model_name(self) -> str:
         """Returns the model name used for the LLM."""
@@ -352,8 +341,9 @@ def generate(  # type: ignore
                 # after each generation, so subsequent calls yield nothing. This is a workaround
                 # until is fixed in the `llama_cpp` or `outlines` libraries.
                 if structured_output:
-                    self._set_logits_processor(structured_output)
-
+                    self._logits_processor = self._prepare_structured_output(
+                        structured_output
+                    )
                 if self.tokenizer_id is None:
                     completion = self._generate_chat_completion(
                         input,

From cfac5743ded71d5f73e1f6ca598bec2c5c4f3c76 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Fri, 10 Jan 2025 10:25:38 +0100
Subject: [PATCH 26/37] Refactor outlines import and logits processor handling
 in TransformersLLM

- Updated the import statement for outlines to use the new helper function `_outlines_version_below_0_1_0`.
- Simplified the logic for setting the `_logits_processor` based on the outlines version check, enhancing code clarity and maintainability.
---
 src/distilabel/models/llms/huggingface/transformers.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/distilabel/models/llms/huggingface/transformers.py b/src/distilabel/models/llms/huggingface/transformers.py
index ec178796ee..976534dbc7 100644
--- a/src/distilabel/models/llms/huggingface/transformers.py
+++ b/src/distilabel/models/llms/huggingface/transformers.py
@@ -23,7 +23,9 @@
 from distilabel.models.llms.utils import compute_tokens, prepare_output
 from distilabel.models.mixins.cuda_device_placement import CudaDevicePlacementMixin
 from distilabel.models.mixins.magpie import MagpieChatTemplateMixin
-from distilabel.steps.tasks.structured_outputs.outlines import outlines_below_0_1_0
+from distilabel.steps.tasks.structured_outputs.outlines import (
+    _outlines_version_below_0_1_0,
+)
 from distilabel.steps.tasks.typing import OutlinesStructuredOutputType, StandardInput
 from distilabel.utils.huggingface import HF_TOKEN_ENV_VAR
 
@@ -152,10 +154,10 @@ def load(self) -> None:
 
         if self.structured_output:
             processor = self._prepare_structured_output(self.structured_output)
-            if outlines_below_0_1_0:
+            if _outlines_version_below_0_1_0():
                 self._prefix_allowed_tokens_fn = processor
             else:
-                self._logits_processor = [processor]
+                self._logits_processor = processor
 
         super().load()
 

From 337876928449fe777b9027d94e9174f794eae4e9 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Fri, 10 Jan 2025 10:29:10 +0100
Subject: [PATCH 27/37] Refactor outlines version check and update function
 naming

- Renamed the helper function from `_outlines_version_below_0_1_0` to `_is_outlines_version_below_0_1_0` for clarity.
- Updated all references to the renamed function across the codebase, ensuring consistent usage in the `TransformersLLM` class and related functions.
- Enhanced code readability and maintainability by standardizing function naming conventions.
---
 src/distilabel/models/llms/huggingface/transformers.py     | 4 ++--
 src/distilabel/steps/tasks/structured_outputs/outlines.py  | 6 +++---
 tests/unit/steps/tasks/structured_outputs/test_outlines.py | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/distilabel/models/llms/huggingface/transformers.py b/src/distilabel/models/llms/huggingface/transformers.py
index 976534dbc7..61e3d09906 100644
--- a/src/distilabel/models/llms/huggingface/transformers.py
+++ b/src/distilabel/models/llms/huggingface/transformers.py
@@ -24,7 +24,7 @@
 from distilabel.models.mixins.cuda_device_placement import CudaDevicePlacementMixin
 from distilabel.models.mixins.magpie import MagpieChatTemplateMixin
 from distilabel.steps.tasks.structured_outputs.outlines import (
-    _outlines_version_below_0_1_0,
+    _is_outlines_version_below_0_1_0,
 )
 from distilabel.steps.tasks.typing import OutlinesStructuredOutputType, StandardInput
 from distilabel.utils.huggingface import HF_TOKEN_ENV_VAR
@@ -154,7 +154,7 @@ def load(self) -> None:
 
         if self.structured_output:
             processor = self._prepare_structured_output(self.structured_output)
-            if _outlines_version_below_0_1_0():
+            if _is_outlines_version_below_0_1_0():
                 self._prefix_allowed_tokens_fn = processor
             else:
                 self._logits_processor = processor
diff --git a/src/distilabel/steps/tasks/structured_outputs/outlines.py b/src/distilabel/steps/tasks/structured_outputs/outlines.py
index 9575b82ba1..432717140e 100644
--- a/src/distilabel/steps/tasks/structured_outputs/outlines.py
+++ b/src/distilabel/steps/tasks/structured_outputs/outlines.py
@@ -39,7 +39,7 @@
 Frameworks = Literal["transformers", "llamacpp", "vllm"]
 
 
-def _outlines_version_below_0_1_0() -> bool:
+def _is_outlines_version_below_0_1_0() -> bool:
     """Helper function to check outlines availability and version.
 
     Returns:
@@ -60,7 +60,7 @@ def model_to_schema(schema: Type[BaseModel]) -> Dict[str, Any]:
 
 def _get_logits_processor(framework: Frameworks) -> Tuple[Callable, Callable]:
     """Helper function to return the appropriate logits processors for the given framework."""
-    if _outlines_version_below_0_1_0():
+    if _is_outlines_version_below_0_1_0():
         processors = {
             "transformers": (
                 "outlines.integrations.transformers",
@@ -160,7 +160,7 @@ def prepare_guided_output(
         elif isinstance(schema, str):
             format = "regex"
 
-    if _outlines_version_below_0_1_0():
+    if _is_outlines_version_below_0_1_0():
         # use the model/llm, processor is NOT a list
         if format == "json":
             return {
diff --git a/tests/unit/steps/tasks/structured_outputs/test_outlines.py b/tests/unit/steps/tasks/structured_outputs/test_outlines.py
index 446967a2d5..2812c2e48b 100644
--- a/tests/unit/steps/tasks/structured_outputs/test_outlines.py
+++ b/tests/unit/steps/tasks/structured_outputs/test_outlines.py
@@ -19,8 +19,8 @@
 
 from distilabel.models.llms.huggingface.transformers import TransformersLLM
 from distilabel.steps.tasks.structured_outputs.outlines import (
+    _is_outlines_version_below_0_1_0,
     model_to_schema,
-    outlines_below_0_1_0,
 )
 from distilabel.steps.tasks.typing import OutlinesStructuredOutputType
 
@@ -181,7 +181,7 @@ def test_load_from_dict(self) -> None:
         llm = TransformersLLM.from_dict(DUMP_JSON)
         assert isinstance(llm, TransformersLLM)
         llm.load()
-        if outlines_below_0_1_0:
+        if _is_outlines_version_below_0_1_0():
             assert llm._prefix_allowed_tokens_fn is not None
             assert llm._logits_processor is None
         else:

From d56b6bcc7d66de4c188797c107a5c6c08d6df416 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Fri, 10 Jan 2025 11:02:55 +0100
Subject: [PATCH 28/37] Refactor processor handling in LlamaCppLLM and
 TransformersLLM based on outlines version

- Introduced version check for outlines in both LlamaCppLLM and TransformersLLM to determine processor return type.
- Updated `prepare_guided_output` to handle processor initialization differently for outlines versions below and above 0.1.0.
- Enhanced tokenizer handling in `_get_tokenizer_from_model` to support multiple frameworks, ensuring compatibility and improved functionality.
---
 .../models/llms/huggingface/transformers.py   |  6 +-
 src/distilabel/models/llms/llamacpp.py        |  6 +-
 .../tasks/structured_outputs/outlines.py      | 59 +++++++++----------
 3 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/src/distilabel/models/llms/huggingface/transformers.py b/src/distilabel/models/llms/huggingface/transformers.py
index 61e3d09906..ad79a0d936 100644
--- a/src/distilabel/models/llms/huggingface/transformers.py
+++ b/src/distilabel/models/llms/huggingface/transformers.py
@@ -309,6 +309,7 @@ def _prepare_structured_output(
             The callable that will be used to guide the generation of the model.
         """
         from distilabel.steps.tasks.structured_outputs.outlines import (
+            _is_outlines_version_below_0_1_0,
             prepare_guided_output,
         )
 
@@ -317,4 +318,7 @@ def _prepare_structured_output(
         )
         if schema := result.get("schema"):
             self.structured_output["schema"] = schema
-        return result["processor"]
+        if _is_outlines_version_below_0_1_0():
+            return result["processor"]
+        else:
+            return [result["processor"]]
diff --git a/src/distilabel/models/llms/llamacpp.py b/src/distilabel/models/llms/llamacpp.py
index 822e5cea77..19715836f1 100644
--- a/src/distilabel/models/llms/llamacpp.py
+++ b/src/distilabel/models/llms/llamacpp.py
@@ -393,10 +393,14 @@ def _prepare_structured_output(
             The callable that will be used to guide the generation of the model.
         """
         from distilabel.steps.tasks.structured_outputs.outlines import (
+            _is_outlines_version_below_0_1_0,
             prepare_guided_output,
         )
 
         result = prepare_guided_output(structured_output, "llamacpp", self._model)
         if (schema := result.get("schema")) and self.structured_output:
             self.structured_output["schema"] = schema
-        return result["processor"]
+        if _is_outlines_version_below_0_1_0():
+            return result["processor"]
+        else:
+            return [result["processor"]]
diff --git a/src/distilabel/steps/tasks/structured_outputs/outlines.py b/src/distilabel/steps/tasks/structured_outputs/outlines.py
index 432717140e..52139a1e80 100644
--- a/src/distilabel/steps/tasks/structured_outputs/outlines.py
+++ b/src/distilabel/steps/tasks/structured_outputs/outlines.py
@@ -24,6 +24,7 @@
     Literal,
     Tuple,
     Type,
+    Union,
     get_args,
 )
 
@@ -34,6 +35,10 @@
 from distilabel.steps.tasks.structured_outputs.utils import schema_as_dict
 
 if TYPE_CHECKING:
+    from llama_cpp import Llama
+    from transformers import Pipeline
+    from vllm import LLM
+
     from distilabel.steps.tasks.typing import OutlinesStructuredOutputType
 
 Frameworks = Literal["transformers", "llamacpp", "vllm"]
@@ -108,7 +113,9 @@ def _get_logits_processor(framework: Frameworks) -> Tuple[Callable, Callable]:
     return getattr(module, json_cls), getattr(module, regex_cls)
 
 
-def _get_tokenizer_from_model(llm: Any, framework: Frameworks) -> Callable:
+def _get_tokenizer_from_model(
+    llm: Union["LLM", "Pipeline", "Llama"], framework: Frameworks
+) -> Callable:
     if framework == "llamacpp":
         from outlines.models.llamacpp import LlamaCppTokenizer
 
@@ -118,7 +125,9 @@ def _get_tokenizer_from_model(llm: Any, framework: Frameworks) -> Callable:
 
         return TransformerTokenizer(llm.tokenizer)
     elif framework == "vllm":
-        return llm.get_tokenizer()
+        from outlines.models.vllm import adapt_tokenizer
+
+        return adapt_tokenizer(llm.get_tokenizer())
 
 
 def prepare_guided_output(
@@ -161,36 +170,26 @@ def prepare_guided_output(
             format = "regex"
 
     if _is_outlines_version_below_0_1_0():
-        # use the model/llm, processor is NOT a list
-        if format == "json":
-            return {
-                "processor": json_processor(
-                    schema,
-                    llm,
-                    whitespace_pattern=structured_output.get("whitespace_pattern"),
-                ),
-                "schema": schema_as_dict(schema),
-            }
-
-        if format == "regex":
-            return {"processor": regex_processor(schema, llm)}
+        # use the llm for processor initialization
+        model = llm
+        tokenizer = None
     else:
-        # use tokenizer, processor is a list
+        # use the tokenizer for processor initialization
+        model = None
         tokenizer = _get_tokenizer_from_model(llm, framework)
-        if format == "json":
-            return {
-                "processor": [
-                    json_processor(
-                        schema,
-                        tokenizer,
-                        whitespace_pattern=structured_output.get("whitespace_pattern"),
-                    )
-                ],
-                "schema": schema_as_dict(schema),
-            }
-
-        if format == "regex":
-            return {"processor": [regex_processor(schema, tokenizer)]}
+
+    if format == "json":
+        return {
+            "processor": json_processor(
+                schema,
+                model or tokenizer,
+                whitespace_pattern=structured_output.get("whitespace_pattern"),
+            ),
+            "schema": schema_as_dict(schema),
+        }
+
+    if format == "regex":
+        return {"processor": regex_processor(schema, llm)}
 
     raise DistilabelUserError(
         f"Invalid format '{format}'. Must be either 'json' or 'regex'.",

From 4056f0898efa184a3588e837f888aa10bf8edf62 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Fri, 10 Jan 2025 12:16:10 +0100
Subject: [PATCH 29/37] Refactor structured output return types in LlamaCppLLM,
 MlxLLM, and TransformersLLM

- Updated return types of `_prepare_structured_output` methods to reflect changes in processor handling.
- Changed return type in LlamaCppLLM from `Union["LogitsProcessorList", None]` to `Union["LogitsProcessorList", "LogitsProcessor"]`.
- Modified MlxLLM and TransformersLLM to return `Union[List[Callable], Callable>` instead of `Union[Callable, None]`, ensuring consistency across implementations.
- Enhanced code clarity and maintainability by standardizing output handling in structured output preparation.
---
 src/distilabel/models/llms/huggingface/transformers.py | 2 +-
 src/distilabel/models/llms/llamacpp.py                 | 9 +++++++--
 src/distilabel/models/llms/mlx.py                      | 4 ++--
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/distilabel/models/llms/huggingface/transformers.py b/src/distilabel/models/llms/huggingface/transformers.py
index ad79a0d936..d84290fd3b 100644
--- a/src/distilabel/models/llms/huggingface/transformers.py
+++ b/src/distilabel/models/llms/huggingface/transformers.py
@@ -299,7 +299,7 @@ def get_last_hidden_states(
 
     def _prepare_structured_output(
         self, structured_output: Optional[OutlinesStructuredOutputType] = None
-    ) -> Union[Callable, None]:
+    ) -> Union[Callable, List[Callable]]:
         """Creates the appropriate function to filter tokens to generate structured outputs.
 
         Args:
diff --git a/src/distilabel/models/llms/llamacpp.py b/src/distilabel/models/llms/llamacpp.py
index 19715836f1..8d90502b93 100644
--- a/src/distilabel/models/llms/llamacpp.py
+++ b/src/distilabel/models/llms/llamacpp.py
@@ -24,7 +24,12 @@
 from distilabel.steps.tasks.typing import FormattedInput, OutlinesStructuredOutputType
 
 if TYPE_CHECKING:
-    from llama_cpp import CreateChatCompletionResponse, Llama, LogitsProcessorList
+    from llama_cpp import (
+        CreateChatCompletionResponse,
+        Llama,
+        LogitsProcessor,
+        LogitsProcessorList,
+    )
 
     from distilabel.steps.tasks.typing import FormattedInput, StandardInput
 
@@ -383,7 +388,7 @@ def generate(  # type: ignore
 
     def _prepare_structured_output(
         self, structured_output: Optional[OutlinesStructuredOutputType] = None
-    ) -> Union["LogitsProcessorList", None]:
+    ) -> Union["LogitsProcessorList", "LogitsProcessor"]:
         """Creates the appropriate function to filter tokens to generate structured outputs.
 
         Args:
diff --git a/src/distilabel/models/llms/mlx.py b/src/distilabel/models/llms/mlx.py
index 4ffcceddab..c754f40d5f 100644
--- a/src/distilabel/models/llms/mlx.py
+++ b/src/distilabel/models/llms/mlx.py
@@ -267,7 +267,7 @@ def generate(
 
     def _prepare_structured_output(
         self, structured_output: Optional[OutlinesStructuredOutputType] = None
-    ) -> Union[Callable, None]:
+    ) -> Union[List[Callable], Callable]:
         """Creates the appropriate function to filter tokens to generate structured outputs.
 
         Args:
@@ -285,4 +285,4 @@ def _prepare_structured_output(
         )
         if schema := result.get("schema"):
             self.structured_output["schema"] = schema
-        return result["processor"]
+        return [result["processor"]]

From 11a7957c81a489588e2c0403ff46c18026a58b04 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Fri, 10 Jan 2025 13:07:02 +0100
Subject: [PATCH 30/37] Enhance MlxLLM integration and expand framework support

- Added support for the 'mlx' framework in the outlines processing logic.
- Updated the `prepare_guided_output` function to utilize `TransformerTokenizer` for 'mlx' framework.
- Modified the `_get_logits_processor` and `_get_tokenizer_from_model` functions to include 'mlx' as a valid framework option, ensuring consistent handling across different frameworks.
- Improved code clarity and maintainability by standardizing framework handling in the structured output preparation process.
---
 src/distilabel/models/llms/mlx.py             |  4 +++-
 .../tasks/structured_outputs/outlines.py      | 24 +++++++++++++++----
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/distilabel/models/llms/mlx.py b/src/distilabel/models/llms/mlx.py
index c754f40d5f..d78ce0b5c8 100644
--- a/src/distilabel/models/llms/mlx.py
+++ b/src/distilabel/models/llms/mlx.py
@@ -276,12 +276,14 @@ def _prepare_structured_output(
         Returns:
             The callable that will be used to guide the generation of the model.
         """
+        from outlines.models.mlxlm import TransformerTokenizer
+
         from distilabel.steps.tasks.structured_outputs.outlines import (
             prepare_guided_output,
         )
 
         result = prepare_guided_output(
-            structured_output, "transformers", self._pipeline
+            structured_output, "mlx", TransformerTokenizer(self._tokenizer._tokenizer)
         )
         if schema := result.get("schema"):
             self.structured_output["schema"] = schema
diff --git a/src/distilabel/steps/tasks/structured_outputs/outlines.py b/src/distilabel/steps/tasks/structured_outputs/outlines.py
index 52139a1e80..e78f1f13e8 100644
--- a/src/distilabel/steps/tasks/structured_outputs/outlines.py
+++ b/src/distilabel/steps/tasks/structured_outputs/outlines.py
@@ -36,12 +36,13 @@
 
 if TYPE_CHECKING:
     from llama_cpp import Llama
+    from outlines.models.mlxlm import TransformerTokenizer
     from transformers import Pipeline
     from vllm import LLM
 
     from distilabel.steps.tasks.typing import OutlinesStructuredOutputType
 
-Frameworks = Literal["transformers", "llamacpp", "vllm"]
+Frameworks = Literal["transformers", "llamacpp", "vllm", "mlx"]
 
 
 def _is_outlines_version_below_0_1_0() -> bool:
@@ -82,6 +83,11 @@ def _get_logits_processor(framework: Frameworks) -> Tuple[Callable, Callable]:
                 "JSONLogitsProcessor",
                 "RegexLogitsProcessor",
             ),
+            "mlx": (
+                "outlines.processors.mlxlm",
+                "JSONLogitsProcessor",
+                "RegexLogitsProcessor",
+            ),
         }
     else:
         processors = {
@@ -100,6 +106,11 @@ def _get_logits_processor(framework: Frameworks) -> Tuple[Callable, Callable]:
                 "JSONLogitsProcessor",
                 "RegexLogitsProcessor",
             ),
+            "mlx": (
+                "outlines.processors",
+                "JSONLogitsProcessor",
+                "RegexLogitsProcessor",
+            ),
         }
 
     if framework not in processors:
@@ -114,26 +125,29 @@ def _get_logits_processor(framework: Frameworks) -> Tuple[Callable, Callable]:
 
 
 def _get_tokenizer_from_model(
-    llm: Union["LLM", "Pipeline", "Llama"], framework: Frameworks
+    llm: Union["LLM", "Pipeline", "Llama", "TransformerTokenizer"],
+    framework: Frameworks,
 ) -> Callable:
     if framework == "llamacpp":
         from outlines.models.llamacpp import LlamaCppTokenizer
 
         return LlamaCppTokenizer(llm)
-    elif framework == "transformers":
+    if framework == "transformers":
         from outlines.models.transformers import TransformerTokenizer
 
         return TransformerTokenizer(llm.tokenizer)
-    elif framework == "vllm":
+    if framework == "vllm":
         from outlines.models.vllm import adapt_tokenizer
 
         return adapt_tokenizer(llm.get_tokenizer())
+    if framework == "mlx":
+        return llm
 
 
 def prepare_guided_output(
     structured_output: "OutlinesStructuredOutputType",
     framework: Frameworks,
-    llm: Any,
+    llm: Union["LLM", "Pipeline", "Llama", "TransformerTokenizer"],
 ) -> Dict[str, Any]:
     """Prepares the `LLM` to generate guided output using `outlines`.
 

From e9fefc4c0553e441890757085a246ad9195c1195 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Fri, 10 Jan 2025 15:10:40 +0100
Subject: [PATCH 31/37] Refactor structured output handling in LlamaCppLLM and
 MlxLLM

- Simplified return types in LlamaCppLLM and MlxLLM by removing version checks and directly returning the processor.
- Enhanced code clarity and maintainability by standardizing the output structure across both classes.
- Updated `prepare_guided_output` usage to ensure consistent handling of structured outputs.
---
 src/distilabel/models/llms/llamacpp.py | 6 +-----
 src/distilabel/models/llms/mlx.py      | 2 +-
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/distilabel/models/llms/llamacpp.py b/src/distilabel/models/llms/llamacpp.py
index 8d90502b93..a754f6b84f 100644
--- a/src/distilabel/models/llms/llamacpp.py
+++ b/src/distilabel/models/llms/llamacpp.py
@@ -398,14 +398,10 @@ def _prepare_structured_output(
             The callable that will be used to guide the generation of the model.
         """
         from distilabel.steps.tasks.structured_outputs.outlines import (
-            _is_outlines_version_below_0_1_0,
             prepare_guided_output,
         )
 
         result = prepare_guided_output(structured_output, "llamacpp", self._model)
         if (schema := result.get("schema")) and self.structured_output:
             self.structured_output["schema"] = schema
-        if _is_outlines_version_below_0_1_0():
-            return result["processor"]
-        else:
-            return [result["processor"]]
+        return [result["processor"]]
diff --git a/src/distilabel/models/llms/mlx.py b/src/distilabel/models/llms/mlx.py
index d78ce0b5c8..5d510771d7 100644
--- a/src/distilabel/models/llms/mlx.py
+++ b/src/distilabel/models/llms/mlx.py
@@ -287,4 +287,4 @@ def _prepare_structured_output(
         )
         if schema := result.get("schema"):
             self.structured_output["schema"] = schema
-        return [result["processor"]]
+        return result["processor"]

From df24685d2332f8fe17a71d452703a2a08b0dad9b Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Fri, 10 Jan 2025 15:52:49 +0100
Subject: [PATCH 32/37] Refactor MlxLLM structured output handling and remove
 unused components

- Removed the `structured_output` attribute and related processing logic from MlxLLM to simplify the class structure.
- Updated the `load` and generation methods to eliminate references to structured output, enhancing clarity and maintainability.
- Adjusted imports and type hints in `outlines.py` to reflect the removal of 'mlx' framework support, streamlining the framework handling.
- Improved code readability by cleaning up unnecessary complexity in structured output preparation.
---
 src/distilabel/models/llms/mlx.py             | 54 +------------------
 .../tasks/structured_outputs/outlines.py      | 19 ++-----
 2 files changed, 5 insertions(+), 68 deletions(-)

diff --git a/src/distilabel/models/llms/mlx.py b/src/distilabel/models/llms/mlx.py
index 5d510771d7..1f8c9b8c65 100644
--- a/src/distilabel/models/llms/mlx.py
+++ b/src/distilabel/models/llms/mlx.py
@@ -19,22 +19,18 @@
     Dict,
     List,
     Optional,
-    Union,
 )
 
 from pydantic import (
-    Field,
     PrivateAttr,
     validate_call,
 )
 
-from distilabel.mixins.runtime_parameters import RuntimeParameter
 from distilabel.models.llms.base import LLM
 from distilabel.models.llms.typing import GenerateOutput
 from distilabel.models.llms.utils import compute_tokens, prepare_output
 from distilabel.models.mixins.magpie import MagpieChatTemplateMixin
 from distilabel.steps.tasks.typing import (
-    OutlinesStructuredOutputType,
     StandardInput,
 )
 
@@ -51,8 +47,6 @@ class MlxLLM(LLM, MagpieChatTemplateMixin):
         tokenizer_config: the tokenizer configuration.
         model_config: the model configuration.
         adapter_path: the path to the adapter.
-        structured_output: a dictionary containing the structured output configuration or if more
-            fine-grained control is needed, an instance of `OutlinesStructuredOutput`. Defaults to None.
         use_magpie_template: a flag used to enable/disable applying the Magpie pre-query
             template. Defaults to `False`.
         magpie_pre_query_template: the pre-query template to be applied to the prompt or
@@ -82,17 +76,10 @@ class MlxLLM(LLM, MagpieChatTemplateMixin):
     tokenizer_config: Dict[str, Any] = {}
     model_config: Dict[str, Any] = {}
     adapter_path: Optional[str] = None
-    structured_output: Optional[RuntimeParameter[OutlinesStructuredOutputType]] = Field(
-        default=None,
-        description="The structured output format to use across all the generations.",
-    )
 
     _mlx_generate: Optional[Callable] = PrivateAttr(default=None)
     _model: Optional["nn.Module"] = PrivateAttr(...)
     _tokenizer: Optional["TokenizerWrapper"] = PrivateAttr(...)
-    _structured_output_logits_processor: Union[Callable, None] = PrivateAttr(
-        default=None
-    )
 
     def load(self) -> None:
         """Loads the model and tokenizer and creates the text generation pipeline. In addition,
@@ -112,11 +99,6 @@ def load(self) -> None:
             adapter_path=self.adapter_path,
         )
 
-        if self.structured_output:
-            self._structured_output_logits_processor = self._prepare_structured_output(
-                self.structured_output
-            )
-
         if self._tokenizer.pad_token is None:
             self._tokenizer.pad_token = self._tokenizer.eos_token
 
@@ -207,10 +189,6 @@ def generate(
         Returns:
             A list of lists of strings containing the generated responses for each input.
         """
-        logits_processors = []
-        if self._structured_output_logits_processor:
-            logits_processors.append(self._structured_output_logits_processor)
-
         structured_output = None
         result = []
         for input in inputs:
@@ -219,13 +197,9 @@ def generate(
 
             output: List[str] = []
             for _ in range(num_generations):
-                if structured_output:
-                    additional_logits_processors = self._prepare_structured_output(
-                        structured_output
-                    )
-                    logits_processors.append(additional_logits_processors)
+                if structured_output:  # will raise a NotImplementedError
+                    self._prepare_structured_output(structured_output)
                 prompt = self.prepare_input(input)
-
                 generation = self._mlx_generate(
                     prompt=prompt,
                     model=self._model,
@@ -264,27 +238,3 @@ def generate(
                 )
             )
         return result
-
-    def _prepare_structured_output(
-        self, structured_output: Optional[OutlinesStructuredOutputType] = None
-    ) -> Union[List[Callable], Callable]:
-        """Creates the appropriate function to filter tokens to generate structured outputs.
-
-        Args:
-            structured_output: the configuration dict to prepare the structured output.
-
-        Returns:
-            The callable that will be used to guide the generation of the model.
-        """
-        from outlines.models.mlxlm import TransformerTokenizer
-
-        from distilabel.steps.tasks.structured_outputs.outlines import (
-            prepare_guided_output,
-        )
-
-        result = prepare_guided_output(
-            structured_output, "mlx", TransformerTokenizer(self._tokenizer._tokenizer)
-        )
-        if schema := result.get("schema"):
-            self.structured_output["schema"] = schema
-        return result["processor"]
diff --git a/src/distilabel/steps/tasks/structured_outputs/outlines.py b/src/distilabel/steps/tasks/structured_outputs/outlines.py
index e78f1f13e8..71d88a41cf 100644
--- a/src/distilabel/steps/tasks/structured_outputs/outlines.py
+++ b/src/distilabel/steps/tasks/structured_outputs/outlines.py
@@ -36,13 +36,12 @@
 
 if TYPE_CHECKING:
     from llama_cpp import Llama
-    from outlines.models.mlxlm import TransformerTokenizer
     from transformers import Pipeline
     from vllm import LLM
 
     from distilabel.steps.tasks.typing import OutlinesStructuredOutputType
 
-Frameworks = Literal["transformers", "llamacpp", "vllm", "mlx"]
+Frameworks = Literal["transformers", "llamacpp", "vllm"]
 
 
 def _is_outlines_version_below_0_1_0() -> bool:
@@ -83,11 +82,6 @@ def _get_logits_processor(framework: Frameworks) -> Tuple[Callable, Callable]:
                 "JSONLogitsProcessor",
                 "RegexLogitsProcessor",
             ),
-            "mlx": (
-                "outlines.processors.mlxlm",
-                "JSONLogitsProcessor",
-                "RegexLogitsProcessor",
-            ),
         }
     else:
         processors = {
@@ -106,11 +100,6 @@ def _get_logits_processor(framework: Frameworks) -> Tuple[Callable, Callable]:
                 "JSONLogitsProcessor",
                 "RegexLogitsProcessor",
             ),
-            "mlx": (
-                "outlines.processors",
-                "JSONLogitsProcessor",
-                "RegexLogitsProcessor",
-            ),
         }
 
     if framework not in processors:
@@ -125,7 +114,7 @@ def _get_logits_processor(framework: Frameworks) -> Tuple[Callable, Callable]:
 
 
 def _get_tokenizer_from_model(
-    llm: Union["LLM", "Pipeline", "Llama", "TransformerTokenizer"],
+    llm: Union["LLM", "Pipeline", "Llama"],
     framework: Frameworks,
 ) -> Callable:
     if framework == "llamacpp":
@@ -140,14 +129,12 @@ def _get_tokenizer_from_model(
         from outlines.models.vllm import adapt_tokenizer
 
         return adapt_tokenizer(llm.get_tokenizer())
-    if framework == "mlx":
-        return llm
 
 
 def prepare_guided_output(
     structured_output: "OutlinesStructuredOutputType",
     framework: Frameworks,
-    llm: Union["LLM", "Pipeline", "Llama", "TransformerTokenizer"],
+    llm: Union["LLM", "Pipeline", "Llama"],
 ) -> Dict[str, Any]:
     """Prepares the `LLM` to generate guided output using `outlines`.
 

From 65272bd894591ff79d807fd9b65a917beb3cff17 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Fri, 10 Jan 2025 16:02:43 +0100
Subject: [PATCH 33/37] Refactor logits processor handling in TransformersLLM

- Changed the assignment of `_logits_processor` to always use a list, ensuring consistent handling across different outlines versions.
- Removed the version check for outlines in the `load` method, simplifying the logic and enhancing maintainability.
- Updated the return type in the structured output preparation to directly return the processor, improving code clarity.
---
 src/distilabel/models/llms/huggingface/transformers.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/distilabel/models/llms/huggingface/transformers.py b/src/distilabel/models/llms/huggingface/transformers.py
index d84290fd3b..f4475dc1b4 100644
--- a/src/distilabel/models/llms/huggingface/transformers.py
+++ b/src/distilabel/models/llms/huggingface/transformers.py
@@ -157,7 +157,7 @@ def load(self) -> None:
             if _is_outlines_version_below_0_1_0():
                 self._prefix_allowed_tokens_fn = processor
             else:
-                self._logits_processor = processor
+                self._logits_processor = [processor]
 
         super().load()
 
@@ -309,7 +309,6 @@ def _prepare_structured_output(
             The callable that will be used to guide the generation of the model.
         """
         from distilabel.steps.tasks.structured_outputs.outlines import (
-            _is_outlines_version_below_0_1_0,
             prepare_guided_output,
         )
 
@@ -318,7 +317,4 @@ def _prepare_structured_output(
         )
         if schema := result.get("schema"):
             self.structured_output["schema"] = schema
-        if _is_outlines_version_below_0_1_0():
-            return result["processor"]
-        else:
-            return [result["processor"]]
+        return result["processor"]

From d2eda4ee8853d6a98e6b884969df5d3b62c8bfe3 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Fri, 10 Jan 2025 16:23:10 +0100
Subject: [PATCH 34/37] Refactor type hints in outlines.py for improved clarity

- Updated type hints for the `llm` parameter in `_get_tokenizer_from_model` and `prepare_guided_output` functions to use `_vLLM` instead of `LLM`, enhancing code readability.
- Adjusted imports to reflect the new alias for `LLM`, streamlining the code structure.
---
 src/distilabel/steps/tasks/structured_outputs/outlines.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/distilabel/steps/tasks/structured_outputs/outlines.py b/src/distilabel/steps/tasks/structured_outputs/outlines.py
index 71d88a41cf..5bc16ae9df 100644
--- a/src/distilabel/steps/tasks/structured_outputs/outlines.py
+++ b/src/distilabel/steps/tasks/structured_outputs/outlines.py
@@ -37,7 +37,7 @@
 if TYPE_CHECKING:
     from llama_cpp import Llama
     from transformers import Pipeline
-    from vllm import LLM
+    from vllm import LLM as _vLLM
 
     from distilabel.steps.tasks.typing import OutlinesStructuredOutputType
 
@@ -114,7 +114,7 @@ def _get_logits_processor(framework: Frameworks) -> Tuple[Callable, Callable]:
 
 
 def _get_tokenizer_from_model(
-    llm: Union["LLM", "Pipeline", "Llama"],
+    llm: Union["_vLLM", "Pipeline", "Llama"],
     framework: Frameworks,
 ) -> Callable:
     if framework == "llamacpp":
@@ -134,7 +134,7 @@ def _get_tokenizer_from_model(
 def prepare_guided_output(
     structured_output: "OutlinesStructuredOutputType",
     framework: Frameworks,
-    llm: Union["LLM", "Pipeline", "Llama"],
+    llm: Union["_vLLM", "Pipeline", "Llama"],
 ) -> Dict[str, Any]:
     """Prepares the `LLM` to generate guided output using `outlines`.
 

From 85494c46d2997a40d7fafbdc857e2256cd1ae665 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Fri, 10 Jan 2025 16:29:22 +0100
Subject: [PATCH 35/37] Refactor type hint imports in outlines.py for improved
 clarity

- Updated type hint imports to include `# noqa` comments, enhancing code readability and maintaining consistency with type checking.
- No functional changes were made; this commit focuses on code structure and clarity.
---
 .../steps/tasks/structured_outputs/outlines.py         | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/distilabel/steps/tasks/structured_outputs/outlines.py b/src/distilabel/steps/tasks/structured_outputs/outlines.py
index 5bc16ae9df..ab62d63f56 100644
--- a/src/distilabel/steps/tasks/structured_outputs/outlines.py
+++ b/src/distilabel/steps/tasks/structured_outputs/outlines.py
@@ -34,12 +34,12 @@
 from distilabel.errors import DistilabelUserError
 from distilabel.steps.tasks.structured_outputs.utils import schema_as_dict
 
-if TYPE_CHECKING:
-    from llama_cpp import Llama
-    from transformers import Pipeline
-    from vllm import LLM as _vLLM
+if TYPE_CHECKING:  # noqa
+    from llama_cpp import Llama  # noqa
+    from transformers import Pipeline  # noqa
+    from vllm import LLM as _vLLM  # noqa
 
-    from distilabel.steps.tasks.typing import OutlinesStructuredOutputType
+    from distilabel.steps.tasks.typing import OutlinesStructuredOutputType  # noqa
 
 Frameworks = Literal["transformers", "llamacpp", "vllm"]
 

From 01ea5f1d99e57a6b7d34267c2bd3c260c2d27d17 Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Fri, 10 Jan 2025 16:54:35 +0100
Subject: [PATCH 36/37] Refactor regex processor handling in
 prepare_guided_output function

- Updated the return statement in the `prepare_guided_output` function to use `model or tokenizer` instead of `llm`, improving clarity and consistency in processor assignment.
- This change enhances the function's flexibility in handling different input types while maintaining existing functionality.
---
 src/distilabel/steps/tasks/structured_outputs/outlines.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/distilabel/steps/tasks/structured_outputs/outlines.py b/src/distilabel/steps/tasks/structured_outputs/outlines.py
index ab62d63f56..a5aceacb3b 100644
--- a/src/distilabel/steps/tasks/structured_outputs/outlines.py
+++ b/src/distilabel/steps/tasks/structured_outputs/outlines.py
@@ -190,7 +190,7 @@ def prepare_guided_output(
         }
 
     if format == "regex":
-        return {"processor": regex_processor(schema, llm)}
+        return {"processor": regex_processor(schema, model or tokenizer)}
 
     raise DistilabelUserError(
         f"Invalid format '{format}'. Must be either 'json' or 'regex'.",

From 399154e690738386f84ebfea50a37a3ef5c9847f Mon Sep 17 00:00:00 2001
From: davidberenstein1957 <david.m.berenstein@gmail.com>
Date: Fri, 10 Jan 2025 17:58:31 +0100
Subject: [PATCH 37/37] Update transformer dependency constraints in
 pyproject.toml

- Removed the upper version limit for the `transformers` package, allowing for updates beyond version 4.47.0.
---
 pyproject.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 48aded2d78..3123d56b55 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,8 +78,7 @@ argilla = ["argilla >= 2.0.0", "ipython"]
 cohere = ["cohere >= 5.2.0"]
 groq = ["groq >= 0.4.1"]
 hf-inference-endpoints = ["huggingface_hub >= 0.22.0"]
-# logit processor breaks in transformers 4.47.0
-hf-transformers = ["transformers >= 4.34.1, < 4.47.0", "torch >= 2.0.0"]
+hf-transformers = ["transformers >= 4.34.1", "torch >= 2.0.0"]
 instructor = ["instructor >= 1.2.3"]
 litellm = ["litellm >= 1.30.0"]
 llama-cpp = ["llama-cpp-python >= 0.2.0"]