[PY] feat: add tokenizer (#1172)

## Linked issues closes: #1066 ## Details 1. Implement tokenizers for Python based on the JS SDK 2. Changes the underlying coding to `cl100k_base`, which is used by gpt4 and gpt3.5. JS is using `r50k_base` and I have created #1171 to track this issue. 3. Rename `GPT3Tokenizer` to `GPTTokenizer`, which seems making more sense for its functionality, as both gpt4 and gpt3.5 can use this tokenizer. 4. Add unit tests for the code 5. Add docstring for the code ## Attestation Checklist - [x] My code follows the style guidelines of this project - I have checked for/fixed spelling, linting, and other errors - I have commented my code for clarity - I have made corresponding changes to the documentation (we use [TypeDoc](https://typedoc.org/) to document our code) - My changes generate no new warnings - I have added tests that validates my changes, and provides sufficient test coverage. I have tested with: - Local testing - E2E testing in Teams - New and existing unit tests pass locally with my changes
microsoft · Jan 17, 2024 · 917925a · 917925a
1 parent 7ce1a23
commit 917925a
Show file tree

Hide file tree

Showing 4 changed files with 115 additions and 0 deletions.
diff --git a/python/packages/ai/teams/ai/tokenizers/__init__.py b/python/packages/ai/teams/ai/tokenizers/__init__.py
@@ -0,0 +1,7 @@
+"""
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+"""
+
+from .gpt_tokenizer import GPTTokenizer
+from .tokenizer import Tokenizer
diff --git a/python/packages/ai/teams/ai/tokenizers/gpt_tokenizer.py b/python/packages/ai/teams/ai/tokenizers/gpt_tokenizer.py
@@ -0,0 +1,42 @@
+"""
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+"""
+
+from typing import List
+
+from tiktoken import Encoding, get_encoding
+
+from .tokenizer import Tokenizer
+
+
+class GPTTokenizer(Tokenizer):
+    """Used to encode and decode text for GPT-3.5/GPT-4 model."""
+
+    _encoding: Encoding
+
+    def __init__(self):
+        """Initializes the GPTTokenizer object."""
+        self._encoding = get_encoding("cl100k_base")
+
+    def decode(self, tokens: List[int]) -> str:
+        """Decodes a list of tokens into a string.
+
+        Args:
+            tokens (List[int]): The list of tokens to be decoded.
+
+        Returns:
+            str: The decoded string.
+        """
+        return self._encoding.decode(tokens)
+
+    def encode(self, text: str) -> List[int]:
+        """Encodes a string into a list of tokens.
+
+        Args:
+            text (str): The string to be encoded.
+
+        Returns:
+            List[int]: The list of encoded tokens.
+        """
+        return self._encoding.encode(text)
diff --git a/python/packages/ai/teams/ai/tokenizers/tokenizer.py b/python/packages/ai/teams/ai/tokenizers/tokenizer.py
@@ -0,0 +1,37 @@
+"""
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+"""
+
+from abc import ABC, abstractmethod
+from typing import List
+
+
+class Tokenizer(ABC):
+    """Abstract base class for a tokenizer.
+
+    This class provides an interface for a tokenizer, which should be able to
+    encode a string into a list of integers and decode a list of integers into a string.
+    """
+
+    @abstractmethod
+    def decode(self, tokens: List[int]) -> str:
+        """Decodes a list of tokens into a string.
+
+        Args:
+            tokens (List[int]): A list of integers representing tokens.
+
+        Returns:
+            str: The decoded string.
+        """
+
+    @abstractmethod
+    def encode(self, text: str) -> List[int]:
+        """Encodes a string into a list of tokens.
+
+        Args:
+            text (str): The text to encode.
+
+        Returns:
+            List[int]: A list of integers representing the encoded text.
+        """
diff --git a/python/packages/ai/tests/ai/tokenizers/test_gpt_tokenizer.py b/python/packages/ai/tests/ai/tokenizers/test_gpt_tokenizer.py
@@ -0,0 +1,29 @@
+"""
+Copyright (c) Microsoft Corporation. All rights reserved.
+Licensed under the MIT License.
+"""
+
+import unittest
+
+from teams.ai.tokenizers import GPTTokenizer
+
+
+class TestGPTTokenizer(unittest.TestCase):
+    def setUp(self):
+        self.tokenizer = GPTTokenizer()
+
+    def test_encode(self):
+        text = "tiktoken is great!"
+        expected_result = [83, 1609, 5963, 374, 2294, 0]
+        result = self.tokenizer.encode(text)
+        self.assertEqual(
+            result, expected_result, "Expected result does not match the encoded result"
+        )
+
+    def test_decode(self):
+        tokens = [83, 1609, 5963, 374, 2294, 0]
+        expected_result = "tiktoken is great!"
+        result = self.tokenizer.decode(tokens)
+        self.assertEqual(
+            result, expected_result, "Expected result does not match the decoded result"
+        )