-
Notifications
You must be signed in to change notification settings - Fork 209
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
## Linked issues closes: #1066 ## Details 1. Implement tokenizers for Python based on the JS SDK 2. Changes the underlying coding to `cl100k_base`, which is used by gpt4 and gpt3.5. JS is using `r50k_base` and I have created #1171 to track this issue. 3. Rename `GPT3Tokenizer` to `GPTTokenizer`, which seems making more sense for its functionality, as both gpt4 and gpt3.5 can use this tokenizer. 4. Add unit tests for the code 5. Add docstring for the code ## Attestation Checklist - [x] My code follows the style guidelines of this project - I have checked for/fixed spelling, linting, and other errors - I have commented my code for clarity - I have made corresponding changes to the documentation (we use [TypeDoc](https://typedoc.org/) to document our code) - My changes generate no new warnings - I have added tests that validates my changes, and provides sufficient test coverage. I have tested with: - Local testing - E2E testing in Teams - New and existing unit tests pass locally with my changes
- Loading branch information
1 parent
7ce1a23
commit 917925a
Showing
4 changed files
with
115 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
""" | ||
Copyright (c) Microsoft Corporation. All rights reserved. | ||
Licensed under the MIT License. | ||
""" | ||
|
||
from .gpt_tokenizer import GPTTokenizer | ||
from .tokenizer import Tokenizer |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
""" | ||
Copyright (c) Microsoft Corporation. All rights reserved. | ||
Licensed under the MIT License. | ||
""" | ||
|
||
from typing import List | ||
|
||
from tiktoken import Encoding, get_encoding | ||
|
||
from .tokenizer import Tokenizer | ||
|
||
|
||
class GPTTokenizer(Tokenizer): | ||
"""Used to encode and decode text for GPT-3.5/GPT-4 model.""" | ||
|
||
_encoding: Encoding | ||
|
||
def __init__(self): | ||
"""Initializes the GPTTokenizer object.""" | ||
self._encoding = get_encoding("cl100k_base") | ||
|
||
def decode(self, tokens: List[int]) -> str: | ||
"""Decodes a list of tokens into a string. | ||
Args: | ||
tokens (List[int]): The list of tokens to be decoded. | ||
Returns: | ||
str: The decoded string. | ||
""" | ||
return self._encoding.decode(tokens) | ||
|
||
def encode(self, text: str) -> List[int]: | ||
"""Encodes a string into a list of tokens. | ||
Args: | ||
text (str): The string to be encoded. | ||
Returns: | ||
List[int]: The list of encoded tokens. | ||
""" | ||
return self._encoding.encode(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
""" | ||
Copyright (c) Microsoft Corporation. All rights reserved. | ||
Licensed under the MIT License. | ||
""" | ||
|
||
from abc import ABC, abstractmethod | ||
from typing import List | ||
|
||
|
||
class Tokenizer(ABC): | ||
"""Abstract base class for a tokenizer. | ||
This class provides an interface for a tokenizer, which should be able to | ||
encode a string into a list of integers and decode a list of integers into a string. | ||
""" | ||
|
||
@abstractmethod | ||
def decode(self, tokens: List[int]) -> str: | ||
"""Decodes a list of tokens into a string. | ||
Args: | ||
tokens (List[int]): A list of integers representing tokens. | ||
Returns: | ||
str: The decoded string. | ||
""" | ||
|
||
@abstractmethod | ||
def encode(self, text: str) -> List[int]: | ||
"""Encodes a string into a list of tokens. | ||
Args: | ||
text (str): The text to encode. | ||
Returns: | ||
List[int]: A list of integers representing the encoded text. | ||
""" |
29 changes: 29 additions & 0 deletions
29
python/packages/ai/tests/ai/tokenizers/test_gpt_tokenizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
""" | ||
Copyright (c) Microsoft Corporation. All rights reserved. | ||
Licensed under the MIT License. | ||
""" | ||
|
||
import unittest | ||
|
||
from teams.ai.tokenizers import GPTTokenizer | ||
|
||
|
||
class TestGPTTokenizer(unittest.TestCase): | ||
def setUp(self): | ||
self.tokenizer = GPTTokenizer() | ||
|
||
def test_encode(self): | ||
text = "tiktoken is great!" | ||
expected_result = [83, 1609, 5963, 374, 2294, 0] | ||
result = self.tokenizer.encode(text) | ||
self.assertEqual( | ||
result, expected_result, "Expected result does not match the encoded result" | ||
) | ||
|
||
def test_decode(self): | ||
tokens = [83, 1609, 5963, 374, 2294, 0] | ||
expected_result = "tiktoken is great!" | ||
result = self.tokenizer.decode(tokens) | ||
self.assertEqual( | ||
result, expected_result, "Expected result does not match the decoded result" | ||
) |