From 730d1e6147a1f879e46cb3b6d266986a4cd55604 Mon Sep 17 00:00:00 2001 From: goncalomoita Date: Fri, 16 Jun 2023 21:46:03 +0100 Subject: [PATCH 1/2] fixed chat parsing that produces codefiles #35 --- gpt_engineer/chat_to_files.py | 85 +++++++++++++++++++--- tests/test_chat_parser.py | 132 ++++++++++++++++++++++++++++++++++ 2 files changed, 206 insertions(+), 11 deletions(-) create mode 100644 tests/test_chat_parser.py diff --git a/gpt_engineer/chat_to_files.py b/gpt_engineer/chat_to_files.py index 890ba33621..66be939be2 100644 --- a/gpt_engineer/chat_to_files.py +++ b/gpt_engineer/chat_to_files.py @@ -1,24 +1,87 @@ import re +from typing import List, Tuple +# Amount of lines within the code block to consider for filename discovery +N_CODELINES_FOR_FILENAME_TA = 5 -def parse_chat(chat): # -> List[Tuple[str, str]]: - # Get all ``` blocks - regex = r"```(.*?)```" +# Default path to use if no filename is found +DEFAULT_PATH = 'unknown.txt' - matches = re.finditer(regex, chat, re.DOTALL) +def parse_chat(chat: str, verbose: bool = False) -> List[Tuple[str, str]]: + ''' + Parses a chat message and returns a list of tuples containing + the file path and the code content for each file. + ''' + code_regex = r'```(.*?)```' + filename_regex = r'\b[\w-]+\.[\w]{1,6}\b' + + # Get all ``` (code) blocks + code_matches = re.finditer(code_regex, chat, re.DOTALL) + + prev_code_y_end = 0 files = [] - for match in matches: - path = match.group(1).split("\n")[0] - # Get the code - code = match.group(1).split("\n")[1:] - code = "\n".join(code) - # Add the file to the list - files.append((path, code)) + for match in code_matches: + lines = match.group(1).split('\n') + code_y_start = match.start() + code_y_end = match.end() + + # Now, we need to get the filename associated with this code block. + # We will look for the filename somewhere near the code block start. + # + # This "somewhere near" is referred to as the "filename_ta", to + # resemble a sort-of target area (ta). + # + # The target area includes the text preceding the code block that + # does not belong to previous code blocks ("no_code"). + # Additionally, as sometimes the filename is defined within + # the code block itself, we will also include the first few lines + # of the code block in the filename_ta. + # + # Example: + # ```python + # # File: entrypoint.py + # import pygame + # ``` + # + # The amount of lines to consider within the code block is set by + # the constant 'N_CODELINES_FOR_FILENAME_TA'. + # + # Get the "preceding" text, which is located between codeblocks + no_code = chat[prev_code_y_end:code_y_start].strip() + within_code = '\n'.join(lines[:N_CODELINES_FOR_FILENAME_TA]) + filename_ta = no_code + '\n' + within_code + + # The path is the filename itself which we greedily match + filename = re.search(filename_regex, filename_ta) + path = filename.group(0) if filename else DEFAULT_PATH + + # Visualize the filename_ta if verbose + if verbose: + print('-' * 20) + print(f'Path: {path}') + print('-' * 20) + print(filename_ta) + print('-' * 20) + + # Check if its not a false positive + # + # For instance, the match with ```main.py``` should not be considered. + # ```main.py``` + # ```python + # ... + # ``` + if not re.fullmatch(filename_regex, '\n'.join(lines)): + # Update the previous code block end + prev_code_y_end = code_y_end + + # File and code have been matched, add them to the list + files.append((path, '\n'.join(lines[1:]))) return files + def to_files(chat, workspace): workspace["all_output.txt"] = chat diff --git a/tests/test_chat_parser.py b/tests/test_chat_parser.py new file mode 100644 index 0000000000..f3de069796 --- /dev/null +++ b/tests/test_chat_parser.py @@ -0,0 +1,132 @@ +import unittest +from gpt_engineer.chat_to_files import parse_chat + +CODE_FORMATS = ''' +(1) +File: main.py + +```python +import pygame +```` + +(2) +entry.py +```python +import pygame +``` + +(3) +```python +# File: rickroll.py +import pygame +``` + +(4) +```python + +# File: engineer.py +import pygame +``` + +(5) +```adastra.py +import pygame +``` + +(6) +```python bird.py +import pygame +``` + +(7) +```obstacle.py python +import pygame +``` + +(8) +```major1.py```` +```python +import pygame +``` + +(9) +```major2.py```` +```python +import pygame +``` + +(10) +```js +// File: bruh.js +const a = 1; +``` + +(11) +```swag.tsx +// File: swag.tsx +const a: number = 1; +``` + +(12) +```gmoita.ts +// File: gmoita.tsx +const a: number = 1; +``` + +(13) +** file1.py ** +```python +import pygame +``` + +(13) +**file2.py** +```python +import pygame +``` + +(14) +#### `gm.py` +```python +import pygame +''' + +class TestChatParsing(unittest.TestCase): + + def setUp(self): + self.expected_files = [ + 'main.py', + 'entry.py', + 'rickroll.py', + 'engineer.py', + 'adastra.py', + 'bird.py', + 'obstacle.py', + 'major1.py', + 'major2.py', + 'bruh.js', + 'swag.tsx', + 'gmoita.ts', + 'file1.py', + 'file2.py', + 'gm.py', + ] + self.chat = CODE_FORMATS + + def test_parsing(self): + files_and_content = parse_chat(self.chat) + + # Check that the number of extracted files matches the expected number + self.assertEqual(len(files_and_content), len(self.expected_files)) + + # Iterate over the expected files and check if they match the parsed files + for i, expected_file in enumerate(self.expected_files): + self.assertEqual(files_and_content[i][0], expected_file) + + # Check that the content of each file is not empty + for file in files_and_content: + self.assertNotEqual(file[1], '') + +if __name__ == '__main__': + unittest.main() + From 51b3520e886e5d37ed25dee0a48e06c318dc1ccc Mon Sep 17 00:00:00 2001 From: goncalomoita Date: Sat, 17 Jun 2023 18:33:10 +0100 Subject: [PATCH 2/2] improved chat parsing tests --- tests/test_chat_parser.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/tests/test_chat_parser.py b/tests/test_chat_parser.py index f3de069796..40b224c204 100644 --- a/tests/test_chat_parser.py +++ b/tests/test_chat_parser.py @@ -79,22 +79,23 @@ import pygame ``` -(13) +(14) **file2.py** ```python import pygame ``` -(14) +(15) #### `gm.py` ```python import pygame +``` ''' class TestChatParsing(unittest.TestCase): def setUp(self): - self.expected_files = [ + self._expected_filenames = ( 'main.py', 'entry.py', 'rickroll.py', @@ -110,22 +111,18 @@ def setUp(self): 'file1.py', 'file2.py', 'gm.py', - ] + ) self.chat = CODE_FORMATS def test_parsing(self): - files_and_content = parse_chat(self.chat) - - # Check that the number of extracted files matches the expected number - self.assertEqual(len(files_and_content), len(self.expected_files)) - - # Iterate over the expected files and check if they match the parsed files - for i, expected_file in enumerate(self.expected_files): - self.assertEqual(files_and_content[i][0], expected_file) - - # Check that the content of each file is not empty - for file in files_and_content: - self.assertNotEqual(file[1], '') + codefiles = parse_chat(self.chat) + + self.assertEqual(len(codefiles), len(self._expected_filenames)) + for i, cf in enumerate(codefiles): + filename, content = cf + + self.assertEqual(filename, self._expected_filenames[i]) + self.assertNotEqual(content, '') if __name__ == '__main__': unittest.main()