diff --git a/gpt_engineer/chat_to_files.py b/gpt_engineer/chat_to_files.py index 40d5b7027b..9c8a9e6ec8 100644 --- a/gpt_engineer/chat_to_files.py +++ b/gpt_engineer/chat_to_files.py @@ -2,82 +2,21 @@ from typing import List, Tuple from gpt_engineer.db import DB -# Amount of lines within the code block to consider for filename discovery -N_CODELINES_FOR_FILENAME_TA = 5 -# Default path to use if no filename is found -DEFAULT_PATH = 'unknown.txt' +def parse_chat(chat) -> List[Tuple[str, str]]: + # Get all ``` blocks + regex = r"```(.*?)```" + matches = re.finditer(regex, chat, re.DOTALL) -def parse_chat(chat: str, verbose: bool = False) -> List[Tuple[str, str]]: - ''' - Parses a chat message and returns a list of tuples containing - the file path and the code content for each file. - ''' - code_regex = r'```(.*?)```' - filename_regex = r'\b[\w-]+\.[\w]{1,6}\b' - - # Get all ``` (code) blocks - code_matches = re.finditer(code_regex, chat, re.DOTALL) - - prev_code_y_end = 0 files = [] - for match in code_matches: - lines = match.group(1).split('\n') - code_y_start = match.start() - code_y_end = match.end() - - # Now, we need to get the filename associated with this code block. - # We will look for the filename somewhere near the code block start. - # - # This "somewhere near" is referred to as the "filename_ta", to - # resemble a sort-of target area (ta). - # - # The target area includes the text preceding the code block that - # does not belong to previous code blocks ("no_code"). - # Additionally, as sometimes the filename is defined within - # the code block itself, we will also include the first few lines - # of the code block in the filename_ta. - # - # Example: - # ```python - # # File: entrypoint.py - # import pygame - # ``` - # - # The amount of lines to consider within the code block is set by - # the constant 'N_CODELINES_FOR_FILENAME_TA'. - # - # Get the "preceding" text, which is located between codeblocks - no_code = chat[prev_code_y_end:code_y_start].strip() - within_code = '\n'.join(lines[:N_CODELINES_FOR_FILENAME_TA]) - filename_ta = no_code + '\n' + within_code - - # The path is the filename itself which we greedily match - filename = re.search(filename_regex, filename_ta) - path = filename.group(0) if filename else DEFAULT_PATH - - # Visualize the filename_ta if verbose - if verbose: - print('-' * 20) - print(f'Path: {path}') - print('-' * 20) - print(filename_ta) - print('-' * 20) - - # Check if its not a false positive - # - # For instance, the match with ```main.py``` should not be considered. - # ```main.py``` - # ```python - # ... - # ``` - if not re.fullmatch(filename_regex, '\n'.join(lines)): - # Update the previous code block end - prev_code_y_end = code_y_end - - # File and code have been matched, add them to the list - files.append((path, '\n'.join(lines[1:]))) + for match in matches: + path = match.group(1).split("\n")[0] + # Get the code + code = match.group(1).split("\n")[1:] + code = "\n".join(code) + # Add the file to the list + files.append((path, code)) return files diff --git a/tests/test_chat_parser.py b/tests/test_chat_parser.py deleted file mode 100644 index 40b224c204..0000000000 --- a/tests/test_chat_parser.py +++ /dev/null @@ -1,129 +0,0 @@ -import unittest -from gpt_engineer.chat_to_files import parse_chat - -CODE_FORMATS = ''' -(1) -File: main.py - -```python -import pygame -```` - -(2) -entry.py -```python -import pygame -``` - -(3) -```python -# File: rickroll.py -import pygame -``` - -(4) -```python - -# File: engineer.py -import pygame -``` - -(5) -```adastra.py -import pygame -``` - -(6) -```python bird.py -import pygame -``` - -(7) -```obstacle.py python -import pygame -``` - -(8) -```major1.py```` -```python -import pygame -``` - -(9) -```major2.py```` -```python -import pygame -``` - -(10) -```js -// File: bruh.js -const a = 1; -``` - -(11) -```swag.tsx -// File: swag.tsx -const a: number = 1; -``` - -(12) -```gmoita.ts -// File: gmoita.tsx -const a: number = 1; -``` - -(13) -** file1.py ** -```python -import pygame -``` - -(14) -**file2.py** -```python -import pygame -``` - -(15) -#### `gm.py` -```python -import pygame -``` -''' - -class TestChatParsing(unittest.TestCase): - - def setUp(self): - self._expected_filenames = ( - 'main.py', - 'entry.py', - 'rickroll.py', - 'engineer.py', - 'adastra.py', - 'bird.py', - 'obstacle.py', - 'major1.py', - 'major2.py', - 'bruh.js', - 'swag.tsx', - 'gmoita.ts', - 'file1.py', - 'file2.py', - 'gm.py', - ) - self.chat = CODE_FORMATS - - def test_parsing(self): - codefiles = parse_chat(self.chat) - - self.assertEqual(len(codefiles), len(self._expected_filenames)) - for i, cf in enumerate(codefiles): - filename, content = cf - - self.assertEqual(filename, self._expected_filenames[i]) - self.assertNotEqual(content, '') - -if __name__ == '__main__': - unittest.main() -