From 730d1e6147a1f879e46cb3b6d266986a4cd55604 Mon Sep 17 00:00:00 2001
From: goncalomoita <goncalo.moita@brainform.co>
Date: Fri, 16 Jun 2023 21:46:03 +0100
Subject: [PATCH 1/2] fixed chat parsing that produces codefiles #35

---
 gpt_engineer/chat_to_files.py |  85 +++++++++++++++++++---
 tests/test_chat_parser.py     | 132 ++++++++++++++++++++++++++++++++++
 2 files changed, 206 insertions(+), 11 deletions(-)
 create mode 100644 tests/test_chat_parser.py

diff --git a/gpt_engineer/chat_to_files.py b/gpt_engineer/chat_to_files.py
index 890ba33621..66be939be2 100644
--- a/gpt_engineer/chat_to_files.py
+++ b/gpt_engineer/chat_to_files.py
@@ -1,24 +1,87 @@
 import re
+from typing import List, Tuple
 
+# Amount of lines within the code block to consider for filename discovery
+N_CODELINES_FOR_FILENAME_TA = 5
 
-def parse_chat(chat):  # -> List[Tuple[str, str]]:
-    # Get all ``` blocks
-    regex = r"```(.*?)```"
+# Default path to use if no filename is found
+DEFAULT_PATH = 'unknown.txt'
 
-    matches = re.finditer(regex, chat, re.DOTALL)
 
+def parse_chat(chat: str, verbose: bool = False) -> List[Tuple[str, str]]:
+    '''
+    Parses a chat message and returns a list of tuples containing
+    the file path and the code content for each file.
+    '''
+    code_regex = r'```(.*?)```'
+    filename_regex = r'\b[\w-]+\.[\w]{1,6}\b'
+
+    # Get all ``` (code) blocks
+    code_matches = re.finditer(code_regex, chat, re.DOTALL)
+    
+    prev_code_y_end = 0
     files = []
-    for match in matches:
-        path = match.group(1).split("\n")[0]
-        # Get the code
-        code = match.group(1).split("\n")[1:]
-        code = "\n".join(code)
-        # Add the file to the list
-        files.append((path, code))
+    for match in code_matches:
+        lines = match.group(1).split('\n')
+        code_y_start = match.start()
+        code_y_end = match.end()
+
+        # Now, we need to get the filename associated with this code block.
+        # We will look for the filename somewhere near the code block start.
+        #
+        # This "somewhere near" is referred to as the "filename_ta", to
+        # resemble a sort-of target area (ta).
+        #
+        # The target area includes the text preceding the code block that
+        # does not belong to previous code blocks ("no_code").
+        # Additionally, as sometimes the filename is defined within
+        # the code block itself, we will also include the first few lines
+        # of the code block in the filename_ta.
+        #
+        # Example:
+        # ```python
+        # # File: entrypoint.py
+        # import pygame
+        # ```
+        #
+        # The amount of lines to consider within the code block is set by
+        # the constant 'N_CODELINES_FOR_FILENAME_TA'.
+        #
+        # Get the "preceding" text, which is located between codeblocks
+        no_code = chat[prev_code_y_end:code_y_start].strip()
+        within_code = '\n'.join(lines[:N_CODELINES_FOR_FILENAME_TA])
+        filename_ta = no_code + '\n' + within_code
+        
+        # The path is the filename itself which we greedily match
+        filename = re.search(filename_regex, filename_ta)
+        path = filename.group(0) if filename else DEFAULT_PATH
+
+        # Visualize the filename_ta if verbose
+        if verbose:
+            print('-' * 20)
+            print(f'Path: {path}')
+            print('-' * 20)
+            print(filename_ta)
+            print('-' * 20)
+        
+        # Check if its not a false positive
+        #
+        # For instance, the match with ```main.py``` should not be considered.
+        # ```main.py```
+        # ```python
+        # ...
+        # ```
+        if not re.fullmatch(filename_regex, '\n'.join(lines)):
+            # Update the previous code block end
+            prev_code_y_end = code_y_end
+
+            # File and code have been matched, add them to the list
+            files.append((path, '\n'.join(lines[1:])))
 
     return files
 
 
+
 def to_files(chat, workspace):
     workspace["all_output.txt"] = chat
 
diff --git a/tests/test_chat_parser.py b/tests/test_chat_parser.py
new file mode 100644
index 0000000000..f3de069796
--- /dev/null
+++ b/tests/test_chat_parser.py
@@ -0,0 +1,132 @@
+import unittest
+from gpt_engineer.chat_to_files import parse_chat
+
+CODE_FORMATS = '''
+(1)
+File: main.py
+
+```python
+import pygame
+````
+
+(2)
+entry.py
+```python
+import pygame
+```
+
+(3)
+```python
+# File: rickroll.py
+import pygame
+```
+
+(4)
+```python
+
+# File: engineer.py
+import pygame
+```
+
+(5)
+```adastra.py
+import pygame
+```
+
+(6)
+```python bird.py
+import pygame
+```
+
+(7)
+```obstacle.py python
+import pygame
+```
+
+(8)
+```major1.py````
+```python
+import pygame
+```
+
+(9)
+```major2.py````
+```python
+import pygame
+```
+
+(10)
+```js
+// File: bruh.js
+const a = 1;
+```
+
+(11)
+```swag.tsx
+// File: swag.tsx
+const a: number = 1;
+```
+
+(12)
+```gmoita.ts
+// File: gmoita.tsx
+const a: number = 1;
+```
+
+(13)
+** file1.py **
+```python
+import pygame
+```
+
+(13)
+**file2.py**
+```python
+import pygame
+```
+
+(14)
+#### `gm.py`
+```python
+import pygame
+'''
+
+class TestChatParsing(unittest.TestCase):
+    
+    def setUp(self):
+        self.expected_files = [
+            'main.py',
+            'entry.py',
+            'rickroll.py',
+            'engineer.py',
+            'adastra.py',
+            'bird.py',
+            'obstacle.py',
+            'major1.py',
+            'major2.py',
+            'bruh.js',
+            'swag.tsx',
+            'gmoita.ts',
+            'file1.py',
+            'file2.py',
+            'gm.py',
+        ]
+        self.chat = CODE_FORMATS
+
+    def test_parsing(self):
+        files_and_content = parse_chat(self.chat)
+
+        # Check that the number of extracted files matches the expected number
+        self.assertEqual(len(files_and_content), len(self.expected_files))
+
+        # Iterate over the expected files and check if they match the parsed files
+        for i, expected_file in enumerate(self.expected_files):
+            self.assertEqual(files_and_content[i][0], expected_file)
+
+        # Check that the content of each file is not empty
+        for file in files_and_content:
+            self.assertNotEqual(file[1], '')
+
+if __name__ == '__main__':
+    unittest.main()
+

From 51b3520e886e5d37ed25dee0a48e06c318dc1ccc Mon Sep 17 00:00:00 2001
From: goncalomoita <goncalo.moita@brainform.co>
Date: Sat, 17 Jun 2023 18:33:10 +0100
Subject: [PATCH 2/2] improved chat parsing tests

---
 tests/test_chat_parser.py | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/tests/test_chat_parser.py b/tests/test_chat_parser.py
index f3de069796..40b224c204 100644
--- a/tests/test_chat_parser.py
+++ b/tests/test_chat_parser.py
@@ -79,22 +79,23 @@
 import pygame
 ```
 
-(13)
+(14)
 **file2.py**
 ```python
 import pygame
 ```
 
-(14)
+(15)
 #### `gm.py`
 ```python
 import pygame
+```
 '''
 
 class TestChatParsing(unittest.TestCase):
     
     def setUp(self):
-        self.expected_files = [
+        self._expected_filenames = (
             'main.py',
             'entry.py',
             'rickroll.py',
@@ -110,22 +111,18 @@ def setUp(self):
             'file1.py',
             'file2.py',
             'gm.py',
-        ]
+        )
         self.chat = CODE_FORMATS
 
     def test_parsing(self):
-        files_and_content = parse_chat(self.chat)
-
-        # Check that the number of extracted files matches the expected number
-        self.assertEqual(len(files_and_content), len(self.expected_files))
-
-        # Iterate over the expected files and check if they match the parsed files
-        for i, expected_file in enumerate(self.expected_files):
-            self.assertEqual(files_and_content[i][0], expected_file)
-
-        # Check that the content of each file is not empty
-        for file in files_and_content:
-            self.assertNotEqual(file[1], '')
+        codefiles = parse_chat(self.chat)
+
+        self.assertEqual(len(codefiles), len(self._expected_filenames))
+        for i, cf in enumerate(codefiles):
+            filename, content = cf
+            
+            self.assertEqual(filename, self._expected_filenames[i])
+            self.assertNotEqual(content, '')
 
 if __name__ == '__main__':
     unittest.main()