Skip to content

Commit

Permalink
Improved chat parsing with no AI logic (AntonOsika#120)
Browse files Browse the repository at this point in the history
  • Loading branch information
goncalomoita authored Jun 18, 2023
1 parent 9cc9cf7 commit 8facedd
Show file tree
Hide file tree
Showing 2 changed files with 201 additions and 11 deletions.
83 changes: 72 additions & 11 deletions gpt_engineer/chat_to_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,82 @@
from typing import List, Tuple
from gpt_engineer.db import DB

# Amount of lines within the code block to consider for filename discovery
N_CODELINES_FOR_FILENAME_TA = 5

def parse_chat(chat) -> List[Tuple[str, str]]:
# Get all ``` blocks
regex = r"```(.*?)```"
# Default path to use if no filename is found
DEFAULT_PATH = 'unknown.txt'

matches = re.finditer(regex, chat, re.DOTALL)

def parse_chat(chat: str, verbose: bool = False) -> List[Tuple[str, str]]:
'''
Parses a chat message and returns a list of tuples containing
the file path and the code content for each file.
'''
code_regex = r'```(.*?)```'
filename_regex = r'\b[\w-]+\.[\w]{1,6}\b'

# Get all ``` (code) blocks
code_matches = re.finditer(code_regex, chat, re.DOTALL)

prev_code_y_end = 0
files = []
for match in matches:
path = match.group(1).split("\n")[0]
# Get the code
code = match.group(1).split("\n")[1:]
code = "\n".join(code)
# Add the file to the list
files.append((path, code))
for match in code_matches:
lines = match.group(1).split('\n')
code_y_start = match.start()
code_y_end = match.end()

# Now, we need to get the filename associated with this code block.
# We will look for the filename somewhere near the code block start.
#
# This "somewhere near" is referred to as the "filename_ta", to
# resemble a sort-of target area (ta).
#
# The target area includes the text preceding the code block that
# does not belong to previous code blocks ("no_code").
# Additionally, as sometimes the filename is defined within
# the code block itself, we will also include the first few lines
# of the code block in the filename_ta.
#
# Example:
# ```python
# # File: entrypoint.py
# import pygame
# ```
#
# The amount of lines to consider within the code block is set by
# the constant 'N_CODELINES_FOR_FILENAME_TA'.
#
# Get the "preceding" text, which is located between codeblocks
no_code = chat[prev_code_y_end:code_y_start].strip()
within_code = '\n'.join(lines[:N_CODELINES_FOR_FILENAME_TA])
filename_ta = no_code + '\n' + within_code

# The path is the filename itself which we greedily match
filename = re.search(filename_regex, filename_ta)
path = filename.group(0) if filename else DEFAULT_PATH

# Visualize the filename_ta if verbose
if verbose:
print('-' * 20)
print(f'Path: {path}')
print('-' * 20)
print(filename_ta)
print('-' * 20)

# Check if its not a false positive
#
# For instance, the match with ```main.py``` should not be considered.
# ```main.py```
# ```python
# ...
# ```
if not re.fullmatch(filename_regex, '\n'.join(lines)):
# Update the previous code block end
prev_code_y_end = code_y_end

# File and code have been matched, add them to the list
files.append((path, '\n'.join(lines[1:])))

return files

Expand Down
129 changes: 129 additions & 0 deletions tests/test_chat_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import unittest
from gpt_engineer.chat_to_files import parse_chat

CODE_FORMATS = '''
(1)
File: main.py
```python
import pygame
````
(2)
entry.py
```python
import pygame
```
(3)
```python
# File: rickroll.py
import pygame
```
(4)
```python
# File: engineer.py
import pygame
```
(5)
```adastra.py
import pygame
```
(6)
```python bird.py
import pygame
```
(7)
```obstacle.py python
import pygame
```
(8)
```major1.py````
```python
import pygame
```
(9)
```major2.py````
```python
import pygame
```
(10)
```js
// File: bruh.js
const a = 1;
```
(11)
```swag.tsx
// File: swag.tsx
const a: number = 1;
```
(12)
```gmoita.ts
// File: gmoita.tsx
const a: number = 1;
```
(13)
** file1.py **
```python
import pygame
```
(14)
**file2.py**
```python
import pygame
```
(15)
#### `gm.py`
```python
import pygame
```
'''

class TestChatParsing(unittest.TestCase):

def setUp(self):
self._expected_filenames = (
'main.py',
'entry.py',
'rickroll.py',
'engineer.py',
'adastra.py',
'bird.py',
'obstacle.py',
'major1.py',
'major2.py',
'bruh.js',
'swag.tsx',
'gmoita.ts',
'file1.py',
'file2.py',
'gm.py',
)
self.chat = CODE_FORMATS

def test_parsing(self):
codefiles = parse_chat(self.chat)

self.assertEqual(len(codefiles), len(self._expected_filenames))
for i, cf in enumerate(codefiles):
filename, content = cf

self.assertEqual(filename, self._expected_filenames[i])
self.assertNotEqual(content, '')

if __name__ == '__main__':
unittest.main()

0 comments on commit 8facedd

Please sign in to comment.