Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved chat parsing with no AI logic #120

Merged
merged 4 commits into from
Jun 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 72 additions & 11 deletions gpt_engineer/chat_to_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,82 @@
from typing import List, Tuple
from gpt_engineer.db import DB

# Amount of lines within the code block to consider for filename discovery
N_CODELINES_FOR_FILENAME_TA = 5

def parse_chat(chat) -> List[Tuple[str, str]]:
# Get all ``` blocks
regex = r"```(.*?)```"
# Default path to use if no filename is found
DEFAULT_PATH = 'unknown.txt'

matches = re.finditer(regex, chat, re.DOTALL)

def parse_chat(chat: str, verbose: bool = False) -> List[Tuple[str, str]]:
'''
Parses a chat message and returns a list of tuples containing
the file path and the code content for each file.
'''
code_regex = r'```(.*?)```'
filename_regex = r'\b[\w-]+\.[\w]{1,6}\b'

# Get all ``` (code) blocks
code_matches = re.finditer(code_regex, chat, re.DOTALL)

prev_code_y_end = 0
files = []
for match in matches:
path = match.group(1).split("\n")[0]
# Get the code
code = match.group(1).split("\n")[1:]
code = "\n".join(code)
# Add the file to the list
files.append((path, code))
for match in code_matches:
lines = match.group(1).split('\n')
code_y_start = match.start()
code_y_end = match.end()

# Now, we need to get the filename associated with this code block.
# We will look for the filename somewhere near the code block start.
#
# This "somewhere near" is referred to as the "filename_ta", to
# resemble a sort-of target area (ta).
#
# The target area includes the text preceding the code block that
# does not belong to previous code blocks ("no_code").
# Additionally, as sometimes the filename is defined within
# the code block itself, we will also include the first few lines
# of the code block in the filename_ta.
#
# Example:
# ```python
# # File: entrypoint.py
# import pygame
# ```
#
# The amount of lines to consider within the code block is set by
# the constant 'N_CODELINES_FOR_FILENAME_TA'.
#
# Get the "preceding" text, which is located between codeblocks
no_code = chat[prev_code_y_end:code_y_start].strip()
within_code = '\n'.join(lines[:N_CODELINES_FOR_FILENAME_TA])
filename_ta = no_code + '\n' + within_code

# The path is the filename itself which we greedily match
filename = re.search(filename_regex, filename_ta)
path = filename.group(0) if filename else DEFAULT_PATH

# Visualize the filename_ta if verbose
if verbose:
print('-' * 20)
print(f'Path: {path}')
print('-' * 20)
print(filename_ta)
print('-' * 20)

# Check if its not a false positive
#
# For instance, the match with ```main.py``` should not be considered.
# ```main.py```
# ```python
# ...
# ```
if not re.fullmatch(filename_regex, '\n'.join(lines)):
# Update the previous code block end
prev_code_y_end = code_y_end

# File and code have been matched, add them to the list
files.append((path, '\n'.join(lines[1:])))

return files

Expand Down
129 changes: 129 additions & 0 deletions tests/test_chat_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import unittest
from gpt_engineer.chat_to_files import parse_chat

CODE_FORMATS = '''
(1)
File: main.py

```python
import pygame
````

(2)
entry.py
```python
import pygame
```

(3)
```python
# File: rickroll.py
import pygame
```

(4)
```python

# File: engineer.py
import pygame
```

(5)
```adastra.py
import pygame
```

(6)
```python bird.py
import pygame
```

(7)
```obstacle.py python
import pygame
```

(8)
```major1.py````
```python
import pygame
```

(9)
```major2.py````
```python
import pygame
```

(10)
```js
// File: bruh.js
const a = 1;
```

(11)
```swag.tsx
// File: swag.tsx
const a: number = 1;
```

(12)
```gmoita.ts
// File: gmoita.tsx
const a: number = 1;
```

(13)
** file1.py **
```python
import pygame
```

(14)
**file2.py**
```python
import pygame
```

(15)
#### `gm.py`
```python
import pygame
```
'''

class TestChatParsing(unittest.TestCase):

def setUp(self):
self._expected_filenames = (
'main.py',
'entry.py',
'rickroll.py',
'engineer.py',
'adastra.py',
'bird.py',
'obstacle.py',
'major1.py',
'major2.py',
'bruh.js',
'swag.tsx',
'gmoita.ts',
'file1.py',
'file2.py',
'gm.py',
)
self.chat = CODE_FORMATS

def test_parsing(self):
codefiles = parse_chat(self.chat)

self.assertEqual(len(codefiles), len(self._expected_filenames))
for i, cf in enumerate(codefiles):
filename, content = cf

self.assertEqual(filename, self._expected_filenames[i])
self.assertNotEqual(content, '')

if __name__ == '__main__':
unittest.main()