Skip to content

Commit

Permalink
Revert "Improved chat parsing with no AI logic (AntonOsika#120)"
Browse files Browse the repository at this point in the history
This reverts commit 8facedd.
  • Loading branch information
AntonOsika authored and 70ziko committed Oct 25, 2023
1 parent 6ce02cf commit 03ac9c5
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 201 deletions.
83 changes: 11 additions & 72 deletions gpt_engineer/chat_to_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,82 +2,21 @@
from typing import List, Tuple
from gpt_engineer.db import DB

# Amount of lines within the code block to consider for filename discovery
N_CODELINES_FOR_FILENAME_TA = 5

# Default path to use if no filename is found
DEFAULT_PATH = 'unknown.txt'
def parse_chat(chat) -> List[Tuple[str, str]]:
# Get all ``` blocks
regex = r"```(.*?)```"

matches = re.finditer(regex, chat, re.DOTALL)

def parse_chat(chat: str, verbose: bool = False) -> List[Tuple[str, str]]:
'''
Parses a chat message and returns a list of tuples containing
the file path and the code content for each file.
'''
code_regex = r'```(.*?)```'
filename_regex = r'\b[\w-]+\.[\w]{1,6}\b'

# Get all ``` (code) blocks
code_matches = re.finditer(code_regex, chat, re.DOTALL)

prev_code_y_end = 0
files = []
for match in code_matches:
lines = match.group(1).split('\n')
code_y_start = match.start()
code_y_end = match.end()

# Now, we need to get the filename associated with this code block.
# We will look for the filename somewhere near the code block start.
#
# This "somewhere near" is referred to as the "filename_ta", to
# resemble a sort-of target area (ta).
#
# The target area includes the text preceding the code block that
# does not belong to previous code blocks ("no_code").
# Additionally, as sometimes the filename is defined within
# the code block itself, we will also include the first few lines
# of the code block in the filename_ta.
#
# Example:
# ```python
# # File: entrypoint.py
# import pygame
# ```
#
# The amount of lines to consider within the code block is set by
# the constant 'N_CODELINES_FOR_FILENAME_TA'.
#
# Get the "preceding" text, which is located between codeblocks
no_code = chat[prev_code_y_end:code_y_start].strip()
within_code = '\n'.join(lines[:N_CODELINES_FOR_FILENAME_TA])
filename_ta = no_code + '\n' + within_code

# The path is the filename itself which we greedily match
filename = re.search(filename_regex, filename_ta)
path = filename.group(0) if filename else DEFAULT_PATH

# Visualize the filename_ta if verbose
if verbose:
print('-' * 20)
print(f'Path: {path}')
print('-' * 20)
print(filename_ta)
print('-' * 20)

# Check if its not a false positive
#
# For instance, the match with ```main.py``` should not be considered.
# ```main.py```
# ```python
# ...
# ```
if not re.fullmatch(filename_regex, '\n'.join(lines)):
# Update the previous code block end
prev_code_y_end = code_y_end

# File and code have been matched, add them to the list
files.append((path, '\n'.join(lines[1:])))
for match in matches:
path = match.group(1).split("\n")[0]
# Get the code
code = match.group(1).split("\n")[1:]
code = "\n".join(code)
# Add the file to the list
files.append((path, code))

return files

Expand Down
129 changes: 0 additions & 129 deletions tests/test_chat_parser.py

This file was deleted.

0 comments on commit 03ac9c5

Please sign in to comment.