diff --git a/examples/cairo-snippets/python-strings.cairo b/examples/cairo-snippets/python-strings.cairo new file mode 100644 index 0000000..887cba6 --- /dev/null +++ b/examples/cairo-snippets/python-strings.cairo @@ -0,0 +1,28 @@ +const a = 1; + +%{ + print('hi %{ foobar %}') + print(bye) +%} + +%{ + print("hi %{ foobar %}") + print(bye) +%} + +%{ + print('''hi %{ foobar %}''') + print(bye) +%} + +%{ + print("""hi %{ foobar %}""") + print(bye) +%} + +%{ + print(f"hi %{ foobar %}") + print(bye) +%} + +const b = 2; diff --git a/examples/cairo-snippets/simple.cairo b/examples/cairo-snippets/simple.cairo new file mode 100644 index 0000000..467b8e4 --- /dev/null +++ b/examples/cairo-snippets/simple.cairo @@ -0,0 +1,4 @@ +%{ + print(hi) + print(bye) +%} diff --git a/examples/cairo-snippets/string-repeat.cairo b/examples/cairo-snippets/string-repeat.cairo new file mode 100644 index 0000000..9115205 --- /dev/null +++ b/examples/cairo-snippets/string-repeat.cairo @@ -0,0 +1,4 @@ +%{ + print('hi %{ foobar %}'"%}"'''%}'''"""%}""") + print(bye) +%} diff --git a/grammar.js b/grammar.js index bbb05c7..a66a179 100644 --- a/grammar.js +++ b/grammar.js @@ -33,7 +33,9 @@ module.exports = grammar({ name: 'cairo', externals: $ => [ - $.hint, + '%{', + $.code_line, + $._failure, ], extras: $ => [/\s/, $.comment], @@ -297,8 +299,8 @@ module.exports = grammar({ non_identifier_type: $ => choice( 'felt', 'codeoffset', - seq( $.type, '*'), - seq( $.type, '**'), + seq($.type, '*'), + seq($.type, '**'), seq('(', commaSep1($.named_type), ')'), $.hint, ), @@ -374,6 +376,14 @@ module.exports = grammar({ hint_expression: $ => seq('nondet', $.hint), + hint: $ => seq( + '%{', + optional($.python_code), + '%}', + ), + + python_code: $ => repeat1($.code_line), + register: _ => choice( 'ap', 'fp', @@ -416,7 +426,7 @@ module.exports = grammar({ _ref_binding: $ => choice( $.typed_identifier, - seq( '(', commaSep($.typed_identifier), ')'), + seq('(', commaSep($.typed_identifier), ')'), ), call_instruction: $ => choice( diff --git a/src/scanner.c b/src/scanner.c index d9d793f..4d1b758 100644 --- a/src/scanner.c +++ b/src/scanner.c @@ -1,51 +1,239 @@ -#include +#include "tree_sitter/parser.h" +#include +#include +#include +#include #include enum TokenType { - HINT, + HINT_START, + PYTHON_CODE_LINE, + FAILURE, }; -void *tree_sitter_cairo_external_scanner_create() { return NULL; } -void tree_sitter_cairo_external_scanner_destroy(void *payload) {} -void tree_sitter_cairo_external_scanner_reset(void *payload) {} -unsigned tree_sitter_cairo_external_scanner_serialize(void *payload, - char *buffer) { - return 0; -} -void tree_sitter_cairo_external_scanner_deserialize(void *payload, - const char *buffer, - unsigned length) {} +enum Context { + C_NONE, + C_PYTHON_CODE, + C_PYTHON_STRING, + C_PYTHON_COMMENT, +}; -static void advance(TSLexer *lexer) { lexer->advance(lexer, false); } +enum PythonStringType { + PST_NONE, + PST_1_SQ_STRING, + PST_3_SQ_STRING, + PST_1_DQ_STRING, + PST_3_DQ_STRING, +}; -bool tree_sitter_cairo_external_scanner_scan(void *payload, TSLexer *lexer, - const bool *valid_symbols) { +typedef struct { + uint32_t ws_count; + uint8_t context; + uint8_t pst; +} Scanner; + +bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) { + if (valid_symbols[FAILURE]) { + return false; + } + + // In Cairo, hints start with %{ and end with %} and can contain anything + // including %s in between and start / end tokens inside of Python strings + if (valid_symbols[HINT_START]) { + if (lexer->lookahead == '%') { + lexer->advance(lexer, true); + if (lexer->lookahead == '{') { + scanner->context = C_PYTHON_CODE; + // Fallback to a built-in lexer + return false; + } + } + } - while (iswspace(lexer->lookahead)) - lexer->advance(lexer, true); - - // In Cairo, hints start with %{ and end with %} and can contain anything - // including %s in between - if (valid_symbols[HINT]) { - if (lexer->lookahead == '%') { - advance(lexer); - if (lexer->lookahead == '{') { - advance(lexer); - lexer->result_symbol = HINT; - while (lexer->lookahead != 0) { - if (lexer->lookahead == '%') { - advance(lexer); + if ((valid_symbols[PYTHON_CODE_LINE])) { + // Skip the first \n after `%{` token, + // all trailing \n after code lines will be included to themselves + if (lexer->lookahead == '\n') { + lexer->advance(lexer, true); + } + + // There is a standalone hint close on line, don't consume it, + // it's a job of a built-in lexer + if (lexer->lookahead == '%') { + lexer->mark_end(lexer); + lexer->advance(lexer, false); if (lexer->lookahead == '}') { - advance(lexer); - return true; + if (scanner->context == C_PYTHON_STRING) { + lexer->result_symbol = FAILURE; + return true; + } + + scanner->context = C_NONE; + return false; + } + } + + // Skip whitespaces before the hint content + // and count them to be able to restore the position + // after every line + uint32_t ws_count = 0; + while (!lexer->eof(lexer)) { + if (lexer->lookahead == '\n') { + lexer->advance(lexer, false); + lexer->mark_end(lexer); + lexer->result_symbol = PYTHON_CODE_LINE; + return true; + } + if (iswspace(lexer->lookahead)) { + ws_count += lexer->lookahead == '\t' ? 8 : 1; + lexer->advance(lexer, true); + if (scanner->ws_count > 0 && ws_count == scanner->ws_count) { + break; + } + } else { + // Make parsing redundant to improperly formated python code. + if (scanner->ws_count == 0 || ws_count < scanner->ws_count) { + scanner->ws_count = ws_count; + } + break; + } + } + + uint32_t content_len = 0; + while (!lexer->eof(lexer)) { + switch (lexer->lookahead) { + case '\'': + case '"': { + const char chr = (char)lexer->lookahead; + lexer->advance(lexer, false); + content_len++; + if (scanner->context == C_PYTHON_STRING) { + unsigned iter = scanner->pst == PST_1_DQ_STRING || + scanner->pst == PST_1_SQ_STRING + ? 0 + : 2; + if (iter > 0) { + do { + if (lexer->lookahead != chr) { + scanner->context = C_PYTHON_CODE; + scanner->pst = PST_NONE; + return false; + } + lexer->advance(lexer, false); + content_len++; + } while (--iter); + } + scanner->context = C_PYTHON_CODE; + scanner->pst = PST_NONE; + continue; + } + if (lexer->lookahead == chr) { + lexer->advance(lexer, false); + content_len++; + if (lexer->lookahead == chr) { + lexer->advance(lexer, false); + content_len++; + scanner->context = C_PYTHON_STRING; + scanner->pst = + chr == '"' ? PST_3_DQ_STRING : PST_3_SQ_STRING; + } else { + // single/double string ended, '' or "" + scanner->context = C_PYTHON_CODE; + scanner->pst = PST_NONE; + } + } else { + scanner->context = C_PYTHON_STRING; + scanner->pst = + chr == '"' ? PST_1_DQ_STRING : PST_1_SQ_STRING; + } + + continue; + } + case '%': + if (scanner->context == C_PYTHON_STRING) { + lexer->advance(lexer, false); + content_len++; + continue; + } + + lexer->mark_end(lexer); + lexer->advance(lexer, false); + if (lexer->lookahead == '}') { + if (scanner->context == C_PYTHON_STRING) { + lexer->result_symbol = FAILURE; + return true; + } + + scanner->context = C_NONE; + // Don't produce an empty node before a hint close token + if (content_len > 0) { + lexer->result_symbol = PYTHON_CODE_LINE; + return true; + } + return false; + } + + case '\n': + lexer->advance(lexer, false); + lexer->mark_end(lexer); + lexer->result_symbol = PYTHON_CODE_LINE; + return true; + + case '#': + if (scanner->context == C_PYTHON_STRING) { + lexer->advance(lexer, false); + content_len++; + continue; + } else { + scanner->context = C_PYTHON_COMMENT; + while (lexer->lookahead != '\n' && !lexer->eof(lexer)) { + lexer->advance(lexer, false); + content_len++; + } + scanner->context = C_NONE; + continue; + } + + default: + lexer->advance(lexer, false); + content_len++; } - } else { - advance(lexer); - } } - } } - } - return false; + return false; +} + +void *tree_sitter_cairo_external_scanner_create() { + Scanner *scanner = (Scanner *)calloc(1, sizeof(Scanner)); + assert(scanner != NULL && "Failed to allocate memory for scanner"); + return scanner; +} + +bool tree_sitter_cairo_external_scanner_scan(void *payload, TSLexer *lexer, + const bool *valid_symbols) { + Scanner *scanner = (Scanner *)payload; + return scan(payload, lexer, valid_symbols); +} + +unsigned tree_sitter_cairo_external_scanner_serialize(void *payload, + char *buffer) { + unsigned len = sizeof(Scanner); + memcpy(buffer, payload, len); + return len; +} + +void tree_sitter_cairo_external_scanner_deserialize(void *payload, + const char *buffer, + unsigned length) { + Scanner *scanner = (Scanner *)payload; + if (length > 0) { + assert(sizeof(Scanner) == length && "sizeof(Scanner) != length"); + memcpy(scanner, buffer, sizeof(Scanner)); + } +} + +void tree_sitter_cairo_external_scanner_destroy(void *payload) { + Scanner *scanner = (Scanner *)payload; + free(scanner); }