Skip to content

Commit

Permalink
feat: implement hints while being indentation-aware
Browse files Browse the repository at this point in the history
* grammar.js - formatting

* Simple version of structured hint rule implementation

* Implement sequencing of hints' python code lines with indent cutting

* Implement python strings context tracking

* Add a $._failure token

* examples/cairo-snippets

* refactor(scanner): remove macros & fix some bugs

---------

Co-authored-by: Amaan Qureshi <[email protected]>
  • Loading branch information
ahlinc and amaanq authored Jul 6, 2023
1 parent 406efbc commit 9c6888b
Show file tree
Hide file tree
Showing 5 changed files with 275 additions and 41 deletions.
28 changes: 28 additions & 0 deletions examples/cairo-snippets/python-strings.cairo
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
const a = 1;

%{
print('hi %{ foobar %}')
print(bye)
%}

%{
print("hi %{ foobar %}")
print(bye)
%}

%{
print('''hi %{ foobar %}''')
print(bye)
%}

%{
print("""hi %{ foobar %}""")
print(bye)
%}

%{
print(f"hi %{ foobar %}")
print(bye)
%}

const b = 2;
4 changes: 4 additions & 0 deletions examples/cairo-snippets/simple.cairo
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
%{
print(hi)
print(bye)
%}
4 changes: 4 additions & 0 deletions examples/cairo-snippets/string-repeat.cairo
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
%{
print('hi %{ foobar %}'"%}"'''%}'''"""%}""")
print(bye)
%}
18 changes: 14 additions & 4 deletions grammar.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ module.exports = grammar({
name: 'cairo',

externals: $ => [
$.hint,
'%{',
$.code_line,
$._failure,
],

extras: $ => [/\s/, $.comment],
Expand Down Expand Up @@ -297,8 +299,8 @@ module.exports = grammar({
non_identifier_type: $ => choice(
'felt',
'codeoffset',
seq( $.type, '*'),
seq( $.type, '**'),
seq($.type, '*'),
seq($.type, '**'),
seq('(', commaSep1($.named_type), ')'),
$.hint,
),
Expand Down Expand Up @@ -374,6 +376,14 @@ module.exports = grammar({

hint_expression: $ => seq('nondet', $.hint),

hint: $ => seq(
'%{',
optional($.python_code),
'%}',
),

python_code: $ => repeat1($.code_line),

register: _ => choice(
'ap',
'fp',
Expand Down Expand Up @@ -416,7 +426,7 @@ module.exports = grammar({

_ref_binding: $ => choice(
$.typed_identifier,
seq( '(', commaSep($.typed_identifier), ')'),
seq('(', commaSep($.typed_identifier), ')'),
),

call_instruction: $ => choice(
Expand Down
262 changes: 225 additions & 37 deletions src/scanner.c
Original file line number Diff line number Diff line change
@@ -1,51 +1,239 @@
#include <tree_sitter/parser.h>
#include "tree_sitter/parser.h"
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wctype.h>

enum TokenType {
HINT,
HINT_START,
PYTHON_CODE_LINE,
FAILURE,
};

void *tree_sitter_cairo_external_scanner_create() { return NULL; }
void tree_sitter_cairo_external_scanner_destroy(void *payload) {}
void tree_sitter_cairo_external_scanner_reset(void *payload) {}
unsigned tree_sitter_cairo_external_scanner_serialize(void *payload,
char *buffer) {
return 0;
}
void tree_sitter_cairo_external_scanner_deserialize(void *payload,
const char *buffer,
unsigned length) {}
enum Context {
C_NONE,
C_PYTHON_CODE,
C_PYTHON_STRING,
C_PYTHON_COMMENT,
};

static void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
enum PythonStringType {
PST_NONE,
PST_1_SQ_STRING,
PST_3_SQ_STRING,
PST_1_DQ_STRING,
PST_3_DQ_STRING,
};

bool tree_sitter_cairo_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
typedef struct {
uint32_t ws_count;
uint8_t context;
uint8_t pst;
} Scanner;

bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) {
if (valid_symbols[FAILURE]) {
return false;
}

// In Cairo, hints start with %{ and end with %} and can contain anything
// including %s in between and start / end tokens inside of Python strings
if (valid_symbols[HINT_START]) {
if (lexer->lookahead == '%') {
lexer->advance(lexer, true);
if (lexer->lookahead == '{') {
scanner->context = C_PYTHON_CODE;
// Fallback to a built-in lexer
return false;
}
}
}

while (iswspace(lexer->lookahead))
lexer->advance(lexer, true);

// In Cairo, hints start with %{ and end with %} and can contain anything
// including %s in between
if (valid_symbols[HINT]) {
if (lexer->lookahead == '%') {
advance(lexer);
if (lexer->lookahead == '{') {
advance(lexer);
lexer->result_symbol = HINT;
while (lexer->lookahead != 0) {
if (lexer->lookahead == '%') {
advance(lexer);
if ((valid_symbols[PYTHON_CODE_LINE])) {
// Skip the first \n after `%{` token,
// all trailing \n after code lines will be included to themselves
if (lexer->lookahead == '\n') {
lexer->advance(lexer, true);
}

// There is a standalone hint close on line, don't consume it,
// it's a job of a built-in lexer
if (lexer->lookahead == '%') {
lexer->mark_end(lexer);
lexer->advance(lexer, false);
if (lexer->lookahead == '}') {
advance(lexer);
return true;
if (scanner->context == C_PYTHON_STRING) {
lexer->result_symbol = FAILURE;
return true;
}

scanner->context = C_NONE;
return false;
}
}

// Skip whitespaces before the hint content
// and count them to be able to restore the position
// after every line
uint32_t ws_count = 0;
while (!lexer->eof(lexer)) {
if (lexer->lookahead == '\n') {
lexer->advance(lexer, false);
lexer->mark_end(lexer);
lexer->result_symbol = PYTHON_CODE_LINE;
return true;
}
if (iswspace(lexer->lookahead)) {
ws_count += lexer->lookahead == '\t' ? 8 : 1;
lexer->advance(lexer, true);
if (scanner->ws_count > 0 && ws_count == scanner->ws_count) {
break;
}
} else {
// Make parsing redundant to improperly formated python code.
if (scanner->ws_count == 0 || ws_count < scanner->ws_count) {
scanner->ws_count = ws_count;
}
break;
}
}

uint32_t content_len = 0;
while (!lexer->eof(lexer)) {
switch (lexer->lookahead) {
case '\'':
case '"': {
const char chr = (char)lexer->lookahead;
lexer->advance(lexer, false);
content_len++;
if (scanner->context == C_PYTHON_STRING) {
unsigned iter = scanner->pst == PST_1_DQ_STRING ||
scanner->pst == PST_1_SQ_STRING
? 0
: 2;
if (iter > 0) {
do {
if (lexer->lookahead != chr) {
scanner->context = C_PYTHON_CODE;
scanner->pst = PST_NONE;
return false;
}
lexer->advance(lexer, false);
content_len++;
} while (--iter);
}
scanner->context = C_PYTHON_CODE;
scanner->pst = PST_NONE;
continue;
}
if (lexer->lookahead == chr) {
lexer->advance(lexer, false);
content_len++;
if (lexer->lookahead == chr) {
lexer->advance(lexer, false);
content_len++;
scanner->context = C_PYTHON_STRING;
scanner->pst =
chr == '"' ? PST_3_DQ_STRING : PST_3_SQ_STRING;
} else {
// single/double string ended, '' or ""
scanner->context = C_PYTHON_CODE;
scanner->pst = PST_NONE;
}
} else {
scanner->context = C_PYTHON_STRING;
scanner->pst =
chr == '"' ? PST_1_DQ_STRING : PST_1_SQ_STRING;
}

continue;
}
case '%':
if (scanner->context == C_PYTHON_STRING) {
lexer->advance(lexer, false);
content_len++;
continue;
}

lexer->mark_end(lexer);
lexer->advance(lexer, false);
if (lexer->lookahead == '}') {
if (scanner->context == C_PYTHON_STRING) {
lexer->result_symbol = FAILURE;
return true;
}

scanner->context = C_NONE;
// Don't produce an empty node before a hint close token
if (content_len > 0) {
lexer->result_symbol = PYTHON_CODE_LINE;
return true;
}
return false;
}

case '\n':
lexer->advance(lexer, false);
lexer->mark_end(lexer);
lexer->result_symbol = PYTHON_CODE_LINE;
return true;

case '#':
if (scanner->context == C_PYTHON_STRING) {
lexer->advance(lexer, false);
content_len++;
continue;
} else {
scanner->context = C_PYTHON_COMMENT;
while (lexer->lookahead != '\n' && !lexer->eof(lexer)) {
lexer->advance(lexer, false);
content_len++;
}
scanner->context = C_NONE;
continue;
}

default:
lexer->advance(lexer, false);
content_len++;
}
} else {
advance(lexer);
}
}
}
}
}

return false;
return false;
}

void *tree_sitter_cairo_external_scanner_create() {
Scanner *scanner = (Scanner *)calloc(1, sizeof(Scanner));
assert(scanner != NULL && "Failed to allocate memory for scanner");
return scanner;
}

bool tree_sitter_cairo_external_scanner_scan(void *payload, TSLexer *lexer,
const bool *valid_symbols) {
Scanner *scanner = (Scanner *)payload;
return scan(payload, lexer, valid_symbols);
}

unsigned tree_sitter_cairo_external_scanner_serialize(void *payload,
char *buffer) {
unsigned len = sizeof(Scanner);
memcpy(buffer, payload, len);
return len;
}

void tree_sitter_cairo_external_scanner_deserialize(void *payload,
const char *buffer,
unsigned length) {
Scanner *scanner = (Scanner *)payload;
if (length > 0) {
assert(sizeof(Scanner) == length && "sizeof(Scanner) != length");
memcpy(scanner, buffer, sizeof(Scanner));
}
}

void tree_sitter_cairo_external_scanner_destroy(void *payload) {
Scanner *scanner = (Scanner *)payload;
free(scanner);
}

0 comments on commit 9c6888b

Please sign in to comment.