Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: ensure tags have the same starting and ending names #9

Merged
merged 4 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "tree-sitter-xml"
description = "XML grammar for tree-sitter"
version = "0.5.1"
version = "0.5.2"
license = "MIT"
readme = "README.md"
keywords = ["incremental", "parsing", "dtd", "xml"]
Expand Down
2 changes: 1 addition & 1 deletion binding.gyp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
],
"cflags_c": [
"-std=c99",
"-Wno-misleading-indentation",
"-Wno-unused-parameter",
"-Wno-unused-but-set-variable",
],
"cflags_cc": [
"-Wno-cast-function-type",
Expand Down
4 changes: 2 additions & 2 deletions bindings/rust/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ fn main() {
let mut config = cc::Build::new();
config.include(&xml_dir);
config
.flag_if_supported("-Wno-misleading-indentation")
.flag_if_supported("-Wno-unused-parameter");
.flag_if_supported("-Wno-unused-parameter")
.flag_if_supported("-Wno-unused-but-set-variable");

for path in &[
xml_dir.join("parser.c"),
Expand Down
84 changes: 37 additions & 47 deletions common/scanner.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include "tree_sitter/parser.h"
#include <wctype.h>

enum TokenType {
PI_TARGET,
Expand All @@ -10,37 +11,35 @@ enum TokenType {
CDATA,
XML_MODEL,
XML_STYLESHEET,
START_TAG_NAME,
END_TAG_NAME,
ERRONEOUS_END_NAME,
SELF_CLOSING_TAG_DELIMITER,
};

// BUG: see cursorless-dev/vscode-parse-tree#74

/// Check if the character is a letter
#define isalpha(chr) \
(((chr) >= 'A' && (chr) <= 'Z') || \
((chr) >= 'a' && (chr) <= 'z'))

/// Check if the character is alphanumeric
#define isalnum(chr) \
(isalpha(chr) || ((chr) >= '0' && (chr) <= '9'))

/// Advance the lexer if the next token doesn't match the given character
#define advance_if_not(lexer, chr) \
if ((lexer)->lookahead != (chr)) return false; advance((lexer))
/// Advance the lexer if the next token matches the given character
#define advance_if_eq(lexer, chr) \
if ((lexer)->lookahead == (chr)) advance((lexer)); else return false

/// Advance the lexer to the next token
static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }

/// Check if the character is valid in PITarget
/// @private
static inline bool is_valid_pi_char(int32_t chr) {
return isalnum(chr) || chr == '_' || chr == ':' || chr == '.' || chr == '-' || chr == L'·';
/// Check if the character is valid in a name
/// TODO: explicitly follow https://www.w3.org/TR/xml11/#NT-Name
static inline bool is_valid_name_char(wchar_t chr) {
return iswalnum(chr) || chr == '_' || chr == ':' || chr == '.' || chr == '-' || chr == L'·';
}

/// Check if the character is valid to start a name
/// TODO: explicitly follow https://www.w3.org/TR/xml11/#NT-NameStartChar
static inline bool is_valid_name_start_char(wchar_t chr) {
return iswalpha(chr) || chr == '_' || chr == ':';
}

/// Check if the lexer matches the given word
/// @private
static inline bool check_word(TSLexer *lexer, const char *const word) {
for (int j = 0; word[j] != '\0'; ++j) {
advance_if_not(lexer, word[j]);
advance_if_eq(lexer, word[j]);
}
return true;
}
Expand All @@ -49,7 +48,7 @@ static inline bool check_word(TSLexer *lexer, const char *const word) {
static bool scan_pi_target(TSLexer *lexer, const bool *valid_symbols) {
bool advanced_once = false, found_x_first = false;

if (isalpha(lexer->lookahead) || lexer->lookahead == '_') {
if (is_valid_name_start_char(lexer->lookahead)) {
if (lexer->lookahead == 'x' || lexer->lookahead == 'X') {
found_x_first = true;
lexer->mark_end(lexer);
Expand All @@ -59,19 +58,20 @@ static bool scan_pi_target(TSLexer *lexer, const bool *valid_symbols) {
}

if (advanced_once) {
while (is_valid_pi_char(lexer->lookahead)) {
if (found_x_first &&
(lexer->lookahead == 'm' || lexer->lookahead == 'M')) {
while (is_valid_name_char(lexer->lookahead)) {
if (found_x_first && (lexer->lookahead == 'm' || lexer->lookahead == 'M')) {
advance(lexer);
if (lexer->lookahead == 'l' || lexer->lookahead == 'L') {
advance(lexer);
if (is_valid_pi_char(lexer->lookahead)) {
if (is_valid_name_char(lexer->lookahead)) {
found_x_first = false;
bool last_char_hyphen = lexer->lookahead == '-';
advance(lexer);
if (last_char_hyphen) {
if (valid_symbols[XML_MODEL] && check_word(lexer, "model")) return false;
if (valid_symbols[XML_STYLESHEET] && check_word(lexer, "stylesheet")) return false;
if (valid_symbols[XML_MODEL] && check_word(lexer, "model"))
return false;
if (valid_symbols[XML_STYLESHEET] && check_word(lexer, "stylesheet"))
return false;
}
} else {
return false;
Expand All @@ -93,16 +93,20 @@ static bool scan_pi_target(TSLexer *lexer, const bool *valid_symbols) {

/// Scan for the content of a PI node
static bool scan_pi_content(TSLexer *lexer) {
while (!lexer->eof(lexer) && lexer->lookahead != '\n' && lexer->lookahead != '?') advance(lexer);
while (!lexer->eof(lexer) && lexer->lookahead != '\n' && lexer->lookahead != '?')
advance(lexer);

if (lexer->lookahead != '?')
return false;

if (lexer->lookahead != '?') return false;
lexer->mark_end(lexer);
advance(lexer);

if (lexer->lookahead == '>') {
advance(lexer);
while (lexer->lookahead == ' ') advance(lexer);
advance_if_not(lexer, '\n');
while (lexer->lookahead == ' ')
advance(lexer);
advance_if_eq(lexer, '\n');
lexer->result_symbol = PI_CONTENT;
return true;
}
Expand All @@ -112,10 +116,8 @@ static bool scan_pi_content(TSLexer *lexer) {

/// Scan for a Comment node
static bool scan_comment(TSLexer *lexer) {
advance_if_not(lexer, '<');
advance_if_not(lexer, '!');
advance_if_not(lexer, '-');
advance_if_not(lexer, '-');
advance_if_eq(lexer, '-');
advance_if_eq(lexer, '-');

while (!lexer->eof(lexer)) {
if (lexer->lookahead == '-') {
Expand All @@ -138,15 +140,3 @@ static bool scan_comment(TSLexer *lexer) {

return false;
}

/// Define the boilerplate functions of the scanner
#define SCANNER_BOILERPLATE(name) \
void *tree_sitter_##name##_external_scanner_create() { return NULL; } \
\
void tree_sitter_##name##_external_scanner_destroy(void *payload) {} \
\
void tree_sitter_##name##_external_scanner_reset(void *payload) {} \
\
unsigned tree_sitter_##name##_external_scanner_serialize(void *payload, char *buffer) { return 0; } \
\
void tree_sitter_##name##_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {}
64 changes: 32 additions & 32 deletions dtd/src/parser.c

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 18 additions & 5 deletions dtd/src/scanner.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,28 @@ static inline bool in_error_recovery(const bool *valid_symbols) {
}

bool tree_sitter_dtd_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
if (in_error_recovery(valid_symbols)) return false;
if (in_error_recovery(valid_symbols))
return false;

if (valid_symbols[PI_TARGET]) return scan_pi_target(lexer, valid_symbols);
if (valid_symbols[PI_TARGET])
return scan_pi_target(lexer, valid_symbols);

if (valid_symbols[PI_CONTENT]) return scan_pi_content(lexer);
if (valid_symbols[PI_CONTENT])
return scan_pi_content(lexer);

if (valid_symbols[COMMENT]) return scan_comment(lexer);
if (valid_symbols[COMMENT]) {
advance_if_eq(lexer, '<');
advance_if_eq(lexer, '!');
return scan_comment(lexer);
}

return false;
}

SCANNER_BOILERPLATE(dtd)
void *tree_sitter_dtd_external_scanner_create() { return NULL; }

void tree_sitter_dtd_external_scanner_destroy(void *payload) {}

unsigned tree_sitter_dtd_external_scanner_serialize(void *payload, char *buffer) { return 0; }

void tree_sitter_dtd_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {}
16 changes: 5 additions & 11 deletions dtd/src/tree_sitter/parser.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading