Skip to content

Commit

Permalink
Highlight YQL using common lexer in YDB CLI interactive mode (#9404)
Browse files Browse the repository at this point in the history
  • Loading branch information
vityaman authored Oct 25, 2024
1 parent fc33a19 commit bd4787a
Show file tree
Hide file tree
Showing 24 changed files with 562 additions and 482 deletions.
4 changes: 3 additions & 1 deletion ydb/library/yql/parser/lexer_common/lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ struct TParsedToken {
// TODO: TStringBuf for Name & Content
TString Name;
TString Content;
// Position of first token symbol
// Position of first token byte/symbol
// When antlr3 lexer is used, LinePos is a position as in a byte array,
// but when antlr4 lexer is used, LinePos is a position as in a symbol array,
ui32 Line = 0; // starts from 1
ui32 LinePos = 0; // starts from 0
};
Expand Down
2 changes: 0 additions & 2 deletions ydb/library/yql/parser/lexer_common/tokens.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#include "lexer.h"


namespace NSQLTranslation {

IOutputStream& OutputTokens(IOutputStream& out, TParsedTokenList::const_iterator begin, TParsedTokenList::const_iterator end) {
Expand All @@ -18,5 +17,4 @@ bool Tokenize(ILexer& lexer, const TString& query, const TString& queryName, TPa
return lexer.Tokenize(query, queryName, onNextToken, issues, maxErrors);
}


}
2 changes: 2 additions & 0 deletions ydb/library/yql/parser/proto_ast/antlr3/proto_ast_antlr3.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ namespace NProtoAST {
try {
Lexer.ReportErrors(&errors);
auto src = Lexer.get_tokSource();

for (;;) {
auto token = src->nextToken();
auto type = token->getType();
Expand All @@ -69,6 +70,7 @@ namespace NProtoAST {
last.Content = token->getText();
last.Line = token->get_line();
last.LinePos = token->get_charPositionInLine();

onNextToken(std::move(last));
if (isEOF) {
break;
Expand Down
5 changes: 5 additions & 0 deletions ydb/library/yql/parser/proto_ast/antlr4/proto_ast_antlr4.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ namespace NProtoAST {

void CollectTokens(IErrorCollector& errors, const NSQLTranslation::ILexer::TTokenCallback& onNextToken) {
try {
bool error = false;
typename antlr4::YqlErrorListener listener(&errors, &error);
Lexer.removeErrorListeners();
Lexer.addErrorListener(&listener);

for (;;) {
auto token = Lexer.nextToken();
auto type = token->getType();
Expand Down
9 changes: 5 additions & 4 deletions ydb/library/yql/sql/v1/SQLv1.g.in
Original file line number Diff line number Diff line change
Expand Up @@ -1724,7 +1724,6 @@ MINUS: '-';
TILDA: '~';
ASTERISK: '*';
SLASH: '/';
BACKSLASH: '\\';
PERCENT: '%';
SEMICOLON: ';';
DOT: '.';
Expand All @@ -1736,9 +1735,6 @@ COLON: ':';
COMMAT: '@';
DOUBLE_COMMAT: '@@';
DOLLAR: '$';
QUOTE_DOUBLE: '"'; // This comment for fix syntax highlighting "
QUOTE_SINGLE: '\'';
BACKTICK: '`';
LBRACE_CURLY: '{';
RBRACE_CURLY: '}';
CARET: '^';
Expand All @@ -1747,6 +1743,11 @@ ARROW: '->';
RBRACE_SQUARE: ']';
LBRACE_SQUARE: '['; // pair ]

fragment BACKSLASH: '\\';
fragment QUOTE_DOUBLE: '"';
fragment QUOTE_SINGLE: '\'';
fragment BACKTICK: '`';

// http://www.antlr.org/wiki/pages/viewpage.action?pageId=1782
fragment A:('a'|'A');
fragment B:('b'|'B');
Expand Down
9 changes: 5 additions & 4 deletions ydb/library/yql/sql/v1/SQLv1Antlr4.g.in
Original file line number Diff line number Diff line change
Expand Up @@ -1723,7 +1723,6 @@ MINUS: '-';
TILDA: '~';
ASTERISK: '*';
SLASH: '/';
BACKSLASH: '\\';
PERCENT: '%';
SEMICOLON: ';';
DOT: '.';
Expand All @@ -1735,9 +1734,6 @@ COLON: ':';
COMMAT: '@';
DOUBLE_COMMAT: '@@';
DOLLAR: '$';
QUOTE_DOUBLE: '"'; // This comment for fix syntax highlighting "
QUOTE_SINGLE: '\'';
BACKTICK: '`';
LBRACE_CURLY: '{';
RBRACE_CURLY: '}';
CARET: '^';
Expand All @@ -1746,6 +1742,11 @@ ARROW: '->';
RBRACE_SQUARE: ']';
LBRACE_SQUARE: '['; // pair ]

fragment BACKSLASH: '\\';
fragment QUOTE_DOUBLE: '"';
fragment QUOTE_SINGLE: '\'';
fragment BACKTICK: '`';

// http://www.antlr.org/wiki/pages/viewpage.action?pageId=1782
fragment A:('a'|'A');
fragment B:('b'|'B');
Expand Down
3 changes: 2 additions & 1 deletion ydb/library/yql/sql/v1/format/sql_format.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ using namespace NSQLv1Generated;

using NSQLTranslation::TParsedToken;
using NSQLTranslation::TParsedTokenList;
using NSQLTranslationV1::IsProbablyKeyword;
using TTokenIterator = TParsedTokenList::const_iterator;

TTokenIterator SkipWS(TTokenIterator curr, TTokenIterator end) {
Expand Down Expand Up @@ -55,7 +56,7 @@ bool Validate(const TParsedTokenList& query, const TParsedTokenList& formattedQu
if (in->Name != out->Name) {
return false;
}
if (AsciiEqualsIgnoreCase(in->Name, in->Content)) {
if (IsProbablyKeyword(*in)) {
if (!AsciiEqualsIgnoreCase(in->Content, out->Content)) {
return false;
}
Expand Down
6 changes: 6 additions & 0 deletions ydb/library/yql/sql/v1/lexer/lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include <ydb/library/yql/parser/proto_ast/gen/v1_antlr4/SQLv1Antlr4Lexer.h>
#include <ydb/library/yql/parser/proto_ast/gen/v1_ansi_antlr4/SQLv1Antlr4Lexer.h>

#include <util/string/ascii.h>

#if defined(_tsan_enabled_)
#include <util/system/mutex.h>
#endif
Expand Down Expand Up @@ -74,4 +76,8 @@ NSQLTranslation::ILexer::TPtr MakeLexer(bool ansi, bool antlr4) {
return NSQLTranslation::ILexer::TPtr(new TV1Lexer(ansi, antlr4));
}

bool IsProbablyKeyword(const NSQLTranslation::TParsedToken& token) {
return AsciiEqualsIgnoreCase(token.Name, token.Content);
}

} // namespace NSQLTranslationV1
6 changes: 6 additions & 0 deletions ydb/library/yql/sql/v1/lexer/lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,10 @@ namespace NSQLTranslationV1 {

NSQLTranslation::ILexer::TPtr MakeLexer(bool ansi, bool antlr4);

// "Probably" because YQL keyword can be an identifier
// depending on a query context. For example
// in SELECT * FROM group - group is an identifier, but
// in SELECT * FROM ... GROUP BY ... - group is a keyword.
bool IsProbablyKeyword(const NSQLTranslation::TParsedToken& token);

}
185 changes: 185 additions & 0 deletions ydb/library/yql/sql/v1/lexer/lexer_ut.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
#include "lexer.h"

#include <ydb/library/yql/core/issue/yql_issue.h>
#include <ydb/library/yql/sql/settings/translation_settings.h>

#include <library/cpp/testing/unittest/registar.h>

using namespace NSQLTranslation;
using namespace NSQLTranslationV1;

std::pair<TParsedTokenList, NYql::TIssues> Tokenize(ILexer::TPtr& lexer, TString queryUtf8) {
TParsedTokenList tokens;
NYql::TIssues issues;
Tokenize(*lexer, queryUtf8, "Query", tokens, issues, SQL_MAX_PARSER_ERRORS);
return {tokens, issues};
}

TVector<TString> GetIssueMessages(ILexer::TPtr& lexer, TString queryUtf8) {
TVector<TString> messages;
for (const auto& issue : Tokenize(lexer, queryUtf8).second) {
messages.emplace_back(issue.ToString(/* oneLine = */ true));
}
return messages;
}

TVector<TString> GetTokenViews(ILexer::TPtr& lexer, TString queryUtf8) {
TVector<TString> names;
for (auto& token : Tokenize(lexer, queryUtf8).first) {
TString view = std::move(token.Name);
if (view == "ID_PLAIN" || view == "STRING_VALUE") {
view.append(" (");
view.append(token.Content);
view.append(")");
}
names.emplace_back(std::move(view));
}
return names;
}

void AssertEquivialent(const TParsedToken& lhs, const TParsedToken& rhs) {
if (lhs.Name == "EOF" && rhs.Name == "EOF") {
return;
}

UNIT_ASSERT_VALUES_EQUAL(lhs.Name, rhs.Name);
UNIT_ASSERT_VALUES_EQUAL(lhs.Content, rhs.Content);
UNIT_ASSERT_VALUES_EQUAL(lhs.Line, rhs.Line);
}

void AssertEquivialent(const TParsedTokenList& lhs, const TParsedTokenList& rhs) {
UNIT_ASSERT_VALUES_EQUAL(lhs.size(), rhs.size());
for (size_t i = 0; i < lhs.size(); ++i) {
AssertEquivialent(lhs.at(i), rhs.at(i));
}
}

Y_UNIT_TEST_SUITE(SQLv1Lexer) {
Y_UNIT_TEST(AntlrVersionIndependent) {
const TVector<TString> queriesUtf8 = {
"",
" ",
"SELECT",
"SEL", // identifier
"SELECT FROM test",
"SELECT * FROM",
" SELECT * FROM ",
"SELECT \"\xF0\x9F\x98\x8A\" FROM ydb",
(
"SELECT \"\xF0\x9F\x98\x8A Hello, друзья\", count, name\n"
"FROM table -- главная таблица 数据库 \n"
"WHERE count < 6\n"
" AND name = \"可靠性\"\n"
" AND count > 12"),
"\"select\"select",
};

auto lexer3 = MakeLexer(/* ansi = */ false, /* antlr4 = */ false);
auto lexer4 = MakeLexer(/* ansi = */ false, /* antlr4 = */ true);

for (const auto& query : queriesUtf8) {
auto [tokens3, issues3] = Tokenize(lexer3, query);
auto [tokens4, issues4] = Tokenize(lexer4, query);
AssertEquivialent(tokens3, tokens4);
UNIT_ASSERT(issues3.Empty());
UNIT_ASSERT(issues4.Empty());
}
}

TVector<TString> InvalidQueries();

void TestInvalidTokensSkipped(bool antlr4, const TVector<TVector<TString>>& expected) {
auto lexer = MakeLexer(/* ansi = */ false, antlr4);

auto input = InvalidQueries();
UNIT_ASSERT_VALUES_EQUAL(input.size(), expected.size());

for (size_t i = 0; i < input.size(); ++i) {
UNIT_ASSERT_VALUES_EQUAL(GetTokenViews(lexer, input[i]), expected[i]);
}
}

TVector<TString> InvalidQueries() {
return {
/* 0: */ "\xF0\x9F\x98\x8A",
/* 1: */ "select \"aaaa",
/* 2: */ "\"\\\"",
/* 3: */ "\xF0\x9F\x98\x8A SELECT * FR",
/* 4: */ "! SELECT * from",
/* 5: */ "\xF0\x9F\x98\x8Aselect ! from",
/* 6: */ "\"",
/* 7: */ "!select",
/* 8: */ "SELECT \\\"\xF0\x9F\x98\x8A\\\" FROM test",
};
}

Y_UNIT_TEST(ErrorRecoveryAntlr3) {
TVector<TVector<TString>> actual = {
/* 0: */ {"EOF"},
/* 1: */ {"SELECT", "WS", "EOF"},
/* 2: */ {"EOF"},
/* 3: */ {"WS", "SELECT", "WS", "ASTERISK", "WS", "ID_PLAIN (FR)", "EOF"},
/* 4: */ {"ID_PLAIN (ELECT)", "WS", "ASTERISK", "WS", "WS", "FROM", "EOF"},
/* 5: */ {"SELECT", "WS", "ID_PLAIN (rom)", "EOF"},
/* 6: */ {"EOF"},
/* 7: */ {"ID_PLAIN (lect)", "EOF"},
/* 8: */ {"SELECT", "WS", "EOF"},
};
TestInvalidTokensSkipped(/* antlr4 = */ false, actual);
}

Y_UNIT_TEST(ErrorRecoveryAntlr4) {
TVector<TVector<TString>> actual = {
/* 0: */ {"EOF"},
/* 1: */ {"SELECT", "WS", "EOF"},
/* 2: */ {"EOF"},
/* 3: */ {"WS", "SELECT", "WS", "ASTERISK", "WS", "ID_PLAIN (FR)", "EOF"},
/* 4: */ {"SELECT", "WS", "ASTERISK", "WS", "WS", "FROM", "EOF"},
/* 5: */ {"SELECT", "WS", "FROM", "EOF"},
/* 6: */ {"EOF"},
/* 7: */ {"ID_PLAIN (elect)", "EOF"},
/* 8: */ {"SELECT", "WS", "EOF"},
};
TestInvalidTokensSkipped(/* antlr4 = */ true, actual);
}

Y_UNIT_TEST(IssuesCollected) {
auto lexer3 = MakeLexer(/* ansi = */ false, /* antlr4 = */ false);
auto lexer4 = MakeLexer(/* ansi = */ false, /* antlr4 = */ true);

for (const auto& query : InvalidQueries()) {
auto issues3 = GetIssueMessages(lexer3, query);
auto issues4 = GetIssueMessages(lexer4, query);

UNIT_ASSERT(!issues3.empty());
UNIT_ASSERT(!issues4.empty());
}
}

Y_UNIT_TEST(IssueMessagesAntlr3) {
auto lexer3 = MakeLexer(/* ansi = */ false, /* antlr4 = */ false);

auto actual = GetIssueMessages(lexer3, "\xF0\x9F\x98\x8A SELECT * FR");

TVector<TString> expected = {
"<main>:1:0: Error: Unexpected character '\xF0\x9F\x98\x8A' (Unicode character <128522>) : cannot match to any predicted input...",
"<main>:1:1: Error: Unexpected character : cannot match to any predicted input...",
"<main>:1:2: Error: Unexpected character : cannot match to any predicted input...",
"<main>:1:3: Error: Unexpected character : cannot match to any predicted input...",
};

UNIT_ASSERT_VALUES_EQUAL(actual, expected);
}

Y_UNIT_TEST(IssueMessagesAntlr4) {
auto lexer4 = MakeLexer(/* ansi = */ false, /* antlr4 = */ true);

auto actual = GetIssueMessages(lexer4, "\xF0\x9F\x98\x8A SELECT * FR");

TVector<TString> expected = {
"<main>:1:0: Error: token recognition error at: '\xF0\x9F\x98\x8A'",
};

UNIT_ASSERT_VALUES_EQUAL(actual, expected);
}
}
12 changes: 12 additions & 0 deletions ydb/library/yql/sql/v1/lexer/ut/ya.make
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
UNITTEST_FOR(ydb/library/yql/sql/v1/lexer)

PEERDIR(
ydb/library/yql/core/issue
ydb/library/yql/parser/lexer_common
)

SRCS(
lexer_ut.cpp
)

END()
4 changes: 4 additions & 0 deletions ydb/library/yql/sql/v1/lexer/ya.make
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,7 @@ SUPPRESSIONS(
)

END()

RECURSE_FOR_TESTS(
ut
)
8 changes: 4 additions & 4 deletions ydb/library/yql/sql/v1/sql_ut_antlr4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3004,7 +3004,7 @@ Y_UNIT_TEST_SUITE(SqlToYQLErrors) {
UNIT_ASSERT(!res.Root);

TString a1 = Err2Str(res);
TString a2(R"foo(<main>:1:16: Error: Unknown cluster: edar
TString a2(R"foo(<main>:1:14: Error: token recognition error at: 'с'
)foo");

UNIT_ASSERT_NO_DIFF(a1, a2);
Expand All @@ -3016,8 +3016,8 @@ Y_UNIT_TEST_SUITE(SqlToYQLErrors) {
UNIT_ASSERT(!res1.Root);
UNIT_ASSERT(!res2.Root);

UNIT_ASSERT_NO_DIFF(Err2Str(res1), "<main>:1:12: Error: mismatched input 'b' expecting {<EOF>, ';'}\n");
UNIT_ASSERT_NO_DIFF(Err2Str(res2), "<main>:1:12: Error: mismatched input 'b' expecting {<EOF>, ';'}\n");
UNIT_ASSERT_NO_DIFF(Err2Str(res1), "<main>:1:13: Error: token recognition error at: '';'\n");
UNIT_ASSERT_NO_DIFF(Err2Str(res2), "<main>:1:13: Error: token recognition error at: '\";'\n");
}

Y_UNIT_TEST(InvalidHexInStringLiteral) {
Expand Down Expand Up @@ -3055,7 +3055,7 @@ Y_UNIT_TEST_SUITE(SqlToYQLErrors) {
Y_UNIT_TEST(InvalidStringFromTable) {
NYql::TAstParseResult res = SqlToYql("select \"FOO\"\"BAR from plato.foo");
UNIT_ASSERT(!res.Root);
UNIT_ASSERT_NO_DIFF(Err2Str(res), "<main>:1:12: Error: mismatched input '\"' expecting {<EOF>, ';'}\n");
UNIT_ASSERT_NO_DIFF(Err2Str(res), "<main>:1:12: Error: token recognition error at: '\"BAR from plato.foo'\n");
}

Y_UNIT_TEST(InvalidDoubleAtStringFromTable) {
Expand Down
Loading

0 comments on commit bd4787a

Please sign in to comment.