diff --git a/sqlglot/dialects/clickhouse.py b/sqlglot/dialects/clickhouse.py index 9e7086f153..911e8c51d2 100644 --- a/sqlglot/dialects/clickhouse.py +++ b/sqlglot/dialects/clickhouse.py @@ -166,6 +166,8 @@ class ClickHouse(Dialect): LOG_BASE_FIRST: t.Optional[bool] = None FORCE_EARLY_ALIAS_REF_EXPANSION = True PRESERVE_ORIGINAL_NAMES = True + NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = True + IDENTIFIERS_CAN_START_WITH_DIGIT = True # https://github.com/ClickHouse/ClickHouse/issues/33935#issue-1112165779 NORMALIZATION_STRATEGY = NormalizationStrategy.CASE_SENSITIVE diff --git a/sqlglot/dialects/dialect.py b/sqlglot/dialects/dialect.py index 58172ef3a6..392a0d86fc 100644 --- a/sqlglot/dialects/dialect.py +++ b/sqlglot/dialects/dialect.py @@ -420,6 +420,9 @@ class Dialect(metaclass=_Dialect): SUPPORTS_VALUES_DEFAULT = True """Whether the DEFAULT keyword is supported in the VALUES clause.""" + NUMBERS_CAN_BE_UNDERSCORE_SEPARATED = False + """Whether number literals can include underscores for better readability""" + REGEXP_EXTRACT_DEFAULT_GROUP = 0 """The default value for the capturing group.""" diff --git a/sqlglot/tokens.py b/sqlglot/tokens.py index ef7abd818f..192c293c97 100644 --- a/sqlglot/tokens.py +++ b/sqlglot/tokens.py @@ -997,6 +997,7 @@ def __init__(self, dialect: DialectType = None) -> None: self._rs_dialect_settings = RsTokenizerDialectSettings( unescaped_sequences=self.dialect.UNESCAPED_SEQUENCES, identifiers_can_start_with_digit=self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT, + numbers_can_be_underscore_separated=self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED, ) self.reset() @@ -1300,8 +1301,12 @@ def _scan_number(self) -> None: self._add(TokenType.NUMBER, number_text) self._add(TokenType.DCOLON, "::") return self._add(token_type, literal) - elif self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT: - return self._add(TokenType.VAR) + else: + replaced = literal.replace("_", "") + if self.dialect.NUMBERS_CAN_BE_UNDERSCORE_SEPARATED and replaced.isdigit(): + return self._add(TokenType.NUMBER, number_text + replaced) + if self.dialect.IDENTIFIERS_CAN_START_WITH_DIGIT: + return self._add(TokenType.VAR) self._advance(-len(literal)) return self._add(TokenType.NUMBER, number_text) diff --git a/sqlglotrs/src/settings.rs b/sqlglotrs/src/settings.rs index 7bc4882e3b..d49776d5db 100644 --- a/sqlglotrs/src/settings.rs +++ b/sqlglotrs/src/settings.rs @@ -171,6 +171,7 @@ impl TokenizerSettings { pub struct TokenizerDialectSettings { pub unescaped_sequences: HashMap, pub identifiers_can_start_with_digit: bool, + pub numbers_can_be_underscore_separated: bool, } #[pymethods] @@ -179,10 +180,12 @@ impl TokenizerDialectSettings { pub fn new( unescaped_sequences: HashMap, identifiers_can_start_with_digit: bool, + numbers_can_be_underscore_separated: bool, ) -> Self { TokenizerDialectSettings { unescaped_sequences, identifiers_can_start_with_digit, + numbers_can_be_underscore_separated, } } } diff --git a/sqlglotrs/src/tokenizer.rs b/sqlglotrs/src/tokenizer.rs index 8228b5a8c5..9aec50f677 100644 --- a/sqlglotrs/src/tokenizer.rs +++ b/sqlglotrs/src/tokenizer.rs @@ -531,10 +531,14 @@ impl<'a> TokenizerState<'a> { ) .map(|x| *x); + let replaced = literal.replace("_", ""); + if let Some(unwrapped_token_type) = token_type { self.add(self.token_types.number, Some(number_text))?; self.add(self.token_types.dcolon, Some("::".to_string()))?; self.add(unwrapped_token_type, Some(literal))?; + } else if self.dialect_settings.numbers_can_be_underscore_separated && self.is_numeric(&replaced) { + self.add(self.token_types.number, Some(number_text + &replaced))?; } else if self.dialect_settings.identifiers_can_start_with_digit { self.add(self.token_types.var, None)?; } else { @@ -706,6 +710,10 @@ impl<'a> TokenizerState<'a> { ) } + fn is_numeric(&mut self, s: &str) -> bool { + s.chars().all(|c| c.is_digit(10)) + } + fn extract_value(&mut self) -> Result { loop { if !self.peek_char.is_whitespace() diff --git a/tests/dialects/test_clickhouse.py b/tests/dialects/test_clickhouse.py index d3d363eac6..49705d51fa 100644 --- a/tests/dialects/test_clickhouse.py +++ b/tests/dialects/test_clickhouse.py @@ -549,6 +549,9 @@ def test_clickhouse(self): "SELECT name FROM data WHERE NOT ((SELECT DISTINCT name FROM data) IS NULL)", ) + self.validate_identity("SELECT 1_2_3_4_5", "SELECT 12345") + self.validate_identity("SELECT 1_b", "SELECT 1_b") + def test_clickhouse_values(self): values = exp.select("*").from_( exp.values([exp.tuple_(1, 2, 3)], alias="subq", columns=["a", "b", "c"]) diff --git a/tests/dialects/test_hive.py b/tests/dialects/test_hive.py index c569d9657f..d0b881e412 100644 --- a/tests/dialects/test_hive.py +++ b/tests/dialects/test_hive.py @@ -806,6 +806,8 @@ def test_hive(self): }, ) + self.validate_identity("SELECT 1_2") + def test_escapes(self) -> None: self.validate_identity("'\n'", "'\\n'") self.validate_identity("'\\n'")