EQL: Allow Unicode escape sequences in strings

Occationally, it's useful to be able to use non-printable, RTL (right-to-left) or other non-standard unicode characters in an EQL query. Introducing the standard \uXXXX escape sequence as well as the variable 2-8 char escape sequence \u{XXXXXXXX}, e.g.: ``` \u0023 \u{35} \u{1f2da} \u{002acd1} ``` Closes: elastic#62832
matriv · Mar 17, 2021 · 07c0620 · 07c0620
1 parent 8a9055a
commit 07c0620
Show file tree

Hide file tree

Showing 6 changed files with 507 additions and 392 deletions.
diff --git a/x-pack/plugin/eql/qa/common/src/main/java/org/elasticsearch/test/eql/EqlRestTestCase.java b/x-pack/plugin/eql/qa/common/src/main/java/org/elasticsearch/test/eql/EqlRestTestCase.java
@@ -106,6 +106,45 @@ public void testIndexWildcardPatterns() throws Exception {
         deleteIndex("test2");
     }
 
+    @SuppressWarnings("unchecked")
+    public void testUnicodeChars() throws Exception {
+        createIndex("test", Settings.EMPTY, null, null);
+
+        StringBuilder bulk = new StringBuilder();
+        bulk.append("{\"index\": {\"_index\": \"test\", \"_id\": 1}}\n");
+        bulk.append("{\"event\":{\"category\":\"process\"},\"@timestamp\":\"2020-09-04T12:34:56Z\",\"log\" : \"prefix_ë_suffix\"}\n");
+        bulk.append("{\"index\": {\"_index\": \"test\", \"_id\": 2}}\n");
+        bulk.append("{\"event\":{\"category\":\"process\"},\"@timestamp\":\"2020-09-05T12:34:57Z\",\"log\" : \"prefix_𖠋_suffix\"}\n");
+        bulkIndex(bulk.toString());
+
+        String endpoint = "/test/_eql/search";
+        Request request = new Request("GET", endpoint);
+        request.setJsonEntity("{\"query\":\"process where log==\\\"prefix_\\\\u00eb_suffix\\\"\"}");
+        Response response = client().performRequest(request);
+
+        Map<String, Object> responseMap;
+        try (InputStream content = response.getEntity().getContent()) {
+            responseMap = XContentHelper.convertToMap(JsonXContent.jsonXContent, content, false);
+        }
+        Map<String, Object> hits = (Map<String, Object>) responseMap.get("hits");
+        List<Map<String, Object>> events = (List<Map<String, Object>>) hits.get("events");
+        assertEquals(1, events.size());
+        assertEquals("1", events.get(0).get("_id"));
+
+        request.setJsonEntity("{\"query\":\"process where log==\\\"prefix_\\\\u{01680b}_suffix\\\"\"}");
+        response = client().performRequest(request);
+
+        try (InputStream content = response.getEntity().getContent()) {
+            responseMap = XContentHelper.convertToMap(JsonXContent.jsonXContent, content, false);
+        }
+        hits = (Map<String, Object>) responseMap.get("hits");
+        events = (List<Map<String, Object>>) hits.get("events");
+        assertEquals(1, events.size());
+        assertEquals("2", events.get(0).get("_id"));
+
+        deleteIndex("test");
+    }
+
     private void bulkIndex(String bulk) throws IOException {
         Request bulkRequest = new Request("POST", "/_bulk");
         bulkRequest.setJsonEntity(bulk);

diff --git a/x-pack/plugin/eql/src/main/antlr/EqlBase.g4 b/x-pack/plugin/eql/src/main/antlr/EqlBase.g4
@@ -205,12 +205,30 @@ LP: '(';
 RP: ')';
 PIPE: '|';
 
+fragment STRING_ESCAPE
+    : '\\' [btnfr"'\\]
+    ;
+
+fragment HEX_DIGIT
+    : [0-9abcdefABCDEF]
+    ;
+
+fragment UNICODE_ESCAPE
+    : '\\u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
+    | '\\u' '{' HEX_DIGIT+  '}' // 2-8 hex
+    ;
+
+fragment UNESCAPED_CHARS
+    : ~[\r\n"\\]
+    ;
+
 STRING
-    : '\''  ('\\' [btnfr"'\\] | ~[\r\n'\\])* '\''
-    | '"'   ('\\' [btnfr"'\\] | ~[\r\n"\\])* '"'
+    : '"' (STRING_ESCAPE | UNICODE_ESCAPE | UNESCAPED_CHARS)* '"'
+    | '"""' (~[\r\n])*? '"""' '"'? '"'?
+    // Old style quoting of string, handled as errors in AbstractBuilder
+    | '\''  ('\\' [btnfr"'\\] | ~[\r\n'\\])* '\''
     | '?"'  ('\\"' |~["\r\n])* '"'
     | '?\'' ('\\\'' |~['\r\n])* '\''
-    | '"""' (~[\r\n])*? '"""' '"'? '"'?
     ;
 
 INTEGER_VALUE

diff --git a/x-pack/plugin/eql/src/main/java/org/elasticsearch/xpack/eql/parser/AbstractBuilder.java b/x-pack/plugin/eql/src/main/java/org/elasticsearch/xpack/eql/parser/AbstractBuilder.java
@@ -137,7 +137,8 @@ public static String unquoteString(Source source) {
         checkForSingleQuotedString(source, text, 0);
 
         text = text.substring(1, text.length() - 1);
-        StringBuffer resultString = new StringBuffer();
+        text = handleUnicodeChars(source, text);
+        StringBuilder resultString = new StringBuilder();
         Matcher regexMatcher = slashPattern.matcher(text);
 
         while (regexMatcher.find()) {
@@ -183,6 +184,50 @@ public static String unquoteString(Source source) {
         return resultString.toString();
     }
 
+    private static String handleUnicodeChars(Source source, String text) {
+        StringBuilder sb = new StringBuilder();
+
+        int startIdx = 0;
+        int endIdx = 0;
+        int idx = text.indexOf("\\u");
+        while (idx >= 0) {
+            String fullSequence;
+            String unicodeSequence;
+            if (text.charAt(idx + 2) == '{') {
+                endIdx = text.indexOf("}", idx + 1) + 1;
+                unicodeSequence = text.substring(idx + 3, endIdx - 1);
+                int length = unicodeSequence.length();
+                if (length < 2 || length > 8) {
+                    throw new ParsingException(source, "Unicode sequence in curly braces should use [2-8] hex digits, [{}] has [{}]",
+                            text.substring(idx, endIdx), length);
+                }
+                unicodeSequence = text.substring(idx + 3, endIdx - 1);
+            } else {
+                endIdx = idx + 6;
+                unicodeSequence = text.substring(idx + 2, endIdx);
+            }
+            sb.append(text, startIdx, idx).append(hexToUnicode(source, unicodeSequence));
+            idx = text.indexOf("\\u", endIdx);
+            startIdx = endIdx;
+        }
+        if (endIdx < text.length()) {
+            sb.append(text.substring(endIdx));
+        }
+        return sb.toString();
+    }
+
+    private static String hexToUnicode(Source source, String hex) {
+        int code = Integer.parseInt(hex, 16);
+        if (code == 0) {
+            throw new ParsingException(source, "Unicode sequence results in null");
+        }
+        try {
+            return String.valueOf(Character.toChars(code));
+        } catch (IllegalArgumentException e) {
+            throw new ParsingException(source, "Invalid unicode character code [{}]", hex);
+        }
+    }
+
     private static void checkForSingleQuotedString(Source source, String text, int i) {
         if (text.charAt(i) == '\'') {
             throw new ParsingException(source,