From b01426f85840e87db8abbc04ac8cab9d6dd2dc43 Mon Sep 17 00:00:00 2001 From: Juan Carrano Date: Mon, 26 Aug 2019 16:56:33 +0200 Subject: [PATCH 1/6] tests/shell: Test quoting and escaping. Check that single and doule quotes work, along with backslash escaping and that malformed strings are rejected. Right now the test is failing. Then next commit will replace the tokenizer with one that works. --- tests/shell/tests/01-run.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/shell/tests/01-run.py b/tests/shell/tests/01-run.py index be99c9bbab38..29da032dcf4a 100755 --- a/tests/shell/tests/01-run.py +++ b/tests/shell/tests/01-run.py @@ -49,6 +49,11 @@ ('unknown_command', ('shell: command not found: unknown_command')), ('help', EXPECTED_HELP), ('echo a string', ('\"echo\"\"a\"\"string\"')), + ('echo "t\e st" "\\"" '"'\\'' a\ b", ('"echo""te st"""""\'""a b"')), + ('echo a\\', ('shell: incorrect quoting')), + ('echo "', ('shell: incorrect quoting')), + ("echo '", ('shell: incorrect quoting')), + ('echo "\'" \'"\'', ('"echo""\'""""')), ('ps', EXPECTED_PS), ('garbage1234'+CONTROL_C, ('>')), # test cancelling a line ('help', EXPECTED_HELP), From 872d3bb70a737c785102c721b42d62a740d46209 Mon Sep 17 00:00:00 2001 From: Juan Carrano Date: Mon, 26 Aug 2019 16:01:22 +0200 Subject: [PATCH 2/6] sys/shell: refactor tokenizer code. The tokenizer (the code that breaks up the line given to the shell into strings to create argv) was quite a messy piece of code. This commit refactors it into a more traditional state-machine based parser. This fixes the issues with quote handling exposed by the recently introduced test. Not only is the code clearer now, it is also smaller: In the SAMR-21, before: ``` text data bss dec hex filename 10292 140 2612 13044 32f4 tests/shell/bin/samr21-xpro/tests_shell.elf ``` After: ``` text data bss dec hex filename 10268 140 2612 13020 32dc tests/shell/bin/samr21-xpro/tests_shell.elf ``` Further commits can bring this number down even further, but for now the focus in on clarity. --- sys/shell/shell.c | 182 ++++++++++++++++++++++++---------------------- 1 file changed, 94 insertions(+), 88 deletions(-) diff --git a/sys/shell/shell.c b/sys/shell/shell.c index f7915253942d..86a470923d64 100644 --- a/sys/shell/shell.c +++ b/sys/shell/shell.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "shell.h" #include "shell_commands.h" @@ -109,107 +110,112 @@ static void print_help(const shell_command_t *command_list) } } +enum PARSE_STATE { + PARSE_SPACE, + PARSE_UNQUOTED, + PARSE_SINGLEQUOTE, + PARSE_DOUBLEQUOTE, + PARSE_ESCAPE_MASK, + PARSE_UNQUOTED_ESC, + PARSE_SINGLEQUOTE_ESC, + PARSE_DOUBLEQUOTE_ESC, +}; + +static enum PARSE_STATE escape_toggle(enum PARSE_STATE s) +{ + return s ^ PARSE_ESCAPE_MASK; +} + +#define SQUOTE '\'' +#define DQUOTE '"' +#define ESCAPECHAR '\\' +#define BLANK ' ' + static void handle_input_line(const shell_command_t *command_list, char *line) { static const char *INCORRECT_QUOTING = "shell: incorrect quoting"; /* first we need to calculate the number of arguments */ - unsigned argc = 0; - char *pos = line; - int contains_esc_seq = 0; - while (1) { - if ((unsigned char) *pos > ' ') { - /* found an argument */ - if (*pos == '"' || *pos == '\'') { - /* it's a quoted argument */ - const char quote_char = *pos; - do { - ++pos; - if (!*pos) { - puts(INCORRECT_QUOTING); - return; - } - else if (*pos == '\\') { - /* skip over the next character */ - ++contains_esc_seq; - ++pos; - if (!*pos) { - puts(INCORRECT_QUOTING); - return; - } - continue; - } - } while (*pos != quote_char); - if ((unsigned char) pos[1] > ' ') { - puts(INCORRECT_QUOTING); - return; + int argc = 0; + char *readpos = line; + char *writepos = readpos; + enum PARSE_STATE pstate = PARSE_SPACE; + + while (*readpos != '\0') { + switch (pstate) { + case PARSE_SPACE: + if (*readpos != BLANK) { + argc++; } - } - else { - /* it's an unquoted argument */ - do { - if (*pos == '\\') { - /* skip over the next character */ - ++contains_esc_seq; - ++pos; - if (!*pos) { - puts(INCORRECT_QUOTING); - return; - } - } - ++pos; - if (*pos == '"') { - puts(INCORRECT_QUOTING); - return; - } - } while ((unsigned char) *pos > ' '); - } - - /* count the number of arguments we got */ - ++argc; + if (*readpos == SQUOTE) { + pstate = PARSE_SINGLEQUOTE; + } else if (*readpos == DQUOTE) { + pstate = PARSE_DOUBLEQUOTE; + } else if (*readpos == ESCAPECHAR) { + pstate = PARSE_UNQUOTED_ESC; + } else if (*readpos != BLANK) { + pstate = PARSE_UNQUOTED; + break; + } + goto parse_end; + case PARSE_UNQUOTED: + if (*readpos == BLANK) { + pstate = PARSE_SPACE; + *writepos++ = '\0'; + goto parse_end; + } else if (*readpos == ESCAPECHAR) { + pstate = escape_toggle(pstate); + goto parse_end; + } + break; + case PARSE_SINGLEQUOTE: + if (*readpos == SQUOTE) { + pstate = PARSE_SPACE; + *writepos++ = '\0'; + goto parse_end; + } else if (*readpos == ESCAPECHAR) { + pstate = escape_toggle(pstate); + goto parse_end; + } + break; + case PARSE_DOUBLEQUOTE: + if (*readpos == DQUOTE) { + pstate = PARSE_SPACE; + *writepos++ = '\0'; + goto parse_end; + } else if (*readpos == ESCAPECHAR) { + pstate = escape_toggle(pstate); + goto parse_end; + } + break; + default: /* QUOTED state */ + pstate = escape_toggle(pstate); + break; } + *writepos++ = *readpos; +parse_end: + readpos++; + } - /* zero out the current position (space or quotation mark) and advance */ - if (*pos > 0) { - *pos = 0; - ++pos; - } - else { - break; - } + *writepos = '\0'; + + if (pstate != PARSE_SPACE && pstate != PARSE_UNQUOTED) { + puts(INCORRECT_QUOTING); + return; } - if (!argc) { + + if (argc == 0) { return; } /* then we fill the argv array */ - char *argv[argc + 1]; - argv[argc] = NULL; - pos = line; - for (unsigned i = 0; i < argc; ++i) { - while (!*pos) { - ++pos; - } - if (*pos == '"' || *pos == '\'') { - ++pos; - } - argv[i] = pos; - while (*pos) { - ++pos; - } - } - for (char **arg = argv; contains_esc_seq && *arg; ++arg) { - for (char *c = *arg; *c; ++c) { - if (*c != '\\') { - continue; - } - for (char *d = c; *d; ++d) { - *d = d[1]; - } - if (--contains_esc_seq == 0) { - break; - } - } + int collected; + char *argv[argc]; + + readpos = line; + for (collected = 0; collected < argc; collected++) { + argv[collected] = readpos; + readpos += strlen(readpos) + 1; } /* then we call the appropriate handler */ From 87194e4cc9231a824965c4937ba1eb4df656fba6 Mon Sep 17 00:00:00 2001 From: Juan Carrano Date: Mon, 26 Aug 2019 16:28:35 +0200 Subject: [PATCH 3/6] sys/shell: further refactor tokenizer. Factor out common code for quoted and unquoted tokens. This makes the code slighly less clear, but it also eliminates repetition (which may improve clarity). The binary size is decreased again (for samr21): Before: ``` text data bss dec hex filename 10268 140 2612 13020 32dc tests/shell/bin/samr21-xpro/tests_shell.elf ``` After: ``` text data bss dec hex filename 10256 140 2612 13008 32d0 tests/shell/bin/samr21-xpro/tests_shell.elf ``` --- sys/shell/shell.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/sys/shell/shell.c b/sys/shell/shell.c index 86a470923d64..c6140730c3eb 100644 --- a/sys/shell/shell.c +++ b/sys/shell/shell.c @@ -142,6 +142,8 @@ static void handle_input_line(const shell_command_t *command_list, char *line) enum PARSE_STATE pstate = PARSE_SPACE; while (*readpos != '\0') { + char wordbreak; + switch (pstate) { case PARSE_SPACE: if (*readpos != BLANK) { @@ -159,35 +161,23 @@ static void handle_input_line(const shell_command_t *command_list, char *line) } goto parse_end; case PARSE_UNQUOTED: - if (*readpos == BLANK) { - pstate = PARSE_SPACE; - *writepos++ = '\0'; - goto parse_end; - } else if (*readpos == ESCAPECHAR) { - pstate = escape_toggle(pstate); - goto parse_end; - } - break; + wordbreak = BLANK; + goto break_word; case PARSE_SINGLEQUOTE: - if (*readpos == SQUOTE) { - pstate = PARSE_SPACE; - *writepos++ = '\0'; - goto parse_end; - } else if (*readpos == ESCAPECHAR) { - pstate = escape_toggle(pstate); - goto parse_end; - } - break; + wordbreak = SQUOTE; + goto break_word; case PARSE_DOUBLEQUOTE: - if (*readpos == DQUOTE) { + wordbreak = DQUOTE; +break_word: + if (*readpos == wordbreak) { pstate = PARSE_SPACE; *writepos++ = '\0'; - goto parse_end; } else if (*readpos == ESCAPECHAR) { pstate = escape_toggle(pstate); - goto parse_end; + } else { + break; } - break; + goto parse_end; default: /* QUOTED state */ pstate = escape_toggle(pstate); break; From a01620eaa69d9cdc5a681d926316666a88684ff3 Mon Sep 17 00:00:00 2001 From: Juan Carrano Date: Tue, 27 Aug 2019 13:10:18 +0200 Subject: [PATCH 4/6] sys/shell: simplify array traversal code. The code for traversing arrays of shell commands (used to print help messages and to search for commmand handlers) was needlessly complex. The new code is both cleaner and results in smaller binaries (at least for samr21). A substantial part of the improvement in size is due to the following change ``` - printf("%-20s %s\n", "Command", "Description"); - puts("---------------------------------------"); + puts("Command Description\n---------------------------------------"); ``` The way the code is written now means that ifdefs in the middle of the code can be eliminated, further enhancing clarity. Size before: ``` text data bss dec hex filename 10176 140 2612 12928 3280 tests/shell/bin/samr21-xpro/tests_shell.elf ``` After: ``` text data bss dec hex filename 10164 140 2612 12916 3274 tests/shell/bin/samr21-xpro/tests_shell.elf ``` --- sys/shell/shell.c | 79 ++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 42 deletions(-) diff --git a/sys/shell/shell.c b/sys/shell/shell.c index c6140730c3eb..42700b03ba48 100644 --- a/sys/shell/shell.c +++ b/sys/shell/shell.c @@ -46,6 +46,12 @@ static void _putchar(int c) { #endif #endif +#ifdef MODULE_SHELL_COMMANDS +#define _use_builtin_cmds true +#else +#define _use_builtin_cmds false +#endif + static void flush_if_needed(void) { #ifdef MODULE_NEWLIB @@ -53,60 +59,49 @@ static void flush_if_needed(void) #endif } -static shell_command_handler_t find_handler(const shell_command_t *command_list, char *command) +static shell_command_handler_t search_commands(const shell_command_t *entry, + char *command) { - const shell_command_t *command_lists[] = { - command_list, -#ifdef MODULE_SHELL_COMMANDS - _shell_command_list, -#endif - }; + for (; entry->name != NULL; entry++) { + if (strcmp(entry->name, command) == 0) { + return entry->handler; + } + } + return NULL; +} - /* iterating over command_lists */ - for (unsigned int i = 0; i < ARRAY_SIZE(command_lists); i++) { +static shell_command_handler_t find_handler(const shell_command_t *command_list, char *command) +{ + shell_command_handler_t handler = NULL; - const shell_command_t *entry; + if (command_list != NULL) { + handler = search_commands(command_list, command); + } - if ((entry = command_lists[i])) { - /* iterating over commands in command_lists entry */ - while (entry->name != NULL) { - if (strcmp(entry->name, command) == 0) { - return entry->handler; - } - else { - entry++; - } - } - } + if (handler == NULL && _use_builtin_cmds) { + handler = search_commands(_shell_command_list, command); } - return NULL; + return handler; } -static void print_help(const shell_command_t *command_list) +static void print_commands(const shell_command_t *entry) { - printf("%-20s %s\n", "Command", "Description"); - puts("---------------------------------------"); - - const shell_command_t *command_lists[] = { - command_list, -#ifdef MODULE_SHELL_COMMANDS - _shell_command_list, -#endif - }; + for (; entry->name != NULL; entry++) { + printf("%-20s %s\n", entry->name, entry->desc); + } +} - /* iterating over command_lists */ - for (unsigned int i = 0; i < ARRAY_SIZE(command_lists); i++) { +static void print_help(const shell_command_t *command_list) +{ + puts("Command Description\n---------------------------------------"); - const shell_command_t *entry; + if (command_list != NULL) { + print_commands(command_list); + } - if ((entry = command_lists[i])) { - /* iterating over commands in command_lists entry */ - while (entry->name != NULL) { - printf("%-20s %s\n", entry->name, entry->desc); - entry++; - } - } + if (_use_builtin_cmds) { + print_commands(_shell_command_list); } } From eb38d0167d0e62dab8f9878afc1d4c741adcdbc9 Mon Sep 17 00:00:00 2001 From: Juan Carrano Date: Tue, 27 Aug 2019 18:08:49 +0200 Subject: [PATCH 5/6] fixup! llvm is not that smart in native. I would have expected that the access to _shell_command_list would be elided when the branch was inaccessible, but it did not happen. --- sys/shell/shell.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sys/shell/shell.c b/sys/shell/shell.c index 42700b03ba48..18ca842b8942 100644 --- a/sys/shell/shell.c +++ b/sys/shell/shell.c @@ -47,9 +47,9 @@ static void _putchar(int c) { #endif #ifdef MODULE_SHELL_COMMANDS -#define _use_builtin_cmds true +#define _builtin_cmds _shell_command_list #else -#define _use_builtin_cmds false +#define _builtin_cmds NULL #endif static void flush_if_needed(void) @@ -78,8 +78,8 @@ static shell_command_handler_t find_handler(const shell_command_t *command_list, handler = search_commands(command_list, command); } - if (handler == NULL && _use_builtin_cmds) { - handler = search_commands(_shell_command_list, command); + if (handler == NULL && _builtin_cmds != NULL) { + handler = search_commands(_builtin_cmds, command); } return handler; @@ -100,8 +100,8 @@ static void print_help(const shell_command_t *command_list) print_commands(command_list); } - if (_use_builtin_cmds) { - print_commands(_shell_command_list); + if (_builtin_cmds != NULL) { + print_commands(_builtin_cmds); } } From 1be6a1340ff719a8fdca2e58c62e30529b913054 Mon Sep 17 00:00:00 2001 From: Juan Carrano Date: Wed, 28 Aug 2019 14:26:04 +0200 Subject: [PATCH 6/6] fixup! include state diagram --- sys/shell/shell.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/sys/shell/shell.c b/sys/shell/shell.c index 18ca842b8942..c3a1599dde0b 100644 --- a/sys/shell/shell.c +++ b/sys/shell/shell.c @@ -126,6 +126,36 @@ static enum PARSE_STATE escape_toggle(enum PARSE_STATE s) #define ESCAPECHAR '\\' #define BLANK ' ' +/** + * Break input line into words, create argv and call the command handler. + * + * Words are broken up at spaces. A backslash escaped the character that comes + * after (meaning it it taken literally and if it is a space it does not break + * the word). Spaces can also be protected by quoting with double or single + * quotes. + * + State diagram for the tokenizer: +``` + + + ┌───[\]────┐ ┌─────["]────┐ ┌───[']─────┐ ┌───[\]────┐ + ↓ │ ↓ │ │ ↓ │ ↓ + ┏━━━━━━━━━━┓ ┏━┷━━━━━┓ ┏━┷━━━┷━┓ ┏━━━━┷━━┓ ┏━━━━━━━━━━┓ + ┃DQUOTE ESC┃ ┃DQUOTE ┠───["]─>┃SPACE ┃<─[']──┨SQUOTE ┃ ┃SQUOTE ESC┃ + ┗━━━━━━━━┯━┛ ┗━━━━━━┯┛ ┗┯━━━━┯━┛ ┗━┯━━━━━┛ ┗━━━┯━━━━━━┛ + │ ↑ │ │ │ │ ↑(store) │ + │ (store)│ │ ┌─[\]──┘ └──[*]────┐ │ │ │ + └────[*]──▶┴◀[*]┘ │ │ └[*]▶┴◀──────[*]┘ + ↓ ┏━━━━━━━┓ ↓ + ├◀─[\]┨NOQUOTE┃◀──────┼◀─┐ + │ ┗━━━━━┯━┛(store)↑ │ + │ │ │ │ + │ └─[*]─────┘ │ + │ ┏━━━━━━━━━━━┓ │ + └────▶┃NOQUOTE ESC┠──[*]─┘ + ┗━━━━━━━━━━━┛ +``` +*/ static void handle_input_line(const shell_command_t *command_list, char *line) { static const char *INCORRECT_QUOTING = "shell: incorrect quoting";