From 55db94f7959237133268cb72d68139a80112a0c5 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 17 May 2022 15:12:36 -0400 Subject: [PATCH 1/5] Cleanup regex_compiler operators and operands --- cpp/src/strings/regex/regcomp.cpp | 298 +++++++++++++++--------------- 1 file changed, 147 insertions(+), 151 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index dd4b4116994..613b2205dbd 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -45,6 +45,7 @@ enum OperatorType { COUNTED_LAZY = 0215, NOP = 0302, // No operation, internal use only }; +#define ITEM_MASK 0300 static reclass ccls_w(CCLASS_W); // \w static reclass ccls_s(CCLASS_S); // \s @@ -541,182 +542,185 @@ class regex_parser { class regex_compiler { reprog& m_prog; - struct Node { + struct and_node { int id_first; int id_last; }; - int cursubid; - int pushsubid; - std::vector andstack; + std::stack and_stack; - struct Ator { + struct re_operator { int t; int subid; }; - std::vector atorstack; + std::stack operator_stack; - bool lastwasand; - int nbra; + bool last_was_and; + int bracket_count; regex_flags flags; - inline void pushand(int f, int l) { andstack.push_back({f, l}); } + inline void push_and(int first, int last) { and_stack.push({first, last}); } - inline Node popand(int op) + inline and_node pop_and(int op) { - if (andstack.size() < 1) { + if (and_stack.empty()) { // missing operand for op - int inst_id = m_prog.add_inst(NOP); - pushand(inst_id, inst_id); + auto const inst_id = m_prog.add_inst(NOP); + push_and(inst_id, inst_id); } - Node node = andstack[andstack.size() - 1]; - andstack.pop_back(); + auto const node = and_stack.top(); + and_stack.pop(); return node; } - inline void pushator(int t) + inline void push_operator(int token, int subid = 0) { - Ator ator; - ator.t = t; - ator.subid = pushsubid; - atorstack.push_back(ator); + operator_stack.push(re_operator{token, subid}); } - inline Ator popator() + inline re_operator const pop_operator() { - Ator ator = atorstack[atorstack.size() - 1]; - atorstack.pop_back(); + auto const ator = operator_stack.top(); + operator_stack.pop(); return ator; } - void evaluntil(int pri) + void eval_until(int min_token) { - Node op1; - Node op2; - int id_inst1 = -1; - int id_inst2 = -1; - while (pri == RBRA || atorstack[atorstack.size() - 1].t >= pri) { - Ator ator = popator(); + while (min_token == RBRA || operator_stack.top().t >= min_token) { + auto const ator = pop_operator(); switch (ator.t) { default: - // unknown operator in evaluntil + // unknown operator break; - case LBRA: /* must have been RBRA */ - op1 = popand('('); - id_inst2 = m_prog.add_inst(RBRA); - m_prog.inst_at(id_inst2).u1.subid = ator.subid; - m_prog.inst_at(op1.id_last).u2.next_id = id_inst2; - id_inst1 = m_prog.add_inst(LBRA); - m_prog.inst_at(id_inst1).u1.subid = ator.subid; - m_prog.inst_at(id_inst1).u2.next_id = op1.id_first; - pushand(id_inst1, id_inst2); + case LBRA: // expects matching RBRA + { + auto const op = pop_and('('); + auto const id_inst2 = m_prog.add_inst(RBRA); + m_prog.inst_at(id_inst2).u1.subid = ator.subid; + m_prog.inst_at(op.id_last).u2.next_id = id_inst2; + auto const id_inst1 = m_prog.add_inst(LBRA); + m_prog.inst_at(id_inst1).u1.subid = ator.subid; + m_prog.inst_at(id_inst1).u2.next_id = op.id_first; + push_and(id_inst1, id_inst2); return; - case OR: - op2 = popand('|'); - op1 = popand('|'); - id_inst2 = m_prog.add_inst(NOP); + } + case OR: { + auto const op2 = pop_and('|'); + auto const op1 = pop_and('|'); + auto const id_inst2 = m_prog.add_inst(NOP); m_prog.inst_at(op2.id_last).u2.next_id = id_inst2; m_prog.inst_at(op1.id_last).u2.next_id = id_inst2; - id_inst1 = m_prog.add_inst(OR); + auto const id_inst1 = m_prog.add_inst(OR); m_prog.inst_at(id_inst1).u1.right_id = op1.id_first; m_prog.inst_at(id_inst1).u2.left_id = op2.id_first; - pushand(id_inst1, id_inst2); + push_and(id_inst1, id_inst2); break; - case CAT: - op2 = popand(0); - op1 = popand(0); + } + case CAT: { + auto const op2 = pop_and(0); + auto const op1 = pop_and(0); m_prog.inst_at(op1.id_last).u2.next_id = op2.id_first; - pushand(op1.id_first, op2.id_last); + push_and(op1.id_first, op2.id_last); break; - case STAR: - op2 = popand('*'); - id_inst1 = m_prog.add_inst(OR); - m_prog.inst_at(op2.id_last).u2.next_id = id_inst1; - m_prog.inst_at(id_inst1).u1.right_id = op2.id_first; - pushand(id_inst1, id_inst1); + } + case STAR: { + auto const op = pop_and('*'); + auto const id_inst1 = m_prog.add_inst(OR); + m_prog.inst_at(op.id_last).u2.next_id = id_inst1; + m_prog.inst_at(id_inst1).u1.right_id = op.id_first; + push_and(id_inst1, id_inst1); break; - case STAR_LAZY: - op2 = popand('*'); - id_inst1 = m_prog.add_inst(OR); - id_inst2 = m_prog.add_inst(NOP); - m_prog.inst_at(op2.id_last).u2.next_id = id_inst1; - m_prog.inst_at(id_inst1).u2.left_id = op2.id_first; - m_prog.inst_at(id_inst1).u1.right_id = id_inst2; - pushand(id_inst1, id_inst2); + } + case STAR_LAZY: { + auto const op = pop_and('*'); + auto const id_inst1 = m_prog.add_inst(OR); + auto const id_inst2 = m_prog.add_inst(NOP); + m_prog.inst_at(op.id_last).u2.next_id = id_inst1; + m_prog.inst_at(id_inst1).u2.left_id = op.id_first; + m_prog.inst_at(id_inst1).u1.right_id = id_inst2; + push_and(id_inst1, id_inst2); break; - case PLUS: - op2 = popand('+'); - id_inst1 = m_prog.add_inst(OR); - m_prog.inst_at(op2.id_last).u2.next_id = id_inst1; - m_prog.inst_at(id_inst1).u1.right_id = op2.id_first; - pushand(op2.id_first, id_inst1); + } + case PLUS: { + auto const op = pop_and('+'); + auto const id_inst1 = m_prog.add_inst(OR); + m_prog.inst_at(op.id_last).u2.next_id = id_inst1; + m_prog.inst_at(id_inst1).u1.right_id = op.id_first; + push_and(op.id_first, id_inst1); break; - case PLUS_LAZY: - op2 = popand('+'); - id_inst1 = m_prog.add_inst(OR); - id_inst2 = m_prog.add_inst(NOP); - m_prog.inst_at(op2.id_last).u2.next_id = id_inst1; - m_prog.inst_at(id_inst1).u2.left_id = op2.id_first; - m_prog.inst_at(id_inst1).u1.right_id = id_inst2; - pushand(op2.id_first, id_inst2); + } + case PLUS_LAZY: { + auto const op = pop_and('+'); + auto const id_inst1 = m_prog.add_inst(OR); + auto const id_inst2 = m_prog.add_inst(NOP); + m_prog.inst_at(op.id_last).u2.next_id = id_inst1; + m_prog.inst_at(id_inst1).u2.left_id = op.id_first; + m_prog.inst_at(id_inst1).u1.right_id = id_inst2; + push_and(op.id_first, id_inst2); break; - case QUEST: - op2 = popand('?'); - id_inst1 = m_prog.add_inst(OR); - id_inst2 = m_prog.add_inst(NOP); - m_prog.inst_at(id_inst1).u2.left_id = id_inst2; - m_prog.inst_at(id_inst1).u1.right_id = op2.id_first; - m_prog.inst_at(op2.id_last).u2.next_id = id_inst2; - pushand(id_inst1, id_inst2); + } + case QUEST: { + auto const op = pop_and('?'); + auto const id_inst1 = m_prog.add_inst(OR); + auto const id_inst2 = m_prog.add_inst(NOP); + m_prog.inst_at(id_inst1).u2.left_id = id_inst2; + m_prog.inst_at(id_inst1).u1.right_id = op.id_first; + m_prog.inst_at(op.id_last).u2.next_id = id_inst2; + push_and(id_inst1, id_inst2); break; - case QUEST_LAZY: - op2 = popand('?'); - id_inst1 = m_prog.add_inst(OR); - id_inst2 = m_prog.add_inst(NOP); - m_prog.inst_at(id_inst1).u2.left_id = op2.id_first; - m_prog.inst_at(id_inst1).u1.right_id = id_inst2; - m_prog.inst_at(op2.id_last).u2.next_id = id_inst2; - pushand(id_inst1, id_inst2); + } + case QUEST_LAZY: { + auto const op = pop_and('?'); + auto const id_inst1 = m_prog.add_inst(OR); + auto const id_inst2 = m_prog.add_inst(NOP); + m_prog.inst_at(id_inst1).u2.left_id = op.id_first; + m_prog.inst_at(id_inst1).u1.right_id = id_inst2; + m_prog.inst_at(op.id_last).u2.next_id = id_inst2; + push_and(id_inst1, id_inst2); break; + } } } } - void Operator(int t) + void handle_operator(int token, int subid = 0) { - if (t == RBRA && --nbra < 0) + if (token == RBRA && --bracket_count < 0) { // unmatched right paren return; - if (t == LBRA) { - nbra++; - if (lastwasand) Operator(CAT); - } else - evaluntil(t); - if (t != RBRA) pushator(t); - lastwasand = (t == STAR || t == QUEST || t == PLUS || t == STAR_LAZY || t == QUEST_LAZY || - t == PLUS_LAZY || t == RBRA); + } + if (token == LBRA) { + bracket_count++; + if (last_was_and) { handle_operator(CAT, subid); } + } else { + eval_until(token); + } + if (token != RBRA) { push_operator(token, subid); } + + static std::vector tokens{STAR, STAR_LAZY, QUEST, QUEST_LAZY, PLUS, PLUS_LAZY, RBRA}; + last_was_and = + std::any_of(tokens.begin(), tokens.end(), [token](auto t) { return t == token; }); } - void Operand(int t) + void handle_operand(int token, int subid = 0, char32_t yy = 0, int class_id = 0) { - if (lastwasand) Operator(CAT); /* catenate is implicit */ - int inst_id = m_prog.add_inst(t); - if (t == CCLASS || t == NCCLASS) { - m_prog.inst_at(inst_id).u1.cls_id = yyclass_id; - } else if (t == CHAR) { + if (last_was_and) { handle_operator(CAT, subid); } // catenate is implicit + + auto const inst_id = m_prog.add_inst(token); + if (token == CCLASS || token == NCCLASS) { + m_prog.inst_at(inst_id).u1.cls_id = class_id; + } else if (token == CHAR) { m_prog.inst_at(inst_id).u1.c = yy; - } else if (t == BOL || t == EOL) { + } else if (token == BOL || token == EOL) { m_prog.inst_at(inst_id).u1.c = is_multiline(flags) ? yy : '\n'; } - pushand(inst_id, inst_id); - lastwasand = true; + push_and(inst_id, inst_id); + last_was_and = true; } - char32_t yy; - int yyclass_id; - void expand_counted(const std::vector& in, std::vector& out) { @@ -809,14 +813,7 @@ class regex_compiler { public: regex_compiler(const char32_t* pattern, regex_flags const flags, reprog& prog) - : m_prog(prog), - cursubid(0), - pushsubid(0), - lastwasand(false), - nbra(0), - flags(flags), - yy(0), - yyclass_id(0) + : m_prog(prog), last_was_and(false), bracket_count(0), flags(flags) { // Parse std::vector items; @@ -830,43 +827,42 @@ class regex_compiler { items = parser.m_items; } - /* Start with a low priority operator to prime parser */ - pushator(START - 1); + int cur_subid{}; + int push_subid{}; - for (int i = 0; i < static_cast(items.size()); i++) { - regex_parser::Item item = items[i]; - int token = item.t; - if (token == CCLASS || token == NCCLASS) - yyclass_id = item.d.yyclass_id; - else - yy = item.d.yy; + // Start with a low priority operator + push_operator(START - 1); + + for (auto const item : items) { + auto token = item.t; if (token == LBRA) { - ++cursubid; - pushsubid = cursubid; + ++cur_subid; + push_subid = cur_subid; } else if (token == LBRA_NC) { - pushsubid = 0; - token = LBRA; + push_subid = 0; + token = LBRA; } - if ((token & 0300) == OPERATOR_MASK) - Operator(token); - else - Operand(token); + if ((token & ITEM_MASK) == OPERATOR_MASK) { + handle_operator(token, push_subid); + } else { + handle_operand(token, push_subid, item.d.yy, item.d.yyclass_id); + } } - /* Close with a low priority operator */ - evaluntil(START); - /* Force END */ - Operand(END); - evaluntil(START); - if (nbra) - ; // "unmatched left paren"; - /* points to first and only operand */ - m_prog.set_start_inst(andstack[andstack.size() - 1].id_first); + // Close with a low priority operator + eval_until(START); + // Force END + handle_operand(END, push_subid); + eval_until(START); + + CUDF_EXPECTS(bracket_count == 0, "unmatched left parenthesis"); + + m_prog.set_start_inst(and_stack.top().id_first); m_prog.finalize(); m_prog.check_for_errors(); - m_prog.set_groups_count(cursubid); + m_prog.set_groups_count(cur_subid); } }; From 9f44578d34e89fc7ee8b97b8ed76aecc887c2fe2 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 25 May 2022 13:08:53 -0400 Subject: [PATCH 2/5] group member variables --- cpp/src/strings/regex/regcomp.cpp | 168 +++++++++++++++--------------- 1 file changed, 82 insertions(+), 86 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 16718c1faeb..85ce83001d0 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -570,56 +570,52 @@ class regex_parser { * @brief The compiler converts class list into instructions. */ class regex_compiler { - reprog& m_prog; - struct and_node { int id_first; int id_last; }; - std::stack and_stack; - struct re_operator { int t; int subid; }; - std::stack operator_stack; - - bool last_was_and; - int bracket_count; - - regex_flags flags; + reprog& _prog; + std::stack _and_stack; + std::stack _operator_stack; + bool _last_was_and; + int _bracket_count; + regex_flags _flags; - inline void push_and(int first, int last) { and_stack.push({first, last}); } + inline void push_and(int first, int last) { _and_stack.push({first, last}); } inline and_node pop_and(int op) { - if (and_stack.empty()) { + if (_and_stack.empty()) { // missing operand for op - auto const inst_id = m_prog.add_inst(NOP); + auto const inst_id = _prog.add_inst(NOP); push_and(inst_id, inst_id); } - auto const node = and_stack.top(); - and_stack.pop(); + auto const node = _and_stack.top(); + _and_stack.pop(); return node; } inline void push_operator(int token, int subid = 0) { - operator_stack.push(re_operator{token, subid}); + _operator_stack.push(re_operator{token, subid}); } inline re_operator const pop_operator() { - auto const ator = operator_stack.top(); - operator_stack.pop(); + auto const ator = _operator_stack.top(); + _operator_stack.pop(); return ator; } void eval_until(int min_token) { - while (min_token == RBRA || operator_stack.top().t >= min_token) { + while (min_token == RBRA || _operator_stack.top().t >= min_token) { auto const ator = pop_operator(); switch (ator.t) { default: @@ -627,88 +623,88 @@ class regex_compiler { break; case LBRA: // expects matching RBRA { - auto const op = pop_and('('); - auto const id_inst2 = m_prog.add_inst(RBRA); - m_prog.inst_at(id_inst2).u1.subid = ator.subid; - m_prog.inst_at(op.id_last).u2.next_id = id_inst2; - auto const id_inst1 = m_prog.add_inst(LBRA); - m_prog.inst_at(id_inst1).u1.subid = ator.subid; - m_prog.inst_at(id_inst1).u2.next_id = op.id_first; + auto const op = pop_and('('); + auto const id_inst2 = _prog.add_inst(RBRA); + _prog.inst_at(id_inst2).u1.subid = ator.subid; + _prog.inst_at(op.id_last).u2.next_id = id_inst2; + auto const id_inst1 = _prog.add_inst(LBRA); + _prog.inst_at(id_inst1).u1.subid = ator.subid; + _prog.inst_at(id_inst1).u2.next_id = op.id_first; push_and(id_inst1, id_inst2); return; } case OR: { - auto const op2 = pop_and('|'); - auto const op1 = pop_and('|'); - auto const id_inst2 = m_prog.add_inst(NOP); - m_prog.inst_at(op2.id_last).u2.next_id = id_inst2; - m_prog.inst_at(op1.id_last).u2.next_id = id_inst2; - auto const id_inst1 = m_prog.add_inst(OR); - m_prog.inst_at(id_inst1).u1.right_id = op1.id_first; - m_prog.inst_at(id_inst1).u2.left_id = op2.id_first; + auto const op2 = pop_and('|'); + auto const op1 = pop_and('|'); + auto const id_inst2 = _prog.add_inst(NOP); + _prog.inst_at(op2.id_last).u2.next_id = id_inst2; + _prog.inst_at(op1.id_last).u2.next_id = id_inst2; + auto const id_inst1 = _prog.add_inst(OR); + _prog.inst_at(id_inst1).u1.right_id = op1.id_first; + _prog.inst_at(id_inst1).u2.left_id = op2.id_first; push_and(id_inst1, id_inst2); break; } case CAT: { - auto const op2 = pop_and(0); - auto const op1 = pop_and(0); - m_prog.inst_at(op1.id_last).u2.next_id = op2.id_first; + auto const op2 = pop_and(0); + auto const op1 = pop_and(0); + _prog.inst_at(op1.id_last).u2.next_id = op2.id_first; push_and(op1.id_first, op2.id_last); break; } case STAR: { - auto const op = pop_and('*'); - auto const id_inst1 = m_prog.add_inst(OR); - m_prog.inst_at(op.id_last).u2.next_id = id_inst1; - m_prog.inst_at(id_inst1).u1.right_id = op.id_first; + auto const op = pop_and('*'); + auto const id_inst1 = _prog.add_inst(OR); + _prog.inst_at(op.id_last).u2.next_id = id_inst1; + _prog.inst_at(id_inst1).u1.right_id = op.id_first; push_and(id_inst1, id_inst1); break; } case STAR_LAZY: { - auto const op = pop_and('*'); - auto const id_inst1 = m_prog.add_inst(OR); - auto const id_inst2 = m_prog.add_inst(NOP); - m_prog.inst_at(op.id_last).u2.next_id = id_inst1; - m_prog.inst_at(id_inst1).u2.left_id = op.id_first; - m_prog.inst_at(id_inst1).u1.right_id = id_inst2; + auto const op = pop_and('*'); + auto const id_inst1 = _prog.add_inst(OR); + auto const id_inst2 = _prog.add_inst(NOP); + _prog.inst_at(op.id_last).u2.next_id = id_inst1; + _prog.inst_at(id_inst1).u2.left_id = op.id_first; + _prog.inst_at(id_inst1).u1.right_id = id_inst2; push_and(id_inst1, id_inst2); break; } case PLUS: { - auto const op = pop_and('+'); - auto const id_inst1 = m_prog.add_inst(OR); - m_prog.inst_at(op.id_last).u2.next_id = id_inst1; - m_prog.inst_at(id_inst1).u1.right_id = op.id_first; + auto const op = pop_and('+'); + auto const id_inst1 = _prog.add_inst(OR); + _prog.inst_at(op.id_last).u2.next_id = id_inst1; + _prog.inst_at(id_inst1).u1.right_id = op.id_first; push_and(op.id_first, id_inst1); break; } case PLUS_LAZY: { - auto const op = pop_and('+'); - auto const id_inst1 = m_prog.add_inst(OR); - auto const id_inst2 = m_prog.add_inst(NOP); - m_prog.inst_at(op.id_last).u2.next_id = id_inst1; - m_prog.inst_at(id_inst1).u2.left_id = op.id_first; - m_prog.inst_at(id_inst1).u1.right_id = id_inst2; + auto const op = pop_and('+'); + auto const id_inst1 = _prog.add_inst(OR); + auto const id_inst2 = _prog.add_inst(NOP); + _prog.inst_at(op.id_last).u2.next_id = id_inst1; + _prog.inst_at(id_inst1).u2.left_id = op.id_first; + _prog.inst_at(id_inst1).u1.right_id = id_inst2; push_and(op.id_first, id_inst2); break; } case QUEST: { - auto const op = pop_and('?'); - auto const id_inst1 = m_prog.add_inst(OR); - auto const id_inst2 = m_prog.add_inst(NOP); - m_prog.inst_at(id_inst1).u2.left_id = id_inst2; - m_prog.inst_at(id_inst1).u1.right_id = op.id_first; - m_prog.inst_at(op.id_last).u2.next_id = id_inst2; + auto const op = pop_and('?'); + auto const id_inst1 = _prog.add_inst(OR); + auto const id_inst2 = _prog.add_inst(NOP); + _prog.inst_at(id_inst1).u2.left_id = id_inst2; + _prog.inst_at(id_inst1).u1.right_id = op.id_first; + _prog.inst_at(op.id_last).u2.next_id = id_inst2; push_and(id_inst1, id_inst2); break; } case QUEST_LAZY: { - auto const op = pop_and('?'); - auto const id_inst1 = m_prog.add_inst(OR); - auto const id_inst2 = m_prog.add_inst(NOP); - m_prog.inst_at(id_inst1).u2.left_id = op.id_first; - m_prog.inst_at(id_inst1).u1.right_id = id_inst2; - m_prog.inst_at(op.id_last).u2.next_id = id_inst2; + auto const op = pop_and('?'); + auto const id_inst1 = _prog.add_inst(OR); + auto const id_inst2 = _prog.add_inst(NOP); + _prog.inst_at(id_inst1).u2.left_id = op.id_first; + _prog.inst_at(id_inst1).u1.right_id = id_inst2; + _prog.inst_at(op.id_last).u2.next_id = id_inst2; push_and(id_inst1, id_inst2); break; } @@ -718,37 +714,37 @@ class regex_compiler { void handle_operator(int token, int subid = 0) { - if (token == RBRA && --bracket_count < 0) { + if (token == RBRA && --_bracket_count < 0) { // unmatched right paren return; } if (token == LBRA) { - bracket_count++; - if (last_was_and) { handle_operator(CAT, subid); } + _bracket_count++; + if (_last_was_and) { handle_operator(CAT, subid); } } else { eval_until(token); } if (token != RBRA) { push_operator(token, subid); } static std::vector tokens{STAR, STAR_LAZY, QUEST, QUEST_LAZY, PLUS, PLUS_LAZY, RBRA}; - last_was_and = + _last_was_and = std::any_of(tokens.begin(), tokens.end(), [token](auto t) { return t == token; }); } void handle_operand(int token, int subid = 0, char32_t yy = 0, int class_id = 0) { - if (last_was_and) { handle_operator(CAT, subid); } // catenate is implicit + if (_last_was_and) { handle_operator(CAT, subid); } // catenate is implicit - auto const inst_id = m_prog.add_inst(token); + auto const inst_id = _prog.add_inst(token); if (token == CCLASS || token == NCCLASS) { - m_prog.inst_at(inst_id).u1.cls_id = class_id; + _prog.inst_at(inst_id).u1.cls_id = class_id; } else if (token == CHAR) { - m_prog.inst_at(inst_id).u1.c = yy; + _prog.inst_at(inst_id).u1.c = yy; } else if (token == BOL || token == EOL) { - m_prog.inst_at(inst_id).u1.c = is_multiline(flags) ? yy : '\n'; + _prog.inst_at(inst_id).u1.c = is_multiline(_flags) ? yy : '\n'; } push_and(inst_id, inst_id); - last_was_and = true; + _last_was_and = true; } std::vector expand_counted(std::vector const& in) @@ -819,11 +815,11 @@ class regex_compiler { public: regex_compiler(const char32_t* pattern, regex_flags const flags, reprog& prog) - : m_prog(prog), last_was_and(false), bracket_count(0), flags(flags) + : _prog(prog), _last_was_and(false), _bracket_count(0), _flags(flags) { // Parse std::vector const items = [&] { - regex_parser parser(pattern, is_dotall(flags) ? ANYNL : ANY, m_prog); + regex_parser parser(pattern, is_dotall(flags) ? ANYNL : ANY, _prog); return parser.m_has_counted ? expand_counted(parser.m_items) : parser.m_items; }(); @@ -857,12 +853,12 @@ class regex_compiler { handle_operand(END, push_subid); eval_until(START); - CUDF_EXPECTS(bracket_count == 0, "unmatched left parenthesis"); + CUDF_EXPECTS(_bracket_count == 0, "unmatched left parenthesis"); - m_prog.set_start_inst(and_stack.top().id_first); - m_prog.finalize(); - m_prog.check_for_errors(); - m_prog.set_groups_count(cur_subid); + _prog.set_start_inst(_and_stack.top().id_first); + _prog.finalize(); + _prog.check_for_errors(); + _prog.set_groups_count(cur_subid); } }; From 5d53290f1e8d1977b870ee5eba0e3a071cc54bd4 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 25 May 2022 15:28:58 -0400 Subject: [PATCH 3/5] removed unused parameter from pop_and() --- cpp/src/strings/regex/regcomp.cpp | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 85ce83001d0..3d82471445a 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -589,10 +589,9 @@ class regex_compiler { inline void push_and(int first, int last) { _and_stack.push({first, last}); } - inline and_node pop_and(int op) + inline and_node pop_and() { if (_and_stack.empty()) { - // missing operand for op auto const inst_id = _prog.add_inst(NOP); push_and(inst_id, inst_id); } @@ -623,7 +622,7 @@ class regex_compiler { break; case LBRA: // expects matching RBRA { - auto const op = pop_and('('); + auto const op = pop_and(); auto const id_inst2 = _prog.add_inst(RBRA); _prog.inst_at(id_inst2).u1.subid = ator.subid; _prog.inst_at(op.id_last).u2.next_id = id_inst2; @@ -634,8 +633,8 @@ class regex_compiler { return; } case OR: { - auto const op2 = pop_and('|'); - auto const op1 = pop_and('|'); + auto const op2 = pop_and(); + auto const op1 = pop_and(); auto const id_inst2 = _prog.add_inst(NOP); _prog.inst_at(op2.id_last).u2.next_id = id_inst2; _prog.inst_at(op1.id_last).u2.next_id = id_inst2; @@ -646,14 +645,14 @@ class regex_compiler { break; } case CAT: { - auto const op2 = pop_and(0); - auto const op1 = pop_and(0); + auto const op2 = pop_and(); + auto const op1 = pop_and(); _prog.inst_at(op1.id_last).u2.next_id = op2.id_first; push_and(op1.id_first, op2.id_last); break; } case STAR: { - auto const op = pop_and('*'); + auto const op = pop_and(); auto const id_inst1 = _prog.add_inst(OR); _prog.inst_at(op.id_last).u2.next_id = id_inst1; _prog.inst_at(id_inst1).u1.right_id = op.id_first; @@ -661,7 +660,7 @@ class regex_compiler { break; } case STAR_LAZY: { - auto const op = pop_and('*'); + auto const op = pop_and(); auto const id_inst1 = _prog.add_inst(OR); auto const id_inst2 = _prog.add_inst(NOP); _prog.inst_at(op.id_last).u2.next_id = id_inst1; @@ -671,7 +670,7 @@ class regex_compiler { break; } case PLUS: { - auto const op = pop_and('+'); + auto const op = pop_and(); auto const id_inst1 = _prog.add_inst(OR); _prog.inst_at(op.id_last).u2.next_id = id_inst1; _prog.inst_at(id_inst1).u1.right_id = op.id_first; @@ -679,7 +678,7 @@ class regex_compiler { break; } case PLUS_LAZY: { - auto const op = pop_and('+'); + auto const op = pop_and(); auto const id_inst1 = _prog.add_inst(OR); auto const id_inst2 = _prog.add_inst(NOP); _prog.inst_at(op.id_last).u2.next_id = id_inst1; @@ -689,7 +688,7 @@ class regex_compiler { break; } case QUEST: { - auto const op = pop_and('?'); + auto const op = pop_and(); auto const id_inst1 = _prog.add_inst(OR); auto const id_inst2 = _prog.add_inst(NOP); _prog.inst_at(id_inst1).u2.left_id = id_inst2; @@ -699,7 +698,7 @@ class regex_compiler { break; } case QUEST_LAZY: { - auto const op = pop_and('?'); + auto const op = pop_and(); auto const id_inst1 = _prog.add_inst(OR); auto const id_inst2 = _prog.add_inst(NOP); _prog.inst_at(id_inst1).u2.left_id = op.id_first; From 203f9e9c9299c98f9fa6b1dcd85e31df294391c9 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 25 May 2022 15:59:53 -0400 Subject: [PATCH 4/5] use cbegin/cend for any_of call --- cpp/src/strings/regex/regcomp.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 3d82471445a..48533ff2ae1 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -727,7 +727,7 @@ class regex_compiler { static std::vector tokens{STAR, STAR_LAZY, QUEST, QUEST_LAZY, PLUS, PLUS_LAZY, RBRA}; _last_was_and = - std::any_of(tokens.begin(), tokens.end(), [token](auto t) { return t == token; }); + std::any_of(tokens.cbegin(), tokens.cend(), [token](auto t) { return t == token; }); } void handle_operand(int token, int subid = 0, char32_t yy = 0, int class_id = 0) From 848c64c485623be04bfbf5afcb7dacbee397c3f8 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 26 May 2022 13:03:59 -0400 Subject: [PATCH 5/5] remove ator name --- cpp/src/strings/regex/regcomp.cpp | 114 +++++++++++++++--------------- 1 file changed, 57 insertions(+), 57 deletions(-) diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp index 48533ff2ae1..fdf4609e336 100644 --- a/cpp/src/strings/regex/regcomp.cpp +++ b/cpp/src/strings/regex/regcomp.cpp @@ -607,103 +607,103 @@ class regex_compiler { inline re_operator const pop_operator() { - auto const ator = _operator_stack.top(); + auto const op = _operator_stack.top(); _operator_stack.pop(); - return ator; + return op; } void eval_until(int min_token) { while (min_token == RBRA || _operator_stack.top().t >= min_token) { - auto const ator = pop_operator(); - switch (ator.t) { + auto const op = pop_operator(); + switch (op.t) { default: // unknown operator break; case LBRA: // expects matching RBRA { - auto const op = pop_and(); - auto const id_inst2 = _prog.add_inst(RBRA); - _prog.inst_at(id_inst2).u1.subid = ator.subid; - _prog.inst_at(op.id_last).u2.next_id = id_inst2; - auto const id_inst1 = _prog.add_inst(LBRA); - _prog.inst_at(id_inst1).u1.subid = ator.subid; - _prog.inst_at(id_inst1).u2.next_id = op.id_first; + auto const operand = pop_and(); + auto const id_inst2 = _prog.add_inst(RBRA); + _prog.inst_at(id_inst2).u1.subid = op.subid; + _prog.inst_at(operand.id_last).u2.next_id = id_inst2; + auto const id_inst1 = _prog.add_inst(LBRA); + _prog.inst_at(id_inst1).u1.subid = op.subid; + _prog.inst_at(id_inst1).u2.next_id = operand.id_first; push_and(id_inst1, id_inst2); return; } case OR: { - auto const op2 = pop_and(); - auto const op1 = pop_and(); - auto const id_inst2 = _prog.add_inst(NOP); - _prog.inst_at(op2.id_last).u2.next_id = id_inst2; - _prog.inst_at(op1.id_last).u2.next_id = id_inst2; - auto const id_inst1 = _prog.add_inst(OR); - _prog.inst_at(id_inst1).u1.right_id = op1.id_first; - _prog.inst_at(id_inst1).u2.left_id = op2.id_first; + auto const operand2 = pop_and(); + auto const operand1 = pop_and(); + auto const id_inst2 = _prog.add_inst(NOP); + _prog.inst_at(operand2.id_last).u2.next_id = id_inst2; + _prog.inst_at(operand1.id_last).u2.next_id = id_inst2; + auto const id_inst1 = _prog.add_inst(OR); + _prog.inst_at(id_inst1).u1.right_id = operand1.id_first; + _prog.inst_at(id_inst1).u2.left_id = operand2.id_first; push_and(id_inst1, id_inst2); break; } case CAT: { - auto const op2 = pop_and(); - auto const op1 = pop_and(); - _prog.inst_at(op1.id_last).u2.next_id = op2.id_first; - push_and(op1.id_first, op2.id_last); + auto const operand2 = pop_and(); + auto const operand1 = pop_and(); + _prog.inst_at(operand1.id_last).u2.next_id = operand2.id_first; + push_and(operand1.id_first, operand2.id_last); break; } case STAR: { - auto const op = pop_and(); - auto const id_inst1 = _prog.add_inst(OR); - _prog.inst_at(op.id_last).u2.next_id = id_inst1; - _prog.inst_at(id_inst1).u1.right_id = op.id_first; + auto const operand = pop_and(); + auto const id_inst1 = _prog.add_inst(OR); + _prog.inst_at(operand.id_last).u2.next_id = id_inst1; + _prog.inst_at(id_inst1).u1.right_id = operand.id_first; push_and(id_inst1, id_inst1); break; } case STAR_LAZY: { - auto const op = pop_and(); - auto const id_inst1 = _prog.add_inst(OR); - auto const id_inst2 = _prog.add_inst(NOP); - _prog.inst_at(op.id_last).u2.next_id = id_inst1; - _prog.inst_at(id_inst1).u2.left_id = op.id_first; - _prog.inst_at(id_inst1).u1.right_id = id_inst2; + auto const operand = pop_and(); + auto const id_inst1 = _prog.add_inst(OR); + auto const id_inst2 = _prog.add_inst(NOP); + _prog.inst_at(operand.id_last).u2.next_id = id_inst1; + _prog.inst_at(id_inst1).u2.left_id = operand.id_first; + _prog.inst_at(id_inst1).u1.right_id = id_inst2; push_and(id_inst1, id_inst2); break; } case PLUS: { - auto const op = pop_and(); - auto const id_inst1 = _prog.add_inst(OR); - _prog.inst_at(op.id_last).u2.next_id = id_inst1; - _prog.inst_at(id_inst1).u1.right_id = op.id_first; - push_and(op.id_first, id_inst1); + auto const operand = pop_and(); + auto const id_inst1 = _prog.add_inst(OR); + _prog.inst_at(operand.id_last).u2.next_id = id_inst1; + _prog.inst_at(id_inst1).u1.right_id = operand.id_first; + push_and(operand.id_first, id_inst1); break; } case PLUS_LAZY: { - auto const op = pop_and(); - auto const id_inst1 = _prog.add_inst(OR); - auto const id_inst2 = _prog.add_inst(NOP); - _prog.inst_at(op.id_last).u2.next_id = id_inst1; - _prog.inst_at(id_inst1).u2.left_id = op.id_first; - _prog.inst_at(id_inst1).u1.right_id = id_inst2; - push_and(op.id_first, id_inst2); + auto const operand = pop_and(); + auto const id_inst1 = _prog.add_inst(OR); + auto const id_inst2 = _prog.add_inst(NOP); + _prog.inst_at(operand.id_last).u2.next_id = id_inst1; + _prog.inst_at(id_inst1).u2.left_id = operand.id_first; + _prog.inst_at(id_inst1).u1.right_id = id_inst2; + push_and(operand.id_first, id_inst2); break; } case QUEST: { - auto const op = pop_and(); - auto const id_inst1 = _prog.add_inst(OR); - auto const id_inst2 = _prog.add_inst(NOP); - _prog.inst_at(id_inst1).u2.left_id = id_inst2; - _prog.inst_at(id_inst1).u1.right_id = op.id_first; - _prog.inst_at(op.id_last).u2.next_id = id_inst2; + auto const operand = pop_and(); + auto const id_inst1 = _prog.add_inst(OR); + auto const id_inst2 = _prog.add_inst(NOP); + _prog.inst_at(id_inst1).u2.left_id = id_inst2; + _prog.inst_at(id_inst1).u1.right_id = operand.id_first; + _prog.inst_at(operand.id_last).u2.next_id = id_inst2; push_and(id_inst1, id_inst2); break; } case QUEST_LAZY: { - auto const op = pop_and(); - auto const id_inst1 = _prog.add_inst(OR); - auto const id_inst2 = _prog.add_inst(NOP); - _prog.inst_at(id_inst1).u2.left_id = op.id_first; - _prog.inst_at(id_inst1).u1.right_id = id_inst2; - _prog.inst_at(op.id_last).u2.next_id = id_inst2; + auto const operand = pop_and(); + auto const id_inst1 = _prog.add_inst(OR); + auto const id_inst2 = _prog.add_inst(NOP); + _prog.inst_at(id_inst1).u2.left_id = operand.id_first; + _prog.inst_at(id_inst1).u1.right_id = id_inst2; + _prog.inst_at(operand.id_last).u2.next_id = id_inst2; push_and(id_inst1, id_inst2); break; }