From c8d726879d46cb0e63652a504e2471238207f4a2 Mon Sep 17 00:00:00 2001 From: keyuchang Date: Mon, 21 Nov 2022 14:39:07 +0800 Subject: [PATCH] *: refactor the yaccgo --- Parser/Lex.go | 45 +++++++++++++++----- Parser/Parser.go | 80 ++++++++++++++++++++++++++++++---- Parser/Parser_test.go | 99 +++++++++++++++++++++++++++++++++++++++++++ Parser/Vistor.go | 6 +-- proposal/202211.md | 30 +++++++++++++ 5 files changed, 237 insertions(+), 23 deletions(-) create mode 100644 proposal/202211.md diff --git a/Parser/Lex.go b/Parser/Lex.go index bbd6956..cfca811 100644 --- a/Parser/Lex.go +++ b/Parser/Lex.go @@ -239,9 +239,9 @@ func ActionState(l *lexer) stateFn { case unicode.IsDigit(r): l.acceptRun("0123456789") l.emit(ActionN) - case l.acceptWord("accept"): //$accept + case l.acceptOnlyAlphaWord("accept"): //$accept l.emit(ActionAccept) - case l.acceptWord("end"): //$end + case l.acceptOnlyAlphaWord("end"): //$end l.emit(ActionEnd) default: l.error("Action lexer error") @@ -265,31 +265,31 @@ func DirectiveState(l *lexer) stateFn { } func DirectiveOtherState(l *lexer) stateFn { - if l.acceptWord("type") { + if l.acceptOnlyAlphaWord("type") { l.emit(TypeDirective) } - if l.acceptWord("token") { + if l.acceptOnlyAlphaWord("token") { l.emit(TokenDirective) } - if l.acceptWord("union") { + if l.acceptOnlyAlphaWord("union") { return DirectiveUnionState } - if l.acceptWord("left") { + if l.acceptOnlyAlphaWord("left") { l.emit(LeftAssoc) } - if l.acceptWord("right") { + if l.acceptOnlyAlphaWord("right") { l.emit(RightAssoc) } - if l.acceptWord("nonassoc") { + if l.acceptOnlyAlphaWord("nonassoc") { l.emit(NoneAssoc) } - if l.acceptWord("prec") { + if l.acceptOnlyAlphaWord("prec") { l.emit(PrecDirective) } - if l.acceptWord("precedence") { + if l.acceptOnlyAlphaWord("precedence") { l.emit(Precedence) } - if l.acceptWord("start") { + if l.acceptOnlyAlphaWord("start") { l.emit(StartDirective) } return rootState @@ -455,6 +455,29 @@ func (l *lexer) acceptRun(valid string) { l.backup() } +func (l *lexer) acceptOnlyAlphaWord(word string) bool { + pos, loc, prev := l.end, l.loc, l.prev + + // Skip spaces (U+0020) if any + r := l.peek() + for ; r == ' '; r = l.peek() { + l.next() + } + + for _, ch := range word { + if l.next() != ch { + l.end, l.loc, l.prev = pos, loc, prev + return false + } + } + if r = l.peek(); unicode.IsLetter(r) { + l.end, l.loc, l.prev = pos, loc, prev + return false + } + + return true +} + func (l *lexer) acceptWord(word string) bool { pos, loc, prev := l.end, l.loc, l.prev diff --git a/Parser/Parser.go b/Parser/Parser.go index 0195bcb..0252f26 100644 --- a/Parser/Parser.go +++ b/Parser/Parser.go @@ -16,6 +16,9 @@ type parser struct { peekCount int pos int err error + + // map Def + TokenDefMap map[string]bool } type Node interface { @@ -130,11 +133,12 @@ func Parse(input string) (*RootNode, error) { lex: Lex(input), pos: 0, } + p.TokenDefMap = make(map[string]bool) var nodeDeclare Node if nodeDeclare = p.parseDeclare(); nodeDeclare == nil { return nil, fmt.Errorf("do not has declare %s", p.err) } - + decl, _ := nodeDeclare.(*DeclareNode) if !p.current.Is(Section) { return nil, fmt.Errorf( fmt.Sprintf("parser error! %s", p.current.Value), @@ -143,14 +147,14 @@ func Parse(input string) (*RootNode, error) { RuDlist := make([]RuleDef, 0) p.next() // get the first identify for { - if ruleslice := p.parseRule(); ruleslice == nil { + if ruleslice := p.parseRule(&decl.TokenDefList); ruleslice == nil { break } else { RuDlist = append(RuDlist, ruleslice...) } } - if !p.current.Is(Section) { + if !p.current.Is(Section) && !p.current.Is(EOF) { return nil, fmt.Errorf(fmt.Sprintf("parser err :%s", p.current.Value)) } restcode := p.lex.input[p.current.EndAt:] @@ -250,6 +254,7 @@ func (p *parser) parseTokendef() *TokenDef { p.backup() } id.Value = value + p.TokenDefMap[id.Name] = true Tokdef.IdentifyList = append(Tokdef.IdentifyList, id) } else if p.current.Is(Charater) { id := Idendity{ @@ -260,6 +265,7 @@ func (p *parser) parseTokendef() *TokenDef { IDTyp: TERMID, Alias: p.current.Value, } + p.TokenDefMap[id.Name] = true Tokdef.IdentifyList = append(Tokdef.IdentifyList, id) } else { break @@ -270,8 +276,16 @@ func (p *parser) parseTokendef() *TokenDef { return &Tokdef } -func (p *parser) parsePrecList() []PrecDef { +// The Same as token, +/* + +%left symbols… + +%left symbols… +*/ +func (p *parser) parsePrecList(Tklist *[]TokenDef) []PrecDef { var assocTy PrecAssocType + var Tokdef TokenDef var IdName string res := make([]PrecDef, 0) if p.current.Is(LeftAssoc) { @@ -282,13 +296,38 @@ func (p *parser) parsePrecList() []PrecDef { assocTy = NonAssocType } + Tag := "" + p.next() + // match < + if p.current.Is(LeftAngleBracket) { + // get Tag + p.next() + Tag = p.current.Value + p.next() + p.expect(RightAngleBracket) // match > + } + p.backup() for { p.next() if p.current.Is(Identifier) || p.current.Is(Charater) { // make loop get id or alias IdName = p.current.Value + idvalue := 0 if p.current.Is(Charater) { IdName = genTempName(IdName) + idvalue = int(p.current.Value[0]) + } + if !p.TokenDefMap[IdName] { + id := Idendity{ + Tag: Tag, + // noname need do for sepical. + Name: IdName, + Value: idvalue, + IDTyp: TERMID, + Alias: "", + } + p.TokenDefMap[IdName] = true + Tokdef.IdentifyList = append(Tokdef.IdentifyList, id) } node := PrecDef{ IdName: IdName, @@ -300,6 +339,9 @@ func (p *parser) parsePrecList() []PrecDef { break } } + if len(Tokdef.IdentifyList) != 0 { + *Tklist = append(*Tklist, Tokdef) + } return res } @@ -339,7 +381,9 @@ func (p *parser) parseTypeList() []TypeDef { return TypedefList } -// startsymbol +// startsymbol indicatation the rules start from +// %start cmds it means cmds is the start symbols +// or else , the rules start symbol must be `start` func (p *parser) parseStartSymbol() string { p.next() if p.current.Is(Identifier) { @@ -357,7 +401,7 @@ func (p *parser) parseDeclare() Node { var TokDefList []TokenDef var PreDefList [][]PrecDef var TypeDefList []TypeDef - var StartSym string + var StartSym string = "start" p.next() for !(p.current.Is(EOF) || p.current.Is(Section)) { if p.current.Is(tokenError) { @@ -381,7 +425,7 @@ func (p *parser) parseDeclare() Node { p.current.Is(NoneAssoc) || // precDirective is just used to rules p.current.Is(Precedence) { - PreDefList = append(PreDefList, p.parsePrecList()) + PreDefList = append(PreDefList, p.parsePrecList(&TokDefList)) //Do not need call p.next continue } @@ -415,8 +459,9 @@ func (p *parser) parseDeclare() Node { ID RuleDefine {id/char %prec terminal-symbol |ActionQuote}* | RuleOR {id/char |ActionQuote}* */ -func (p *parser) parseRule() []RuleDef { +func (p *parser) parseRule(toklst *[]TokenDef) []RuleDef { var Leftpart string + var Tokdef TokenDef if p.current.Is(Identifier) { Leftpart = p.current.Value p.next() @@ -449,6 +494,18 @@ func (p *parser) parseRule() []RuleDef { ElemType: RightSyType, Element: genTempName(p.current.Value), }) + if !p.TokenDefMap[genTempName(p.current.Value)] { + id := Idendity{ + Tag: "", + // noname need do for sepical. + Name: genTempName(p.current.Value), + Value: int(p.current.Value[0]), + IDTyp: TERMID, + Alias: "", + } + p.TokenDefMap[genTempName(p.current.Value)] = true + Tokdef.IdentifyList = append(Tokdef.IdentifyList, id) + } case Identifier: rightpart = append(rightpart, RightSymOrAction{ ElemType: RightSyType, @@ -475,10 +532,15 @@ func (p *parser) parseRule() []RuleDef { } default: res = append(res, rule) - return res + goto out + } rule.RightPart = rightpart p.next() } +out: + if len(Tokdef.IdentifyList) != 0 { + *toklst = append(*toklst, Tokdef) + } return res } diff --git a/Parser/Parser_test.go b/Parser/Parser_test.go index 383ebb1..b150136 100644 --- a/Parser/Parser_test.go +++ b/Parser/Parser_test.go @@ -132,3 +132,102 @@ expr3: root.LALR1 = lalr } } + +func TestParser2(t *testing.T) { + str := + `// Copyright 2013 The Go Authors. All rights reserved. + // Use of this source code is governed by a BSD-style + // license that can be found in the LICENSE file. + + // This is an example of a goyacc program. + // To build it: + // goyacc -p "expr" expr.y (produces y.go) + // go build -o expr y.go + // expr + // > + + %{ + + package main + + import ( + "bufio" + "bytes" + "fmt" + "io" + "log" + "math/big" + "os" + "unicode/utf8" + ) + + %} + +%left A B +%left C +%start s +%% +s : A +` + if tr, err := Parse(str); err != nil { + t.Error(err) + } else { + // work in test + var node Node = tr + w := DoWalker(&node, &RootVistor{}) + lalr := w.BuildLALR1() + fmt.Println(lalr) + root := w.VistorNode.(*RootVistor) + root.LALR1 = lalr + } +} + +func TestParser3(t *testing.T) { + str := ` + %{ + package main + %} + + %union{ + String string + Expr expr + } + + + %token IDENTIFIER + %token NUMBER 100 + %type expr assignment + + %left '+' '-' + %left '*' '/' + %% + start: expr {yylex.(*interpreter).parseResult = &astRoot{$1}} + | assignment {yylex.(*interpreter).parseResult = $1} + + expr: + NUMBER {$$ = &number{$1} } + | IDENTIFIER { $$ = &variable{$1}} + | expr '+' expr { $$ = &binaryExpr{Op: '+', lhs: $1, rhs: $3} } + | expr '-' expr { $$ = &binaryExpr{Op: '-', lhs: $1, rhs: $3} } + | expr '*' expr { $$ = &binaryExpr{Op: '*', lhs: $1, rhs: $3} } + | expr '/' expr { $$ = &binaryExpr{Op: '/', lhs: $1, rhs: $3} } + | '(' expr ')' { $$ = &parenExpr{$2}} + | '-' expr %prec '*' { $$ = &unaryExpr{$2} } + + + assignment: + IDENTIFIER '=' expr {$$ = &assignment{$1, $3}} + %% +` + if tr, err := Parse(str); err != nil { + t.Error(err) + } else { + // work in test + var node Node = tr + w := DoWalker(&node, &RootVistor{}) + lalr := w.BuildLALR1() + fmt.Println(lalr) + root := w.VistorNode.(*RootVistor) + root.LALR1 = lalr + } +} diff --git a/Parser/Vistor.go b/Parser/Vistor.go index b536db6..759b08f 100644 --- a/Parser/Vistor.go +++ b/Parser/Vistor.go @@ -113,7 +113,7 @@ func (v *astDeclareVistor) Process(node *Node) { } } // 4. start symbol - if v.idsymtabl[n.StartSym] == nil { + if len(n.StartSym) != 0 && v.idsymtabl[n.StartSym] == nil { //Append new name id := &Idendity{ Name: n.StartSym, @@ -122,9 +122,9 @@ func (v *astDeclareVistor) Process(node *Node) { Value: 0, } v.idsymtabl[n.StartSym] = id - + v.startSym = v.idsymtabl[n.StartSym] } - v.startSym = v.idsymtabl[n.StartSym] + //set other value v.code = n.CodeList v.union = n.Union diff --git a/proposal/202211.md b/proposal/202211.md new file mode 100644 index 0000000..bb622e7 --- /dev/null +++ b/proposal/202211.md @@ -0,0 +1,30 @@ +# Proposal: single char should treat as default token + + 'char' in all file, should add in to tokens +for examples: +``` +%type expr expr1 expr2 expr3 + +%token '+' NUMBER +... +E : NUMBER +E : E '+' E + | E '-' E +``` +when the `rule` has got '+' , '+' do not need to define in `%token` +it can use as follow: + +``` +%type expr expr1 expr2 expr3 + +%token NUMBER +... +E : NUMBER +E : E '+' E + | E '-' E +``` + +if you define ` %token NUMBER 43 ` and '+' ascii code is 43, +it should report error `annot have 2 different Ts with same value` + +# Poposal:` %left/right X Y`, should add `X Y` to tokens \ No newline at end of file