Skip to content

Commit

Permalink
feat(fixer_v2): State Transition Machine for Metavariable Parser (#114)
Browse files Browse the repository at this point in the history
# Description

~~~mermaid
stateDiagram-v2
    direction LR
    
    GO --> CL: "COLON"
    CL --> OB: "["
    OB --> DB: "["
    OB --> NM: "ident"
    DB --> NM: "ident"
    
    %% Simple Pattern :[name]
    NM --> NM: "ident"
    NM --> CB: "]"
    CB --> OK: "COLON" or "whitespace"
    CB --> QB: "]"
    CB --> QT: "*+?"
    
    %% Type Pattern :[[name:type]]
    NM --> ID: "COLON"
    ID --> ID: "ident"
    ID --> CB: "]"
    
    %% Post Quantifier
    QB --> QT: "*+?"
    QB --> OK: "COLON" or "whitespace"
    QT --> OK: "COLON" or "whitespace"
    
    %% Text handling
    GO --> TX: "other"
    GO --> WS: "whitespace"
    GO --> BR: "{}"
    
    %% Whitespace loop
    WS --> WS: "whitespace"
    WS --> CL: "COLON"
    WS --> TX: "other"
    WS --> BR: "{}"
    
    %% Block handling
    BR --> BR: "{"
    BR --> OK: "}"
    BR --> CL: "COLON"
    BR --> WS: "whitespace"
    
    note right of GO
        Initial state
    end note
    
    note right of CL
        After colon
    end note
    
    note right of NM
        Reading name
    end note
    
    note right of ID
        Reading type
    end note
    
    note right of QT
        Reading quantifier
    end note
~~~

The initial implemtation of state transition machine, which will
replaces the current naive lexer implementation. This is a drop-in
replacement for the existing lexer. No changes required in the parser or
other components for now.
  • Loading branch information
notJoon authored Jan 19, 2025
1 parent c4f86e3 commit ac21b4c
Show file tree
Hide file tree
Showing 4 changed files with 319 additions and 1 deletion.
7 changes: 7 additions & 0 deletions .github/golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,10 @@ issues:
- path: _\.gno
linters:
- errorlint # Disabled linting of error comparisons, because of lacking std lib support

- path: fixer_v2/query/internal\.go$
linters:
- gofmt
- gofumpt
- goimports
- whitespace
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,6 @@ lint:
golangci-lint run

fmt:
go fmt ./...
find . -name "*.go" ! -path "./fixer_v2/query/internal.go" -exec go fmt {} \;

.PHONY: all build test clean run deps build-linux build-windows build-mac build-all install-linter lint
252 changes: 252 additions & 0 deletions fixer_v2/query/internal.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,252 @@
package query

import (
"fmt"
"strings"
)

/*
State Transition Machine Design Rationale
This lexer uses a state transition machine approach instead of a traditional
hand-coded switch-case lexer for several performance-critical reasons:
1. Branch Prediction Optimization
- Traditional lexers with multiple if/switch statements suffer from branch
misprediction penalties. Each token type check causes a branch, and modern
CPUs struggle to predict these branches effectively.
- State machine approach consolidates branching into a single, predictable
loop with table-driven transitions, reducing branch mispredictions.
2. Token Processing Efficiency
- The state machine processes input character-by-character in a tight loop
using lookup tables, rather than repeatedly examining characters with
conditional logic.
- Token length tracking is integrated into the state machine loop via the
'in_token' table, eliminating the need for separate length calculations.
3. Memory Access Patterns
- The transition table, while larger than hand-coded logic, provides more
predictable memory access patterns that modern CPU caches can handle efficiently.
- Character class equivalence is used to reduce the transition table size
while maintaining performance (e.g., most alphabetic characters behave similarly).
4. Unified Whitespace and Token Processing
- The state machine handles both whitespace skipping and token recognition
in the same loop, eliminating additional branch mispredictions that would
occur when switching between these modes.
5. Extensibility and Maintainability
- Adding new token types only requires updating the transition table rather
than modifying complex branching logic.
- The state machine structure makes it easier to verify and maintain the lexer's
behavior compared to nested conditional logic.
Implementation Notes:
1. States are arranged so that final states have lower numbers, allowing for a single
comparison to detect when token recognition is complete.
2. The transition table is structured for efficient CPU cache usage by minimizing
the table size through character equivalence classes.
3. The design supports both simple tokens (like operators) and complex tokens
(like identifiers) while maintaining consistent performance characteristics.
Reference:
[1] https://nothings.org/computer/lexing.html
*/

type (
States int8 // Represents possible states of the parser
Classes int8 // Represents character classes in the pattern
)

// States represent different stages of lexical analysis:
// - GO (0) - Initial state, ready to start processing input
// - OK (1) - Accept state, token successfully recognized
// - CL (2) - After seeing a colon, expecting bracket or identifier
// - OB (3) - After first opening bracket, may start double bracket
// - DB (4) - After double bracket, expecting identifier
// - NM (5) - Reading name part of metavariable
// - ID (6) - Reading type identifier (after colon in name)
// - CB (7) - After first closing bracket
// - QB (8) - After second closing bracket
// - QT (9) - Processing quantifier (*, +, ?)
// - TX (10) - Processing regular text
// - WS (11) - Processing whitespace
// - BR (12) - Processing block delimiters ({, })
//
// The state numbering is significant - states <= OK are final states,
// allowing for efficient loop termination with a single comparison.
const (
GO States = iota // Initial state
OK // Accept state (successful parse)
CL // After colon state (:)
OB // After first bracket state ([)
DB // After double bracket state ([[)
NM // Reading name state
ID // Reading type identifier state
CB // After closing bracket state (])
QB // After double closing bracket state (]])
QT // Reading quantifier state (*, +, ?)
TX // Reading text state
WS // Reading whitespace state
BR // Reading block state ({, })
)

// Character class definitions
const (
C_COLON Classes = iota // Colon character (:)
C_LBRACK // Left bracket ([)
C_RBRACK // Right bracket (])
C_LBRACE // Left brace ({)
C_RBRACE // Right brace (})
C_SPACE // Whitespace characters (space, tab, newline)
C_IDENT // Identifier characters (alphanumeric, _, -)
C_QUANT // Quantifiers (*, +, ?)
C_OTHER // Any other character
)

// State transition table for the pattern parser
// Key considerations in the transitions:
// 1. After NM state, a colon transitions to ID state for type specifications
// 2. CB and QB states allow whitespace transitions for better error recovery
// 3. After quantifiers (QT), we can continue with any valid pattern start
// 4. TX (text) state allows transitioning back to pattern parsing
var StateTransitionTable = [13][9]States{
// COLON LBRACK RBRACK LBRACE RBRACE SPACE IDENT QUANT OTHER
/* GO */ { CL, TX, TX, BR, BR, WS, TX, TX, TX },
/* OK */ { CL, TX, TX, BR, BR, WS, TX, TX, TX },
/* CL */ { TX, OB, TX, TX, TX, TX, ID, TX, TX },
/* OB */ { TX, DB, TX, TX, TX, TX, NM, TX, TX },
/* DB */ { TX, TX, TX, TX, TX, TX, NM, TX, TX },
/* NM */ { ID, TX, CB, TX, TX, TX, NM, TX, TX }, // Transition to ID state when colon is encountered
/* ID */ { TX, TX, CB, TX, TX, TX, ID, TX, TX },
/* CB */ { OK, TX, QB, TX, TX, WS, TX, QT, TX }, // Handle whitespace for better error recovery
/* QB */ { OK, TX, TX, TX, TX, WS, TX, QT, TX }, // Handle whitespace for better error recovery
/* QT */ { CL, TX, TX, BR, BR, WS, TX, TX, TX },
/* TX */ { CL, TX, TX, BR, BR, WS, TX, TX, TX },
/* WS */ { CL, TX, TX, BR, BR, WS, TX, TX, TX },
/* BR */ { CL, TX, TX, BR, OK, WS, TX, TX, TX },
}

func (c Classes) String() string {
switch c {
case C_COLON:
return "COLON"
case C_LBRACK:
return "LBRACK"
case C_RBRACK:
return "RBRACK"
case C_LBRACE:
return "LBRACE"
case C_RBRACE:
return "RBRACE"
case C_SPACE:
return "SPACE"
case C_IDENT:
return "IDENT"
case C_QUANT:
return "QUANT"
case C_OTHER:
return "OTHER"
default:
return "UNKNOWN"
}
}

// StateMachine represents the parser's state machine
type StateMachine struct {
state States // Current state
input string // Input pattern to parse
position int // Current position in input
}

func NewStateMachine(input string) *StateMachine {
return &StateMachine{
state: GO,
input: input,
position: 0,
}
}

// Transition records the transition details between states
type Transition struct {
char byte
fromState States
class Classes
toState States
}

func (sm *StateMachine) recordTransitions() []Transition {
var transitions []Transition

for sm.position < len(sm.input) {
c := sm.input[sm.position]
class := getCharacterClass(c)
currentState := sm.state
nextState := StateTransitionTable[currentState][class]

transitions = append(transitions, Transition{
char: c,
fromState: currentState,
class: class,
toState: nextState,
})

sm.state = nextState
sm.position++
}

return transitions
}

func visualizeTransitions(transitions []Transition) string {
var b strings.Builder
for _, t := range transitions {
fmt.Fprintf(&b, "%c: %v -%v-> %v\n",
t.char, t.fromState, t.class, t.toState)
}
return b.String()
}

// getCharacterClass determines the character class for a given byte
// Handles special characters, whitespace, and identifier characters
// Returns C_OTHER for any character that doesn't fit other categories
func getCharacterClass(c byte) Classes {
// Check special characters first
switch c {
case ':':
return C_COLON
case '[':
return C_LBRACK
case ']':
return C_RBRACK
case '{':
return C_LBRACE
case '}':
return C_RBRACE
case '*', '+', '?':
return C_QUANT
}

// Check for whitespace
if isWhitespace(c) {
return C_SPACE
}

// Check for identifier characters
if isIdentChar(c) {
return C_IDENT
}

return C_OTHER
}

// isIdentChar checks if a character is valid in an identifier
// Allows: alphanumeric, underscore, and hyphen (comby-specific)
func isIdentChar(c byte) bool {
return ('a' <= c && c <= 'z') ||
('A' <= c && c <= 'Z') ||
('0' <= c && c <= '9') ||
c == '_' ||
c == '-' // Comby syntax allows hyphens in identifiers
}
59 changes: 59 additions & 0 deletions fixer_v2/query/internal_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package query

import (
"reflect"
"testing"
)

func TestSpecialPatterns(t *testing.T) {
patterns := []struct {
input string
desc string
expected []States
}{
{
input: ":[[var:identifier]]*",
desc: "Identifier with zero-or-more",
expected: []States{
CL, OB, DB,
NM, NM, NM,
ID, ID, ID,
ID, ID, ID,
ID, ID, ID,
ID, ID, CB,
QB, QT,
},
},
{
input: ":[var] :[next]",
desc: "Multiple holes",
expected: []States{
CL, OB, NM,
NM, NM, CB,
WS, CL, OB,
NM, NM, NM,
NM, CB,
},
},
}

for _, p := range patterns {
t.Run(p.desc, func(t *testing.T) {
sm := NewStateMachine(p.input)
transitions := sm.recordTransitions()

states := make([]States, len(transitions))
for i, tr := range transitions {
states[i] = tr.toState
}

t.Logf("\n Input: %s", p.input)
t.Logf("\nTransitions:\n%s", visualizeTransitions(transitions))

if !reflect.DeepEqual(states, p.expected) {
t.Errorf("\nGot: %v\nWant: %v", states, p.expected)
t.Logf("\nTransitions:\n%s", visualizeTransitions(transitions))
}
})
}
}

0 comments on commit ac21b4c

Please sign in to comment.