feat/schema: add lexer

2026-01-26 19:26:24 -06:00
parent 0991f8d674
commit 2526d34f21
9 changed files with 2026 additions and 0 deletions
--- a/schema/azschema.bnf
+++ b/schema/azschema.bnf
@@ -0,0 +1,18 @@
 string: '`' {.} '`' | '"' {.} '"';
 _bin_digit: '0' - '1';
 _oct_digit: _bin_digit | '2' - '7';
 _dec_digit: _oct_digit | '8' - '9';
 _hex_digit: _dec_digit | 'A' - 'F' | 'a' - 'f';
 _negative: '-';
 number: [_negative] '0' 'b' _bin_digit {_bin_digit}
    | [_negative] '0' 'o' _oct_digit {_oct_digit}
    | [_negative] _dec_digit {_dec_digit}
    | [_negative] '0' 'x' _hex_digit {_hex_digit};
 _name_initial: 'A' - 'Z' | 'a' - 'z' | '_' | '~' | '!' | '@' | '#' | '$' | '%' | '^' | '&' | '*' | '-' | '_' | '+' | '=' | '?' | '/' | '.' | '\'';
 _name_char: _name_initial | _dec_digit;
 name: _name_initial {_name_char};
 !whitespace: ' ' | '\t' | '\n' | '\r';
 !comment: ';' {.} '\n';
--- a/schema/lexer/acttab.go
+++ b/schema/lexer/acttab.go
@@ -0,0 +1,187 @@
 // Code generated by gocc; DO NOT EDIT.
 package lexer
 import (
 	"fmt"
 	"azalea/schema/token"
 )
 type ActionTable [NumStates]ActionRow
 type ActionRow struct {
 	Accept token.Type
 	Ignore string
 }
 func (a ActionRow) String() string {
 	return fmt.Sprintf("Accept=%d, Ignore=%s", a.Accept, a.Ignore)
 }
 var ActTab = ActionTable{
 	ActionRow{ // S0
 		Accept: 0,
 		Ignore: "",
 	},
 	ActionRow{ // S1
 		Accept: -1,
 		Ignore: "!whitespace",
 	},
 	ActionRow{ // S2
 		Accept: 2,
 		Ignore: "",
 	},
 	ActionRow{ // S3
 		Accept: 0,
 		Ignore: "",
 	},
 	ActionRow{ // S4
 		Accept: 2,
 		Ignore: "",
 	},
 	ActionRow{ // S5
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S6
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S7
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S8
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S9
 		Accept: 0,
 		Ignore: "",
 	},
 	ActionRow{ // S10
 		Accept: 0,
 		Ignore: "",
 	},
 	ActionRow{ // S11
 		Accept: 2,
 		Ignore: "",
 	},
 	ActionRow{ // S12
 		Accept: 2,
 		Ignore: "",
 	},
 	ActionRow{ // S13
 		Accept: 2,
 		Ignore: "",
 	},
 	ActionRow{ // S14
 		Accept: 2,
 		Ignore: "",
 	},
 	ActionRow{ // S15
 		Accept: 4,
 		Ignore: "",
 	},
 	ActionRow{ // S16
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S17
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S18
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S19
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S20
 		Accept: 0,
 		Ignore: "",
 	},
 	ActionRow{ // S21
 		Accept: 0,
 		Ignore: "",
 	},
 	ActionRow{ // S22
 		Accept: 0,
 		Ignore: "",
 	},
 	ActionRow{ // S23
 		Accept: -1,
 		Ignore: "!comment",
 	},
 	ActionRow{ // S24
 		Accept: 2,
 		Ignore: "",
 	},
 	ActionRow{ // S25
 		Accept: 2,
 		Ignore: "",
 	},
 	ActionRow{ // S26
 		Accept: 2,
 		Ignore: "",
 	},
 	ActionRow{ // S27
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S28
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S29
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S30
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S31
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S32
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S33
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S34
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S35
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S36
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S37
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S38
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S39
 		Accept: 3,
 		Ignore: "",
 	},
 	ActionRow{ // S40
 		Accept: 3,
 		Ignore: "",
 	},
 }
--- a/schema/lexer/lexer.go
+++ b/schema/lexer/lexer.go
@@ -0,0 +1,175 @@
 // Code generated by gocc; DO NOT EDIT.
 package lexer
 import (
 	"os"
 	"unicode/utf8"
 	"azalea/schema/token"
 )
 const (
 	NoState    = -1
 	NumStates  = 41
 	NumSymbols = 43
 )
 type Lexer struct {
 	src     []byte
 	pos     int
 	line    int
 	column  int
 	Context token.Context
 }
 func NewLexer(src []byte) *Lexer {
 	lexer := &Lexer{
 		src:     src,
 		pos:     0,
 		line:    1,
 		column:  1,
 		Context: nil,
 	}
 	return lexer
 }
 // SourceContext is a simple instance of a token.Context which
 // contains the name of the source file.
 type SourceContext struct {
 	Filepath string
 }
 func (s *SourceContext) Source() string {
 	return s.Filepath
 }
 func NewLexerFile(fpath string) (*Lexer, error) {
 	src, err := os.ReadFile(fpath)
 	if err != nil {
 		return nil, err
 	}
 	lexer := NewLexer(src)
 	lexer.Context = &SourceContext{Filepath: fpath}
 	return lexer, nil
 }
 func (l *Lexer) Scan() (tok *token.Token) {
 	tok = &token.Token{}
 	if l.pos >= len(l.src) {
 		tok.Type = token.EOF
 		tok.Pos.Offset, tok.Pos.Line, tok.Pos.Column = l.pos, l.line, l.column
 		tok.Pos.Context = l.Context
 		return
 	}
 	start, startLine, startColumn, end := l.pos, l.line, l.column, 0
 	tok.Type = token.INVALID
 	state, rune1, size := 0, rune(-1), 0
 	for state != -1 {
 		if l.pos >= len(l.src) {
 			rune1 = -1
 		} else {
 			rune1, size = utf8.DecodeRune(l.src[l.pos:])
 			l.pos += size
 		}
 		nextState := -1
 		if rune1 != -1 {
 			nextState = TransTab[state](rune1)
 		}
 		state = nextState
 		if state != -1 {
 			switch rune1 {
 			case '\n':
 				l.line++
 				l.column = 1
 			case '\r':
 				l.column = 1
 			case '\t':
 				l.column += 4
 			default:
 				l.column++
 			}
 			switch {
 			case ActTab[state].Accept != -1:
 				tok.Type = ActTab[state].Accept
 				end = l.pos
 			case ActTab[state].Ignore != "":
 				start, startLine, startColumn = l.pos, l.line, l.column
 				state = 0
 				if start >= len(l.src) {
 					tok.Type = token.EOF
 				}
 			}
 		} else {
 			if tok.Type == token.INVALID {
 				end = l.pos
 			}
 		}
 	}
 	if end > start {
 		l.pos = end
 		tok.Lit = l.src[start:end]
 	} else {
 		tok.Lit = []byte{}
 	}
 	tok.Pos.Offset, tok.Pos.Line, tok.Pos.Column = start, startLine, startColumn
 	tok.Pos.Context = l.Context
 	return
 }
 func (l *Lexer) Reset() {
 	l.pos = 0
 }
 /*
 Lexer symbols:
 0: '`'
 1: '`'
 2: '"'
 3: '"'
 4: '0'
 5: 'b'
 6: '0'
 7: 'o'
 8: '0'
 9: 'x'
 10: '-'
 11: '_'
 12: '~'
 13: '!'
 14: '@'
 15: '#'
 16: '$'
 17: '%'
 18: '^'
 19: '&'
 20: '*'
 21: '-'
 22: '_'
 23: '+'
 24: '='
 25: '?'
 26: '/'
 27: '.'
 28: '''
 29: ' '
 30: '\t'
 31: '\n'
 32: '\r'
 33: ';'
 34: '\n'
 35: '0'-'1'
 36: '2'-'7'
 37: '8'-'9'
 38: 'A'-'F'
 39: 'a'-'f'
 40: 'A'-'Z'
 41: 'a'-'z'
 42: .
 */
--- a/schema/lexer/transitiontable.go
+++ b/schema/lexer/transitiontable.go
--- a/schema/main.go
+++ b/schema/main.go
@@ -1 +1,2 @@
 //go:generate gocc azschema.bnf
 package schema
--- a/schema/token/context.go
+++ b/schema/token/context.go
@@ -0,0 +1,14 @@
 // Code generated by gocc; DO NOT EDIT.
 package token
 // Context allows user-defined data to be associated with the
 // lexer/scanner to be associated with each token that lexer
 // produces.
 type Context interface{}
 // Sourcer is a Context interface which presents a Source() method
 // identifying e.g the filename for the current code.
 type Sourcer interface {
 	Source() string
 }
--- a/schema/token/token.go
+++ b/schema/token/token.go
@@ -0,0 +1,153 @@
 // Code generated by gocc; DO NOT EDIT.
 package token
 import (
 	"bytes"
 	"fmt"
 	"strconv"
 	"unicode/utf8"
 )
 type Token struct {
 	Type
 	Lit []byte
 	Pos
 }
 type Type int
 const (
 	INVALID Type = iota
 	EOF
 )
 type Pos struct {
 	Offset  int
 	Line    int
 	Column  int
 	Context Context
 }
 func (p Pos) String() string {
 	// If the context provides a filename, provide a human-readable File:Line:Column representation.
 	switch src := p.Context.(type) {
 	case Sourcer:
 		return fmt.Sprintf("%s:%d:%d", src.Source(), p.Line, p.Column)
 	default:
 		return fmt.Sprintf("Pos(offset=%d, line=%d, column=%d)", p.Offset, p.Line, p.Column)
 	}
 }
 type TokenMap struct {
 	typeMap []string
 	idMap   map[string]Type
 }
 func (m TokenMap) Id(tok Type) string {
 	if int(tok) < len(m.typeMap) {
 		return m.typeMap[tok]
 	}
 	return "unknown"
 }
 func (m TokenMap) Type(tok string) Type {
 	if typ, exist := m.idMap[tok]; exist {
 		return typ
 	}
 	return INVALID
 }
 func (m TokenMap) TokenString(tok *Token) string {
 	return fmt.Sprintf("%s(%d,%s)", m.Id(tok.Type), tok.Type, tok.Lit)
 }
 func (m TokenMap) StringType(typ Type) string {
 	return fmt.Sprintf("%s(%d)", m.Id(typ), typ)
 }
 // Equals returns returns true if the token Type and Lit are matches.
 func (t *Token) Equals(rhs interface{}) bool {
 	switch rhsT := rhs.(type) {
 	case *Token:
 		return t == rhsT || (t.Type == rhsT.Type && bytes.Equal(t.Lit, rhsT.Lit))
 	default:
 		return false
 	}
 }
 // CharLiteralValue returns the string value of the char literal.
 func (t *Token) CharLiteralValue() string {
 	return string(t.Lit[1 : len(t.Lit)-1])
 }
 // Float32Value returns the float32 value of the token or an error if the token literal does not
 // denote a valid float32.
 func (t *Token) Float32Value() (float32, error) {
 	if v, err := strconv.ParseFloat(string(t.Lit), 32); err != nil {
 		return 0, err
 	} else {
 		return float32(v), nil
 	}
 }
 // Float64Value returns the float64 value of the token or an error if the token literal does not
 // denote a valid float64.
 func (t *Token) Float64Value() (float64, error) {
 	return strconv.ParseFloat(string(t.Lit), 64)
 }
 // IDValue returns the string representation of an identifier token.
 func (t *Token) IDValue() string {
 	return string(t.Lit)
 }
 // Int32Value returns the int32 value of the token or an error if the token literal does not
 // denote a valid float64.
 func (t *Token) Int32Value() (int32, error) {
 	if v, err := strconv.ParseInt(string(t.Lit), 10, 64); err != nil {
 		return 0, err
 	} else {
 		return int32(v), nil
 	}
 }
 // Int64Value returns the int64 value of the token or an error if the token literal does not
 // denote a valid float64.
 func (t *Token) Int64Value() (int64, error) {
 	return strconv.ParseInt(string(t.Lit), 10, 64)
 }
 // UTF8Rune decodes the UTF8 rune in the token literal. It returns utf8.RuneError if
 // the token literal contains an invalid rune.
 func (t *Token) UTF8Rune() (rune, error) {
 	r, _ := utf8.DecodeRune(t.Lit)
 	if r == utf8.RuneError {
 		err := fmt.Errorf("Invalid rune")
 		return r, err
 	}
 	return r, nil
 }
 // StringValue returns the string value of the token literal.
 func (t *Token) StringValue() string {
 	return string(t.Lit[1 : len(t.Lit)-1])
 }
 var TokMap = TokenMap{
 	typeMap: []string{
 		"INVALID",
 		"␚",
 		"name",
 		"number",
 		"string",
 	},
 	idMap: map[string]Type{
 		"INVALID": 0,
 		"␚":       1,
 		"name":    2,
 		"number":  3,
 		"string":  4,
 	},
 }
--- a/schema/util/litconv.go
+++ b/schema/util/litconv.go
@@ -0,0 +1,101 @@
 // Code generated by gocc; DO NOT EDIT.
 package util
 import (
 	"fmt"
 	"strconv"
 	"unicode"
 	"unicode/utf8"
 )
 // Interface.
 // RuneValue will convert the literal value of a scanned token to a rune.
 func RuneValue(lit []byte) rune {
 	if lit[1] == '\\' {
 		return escapeCharVal(lit)
 	}
 	r, size := utf8.DecodeRune(lit[1:])
 	if size != len(lit)-2 {
 		panic(fmt.Sprintf("Error decoding rune. Lit: %s, rune: %d, size%d\n", lit, r, size))
 	}
 	return r
 }
 // UintValue will attempt to parse a byte-slice as a signed base-10 64-bit integer.
 func IntValue(lit []byte) (int64, error) {
 	return strconv.ParseInt(string(lit), 10, 64)
 }
 // UintValue will attempt to parse a byte-slice as an unsigned base-10 64-bit integer.
 func UintValue(lit []byte) (uint64, error) {
 	return strconv.ParseUint(string(lit), 10, 64)
 }
 // Helpers.
 func escapeCharVal(lit []byte) rune {
 	var i, base, max uint32
 	offset := 2
 	switch lit[offset] {
 	case 'a':
 		return '\a'
 	case 'b':
 		return '\b'
 	case 'f':
 		return '\f'
 	case 'n':
 		return '\n'
 	case 'r':
 		return '\r'
 	case 't':
 		return '\t'
 	case 'v':
 		return '\v'
 	case '\\':
 		return '\\'
 	case '\'':
 		return '\''
 	case '0', '1', '2', '3', '4', '5', '6', '7':
 		i, base, max = 3, 8, 255
 	case 'x':
 		i, base, max = 2, 16, 255
 		offset++
 	case 'u':
 		i, base, max = 4, 16, unicode.MaxRune
 		offset++
 	case 'U':
 		i, base, max = 8, 16, unicode.MaxRune
 		offset++
 	default:
 		panic(fmt.Sprintf("Error decoding character literal: %s\n", lit))
 	}
 	var x uint32
 	for ; i > 0 && offset < len(lit)-1; i-- {
 		ch, size := utf8.DecodeRune(lit[offset:])
 		offset += size
 		d := uint32(digitVal(ch))
 		if d >= base {
 			panic(fmt.Sprintf("charVal(%s): illegal character (%c) in escape sequence. size=%d, offset=%d", lit, ch, size, offset))
 		}
 		x = x*base + d
 	}
 	if x > max || 0xD800 <= x && x < 0xE000 {
 		panic(fmt.Sprintf("Error decoding escape char value. Lit:%s, offset:%d, escape sequence is invalid Unicode code point\n", lit, offset))
 	}
 	return rune(x)
 }
 func digitVal(ch rune) int {
 	switch {
 	case '0' <= ch && ch <= '9':
 		return int(ch) - '0'
 	case 'a' <= ch && ch <= 'f':
 		return int(ch) - 'a' + 10
 	case 'A' <= ch && ch <= 'F':
 		return int(ch) - 'A' + 10
 	}
 	return 16 // larger than any legal digit val
 }
--- a/schema/util/rune.go
+++ b/schema/util/rune.go
@@ -0,0 +1,39 @@
 // Code generated by gocc; DO NOT EDIT.
 package util
 import (
 	"fmt"
 )
 func RuneToString(r rune) string {
 	if r >= 0x20 && r < 0x7f {
 		return fmt.Sprintf("'%c'", r)
 	}
 	switch r {
 	case 0x07:
 		return "'\\a'"
 	case 0x08:
 		return "'\\b'"
 	case 0x0C:
 		return "'\\f'"
 	case 0x0A:
 		return "'\\n'"
 	case 0x0D:
 		return "'\\r'"
 	case 0x09:
 		return "'\\t'"
 	case 0x0b:
 		return "'\\v'"
 	case 0x5c:
 		return "'\\\\\\'"
 	case 0x27:
 		return "'\\''"
 	case 0x22:
 		return "'\\\"'"
 	}
 	if r < 0x10000 {
 		return fmt.Sprintf("\\u%04x", r)
 	}
 	return fmt.Sprintf("\\U%08x", r)
 }
`@@ -1 +1,2 @@`
		`//go:generate gocc azschema.bnf`
	`package schema`	`package schema`