feat/schema: add lexer

2026-01-26 19:26:24 -06:00
parent 0991f8d674
commit 2526d34f21
9 changed files with 2026 additions and 0 deletions
--- a/schema/azschema.bnf
+++ b/schema/azschema.bnf
@@ -0,0 +1,18 @@
+string: '`' {.} '`' | '"' {.} '"';
+
+_bin_digit: '0' - '1';
+_oct_digit: _bin_digit | '2' - '7';
+_dec_digit: _oct_digit | '8' - '9';
+_hex_digit: _dec_digit | 'A' - 'F' | 'a' - 'f';
+_negative: '-';
+number: [_negative] '0' 'b' _bin_digit {_bin_digit}
+    | [_negative] '0' 'o' _oct_digit {_oct_digit}
+    | [_negative] _dec_digit {_dec_digit}
+    | [_negative] '0' 'x' _hex_digit {_hex_digit};
+
+_name_initial: 'A' - 'Z' | 'a' - 'z' | '_' | '~' | '!' | '@' | '#' | '$' | '%' | '^' | '&' | '*' | '-' | '_' | '+' | '=' | '?' | '/' | '.' | '\'';
+_name_char: _name_initial | _dec_digit;
+name: _name_initial {_name_char};
+
+!whitespace: ' ' | '\t' | '\n' | '\r';
+!comment: ';' {.} '\n';
--- a/schema/lexer/acttab.go
+++ b/schema/lexer/acttab.go
@@ -0,0 +1,187 @@
+// Code generated by gocc; DO NOT EDIT.
+
+package lexer
+
+import (
+	"fmt"
+
+	"azalea/schema/token"
+)
+
+type ActionTable [NumStates]ActionRow
+
+type ActionRow struct {
+	Accept token.Type
+	Ignore string
+}
+
+func (a ActionRow) String() string {
+	return fmt.Sprintf("Accept=%d, Ignore=%s", a.Accept, a.Ignore)
+}
+
+var ActTab = ActionTable{
+	ActionRow{ // S0
+		Accept: 0,
+		Ignore: "",
+	},
+	ActionRow{ // S1
+		Accept: -1,
+		Ignore: "!whitespace",
+	},
+	ActionRow{ // S2
+		Accept: 2,
+		Ignore: "",
+	},
+	ActionRow{ // S3
+		Accept: 0,
+		Ignore: "",
+	},
+	ActionRow{ // S4
+		Accept: 2,
+		Ignore: "",
+	},
+	ActionRow{ // S5
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S6
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S7
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S8
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S9
+		Accept: 0,
+		Ignore: "",
+	},
+	ActionRow{ // S10
+		Accept: 0,
+		Ignore: "",
+	},
+	ActionRow{ // S11
+		Accept: 2,
+		Ignore: "",
+	},
+	ActionRow{ // S12
+		Accept: 2,
+		Ignore: "",
+	},
+	ActionRow{ // S13
+		Accept: 2,
+		Ignore: "",
+	},
+	ActionRow{ // S14
+		Accept: 2,
+		Ignore: "",
+	},
+	ActionRow{ // S15
+		Accept: 4,
+		Ignore: "",
+	},
+	ActionRow{ // S16
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S17
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S18
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S19
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S20
+		Accept: 0,
+		Ignore: "",
+	},
+	ActionRow{ // S21
+		Accept: 0,
+		Ignore: "",
+	},
+	ActionRow{ // S22
+		Accept: 0,
+		Ignore: "",
+	},
+	ActionRow{ // S23
+		Accept: -1,
+		Ignore: "!comment",
+	},
+	ActionRow{ // S24
+		Accept: 2,
+		Ignore: "",
+	},
+	ActionRow{ // S25
+		Accept: 2,
+		Ignore: "",
+	},
+	ActionRow{ // S26
+		Accept: 2,
+		Ignore: "",
+	},
+	ActionRow{ // S27
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S28
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S29
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S30
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S31
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S32
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S33
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S34
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S35
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S36
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S37
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S38
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S39
+		Accept: 3,
+		Ignore: "",
+	},
+	ActionRow{ // S40
+		Accept: 3,
+		Ignore: "",
+	},
+}
--- a/schema/lexer/lexer.go
+++ b/schema/lexer/lexer.go
@@ -0,0 +1,175 @@
+// Code generated by gocc; DO NOT EDIT.
+
+package lexer
+
+import (
+	"os"
+	"unicode/utf8"
+
+	"azalea/schema/token"
+)
+
+const (
+	NoState    = -1
+	NumStates  = 41
+	NumSymbols = 43
+)
+
+type Lexer struct {
+	src     []byte
+	pos     int
+	line    int
+	column  int
+	Context token.Context
+}
+
+func NewLexer(src []byte) *Lexer {
+	lexer := &Lexer{
+		src:     src,
+		pos:     0,
+		line:    1,
+		column:  1,
+		Context: nil,
+	}
+	return lexer
+}
+
+// SourceContext is a simple instance of a token.Context which
+// contains the name of the source file.
+type SourceContext struct {
+	Filepath string
+}
+
+func (s *SourceContext) Source() string {
+	return s.Filepath
+}
+
+func NewLexerFile(fpath string) (*Lexer, error) {
+	src, err := os.ReadFile(fpath)
+	if err != nil {
+		return nil, err
+	}
+	lexer := NewLexer(src)
+	lexer.Context = &SourceContext{Filepath: fpath}
+	return lexer, nil
+}
+
+func (l *Lexer) Scan() (tok *token.Token) {
+	tok = &token.Token{}
+	if l.pos >= len(l.src) {
+		tok.Type = token.EOF
+		tok.Pos.Offset, tok.Pos.Line, tok.Pos.Column = l.pos, l.line, l.column
+		tok.Pos.Context = l.Context
+		return
+	}
+	start, startLine, startColumn, end := l.pos, l.line, l.column, 0
+	tok.Type = token.INVALID
+	state, rune1, size := 0, rune(-1), 0
+	for state != -1 {
+		if l.pos >= len(l.src) {
+			rune1 = -1
+		} else {
+			rune1, size = utf8.DecodeRune(l.src[l.pos:])
+			l.pos += size
+		}
+
+		nextState := -1
+		if rune1 != -1 {
+			nextState = TransTab[state](rune1)
+		}
+		state = nextState
+
+		if state != -1 {
+
+			switch rune1 {
+			case '\n':
+				l.line++
+				l.column = 1
+			case '\r':
+				l.column = 1
+			case '\t':
+				l.column += 4
+			default:
+				l.column++
+			}
+
+			switch {
+			case ActTab[state].Accept != -1:
+				tok.Type = ActTab[state].Accept
+				end = l.pos
+			case ActTab[state].Ignore != "":
+				start, startLine, startColumn = l.pos, l.line, l.column
+				state = 0
+				if start >= len(l.src) {
+					tok.Type = token.EOF
+				}
+
+			}
+		} else {
+			if tok.Type == token.INVALID {
+				end = l.pos
+			}
+		}
+	}
+	if end > start {
+		l.pos = end
+		tok.Lit = l.src[start:end]
+	} else {
+		tok.Lit = []byte{}
+	}
+	tok.Pos.Offset, tok.Pos.Line, tok.Pos.Column = start, startLine, startColumn
+	tok.Pos.Context = l.Context
+
+	return
+}
+
+func (l *Lexer) Reset() {
+	l.pos = 0
+}
+
+/*
+Lexer symbols:
+0: '`'
+1: '`'
+2: '"'
+3: '"'
+4: '0'
+5: 'b'
+6: '0'
+7: 'o'
+8: '0'
+9: 'x'
+10: '-'
+11: '_'
+12: '~'
+13: '!'
+14: '@'
+15: '#'
+16: '$'
+17: '%'
+18: '^'
+19: '&'
+20: '*'
+21: '-'
+22: '_'
+23: '+'
+24: '='
+25: '?'
+26: '/'
+27: '.'
+28: '''
+29: ' '
+30: '\t'
+31: '\n'
+32: '\r'
+33: ';'
+34: '\n'
+35: '0'-'1'
+36: '2'-'7'
+37: '8'-'9'
+38: 'A'-'F'
+39: 'a'-'f'
+40: 'A'-'Z'
+41: 'a'-'z'
+42: .
+*/
--- a/schema/lexer/transitiontable.go
+++ b/schema/lexer/transitiontable.go
--- a/schema/main.go
+++ b/schema/main.go
@@ -1 +1,2 @@
+//go:generate gocc azschema.bnf
 package schema
--- a/schema/token/context.go
+++ b/schema/token/context.go
@@ -0,0 +1,14 @@
+// Code generated by gocc; DO NOT EDIT.
+
+package token
+
+// Context allows user-defined data to be associated with the
+// lexer/scanner to be associated with each token that lexer
+// produces.
+type Context interface{}
+
+// Sourcer is a Context interface which presents a Source() method
+// identifying e.g the filename for the current code.
+type Sourcer interface {
+	Source() string
+}
--- a/schema/token/token.go
+++ b/schema/token/token.go
@@ -0,0 +1,153 @@
+// Code generated by gocc; DO NOT EDIT.
+
+package token
+
+import (
+	"bytes"
+	"fmt"
+	"strconv"
+	"unicode/utf8"
+)
+
+type Token struct {
+	Type
+	Lit []byte
+	Pos
+}
+
+type Type int
+
+const (
+	INVALID Type = iota
+	EOF
+)
+
+type Pos struct {
+	Offset  int
+	Line    int
+	Column  int
+	Context Context
+}
+
+func (p Pos) String() string {
+	// If the context provides a filename, provide a human-readable File:Line:Column representation.
+	switch src := p.Context.(type) {
+	case Sourcer:
+		return fmt.Sprintf("%s:%d:%d", src.Source(), p.Line, p.Column)
+	default:
+		return fmt.Sprintf("Pos(offset=%d, line=%d, column=%d)", p.Offset, p.Line, p.Column)
+	}
+}
+
+type TokenMap struct {
+	typeMap []string
+	idMap   map[string]Type
+}
+
+func (m TokenMap) Id(tok Type) string {
+	if int(tok) < len(m.typeMap) {
+		return m.typeMap[tok]
+	}
+	return "unknown"
+}
+
+func (m TokenMap) Type(tok string) Type {
+	if typ, exist := m.idMap[tok]; exist {
+		return typ
+	}
+	return INVALID
+}
+
+func (m TokenMap) TokenString(tok *Token) string {
+	return fmt.Sprintf("%s(%d,%s)", m.Id(tok.Type), tok.Type, tok.Lit)
+}
+
+func (m TokenMap) StringType(typ Type) string {
+	return fmt.Sprintf("%s(%d)", m.Id(typ), typ)
+}
+
+// Equals returns returns true if the token Type and Lit are matches.
+func (t *Token) Equals(rhs interface{}) bool {
+	switch rhsT := rhs.(type) {
+	case *Token:
+		return t == rhsT || (t.Type == rhsT.Type && bytes.Equal(t.Lit, rhsT.Lit))
+	default:
+		return false
+	}
+}
+
+// CharLiteralValue returns the string value of the char literal.
+func (t *Token) CharLiteralValue() string {
+	return string(t.Lit[1 : len(t.Lit)-1])
+}
+
+// Float32Value returns the float32 value of the token or an error if the token literal does not
+// denote a valid float32.
+func (t *Token) Float32Value() (float32, error) {
+	if v, err := strconv.ParseFloat(string(t.Lit), 32); err != nil {
+		return 0, err
+	} else {
+		return float32(v), nil
+	}
+}
+
+// Float64Value returns the float64 value of the token or an error if the token literal does not
+// denote a valid float64.
+func (t *Token) Float64Value() (float64, error) {
+	return strconv.ParseFloat(string(t.Lit), 64)
+}
+
+// IDValue returns the string representation of an identifier token.
+func (t *Token) IDValue() string {
+	return string(t.Lit)
+}
+
+// Int32Value returns the int32 value of the token or an error if the token literal does not
+// denote a valid float64.
+func (t *Token) Int32Value() (int32, error) {
+	if v, err := strconv.ParseInt(string(t.Lit), 10, 64); err != nil {
+		return 0, err
+	} else {
+		return int32(v), nil
+	}
+}
+
+// Int64Value returns the int64 value of the token or an error if the token literal does not
+// denote a valid float64.
+func (t *Token) Int64Value() (int64, error) {
+	return strconv.ParseInt(string(t.Lit), 10, 64)
+}
+
+// UTF8Rune decodes the UTF8 rune in the token literal. It returns utf8.RuneError if
+// the token literal contains an invalid rune.
+func (t *Token) UTF8Rune() (rune, error) {
+	r, _ := utf8.DecodeRune(t.Lit)
+	if r == utf8.RuneError {
+		err := fmt.Errorf("Invalid rune")
+		return r, err
+	}
+	return r, nil
+}
+
+// StringValue returns the string value of the token literal.
+func (t *Token) StringValue() string {
+	return string(t.Lit[1 : len(t.Lit)-1])
+}
+
+var TokMap = TokenMap{
+	typeMap: []string{
+		"INVALID",
+		"␚",
+		"name",
+		"number",
+		"string",
+	},
+
+	idMap: map[string]Type{
+		"INVALID": 0,
+		"␚":       1,
+		"name":    2,
+		"number":  3,
+		"string":  4,
+	},
+}
--- a/schema/util/litconv.go
+++ b/schema/util/litconv.go
@@ -0,0 +1,101 @@
+// Code generated by gocc; DO NOT EDIT.
+
+package util
+
+import (
+	"fmt"
+	"strconv"
+	"unicode"
+	"unicode/utf8"
+)
+
+// Interface.
+
+// RuneValue will convert the literal value of a scanned token to a rune.
+func RuneValue(lit []byte) rune {
+	if lit[1] == '\\' {
+		return escapeCharVal(lit)
+	}
+	r, size := utf8.DecodeRune(lit[1:])
+	if size != len(lit)-2 {
+		panic(fmt.Sprintf("Error decoding rune. Lit: %s, rune: %d, size%d\n", lit, r, size))
+	}
+	return r
+}
+
+// UintValue will attempt to parse a byte-slice as a signed base-10 64-bit integer.
+func IntValue(lit []byte) (int64, error) {
+	return strconv.ParseInt(string(lit), 10, 64)
+}
+
+// UintValue will attempt to parse a byte-slice as an unsigned base-10 64-bit integer.
+func UintValue(lit []byte) (uint64, error) {
+	return strconv.ParseUint(string(lit), 10, 64)
+}
+
+// Helpers.
+func escapeCharVal(lit []byte) rune {
+	var i, base, max uint32
+	offset := 2
+	switch lit[offset] {
+	case 'a':
+		return '\a'
+	case 'b':
+		return '\b'
+	case 'f':
+		return '\f'
+	case 'n':
+		return '\n'
+	case 'r':
+		return '\r'
+	case 't':
+		return '\t'
+	case 'v':
+		return '\v'
+	case '\\':
+		return '\\'
+	case '\'':
+		return '\''
+	case '0', '1', '2', '3', '4', '5', '6', '7':
+		i, base, max = 3, 8, 255
+	case 'x':
+		i, base, max = 2, 16, 255
+		offset++
+	case 'u':
+		i, base, max = 4, 16, unicode.MaxRune
+		offset++
+	case 'U':
+		i, base, max = 8, 16, unicode.MaxRune
+		offset++
+	default:
+		panic(fmt.Sprintf("Error decoding character literal: %s\n", lit))
+	}
+
+	var x uint32
+	for ; i > 0 && offset < len(lit)-1; i-- {
+		ch, size := utf8.DecodeRune(lit[offset:])
+		offset += size
+		d := uint32(digitVal(ch))
+		if d >= base {
+			panic(fmt.Sprintf("charVal(%s): illegal character (%c) in escape sequence. size=%d, offset=%d", lit, ch, size, offset))
+		}
+		x = x*base + d
+	}
+	if x > max || 0xD800 <= x && x < 0xE000 {
+		panic(fmt.Sprintf("Error decoding escape char value. Lit:%s, offset:%d, escape sequence is invalid Unicode code point\n", lit, offset))
+	}
+
+	return rune(x)
+}
+
+func digitVal(ch rune) int {
+	switch {
+	case '0' <= ch && ch <= '9':
+		return int(ch) - '0'
+	case 'a' <= ch && ch <= 'f':
+		return int(ch) - 'a' + 10
+	case 'A' <= ch && ch <= 'F':
+		return int(ch) - 'A' + 10
+	}
+	return 16 // larger than any legal digit val
+}
--- a/schema/util/rune.go
+++ b/schema/util/rune.go
@@ -0,0 +1,39 @@
+// Code generated by gocc; DO NOT EDIT.
+
+package util
+
+import (
+	"fmt"
+)
+
+func RuneToString(r rune) string {
+	if r >= 0x20 && r < 0x7f {
+		return fmt.Sprintf("'%c'", r)
+	}
+	switch r {
+	case 0x07:
+		return "'\\a'"
+	case 0x08:
+		return "'\\b'"
+	case 0x0C:
+		return "'\\f'"
+	case 0x0A:
+		return "'\\n'"
+	case 0x0D:
+		return "'\\r'"
+	case 0x09:
+		return "'\\t'"
+	case 0x0b:
+		return "'\\v'"
+	case 0x5c:
+		return "'\\\\\\'"
+	case 0x27:
+		return "'\\''"
+	case 0x22:
+		return "'\\\"'"
+	}
+	if r < 0x10000 {
+		return fmt.Sprintf("\\u%04x", r)
+	}
+	return fmt.Sprintf("\\U%08x", r)
+}