Files
azalea/schema/token.go
2026-01-23 03:20:34 -06:00

199 lines
4.7 KiB
Go

package schema
import (
"bytes"
"errors"
"fmt"
"log"
"slices"
"strconv"
"sync"
)
type Token struct {
Type TokenType
Number int64
Value string
}
type TokenType uintptr
const (
StringLiteralTokenType TokenType = iota
NumberLiteralTokenType
NameTokenType
OpenParenTokenType
CloseParenTokenType
)
func (t *Token) String() string {
switch t.Type {
case StringLiteralTokenType:
return fmt.Sprintf("[l'%s']", t.Value)
case NumberLiteralTokenType:
return fmt.Sprintf("[l%d]", t.Number)
case NameTokenType:
return fmt.Sprintf("[n'%s']", t.Value)
case OpenParenTokenType:
return fmt.Sprintf("[(%d]", t.Number)
case CloseParenTokenType:
return fmt.Sprintf("[%d)]", t.Number)
}
return fmt.Sprintf("[?'%s']", t.Value)
}
func StringLiteralToken(Value string) *Token {
return &Token{Type: StringLiteralTokenType, Value: Value}
}
func NumberLiteralToken(Value string) *Token {
number, err := strconv.ParseInt(Value, 0, 64)
if err != nil {
log.Panicf("failed to parse '%s' as number: %s", Value, err)
}
return &Token{Type: NumberLiteralTokenType, Number: number}
}
func NameToken(Name string) *Token {
return &Token{Type: NameTokenType, Value: Name}
}
func OpenParenToken(Depth int) *Token {
return &Token{Type: OpenParenTokenType, Number: int64(Depth)}
}
func CloseParenToken(Depth int) *Token {
return &Token{Type: CloseParenTokenType, Number: int64(Depth)}
}
// preprocess removes comments and newlines.
func preprocess(in []byte) ([]byte, int) {
lines := bytes.Split(in, []byte("\n"))
var wg sync.WaitGroup
length := len(lines)
wg.Add(length)
for n, l := range lines {
go func(n int, l []byte) {
defer wg.Done()
quote := false // "
grave := false // `
for i, c := range l {
if c == '"' && !quote && !grave {
quote = true
}
if c == '"' && quote && !grave {
quote = false
}
if c == '`' && !quote && !grave {
grave = true
}
if c == '`' && !quote && grave {
grave = false
}
if c == ';' && !(quote || grave) {
lines[n] = l[:i]
break
}
}
}(n, l)
}
wg.Wait()
return bytes.Join(lines, []byte(" ")), length
}
func Tokenize(s []byte) ([][]*Token, error) {
s, _ = preprocess(s)
var tokens = make([][]*Token, 0)
statement := 0
token := 0
depth := 0
literalbegin := -1
namebegin := -1
quote := false
grave := false
for i, c := range s {
if !quote && !grave {
switch c {
case '(':
if depth == 0 {
tokens = append(tokens, make([]*Token, 0))
}
tokens[statement] = append(tokens[statement], OpenParenToken(depth))
depth++
token++
break
case ')':
if namebegin != -1 {
tokens[statement] = append(tokens[statement], NameToken(string(s[namebegin:i])))
namebegin = -1
token++
} else if literalbegin != -1 {
tokens[statement] = append(tokens[statement], NumberLiteralToken(string(s[literalbegin:i])))
token++
literalbegin = -1
}
depth--
if depth < 0 {
return nil, errors.New(fmt.Sprintf("unexpected closing paren at [%d,%d]", statement, token))
}
tokens[statement] = append(tokens[statement], CloseParenToken(depth))
token++
if depth == 0 {
statement++
if statement >= len(tokens) {
slices.Grow(tokens, 1)
}
}
break
case '"':
literalbegin = i + 1
quote = true
break
case '`':
literalbegin = i + 1
grave = true
break
case ' ':
if namebegin != -1 {
tokens[statement] = append(tokens[statement], NameToken(string(s[namebegin:i])))
token++
namebegin = -1
} else if literalbegin != -1 {
tokens[statement] = append(tokens[statement], NumberLiteralToken(string(s[literalbegin:i])))
token++
literalbegin = -1
}
break
default:
if namebegin == -1 && literalbegin == -1 {
if isDigit(c) {
literalbegin = i
} else if isAllowedName(c) {
namebegin = i
}
}
}
} else if c == '"' && quote {
tokens[statement] = append(tokens[statement], StringLiteralToken(string(s[literalbegin:i])))
literalbegin = -1
quote = false
token++
} else if c == '`' && grave {
tokens[statement] = append(tokens[statement], StringLiteralToken(string(s[literalbegin:i])))
literalbegin = -1
grave = false
token++
}
}
return tokens, nil
}
// isDigit checks if a character is a digit and therefore is allowed to be the start of a numeric literal.
func isDigit(c byte) bool {
return c >= '0' && c <= '9'
}
// isAllowedName checks if a character is allowed to be the first character of a name.
// Variable names beginning with a number or containing any of the reserved characters are forbidden.
func isAllowedName(c byte) bool {
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_' || (c >= '*' && c <= '/') || (c >= ':' && c <= '@')
}