package schema import ( "bytes" "errors" "fmt" "log" "slices" "strconv" "sync" ) type Token struct { Type TokenType Number int64 Value string } type TokenType uintptr const ( StringLiteralTokenType TokenType = iota NumberLiteralTokenType NameTokenType OpenParenTokenType CloseParenTokenType ) func (t *Token) String() string { switch t.Type { case StringLiteralTokenType: return fmt.Sprintf("[l'%s']", t.Value) case NumberLiteralTokenType: return fmt.Sprintf("[l%d]", t.Number) case NameTokenType: return fmt.Sprintf("[n'%s']", t.Value) case OpenParenTokenType: return fmt.Sprintf("[(%d]", t.Number) case CloseParenTokenType: return fmt.Sprintf("[%d)]", t.Number) } return fmt.Sprintf("[?'%s']", t.Value) } func StringLiteralToken(Value string) *Token { return &Token{Type: StringLiteralTokenType, Value: Value} } func NumberLiteralToken(Value string) *Token { number, err := strconv.ParseInt(Value, 0, 64) if err != nil { log.Panicf("failed to parse '%s' as number: %s", Value, err) } return &Token{Type: NumberLiteralTokenType, Number: number} } func NameToken(Name string) *Token { return &Token{Type: NameTokenType, Value: Name} } func OpenParenToken(Depth int) *Token { return &Token{Type: OpenParenTokenType, Number: int64(Depth)} } func CloseParenToken(Depth int) *Token { return &Token{Type: CloseParenTokenType, Number: int64(Depth)} } // preprocess removes comments and newlines. func preprocess(in []byte) ([]byte, int) { lines := bytes.Split(in, []byte("\n")) var wg sync.WaitGroup length := len(lines) wg.Add(length) for n, l := range lines { go func(n int, l []byte) { defer wg.Done() quote := false // " grave := false // ` for i, c := range l { if c == '"' && !quote && !grave { quote = true } if c == '"' && quote && !grave { quote = false } if c == '`' && !quote && !grave { grave = true } if c == '`' && !quote && grave { grave = false } if c == ';' && !(quote || grave) { lines[n] = l[:i] break } } }(n, l) } wg.Wait() return bytes.Join(lines, []byte(" ")), length } func Tokenize(s []byte) ([][]*Token, error) { s, _ = preprocess(s) var tokens = make([][]*Token, 0) statement := 0 token := 0 depth := 0 literalbegin := -1 namebegin := -1 quote := false grave := false for i, c := range s { if !quote && !grave { switch c { case '(': if depth == 0 { tokens = append(tokens, make([]*Token, 0)) } tokens[statement] = append(tokens[statement], OpenParenToken(depth)) depth++ token++ break case ')': if namebegin != -1 { tokens[statement] = append(tokens[statement], NameToken(string(s[namebegin:i]))) namebegin = -1 token++ } else if literalbegin != -1 { tokens[statement] = append(tokens[statement], NumberLiteralToken(string(s[literalbegin:i]))) token++ literalbegin = -1 } depth-- if depth < 0 { return nil, errors.New(fmt.Sprintf("unexpected closing paren at [%d,%d]", statement, token)) } tokens[statement] = append(tokens[statement], CloseParenToken(depth)) token++ if depth == 0 { statement++ if statement >= len(tokens) { slices.Grow(tokens, 1) } } break case '"': literalbegin = i + 1 quote = true break case '`': literalbegin = i + 1 grave = true break case ' ': if namebegin != -1 { tokens[statement] = append(tokens[statement], NameToken(string(s[namebegin:i]))) token++ namebegin = -1 } else if literalbegin != -1 { tokens[statement] = append(tokens[statement], NumberLiteralToken(string(s[literalbegin:i]))) token++ literalbegin = -1 } break default: if namebegin == -1 && literalbegin == -1 { if isDigit(c) { literalbegin = i } else if isAllowedName(c) { namebegin = i } } } } else if c == '"' && quote { tokens[statement] = append(tokens[statement], StringLiteralToken(string(s[literalbegin:i]))) literalbegin = -1 quote = false token++ } else if c == '`' && grave { tokens[statement] = append(tokens[statement], StringLiteralToken(string(s[literalbegin:i]))) literalbegin = -1 grave = false token++ } } return tokens, nil } // isDigit checks if a character is a digit and therefore is allowed to be the start of a numeric literal. func isDigit(c byte) bool { return c >= '0' && c <= '9' } // isAllowedName checks if a character is allowed to be the first character of a name. // Variable names beginning with a number or containing any of the reserved characters are forbidden. func isAllowedName(c byte) bool { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_' || (c >= '*' && c <= '/') || (c >= ':' && c <= '@') }