u-tpl/internal/lexer.go

package internal

import (
	"strings"
	"unicode"
)

type TokenType int

const (
	TokText TokenType = iota
	TokParamStart
	TokRawStart
	TokIfStart
	TokForStart
	TokTplStart
	TokIncludeStart
	TokNamespaceStart
	TokUseStart
	TokElse
	TokComment
	TokEOF
)

type Token struct {
	Type  TokenType
	Value string
	Pos   Pos
}

type Lexer struct {
	input []rune
	pos   int
	line  int
	col   int
}

func NewLexer(input string) *Lexer {
	return &Lexer{
		input: []rune(input),
		line:  1,
		col:   1,
	}
}

func (l *Lexer) Tokenize() ([]Token, error) {
	var tokens []Token

	for l.pos < len(l.input) {
		ch := l.input[l.pos]

		if ch == '#' {
			if l.peek(1) == '{' {
				tokens = append(tokens, Token{Type: TokParamStart, Value: "#{", Pos: l.curPos()})
				l.advance()
				l.advance()
				continue
			}
			tokens = append(tokens, l.readComment())
			continue
		}

		if ch == '$' && l.peek(1) == '{' {
			tokens = append(tokens, Token{Type: TokRawStart, Value: "${", Pos: l.curPos()})
			l.advance()
			l.advance()
			continue
		}

		if ch == '@' {
			if tok, ok := l.tryDirective(); ok {
				tokens = append(tokens, tok)
				continue
			}
		}

		if ch == '}' {
			// Skip spaces after '}' to check for "else"
			spaceOffset := 1
			for l.peek(spaceOffset) == ' ' || l.peek(spaceOffset) == '\t' {
				spaceOffset++
			}
			if l.peekWord(spaceOffset, "else") {
				pos := l.curPos()
				l.advance()            // consume '}'
				l.advanceN(spaceOffset - 1) // consume spaces
				l.advanceN(4)          // consume "else"
				tokens = append(tokens, Token{Type: TokElse, Value: "} else", Pos: pos})
				continue
			}
			l.advance()
			// Pos stores the end position (after '}'), consistent with other TokText tokens
			tokens = append(tokens, Token{Type: TokText, Value: "}", Pos: l.curPos()})
			continue
		}

		if ch == '\n' {
			l.advance()
			// Pos stores the end position (after '\n'), consistent with other TokText tokens
			tokens = append(tokens, Token{Type: TokText, Value: "\n", Pos: l.curPos()})
			continue
		}

		// Regular text: scan until special character
		start := l.pos
		for l.pos < len(l.input) {
			c := l.input[l.pos]
			if c == '#' || c == '$' || c == '@' || c == '}' || c == '\n' {
				break
			}
			l.advance()
		}
		if l.pos > start {
			tokens = append(tokens, Token{Type: TokText, Value: string(l.input[start:l.pos]), Pos: Pos{Line: l.line, Col: l.col}})
		}
	}

	tokens = append(tokens, Token{Type: TokEOF, Pos: Pos{Line: l.line, Col: l.col}})
	return tokens, nil
}

func (l *Lexer) curPos() Pos {
	return Pos{Line: l.line, Col: l.col}
}

func (l *Lexer) advance() {
	if l.pos < len(l.input) {
		if l.input[l.pos] == '\n' {
			l.line++
			l.col = 1
		} else {
			l.col++
		}
		l.pos++
	}
}

func (l *Lexer) advanceN(n int) {
	for range n {
		l.advance()
	}
}

func (l *Lexer) peek(offset int) rune {
	idx := l.pos + offset
	if idx < len(l.input) {
		return l.input[idx]
	}
	return 0
}

func (l *Lexer) peekWord(offset int, word string) bool {
	runes := []rune(word)
	n := len(runes)
	for i := range n {
		if l.peek(offset+i) != runes[i] {
			return false
		}
	}
	after := l.peek(offset + n)
	return after == 0 || after == ' ' || after == '\n' || after == '\t' || after == '{' || after == '}'
}

func (l *Lexer) tryDirective() (Token, bool) {
	type directive struct {
		prefix []rune
		ttype  TokenType
		skip   int
	}

	directives := []directive{
		{[]rune("@if("), TokIfStart, 4},
		{[]rune("@for("), TokForStart, 5},
		{[]rune("@tpl(\""), TokTplStart, 5},
		{[]rune("@include(\""), TokIncludeStart, 10},
		{[]rune("@namespace(\""), TokNamespaceStart, 12},
		{[]rune("@use(\""), TokUseStart, 6},
	}

	for _, d := range directives {
		if l.matchRunes(d.prefix) {
			pos := l.curPos()
			val := string(d.prefix)
			l.advanceN(d.skip)
			return Token{Type: d.ttype, Value: val, Pos: pos}, true
		}
	}

	return Token{}, false
}

func (l *Lexer) matchRunes(runes []rune) bool {
	for i, r := range runes {
		if l.peek(i) != r {
			return false
		}
	}
	return true
}

func (l *Lexer) readComment() Token {
	pos := l.curPos()
	l.advance() // #
	start := l.pos
	for l.pos < len(l.input) && l.input[l.pos] != '\n' {
		l.advance()
	}
	// consume trailing newline so the comment line disappears
	if l.pos < len(l.input) && l.input[l.pos] == '\n' {
		l.advance()
	}
	return Token{Type: TokComment, Value: strings.TrimRightFunc(string(l.input[start:l.pos]), unicode.IsSpace), Pos: pos}
}