From 1405d2b8b105476a4290bb886dadadb9ab2bf5d5 Mon Sep 17 00:00:00 2001 From: Eduard Urbach Date: Thu, 19 Jun 2025 15:58:29 +0200 Subject: [PATCH] Added token package --- src/token/Count.go | 14 + src/token/Count_test.go | 17 + src/token/Instructions.go | 61 ++++ src/token/Instructions_test.go | 109 +++++++ src/token/Kind.go | 79 +++++ src/token/List.go | 76 +++++ src/token/List_test.go | 71 ++++ src/token/Token.go | 77 +++++ src/token/Token_test.go | 58 ++++ src/token/Tokenize.go | 64 ++++ src/token/Tokenize_test.go | 576 +++++++++++++++++++++++++++++++++ src/token/dash.go | 25 ++ src/token/digit.go | 38 +++ src/token/identifier.go | 52 +++ src/token/operator.go | 86 +++++ src/token/quote.go | 28 ++ src/token/slash.go | 34 ++ src/token/zero.go | 35 ++ 18 files changed, 1500 insertions(+) create mode 100644 src/token/Count.go create mode 100644 src/token/Count_test.go create mode 100644 src/token/Instructions.go create mode 100644 src/token/Instructions_test.go create mode 100644 src/token/Kind.go create mode 100644 src/token/List.go create mode 100644 src/token/List_test.go create mode 100644 src/token/Token.go create mode 100644 src/token/Token_test.go create mode 100644 src/token/Tokenize.go create mode 100644 src/token/Tokenize_test.go create mode 100644 src/token/dash.go create mode 100644 src/token/digit.go create mode 100644 src/token/identifier.go create mode 100644 src/token/operator.go create mode 100644 src/token/quote.go create mode 100644 src/token/slash.go create mode 100644 src/token/zero.go diff --git a/src/token/Count.go b/src/token/Count.go new file mode 100644 index 0000000..f3dc148 --- /dev/null +++ b/src/token/Count.go @@ -0,0 +1,14 @@ +package token + +// Count counts how often the given token appears in the token list. +func Count(tokens []Token, buffer []byte, kind Kind, name string) uint8 { + count := uint8(0) + + for _, t := range tokens { + if t.Kind == kind && t.String(buffer) == name { + count++ + } + } + + return count +} \ No newline at end of file diff --git a/src/token/Count_test.go b/src/token/Count_test.go new file mode 100644 index 0000000..a255a55 --- /dev/null +++ b/src/token/Count_test.go @@ -0,0 +1,17 @@ +package token_test + +import ( + "testing" + + "git.urbach.dev/cli/q/src/token" + "git.urbach.dev/go/assert" +) + +func TestCount(t *testing.T) { + buffer := []byte(`a b b c c c`) + tokens := token.Tokenize(buffer) + assert.Equal(t, token.Count(tokens, buffer, token.Identifier, "a"), 1) + assert.Equal(t, token.Count(tokens, buffer, token.Identifier, "b"), 2) + assert.Equal(t, token.Count(tokens, buffer, token.Identifier, "c"), 3) + assert.Equal(t, token.Count(tokens, buffer, token.Identifier, "d"), 0) +} \ No newline at end of file diff --git a/src/token/Instructions.go b/src/token/Instructions.go new file mode 100644 index 0000000..3005976 --- /dev/null +++ b/src/token/Instructions.go @@ -0,0 +1,61 @@ +package token + +// Instructions yields on each AST node. +func (list List) Instructions(yield func(List) bool) { + start := 0 + groupLevel := 0 + blockLevel := 0 + + for i, t := range list { + switch t.Kind { + case NewLine: + if start == i { + start = i + 1 + continue + } + + if groupLevel > 0 || blockLevel > 0 { + continue + } + + if !yield(list[start:i]) { + return + } + + start = i + 1 + + case GroupStart: + groupLevel++ + + case GroupEnd: + groupLevel-- + + case BlockStart: + blockLevel++ + + case BlockEnd: + blockLevel-- + + if groupLevel > 0 || blockLevel > 0 { + continue + } + + if !yield(list[start : i+1]) { + return + } + + start = i + 1 + + case EOF: + if start < i { + yield(list[start:i]) + } + + return + } + } + + if start < len(list) { + yield(list[start:]) + } +} \ No newline at end of file diff --git a/src/token/Instructions_test.go b/src/token/Instructions_test.go new file mode 100644 index 0000000..530a648 --- /dev/null +++ b/src/token/Instructions_test.go @@ -0,0 +1,109 @@ +package token_test + +import ( + "testing" + + "git.urbach.dev/cli/q/src/token" + "git.urbach.dev/go/assert" +) + +func TestInstructionsBasic(t *testing.T) { + src := []byte("a := 1\nb := 2\n") + tokens := token.Tokenize(src) + nodes := []string{} + + for param := range tokens.Instructions { + nodes = append(nodes, param.String(src)) + } + + assert.DeepEqual(t, nodes, []string{"a:=1", "b:=2"}) +} + +func TestInstructionsBlock(t *testing.T) { + src := []byte("a := 1\nif x > 0 {\nx = 0\n}\nb := 2\n") + tokens := token.Tokenize(src) + nodes := []string{} + + for param := range tokens.Instructions { + nodes = append(nodes, param.String(src)) + } + + assert.DeepEqual(t, nodes, []string{"a:=1", "ifx>0{\nx=0\n}", "b:=2"}) +} + +func TestInstructionsGroup(t *testing.T) { + src := []byte("a := 1\ncall(\nx,\ny\n)\nb := 2\n") + tokens := token.Tokenize(src) + nodes := []string{} + + for param := range tokens.Instructions { + nodes = append(nodes, param.String(src)) + } + + assert.DeepEqual(t, nodes, []string{"a:=1", "call(\nx,\ny\n)", "b:=2"}) +} + +func TestInstructionsBreak(t *testing.T) { + src := []byte("a := 1\nb := 2\n") + tokens := token.Tokenize(src) + count := 0 + + for range tokens.Instructions { + if count == 1 { + break + } + + count++ + } +} + +func TestInstructionsEOF(t *testing.T) { + src := []byte("a := 1") + tokens := token.Tokenize(src) + count := 0 + + for range tokens.Instructions { + count++ + } + + assert.Equal(t, count, 1) +} + +func TestInstructionsNoEOF(t *testing.T) { + tokens := token.List{ + token.Token{Position: 0, Length: 1, Kind: token.Identifier}, + } + + count := 0 + + for range tokens.Instructions { + count++ + } + + assert.Equal(t, count, 1) +} + +func TestInstructionsMultiBlock(t *testing.T) { + src := []byte("if x == 0 { if y == 0 {} }") + tokens := token.Tokenize(src) + count := 0 + + for range tokens.Instructions { + count++ + } + + assert.Equal(t, count, 1) +} + +func TestInstructionsMultiBlockBreak(t *testing.T) { + src := []byte("if x == 0 { if y == 0 {} }") + tokens := token.Tokenize(src) + count := 0 + + for range tokens.Instructions { + count++ + break + } + + assert.Equal(t, count, 1) +} \ No newline at end of file diff --git a/src/token/Kind.go b/src/token/Kind.go new file mode 100644 index 0000000..b4a8dbd --- /dev/null +++ b/src/token/Kind.go @@ -0,0 +1,79 @@ +package token + +// Kind represents the type of token. +type Kind uint8 + +const ( + Invalid Kind = iota // Invalid is an invalid token. + EOF // EOF is the end of file. + NewLine // NewLine is the newline character. + Identifier // Identifier is a series of characters used to identify a variable or function. + Number // Number is a series of numerical characters. + Rune // Rune is a single unicode code point. + String // String is an uninterpreted series of characters in the source code. + Comment // Comment is a comment. + GroupStart // ( + GroupEnd // ) + BlockStart // { + BlockEnd // } + ArrayStart // [ + ArrayEnd // ] + ReturnType // -> + ___OPERATORS___ // + Add // + + Sub // - + Mul // * + Div // / + Mod // % + And // & + Or // | + Xor // ^ + Shl // << + Shr // >> + LogicalAnd // && + LogicalOr // || + Define // := + Dot // . + Range // .. + Call // x() + Array // [x] + Separator // , + ___ASSIGNMENTS___ // + Assign // = + AddAssign // += + SubAssign // -= + MulAssign // *= + DivAssign // /= + ModAssign // %= + AndAssign // &= + OrAssign // |= + XorAssign // ^= + ShlAssign // <<= + ShrAssign // >>= + ___END_ASSIGNMENTS___ // + ___COMPARISONS___ // + Equal // == + NotEqual // != + Less // < + Greater // > + LessEqual // <= + GreaterEqual // >= + ___END_COMPARISONS___ // + ___UNARY___ // + Not // ! (unary) + Negate // - (unary) + ___END_UNARY___ // + ___END_OPERATORS___ // + ___KEYWORDS___ // + Assert // assert + Const // const + Else // else + Extern // extern + For // for + If // if + Import // import + Loop // loop + Return // return + Switch // switch + ___END_KEYWORDS___ // +) \ No newline at end of file diff --git a/src/token/List.go b/src/token/List.go new file mode 100644 index 0000000..6b18ed8 --- /dev/null +++ b/src/token/List.go @@ -0,0 +1,76 @@ +package token + +import ( + "strings" +) + +// List is a slice of tokens. +type List []Token + +// IndexKind returns the position of a token kind within a token list. +func (list List) IndexKind(kind Kind) int { + for i, token := range list { + if token.Kind == kind { + return i + } + } + + return -1 +} + +// LastIndexKind returns the position of the last token kind within a token list. +func (list List) LastIndexKind(kind Kind) int { + for i := len(list) - 1; i >= 0; i-- { + if list[i].Kind == kind { + return i + } + } + + return -1 +} + +// Split calls the callback function on each set of tokens in a comma separated list. +func (list List) Split(yield func(List) bool) { + if len(list) == 0 { + return + } + + start := 0 + groupLevel := 0 + + for i, t := range list { + switch t.Kind { + case GroupStart, ArrayStart, BlockStart: + groupLevel++ + + case GroupEnd, ArrayEnd, BlockEnd: + groupLevel-- + + case Separator: + if groupLevel > 0 { + continue + } + + parameter := list[start:i] + + if !yield(parameter) { + return + } + + start = i + 1 + } + } + + yield(list[start:]) +} + +// String returns the concatenated token strings. +func (list List) String(source []byte) string { + tmp := strings.Builder{} + + for _, t := range list { + tmp.WriteString(t.String(source)) + } + + return tmp.String() +} \ No newline at end of file diff --git a/src/token/List_test.go b/src/token/List_test.go new file mode 100644 index 0000000..83db69d --- /dev/null +++ b/src/token/List_test.go @@ -0,0 +1,71 @@ +package token_test + +import ( + "testing" + + "git.urbach.dev/cli/q/src/token" + "git.urbach.dev/go/assert" +) + +func TestIndexKind(t *testing.T) { + tokens := token.Tokenize([]byte("a{{}}")) + assert.Equal(t, tokens.IndexKind(token.NewLine), -1) + assert.Equal(t, tokens.LastIndexKind(token.NewLine), -1) + assert.Equal(t, tokens.IndexKind(token.BlockStart), 1) + assert.Equal(t, tokens.LastIndexKind(token.BlockStart), 2) + assert.Equal(t, tokens.IndexKind(token.BlockEnd), 3) + assert.Equal(t, tokens.LastIndexKind(token.BlockEnd), 4) +} + +func TestSplit(t *testing.T) { + src := []byte("1+2,3*4,5*6,7+8") + tokens := token.Tokenize(src) + parameters := []string{} + + for param := range tokens.Split { + parameters = append(parameters, param.String(src)) + } + + assert.DeepEqual(t, parameters, []string{"1+2", "3*4", "5*6", "7+8"}) +} + +func TestSplitBreak(t *testing.T) { + src := []byte("1,2") + tokens := token.Tokenize(src) + + for range tokens.Split { + break + } +} + +func TestSplitEmpty(t *testing.T) { + tokens := token.List{} + + for range tokens.Split { + t.Fail() + } +} + +func TestSplitGroups(t *testing.T) { + src := []byte("f(1,2),g(3,4)") + tokens := token.Tokenize(src) + parameters := []string{} + + for param := range tokens.Split { + parameters = append(parameters, param.String(src)) + } + + assert.DeepEqual(t, parameters, []string{"f(1,2)", "g(3,4)"}) +} + +func TestSplitSingle(t *testing.T) { + src := []byte("123") + tokens := token.Tokenize(src) + parameters := []string{} + + for param := range tokens.Split { + parameters = append(parameters, param.String(src)) + } + + assert.DeepEqual(t, parameters, []string{"123"}) +} \ No newline at end of file diff --git a/src/token/Token.go b/src/token/Token.go new file mode 100644 index 0000000..cacd8f4 --- /dev/null +++ b/src/token/Token.go @@ -0,0 +1,77 @@ +package token + +import ( + "unsafe" +) + +// Position is the data type for storing file offsets. +type Position = uint32 + +// Length is the data type for storing token lengths. +type Length = uint16 + +// Token represents a single element in a source file. +// The characters that make up an identifier are grouped into a single token. +// This makes parsing easier and allows us to do better syntax checks. +type Token struct { + Position Position + Length Length + Kind Kind +} + +// Bytes returns the byte slice. +func (t Token) Bytes(buffer []byte) []byte { + return buffer[t.Position : t.Position+Position(t.Length)] +} + +// End returns the position after the token. +func (t Token) End() Position { + return t.Position + Position(t.Length) +} + +// IsAssignment returns true if the token is an assignment operator. +func (t Token) IsAssignment() bool { + return t.Kind > ___ASSIGNMENTS___ && t.Kind < ___END_ASSIGNMENTS___ +} + +// IsComparison returns true if the token is a comparison operator. +func (t Token) IsComparison() bool { + return t.Kind > ___COMPARISONS___ && t.Kind < ___END_COMPARISONS___ +} + +// IsExpressionStart returns true if the token starts an expression. +func (t Token) IsExpressionStart() bool { + return t.Kind == GroupStart || t.Kind == ArrayStart || t.Kind == BlockStart +} + +// IsKeyword returns true if the token is a keyword. +func (t Token) IsKeyword() bool { + return t.Kind > ___KEYWORDS___ && t.Kind < ___END_KEYWORDS___ +} + +// IsNumeric returns true if the token is a number or rune. +func (t Token) IsNumeric() bool { + return t.Kind == Number || t.Kind == Rune +} + +// IsOperator returns true if the token is an operator. +func (t Token) IsOperator() bool { + return t.Kind > ___OPERATORS___ && t.Kind < ___END_OPERATORS___ +} + +// IsUnaryOperator returns true if the token is a unary operator. +func (t Token) IsUnaryOperator() bool { + return t.Kind > ___UNARY___ && t.Kind < ___END_UNARY___ +} + +// Reset resets the token to default values. +func (t *Token) Reset() { + t.Position = 0 + t.Length = 0 + t.Kind = Invalid +} + +// String returns the token string. +func (t Token) String(buffer []byte) string { + return unsafe.String(unsafe.SliceData(t.Bytes(buffer)), t.Length) +} \ No newline at end of file diff --git a/src/token/Token_test.go b/src/token/Token_test.go new file mode 100644 index 0000000..a3d38f0 --- /dev/null +++ b/src/token/Token_test.go @@ -0,0 +1,58 @@ +package token_test + +import ( + "testing" + + "git.urbach.dev/cli/q/src/token" + "git.urbach.dev/go/assert" +) + +func TestTokenEnd(t *testing.T) { + hello := token.Token{ + Kind: token.Identifier, + Position: 0, + Length: 5, + } + + assert.Equal(t, hello.End(), 5) +} + +func TestTokenReset(t *testing.T) { + hello := token.Token{ + Kind: token.Identifier, + Position: 1, + Length: 5, + } + + hello.Reset() + assert.Equal(t, hello.Position, 0) + assert.Equal(t, hello.Length, 0) + assert.Equal(t, hello.Kind, token.Invalid) +} + +func TestTokenString(t *testing.T) { + buffer := []byte("hello, world") + hello := token.Token{Kind: token.Identifier, Position: 0, Length: 5} + comma := token.Token{Kind: token.Separator, Position: 5, Length: 1} + world := token.Token{Kind: token.Identifier, Position: 7, Length: 5} + + assert.Equal(t, hello.String(buffer), "hello") + assert.Equal(t, comma.String(buffer), ",") + assert.Equal(t, world.String(buffer), "world") +} + +func TestTokenGroups(t *testing.T) { + assignment := token.Token{Kind: token.Assign} + operator := token.Token{Kind: token.Add} + keyword := token.Token{Kind: token.If} + unary := token.Token{Kind: token.Not} + number := token.Token{Kind: token.Number} + comparison := token.Token{Kind: token.Equal} + + assert.True(t, assignment.IsAssignment()) + assert.True(t, operator.IsOperator()) + assert.True(t, keyword.IsKeyword()) + assert.True(t, unary.IsUnaryOperator()) + assert.True(t, number.IsNumeric()) + assert.True(t, comparison.IsComparison()) +} \ No newline at end of file diff --git a/src/token/Tokenize.go b/src/token/Tokenize.go new file mode 100644 index 0000000..e9bd066 --- /dev/null +++ b/src/token/Tokenize.go @@ -0,0 +1,64 @@ +package token + +// Tokenize turns the file contents into a list of tokens. +func Tokenize(buffer []byte) List { + var ( + i Position + tokens = make(List, 0, 8+len(buffer)/2) + ) + + for i < Position(len(buffer)) { + switch buffer[i] { + case ' ', '\t', '\r': + case ',': + tokens = append(tokens, Token{Kind: Separator, Position: i, Length: 1}) + case '(': + tokens = append(tokens, Token{Kind: GroupStart, Position: i, Length: 1}) + case ')': + tokens = append(tokens, Token{Kind: GroupEnd, Position: i, Length: 1}) + case '{': + tokens = append(tokens, Token{Kind: BlockStart, Position: i, Length: 1}) + case '}': + tokens = append(tokens, Token{Kind: BlockEnd, Position: i, Length: 1}) + case '[': + tokens = append(tokens, Token{Kind: ArrayStart, Position: i, Length: 1}) + case ']': + tokens = append(tokens, Token{Kind: ArrayEnd, Position: i, Length: 1}) + case '\n': + tokens = append(tokens, Token{Kind: NewLine, Position: i, Length: 1}) + case '-': + tokens, i = dash(tokens, buffer, i) + case '/': + tokens, i = slash(tokens, buffer, i) + continue + case '"', '\'': + tokens, i = quote(tokens, buffer, i) + continue + case '0': + tokens, i = zero(tokens, buffer, i) + continue + default: + if isIdentifierStart(buffer[i]) { + tokens, i = identifier(tokens, buffer, i) + continue + } + + if isDigit(buffer[i]) { + tokens, i = digit(tokens, buffer, i) + continue + } + + if isOperator(buffer[i]) { + tokens, i = operator(tokens, buffer, i) + continue + } + + tokens = append(tokens, Token{Kind: Invalid, Position: i, Length: 1}) + } + + i++ + } + + tokens = append(tokens, Token{Kind: EOF, Position: i, Length: 0}) + return tokens +} \ No newline at end of file diff --git a/src/token/Tokenize_test.go b/src/token/Tokenize_test.go new file mode 100644 index 0000000..cf5414c --- /dev/null +++ b/src/token/Tokenize_test.go @@ -0,0 +1,576 @@ +package token_test + +import ( + "testing" + + "git.urbach.dev/cli/q/src/token" + "git.urbach.dev/go/assert" +) + +func TestFunction(t *testing.T) { + tokens := token.Tokenize([]byte("main(){}")) + + expected := []token.Kind{ + token.Identifier, + token.GroupStart, + token.GroupEnd, + token.BlockStart, + token.BlockEnd, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestKeyword(t *testing.T) { + tokens := token.Tokenize([]byte("assert const else extern if import for loop return switch")) + + expected := []token.Kind{ + token.Assert, + token.Const, + token.Else, + token.Extern, + token.If, + token.Import, + token.For, + token.Loop, + token.Return, + token.Switch, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestArray(t *testing.T) { + tokens := token.Tokenize([]byte("array[i]")) + + expected := []token.Kind{ + token.Identifier, + token.ArrayStart, + token.Identifier, + token.ArrayEnd, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestNewline(t *testing.T) { + tokens := token.Tokenize([]byte("\n\n")) + + expected := []token.Kind{ + token.NewLine, + token.NewLine, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestNumber(t *testing.T) { + tokens := token.Tokenize([]byte(`123 456`)) + + expected := []token.Kind{ + token.Number, + token.Number, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestOperator(t *testing.T) { + tokens := token.Tokenize([]byte(`a + b - c * d / e % f << g >> h & i | j ^ k`)) + + expected := []token.Kind{ + token.Identifier, + token.Add, + token.Identifier, + token.Sub, + token.Identifier, + token.Mul, + token.Identifier, + token.Div, + token.Identifier, + token.Mod, + token.Identifier, + token.Shl, + token.Identifier, + token.Shr, + token.Identifier, + token.And, + token.Identifier, + token.Or, + token.Identifier, + token.Xor, + token.Identifier, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestOperatorAssign(t *testing.T) { + tokens := token.Tokenize([]byte(`a = b += c -= d *= e /= f %= g &= h |= i ^= j <<= k >>= l`)) + + expected := []token.Kind{ + token.Identifier, + token.Assign, + token.Identifier, + token.AddAssign, + token.Identifier, + token.SubAssign, + token.Identifier, + token.MulAssign, + token.Identifier, + token.DivAssign, + token.Identifier, + token.ModAssign, + token.Identifier, + token.AndAssign, + token.Identifier, + token.OrAssign, + token.Identifier, + token.XorAssign, + token.Identifier, + token.ShlAssign, + token.Identifier, + token.ShrAssign, + token.Identifier, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestOperatorEquality(t *testing.T) { + tokens := token.Tokenize([]byte(`a == b != c <= d >= e < f > g`)) + + expected := []token.Kind{ + token.Identifier, + token.Equal, + token.Identifier, + token.NotEqual, + token.Identifier, + token.LessEqual, + token.Identifier, + token.GreaterEqual, + token.Identifier, + token.Less, + token.Identifier, + token.Greater, + token.Identifier, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestOperatorLogical(t *testing.T) { + tokens := token.Tokenize([]byte(`a && b || c`)) + + expected := []token.Kind{ + token.Identifier, + token.LogicalAnd, + token.Identifier, + token.LogicalOr, + token.Identifier, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestDefine(t *testing.T) { + tokens := token.Tokenize([]byte(`a := b`)) + + expected := []token.Kind{ + token.Identifier, + token.Define, + token.Identifier, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestDot(t *testing.T) { + tokens := token.Tokenize([]byte(`a.b.c`)) + + expected := []token.Kind{ + token.Identifier, + token.Dot, + token.Identifier, + token.Dot, + token.Identifier, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestNot(t *testing.T) { + tokens := token.Tokenize([]byte(`!a`)) + + expected := []token.Kind{ + token.Not, + token.Identifier, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestNegateFirstToken(t *testing.T) { + tokens := token.Tokenize([]byte(`-a`)) + + expected := []token.Kind{ + token.Negate, + token.Identifier, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestNegateAfterGroupStart(t *testing.T) { + tokens := token.Tokenize([]byte(`(-a)`)) + + expected := []token.Kind{ + token.GroupStart, + token.Negate, + token.Identifier, + token.GroupEnd, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestNegateSub(t *testing.T) { + tokens := token.Tokenize([]byte(`-a-b`)) + + expected := []token.Kind{ + token.Negate, + token.Identifier, + token.Sub, + token.Identifier, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestNegateAfterOperator(t *testing.T) { + tokens := token.Tokenize([]byte(`-a + -b`)) + + expected := []token.Kind{ + token.Negate, + token.Identifier, + token.Add, + token.Negate, + token.Identifier, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestNegateNumber(t *testing.T) { + tokens := token.Tokenize([]byte(`-1`)) + + expected := []token.Kind{ + token.Number, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestBinaryNumber(t *testing.T) { + tokens := token.Tokenize([]byte(`0b1010`)) + + expected := []token.Kind{ + token.Number, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestOctalNumber(t *testing.T) { + tokens := token.Tokenize([]byte(`0o755`)) + + expected := []token.Kind{ + token.Number, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestHexadecimalNumber(t *testing.T) { + tokens := token.Tokenize([]byte(`0xCAFE`)) + + expected := []token.Kind{ + token.Number, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestStandaloneZero(t *testing.T) { + tokens := token.Tokenize([]byte(`0`)) + + expected := []token.Kind{ + token.Number, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestLeadingZero(t *testing.T) { + tokens := token.Tokenize([]byte(`0123`)) + + expected := []token.Kind{ + token.Number, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestRange(t *testing.T) { + tokens := token.Tokenize([]byte("a..b")) + + expected := []token.Kind{ + token.Identifier, + token.Range, + token.Identifier, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestSeparator(t *testing.T) { + tokens := token.Tokenize([]byte("a,b,c")) + + expected := []token.Kind{ + token.Identifier, + token.Separator, + token.Identifier, + token.Separator, + token.Identifier, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestComment(t *testing.T) { + tokens := token.Tokenize([]byte("// Hello\n// World")) + + expected := []token.Kind{ + token.Comment, + token.NewLine, + token.Comment, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } + + tokens = token.Tokenize([]byte("// Hello\n")) + + expected = []token.Kind{ + token.Comment, + token.NewLine, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } + + tokens = token.Tokenize([]byte(`// Hello`)) + + expected = []token.Kind{ + token.Comment, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } + + tokens = token.Tokenize([]byte(`//`)) + + expected = []token.Kind{ + token.Comment, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } + + tokens = token.Tokenize([]byte(`/`)) + + expected = []token.Kind{ + token.Div, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestInvalid(t *testing.T) { + tokens := token.Tokenize([]byte(`##`)) + + expected := []token.Kind{ + token.Invalid, + token.Invalid, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestString(t *testing.T) { + tokens := token.Tokenize([]byte(`"Hello" "World"`)) + + expected := []token.Kind{ + token.String, + token.String, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestStringMultiline(t *testing.T) { + tokens := token.Tokenize([]byte("\"Hello\nWorld\"")) + + expected := []token.Kind{ + token.String, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestStringEOF(t *testing.T) { + tokens := token.Tokenize([]byte(`"EOF`)) + + expected := []token.Kind{ + token.String, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestReturnType(t *testing.T) { + tokens := token.Tokenize([]byte("()->")) + + expected := []token.Kind{ + token.GroupStart, + token.GroupEnd, + token.ReturnType, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestMinusAtEOF(t *testing.T) { + tokens := token.Tokenize([]byte("1-")) + + expected := []token.Kind{ + token.Number, + token.Sub, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} + +func TestRune(t *testing.T) { + tokens := token.Tokenize([]byte("'a'")) + + expected := []token.Kind{ + token.Rune, + token.EOF, + } + + for i, kind := range expected { + assert.Equal(t, tokens[i].Kind, kind) + } +} \ No newline at end of file diff --git a/src/token/dash.go b/src/token/dash.go new file mode 100644 index 0000000..7ded02d --- /dev/null +++ b/src/token/dash.go @@ -0,0 +1,25 @@ +package token + +// dash handles all tokens starting with '-'. +func dash(tokens List, buffer []byte, i Position) (List, Position) { + if len(tokens) == 0 || tokens[len(tokens)-1].IsOperator() || tokens[len(tokens)-1].IsExpressionStart() || tokens[len(tokens)-1].IsKeyword() { + tokens = append(tokens, Token{Kind: Negate, Position: i, Length: 1}) + } else { + if i+1 < Position(len(buffer)) { + switch buffer[i+1] { + case '=': + tokens = append(tokens, Token{Kind: SubAssign, Position: i, Length: 2}) + i++ + case '>': + tokens = append(tokens, Token{Kind: ReturnType, Position: i, Length: 2}) + i++ + default: + tokens = append(tokens, Token{Kind: Sub, Position: i, Length: 1}) + } + } else { + tokens = append(tokens, Token{Kind: Sub, Position: i, Length: 1}) + } + } + + return tokens, i +} \ No newline at end of file diff --git a/src/token/digit.go b/src/token/digit.go new file mode 100644 index 0000000..29be31f --- /dev/null +++ b/src/token/digit.go @@ -0,0 +1,38 @@ +package token + +// digit handles all tokens that qualify as a digit. +func digit(tokens List, buffer []byte, i Position) (List, Position) { + position := i + i++ + + for i < Position(len(buffer)) && isDigit(buffer[i]) { + i++ + } + + last := len(tokens) - 1 + + if len(tokens) > 0 && tokens[last].Kind == Negate { + tokens[last].Kind = Number + tokens[last].Length = Length(i-position) + 1 + } else { + tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(i - position)}) + } + + return tokens, i +} + +func isDigit(c byte) bool { + return c >= '0' && c <= '9' +} + +func isHexDigit(c byte) bool { + return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') +} + +func isBinaryDigit(c byte) bool { + return c == '0' || c == '1' +} + +func isOctalDigit(c byte) bool { + return c >= '0' && c <= '7' +} \ No newline at end of file diff --git a/src/token/identifier.go b/src/token/identifier.go new file mode 100644 index 0000000..d685059 --- /dev/null +++ b/src/token/identifier.go @@ -0,0 +1,52 @@ +package token + +// identifier handles all tokens that qualify as an identifier. +func identifier(tokens List, buffer []byte, i Position) (List, Position) { + position := i + i++ + + for i < Position(len(buffer)) && isIdentifier(buffer[i]) { + i++ + } + + identifier := buffer[position:i] + kind := Identifier + + switch string(identifier) { + case "assert": + kind = Assert + case "const": + kind = Const + case "if": + kind = If + case "else": + kind = Else + case "extern": + kind = Extern + case "for": + kind = For + case "import": + kind = Import + case "loop": + kind = Loop + case "return": + kind = Return + case "switch": + kind = Switch + } + + tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(len(identifier))}) + return tokens, i +} + +func isIdentifier(c byte) bool { + return isLetter(c) || isDigit(c) || c == '_' +} + +func isIdentifierStart(c byte) bool { + return isLetter(c) || c == '_' +} + +func isLetter(c byte) bool { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') +} \ No newline at end of file diff --git a/src/token/operator.go b/src/token/operator.go new file mode 100644 index 0000000..4fa1f25 --- /dev/null +++ b/src/token/operator.go @@ -0,0 +1,86 @@ +package token + +// operator handles all tokens that qualify as an operator. +func operator(tokens List, buffer []byte, i Position) (List, Position) { + position := i + i++ + + for i < Position(len(buffer)) && isOperator(buffer[i]) { + i++ + } + + kind := Invalid + + switch string(buffer[position:i]) { + case "!": + kind = Not + case "!=": + kind = NotEqual + case "%": + kind = Mod + case "%=": + kind = ModAssign + case "&": + kind = And + case "&&": + kind = LogicalAnd + case "&=": + kind = AndAssign + case "*": + kind = Mul + case "*=": + kind = MulAssign + case "+": + kind = Add + case "+=": + kind = AddAssign + case ".": + kind = Dot + case "..": + kind = Range + case ":=": + kind = Define + case "<": + kind = Less + case "<<": + kind = Shl + case "<<=": + kind = ShlAssign + case "<=": + kind = LessEqual + case "=": + kind = Assign + case "==": + kind = Equal + case ">": + kind = Greater + case ">=": + kind = GreaterEqual + case ">>": + kind = Shr + case ">>=": + kind = ShrAssign + case "^": + kind = Xor + case "^=": + kind = XorAssign + case "|": + kind = Or + case "|=": + kind = OrAssign + case "||": + kind = LogicalOr + } + + tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(i - position)}) + return tokens, i +} + +func isOperator(c byte) bool { + switch c { + case '=', ':', '.', '+', '-', '*', '/', '<', '>', '&', '|', '^', '%', '!': + return true + default: + return false + } +} \ No newline at end of file diff --git a/src/token/quote.go b/src/token/quote.go new file mode 100644 index 0000000..151db0e --- /dev/null +++ b/src/token/quote.go @@ -0,0 +1,28 @@ +package token + +// quote handles all tokens starting with a single or double quote. +func quote(tokens List, buffer []byte, i Position) (List, Position) { + limiter := buffer[i] + start := i + end := Position(len(buffer)) + i++ + + for i < Position(len(buffer)) { + if buffer[i] == limiter && (buffer[i-1] != '\\' || buffer[i-2] == '\\') { + end = i + 1 + i++ + break + } + + i++ + } + + kind := String + + if limiter == '\'' { + kind = Rune + } + + tokens = append(tokens, Token{Kind: kind, Position: start, Length: Length(end - start)}) + return tokens, i +} \ No newline at end of file diff --git a/src/token/slash.go b/src/token/slash.go new file mode 100644 index 0000000..b5312d1 --- /dev/null +++ b/src/token/slash.go @@ -0,0 +1,34 @@ +package token + +// slash handles all tokens starting with '/'. +func slash(tokens List, buffer []byte, i Position) (List, Position) { + if i+1 < Position(len(buffer)) && buffer[i+1] == '/' { + position := i + + for i < Position(len(buffer)) && buffer[i] != '\n' { + i++ + } + + tokens = append(tokens, Token{Kind: Comment, Position: position, Length: Length(i - position)}) + } else { + position := i + i++ + + for i < Position(len(buffer)) && isOperator(buffer[i]) { + i++ + } + + kind := Invalid + + switch string(buffer[position:i]) { + case "/": + kind = Div + case "/=": + kind = DivAssign + } + + tokens = append(tokens, Token{Kind: kind, Position: position, Length: Length(i - position)}) + } + + return tokens, i +} \ No newline at end of file diff --git a/src/token/zero.go b/src/token/zero.go new file mode 100644 index 0000000..dab2d9a --- /dev/null +++ b/src/token/zero.go @@ -0,0 +1,35 @@ +package token + +// zero handles all tokens starting with a '0'. +func zero(tokens List, buffer []byte, i Position) (List, Position) { + position := i + i++ + + if i >= Position(len(buffer)) { + tokens = append(tokens, Token{Kind: Number, Position: position, Length: 1}) + return tokens, i + } + + filter := isDigit + + switch buffer[i] { + case 'x': + i++ + filter = isHexDigit + + case 'b': + i++ + filter = isBinaryDigit + + case 'o': + i++ + filter = isOctalDigit + } + + for i < Position(len(buffer)) && filter(buffer[i]) { + i++ + } + + tokens = append(tokens, Token{Kind: Number, Position: position, Length: Length(i - position)}) + return tokens, i +} \ No newline at end of file