From 4c95efa46907e09f2b68754c8e40e78375eec661 Mon Sep 17 00:00:00 2001 From: Mike Farah Date: Thu, 17 Sep 2020 21:58:01 +1000 Subject: [PATCH] wip --- go.mod | 2 + go.sum | 4 ++ pkg/yqlib/path_tokeniser.go | 103 +++++++++++++++++++++++++++++++ pkg/yqlib/path_tokeniser_test.go | 52 ++++++++++++++++ 4 files changed, 161 insertions(+) create mode 100644 pkg/yqlib/path_tokeniser.go create mode 100644 pkg/yqlib/path_tokeniser_test.go diff --git a/go.mod b/go.mod index 9253a567..89d1d959 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,8 @@ require ( github.com/pkg/errors v0.9.1 github.com/spf13/cobra v1.0.0 github.com/spf13/pflag v1.0.5 // indirect + github.com/timtadh/data-structures v0.5.3 // indirect + github.com/timtadh/lexmachine v0.2.2 golang.org/x/sys v0.0.0-20200905004654-be1d3432aa8f // indirect golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect gopkg.in/op/go-logging.v1 v1.0.0-20160211212156-b2cb9fa56473 diff --git a/go.sum b/go.sum index 7672e2c1..4710d6b5 100644 --- a/go.sum +++ b/go.sum @@ -111,6 +111,10 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+ github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/timtadh/data-structures v0.5.3 h1:F2tEjoG9qWIyUjbvXVgJqEOGJPMIiYn7U5W5mE+i/vQ= +github.com/timtadh/data-structures v0.5.3/go.mod h1:9R4XODhJ8JdWFEI8P/HJKqxuJctfBQw6fDibMQny2oU= +github.com/timtadh/lexmachine v0.2.2 h1:g55RnjdYazm5wnKv59pwFcBJHOyvTPfDEoz21s4PHmY= +github.com/timtadh/lexmachine v0.2.2/go.mod h1:GBJvD5OAfRn/gnp92zb9KTgHLB7akKyxmVivoYCcjQI= github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U= github.com/ugorji/go v1.1.4/go.mod h1:uQMGLiO92mf5W77hV/PUCpI3pbzQx3CRekS0kk+RGrc= github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU= diff --git a/pkg/yqlib/path_tokeniser.go b/pkg/yqlib/path_tokeniser.go new file mode 100644 index 00000000..2283aed9 --- /dev/null +++ b/pkg/yqlib/path_tokeniser.go @@ -0,0 +1,103 @@ +package yqlib + +import ( + "strings" + + lex "github.com/timtadh/lexmachine" + "github.com/timtadh/lexmachine/machines" +) + +var Literals []string // The tokens representing literal strings +var Keywords []string // The keyword tokens +var Tokens []string // All of the tokens (including literals and keywords) +var TokenIds map[string]int // A map from the token names to their int ids + +func initTokens() { + Literals = []string{ + "(", + ")", + "[+]", + "[*]", + "**", + } + Tokens = []string{ + "OPERATION", // ==, OR, AND + "PATH", // a.b.c + "ARRAY_INDEX", // 1234 + "PATH_JOIN", // "." + } + Tokens = append(Tokens, Literals...) + TokenIds = make(map[string]int) + for i, tok := range Tokens { + TokenIds[tok] = i + } +} + +func skip(*lex.Scanner, *machines.Match) (interface{}, error) { + return nil, nil +} + +func token(name string) lex.Action { + return func(s *lex.Scanner, m *machines.Match) (interface{}, error) { + return s.Token(TokenIds[name], string(m.Bytes), m), nil + } +} + +// Creates the lexer object and compiles the NFA. +func initLexer() (*lex.Lexer, error) { + lexer := lex.NewLexer() + for _, lit := range Literals { + r := "\\" + strings.Join(strings.Split(lit, ""), "\\") + lexer.Add([]byte(r), token(lit)) + } + lexer.Add([]byte(`([Oo][Rr]|[Aa][Nn][Dd]|==)`), token("OPERATION")) + lexer.Add([]byte(`\[-?[0-9]+\]`), token("ARRAY_INDEX")) + lexer.Add([]byte("( |\t|\n|\r)+"), skip) + lexer.Add([]byte(`"[^ "]+"`), token("PATH")) + lexer.Add([]byte(`[^ \.\[\(\)=]+`), token("PATH")) + lexer.Add([]byte(`\.`), skip) + err := lexer.Compile() + if err != nil { + return nil, err + } + return lexer, nil +} + +type PathTokeniser interface { + Tokenise(path string) ([]*lex.Token, error) +} + +type pathTokeniser struct { + lexer *lex.Lexer +} + +func NewPathTokeniser() PathTokeniser { + initTokens() + var lexer, err = initLexer() + if err != nil { + panic(err) + } + return &pathTokeniser{lexer} +} + +func (p *pathTokeniser) Tokenise(path string) ([]*lex.Token, error) { + scanner, err := p.lexer.Scanner([]byte(path)) + + if err != nil { + return nil, err + } + var tokens []*lex.Token + for tok, err, eof := scanner.Next(); !eof; tok, err, eof = scanner.Next() { + + if tok != nil { + token := tok.(*lex.Token) + log.Debugf("Processing %v - %v", token.Value, Tokens[token.Type]) + tokens = append(tokens, token) + } + if err != nil { + return nil, err + } + } + + return tokens, nil +} diff --git a/pkg/yqlib/path_tokeniser_test.go b/pkg/yqlib/path_tokeniser_test.go new file mode 100644 index 00000000..b12aca67 --- /dev/null +++ b/pkg/yqlib/path_tokeniser_test.go @@ -0,0 +1,52 @@ +package yqlib + +import ( + "testing" + + "github.com/mikefarah/yq/v3/test" +) + +var tokeniserTests = []struct { + path string + expectedTokens []interface{} +}{ // TODO: Ensure ALL documented examples have tests! sheesh + + // {"apples.BANANAS", append(make([]interface{}, 0), "apples", "BANANAS")}, + // {"a.b.**", append(make([]interface{}, 0), "a", "b", "**")}, + // {"a.\"=\".frog", append(make([]interface{}, 0), "a", "=", "frog")}, + // {"a.b.*", append(make([]interface{}, 0), "a", "b", "*")}, + // {"a.b.thin*", append(make([]interface{}, 0), "a", "b", "thin*")}, + // {"a.b[0]", append(make([]interface{}, 0), "a", "b", "0")}, + // {"a.b[*]", append(make([]interface{}, 0), "a", "b", "[*]")}, + // {"a.b[-12]", append(make([]interface{}, 0), "a", "b", "-12")}, + // {"a.b.0", append(make([]interface{}, 0), "a", "b", "0")}, + // {"a.b.d[+]", append(make([]interface{}, 0), "a", "b", "d", "[+]")}, + // {"a", append(make([]interface{}, 0), "a")}, + // {"\"a.b\".c", append(make([]interface{}, 0), "a.b", "c")}, + // {`b."foo.bar"`, append(make([]interface{}, 0), "b", "foo.bar")}, + // {"animals(.==cat)", append(make([]interface{}, 0), "animals", "(", "==", "cat", ")")}, // TODO validate this dot is not a join? + // {"animals(.==c*)", append(make([]interface{}, 0), "animals", "(", "==", "c*", ")")}, // TODO validate this dot is not a join? + // {"[1].a.d", append(make([]interface{}, 0), int64(1), "a", "d")}, + // {"a[0].c", append(make([]interface{}, 0), "a", int64(0), "c")}, + // {"[0]", append(make([]interface{}, 0), int64(0))}, + // {"a.cool(s.d.f==cool)", append(make([]interface{}, 0), "a", "cool", "(", "s", "d", "f", "==", "cool", ")")}, + {"a.cool(s.d.f==cool OR t.b.h==frog).caterpillar", append(make([]interface{}, 0), "a", "cool", "(", "s", "d", "f", "==", "cool", "OR", "t", "b", "h", "==", "frog", ")", "caterpillar")}, + {"a.cool(s.d.f==cool and t.b.h==frog)*", append(make([]interface{}, 0), "a", "cool", "(", "s", "d", "f", "==", "cool", "and", "t", "b", "h", "==", "frog", ")", "*")}, + {"a.cool(s.d.f==cool and t.b.h==frog).th*", append(make([]interface{}, 0), "a", "cool", "(", "s", "d", "f", "==", "cool", "and", "t", "b", "h", "==", "frog", ")", "th*")}, +} + +var tokeniser = NewPathTokeniser() + +func TestTokeniser(t *testing.T) { + for _, tt := range tokeniserTests { + tokens, err := tokeniser.Tokenise(tt.path) + if err != nil { + t.Error(tt.path, err) + } + var tokenValues []interface{} + for _, token := range tokens { + tokenValues = append(tokenValues, token.Value) + } + test.AssertResultComplex(t, tt.expectedTokens, tokenValues) + } +}