Added capture regex operator

This commit is contained in:
Mike Farah 2021-07-11 11:08:18 +10:00
parent 3394feee0d
commit 5c0a5bd9d3
5 changed files with 175 additions and 46 deletions

View File

@ -24,76 +24,110 @@ cat; meow; 1; ; true
## Match string ## Match string
Given a sample.yml file of: Given a sample.yml file of:
```yaml ```yaml
cat foo bar foo
``` ```
then then
```bash ```bash
yq eval 'match("at")' sample.yml yq eval 'match("foo")' sample.yml
``` ```
will output will output
```yaml ```yaml
string: at string: foo
offset: 1 offset: 0
length: 2 length: 3
captures: [] captures: []
``` ```
## Match string, case insensitive ## Match string, case insensitive
Given a sample.yml file of: Given a sample.yml file of:
```yaml ```yaml
cAt foo bar FOO
``` ```
then then
```bash ```bash
yq eval 'match("(?i)at")' sample.yml yq eval 'match("(?i)foo"; "g")' sample.yml
``` ```
will output will output
```yaml ```yaml
string: At string: foo
offset: 1 offset: 0
length: 2 length: 3
captures: []
string: FOO
offset: 8
length: 3
captures: [] captures: []
``` ```
## Match with capture groups ## Match with capture groups
Given a sample.yml file of: Given a sample.yml file of:
```yaml ```yaml
a cat abc abc
``` ```
then then
```bash ```bash
yq eval 'match("c(.t)")' sample.yml yq eval 'match("(abc)+"; "g")' sample.yml
``` ```
will output will output
```yaml ```yaml
string: cat string: abc
offset: 2 offset: 0
length: 3 length: 3
captures: captures:
- string: at - string: abc
offset: 3 offset: 0
length: 2 length: 3
string: abc
offset: 4
length: 3
captures:
- string: abc
offset: 4
length: 3
``` ```
## Match with named capture groups ## Match with named capture groups
Given a sample.yml file of: Given a sample.yml file of:
```yaml ```yaml
a cat foo bar foo foo foo
``` ```
then then
```bash ```bash
yq eval 'match("c(?P<cool>.t)")' sample.yml yq eval 'match("foo (?P<bar123>bar)? foo"; "g")' sample.yml
``` ```
will output will output
```yaml ```yaml
string: cat string: foo bar foo
offset: 2 offset: 0
length: 3 length: 11
captures: captures:
- string: at - string: bar
offset: 3 offset: 4
length: 2 length: 3
name: cool name: bar123
string: foo foo
offset: 12
length: 8
captures:
- string: null
offset: -1
length: 0
name: bar123
```
## Capture named groups into a map
Given a sample.yml file of:
```yaml
xyzzy-14
```
then
```bash
yq eval 'capture("(?P<a>[a-z]+)-(?P<n>[0-9]+)")' sample.yml
```
will output
```yaml
a: xyzzy
n: "14"
``` ```
## Match without global flag ## Match without global flag

View File

@ -277,6 +277,7 @@ func initLexer() (*lex.Lexer, error) {
lexer.Add([]byte(`join`), opToken(joinStringOpType)) lexer.Add([]byte(`join`), opToken(joinStringOpType))
lexer.Add([]byte(`sub`), opToken(subStringOpType)) lexer.Add([]byte(`sub`), opToken(subStringOpType))
lexer.Add([]byte(`match`), opToken(matchOpType)) lexer.Add([]byte(`match`), opToken(matchOpType))
lexer.Add([]byte(`capture`), opToken(captureOpType))
lexer.Add([]byte(`test`), opToken(testOpType)) lexer.Add([]byte(`test`), opToken(testOpType))
lexer.Add([]byte(`any`), opToken(anyOpType)) lexer.Add([]byte(`any`), opToken(anyOpType))

View File

@ -84,7 +84,8 @@ var sortKeysOpType = &operationType{Type: "SORT_KEYS", NumArgs: 1, Precedence: 5
var joinStringOpType = &operationType{Type: "JOIN", NumArgs: 1, Precedence: 50, Handler: joinStringOperator} var joinStringOpType = &operationType{Type: "JOIN", NumArgs: 1, Precedence: 50, Handler: joinStringOperator}
var subStringOpType = &operationType{Type: "SUBSTR", NumArgs: 1, Precedence: 50, Handler: substituteStringOperator} var subStringOpType = &operationType{Type: "SUBSTR", NumArgs: 1, Precedence: 50, Handler: substituteStringOperator}
var matchOpType = &operationType{Type: "MATCH", NumArgs: 1, Precedence: 50, Handler: matchOperator} var matchOpType = &operationType{Type: "MATCH", NumArgs: 1, Precedence: 50, Handler: matchOperator}
var testOpType = &operationType{Type: "MATCH", NumArgs: 1, Precedence: 50, Handler: testOperator} var captureOpType = &operationType{Type: "CAPTURE", NumArgs: 1, Precedence: 50, Handler: captureOperator}
var testOpType = &operationType{Type: "TEST", NumArgs: 1, Precedence: 50, Handler: testOperator}
var splitStringOpType = &operationType{Type: "SPLIT", NumArgs: 1, Precedence: 50, Handler: splitStringOperator} var splitStringOpType = &operationType{Type: "SPLIT", NumArgs: 1, Precedence: 50, Handler: splitStringOperator}
var keysOpType = &operationType{Type: "KEYS", NumArgs: 0, Precedence: 50, Handler: keysOperator} var keysOpType = &operationType{Type: "KEYS", NumArgs: 0, Precedence: 50, Handler: keysOperator}

View File

@ -75,9 +75,22 @@ func substituteStringOperator(d *dataTreeNavigator, context Context, expressionN
} }
func addMatch(original []*yaml.Node, match string, offset int, name string) []*yaml.Node { func addMatch(original []*yaml.Node, match string, offset int, name string) []*yaml.Node {
newContent := append(original, newContent := append(original,
createScalarNode("string", "string"), createScalarNode("string", "string"))
if offset < 0 {
// offset of -1 means there was no match, force a null value like jq
newContent = append(newContent,
createScalarNode(nil, "null"),
)
} else {
newContent = append(newContent,
createScalarNode(match, match), createScalarNode(match, match),
)
}
newContent = append(newContent,
createScalarNode("offset", "offset"), createScalarNode("offset", "offset"),
createScalarNode(offset, fmt.Sprintf("%v", offset)), createScalarNode(offset, fmt.Sprintf("%v", offset)),
createScalarNode("length", "length"), createScalarNode("length", "length"),
@ -96,11 +109,7 @@ type matchPreferences struct {
Global bool Global bool
} }
func match(matchPrefs matchPreferences, regEx *regexp.Regexp, candidate *CandidateNode, value string, results *list.List) { func getMatches(matchPrefs matchPreferences, regEx *regexp.Regexp, value string) ([][]string, [][]int) {
subNames := regEx.SubexpNames()
log.Debugf("subNames %v", subNames)
var allMatches [][]string var allMatches [][]string
var allIndices [][]int var allIndices [][]int
@ -113,6 +122,12 @@ func match(matchPrefs matchPreferences, regEx *regexp.Regexp, candidate *Candida
} }
log.Debug("allMatches, %v", allMatches) log.Debug("allMatches, %v", allMatches)
return allMatches, allIndices
}
func match(matchPrefs matchPreferences, regEx *regexp.Regexp, candidate *CandidateNode, value string, results *list.List) {
subNames := regEx.SubexpNames()
allMatches, allIndices := getMatches(matchPrefs, regEx, value)
// if all matches just has an empty array in it, // if all matches just has an empty array in it,
// then nothing matched // then nothing matched
@ -141,6 +156,43 @@ func match(matchPrefs matchPreferences, regEx *regexp.Regexp, candidate *Candida
} }
func capture(matchPrefs matchPreferences, regEx *regexp.Regexp, candidate *CandidateNode, value string, results *list.List) {
subNames := regEx.SubexpNames()
allMatches, allIndices := getMatches(matchPrefs, regEx, value)
// if all matches just has an empty array in it,
// then nothing matched
if len(allMatches) > 0 && len(allMatches[0]) == 0 {
return
}
for i, matches := range allMatches {
capturesNode := &yaml.Node{Kind: yaml.MappingNode}
_, submatches := matches[0], matches[1:]
for j, submatch := range submatches {
capturesNode.Content = append(capturesNode.Content,
createScalarNode(subNames[j+1], subNames[j+1]))
offset := allIndices[i][2+j*2]
// offset of -1 means there was no match, force a null value like jq
if offset < 0 {
capturesNode.Content = append(capturesNode.Content,
createScalarNode(nil, "null"),
)
} else {
capturesNode.Content = append(capturesNode.Content,
createScalarNode(submatch, submatch),
)
}
}
results.PushBack(candidate.CreateChild(nil, capturesNode))
}
}
func extractMatchArguments(d *dataTreeNavigator, context Context, expressionNode *ExpressionNode) (*regexp.Regexp, matchPreferences, error) { func extractMatchArguments(d *dataTreeNavigator, context Context, expressionNode *ExpressionNode) (*regexp.Regexp, matchPreferences, error) {
regExExpNode := expressionNode.Rhs regExExpNode := expressionNode.Rhs
@ -205,6 +257,27 @@ func matchOperator(d *dataTreeNavigator, context Context, expressionNode *Expres
return context.ChildContext(results), nil return context.ChildContext(results), nil
} }
func captureOperator(d *dataTreeNavigator, context Context, expressionNode *ExpressionNode) (Context, error) {
regEx, matchPrefs, err := extractMatchArguments(d, context, expressionNode)
if err != nil {
return Context{}, err
}
var results = list.New()
for el := context.MatchingNodes.Front(); el != nil; el = el.Next() {
candidate := el.Value.(*CandidateNode)
node := unwrapDoc(candidate.Node)
if node.Tag != "!!str" {
return Context{}, fmt.Errorf("cannot match with %v, can only match strings. Hint: Most often you'll want to use '|=' over '=' for this operation", node.Tag)
}
capture(matchPrefs, regEx, candidate, node.Value, results)
}
return context.ChildContext(results), nil
}
func testOperator(d *dataTreeNavigator, context Context, expressionNode *ExpressionNode) (Context, error) { func testOperator(d *dataTreeNavigator, context Context, expressionNode *ExpressionNode) (Context, error) {
regEx, _, err := extractMatchArguments(d, context, expressionNode) regEx, _, err := extractMatchArguments(d, context, expressionNode)
if err != nil { if err != nil {

View File

@ -15,34 +15,54 @@ var stringsOperatorScenarios = []expressionScenario{
}, },
{ {
description: "Match string", description: "Match string",
document: `cat`, document: `foo bar foo`,
expression: `match("at")`, expression: `match("foo")`,
expected: []string{ expected: []string{
"D0, P[], ()::string: at\noffset: 1\nlength: 2\ncaptures: []\n", "D0, P[], ()::string: foo\noffset: 0\nlength: 3\ncaptures: []\n",
}, },
}, },
{ {
description: "Match string, case insensitive", description: "Match string, case insensitive",
document: `cAt`, document: `foo bar FOO`,
expression: `match("(?i)at")`, expression: `match("(?i)foo"; "g")`,
expected: []string{ expected: []string{
"D0, P[], ()::string: At\noffset: 1\nlength: 2\ncaptures: []\n", "D0, P[], ()::string: foo\noffset: 0\nlength: 3\ncaptures: []\n",
"D0, P[], ()::string: FOO\noffset: 8\nlength: 3\ncaptures: []\n",
}, },
}, },
{ {
description: "Match with capture groups", description: "Match with capture groups",
document: `a cat`, document: `abc abc`,
expression: `match("c(.t)")`, expression: `match("(abc)+"; "g")`,
expected: []string{ expected: []string{
"D0, P[], ()::string: cat\noffset: 2\nlength: 3\ncaptures:\n - string: at\n offset: 3\n length: 2\n", "D0, P[], ()::string: abc\noffset: 0\nlength: 3\ncaptures:\n - string: abc\n offset: 0\n length: 3\n",
"D0, P[], ()::string: abc\noffset: 4\nlength: 3\ncaptures:\n - string: abc\n offset: 4\n length: 3\n",
}, },
}, },
{ {
description: "Match with named capture groups", description: "Match with named capture groups",
document: `a cat`, document: `foo bar foo foo foo`,
expression: `match("c(?P<cool>.t)")`, expression: `match("foo (?P<bar123>bar)? foo"; "g")`,
expected: []string{ expected: []string{
"D0, P[], ()::string: cat\noffset: 2\nlength: 3\ncaptures:\n - string: at\n offset: 3\n length: 2\n name: cool\n", "D0, P[], ()::string: foo bar foo\noffset: 0\nlength: 11\ncaptures:\n - string: bar\n offset: 4\n length: 3\n name: bar123\n",
"D0, P[], ()::string: foo foo\noffset: 12\nlength: 8\ncaptures:\n - string: null\n offset: -1\n length: 0\n name: bar123\n",
},
},
{
description: "Capture named groups into a map",
document: `xyzzy-14`,
expression: `capture("(?P<a>[a-z]+)-(?P<n>[0-9]+)")`,
expected: []string{
"D0, P[], ()::a: xyzzy\nn: \"14\"\n",
},
},
{
skipDoc: true,
description: "Capture named groups into a map, with null",
document: `xyzzy-14`,
expression: `capture("(?P<a>[a-z]+)-(?P<n>[0-9]+)(?P<bar123>bar)?")`,
expected: []string{
"D0, P[], ()::a: xyzzy\nn: \"14\"\nbar123: null\n",
}, },
}, },
{ {