Added capture regex operator

This commit is contained in:
Mike Farah 2021-07-11 11:08:18 +10:00
parent 3394feee0d
commit 5c0a5bd9d3
5 changed files with 175 additions and 46 deletions

View File

@ -24,76 +24,110 @@ cat; meow; 1; ; true
## Match string
Given a sample.yml file of:
```yaml
cat
foo bar foo
```
then
```bash
yq eval 'match("at")' sample.yml
yq eval 'match("foo")' sample.yml
```
will output
```yaml
string: at
offset: 1
length: 2
string: foo
offset: 0
length: 3
captures: []
```
## Match string, case insensitive
Given a sample.yml file of:
```yaml
cAt
foo bar FOO
```
then
```bash
yq eval 'match("(?i)at")' sample.yml
yq eval 'match("(?i)foo"; "g")' sample.yml
```
will output
```yaml
string: At
offset: 1
length: 2
string: foo
offset: 0
length: 3
captures: []
string: FOO
offset: 8
length: 3
captures: []
```
## Match with capture groups
Given a sample.yml file of:
```yaml
a cat
abc abc
```
then
```bash
yq eval 'match("c(.t)")' sample.yml
yq eval 'match("(abc)+"; "g")' sample.yml
```
will output
```yaml
string: cat
offset: 2
string: abc
offset: 0
length: 3
captures:
- string: at
offset: 3
length: 2
- string: abc
offset: 0
length: 3
string: abc
offset: 4
length: 3
captures:
- string: abc
offset: 4
length: 3
```
## Match with named capture groups
Given a sample.yml file of:
```yaml
a cat
foo bar foo foo foo
```
then
```bash
yq eval 'match("c(?P<cool>.t)")' sample.yml
yq eval 'match("foo (?P<bar123>bar)? foo"; "g")' sample.yml
```
will output
```yaml
string: cat
offset: 2
length: 3
string: foo bar foo
offset: 0
length: 11
captures:
- string: at
offset: 3
length: 2
name: cool
- string: bar
offset: 4
length: 3
name: bar123
string: foo foo
offset: 12
length: 8
captures:
- string: null
offset: -1
length: 0
name: bar123
```
## Capture named groups into a map
Given a sample.yml file of:
```yaml
xyzzy-14
```
then
```bash
yq eval 'capture("(?P<a>[a-z]+)-(?P<n>[0-9]+)")' sample.yml
```
will output
```yaml
a: xyzzy
n: "14"
```
## Match without global flag

View File

@ -277,6 +277,7 @@ func initLexer() (*lex.Lexer, error) {
lexer.Add([]byte(`join`), opToken(joinStringOpType))
lexer.Add([]byte(`sub`), opToken(subStringOpType))
lexer.Add([]byte(`match`), opToken(matchOpType))
lexer.Add([]byte(`capture`), opToken(captureOpType))
lexer.Add([]byte(`test`), opToken(testOpType))
lexer.Add([]byte(`any`), opToken(anyOpType))

View File

@ -84,7 +84,8 @@ var sortKeysOpType = &operationType{Type: "SORT_KEYS", NumArgs: 1, Precedence: 5
var joinStringOpType = &operationType{Type: "JOIN", NumArgs: 1, Precedence: 50, Handler: joinStringOperator}
var subStringOpType = &operationType{Type: "SUBSTR", NumArgs: 1, Precedence: 50, Handler: substituteStringOperator}
var matchOpType = &operationType{Type: "MATCH", NumArgs: 1, Precedence: 50, Handler: matchOperator}
var testOpType = &operationType{Type: "MATCH", NumArgs: 1, Precedence: 50, Handler: testOperator}
var captureOpType = &operationType{Type: "CAPTURE", NumArgs: 1, Precedence: 50, Handler: captureOperator}
var testOpType = &operationType{Type: "TEST", NumArgs: 1, Precedence: 50, Handler: testOperator}
var splitStringOpType = &operationType{Type: "SPLIT", NumArgs: 1, Precedence: 50, Handler: splitStringOperator}
var keysOpType = &operationType{Type: "KEYS", NumArgs: 0, Precedence: 50, Handler: keysOperator}

View File

@ -75,9 +75,22 @@ func substituteStringOperator(d *dataTreeNavigator, context Context, expressionN
}
func addMatch(original []*yaml.Node, match string, offset int, name string) []*yaml.Node {
newContent := append(original,
createScalarNode("string", "string"),
createScalarNode(match, match),
createScalarNode("string", "string"))
if offset < 0 {
// offset of -1 means there was no match, force a null value like jq
newContent = append(newContent,
createScalarNode(nil, "null"),
)
} else {
newContent = append(newContent,
createScalarNode(match, match),
)
}
newContent = append(newContent,
createScalarNode("offset", "offset"),
createScalarNode(offset, fmt.Sprintf("%v", offset)),
createScalarNode("length", "length"),
@ -96,11 +109,7 @@ type matchPreferences struct {
Global bool
}
func match(matchPrefs matchPreferences, regEx *regexp.Regexp, candidate *CandidateNode, value string, results *list.List) {
subNames := regEx.SubexpNames()
log.Debugf("subNames %v", subNames)
func getMatches(matchPrefs matchPreferences, regEx *regexp.Regexp, value string) ([][]string, [][]int) {
var allMatches [][]string
var allIndices [][]int
@ -113,6 +122,12 @@ func match(matchPrefs matchPreferences, regEx *regexp.Regexp, candidate *Candida
}
log.Debug("allMatches, %v", allMatches)
return allMatches, allIndices
}
func match(matchPrefs matchPreferences, regEx *regexp.Regexp, candidate *CandidateNode, value string, results *list.List) {
subNames := regEx.SubexpNames()
allMatches, allIndices := getMatches(matchPrefs, regEx, value)
// if all matches just has an empty array in it,
// then nothing matched
@ -141,6 +156,43 @@ func match(matchPrefs matchPreferences, regEx *regexp.Regexp, candidate *Candida
}
func capture(matchPrefs matchPreferences, regEx *regexp.Regexp, candidate *CandidateNode, value string, results *list.List) {
subNames := regEx.SubexpNames()
allMatches, allIndices := getMatches(matchPrefs, regEx, value)
// if all matches just has an empty array in it,
// then nothing matched
if len(allMatches) > 0 && len(allMatches[0]) == 0 {
return
}
for i, matches := range allMatches {
capturesNode := &yaml.Node{Kind: yaml.MappingNode}
_, submatches := matches[0], matches[1:]
for j, submatch := range submatches {
capturesNode.Content = append(capturesNode.Content,
createScalarNode(subNames[j+1], subNames[j+1]))
offset := allIndices[i][2+j*2]
// offset of -1 means there was no match, force a null value like jq
if offset < 0 {
capturesNode.Content = append(capturesNode.Content,
createScalarNode(nil, "null"),
)
} else {
capturesNode.Content = append(capturesNode.Content,
createScalarNode(submatch, submatch),
)
}
}
results.PushBack(candidate.CreateChild(nil, capturesNode))
}
}
func extractMatchArguments(d *dataTreeNavigator, context Context, expressionNode *ExpressionNode) (*regexp.Regexp, matchPreferences, error) {
regExExpNode := expressionNode.Rhs
@ -205,6 +257,27 @@ func matchOperator(d *dataTreeNavigator, context Context, expressionNode *Expres
return context.ChildContext(results), nil
}
func captureOperator(d *dataTreeNavigator, context Context, expressionNode *ExpressionNode) (Context, error) {
regEx, matchPrefs, err := extractMatchArguments(d, context, expressionNode)
if err != nil {
return Context{}, err
}
var results = list.New()
for el := context.MatchingNodes.Front(); el != nil; el = el.Next() {
candidate := el.Value.(*CandidateNode)
node := unwrapDoc(candidate.Node)
if node.Tag != "!!str" {
return Context{}, fmt.Errorf("cannot match with %v, can only match strings. Hint: Most often you'll want to use '|=' over '=' for this operation", node.Tag)
}
capture(matchPrefs, regEx, candidate, node.Value, results)
}
return context.ChildContext(results), nil
}
func testOperator(d *dataTreeNavigator, context Context, expressionNode *ExpressionNode) (Context, error) {
regEx, _, err := extractMatchArguments(d, context, expressionNode)
if err != nil {

View File

@ -15,34 +15,54 @@ var stringsOperatorScenarios = []expressionScenario{
},
{
description: "Match string",
document: `cat`,
expression: `match("at")`,
document: `foo bar foo`,
expression: `match("foo")`,
expected: []string{
"D0, P[], ()::string: at\noffset: 1\nlength: 2\ncaptures: []\n",
"D0, P[], ()::string: foo\noffset: 0\nlength: 3\ncaptures: []\n",
},
},
{
description: "Match string, case insensitive",
document: `cAt`,
expression: `match("(?i)at")`,
document: `foo bar FOO`,
expression: `match("(?i)foo"; "g")`,
expected: []string{
"D0, P[], ()::string: At\noffset: 1\nlength: 2\ncaptures: []\n",
"D0, P[], ()::string: foo\noffset: 0\nlength: 3\ncaptures: []\n",
"D0, P[], ()::string: FOO\noffset: 8\nlength: 3\ncaptures: []\n",
},
},
{
description: "Match with capture groups",
document: `a cat`,
expression: `match("c(.t)")`,
document: `abc abc`,
expression: `match("(abc)+"; "g")`,
expected: []string{
"D0, P[], ()::string: cat\noffset: 2\nlength: 3\ncaptures:\n - string: at\n offset: 3\n length: 2\n",
"D0, P[], ()::string: abc\noffset: 0\nlength: 3\ncaptures:\n - string: abc\n offset: 0\n length: 3\n",
"D0, P[], ()::string: abc\noffset: 4\nlength: 3\ncaptures:\n - string: abc\n offset: 4\n length: 3\n",
},
},
{
description: "Match with named capture groups",
document: `a cat`,
expression: `match("c(?P<cool>.t)")`,
document: `foo bar foo foo foo`,
expression: `match("foo (?P<bar123>bar)? foo"; "g")`,
expected: []string{
"D0, P[], ()::string: cat\noffset: 2\nlength: 3\ncaptures:\n - string: at\n offset: 3\n length: 2\n name: cool\n",
"D0, P[], ()::string: foo bar foo\noffset: 0\nlength: 11\ncaptures:\n - string: bar\n offset: 4\n length: 3\n name: bar123\n",
"D0, P[], ()::string: foo foo\noffset: 12\nlength: 8\ncaptures:\n - string: null\n offset: -1\n length: 0\n name: bar123\n",
},
},
{
description: "Capture named groups into a map",
document: `xyzzy-14`,
expression: `capture("(?P<a>[a-z]+)-(?P<n>[0-9]+)")`,
expected: []string{
"D0, P[], ()::a: xyzzy\nn: \"14\"\n",
},
},
{
skipDoc: true,
description: "Capture named groups into a map, with null",
document: `xyzzy-14`,
expression: `capture("(?P<a>[a-z]+)-(?P<n>[0-9]+)(?P<bar123>bar)?")`,
expected: []string{
"D0, P[], ()::a: xyzzy\nn: \"14\"\nbar123: null\n",
},
},
{