wip better comment parsing

This commit is contained in:
Mike Farah 2021-12-22 11:31:28 +11:00
parent 0881ce2476
commit a72743f9c9
3 changed files with 145 additions and 254 deletions

View File

@ -4,6 +4,7 @@ import (
"encoding/xml" "encoding/xml"
"fmt" "fmt"
"io" "io"
"strings"
"unicode" "unicode"
"golang.org/x/net/html/charset" "golang.org/x/net/html/charset"
@ -61,17 +62,19 @@ func (dec *xmlDecoder) createSequence(nodes []*xmlNode) (*yaml.Node, error) {
} }
func (dec *xmlDecoder) createMap(n *xmlNode) (*yaml.Node, error) { func (dec *xmlDecoder) createMap(n *xmlNode) (*yaml.Node, error) {
yamlNode := &yaml.Node{Kind: yaml.MappingNode, HeadComment: n.Comment} log.Debug("createMap: headC: %v, footC: %v", n.HeadComment, n.FootComment)
yamlNode := &yaml.Node{Kind: yaml.MappingNode, HeadComment: n.HeadComment}
if len(n.Data) > 0 { if len(n.Data) > 0 {
label := dec.contentPrefix label := dec.contentPrefix
yamlNode.Content = append(yamlNode.Content, createScalarNode(label, label), createScalarNode(n.Data, n.Data)) yamlNode.Content = append(yamlNode.Content, createScalarNode(label, label), createScalarNode(n.Data, n.Data))
} }
for _, keyValuePair := range n.Children { for i, keyValuePair := range n.Children {
label := keyValuePair.K label := keyValuePair.K
children := keyValuePair.V children := keyValuePair.V
labelNode := createScalarNode(label, label) labelNode := createScalarNode(label, label)
// labelNode.HeadComment = n.HeadComment
var valueNode *yaml.Node var valueNode *yaml.Node
var err error var err error
log.Debug("len of children in %v is %v", label, len(children)) log.Debug("len of children in %v is %v", label, len(children))
@ -81,10 +84,15 @@ func (dec *xmlDecoder) createMap(n *xmlNode) (*yaml.Node, error) {
return nil, err return nil, err
} }
} else { } else {
valueNode, err = dec.convertToYamlNode(children[0]) valueNode, err = dec.convertToYamlNode(children[0])
if err != nil { if err != nil {
return nil, err return nil, err
} }
if i == len(n.Children)-1 {
valueNode.FootComment = n.FootComment
}
} }
yamlNode.Content = append(yamlNode.Content, labelNode, valueNode) yamlNode.Content = append(yamlNode.Content, labelNode, valueNode)
} }
@ -97,7 +105,9 @@ func (dec *xmlDecoder) convertToYamlNode(n *xmlNode) (*yaml.Node, error) {
return dec.createMap(n) return dec.createMap(n)
} }
scalar := createScalarNode(n.Data, n.Data) scalar := createScalarNode(n.Data, n.Data)
scalar.HeadComment = n.Comment log.Debug("scalar headC: %v, footC: %v", n.HeadComment, n.FootComment)
scalar.LineComment = n.HeadComment
return scalar, nil return scalar, nil
} }
@ -125,7 +135,8 @@ func (dec *xmlDecoder) Decode(rootYamlNode *yaml.Node) error {
type xmlNode struct { type xmlNode struct {
Children []*xmlChildrenKv Children []*xmlChildrenKv
Comment string HeadComment string
FootComment string
Data string Data string
} }
@ -158,6 +169,7 @@ type element struct {
parent *element parent *element
n *xmlNode n *xmlNode
label string label string
state string
} }
// this code is heavily based on https://github.com/basgys/goxml2json // this code is heavily based on https://github.com/basgys/goxml2json
@ -183,6 +195,8 @@ func (dec *xmlDecoder) decodeXml(root *xmlNode) error {
switch se := t.(type) { switch se := t.(type) {
case xml.StartElement: case xml.StartElement:
log.Debug("start element %v", se.Name.Local)
elem.state = "started"
// Build new a new current element and link it to its parent // Build new a new current element and link it to its parent
elem = &element{ elem = &element{
parent: elem, parent: elem,
@ -198,6 +212,8 @@ func (dec *xmlDecoder) decodeXml(root *xmlNode) error {
// Extract XML data (if any) // Extract XML data (if any)
elem.n.Data = trimNonGraphic(string(se)) elem.n.Data = trimNonGraphic(string(se))
case xml.EndElement: case xml.EndElement:
log.Debug("end element %v", elem.label)
elem.state = "finished"
// And add it to its parent list // And add it to its parent list
if elem.parent != nil { if elem.parent != nil {
elem.parent.n.AddChild(elem.label, elem.n) elem.parent.n.AddChild(elem.label, elem.n)
@ -206,13 +222,32 @@ func (dec *xmlDecoder) decodeXml(root *xmlNode) error {
// Then change the current element to its parent // Then change the current element to its parent
elem = elem.parent elem = elem.parent
case xml.Comment: case xml.Comment:
elem.n.Comment = trimNonGraphic(string(xml.CharData(se)))
commentStr := trimNonGraphic(string(xml.CharData(se)))
if elem.state == "started" {
log.Debug("got a foot comment for %v: %v", elem.label, commentStr)
elem.n.FootComment = commentStr
} else {
log.Debug("got a head comment for %v: %v", elem.label, commentStr)
elem.n.HeadComment = joinFilter([]string{elem.n.HeadComment, commentStr})
}
} }
} }
return nil return nil
} }
func joinFilter(rawStrings []string) string {
stringsToJoin := make([]string, 0)
for _, str := range rawStrings {
if str != "" {
stringsToJoin = append(stringsToJoin, str)
}
}
return strings.Join(stringsToJoin, " ")
}
// trimNonGraphic returns a slice of the string s, with all leading and trailing // trimNonGraphic returns a slice of the string s, with all leading and trailing
// non graphic characters and spaces removed. // non graphic characters and spaces removed.
// //

View File

@ -22,179 +22,3 @@ XML nodes that have attributes then plain content, e.g:
The content of the node will be set as a field in the map with the key "+content". Use the `--xml-content-name` flag to change this. The content of the node will be set as a field in the map with the key "+content". Use the `--xml-content-name` flag to change this.
## Parse xml: simple
Given a sample.xml file of:
```xml
<?xml version="1.0" encoding="UTF-8"?>
<cat>meow</cat>
```
then
```bash
yq e -p=xml '.' sample.xml
```
will output
```yaml
cat: meow
```
## Parse xml: array
Consecutive nodes with identical xml names are assumed to be arrays.
Given a sample.xml file of:
```xml
<?xml version="1.0" encoding="UTF-8"?>
<animal>1</animal>
<animal>2</animal>
```
then
```bash
yq e -p=xml '.' sample.xml
```
will output
```yaml
animal:
- "1"
- "2"
```
## Parse xml: attributes
Attributes are converted to fields, with the attribute prefix.
Given a sample.xml file of:
```xml
<?xml version="1.0" encoding="UTF-8"?>
<cat legs="4">
<legs>7</legs>
</cat>
```
then
```bash
yq e -p=xml '.' sample.xml
```
will output
```yaml
cat:
+legs: "4"
legs: "7"
```
## Parse xml: attributes with content
Content is added as a field, using the content name
Given a sample.xml file of:
```xml
<?xml version="1.0" encoding="UTF-8"?>
<cat legs="4">meow</cat>
```
then
```bash
yq e -p=xml '.' sample.xml
```
will output
```yaml
cat:
+content: meow
+legs: "4"
```
## Encode xml: simple
Given a sample.yml file of:
```yaml
cat: purrs
```
then
```bash
yq e -o=xml '.' sample.yml
```
will output
```xml
<cat>purrs</cat>
```
## Encode xml: array
Given a sample.yml file of:
```yaml
pets:
cat:
- purrs
- meows
```
then
```bash
yq e -o=xml '.' sample.yml
```
will output
```xml
<pets>
<cat>purrs</cat>
<cat>meows</cat>
</pets>
```
## Encode xml: attributes
Fields with the matching xml-attribute-prefix are assumed to be attributes.
Given a sample.yml file of:
```yaml
cat:
+name: tiger
meows: true
```
then
```bash
yq e -o=xml '.' sample.yml
```
will output
```xml
<cat name="tiger">
<meows>true</meows>
</cat>
```
## Encode xml: attributes with content
Fields with the matching xml-content-name is assumed to be content.
Given a sample.yml file of:
```yaml
cat:
+name: tiger
+content: cool
```
then
```bash
yq e -o=xml '.' sample.yml
```
will output
```xml
<cat name="tiger">cool</cat>
```
## Encode xml: comments
A best attempt is made to copy comments to xml.
Given a sample.yml file of:
```yaml
# above_cat
cat: # inline_cat
# above_array
array: # inline_array
- val1 # inline_val1
# above_val2
- val2 # inline_val2
# below_cat
```
then
```bash
yq e -o=xml '.' sample.yml
```
will output
```xml
<!-- above_cat inline_cat--><cat><!-- above_array inline_array-->
<array><!-- inline_val1-->val1</array>
<array><!-- above_val2 inline_val2-->val2</array>
</cat><!-- below_cat-->
```

View File

@ -24,12 +24,18 @@ func decodeXml(t *testing.T, xml string) *CandidateNode {
return &CandidateNode{Node: node} return &CandidateNode{Node: node}
} }
func yamlToXml(sampleYaml string, indent int) string { func processScenario(s xmlScenario) string {
var output bytes.Buffer var output bytes.Buffer
writer := bufio.NewWriter(&output) writer := bufio.NewWriter(&output)
var encoder = NewXmlEncoder(writer, indent, "+", "+content") var encoder = NewXmlEncoder(writer, 2, "+", "+content")
inputs, err := readDocuments(strings.NewReader(sampleYaml), "sample.yml", 0, NewYamlDecoder())
var decoder = NewYamlDecoder()
if s.scenarioType == "roundtrip" {
decoder = NewXmlDecoder("+", "+content")
}
inputs, err := readDocuments(strings.NewReader(s.input), "sample.yml", 0, decoder)
if err != nil { if err != nil {
panic(err) panic(err)
} }
@ -49,10 +55,24 @@ type xmlScenario struct {
description string description string
subdescription string subdescription string
skipDoc bool skipDoc bool
encodeScenario bool scenarioType string
} }
var yamlWithComments = `need to fix leadingContent thing. This should fail.# above_cat var expectedDecodeYamlWithComments = `D0, P[], (doc)::# before cat
cat:
# in cat
x: "3" # xca
# cool
# smart
y:
# befored
d: "4" # ind ind2
# afterd
# after cat
`
var yamlWithComments = `# above_cat
cat: # inline_cat cat: # inline_cat
# above_array # above_array
array: # inline_array array: # inline_array
@ -69,73 +89,85 @@ var expectedXmlWithComments = `<!-- above_cat inline_cat--><cat><!-- above_array
` `
var xmlScenarios = []xmlScenario{ var xmlScenarios = []xmlScenario{
{ // {
description: "Parse xml: simple", // description: "Parse xml: simple",
input: "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<cat>meow</cat>", // input: "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<cat>meow</cat>",
expected: "D0, P[], (doc)::cat: meow\n", // expected: "D0, P[], (doc)::cat: meow\n",
}, // },
{ // {
description: "Parse xml: array", // description: "Parse xml: array",
subdescription: "Consecutive nodes with identical xml names are assumed to be arrays.", // subdescription: "Consecutive nodes with identical xml names are assumed to be arrays.",
input: "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<animal>1</animal>\n<animal>2</animal>", // input: "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<animal>1</animal>\n<animal>2</animal>",
expected: "D0, P[], (doc)::animal:\n - \"1\"\n - \"2\"\n", // expected: "D0, P[], (doc)::animal:\n - \"1\"\n - \"2\"\n",
}, // },
{ // {
description: "Parse xml: attributes", // description: "Parse xml: attributes",
subdescription: "Attributes are converted to fields, with the attribute prefix.", // subdescription: "Attributes are converted to fields, with the attribute prefix.",
input: "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<cat legs=\"4\">\n <legs>7</legs>\n</cat>", // input: "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<cat legs=\"4\">\n <legs>7</legs>\n</cat>",
expected: "D0, P[], (doc)::cat:\n +legs: \"4\"\n legs: \"7\"\n", // expected: "D0, P[], (doc)::cat:\n +legs: \"4\"\n legs: \"7\"\n",
}, // },
{ // {
description: "Parse xml: attributes with content", // description: "Parse xml: attributes with content",
subdescription: "Content is added as a field, using the content name", // subdescription: "Content is added as a field, using the content name",
input: "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<cat legs=\"4\">meow</cat>", // input: "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<cat legs=\"4\">meow</cat>",
expected: "D0, P[], (doc)::cat:\n +content: meow\n +legs: \"4\"\n", // expected: "D0, P[], (doc)::cat:\n +content: meow\n +legs: \"4\"\n",
}, // },
{
description: "Encode xml: simple",
input: "cat: purrs",
expected: "<cat>purrs</cat>\n",
encodeScenario: true,
},
{
description: "Encode xml: array",
input: "pets:\n cat:\n - purrs\n - meows",
expected: "<pets>\n <cat>purrs</cat>\n <cat>meows</cat>\n</pets>\n",
encodeScenario: true,
},
{
description: "Encode xml: attributes",
subdescription: "Fields with the matching xml-attribute-prefix are assumed to be attributes.",
input: "cat:\n +name: tiger\n meows: true\n",
expected: "<cat name=\"tiger\">\n <meows>true</meows>\n</cat>\n",
encodeScenario: true,
},
{ {
skipDoc: true, skipDoc: true,
input: "cat:\n ++name: tiger\n meows: true\n", input: "<!-- before cat --><cat><!-- in cat --><x>3<!--xca\ncool\nsmart --></x><y><!-- befored --><d><!-- ind -->4<!-- ind2 --></d><!-- afterd --></y><!-- after --></cat><!-- after cat -->",
expected: "<cat +name=\"tiger\">\n <meows>true</meows>\n</cat>\n", expected: expectedDecodeYamlWithComments,
encodeScenario: true, scenarioType: "decode",
},
{
description: "Encode xml: attributes with content",
subdescription: "Fields with the matching xml-content-name is assumed to be content.",
input: "cat:\n +name: tiger\n +content: cool\n",
expected: "<cat name=\"tiger\">cool</cat>\n",
encodeScenario: true,
},
{
description: "Encode xml: comments",
subdescription: "A best attempt is made to copy comments to xml.",
input: yamlWithComments,
expected: expectedXmlWithComments,
encodeScenario: true,
}, },
// {
// description: "Encode xml: simple",
// input: "cat: purrs",
// expected: "<cat>purrs</cat>\n",
// scenarioType: "encode",
// },
// {
// description: "Encode xml: array",
// input: "pets:\n cat:\n - purrs\n - meows",
// expected: "<pets>\n <cat>purrs</cat>\n <cat>meows</cat>\n</pets>\n",
// scenarioType: "encode",
// },
// {
// description: "Encode xml: attributes",
// subdescription: "Fields with the matching xml-attribute-prefix are assumed to be attributes.",
// input: "cat:\n +name: tiger\n meows: true\n",
// expected: "<cat name=\"tiger\">\n <meows>true</meows>\n</cat>\n",
// scenarioType: "encode",
// },
// {
// skipDoc: true,
// input: "cat:\n ++name: tiger\n meows: true\n",
// expected: "<cat +name=\"tiger\">\n <meows>true</meows>\n</cat>\n",
// scenarioType: "encode",
// },
// {
// description: "Encode xml: attributes with content",
// subdescription: "Fields with the matching xml-content-name is assumed to be content.",
// input: "cat:\n +name: tiger\n +content: cool\n",
// expected: "<cat name=\"tiger\">cool</cat>\n",
// scenarioType: "encode",
// },
// {
// description: "Encode xml: comments",
// subdescription: "A best attempt is made to copy comments to xml.",
// input: yamlWithComments,
// expected: expectedXmlWithComments,
// scenarioType: "encode",
// },
// {
// skipDoc: true,
// input: "<!-- beforeCat --><cat><!-- in cat -->value<!-- after --></cat><!-- after cat -->",
// expected: "<!-- beforeCat --><cat><!-- in cat -->value</cat><!-- after cat -->",
// scenarioType: "roundtrip",
// },
} }
func testXmlScenario(t *testing.T, s *xmlScenario) { func testXmlScenario(t *testing.T, s xmlScenario) {
if s.encodeScenario { if s.scenarioType == "encode" || s.scenarioType == "roundtrip" {
test.AssertResultWithContext(t, s.expected, yamlToXml(s.input, 2), s.description) test.AssertResultWithContext(t, s.expected, processScenario(s), s.description)
} else { } else {
var actual = resultToString(t, decodeXml(t, s.input)) var actual = resultToString(t, decodeXml(t, s.input))
test.AssertResultWithContext(t, s.expected, actual, s.description) test.AssertResultWithContext(t, s.expected, actual, s.description)
@ -148,7 +180,7 @@ func documentXmlScenario(t *testing.T, w *bufio.Writer, i interface{}) {
if s.skipDoc { if s.skipDoc {
return return
} }
if s.encodeScenario { if s.scenarioType == "encode" {
documentXmlEncodeScenario(w, s) documentXmlEncodeScenario(w, s)
} else { } else {
documentXmlDecodeScenario(t, w, s) documentXmlDecodeScenario(t, w, s)
@ -200,12 +232,12 @@ func documentXmlEncodeScenario(w *bufio.Writer, s xmlScenario) {
writeOrPanic(w, "```bash\nyq e -o=xml '.' sample.yml\n```\n") writeOrPanic(w, "```bash\nyq e -o=xml '.' sample.yml\n```\n")
writeOrPanic(w, "will output\n") writeOrPanic(w, "will output\n")
writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", yamlToXml(s.input, 2))) writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", processScenario(s)))
} }
func TestXmlScenarios(t *testing.T) { func TestXmlScenarios(t *testing.T) {
for _, tt := range xmlScenarios { for _, tt := range xmlScenarios {
testXmlScenario(t, &tt) testXmlScenario(t, tt)
} }
genericScenarios := make([]interface{}, len(xmlScenarios)) genericScenarios := make([]interface{}, len(xmlScenarios))
for i, s := range xmlScenarios { for i, s := range xmlScenarios {