yq/pkg/yqlib/decoder_xml.go

322 lines
8.1 KiB
Go
Raw Normal View History

2021-12-21 04:02:07 +00:00
package yqlib
import (
"encoding/xml"
"fmt"
"io"
2021-12-22 00:31:28 +00:00
"strings"
2021-12-31 08:58:39 +00:00
// "strings"
2021-12-21 04:02:07 +00:00
"unicode"
"golang.org/x/net/html/charset"
yaml "gopkg.in/yaml.v3"
)
type InputFormat uint
const (
YamlInputFormat = 1 << iota
XmlInputFormat
)
func InputFormatFromString(format string) (InputFormat, error) {
switch format {
case "yaml", "y":
return YamlInputFormat, nil
case "xml", "x":
return XmlInputFormat, nil
default:
return 0, fmt.Errorf("unknown format '%v' please use [yaml|xml]", format)
}
}
type xmlDecoder struct {
reader io.Reader
attributePrefix string
contentPrefix string
finished bool
}
func NewXmlDecoder(attributePrefix string, contentPrefix string) Decoder {
if contentPrefix == "" {
contentPrefix = "content"
}
return &xmlDecoder{attributePrefix: attributePrefix, contentPrefix: contentPrefix, finished: false}
}
func (dec *xmlDecoder) Init(reader io.Reader) {
dec.reader = reader
dec.finished = false
}
func (dec *xmlDecoder) createSequence(nodes []*xmlNode) (*yaml.Node, error) {
yamlNode := &yaml.Node{Kind: yaml.SequenceNode}
for _, child := range nodes {
yamlChild, err := dec.convertToYamlNode(child)
if err != nil {
return nil, err
}
yamlNode.Content = append(yamlNode.Content, yamlChild)
}
return yamlNode, nil
}
2021-12-31 08:58:39 +00:00
func (dec *xmlDecoder) processComment(c string) string {
if c == "" {
return ""
}
return "#" + strings.TrimRight(c, " ")
}
2021-12-21 04:02:07 +00:00
func (dec *xmlDecoder) createMap(n *xmlNode) (*yaml.Node, error) {
2021-12-22 00:31:28 +00:00
log.Debug("createMap: headC: %v, footC: %v", n.HeadComment, n.FootComment)
2022-01-01 01:24:44 +00:00
yamlNode := &yaml.Node{Kind: yaml.MappingNode, FootComment: dec.processComment(n.FootComment)}
2021-12-21 04:02:07 +00:00
if len(n.Data) > 0 {
label := dec.contentPrefix
2022-01-01 01:24:44 +00:00
labelNode := createScalarNode(label, label)
labelNode.HeadComment = dec.processComment(n.HeadComment)
yamlNode.Content = append(yamlNode.Content, labelNode, createScalarNode(n.Data, n.Data))
2021-12-21 04:02:07 +00:00
}
2022-01-01 01:24:44 +00:00
for i, keyValuePair := range n.Children {
2021-12-21 04:02:07 +00:00
label := keyValuePair.K
children := keyValuePair.V
labelNode := createScalarNode(label, label)
var valueNode *yaml.Node
var err error
2022-01-01 01:24:44 +00:00
if i == 0 {
labelNode.HeadComment = dec.processComment(n.HeadComment)
}
2021-12-21 04:02:07 +00:00
log.Debug("len of children in %v is %v", label, len(children))
if len(children) > 1 {
valueNode, err = dec.createSequence(children)
if err != nil {
return nil, err
}
} else {
2021-12-31 08:58:39 +00:00
// comment hack for maps of scalars
// if the value is a scalar, the head comment of the scalar needs to go on the key?
// add tests for <z/> as well as multiple <ds> of inputXmlWithComments > yaml
2022-01-01 01:24:44 +00:00
if len(children[0].Children) == 0 && children[0].HeadComment != "" {
labelNode.HeadComment = labelNode.HeadComment + "\n" + strings.TrimSpace(children[0].HeadComment)
2021-12-31 08:58:39 +00:00
children[0].HeadComment = ""
}
2021-12-21 04:02:07 +00:00
valueNode, err = dec.convertToYamlNode(children[0])
if err != nil {
return nil, err
}
}
yamlNode.Content = append(yamlNode.Content, labelNode, valueNode)
}
return yamlNode, nil
}
func (dec *xmlDecoder) convertToYamlNode(n *xmlNode) (*yaml.Node, error) {
if len(n.Children) > 0 {
return dec.createMap(n)
}
scalar := createScalarNode(n.Data, n.Data)
2021-12-22 00:31:28 +00:00
log.Debug("scalar headC: %v, footC: %v", n.HeadComment, n.FootComment)
2021-12-31 08:58:39 +00:00
scalar.HeadComment = dec.processComment(n.HeadComment)
scalar.LineComment = dec.processComment(n.LineComment)
scalar.FootComment = dec.processComment(n.FootComment)
2021-12-22 00:31:28 +00:00
2021-12-21 04:02:07 +00:00
return scalar, nil
}
func (dec *xmlDecoder) Decode(rootYamlNode *yaml.Node) error {
if dec.finished {
return io.EOF
}
root := &xmlNode{}
// cant use xj - it doesn't keep map order.
err := dec.decodeXml(root)
if err != nil {
return err
}
firstNode, err := dec.convertToYamlNode(root)
if err != nil {
return err
}
rootYamlNode.Kind = yaml.DocumentNode
rootYamlNode.Content = []*yaml.Node{firstNode}
dec.finished = true
return nil
}
type xmlNode struct {
2021-12-22 00:31:28 +00:00
Children []*xmlChildrenKv
HeadComment string
FootComment string
2021-12-31 01:50:16 +00:00
LineComment string
2021-12-22 00:31:28 +00:00
Data string
2021-12-21 04:02:07 +00:00
}
type xmlChildrenKv struct {
K string
V []*xmlNode
}
// AddChild appends a node to the list of children
func (n *xmlNode) AddChild(s string, c *xmlNode) {
if n.Children == nil {
n.Children = make([]*xmlChildrenKv, 0)
}
log.Debug("looking for %s", s)
// see if we can find an existing entry to add to
for _, childEntry := range n.Children {
if childEntry.K == s {
log.Debug("found it, appending an entry%s", s)
childEntry.V = append(childEntry.V, c)
log.Debug("yay len of children in %v is %v", s, len(childEntry.V))
return
}
}
log.Debug("not there, making a new one %s", s)
n.Children = append(n.Children, &xmlChildrenKv{K: s, V: []*xmlNode{c}})
}
type element struct {
parent *element
n *xmlNode
label string
2021-12-22 00:31:28 +00:00
state string
2021-12-21 04:02:07 +00:00
}
// this code is heavily based on https://github.com/basgys/goxml2json
// main changes are to decode into a structure that preserves the original order
// of the map keys.
func (dec *xmlDecoder) decodeXml(root *xmlNode) error {
xmlDec := xml.NewDecoder(dec.reader)
// That will convert the charset if the provided XML is non-UTF-8
xmlDec.CharsetReader = charset.NewReaderLabel
// Create first element from the root node
elem := &element{
parent: nil,
n: root,
}
for {
t, _ := xmlDec.Token()
if t == nil {
break
}
switch se := t.(type) {
case xml.StartElement:
2021-12-22 00:31:28 +00:00
log.Debug("start element %v", se.Name.Local)
elem.state = "started"
2021-12-21 04:02:07 +00:00
// Build new a new current element and link it to its parent
elem = &element{
parent: elem,
n: &xmlNode{},
label: se.Name.Local,
}
// Extract attributes as children
for _, a := range se.Attr {
elem.n.AddChild(dec.attributePrefix+a.Name.Local, &xmlNode{Data: a.Value})
}
case xml.CharData:
// Extract XML data (if any)
elem.n.Data = trimNonGraphic(string(se))
2021-12-31 01:50:16 +00:00
if elem.n.Data != "" {
elem.state = "chardata"
}
2021-12-21 04:02:07 +00:00
case xml.EndElement:
2021-12-22 00:31:28 +00:00
log.Debug("end element %v", elem.label)
elem.state = "finished"
2021-12-21 04:02:07 +00:00
// And add it to its parent list
if elem.parent != nil {
elem.parent.n.AddChild(elem.label, elem.n)
}
// Then change the current element to its parent
elem = elem.parent
case xml.Comment:
2021-12-22 00:31:28 +00:00
2021-12-31 01:36:59 +00:00
commentStr := string(xml.CharData(se))
2021-12-22 00:31:28 +00:00
if elem.state == "started" {
2021-12-31 08:58:39 +00:00
log.Debug("got a foot comment for %v: [%v]", elem.label, commentStr)
2021-12-31 01:36:59 +00:00
// elem.n.FootComment = elem.n.FootComment + commentStr
// put the comment on the foot of the last child
if len(elem.n.Children) > 0 {
child := elem.n.Children[len(elem.n.Children)-1]
log.Debug("putting it here: %v", child.K)
2021-12-31 08:58:39 +00:00
child.V[len(child.V)-1].FootComment = joinFilter([]string{child.V[len(child.V)-1].FootComment, commentStr})
2021-12-31 01:36:59 +00:00
} else {
log.Debug("putting it on the element")
2021-12-31 01:50:16 +00:00
elem.n.FootComment = joinFilter([]string{elem.n.FootComment, commentStr})
2021-12-31 01:36:59 +00:00
}
2021-12-31 01:50:16 +00:00
} else if elem.state == "chardata" {
2021-12-31 08:58:39 +00:00
log.Debug("got a line comment for (%v) %v: [%v]", elem.state, elem.label, commentStr)
2021-12-31 01:50:16 +00:00
elem.n.LineComment = joinFilter([]string{elem.n.LineComment, commentStr})
2021-12-22 00:31:28 +00:00
} else {
2021-12-31 08:58:39 +00:00
log.Debug("got a head comment for (%v) %v: [%v]", elem.state, elem.label, commentStr)
2021-12-22 00:31:28 +00:00
elem.n.HeadComment = joinFilter([]string{elem.n.HeadComment, commentStr})
}
2021-12-21 04:02:07 +00:00
}
}
return nil
}
2021-12-22 00:31:28 +00:00
func joinFilter(rawStrings []string) string {
stringsToJoin := make([]string, 0)
for _, str := range rawStrings {
if str != "" {
stringsToJoin = append(stringsToJoin, str)
}
}
return strings.Join(stringsToJoin, " ")
}
2021-12-21 04:02:07 +00:00
// trimNonGraphic returns a slice of the string s, with all leading and trailing
// non graphic characters and spaces removed.
//
// Graphic characters include letters, marks, numbers, punctuation, symbols,
// and spaces, from categories L, M, N, P, S, Zs.
// Spacing characters are set by category Z and property Pattern_White_Space.
func trimNonGraphic(s string) string {
if s == "" {
return s
}
var first *int
var last int
for i, r := range []rune(s) {
if !unicode.IsGraphic(r) || unicode.IsSpace(r) {
continue
}
if first == nil {
f := i // copy i
first = &f
last = i
} else {
last = i
}
}
// If first is nil, it means there are no graphic characters
if first == nil {
return ""
}
return string([]rune(s)[*first : last+1])
}