2021-12-21 04:02:07 +00:00
|
|
|
package yqlib
|
|
|
|
|
|
|
|
import (
|
|
|
|
"encoding/xml"
|
2022-03-28 03:05:10 +00:00
|
|
|
"errors"
|
2021-12-21 04:02:07 +00:00
|
|
|
"io"
|
2022-01-15 00:57:59 +00:00
|
|
|
"strings"
|
2021-12-21 04:02:07 +00:00
|
|
|
"unicode"
|
|
|
|
|
|
|
|
"golang.org/x/net/html/charset"
|
|
|
|
yaml "gopkg.in/yaml.v3"
|
|
|
|
)
|
|
|
|
|
|
|
|
type xmlDecoder struct {
|
|
|
|
reader io.Reader
|
2022-05-27 01:18:38 +00:00
|
|
|
readAnything bool
|
2021-12-21 04:02:07 +00:00
|
|
|
attributePrefix string
|
2022-01-15 00:57:59 +00:00
|
|
|
contentName string
|
2022-03-28 03:05:10 +00:00
|
|
|
strictMode bool
|
2022-06-14 23:40:31 +00:00
|
|
|
keepNamespace bool
|
|
|
|
useRawToken bool
|
2021-12-21 04:02:07 +00:00
|
|
|
finished bool
|
|
|
|
}
|
|
|
|
|
2022-06-14 23:40:31 +00:00
|
|
|
func NewXMLDecoder(attributePrefix string, contentName string, strictMode bool, keepNamespace bool, useRawToken bool) Decoder {
|
2022-01-15 00:57:59 +00:00
|
|
|
if contentName == "" {
|
|
|
|
contentName = "content"
|
2021-12-21 04:02:07 +00:00
|
|
|
}
|
2022-06-14 23:40:31 +00:00
|
|
|
return &xmlDecoder{
|
|
|
|
attributePrefix: attributePrefix,
|
|
|
|
contentName: contentName,
|
|
|
|
finished: false,
|
|
|
|
strictMode: strictMode,
|
|
|
|
keepNamespace: keepNamespace,
|
|
|
|
useRawToken: useRawToken,
|
|
|
|
}
|
2021-12-21 04:02:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (dec *xmlDecoder) Init(reader io.Reader) {
|
|
|
|
dec.reader = reader
|
2022-05-27 01:18:38 +00:00
|
|
|
dec.readAnything = false
|
2021-12-21 04:02:07 +00:00
|
|
|
dec.finished = false
|
|
|
|
}
|
|
|
|
|
|
|
|
func (dec *xmlDecoder) createSequence(nodes []*xmlNode) (*yaml.Node, error) {
|
2022-07-29 00:26:50 +00:00
|
|
|
yamlNode := &yaml.Node{Kind: yaml.SequenceNode, Tag: "!!seq"}
|
2021-12-21 04:02:07 +00:00
|
|
|
for _, child := range nodes {
|
|
|
|
yamlChild, err := dec.convertToYamlNode(child)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
yamlNode.Content = append(yamlNode.Content, yamlChild)
|
|
|
|
}
|
|
|
|
|
|
|
|
return yamlNode, nil
|
|
|
|
}
|
|
|
|
|
2022-01-15 00:57:59 +00:00
|
|
|
func (dec *xmlDecoder) processComment(c string) string {
|
|
|
|
if c == "" {
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
return "#" + strings.TrimRight(c, " ")
|
|
|
|
}
|
|
|
|
|
2021-12-21 04:02:07 +00:00
|
|
|
func (dec *xmlDecoder) createMap(n *xmlNode) (*yaml.Node, error) {
|
2022-01-15 00:57:59 +00:00
|
|
|
log.Debug("createMap: headC: %v, footC: %v", n.HeadComment, n.FootComment)
|
2022-07-29 00:26:50 +00:00
|
|
|
yamlNode := &yaml.Node{Kind: yaml.MappingNode, Tag: "!!map"}
|
2021-12-21 04:02:07 +00:00
|
|
|
|
|
|
|
if len(n.Data) > 0 {
|
2022-01-15 00:57:59 +00:00
|
|
|
label := dec.contentName
|
|
|
|
labelNode := createScalarNode(label, label)
|
|
|
|
labelNode.HeadComment = dec.processComment(n.HeadComment)
|
|
|
|
labelNode.FootComment = dec.processComment(n.FootComment)
|
|
|
|
yamlNode.Content = append(yamlNode.Content, labelNode, createScalarNode(n.Data, n.Data))
|
2021-12-21 04:02:07 +00:00
|
|
|
}
|
|
|
|
|
2022-01-15 00:57:59 +00:00
|
|
|
for i, keyValuePair := range n.Children {
|
2021-12-21 04:02:07 +00:00
|
|
|
label := keyValuePair.K
|
|
|
|
children := keyValuePair.V
|
|
|
|
labelNode := createScalarNode(label, label)
|
|
|
|
var valueNode *yaml.Node
|
|
|
|
var err error
|
2022-01-15 00:57:59 +00:00
|
|
|
|
|
|
|
if i == 0 {
|
|
|
|
labelNode.HeadComment = dec.processComment(n.HeadComment)
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
// if i == len(n.Children)-1 {
|
|
|
|
labelNode.FootComment = dec.processComment(keyValuePair.FootComment)
|
|
|
|
// }
|
|
|
|
|
2021-12-21 04:02:07 +00:00
|
|
|
log.Debug("len of children in %v is %v", label, len(children))
|
|
|
|
if len(children) > 1 {
|
|
|
|
valueNode, err = dec.createSequence(children)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
} else {
|
2022-01-15 00:57:59 +00:00
|
|
|
// comment hack for maps of scalars
|
|
|
|
// if the value is a scalar, the head comment of the scalar needs to go on the key?
|
|
|
|
// add tests for <z/> as well as multiple <ds> of inputXmlWithComments > yaml
|
|
|
|
if len(children[0].Children) == 0 && children[0].HeadComment != "" {
|
|
|
|
labelNode.HeadComment = labelNode.HeadComment + "\n" + strings.TrimSpace(children[0].HeadComment)
|
|
|
|
children[0].HeadComment = ""
|
|
|
|
}
|
2021-12-21 04:02:07 +00:00
|
|
|
valueNode, err = dec.convertToYamlNode(children[0])
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
yamlNode.Content = append(yamlNode.Content, labelNode, valueNode)
|
|
|
|
}
|
|
|
|
|
|
|
|
return yamlNode, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (dec *xmlDecoder) convertToYamlNode(n *xmlNode) (*yaml.Node, error) {
|
|
|
|
if len(n.Children) > 0 {
|
|
|
|
return dec.createMap(n)
|
|
|
|
}
|
|
|
|
scalar := createScalarNode(n.Data, n.Data)
|
2022-02-07 00:26:48 +00:00
|
|
|
if n.Data == "" {
|
|
|
|
scalar = createScalarNode(nil, "")
|
|
|
|
}
|
2022-01-15 00:57:59 +00:00
|
|
|
log.Debug("scalar headC: %v, footC: %v", n.HeadComment, n.FootComment)
|
|
|
|
scalar.HeadComment = dec.processComment(n.HeadComment)
|
|
|
|
scalar.LineComment = dec.processComment(n.LineComment)
|
|
|
|
scalar.FootComment = dec.processComment(n.FootComment)
|
|
|
|
|
2021-12-21 04:02:07 +00:00
|
|
|
return scalar, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (dec *xmlDecoder) Decode(rootYamlNode *yaml.Node) error {
|
|
|
|
if dec.finished {
|
|
|
|
return io.EOF
|
|
|
|
}
|
|
|
|
root := &xmlNode{}
|
|
|
|
// cant use xj - it doesn't keep map order.
|
2022-02-07 00:55:55 +00:00
|
|
|
err := dec.decodeXML(root)
|
2021-12-21 04:02:07 +00:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
firstNode, err := dec.convertToYamlNode(root)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return err
|
2022-02-07 00:26:48 +00:00
|
|
|
} else if firstNode.Tag == "!!null" {
|
2022-02-10 01:02:53 +00:00
|
|
|
dec.finished = true
|
2022-05-27 01:18:38 +00:00
|
|
|
if dec.readAnything {
|
|
|
|
return io.EOF
|
|
|
|
}
|
2021-12-21 04:02:07 +00:00
|
|
|
}
|
2022-05-27 01:18:38 +00:00
|
|
|
dec.readAnything = true
|
2021-12-21 04:02:07 +00:00
|
|
|
rootYamlNode.Kind = yaml.DocumentNode
|
|
|
|
rootYamlNode.Content = []*yaml.Node{firstNode}
|
|
|
|
dec.finished = true
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
type xmlNode struct {
|
2022-01-15 00:57:59 +00:00
|
|
|
Children []*xmlChildrenKv
|
|
|
|
HeadComment string
|
|
|
|
FootComment string
|
|
|
|
LineComment string
|
|
|
|
Data string
|
2021-12-21 04:02:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
type xmlChildrenKv struct {
|
2022-01-15 00:57:59 +00:00
|
|
|
K string
|
|
|
|
V []*xmlNode
|
|
|
|
FootComment string
|
2021-12-21 04:02:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// AddChild appends a node to the list of children
|
|
|
|
func (n *xmlNode) AddChild(s string, c *xmlNode) {
|
|
|
|
|
|
|
|
if n.Children == nil {
|
|
|
|
n.Children = make([]*xmlChildrenKv, 0)
|
|
|
|
}
|
|
|
|
log.Debug("looking for %s", s)
|
|
|
|
// see if we can find an existing entry to add to
|
|
|
|
for _, childEntry := range n.Children {
|
|
|
|
if childEntry.K == s {
|
|
|
|
log.Debug("found it, appending an entry%s", s)
|
|
|
|
childEntry.V = append(childEntry.V, c)
|
|
|
|
log.Debug("yay len of children in %v is %v", s, len(childEntry.V))
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
log.Debug("not there, making a new one %s", s)
|
|
|
|
n.Children = append(n.Children, &xmlChildrenKv{K: s, V: []*xmlNode{c}})
|
|
|
|
}
|
|
|
|
|
|
|
|
type element struct {
|
|
|
|
parent *element
|
|
|
|
n *xmlNode
|
|
|
|
label string
|
2022-01-15 00:57:59 +00:00
|
|
|
state string
|
2021-12-21 04:02:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// this code is heavily based on https://github.com/basgys/goxml2json
|
|
|
|
// main changes are to decode into a structure that preserves the original order
|
|
|
|
// of the map keys.
|
2022-02-07 00:55:55 +00:00
|
|
|
func (dec *xmlDecoder) decodeXML(root *xmlNode) error {
|
2021-12-21 04:02:07 +00:00
|
|
|
xmlDec := xml.NewDecoder(dec.reader)
|
2022-03-28 03:05:10 +00:00
|
|
|
xmlDec.Strict = dec.strictMode
|
2021-12-21 04:02:07 +00:00
|
|
|
// That will convert the charset if the provided XML is non-UTF-8
|
|
|
|
xmlDec.CharsetReader = charset.NewReaderLabel
|
|
|
|
|
|
|
|
// Create first element from the root node
|
|
|
|
elem := &element{
|
|
|
|
parent: nil,
|
|
|
|
n: root,
|
|
|
|
}
|
|
|
|
|
2022-06-14 23:40:31 +00:00
|
|
|
getToken := func() (xml.Token, error) {
|
|
|
|
if dec.useRawToken {
|
|
|
|
return xmlDec.RawToken()
|
|
|
|
}
|
|
|
|
return xmlDec.Token()
|
|
|
|
}
|
|
|
|
|
2021-12-21 04:02:07 +00:00
|
|
|
for {
|
2022-06-14 23:40:31 +00:00
|
|
|
t, e := getToken()
|
2022-03-28 03:05:10 +00:00
|
|
|
if e != nil && !errors.Is(e, io.EOF) {
|
2022-03-27 00:42:07 +00:00
|
|
|
return e
|
|
|
|
}
|
2021-12-21 04:02:07 +00:00
|
|
|
if t == nil {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
|
|
|
|
switch se := t.(type) {
|
|
|
|
case xml.StartElement:
|
2022-01-15 00:57:59 +00:00
|
|
|
log.Debug("start element %v", se.Name.Local)
|
|
|
|
elem.state = "started"
|
2021-12-21 04:02:07 +00:00
|
|
|
// Build new a new current element and link it to its parent
|
|
|
|
elem = &element{
|
|
|
|
parent: elem,
|
|
|
|
n: &xmlNode{},
|
|
|
|
label: se.Name.Local,
|
|
|
|
}
|
|
|
|
|
|
|
|
// Extract attributes as children
|
|
|
|
for _, a := range se.Attr {
|
2022-06-14 23:40:31 +00:00
|
|
|
if dec.keepNamespace {
|
|
|
|
if a.Name.Space != "" {
|
|
|
|
a.Name.Local = a.Name.Space + ":" + a.Name.Local
|
|
|
|
}
|
|
|
|
}
|
2021-12-21 04:02:07 +00:00
|
|
|
elem.n.AddChild(dec.attributePrefix+a.Name.Local, &xmlNode{Data: a.Value})
|
|
|
|
}
|
|
|
|
case xml.CharData:
|
|
|
|
// Extract XML data (if any)
|
|
|
|
elem.n.Data = trimNonGraphic(string(se))
|
2022-01-15 00:57:59 +00:00
|
|
|
if elem.n.Data != "" {
|
|
|
|
elem.state = "chardata"
|
|
|
|
log.Debug("chardata [%v] for %v", elem.n.Data, elem.label)
|
|
|
|
}
|
2021-12-21 04:02:07 +00:00
|
|
|
case xml.EndElement:
|
2022-01-15 00:57:59 +00:00
|
|
|
log.Debug("end element %v", elem.label)
|
|
|
|
elem.state = "finished"
|
2021-12-21 04:02:07 +00:00
|
|
|
// And add it to its parent list
|
|
|
|
if elem.parent != nil {
|
|
|
|
elem.parent.n.AddChild(elem.label, elem.n)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Then change the current element to its parent
|
|
|
|
elem = elem.parent
|
|
|
|
case xml.Comment:
|
2022-01-15 00:57:59 +00:00
|
|
|
|
|
|
|
commentStr := string(xml.CharData(se))
|
|
|
|
if elem.state == "started" {
|
|
|
|
applyFootComment(elem, commentStr)
|
|
|
|
|
|
|
|
} else if elem.state == "chardata" {
|
|
|
|
log.Debug("got a line comment for (%v) %v: [%v]", elem.state, elem.label, commentStr)
|
|
|
|
elem.n.LineComment = joinFilter([]string{elem.n.LineComment, commentStr})
|
|
|
|
} else {
|
|
|
|
log.Debug("got a head comment for (%v) %v: [%v]", elem.state, elem.label, commentStr)
|
|
|
|
elem.n.HeadComment = joinFilter([]string{elem.n.HeadComment, commentStr})
|
|
|
|
}
|
|
|
|
|
2021-12-21 04:02:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2022-01-15 00:57:59 +00:00
|
|
|
func applyFootComment(elem *element, commentStr string) {
|
|
|
|
|
|
|
|
// first lets try to put the comment on the last child
|
|
|
|
if len(elem.n.Children) > 0 {
|
|
|
|
lastChildIndex := len(elem.n.Children) - 1
|
|
|
|
childKv := elem.n.Children[lastChildIndex]
|
|
|
|
log.Debug("got a foot comment for %v: [%v]", childKv.K, commentStr)
|
|
|
|
childKv.FootComment = joinFilter([]string{elem.n.FootComment, commentStr})
|
|
|
|
} else {
|
|
|
|
log.Debug("got a foot comment for %v: [%v]", elem.label, commentStr)
|
|
|
|
elem.n.FootComment = joinFilter([]string{elem.n.FootComment, commentStr})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func joinFilter(rawStrings []string) string {
|
|
|
|
stringsToJoin := make([]string, 0)
|
|
|
|
for _, str := range rawStrings {
|
|
|
|
if str != "" {
|
|
|
|
stringsToJoin = append(stringsToJoin, str)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return strings.Join(stringsToJoin, " ")
|
|
|
|
}
|
|
|
|
|
2021-12-21 04:02:07 +00:00
|
|
|
// trimNonGraphic returns a slice of the string s, with all leading and trailing
|
|
|
|
// non graphic characters and spaces removed.
|
|
|
|
//
|
|
|
|
// Graphic characters include letters, marks, numbers, punctuation, symbols,
|
|
|
|
// and spaces, from categories L, M, N, P, S, Zs.
|
|
|
|
// Spacing characters are set by category Z and property Pattern_White_Space.
|
|
|
|
func trimNonGraphic(s string) string {
|
|
|
|
if s == "" {
|
|
|
|
return s
|
|
|
|
}
|
|
|
|
|
|
|
|
var first *int
|
|
|
|
var last int
|
|
|
|
for i, r := range []rune(s) {
|
|
|
|
if !unicode.IsGraphic(r) || unicode.IsSpace(r) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if first == nil {
|
|
|
|
f := i // copy i
|
|
|
|
first = &f
|
|
|
|
last = i
|
|
|
|
} else {
|
|
|
|
last = i
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If first is nil, it means there are no graphic characters
|
|
|
|
if first == nil {
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
|
|
|
|
return string([]rune(s)[*first : last+1])
|
|
|
|
}
|