From 4571ec825f30feedbb4a6fe963deb87f6dec0d0e Mon Sep 17 00:00:00 2001 From: Mike Farah Date: Thu, 16 Dec 2021 22:18:50 +1100 Subject: [PATCH] wipt --- LICENSE | 2 + examples/mike.xml | 7 +- go.mod | 4 +- pkg/yqlib/decoder_xml.go | 156 ++++++++++++++++++++++++++++++++++++--- 4 files changed, 154 insertions(+), 15 deletions(-) diff --git a/LICENSE b/LICENSE index e2f1fc03..dfbcc195 100644 --- a/LICENSE +++ b/LICENSE @@ -1,3 +1,5 @@ +The MIT License (MIT) + Copyright (c) 2017 Mike Farah Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/examples/mike.xml b/examples/mike.xml index b67be84e..3c9360b9 100644 --- a/examples/mike.xml +++ b/examples/mike.xml @@ -1,6 +1,11 @@ + - + + + + cool + bar bar23 \ No newline at end of file diff --git a/go.mod b/go.mod index c86e61f1..ae1ecfcf 100644 --- a/go.mod +++ b/go.mod @@ -1,7 +1,6 @@ module github.com/mikefarah/yq/v4 require ( - github.com/basgys/goxml2json v1.1.0 github.com/elliotchance/orderedmap v1.4.0 github.com/fatih/color v1.13.0 github.com/goccy/go-yaml v1.9.4 @@ -9,17 +8,18 @@ require ( github.com/magiconair/properties v1.8.5 github.com/spf13/cobra v1.3.0 github.com/timtadh/lexmachine v0.2.2 + golang.org/x/net v0.0.0-20210813160813-60bc85c4be6d gopkg.in/op/go-logging.v1 v1.0.0-20160211212156-b2cb9fa56473 gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b ) require ( + github.com/basgys/goxml2json v1.1.0 // indirect github.com/inconshreveable/mousetrap v1.0.0 // indirect github.com/mattn/go-colorable v0.1.12 // indirect github.com/mattn/go-isatty v0.0.14 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/timtadh/data-structures v0.5.3 // indirect - golang.org/x/net v0.0.0-20210813160813-60bc85c4be6d // indirect golang.org/x/sys v0.0.0-20211205182925-97ca703d548d // indirect golang.org/x/text v0.3.7 // indirect golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 // indirect diff --git a/pkg/yqlib/decoder_xml.go b/pkg/yqlib/decoder_xml.go index 563e3ba6..f50cdeae 100644 --- a/pkg/yqlib/decoder_xml.go +++ b/pkg/yqlib/decoder_xml.go @@ -1,9 +1,11 @@ package yqlib import ( + "encoding/xml" "io" + "unicode" - xj "github.com/basgys/goxml2json" + "golang.org/x/net/html/charset" yaml "gopkg.in/yaml.v3" ) @@ -15,10 +17,10 @@ type xmlDecoder struct { } func NewXmlDecoder(reader io.Reader, attributePrefix string, contentPrefix string) Decoder { - return &xmlDecoder{reader: reader, attributePrefix: attributePrefix, contentPrefix: contentPrefix, finished: false} + return &xmlDecoder{reader: reader, attributePrefix: attributePrefix, contentPrefix: "c", finished: false} } -func (dec *xmlDecoder) createSequence(nodes xj.Nodes) (*yaml.Node, error) { +func (dec *xmlDecoder) createSequence(nodes []*xmlNode) (*yaml.Node, error) { yamlNode := &yaml.Node{Kind: yaml.SequenceNode} for _, child := range nodes { yamlChild, err := dec.convertToYamlNode(child) @@ -31,18 +33,21 @@ func (dec *xmlDecoder) createSequence(nodes xj.Nodes) (*yaml.Node, error) { return yamlNode, nil } -func (dec *xmlDecoder) createMap(n *xj.Node) (*yaml.Node, error) { - yamlNode := &yaml.Node{Kind: yaml.MappingNode} +func (dec *xmlDecoder) createMap(n *xmlNode) (*yaml.Node, error) { + yamlNode := &yaml.Node{Kind: yaml.MappingNode, HeadComment: n.Comment} if len(n.Data) > 0 { - label := dec.contentPrefix + "content" + label := dec.contentPrefix yamlNode.Content = append(yamlNode.Content, createScalarNode(label, label), createScalarNode(n.Data, n.Data)) } - for label, children := range n.Children { + for _, keyValuePair := range n.Children { + label := keyValuePair.K + children := keyValuePair.V labelNode := createScalarNode(label, label) var valueNode *yaml.Node var err error + log.Debug("len of children in %v is %v", label, len(children)) if len(children) > 1 { valueNode, err = dec.createSequence(children) if err != nil { @@ -60,20 +65,22 @@ func (dec *xmlDecoder) createMap(n *xj.Node) (*yaml.Node, error) { return yamlNode, nil } -func (dec *xmlDecoder) convertToYamlNode(n *xj.Node) (*yaml.Node, error) { - if n.IsComplex() { +func (dec *xmlDecoder) convertToYamlNode(n *xmlNode) (*yaml.Node, error) { + if len(n.Children) > 0 { return dec.createMap(n) } - return createScalarNode(n.Data, n.Data), nil + scalar := createScalarNode(n.Data, n.Data) + scalar.HeadComment = n.Comment + return scalar, nil } func (dec *xmlDecoder) Decode(rootYamlNode *yaml.Node) error { if dec.finished { return io.EOF } - root := &xj.Node{} + root := &xmlNode{} // cant use xj - it doesn't keep map order. - err := xj.NewDecoder(dec.reader).Decode(root) + err := dec.decodeXml(root) if err != nil { return err @@ -88,3 +95,128 @@ func (dec *xmlDecoder) Decode(rootYamlNode *yaml.Node) error { dec.finished = true return nil } + +type xmlNode struct { + Children []*xmlChildrenKv + Comment string + Data string +} + +type xmlChildrenKv struct { + K string + V []*xmlNode +} + +// AddChild appends a node to the list of children +func (n *xmlNode) AddChild(s string, c *xmlNode) { + + if n.Children == nil { + n.Children = make([]*xmlChildrenKv, 0) + } + log.Debug("looking for %s", s) + // see if we can find an existing entry to add to + for _, childEntry := range n.Children { + if childEntry.K == s { + log.Debug("found it, appending an entry%s", s) + childEntry.V = append(childEntry.V, c) + log.Debug("yay len of children in %v is %v", s, len(childEntry.V)) + return + } + } + log.Debug("not there, making a new one %s", s) + n.Children = append(n.Children, &xmlChildrenKv{K: s, V: []*xmlNode{c}}) +} + +type element struct { + parent *element + n *xmlNode + label string +} + +// this code is heavily based on https://github.com/basgys/goxml2json +// main changes are to decode into a structure that preserves the original order +// of the map keys. +func (dec *xmlDecoder) decodeXml(root *xmlNode) error { + xmlDec := xml.NewDecoder(dec.reader) + + // That will convert the charset if the provided XML is non-UTF-8 + xmlDec.CharsetReader = charset.NewReaderLabel + + // Create first element from the root node + elem := &element{ + parent: nil, + n: root, + } + + for { + t, _ := xmlDec.Token() + if t == nil { + break + } + + switch se := t.(type) { + case xml.StartElement: + // Build new a new current element and link it to its parent + elem = &element{ + parent: elem, + n: &xmlNode{}, + label: se.Name.Local, + } + + // Extract attributes as children + for _, a := range se.Attr { + elem.n.AddChild(dec.attributePrefix+a.Name.Local, &xmlNode{Data: a.Value}) + } + case xml.CharData: + // Extract XML data (if any) + elem.n.Data = trimNonGraphic(string(xml.CharData(se))) + case xml.EndElement: + // And add it to its parent list + if elem.parent != nil { + elem.parent.n.AddChild(elem.label, elem.n) + } + + // Then change the current element to its parent + elem = elem.parent + case xml.Comment: + elem.n.Comment = trimNonGraphic(string(xml.CharData(se))) + } + } + + return nil +} + +// trimNonGraphic returns a slice of the string s, with all leading and trailing +// non graphic characters and spaces removed. +// +// Graphic characters include letters, marks, numbers, punctuation, symbols, +// and spaces, from categories L, M, N, P, S, Zs. +// Spacing characters are set by category Z and property Pattern_White_Space. +func trimNonGraphic(s string) string { + if s == "" { + return s + } + + var first *int + var last int + for i, r := range []rune(s) { + if !unicode.IsGraphic(r) || unicode.IsSpace(r) { + continue + } + + if first == nil { + f := i // copy i + first = &f + last = i + } else { + last = i + } + } + + // If first is nil, it means there are no graphic characters + if first == nil { + return "" + } + + return string([]rune(s)[*first : last+1]) +}