From d8da9c8ccc0448c2579b4b2d90ff6f994ccf9fb8 Mon Sep 17 00:00:00 2001 From: Mike Farah Date: Tue, 18 Oct 2022 14:15:25 +1100 Subject: [PATCH] Added XML processing instructions and directive support --- README.md | 21 ---- acceptance_tests/inputs-format.sh | 25 ++++- cmd/constant.go | 6 - cmd/root.go | 26 +++-- cmd/utils.go | 4 +- examples/mike.xml | 21 ++-- pkg/yqlib/decoder_xml.go | 45 ++++---- pkg/yqlib/doc/usage/headers/xml.md | 35 +++++- pkg/yqlib/doc/usage/xml.md | 145 +++++++++++++++++++++++-- pkg/yqlib/encoder_xml.go | 94 ++++++++++++---- pkg/yqlib/lexer_participle.go | 2 +- pkg/yqlib/lib.go | 20 ++++ pkg/yqlib/operator_encoder_decoder.go | 9 +- pkg/yqlib/xml_test.go | 151 +++++++++++++++++++++----- 14 files changed, 463 insertions(+), 141 deletions(-) diff --git a/README.md b/README.md index 2df32967..54ef524f 100644 --- a/README.md +++ b/README.md @@ -7,27 +7,6 @@ a lightweight and portable command-line YAML, JSON and XML processor. `yq` uses yq is written in go - so you can download a dependency free binary for your platform and you are good to go! If you prefer there are a variety of package managers that can be used as well as Docker and Podman, all listed below. -## Notice for v4.x versions prior to 4.18.1 -Since 4.18.1, yq's 'eval/e' command is the _default_ command and no longer needs to be specified. - -Older versions will still need to specify 'eval/e'. - -Similarly, '-' is no longer required as a filename to read from STDIN (unless reading from one or more files). - -TLDR: - -Prior to 4.18.1 -```bash -yq e '.cool' - < file.yaml -``` - -4.18+ -```bash -yq '.cool' < file.yaml -``` - -When merging multiple files together, `eval-all/ea` is still required to tell `yq` to run the expression against all the document at once. - ## Quick Usage Guide Read a value: diff --git a/acceptance_tests/inputs-format.sh b/acceptance_tests/inputs-format.sh index 2173678e..8979d04d 100755 --- a/acceptance_tests/inputs-format.sh +++ b/acceptance_tests/inputs-format.sh @@ -127,6 +127,7 @@ testInputXmlNamespaces() { EOL read -r -d '' expected << EOM ++p_xml: version="1.0" map: +xmlns: some-namespace +xmlns:xsi: some-instance @@ -140,6 +141,26 @@ EOM assertEquals "$expected" "$X" } +testInputXmlRoundtrip() { + cat >test.yml < + +Meow +EOL + + read -r -d '' expected << EOM + + +Meow +EOM + + X=$(./yq -p=xml -o=xml test.yml) + assertEquals "$expected" "$X" + + X=$(./yq ea -p=xml -o=xml test.yml) + assertEquals "$expected" "$X" +} + testInputXmlStrict() { cat >test.yml < EOL - X=$(./yq -p=xml --xml-strict-mode test.yml 2>&1) + X=$(./yq -p=xml --xml-strict-mode test.yml -o=xml 2>&1) assertEquals 1 $? assertEquals "Error: bad file 'test.yml': XML syntax error on line 7: invalid character entity &writer;" "$X" - X=$(./yq ea -p=xml --xml-strict-mode test.yml 2>&1) + X=$(./yq ea -p=xml --xml-strict-mode test.yml -o=xml 2>&1) assertEquals "Error: bad file 'test.yml': XML syntax error on line 7: invalid character entity &writer;" "$X" } diff --git a/cmd/constant.go b/cmd/constant.go index a152e12a..ef0c31db 100644 --- a/cmd/constant.go +++ b/cmd/constant.go @@ -8,12 +8,6 @@ var outputToJSON = false var outputFormat = "yaml" var inputFormat = "yaml" -var xmlAttributePrefix = "+" -var xmlContentName = "+content" -var xmlStrictMode = false -var xmlKeepNamespace = true -var xmlUseRawToken = true - var exitStatus = false var forceColor = false var forceNoColor = false diff --git a/cmd/root.go b/cmd/root.go index 3951ed06..df740104 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -47,14 +47,18 @@ yq -P sample.json if verbose { backend.SetLevel(logging.DEBUG, "") } else { - backend.SetLevel(logging.ERROR, "") + backend.SetLevel(logging.WARNING, "") } logging.SetBackend(backend) yqlib.InitExpressionParser() - yqlib.XMLPreferences.AttributePrefix = xmlAttributePrefix - yqlib.XMLPreferences.ContentName = xmlContentName - yqlib.XMLPreferences.StrictMode = xmlStrictMode + if (inputFormat == "x" || inputFormat == "xml") && + outputFormat != "x" && outputFormat != "xml" && + yqlib.XMLPreferences.AttributePrefix == "+" { + yqlib.GetLogger().Warning("The default xml-attribute-prefix will change in the v4.30 to `+@` to avoid " + + "naming conflicts with the default content name, directive name and proc inst prefix. If you need to keep " + + "`+` please set that value explicityly with --xml-attribute-prefix.") + } }, } @@ -69,11 +73,15 @@ yq -P sample.json rootCmd.PersistentFlags().StringVarP(&outputFormat, "output-format", "o", "yaml", "[yaml|y|json|j|props|p|xml|x] output format type.") rootCmd.PersistentFlags().StringVarP(&inputFormat, "input-format", "p", "yaml", "[yaml|y|props|p|xml|x] parse format for input. Note that json is a subset of yaml.") - rootCmd.PersistentFlags().StringVar(&xmlAttributePrefix, "xml-attribute-prefix", "+", "prefix for xml attributes") - rootCmd.PersistentFlags().StringVar(&xmlContentName, "xml-content-name", "+content", "name for xml content (if no attribute name is present).") - rootCmd.PersistentFlags().BoolVar(&xmlStrictMode, "xml-strict-mode", false, "enables strict parsing of XML. See https://pkg.go.dev/encoding/xml for more details.") - rootCmd.PersistentFlags().BoolVar(&xmlKeepNamespace, "xml-keep-namespace", true, "enables keeping namespace after parsing attributes") - rootCmd.PersistentFlags().BoolVar(&xmlUseRawToken, "xml-raw-token", true, "enables using RawToken method instead Token. Commonly disables namespace translations. See https://pkg.go.dev/encoding/xml#Decoder.RawToken for details.") + rootCmd.PersistentFlags().StringVar(&yqlib.XMLPreferences.AttributePrefix, "xml-attribute-prefix", "+", "prefix for xml attributes") + rootCmd.PersistentFlags().StringVar(&yqlib.XMLPreferences.ContentName, "xml-content-name", "+content", "name for xml content (if no attribute name is present).") + rootCmd.PersistentFlags().BoolVar(&yqlib.XMLPreferences.StrictMode, "xml-strict-mode", false, "enables strict parsing of XML. See https://pkg.go.dev/encoding/xml for more details.") + rootCmd.PersistentFlags().BoolVar(&yqlib.XMLPreferences.KeepNamespace, "xml-keep-namespace", true, "enables keeping namespace after parsing attributes") + rootCmd.PersistentFlags().BoolVar(&yqlib.XMLPreferences.UseRawToken, "xml-raw-token", true, "enables using RawToken method instead Token. Commonly disables namespace translations. See https://pkg.go.dev/encoding/xml#Decoder.RawToken for details.") + rootCmd.PersistentFlags().StringVar(&yqlib.XMLPreferences.ProcInstPrefix, "xml-proc-inst-prefix", "+p_", "prefix for xml processing instructions (e.g. )") + rootCmd.PersistentFlags().StringVar(&yqlib.XMLPreferences.DirectiveName, "xml-directive-name", "+directive", "name for xml directives (e.g. )") + rootCmd.PersistentFlags().BoolVar(&yqlib.XMLPreferences.SkipProcInst, "xml-skip-proc-inst", false, "skip over process instructions (e.g. )") + rootCmd.PersistentFlags().BoolVar(&yqlib.XMLPreferences.SkipDirectives, "xml-skip-directives", false, "skip over directives (e.g. )") rootCmd.PersistentFlags().BoolVarP(&nullInput, "null-input", "n", false, "Don't read input, simply evaluate the expression given. Useful for creating docs from scratch.") rootCmd.PersistentFlags().BoolVarP(&noDocSeparators, "no-doc", "N", false, "Don't print document separators (---)") diff --git a/cmd/utils.go b/cmd/utils.go index e0aaf1d6..b201b1db 100644 --- a/cmd/utils.go +++ b/cmd/utils.go @@ -63,7 +63,7 @@ func configureDecoder() (yqlib.Decoder, error) { } switch yqlibInputFormat { case yqlib.XMLInputFormat: - return yqlib.NewXMLDecoder(xmlAttributePrefix, xmlContentName, xmlStrictMode, xmlKeepNamespace, xmlUseRawToken), nil + return yqlib.NewXMLDecoder(yqlib.XMLPreferences), nil case yqlib.PropertiesInputFormat: return yqlib.NewPropertiesDecoder(), nil case yqlib.JsonInputFormat: @@ -107,7 +107,7 @@ func configureEncoder(format yqlib.PrinterOutputFormat) yqlib.Encoder { case yqlib.YamlOutputFormat: return yqlib.NewYamlEncoder(indent, colorsEnabled, !noDocSeparators, unwrapScalar) case yqlib.XMLOutputFormat: - return yqlib.NewXMLEncoder(indent, xmlAttributePrefix, xmlContentName) + return yqlib.NewXMLEncoder(indent, yqlib.XMLPreferences) } panic("invalid encoder") } diff --git a/examples/mike.xml b/examples/mike.xml index 94773b86..4543a106 100644 --- a/examples/mike.xml +++ b/examples/mike.xml @@ -1,14 +1,7 @@ - - - - 3 - - - 4 - - - - - \ No newline at end of file + + + + + +things + \ No newline at end of file diff --git a/pkg/yqlib/decoder_xml.go b/pkg/yqlib/decoder_xml.go index d053f6ca..d74c579a 100644 --- a/pkg/yqlib/decoder_xml.go +++ b/pkg/yqlib/decoder_xml.go @@ -12,27 +12,16 @@ import ( ) type xmlDecoder struct { - reader io.Reader - readAnything bool - attributePrefix string - contentName string - strictMode bool - keepNamespace bool - useRawToken bool - finished bool + reader io.Reader + readAnything bool + finished bool + prefs xmlPreferences } -func NewXMLDecoder(attributePrefix string, contentName string, strictMode bool, keepNamespace bool, useRawToken bool) Decoder { - if contentName == "" { - contentName = "content" - } +func NewXMLDecoder(prefs xmlPreferences) Decoder { return &xmlDecoder{ - attributePrefix: attributePrefix, - contentName: contentName, - finished: false, - strictMode: strictMode, - keepNamespace: keepNamespace, - useRawToken: useRawToken, + finished: false, + prefs: prefs, } } @@ -67,7 +56,7 @@ func (dec *xmlDecoder) createMap(n *xmlNode) (*yaml.Node, error) { yamlNode := &yaml.Node{Kind: yaml.MappingNode, Tag: "!!map"} if len(n.Data) > 0 { - label := dec.contentName + label := dec.prefs.ContentName labelNode := createScalarNode(label, label) labelNode.HeadComment = dec.processComment(n.HeadComment) labelNode.FootComment = dec.processComment(n.FootComment) @@ -86,9 +75,7 @@ func (dec *xmlDecoder) createMap(n *xmlNode) (*yaml.Node, error) { } - // if i == len(n.Children)-1 { labelNode.FootComment = dec.processComment(keyValuePair.FootComment) - // } log.Debug("len of children in %v is %v", label, len(children)) if len(children) > 1 { @@ -205,7 +192,7 @@ type element struct { // of the map keys. func (dec *xmlDecoder) decodeXML(root *xmlNode) error { xmlDec := xml.NewDecoder(dec.reader) - xmlDec.Strict = dec.strictMode + xmlDec.Strict = dec.prefs.StrictMode // That will convert the charset if the provided XML is non-UTF-8 xmlDec.CharsetReader = charset.NewReaderLabel @@ -216,7 +203,7 @@ func (dec *xmlDecoder) decodeXML(root *xmlNode) error { } getToken := func() (xml.Token, error) { - if dec.useRawToken { + if dec.prefs.UseRawToken { return xmlDec.RawToken() } return xmlDec.Token() @@ -244,12 +231,12 @@ func (dec *xmlDecoder) decodeXML(root *xmlNode) error { // Extract attributes as children for _, a := range se.Attr { - if dec.keepNamespace { + if dec.prefs.KeepNamespace { if a.Name.Space != "" { a.Name.Local = a.Name.Space + ":" + a.Name.Local } } - elem.n.AddChild(dec.attributePrefix+a.Name.Local, &xmlNode{Data: a.Value}) + elem.n.AddChild(dec.prefs.AttributePrefix+a.Name.Local, &xmlNode{Data: a.Value}) } case xml.CharData: // Extract XML data (if any) @@ -282,6 +269,14 @@ func (dec *xmlDecoder) decodeXML(root *xmlNode) error { elem.n.HeadComment = joinFilter([]string{elem.n.HeadComment, commentStr}) } + case xml.ProcInst: + if !dec.prefs.SkipProcInst { + elem.n.AddChild(dec.prefs.ProcInstPrefix+se.Target, &xmlNode{Data: string(se.Inst)}) + } + case xml.Directive: + if !dec.prefs.SkipDirectives { + elem.n.AddChild(dec.prefs.DirectiveName, &xmlNode{Data: string(se)}) + } } } diff --git a/pkg/yqlib/doc/usage/headers/xml.md b/pkg/yqlib/doc/usage/headers/xml.md index fd8e79d6..bcd9ba4b 100644 --- a/pkg/yqlib/doc/usage/headers/xml.md +++ b/pkg/yqlib/doc/usage/headers/xml.md @@ -4,4 +4,37 @@ Encode and decode to and from XML. Whitespace is not conserved for round trips - Consecutive xml nodes with the same name are assumed to be arrays. -XML content data and attributes are created as fields. This can be controlled by the `'--xml-attribute-prefix` and `--xml-content-name` flags - see below for examples. +XML content data, attributes processing instructions and directives are all created as plain fields. + +This can be controlled by: + +| Flag | Default |Sample XML | +| -- | -- | -- | + | `--xml-attribute-prefix` | `+` (changing to `+@` soon) | Legs in `````` | + | `--xml-content-name` | `+content` | Meow in ```Meow true``` | + | `--xml-directive-name` | `+directive` | `````` | + | `--xml-proc-inst-prefix` | `+p_` | `````` | + + +{% hint style="warning" %} +Default Attribute Prefix will be changing in v4.30! +In order to avoid name conflicts (e.g. having an attribute named "content" will create a field that clashes with the default content name of "+content") the attribute prefix will be changing to "+@". + +This will affect users that have not set their own prefix and are not roundtripping XML changes. + +{% endhint %} + +## Encoder / Decoder flag options + +In addition to the above flags, there are the following xml encoder/decoder options controlled by flags: + +| Flag | Default | Description | +| -- | -- | -- | +| `--xml-strict-mode` | false | Strict mode enforces the requirements of the XML specification. When switched off the parser allows input containing common mistakes. See [the Golang xml decoder ](https://pkg.go.dev/encoding/xml#Decoder) for more details.| +| `--xml-keep-namespace` | true | Keeps the namespace of attributes | +| `--xml-raw-token` | true | Does not verify that start and end elements match and does not translate name space prefixes to their corresponding URLs. | +| `--xml-skip-proc-inst` | false | Skips over processing instructions, e.g. `` | +| `--xml-skip-directives` | false | Skips over directives, e.g. `````` | + + +See below for examples diff --git a/pkg/yqlib/doc/usage/xml.md b/pkg/yqlib/doc/usage/xml.md index 729c8dc1..41102009 100644 --- a/pkg/yqlib/doc/usage/xml.md +++ b/pkg/yqlib/doc/usage/xml.md @@ -4,7 +4,40 @@ Encode and decode to and from XML. Whitespace is not conserved for round trips - Consecutive xml nodes with the same name are assumed to be arrays. -XML content data and attributes are created as fields. This can be controlled by the `'--xml-attribute-prefix` and `--xml-content-name` flags - see below for examples. +XML content data, attributes processing instructions and directives are all created as plain fields. + +This can be controlled by: + +| Flag | Default |Sample XML | +| -- | -- | -- | + | `--xml-attribute-prefix` | `+` (changing to `+@` soon) | Legs in `````` | + | `--xml-content-name` | `+content` | Meow in ```Meow true``` | + | `--xml-directive-name` | `+directive` | `````` | + | `--xml-proc-inst-prefix` | `+p_` | `````` | + + +{% hint style="warning" %} +Default Attribute Prefix will be changing in v4.30! +In order to avoid name conflicts (e.g. having an attribute named "content" will create a field that clashes with the default content name of "+content") the attribute prefix will be changing to "+@". + +This will affect users that have not set their own prefix and are not roundtripping XML changes. + +{% endhint %} + +## Encoder / Decoder flag options + +In addition to the above flags, there are the following xml encoder/decoder options controlled by flags: + +| Flag | Default | Description | +| -- | -- | -- | +| `--xml-strict-mode` | false | Strict mode enforces the requirements of the XML specification. When switched off the parser allows input containing common mistakes. See [the Golang xml decoder ](https://pkg.go.dev/encoding/xml#Decoder) for more details.| +| `--xml-keep-namespace` | true | Keeps the namespace of attributes | +| `--xml-raw-token` | true | Does not verify that start and end elements match and does not translate name space prefixes to their corresponding URLs. | +| `--xml-skip-proc-inst` | false | Skips over processing instructions, e.g. `` | +| `--xml-skip-directives` | false | Skips over directives, e.g. `````` | + + +See below for examples {% hint style="warning" %} Note that versions prior to 4.18 require the 'eval/e' command to be specified. @@ -30,6 +63,7 @@ yq -p=xml '.' sample.xml ``` will output ```yaml ++p_xml: version="1.0" encoding="UTF-8" cat: says: meow legs: "4" @@ -54,6 +88,7 @@ yq -p=xml ' (.. | select(tag == "!!str")) |= from_yaml' sample.xml ``` will output ```yaml ++p_xml: version="1.0" encoding="UTF-8" cat: says: meow legs: 4 @@ -75,6 +110,7 @@ yq -p=xml '.' sample.xml ``` will output ```yaml ++p_xml: version="1.0" encoding="UTF-8" animal: - cat - goat @@ -96,6 +132,7 @@ yq -p=xml '.' sample.xml ``` will output ```yaml ++p_xml: version="1.0" encoding="UTF-8" cat: +legs: "4" legs: "7" @@ -115,13 +152,14 @@ yq -p=xml '.' sample.xml ``` will output ```yaml ++p_xml: version="1.0" encoding="UTF-8" cat: +content: meow +legs: "4" ``` ## Parse xml: custom dtd -DTD entities are ignored. +DTD entities are processed as directives. Given a sample.xml file of: ```xml @@ -137,12 +175,45 @@ Given a sample.xml file of: ``` then ```bash -yq -p=xml '.' sample.xml +yq -p=xml -o=xml '.' sample.xml ``` will output -```yaml -root: - item: '&writer;©right;' +```xml + + + +]> + + &writer;&copyright; + +``` + +## Parse xml: skip custom dtd +DTDs are directives, skip over directives to skip DTDs. + +Given a sample.xml file of: +```xml + + + + +]> + + &writer;©right; + +``` +then +```bash +yq -p=xml -o=xml --xml-skip-directives '.' sample.xml +``` +will output +```xml + + + &writer;&copyright; + ``` ## Parse xml: with comments @@ -207,12 +278,14 @@ yq -p=xml -o=xml --xml-keep-namespace '.' sample.xml ``` will output ```xml + ``` instead of ```xml - + + ``` ## Parse xml: keep raw attribute namespace @@ -230,11 +303,13 @@ yq -p=xml -o=xml --xml-keep-namespace --xml-raw-token '.' sample.xml ``` will output ```xml - + + ``` instead of ```xml + ``` @@ -339,6 +414,32 @@ will output ``` +## Encode: doctype and xml declaration +Use the special xml names to add/modify proc instructions and directives. + +Given a sample.yml file of: +```yaml ++p_xml: version="1.0" ++directive: 'DOCTYPE config SYSTEM "/etc/iwatch/iwatch.dtd" ' +apple: + +p_coolioo: version="1.0" + +directive: 'CATYPE meow purr puss ' + b: things + +``` +then +```bash +yq -o=xml '.' sample.yml +``` +will output +```xml + + + + things + +``` + ## Round trip: with comments A best effort is made, but comment positions and white space are not preserved perfectly. @@ -380,3 +481,31 @@ in d before --> ``` +## Roundtrip: with doctype and declaration +yq parses XML proc instructions and directives into nodes. +Unfortunately the underlying XML parser loses whitespace information. + +Given a sample.xml file of: +```xml + + + + + + things + + +``` +then +```bash +yq -p=xml -o=xml '.' sample.xml +``` +will output +```xml + + + + things + +``` + diff --git a/pkg/yqlib/encoder_xml.go b/pkg/yqlib/encoder_xml.go index b84b59de..012e64b7 100644 --- a/pkg/yqlib/encoder_xml.go +++ b/pkg/yqlib/encoder_xml.go @@ -9,21 +9,19 @@ import ( yaml "gopkg.in/yaml.v3" ) -var XMLPreferences = xmlPreferences{AttributePrefix: "+", ContentName: "+content", StrictMode: false, UseRawToken: false} - type xmlEncoder struct { - attributePrefix string - contentName string - indentString string + indentString string + writer io.Writer + prefs xmlPreferences } -func NewXMLEncoder(indent int, attributePrefix string, contentName string) Encoder { +func NewXMLEncoder(indent int, prefs xmlPreferences) Encoder { var indentString = "" for index := 0; index < indent; index++ { indentString = indentString + " " } - return &xmlEncoder{attributePrefix, contentName, indentString} + return &xmlEncoder{indentString, nil, prefs} } func (e *xmlEncoder) CanHandleAliases() bool { @@ -40,6 +38,8 @@ func (e *xmlEncoder) PrintLeadingContent(writer io.Writer, content string) error func (e *xmlEncoder) Encode(writer io.Writer, node *yaml.Node) error { encoder := xml.NewEncoder(writer) + // hack so we can manually add newlines to procInst and directives + e.writer = writer encoder.Indent("", e.indentString) switch node.Kind { @@ -77,6 +77,23 @@ func (e *xmlEncoder) Encode(writer io.Writer, node *yaml.Node) error { } func (e *xmlEncoder) encodeTopLevelMap(encoder *xml.Encoder, node *yaml.Node) error { + // make sure processing instructions are encoded first + for i := 0; i < len(node.Content); i += 2 { + key := node.Content[i] + value := node.Content[i+1] + + if key.Value == (e.prefs.ProcInstPrefix + "xml") { + name := strings.Replace(key.Value, e.prefs.ProcInstPrefix, "", 1) + procInst := xml.ProcInst{Target: name, Inst: []byte(value.Value)} + if err := encoder.EncodeToken(procInst); err != nil { + return err + } + if _, err := e.writer.Write([]byte("\n")); err != nil { + log.Warning("Unable to write newline, skipping: %w", err) + } + } + } + err := e.encodeComment(encoder, headAndLineComment(node)) if err != nil { return err @@ -92,11 +109,33 @@ func (e *xmlEncoder) encodeTopLevelMap(encoder *xml.Encoder, node *yaml.Node) er return err } - log.Debugf("recursing") + if key.Value == (e.prefs.ProcInstPrefix + "xml") { + // dont double process these. + } else if strings.HasPrefix(key.Value, e.prefs.ProcInstPrefix) { + name := strings.Replace(key.Value, e.prefs.ProcInstPrefix, "", 1) + procInst := xml.ProcInst{Target: name, Inst: []byte(value.Value)} + if err := encoder.EncodeToken(procInst); err != nil { + return err + } + if _, err := e.writer.Write([]byte("\n")); err != nil { + log.Warning("Unable to write newline, skipping: %w", err) + } + } else if key.Value == e.prefs.DirectiveName { + var directive xml.Directive = []byte(value.Value) + if err := encoder.EncodeToken(directive); err != nil { + return err + } + if _, err := e.writer.Write([]byte("\n")); err != nil { + log.Warning("Unable to write newline, skipping: %w", err) + } + } else { - err = e.doEncode(encoder, value, start) - if err != nil { - return err + log.Debugf("recursing") + + err = e.doEncode(encoder, value, start) + if err != nil { + return err + } } err = e.encodeComment(encoder, footComment(key)) if err != nil { @@ -180,6 +219,13 @@ func (e *xmlEncoder) encodeArray(encoder *xml.Encoder, node *yaml.Node, start xm return e.encodeComment(encoder, footComment(node)) } +func (e *xmlEncoder) isAttribute(name string) bool { + return strings.HasPrefix(name, e.prefs.AttributePrefix) && + name != e.prefs.ContentName && + name != e.prefs.DirectiveName && + !strings.HasPrefix(name, e.prefs.ProcInstPrefix) +} + func (e *xmlEncoder) encodeMap(encoder *xml.Encoder, node *yaml.Node, start xml.StartElement) error { log.Debug("its a map") @@ -188,9 +234,9 @@ func (e *xmlEncoder) encodeMap(encoder *xml.Encoder, node *yaml.Node, start xml. key := node.Content[i] value := node.Content[i+1] - if strings.HasPrefix(key.Value, e.attributePrefix) && key.Value != e.contentName { + if e.isAttribute(key.Value) { if value.Kind == yaml.ScalarNode { - attributeName := strings.Replace(key.Value, e.attributePrefix, "", 1) + attributeName := strings.Replace(key.Value, e.prefs.AttributePrefix, "", 1) start.Attr = append(start.Attr, xml.Attr{Name: xml.Name{Local: attributeName}, Value: value.Value}) } else { return fmt.Errorf("cannot use %v as attribute, only scalars are supported", value.Tag) @@ -212,14 +258,18 @@ func (e *xmlEncoder) encodeMap(encoder *xml.Encoder, node *yaml.Node, start xml. if err != nil { return err } - - if !strings.HasPrefix(key.Value, e.attributePrefix) && key.Value != e.contentName { - start := xml.StartElement{Name: xml.Name{Local: key.Value}} - err := e.doEncode(encoder, value, start) - if err != nil { + if strings.HasPrefix(key.Value, e.prefs.ProcInstPrefix) { + name := strings.Replace(key.Value, e.prefs.ProcInstPrefix, "", 1) + procInst := xml.ProcInst{Target: name, Inst: []byte(value.Value)} + if err := encoder.EncodeToken(procInst); err != nil { return err } - } else if key.Value == e.contentName { + } else if key.Value == e.prefs.DirectiveName { + var directive xml.Directive = []byte(value.Value) + if err := encoder.EncodeToken(directive); err != nil { + return err + } + } else if key.Value == e.prefs.ContentName { // directly encode the contents err = e.encodeComment(encoder, headAndLineComment(value)) if err != nil { @@ -234,6 +284,12 @@ func (e *xmlEncoder) encodeMap(encoder *xml.Encoder, node *yaml.Node, start xml. if err != nil { return err } + } else if !e.isAttribute(key.Value) { + start := xml.StartElement{Name: xml.Name{Local: key.Value}} + err := e.doEncode(encoder, value, start) + if err != nil { + return err + } } err = e.encodeComment(encoder, footComment(key)) if err != nil { diff --git a/pkg/yqlib/lexer_participle.go b/pkg/yqlib/lexer_participle.go index 170424a7..6cc745cc 100644 --- a/pkg/yqlib/lexer_participle.go +++ b/pkg/yqlib/lexer_participle.go @@ -76,7 +76,7 @@ var participleYqRules = []*participleYqRule{ {"Base64d", `@base64d`, decodeOp(Base64InputFormat), 0}, {"Base64", `@base64`, encodeWithIndent(Base64OutputFormat, 0), 0}, - {"LoadXML", `load_?xml|xml_?load`, loadOp(NewXMLDecoder(XMLPreferences.AttributePrefix, XMLPreferences.ContentName, XMLPreferences.StrictMode, XMLPreferences.KeepNamespace, XMLPreferences.UseRawToken), false), 0}, + {"LoadXML", `load_?xml|xml_?load`, loadOp(NewXMLDecoder(XMLPreferences), false), 0}, {"LoadBase64", `load_?base64`, loadOp(NewBase64Decoder(), false), 0}, diff --git a/pkg/yqlib/lib.go b/pkg/yqlib/lib.go index e5f42a49..910ab801 100644 --- a/pkg/yqlib/lib.go +++ b/pkg/yqlib/lib.go @@ -27,8 +27,28 @@ type xmlPreferences struct { StrictMode bool KeepNamespace bool UseRawToken bool + ProcInstPrefix string + DirectiveName string + SkipProcInst bool + SkipDirectives bool } +func NewDefaultXmlPreferences() xmlPreferences { + return xmlPreferences{ + AttributePrefix: "+", + ContentName: "+content", + StrictMode: false, + KeepNamespace: true, + UseRawToken: false, + ProcInstPrefix: "+p_", + DirectiveName: "+directive", + SkipProcInst: false, + SkipDirectives: false, + } +} + +var XMLPreferences = NewDefaultXmlPreferences() + var log = logging.MustGetLogger("yq-lib") var PrettyPrintExp = `(... | (select(tag != "!!str"), select(tag == "!!str") | select(test("(?i)^(y|yes|n|no|on|off)$") | not)) ) style=""` diff --git a/pkg/yqlib/operator_encoder_decoder.go b/pkg/yqlib/operator_encoder_decoder.go index afb321ef..e8b8a963 100644 --- a/pkg/yqlib/operator_encoder_decoder.go +++ b/pkg/yqlib/operator_encoder_decoder.go @@ -23,7 +23,7 @@ func configureEncoder(format PrinterOutputFormat, indent int) Encoder { case YamlOutputFormat: return NewYamlEncoder(indent, false, true, true) case XMLOutputFormat: - return NewXMLEncoder(indent, XMLPreferences.AttributePrefix, XMLPreferences.ContentName) + return NewXMLEncoder(indent, XMLPreferences) case Base64OutputFormat: return NewBase64Encoder() } @@ -104,12 +104,7 @@ func decodeOperator(d *dataTreeNavigator, context Context, expressionNode *Expre case YamlInputFormat: decoder = NewYamlDecoder() case XMLInputFormat: - decoder = NewXMLDecoder( - XMLPreferences.AttributePrefix, - XMLPreferences.ContentName, - XMLPreferences.StrictMode, - XMLPreferences.KeepNamespace, - XMLPreferences.UseRawToken) + decoder = NewXMLDecoder(XMLPreferences) case Base64InputFormat: decoder = NewBase64Decoder() case PropertiesInputFormat: diff --git a/pkg/yqlib/xml_test.go b/pkg/yqlib/xml_test.go index caaf52de..af78d76e 100644 --- a/pkg/yqlib/xml_test.go +++ b/pkg/yqlib/xml_test.go @@ -159,13 +159,15 @@ const inputXMLWithNamespacedAttr = ` ` -const expectedYAMLWithNamespacedAttr = `map: +const expectedYAMLWithNamespacedAttr = `+p_xml: version="1.0" +map: +xmlns: some-namespace +xmlns:xsi: some-instance +some-instance:schemaLocation: some-url ` -const expectedYAMLWithRawNamespacedAttr = `map: +const expectedYAMLWithRawNamespacedAttr = `+p_xml: version="1.0" +map: +xmlns: some-namespace +xmlns:xsi: some-instance +xsi:schemaLocation: some-url @@ -181,8 +183,44 @@ const xmlWithCustomDtd = ` &writer;©right; ` -const expectedDtd = `root: - item: '&writer;©right;' +const expectedDtd = ` + + +]> + + &writer;&copyright; + +` + +const expectedSkippedDtd = ` + + &writer;&copyright; + +` + +const xmlWithProcInstAndDirectives = ` + + + + + things + +` + +const yamlWithProcInstAndDirectives = `+p_xml: version="1.0" ++directive: 'DOCTYPE config SYSTEM "/etc/iwatch/iwatch.dtd" ' +apple: + +p_coolioo: version="1.0" + +directive: 'CATYPE meow purr puss ' + b: things +` + +const expectedXmlWithProcInstAndDirectives = ` + + + things + ` var xmlScenarios = []formatScenario{ @@ -190,38 +228,46 @@ var xmlScenarios = []formatScenario{ description: "Parse xml: simple", subdescription: "Notice how all the values are strings, see the next example on how you can fix that.", input: "\n\n meow\n 4\n true\n", - expected: "cat:\n says: meow\n legs: \"4\"\n cute: \"true\"\n", + expected: "+p_xml: version=\"1.0\" encoding=\"UTF-8\"\ncat:\n says: meow\n legs: \"4\"\n cute: \"true\"\n", }, { description: "Parse xml: number", subdescription: "All values are assumed to be strings when parsing XML, but you can use the `from_yaml` operator on all the strings values to autoparse into the correct type.", input: "\n\n meow\n 4\n true\n", expression: " (.. | select(tag == \"!!str\")) |= from_yaml", - expected: "cat:\n says: meow\n legs: 4\n cute: true\n", + expected: "+p_xml: version=\"1.0\" encoding=\"UTF-8\"\ncat:\n says: meow\n legs: 4\n cute: true\n", }, { description: "Parse xml: array", subdescription: "Consecutive nodes with identical xml names are assumed to be arrays.", input: "\ncat\ngoat", - expected: "animal:\n - cat\n - goat\n", + expected: "+p_xml: version=\"1.0\" encoding=\"UTF-8\"\nanimal:\n - cat\n - goat\n", }, { description: "Parse xml: attributes", subdescription: "Attributes are converted to fields, with the default attribute prefix '+'. Use '--xml-attribute-prefix` to set your own.", input: "\n\n 7\n", - expected: "cat:\n +legs: \"4\"\n legs: \"7\"\n", + expected: "+p_xml: version=\"1.0\" encoding=\"UTF-8\"\ncat:\n +legs: \"4\"\n legs: \"7\"\n", }, { description: "Parse xml: attributes with content", subdescription: "Content is added as a field, using the default content name of `+content`. Use `--xml-content-name` to set your own.", input: "\nmeow", - expected: "cat:\n +content: meow\n +legs: \"4\"\n", + expected: "+p_xml: version=\"1.0\" encoding=\"UTF-8\"\ncat:\n +content: meow\n +legs: \"4\"\n", }, { description: "Parse xml: custom dtd", - subdescription: "DTD entities are ignored.", + subdescription: "DTD entities are processed as directives.", input: xmlWithCustomDtd, expected: expectedDtd, + scenarioType: "roundtrip", + }, + { + description: "Parse xml: skip custom dtd", + subdescription: "DTDs are directives, skip over directives to skip DTDs.", + input: xmlWithCustomDtd, + expected: expectedSkippedDtd, + scenarioType: "roundtrip-skip-directives", }, { description: "Parse xml: with comments", @@ -322,9 +368,10 @@ var xmlScenarios = []formatScenario{ scenarioType: "encode", }, { + description: "double prefix", skipDoc: true, - input: "cat:\n ++name: tiger\n meows: true\n", - expected: "\n true\n\n", + input: "cat:\n ++@name: tiger\n meows: true\n", + expected: "\n true\n\n", scenarioType: "encode", }, { @@ -341,6 +388,13 @@ var xmlScenarios = []formatScenario{ expected: expectedXMLWithComments, scenarioType: "encode", }, + { + description: "Encode: doctype and xml declaration", + subdescription: "Use the special xml names to add/modify proc instructions and directives.", + input: yamlWithProcInstAndDirectives, + expected: expectedXmlWithProcInstAndDirectives, + scenarioType: "encode", + }, { description: "Round trip: with comments", subdescription: "A best effort is made, but comment positions and white space are not preserved perfectly.", @@ -348,21 +402,35 @@ var xmlScenarios = []formatScenario{ expected: expectedRoundtripXMLWithComments, scenarioType: "roundtrip", }, + { + description: "Roundtrip: with doctype and declaration", + subdescription: "yq parses XML proc instructions and directives into nodes.\nUnfortunately the underlying XML parser loses whitespace information.", + input: xmlWithProcInstAndDirectives, + expected: expectedXmlWithProcInstAndDirectives, + scenarioType: "roundtrip", + }, } func testXMLScenario(t *testing.T, s formatScenario) { switch s.scenarioType { case "", "decode": - test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewXMLDecoder("+", "+content", false, false, false), NewYamlEncoder(4, false, true, true)), s.description) + test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewXMLDecoder(XMLPreferences), NewYamlEncoder(4, false, true, true)), s.description) case "encode": - test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewYamlDecoder(), NewXMLEncoder(2, "+", "+content")), s.description) + test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewYamlDecoder(), NewXMLEncoder(2, XMLPreferences)), s.description) case "roundtrip": - test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewXMLDecoder("+", "+content", false, false, false), NewXMLEncoder(2, "+", "+content")), s.description) + test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewXMLDecoder(XMLPreferences), NewXMLEncoder(2, XMLPreferences)), s.description) case "decode-keep-ns": - test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewXMLDecoder("+", "+content", false, true, false), NewYamlEncoder(2, false, true, true)), s.description) + prefs := NewDefaultXmlPreferences() + prefs.KeepNamespace = true + test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewXMLDecoder(prefs), NewYamlEncoder(2, false, true, true)), s.description) case "decode-raw-token": - test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewXMLDecoder("+", "+content", false, true, true), NewYamlEncoder(2, false, true, true)), s.description) - + prefs := NewDefaultXmlPreferences() + prefs.UseRawToken = true + test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewXMLDecoder(prefs), NewYamlEncoder(2, false, true, true)), s.description) + case "roundtrip-skip-directives": + prefs := NewDefaultXmlPreferences() + prefs.SkipDirectives = true + test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewXMLDecoder(prefs), NewXMLEncoder(2, prefs)), s.description) default: panic(fmt.Sprintf("unhandled scenario type %q", s.scenarioType)) } @@ -385,6 +453,8 @@ func documentXMLScenario(t *testing.T, w *bufio.Writer, i interface{}) { documentXMLDecodeKeepNsScenario(w, s) case "decode-raw-token": documentXMLDecodeKeepNsRawTokenScenario(w, s) + case "roundtrip-skip-directives": + documentXMLSkipDirectrivesScenario(w, s) default: panic(fmt.Sprintf("unhandled scenario type %q", s.scenarioType)) @@ -410,7 +480,7 @@ func documentXMLDecodeScenario(w *bufio.Writer, s formatScenario) { writeOrPanic(w, fmt.Sprintf("```bash\nyq -p=xml '%v' sample.xml\n```\n", expression)) writeOrPanic(w, "will output\n") - writeOrPanic(w, fmt.Sprintf("```yaml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder("+", "+content", false, false, false), NewYamlEncoder(2, false, true, true)))) + writeOrPanic(w, fmt.Sprintf("```yaml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder(XMLPreferences), NewYamlEncoder(2, false, true, true)))) } func documentXMLDecodeKeepNsScenario(w *bufio.Writer, s formatScenario) { @@ -427,11 +497,14 @@ func documentXMLDecodeKeepNsScenario(w *bufio.Writer, s formatScenario) { writeOrPanic(w, "then\n") writeOrPanic(w, "```bash\nyq -p=xml -o=xml --xml-keep-namespace '.' sample.xml\n```\n") writeOrPanic(w, "will output\n") + prefs := NewDefaultXmlPreferences() + prefs.KeepNamespace = true + writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder(prefs), NewXMLEncoder(2, prefs)))) - writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder("+", "+content", false, true, false), NewXMLEncoder(2, "+", "+content")))) - + prefsWithout := NewDefaultXmlPreferences() + prefs.KeepNamespace = false writeOrPanic(w, "instead of\n") - writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder("+", "+content", false, false, false), NewXMLEncoder(2, "+", "+content")))) + writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder(prefsWithout), NewXMLEncoder(2, prefsWithout)))) } func documentXMLDecodeKeepNsRawTokenScenario(w *bufio.Writer, s formatScenario) { @@ -449,10 +522,16 @@ func documentXMLDecodeKeepNsRawTokenScenario(w *bufio.Writer, s formatScenario) writeOrPanic(w, "```bash\nyq -p=xml -o=xml --xml-keep-namespace --xml-raw-token '.' sample.xml\n```\n") writeOrPanic(w, "will output\n") - writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder("+", "+content", false, true, true), NewXMLEncoder(2, "+", "+content")))) + prefs := NewDefaultXmlPreferences() + prefs.KeepNamespace = true + + writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder(prefs), NewXMLEncoder(2, prefs)))) + + prefsWithout := NewDefaultXmlPreferences() + prefsWithout.KeepNamespace = false writeOrPanic(w, "instead of\n") - writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder("+", "+content", false, false, false), NewXMLEncoder(2, "+", "+content")))) + writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder(prefsWithout), NewXMLEncoder(2, prefsWithout)))) } func documentXMLEncodeScenario(w *bufio.Writer, s formatScenario) { @@ -470,7 +549,7 @@ func documentXMLEncodeScenario(w *bufio.Writer, s formatScenario) { writeOrPanic(w, "```bash\nyq -o=xml '.' sample.yml\n```\n") writeOrPanic(w, "will output\n") - writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", processFormatScenario(s, NewYamlDecoder(), NewXMLEncoder(2, "+", "+content")))) + writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", processFormatScenario(s, NewYamlDecoder(), NewXMLEncoder(2, XMLPreferences)))) } func documentXMLRoundTripScenario(w *bufio.Writer, s formatScenario) { @@ -488,7 +567,27 @@ func documentXMLRoundTripScenario(w *bufio.Writer, s formatScenario) { writeOrPanic(w, "```bash\nyq -p=xml -o=xml '.' sample.xml\n```\n") writeOrPanic(w, "will output\n") - writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder("+", "+content", false, false, false), NewXMLEncoder(2, "+", "+content")))) + writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder(XMLPreferences), NewXMLEncoder(2, XMLPreferences)))) +} + +func documentXMLSkipDirectrivesScenario(w *bufio.Writer, s formatScenario) { + writeOrPanic(w, fmt.Sprintf("## %v\n", s.description)) + + if s.subdescription != "" { + writeOrPanic(w, s.subdescription) + writeOrPanic(w, "\n\n") + } + + writeOrPanic(w, "Given a sample.xml file of:\n") + writeOrPanic(w, fmt.Sprintf("```xml\n%v\n```\n", s.input)) + + writeOrPanic(w, "then\n") + writeOrPanic(w, "```bash\nyq -p=xml -o=xml --xml-skip-directives '.' sample.xml\n```\n") + writeOrPanic(w, "will output\n") + prefs := NewDefaultXmlPreferences() + prefs.SkipDirectives = true + + writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder(prefs), NewXMLEncoder(2, prefs)))) } func TestXMLScenarios(t *testing.T) {