Disable strict XML parsing by default #1155

This commit is contained in:
Mike Farah 2022-03-28 14:05:10 +11:00
parent 3a1e2c7518
commit bbeae229ca
11 changed files with 86 additions and 14 deletions

View File

@ -58,6 +58,27 @@ EOM
assertEquals "$expected" "$X" assertEquals "$expected" "$X"
} }
testInputXmlStrict() {
cat >test.yml <<EOL
<?xml version="1.0"?>
<!DOCTYPE root [
<!ENTITY writer "Catherine.">
<!ENTITY copyright "(r) Great">
]>
<root>
<item>&writer;&copyright;</item>
</root>
EOL
X=$(./yq -p=xml --xml-strict-mode test.yml 2>&1)
assertEquals 1 $?
assertEquals "Error: bad file 'test.yml': XML syntax error on line 7: invalid character entity &writer;" "$X"
X=$(./yq ea -p=xml --xml-strict-mode test.yml 2>&1)
assertEquals "Error: bad file 'test.yml': XML syntax error on line 7: invalid character entity &writer;" "$X"
}
testInputXmlGithubAction() { testInputXmlGithubAction() {
cat >test.yml <<EOL cat >test.yml <<EOL
<cat legs="4">BiBi</cat> <cat legs="4">BiBi</cat>

View File

@ -10,6 +10,7 @@ var inputFormat = "yaml"
var xmlAttributePrefix = "+" var xmlAttributePrefix = "+"
var xmlContentName = "+content" var xmlContentName = "+content"
var xmlStrictMode = false
var exitStatus = false var exitStatus = false
var forceColor = false var forceColor = false

View File

@ -54,6 +54,7 @@ yq -P sample.json
yqlib.InitExpressionParser() yqlib.InitExpressionParser()
yqlib.XMLPreferences.AttributePrefix = xmlAttributePrefix yqlib.XMLPreferences.AttributePrefix = xmlAttributePrefix
yqlib.XMLPreferences.ContentName = xmlContentName yqlib.XMLPreferences.ContentName = xmlContentName
yqlib.XMLPreferences.StrictMode = xmlStrictMode
}, },
} }
@ -70,6 +71,7 @@ yq -P sample.json
rootCmd.PersistentFlags().StringVar(&xmlAttributePrefix, "xml-attribute-prefix", "+", "prefix for xml attributes") rootCmd.PersistentFlags().StringVar(&xmlAttributePrefix, "xml-attribute-prefix", "+", "prefix for xml attributes")
rootCmd.PersistentFlags().StringVar(&xmlContentName, "xml-content-name", "+content", "name for xml content (if no attribute name is present).") rootCmd.PersistentFlags().StringVar(&xmlContentName, "xml-content-name", "+content", "name for xml content (if no attribute name is present).")
rootCmd.PersistentFlags().BoolVar(&xmlStrictMode, "xml-strict-mode", false, "enables strict parsing of XML. See https://pkg.go.dev/encoding/xml for more details.")
rootCmd.PersistentFlags().BoolVarP(&nullInput, "null-input", "n", false, "Don't read input, simply evaluate the expression given. Useful for creating docs from scratch.") rootCmd.PersistentFlags().BoolVarP(&nullInput, "null-input", "n", false, "Don't read input, simply evaluate the expression given. Useful for creating docs from scratch.")
rootCmd.PersistentFlags().BoolVarP(&noDocSeparators, "no-doc", "N", false, "Don't print document separators (---)") rootCmd.PersistentFlags().BoolVarP(&noDocSeparators, "no-doc", "N", false, "Don't print document separators (---)")

View File

@ -53,7 +53,7 @@ func configureDecoder() (yqlib.Decoder, error) {
} }
switch yqlibInputFormat { switch yqlibInputFormat {
case yqlib.XMLInputFormat: case yqlib.XMLInputFormat:
return yqlib.NewXMLDecoder(xmlAttributePrefix, xmlContentName), nil return yqlib.NewXMLDecoder(xmlAttributePrefix, xmlContentName, xmlStrictMode), nil
case yqlib.PropertiesInputFormat: case yqlib.PropertiesInputFormat:
return yqlib.NewPropertiesDecoder(), nil return yqlib.NewPropertiesDecoder(), nil
} }

View File

@ -2,6 +2,7 @@ package yqlib
import ( import (
"encoding/xml" "encoding/xml"
"errors"
"io" "io"
"strings" "strings"
"unicode" "unicode"
@ -14,14 +15,15 @@ type xmlDecoder struct {
reader io.Reader reader io.Reader
attributePrefix string attributePrefix string
contentName string contentName string
strictMode bool
finished bool finished bool
} }
func NewXMLDecoder(attributePrefix string, contentName string) Decoder { func NewXMLDecoder(attributePrefix string, contentName string, strictMode bool) Decoder {
if contentName == "" { if contentName == "" {
contentName = "content" contentName = "content"
} }
return &xmlDecoder{attributePrefix: attributePrefix, contentName: contentName, finished: false} return &xmlDecoder{attributePrefix: attributePrefix, contentName: contentName, finished: false, strictMode: strictMode}
} }
func (dec *xmlDecoder) Init(reader io.Reader) { func (dec *xmlDecoder) Init(reader io.Reader) {
@ -189,7 +191,7 @@ type element struct {
// of the map keys. // of the map keys.
func (dec *xmlDecoder) decodeXML(root *xmlNode) error { func (dec *xmlDecoder) decodeXML(root *xmlNode) error {
xmlDec := xml.NewDecoder(dec.reader) xmlDec := xml.NewDecoder(dec.reader)
xmlDec.Strict = dec.strictMode
// That will convert the charset if the provided XML is non-UTF-8 // That will convert the charset if the provided XML is non-UTF-8
xmlDec.CharsetReader = charset.NewReaderLabel xmlDec.CharsetReader = charset.NewReaderLabel
@ -201,7 +203,7 @@ func (dec *xmlDecoder) decodeXML(root *xmlNode) error {
for { for {
t, e := xmlDec.Token() t, e := xmlDec.Token()
if e != nil { if e != nil && !errors.Is(e, io.EOF) {
return e return e
} }
if t == nil { if t == nil {

View File

@ -120,6 +120,31 @@ cat:
+legs: "4" +legs: "4"
``` ```
## Parse xml: custom dtd
DTD entities are ignored.
Given a sample.xml file of:
```xml
<?xml version="1.0"?>
<!DOCTYPE root [
<!ENTITY writer "Blah.">
<!ENTITY copyright "Blah">
]>
<root>
<item>&writer;&copyright;</item>
</root>
```
then
```bash
yq -p=xml '.' sample.xml
```
will output
```yaml
root:
item: '&writer;&copyright;'
```
## Parse xml: with comments ## Parse xml: with comments
A best attempt is made to preserve comments. A best attempt is made to preserve comments.

View File

@ -9,7 +9,7 @@ import (
yaml "gopkg.in/yaml.v3" yaml "gopkg.in/yaml.v3"
) )
var XMLPreferences = xmlPreferences{AttributePrefix: "+", ContentName: "+content"} var XMLPreferences = xmlPreferences{AttributePrefix: "+", ContentName: "+content", StrictMode: false}
type xmlEncoder struct { type xmlEncoder struct {
attributePrefix string attributePrefix string

View File

@ -413,9 +413,9 @@ func initLexer() (*lex.Lexer, error) {
lexer.Add([]byte(`load`), opTokenWithPrefs(loadOpType, nil, loadPrefs{loadAsString: false, decoder: NewYamlDecoder()})) lexer.Add([]byte(`load`), opTokenWithPrefs(loadOpType, nil, loadPrefs{loadAsString: false, decoder: NewYamlDecoder()}))
lexer.Add([]byte(`xmlload`), opTokenWithPrefs(loadOpType, nil, loadPrefs{loadAsString: false, decoder: NewXMLDecoder(XMLPreferences.AttributePrefix, XMLPreferences.ContentName)})) lexer.Add([]byte(`xmlload`), opTokenWithPrefs(loadOpType, nil, loadPrefs{loadAsString: false, decoder: NewXMLDecoder(XMLPreferences.AttributePrefix, XMLPreferences.ContentName, XMLPreferences.StrictMode)}))
lexer.Add([]byte(`load_xml`), opTokenWithPrefs(loadOpType, nil, loadPrefs{loadAsString: false, decoder: NewXMLDecoder(XMLPreferences.AttributePrefix, XMLPreferences.ContentName)})) lexer.Add([]byte(`load_xml`), opTokenWithPrefs(loadOpType, nil, loadPrefs{loadAsString: false, decoder: NewXMLDecoder(XMLPreferences.AttributePrefix, XMLPreferences.ContentName, XMLPreferences.StrictMode)}))
lexer.Add([]byte(`loadxml`), opTokenWithPrefs(loadOpType, nil, loadPrefs{loadAsString: false, decoder: NewXMLDecoder(XMLPreferences.AttributePrefix, XMLPreferences.ContentName)})) lexer.Add([]byte(`loadxml`), opTokenWithPrefs(loadOpType, nil, loadPrefs{loadAsString: false, decoder: NewXMLDecoder(XMLPreferences.AttributePrefix, XMLPreferences.ContentName, XMLPreferences.StrictMode)}))
lexer.Add([]byte(`load_base64`), opTokenWithPrefs(loadOpType, nil, loadPrefs{loadAsString: false, decoder: NewBase64Decoder()})) lexer.Add([]byte(`load_base64`), opTokenWithPrefs(loadOpType, nil, loadPrefs{loadAsString: false, decoder: NewBase64Decoder()}))

View File

@ -24,6 +24,7 @@ func InitExpressionParser() {
type xmlPreferences struct { type xmlPreferences struct {
AttributePrefix string AttributePrefix string
ContentName string ContentName string
StrictMode bool
} }
var log = logging.MustGetLogger("yq-lib") var log = logging.MustGetLogger("yq-lib")

View File

@ -104,7 +104,7 @@ func decodeOperator(d *dataTreeNavigator, context Context, expressionNode *Expre
case YamlInputFormat: case YamlInputFormat:
decoder = NewYamlDecoder() decoder = NewYamlDecoder()
case XMLInputFormat: case XMLInputFormat:
decoder = NewXMLDecoder(XMLPreferences.AttributePrefix, XMLPreferences.ContentName) decoder = NewXMLDecoder(XMLPreferences.AttributePrefix, XMLPreferences.ContentName, XMLPreferences.StrictMode)
case Base64InputFormat: case Base64InputFormat:
decoder = NewBase64Decoder() decoder = NewBase64Decoder()
} }

View File

@ -153,6 +153,20 @@ var expectedXMLWithComments = `<!-- above_cat inline_cat --><cat><!-- above_arra
</cat><!-- below_cat --> </cat><!-- below_cat -->
` `
var xmlWithCustomDtd = `
<?xml version="1.0"?>
<!DOCTYPE root [
<!ENTITY writer "Blah.">
<!ENTITY copyright "Blah">
]>
<root>
<item>&writer;&copyright;</item>
</root>`
var expectedDtd = `root:
item: '&writer;&copyright;'
`
var xmlScenarios = []formatScenario{ var xmlScenarios = []formatScenario{
{ {
description: "Parse xml: simple", description: "Parse xml: simple",
@ -185,6 +199,12 @@ var xmlScenarios = []formatScenario{
input: "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<cat legs=\"4\">meow</cat>", input: "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<cat legs=\"4\">meow</cat>",
expected: "cat:\n +content: meow\n +legs: \"4\"\n", expected: "cat:\n +content: meow\n +legs: \"4\"\n",
}, },
{
description: "Parse xml: custom dtd",
subdescription: "DTD entities are ignored.",
input: xmlWithCustomDtd,
expected: expectedDtd,
},
{ {
description: "Parse xml: with comments", description: "Parse xml: with comments",
subdescription: "A best attempt is made to preserve comments.", subdescription: "A best attempt is made to preserve comments.",
@ -286,9 +306,9 @@ func testXMLScenario(t *testing.T, s formatScenario) {
if s.scenarioType == "encode" { if s.scenarioType == "encode" {
test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewYamlDecoder(), NewXMLEncoder(2, "+", "+content")), s.description) test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewYamlDecoder(), NewXMLEncoder(2, "+", "+content")), s.description)
} else if s.scenarioType == "roundtrip" { } else if s.scenarioType == "roundtrip" {
test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewXMLDecoder("+", "+content"), NewXMLEncoder(2, "+", "+content")), s.description) test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewXMLDecoder("+", "+content", false), NewXMLEncoder(2, "+", "+content")), s.description)
} else { } else {
test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewXMLDecoder("+", "+content"), NewYamlEncoder(4, false, true, true)), s.description) test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewXMLDecoder("+", "+content", false), NewYamlEncoder(4, false, true, true)), s.description)
} }
} }
@ -327,7 +347,7 @@ func documentXMLDecodeScenario(w *bufio.Writer, s formatScenario) {
writeOrPanic(w, fmt.Sprintf("```bash\nyq -p=xml '%v' sample.xml\n```\n", expression)) writeOrPanic(w, fmt.Sprintf("```bash\nyq -p=xml '%v' sample.xml\n```\n", expression))
writeOrPanic(w, "will output\n") writeOrPanic(w, "will output\n")
writeOrPanic(w, fmt.Sprintf("```yaml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder("+", "+content"), NewYamlEncoder(2, false, true, true)))) writeOrPanic(w, fmt.Sprintf("```yaml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder("+", "+content", false), NewYamlEncoder(2, false, true, true))))
} }
func documentXMLEncodeScenario(w *bufio.Writer, s formatScenario) { func documentXMLEncodeScenario(w *bufio.Writer, s formatScenario) {
@ -363,7 +383,7 @@ func documentXMLRoundTripScenario(w *bufio.Writer, s formatScenario) {
writeOrPanic(w, "```bash\nyq -p=xml -o=xml '.' sample.xml\n```\n") writeOrPanic(w, "```bash\nyq -p=xml -o=xml '.' sample.xml\n```\n")
writeOrPanic(w, "will output\n") writeOrPanic(w, "will output\n")
writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder("+", "+content"), NewXMLEncoder(2, "+", "+content")))) writeOrPanic(w, fmt.Sprintf("```xml\n%v```\n\n", processFormatScenario(s, NewXMLDecoder("+", "+content", false), NewXMLEncoder(2, "+", "+content"))))
} }
func TestXMLScenarios(t *testing.T) { func TestXMLScenarios(t *testing.T) {