Add `--nul-output|-0` flag to separate element with NUL character (#1550)

This is to ensure solid parsing of complex data (with any binary
content except NUL chars) by separating the `yq` root collection
member's output with NUL char. As a safe-guard, an error will be cast
if trying to use NUL character with content that contains itself NUL
characters inside.
This commit is contained in:
Valentin Lab 2023-03-29 00:51:55 +02:00 committed by GitHub
parent 311622d14b
commit 5fd2890d1b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 389 additions and 3 deletions

286
acceptance_tests/nul-separator.sh Executable file
View File

@ -0,0 +1,286 @@
#!/bin/bash
setUp() {
rm test*.yml || true
}
## Convenient bash shortcut to read records of NUL separated values
## from stdin the safe way. See example usage in the next tests.
read-0() {
local eof="" IFS=''
while [ "$1" ]; do
## - The `-r` avoids bad surprise with '\n' and other interpreted
## sequences that can be read.
## - The `-d ''` is the (strange?) way to refer to NUL delimiter.
## - The `--` is how to avoid unpleasant surprises if your
## "$1" starts with "-" (minus) sign. This protection also
## will produce a readable error if you want to try to start
## your variable names with a "-".
read -r -d '' -- "$1" || eof=1
shift
done
[ -z "$eof" ] ## fail on EOF
}
## Convenient bash shortcut to be used with the next function `p-err`
## to read NUL separated values the safe way AND catch any errors from
## the process creating the stream of NUL separated data. See example
## usage in the tests.
read-0-err() {
local ret="$1" eof="" idx=0 last=
read -r -- "${ret?}" <<<"0"
shift
while [ "$1" ]; do
last=$idx
read -r -d '' -- "$1" || {
## Put this last value in ${!ret}
eof="$1"
read -r -- "$ret" <<<"${!eof}"
break
}
((idx++))
shift
done
[ -z "$eof" ] || {
if [ "$last" != 0 ]; then
## Uhoh, we have no idea if the errorlevel of the internal
## command was properly delimited with a NUL char, and
## anyway something went really wrong at least about the
## number of fields separated by NUL char and the one
## expected.
echo "Error: read-0-err couldn't fill all value $ret = '${!ret}', '$eof', '${!eof}'" >&2
read -r -- "$ret" <<<"not-enough-values"
else
if ! [[ "${!ret}" =~ ^[0-9]+$ && "${!ret}" -ge 0 && "${!ret}" -le 127 ]]; then
## This could happen if you don't use `p-err` wrapper,
## or used stdout in unexpected ways in your inner
## command.
echo "Error: last value is not a number, did you finish with an errorlevel ?" >&2
read -r -- "$ret" <<<"last-value-not-a-number"
fi
fi
false
}
}
## Simply runs command given as argument and adds errorlevel in the
## standard output. Is expected to be used in tandem with
## `read-0-err`.
p-err() {
local exp="$1"
"$@"
printf "%s" "$?"
}
wyq-r() {
local exp="$1"
./yq e -0 -r=false "$1"
printf "%s" "$?"
}
testBasicUsageRaw() {
cat >test.yml <<EOL
a: foo
b: bar
EOL
printf "foo\0bar\0" > expected.out
## We need to compare binary content here. We have to filter the compared
## content through a representation that gets rid of NUL chars but accurately
## transcribe the content.
## Also as it would be nice to have a pretty output in case the test fails,
## we use here 'hd': a widely available shortcut to 'hexdump' that will
## pretty-print any binary to it's hexadecimal representation.
##
## Note that the standard `assertEquals` compare its arguments
## value, but they can't hold NUL characters (this comes from the
## limitation of the C API of `exec*(..)` functions that requires
## `const char *arv[]`). And these are NUL terminated strings. As a
## consequence, the NUL characters gets removed in bash arguments.
assertEquals "$(hd expected.out)" \
"$(./yq e -0 '.a, .b' test.yml | hd)"
rm expected.out
}
testBasicUsage() {
local a b
cat >test.yml <<EOL
a: foo
b: bar
EOL
## We provide 2 values, and ask to fill 2 variables.
read-0 a b < <(./yq e -0 '.a, .b' test.yml)
assertEquals "$?" "0" ## Everything is fine
assertEquals "foo" "$a" ## Values are correctly parsed
assertEquals "bar" "$b"
a=YYY ; b=XXX
## Not enough values provided to fill `a` and `b`.
read-0 a b < <(./yq e -0 '.a' test.yml)
assertEquals "$?" "1" ## An error was emitted
assertEquals "foo" "$a" ## First value was correctly parsed
assertEquals "" "$b" ## Second was still reset
## Error from inner command are not catchable !. Use
## `read-0-err`/`p-err` for that.
read-0 a < <(printf "\0"; ./yq e -0 'xxx' test.yml; )
assertEquals "$?" "0"
}
testBasicUsageJson() {
cat >test.yml <<EOL
a:
x: foo
b: bar
EOL
read-0 a b < <(./yq e -0 -o=json '.a, .b' test.yml)
assertEquals '{
"x": "foo"
}' "$a"
assertEquals '"bar"' "$b"
}
testFailWithValueContainingNUL() {
local a b c
## Note that value of field 'a' actually contains a NUL char !
cat >test.yml <<EOL
a: "foo\u0000bar"
b: 1
c: |
wiz
boom
EOL
## We are looking for trouble with asking to separated fields with NUL
## char and requested value `.a` actually contains itself a NUL char !
read-0 a b c < <(./yq e -0 '.a, .b, .c' test.yml)
assertNotEquals "0" "$?" ## read-0 failed to fill all values
## But here, we can request for one value, even if `./yq` fails
read-0 b < <(./yq e -0 '.b, .a' test.yml)
assertEquals "0" "$?" ## read-0 succeeds at feeding the first value
## Note: to catch the failure of `yq`, see in the next tests the usage
## of `read-0-err`.
## using -r=false solves any NUL containing value issues, but keeps
## all in YAML representation:
read-0 a b c < <(./yq e -0 -r=false '.a, .b, .c' test.yml)
assertEquals "0" "$?" ## All goes well despite asking for `a` value
assertEquals '"foo\0bar"' "$a" ## This is a YAML string representation
assertEquals '1' "$b"
assertEquals '|
wiz
boom' "$c"
}
testStandardLoop() {
local E a b res
## Here everything is normal: 4 values, that will be paired
## in key/values.
cat >test.yml <<EOL
- yay
- wiz
- hop
- pow
EOL
res=""
while read-0-err E a b; do
res+="$a: $b;"
done < <(p-err ./yq -0 '.[]' test.yml)
assertEquals "0" "$E" ## errorlevel of internal command
assertEquals "yay: wiz;hop: pow;" "$res" ## expected result
}
testStandardLoopWithoutEnoughValues() {
local E a b res
## Here 5 values, there will be a missing value when reading
## pairs of value.
cat >test.yml <<EOL
- yay
- wiz
- hop
- pow
- kwak
EOL
res=""
## The loop will succeed 2 times then fail
while read-0-err E a b; do
res+="$a: $b;"
done < <(p-err ./yq -0 '.[]' test.yml)
assertEquals "not-enough-values" "$E" ## Not enough value error
assertEquals "yay: wiz;hop: pow;" "$res" ## the 2 full key/value pairs
}
testStandardLoopWithInternalCmdError() {
local E a b res
## Note the third value contains a NUL char !
cat >test.yml <<EOL
- yay
- wiz
- "foo\0bar"
- hop
- pow
EOL
res=""
## It should be only upon the second pass in the loop that
## read-0-err will catch the fact that there is an error !
while read-0-err E a b; do
res+="$a: $b;"
done < <(p-err ./yq -0 '.[]' test.yml)
assertEquals "1" "$E" ## Internal command errorlevel (from `./yq`)
assertEquals "yay: wiz;" "$res" ## first 2 values were ok at least
}
testStandardLoopNotEnoughErrorEatsCmdError() {
local E a b res
## Because of possible edge cases where the internal errorlevel
## reported by `p-err` in the standard output might be mangled
## with the unfinished record, `read-0-err E ...` will NOT report
## the internal command error in the variable E and instead will
## store the value 'not-enough-values'. In real world, anyway, you
## will want to react the same if the internal command failed
## and/or you didn't get as much values as expected while
## reading. Keep in mind also that standard error is not
## swallowed, so you can read reports from the inner command AND
## from `read-0-err`.
## Here, note that the fourth value contains a NUL char !
cat >test.yml <<EOL
- yay
- wiz
- hop
- "foo\0bar"
- pow
EOL
res=""
## It should be only upon the second loop that read-0-err will catch
## the fact that there are not enough data to fill the requested variables
while read-0-err E a b; do
res+="$a: $b;"
done < <(p-err ./yq -0 '.[]' test.yml)
assertEquals "not-enough-values" "$E" ## Not enough values error eats internal error !
assertEquals "yay: wiz;" "$res" ## first 2 values were ok at least
}
source ./scripts/shunit2

View File

@ -18,6 +18,7 @@ var colorsEnabled = false
var indent = 2 var indent = 2
var noDocSeparators = false var noDocSeparators = false
var nullInput = false var nullInput = false
var nulSepOutput = false
var verbose = false var verbose = false
var version = false var version = false
var prettyPrint = false var prettyPrint = false

View File

@ -90,6 +90,9 @@ func evaluateAll(cmd *cobra.Command, args []string) (cmdError error) {
} }
printer := yqlib.NewPrinter(encoder, printerWriter) printer := yqlib.NewPrinter(encoder, printerWriter)
if nulSepOutput {
printer.SetNulSepOutput(true)
}
if frontMatter != "" { if frontMatter != "" {
frontMatterHandler := yqlib.NewFrontMatterHandler(args[0]) frontMatterHandler := yqlib.NewFrontMatterHandler(args[0])

View File

@ -99,6 +99,9 @@ func evaluateSequence(cmd *cobra.Command, args []string) (cmdError error) {
} }
printer := yqlib.NewPrinter(encoder, printerWriter) printer := yqlib.NewPrinter(encoder, printerWriter)
if nulSepOutput {
printer.SetNulSepOutput(true)
}
decoder, err := configureDecoder(false) decoder, err := configureDecoder(false)
if err != nil { if err != nil {

View File

@ -86,6 +86,7 @@ yq -P sample.json
rootCmd.PersistentFlags().BoolVarP(&writeInplace, "inplace", "i", false, "update the file inplace of first file given.") rootCmd.PersistentFlags().BoolVarP(&writeInplace, "inplace", "i", false, "update the file inplace of first file given.")
rootCmd.PersistentFlags().VarP(unwrapScalarFlag, "unwrapScalar", "r", "unwrap scalar, print the value with no quotes, colors or comments. Defaults to true for yaml") rootCmd.PersistentFlags().VarP(unwrapScalarFlag, "unwrapScalar", "r", "unwrap scalar, print the value with no quotes, colors or comments. Defaults to true for yaml")
rootCmd.PersistentFlags().Lookup("unwrapScalar").NoOptDefVal = "true" rootCmd.PersistentFlags().Lookup("unwrapScalar").NoOptDefVal = "true"
rootCmd.PersistentFlags().BoolVarP(&nulSepOutput, "nul-output", "0", false, "Use NUL char to separate values. If unwrap scalar is also set, fail if unwrapped scalar contains NUL char.")
rootCmd.PersistentFlags().BoolVarP(&prettyPrint, "prettyPrint", "P", false, "pretty print, shorthand for '... style = \"\"'") rootCmd.PersistentFlags().BoolVarP(&prettyPrint, "prettyPrint", "P", false, "pretty print, shorthand for '... style = \"\"'")
rootCmd.PersistentFlags().BoolVarP(&exitStatus, "exit-status", "e", false, "set exit status if there are no matches or null or false is returned") rootCmd.PersistentFlags().BoolVarP(&exitStatus, "exit-status", "e", false, "set exit status if there are no matches or null or false is returned")

View File

@ -2,6 +2,7 @@ package yqlib
import ( import (
"bufio" "bufio"
"bytes"
"container/list" "container/list"
"fmt" "fmt"
"io" "io"
@ -15,6 +16,7 @@ type Printer interface {
PrintedAnything() bool PrintedAnything() bool
//e.g. when given a front-matter doc, like jekyll //e.g. when given a front-matter doc, like jekyll
SetAppendix(reader io.Reader) SetAppendix(reader io.Reader)
SetNulSepOutput(nulSepOutput bool)
} }
type PrinterOutputFormat uint32 type PrinterOutputFormat uint32
@ -59,6 +61,7 @@ type resultsPrinter struct {
printedMatches bool printedMatches bool
treeNavigator DataTreeNavigator treeNavigator DataTreeNavigator
appendixReader io.Reader appendixReader io.Reader
nulSepOutput bool
} }
func NewPrinter(encoder Encoder, printerWriter PrinterWriter) Printer { func NewPrinter(encoder Encoder, printerWriter PrinterWriter) Printer {
@ -67,9 +70,16 @@ func NewPrinter(encoder Encoder, printerWriter PrinterWriter) Printer {
printerWriter: printerWriter, printerWriter: printerWriter,
firstTimePrinting: true, firstTimePrinting: true,
treeNavigator: NewDataTreeNavigator(), treeNavigator: NewDataTreeNavigator(),
nulSepOutput: false,
} }
} }
func (p *resultsPrinter) SetNulSepOutput(nulSepOutput bool) {
log.Debug("Setting NUL separator output")
p.nulSepOutput = nulSepOutput
}
func (p *resultsPrinter) SetAppendix(reader io.Reader) { func (p *resultsPrinter) SetAppendix(reader io.Reader) {
p.appendixReader = reader p.appendixReader = reader
} }
@ -84,6 +94,16 @@ func (p *resultsPrinter) printNode(node *yaml.Node, writer io.Writer) error {
return p.encoder.Encode(writer, node) return p.encoder.Encode(writer, node)
} }
func removeLastEOL(b *bytes.Buffer) {
data := b.Bytes()
n := len(data)
if n >= 2 && data[n-2] == '\r' && data[n-1] == '\n' {
b.Truncate(n - 2)
} else if n >= 1 && (data[n-1] == '\r' || data[n-1] == '\n') {
b.Truncate(n - 1)
}
}
func (p *resultsPrinter) PrintResults(matchingNodes *list.List) error { func (p *resultsPrinter) PrintResults(matchingNodes *list.List) error {
log.Debug("PrintResults for %v matches", matchingNodes.Len()) log.Debug("PrintResults for %v matches", matchingNodes.Len())
@ -128,18 +148,40 @@ func (p *resultsPrinter) PrintResults(matchingNodes *list.List) error {
} }
} }
if err := p.encoder.PrintLeadingContent(writer, mappedDoc.LeadingContent); err != nil { var destination io.Writer = writer
tempBuffer := bytes.NewBuffer(nil)
if p.nulSepOutput {
destination = tempBuffer
}
if err := p.encoder.PrintLeadingContent(destination, mappedDoc.LeadingContent); err != nil {
return err return err
} }
if err := p.printNode(mappedDoc.Node, writer); err != nil { if err := p.printNode(mappedDoc.Node, destination); err != nil {
return err return err
} }
if err := p.encoder.PrintLeadingContent(writer, mappedDoc.TrailingContent); err != nil { if err := p.encoder.PrintLeadingContent(destination, mappedDoc.TrailingContent); err != nil {
return err return err
} }
if p.nulSepOutput {
removeLastEOL(tempBuffer)
tempBufferBytes := tempBuffer.Bytes()
if bytes.IndexByte(tempBufferBytes, 0) != -1 {
return fmt.Errorf(
"Can't serialize value because it contains NUL char and you are using NUL separated output",
)
}
if _, err := writer.Write(tempBufferBytes); err != nil {
return err
}
if _, err := writer.Write([]byte{0}); err != nil {
return err
}
}
p.previousDocIndex = mappedDoc.Document p.previousDocIndex = mappedDoc.Document
if err := writer.Flush(); err != nil { if err := writer.Flush(); err != nil {
return err return err

View File

@ -340,3 +340,53 @@ func TestPrinterMultipleDocsJson(t *testing.T) {
writer.Flush() writer.Flush()
test.AssertResult(t, expected, output.String()) test.AssertResult(t, expected, output.String())
} }
func TestPrinterNulSeparator(t *testing.T) {
var output bytes.Buffer
var writer = bufio.NewWriter(&output)
printer := NewSimpleYamlPrinter(writer, YamlOutputFormat, true, false, 2, false)
printer.SetNulSepOutput(true)
node, err := getExpressionParser().ParseExpression(".a")
if err != nil {
panic(err)
}
streamEvaluator := NewStreamEvaluator()
_, err = streamEvaluator.Evaluate("sample", strings.NewReader(multiDocSample), node, printer, NewYamlDecoder(ConfiguredYamlPreferences))
if err != nil {
panic(err)
}
writer.Flush()
expected := "banana\x00apple\x00coconut\x00"
test.AssertResult(t, expected, output.String())
}
func TestPrinterNulSeparatorWithJson(t *testing.T) {
var output bytes.Buffer
var writer = bufio.NewWriter(&output)
// note printDocSeparators is true, it should still not print document separators
// when outputing JSON.
encoder := NewJSONEncoder(0, false, false)
if encoder == nil {
t.Skipf("no support for %s output format", "json")
}
printer := NewPrinter(encoder, NewSinglePrinterWriter(writer))
printer.SetNulSepOutput(true)
inputs, err := readDocuments(strings.NewReader(multiDocSample), "sample.yml", 0, NewYamlDecoder(ConfiguredYamlPreferences))
if err != nil {
panic(err)
}
inputs.Front().Value.(*CandidateNode).LeadingContent = "# ignore this\n"
err = printer.PrintResults(inputs)
if err != nil {
panic(err)
}
expected := `{"a":"banana"}` + "\x00" + `{"a":"apple"}` + "\x00" + `{"a":"coconut"}` + "\x00"
writer.Flush()
test.AssertResult(t, expected, output.String())
}