Add `--nul-output|-0` flag to separate element with NUL character (#1550)

This is to ensure solid parsing of complex data (with any binary content except NUL chars) by separating the `yq` root collection member's output with NUL char. As a safe-guard, an error will be cast if trying to use NUL character with content that contains itself NUL characters inside.
2025-02-03 20:54:18 +00:00 · 2023-03-29 00:51:55 +02:00 · 2023-03-29 00:51:55 +02:00 · 5fd2890d1b
commit 5fd2890d1b
parent 311622d14b
7 changed files with 389 additions and 3 deletions
--- a/acceptance_tests/nul-separator.sh
+++ b/acceptance_tests/nul-separator.sh
@ -0,0 +1,286 @@
 #!/bin/bash
 setUp() {
    rm test*.yml || true
 }
 ## Convenient bash shortcut to read records of NUL separated values
 ## from stdin the safe way. See example usage in the next tests.
 read-0() {
    local eof="" IFS=''
    while [ "$1" ]; do
        ## - The `-r` avoids bad surprise with '\n' and other interpreted
        ##   sequences that can be read.
        ## - The `-d ''` is the (strange?) way to refer to NUL delimiter.
        ## - The `--` is how to avoid unpleasant surprises if your
        ##   "$1" starts with "-" (minus) sign. This protection also
        ##   will produce a readable error if you want to try to start
        ##   your variable names with a "-".
        read -r -d '' -- "$1" || eof=1
        shift
    done
    [ -z "$eof" ] ## fail on EOF
 }
 ## Convenient bash shortcut to be used with the next function `p-err`
 ## to read NUL separated values the safe way AND catch any errors from
 ## the process creating the stream of NUL separated data.  See example
 ## usage in the tests.
 read-0-err() {
    local ret="$1" eof="" idx=0 last=
    read -r -- "${ret?}" <<<"0"
    shift
    while [ "$1" ]; do
        last=$idx
        read -r -d '' -- "$1" || {
            ## Put this last value in ${!ret}
            eof="$1"
            read -r -- "$ret" <<<"${!eof}"
            break
        }
        ((idx++))
        shift
    done
    [ -z "$eof" ] || {
        if [ "$last" != 0 ]; then
            ## Uhoh, we have no idea if the errorlevel of the internal
            ## command was properly delimited with a NUL char, and
            ## anyway something went really wrong at least about the
            ## number of fields separated by NUL char and the one
            ## expected.
            echo "Error: read-0-err couldn't fill all value $ret = '${!ret}', '$eof', '${!eof}'" >&2
            read -r -- "$ret" <<<"not-enough-values"
        else
            if ! [[ "${!ret}" =~ ^[0-9]+$ && "${!ret}" -ge 0 && "${!ret}" -le 127 ]]; then
                ## This could happen if you don't use `p-err` wrapper,
                ## or used stdout in unexpected ways in your inner
                ## command.
                echo "Error: last value is not a number, did you finish with an errorlevel ?" >&2
                read -r -- "$ret" <<<"last-value-not-a-number"
            fi
        fi
        false
    }
 }
 ## Simply runs command given as argument and adds errorlevel in the
 ## standard output. Is expected to be used in tandem with
 ## `read-0-err`.
 p-err() {
    local exp="$1"
    "$@"
    printf "%s" "$?"
 }
 wyq-r() {
    local exp="$1"
    ./yq e -0 -r=false "$1"
    printf "%s" "$?"
 }
 testBasicUsageRaw() {
  cat >test.yml <<EOL
 a: foo
 b: bar
 EOL
  printf "foo\0bar\0" > expected.out
  ## We need to compare binary content here. We have to filter the compared
  ## content through a representation that gets rid of NUL chars but accurately
  ## transcribe the content.
  ## Also as it would be nice to have a pretty output in case the test fails,
  ## we use here 'hd': a widely available shortcut to 'hexdump' that will
  ## pretty-print any binary to it's hexadecimal representation.
  ##
  ## Note that the standard `assertEquals` compare its arguments
  ## value, but they can't hold NUL characters (this comes from the
  ## limitation of the C API of `exec*(..)` functions that requires
  ## `const char *arv[]`). And these are NUL terminated strings.  As a
  ## consequence, the NUL characters gets removed in bash arguments.
  assertEquals "$(hd expected.out)" \
               "$(./yq e -0 '.a, .b' test.yml | hd)"
  rm expected.out
 }
 testBasicUsage() {
  local a b
  cat >test.yml <<EOL
 a: foo
 b: bar
 EOL
  ## We provide 2 values, and ask to fill 2 variables.
  read-0 a b < <(./yq e -0 '.a, .b' test.yml)
  assertEquals "$?" "0"      ## Everything is fine
  assertEquals "foo" "$a"    ## Values are correctly parsed
  assertEquals "bar" "$b"
  a=YYY ; b=XXX
  ## Not enough values provided to fill `a` and `b`.
  read-0 a b < <(./yq e -0 '.a' test.yml)
  assertEquals "$?" "1"      ## An error was emitted
  assertEquals "foo" "$a"    ## First value was correctly parsed
  assertEquals "" "$b"       ## Second was still reset
  ## Error from inner command are not catchable !. Use
  ## `read-0-err`/`p-err` for that.
  read-0 a < <(printf "\0"; ./yq e -0 'xxx' test.yml; )
  assertEquals "$?" "0"
 }
 testBasicUsageJson() {
  cat >test.yml <<EOL
 a:
  x: foo
 b: bar
 EOL
  read-0 a b < <(./yq e -0 -o=json '.a, .b' test.yml)
  assertEquals '{
  "x": "foo"
 }' "$a"
  assertEquals '"bar"' "$b"
 }
 testFailWithValueContainingNUL() {
  local a b c
  ## Note that value of field 'a' actually contains a NUL char !
  cat >test.yml <<EOL
 a: "foo\u0000bar"
 b: 1
 c: |
  wiz
  boom
 EOL
  ## We are looking for trouble with asking to separated fields with NUL
  ## char and requested value `.a` actually contains itself a NUL char !
  read-0 a b c < <(./yq e -0 '.a, .b, .c' test.yml)
  assertNotEquals "0" "$?"   ## read-0 failed to fill all values
  ## But here, we can request for one value, even if `./yq` fails
  read-0 b < <(./yq e -0 '.b, .a' test.yml)
  assertEquals "0" "$?"   ## read-0 succeeds at feeding the first value
  ## Note: to catch the failure of `yq`, see in the next tests the usage
  ## of `read-0-err`.
  ## using -r=false solves any NUL containing value issues, but keeps
  ## all in YAML representation:
  read-0 a b c < <(./yq e -0 -r=false '.a, .b, .c' test.yml)
  assertEquals "0" "$?"    ## All goes well despite asking for `a` value
  assertEquals '"foo\0bar"' "$a"   ## This is a YAML string representation
  assertEquals '1' "$b"
  assertEquals '|
  wiz
  boom' "$c"
 }
 testStandardLoop() {
    local E a b res
    ## Here everything is normal: 4 values, that will be paired
    ## in key/values.
    cat >test.yml <<EOL
 - yay
 - wiz
 - hop
 - pow
 EOL
    res=""
    while read-0-err E a b; do
        res+="$a: $b;"
    done < <(p-err ./yq -0 '.[]' test.yml)
    assertEquals "0" "$E"                     ## errorlevel of internal command
    assertEquals "yay: wiz;hop: pow;" "$res"  ## expected result
 }
 testStandardLoopWithoutEnoughValues() {
    local E a b res
    ## Here 5 values, there will be a missing value when reading
    ## pairs of value.
    cat >test.yml <<EOL
 - yay
 - wiz
 - hop
 - pow
 - kwak
 EOL
    res=""
    ## The loop will succeed 2 times then fail
    while read-0-err E a b; do
        res+="$a: $b;"
    done < <(p-err ./yq -0 '.[]' test.yml)
    assertEquals "not-enough-values" "$E"     ## Not enough value error
    assertEquals "yay: wiz;hop: pow;" "$res"  ## the 2 full key/value pairs
 }
 testStandardLoopWithInternalCmdError() {
    local E a b res
    ## Note the third value contains a NUL char !
    cat >test.yml <<EOL
 - yay
 - wiz
 - "foo\0bar"
 - hop
 - pow
 EOL
    res=""
    ## It should be only upon the second pass in the loop that
    ## read-0-err will catch the fact that there is an error !
    while read-0-err E a b; do
        res+="$a: $b;"
    done < <(p-err ./yq -0 '.[]' test.yml)
    assertEquals "1" "$E"            ## Internal command errorlevel (from `./yq`)
    assertEquals "yay: wiz;" "$res"  ## first 2 values were ok at least
 }
 testStandardLoopNotEnoughErrorEatsCmdError() {
    local E a b res
    ## Because of possible edge cases where the internal errorlevel
    ## reported by `p-err` in the standard output might be mangled
    ## with the unfinished record, `read-0-err E ...` will NOT report
    ## the internal command error in the variable E and instead will
    ## store the value 'not-enough-values'. In real world, anyway, you
    ## will want to react the same if the internal command failed
    ## and/or you didn't get as much values as expected while
    ## reading. Keep in mind also that standard error is not
    ## swallowed, so you can read reports from the inner command AND
    ## from `read-0-err`.
    ## Here, note that the fourth value contains a NUL char !
    cat >test.yml <<EOL
 - yay
 - wiz
 - hop
 - "foo\0bar"
 - pow
 EOL
    res=""
    ## It should be only upon the second loop that read-0-err will catch
    ## the fact that there are not enough data to fill the requested variables
    while read-0-err E a b; do
        res+="$a: $b;"
    done < <(p-err ./yq -0 '.[]' test.yml)
    assertEquals "not-enough-values" "$E"          ## Not enough values error eats internal error !
    assertEquals "yay: wiz;" "$res"  ## first 2 values were ok at least
 }
 source ./scripts/shunit2
--- a/cmd/constant.go
+++ b/cmd/constant.go
@ -18,6 +18,7 @@ var colorsEnabled = false
 var indent = 2
 var noDocSeparators = false
 var nullInput = false
 var nulSepOutput = false
 var verbose = false
 var version = false
 var prettyPrint = false
--- a/cmd/evaluate_all_command.go
+++ b/cmd/evaluate_all_command.go
@ -90,6 +90,9 @@ func evaluateAll(cmd *cobra.Command, args []string) (cmdError error) {
 	}
 	printer := yqlib.NewPrinter(encoder, printerWriter)
 	if nulSepOutput {
 		printer.SetNulSepOutput(true)
 	}
 	if frontMatter != "" {
 		frontMatterHandler := yqlib.NewFrontMatterHandler(args[0])
--- a/cmd/evalute_sequence_command.go
+++ b/cmd/evalute_sequence_command.go
@ -99,6 +99,9 @@ func evaluateSequence(cmd *cobra.Command, args []string) (cmdError error) {
 	}
 	printer := yqlib.NewPrinter(encoder, printerWriter)
 	if nulSepOutput {
 		printer.SetNulSepOutput(true)
 	}
 	decoder, err := configureDecoder(false)
 	if err != nil {
--- a/cmd/root.go
+++ b/cmd/root.go
@ -86,6 +86,7 @@ yq -P sample.json
 	rootCmd.PersistentFlags().BoolVarP(&writeInplace, "inplace", "i", false, "update the file inplace of first file given.")
 	rootCmd.PersistentFlags().VarP(unwrapScalarFlag, "unwrapScalar", "r", "unwrap scalar, print the value with no quotes, colors or comments. Defaults to true for yaml")
 	rootCmd.PersistentFlags().Lookup("unwrapScalar").NoOptDefVal = "true"
 	rootCmd.PersistentFlags().BoolVarP(&nulSepOutput, "nul-output", "0", false, "Use NUL char to separate values. If unwrap scalar is also set, fail if unwrapped scalar contains NUL char.")
 	rootCmd.PersistentFlags().BoolVarP(&prettyPrint, "prettyPrint", "P", false, "pretty print, shorthand for '... style = \"\"'")
 	rootCmd.PersistentFlags().BoolVarP(&exitStatus, "exit-status", "e", false, "set exit status if there are no matches or null or false is returned")
--- a/pkg/yqlib/printer.go
+++ b/pkg/yqlib/printer.go
@ -2,6 +2,7 @@ package yqlib
 import (
 	"bufio"
 	"bytes"
 	"container/list"
 	"fmt"
 	"io"
@ -15,6 +16,7 @@ type Printer interface {
 	PrintedAnything() bool
 	//e.g. when given a front-matter doc, like jekyll
 	SetAppendix(reader io.Reader)
 	SetNulSepOutput(nulSepOutput bool)
 }
 type PrinterOutputFormat uint32
@ -59,6 +61,7 @@ type resultsPrinter struct {
 	printedMatches    bool
 	treeNavigator     DataTreeNavigator
 	appendixReader    io.Reader
 	nulSepOutput      bool
 }
 func NewPrinter(encoder Encoder, printerWriter PrinterWriter) Printer {
@ -67,9 +70,16 @@ func NewPrinter(encoder Encoder, printerWriter PrinterWriter) Printer {
 		printerWriter:     printerWriter,
 		firstTimePrinting: true,
 		treeNavigator:     NewDataTreeNavigator(),
 		nulSepOutput:      false,
 	}
 }
 func (p *resultsPrinter) SetNulSepOutput(nulSepOutput bool) {
 	log.Debug("Setting NUL separator output")
 	p.nulSepOutput = nulSepOutput
 }
 func (p *resultsPrinter) SetAppendix(reader io.Reader) {
 	p.appendixReader = reader
 }
@ -84,6 +94,16 @@ func (p *resultsPrinter) printNode(node *yaml.Node, writer io.Writer) error {
 	return p.encoder.Encode(writer, node)
 }
 func removeLastEOL(b *bytes.Buffer) {
 	data := b.Bytes()
 	n := len(data)
 	if n >= 2 && data[n-2] == '\r' && data[n-1] == '\n' {
 		b.Truncate(n - 2)
 	} else if n >= 1 && (data[n-1] == '\r' || data[n-1] == '\n') {
 		b.Truncate(n - 1)
 	}
 }
 func (p *resultsPrinter) PrintResults(matchingNodes *list.List) error {
 	log.Debug("PrintResults for %v matches", matchingNodes.Len())
@ -128,18 +148,40 @@ func (p *resultsPrinter) PrintResults(matchingNodes *list.List) error {
 			}
 		}
-		if err := p.encoder.PrintLeadingContent(writer, mappedDoc.LeadingContent); err != nil {
+		var destination io.Writer = writer
 		tempBuffer := bytes.NewBuffer(nil)
 		if p.nulSepOutput {
 			destination = tempBuffer
 		}
 		if err := p.encoder.PrintLeadingContent(destination, mappedDoc.LeadingContent); err != nil {
 			return err
 		}
-		if err := p.printNode(mappedDoc.Node, writer); err != nil {
+		if err := p.printNode(mappedDoc.Node, destination); err != nil {
 			return err
 		}
-		if err := p.encoder.PrintLeadingContent(writer, mappedDoc.TrailingContent); err != nil {
+		if err := p.encoder.PrintLeadingContent(destination, mappedDoc.TrailingContent); err != nil {
 			return err
 		}
 		if p.nulSepOutput {
 			removeLastEOL(tempBuffer)
 			tempBufferBytes := tempBuffer.Bytes()
 			if bytes.IndexByte(tempBufferBytes, 0) != -1 {
 				return fmt.Errorf(
 					"Can't serialize value because it contains NUL char and you are using NUL separated output",
 				)
 			}
 			if _, err := writer.Write(tempBufferBytes); err != nil {
 				return err
 			}
 			if _, err := writer.Write([]byte{0}); err != nil {
 				return err
 			}
 		}
 		p.previousDocIndex = mappedDoc.Document
 		if err := writer.Flush(); err != nil {
 			return err
--- a/pkg/yqlib/printer_test.go
+++ b/pkg/yqlib/printer_test.go
@ -340,3 +340,53 @@ func TestPrinterMultipleDocsJson(t *testing.T) {
 	writer.Flush()
 	test.AssertResult(t, expected, output.String())
 }
 func TestPrinterNulSeparator(t *testing.T) {
 	var output bytes.Buffer
 	var writer = bufio.NewWriter(&output)
 	printer := NewSimpleYamlPrinter(writer, YamlOutputFormat, true, false, 2, false)
 	printer.SetNulSepOutput(true)
 	node, err := getExpressionParser().ParseExpression(".a")
 	if err != nil {
 		panic(err)
 	}
 	streamEvaluator := NewStreamEvaluator()
 	_, err = streamEvaluator.Evaluate("sample", strings.NewReader(multiDocSample), node, printer, NewYamlDecoder(ConfiguredYamlPreferences))
 	if err != nil {
 		panic(err)
 	}
 	writer.Flush()
 	expected := "banana\x00apple\x00coconut\x00"
 	test.AssertResult(t, expected, output.String())
 }
 func TestPrinterNulSeparatorWithJson(t *testing.T) {
 	var output bytes.Buffer
 	var writer = bufio.NewWriter(&output)
 	// note printDocSeparators is true, it should still not print document separators
 	// when outputing JSON.
 	encoder := NewJSONEncoder(0, false, false)
 	if encoder == nil {
 		t.Skipf("no support for %s output format", "json")
 	}
 	printer := NewPrinter(encoder, NewSinglePrinterWriter(writer))
 	printer.SetNulSepOutput(true)
 	inputs, err := readDocuments(strings.NewReader(multiDocSample), "sample.yml", 0, NewYamlDecoder(ConfiguredYamlPreferences))
 	if err != nil {
 		panic(err)
 	}
 	inputs.Front().Value.(*CandidateNode).LeadingContent = "# ignore this\n"
 	err = printer.PrintResults(inputs)
 	if err != nil {
 		panic(err)
 	}
 	expected := `{"a":"banana"}` + "\x00" + `{"a":"apple"}` + "\x00" + `{"a":"coconut"}` + "\x00"
 	writer.Flush()
 	test.AssertResult(t, expected, output.String())
 }