WIP: adding CSV decoder

This commit is contained in:
Mike Farah 2022-07-31 17:31:27 +10:00
parent 4508bc2dc2
commit 183a42c249
6 changed files with 424 additions and 1 deletions

View File

@ -68,6 +68,10 @@ func configureDecoder() (yqlib.Decoder, error) {
return yqlib.NewPropertiesDecoder(), nil
case yqlib.JsonInputFormat:
return yqlib.NewJSONDecoder(), nil
case yqlib.CSVObjectInputFormat:
return yqlib.NewCSVObjectDecoder(','), nil
case yqlib.TSVObjectInputFormat:
return yqlib.NewCSVObjectDecoder('\t'), nil
}
return yqlib.NewYamlDecoder(), nil

193
pkg/yqlib/csv_test.go Normal file
View File

@ -0,0 +1,193 @@
package yqlib
import (
"bufio"
"fmt"
"testing"
"github.com/mikefarah/yq/v4/test"
)
const csvSimple = `name,numberOfCats,likesApples,height
Gary,1,true,168.8
Samantha's Rabbit,2,false,-188.8
`
const csvSimpleShort = `Name,Number of Cats
Gary,1
Samantha's Rabbit,2
`
const tsvSimple = `name numberOfCats likesApples height
Gary 1 true 168.8
Samantha's Rabbit 2 false -188.8
`
const expectedYamlFromCSV = `- name: Gary
numberOfCats: 1
likesApples: true
height: 168.8
- name: Samantha's Rabbit
numberOfCats: 2
likesApples: false
height: -188.8
`
const csvTestSimpleYaml = `- [i, like, csv]
- [because, excel, is, cool]`
const csvTestExpectedSimpleCsv = `i,like,csv
because,excel,is,cool
`
const tsvTestExpectedSimpleCsv = `i like csv
because excel is cool
`
var csvScenarios = []formatScenario{
{
description: "Encode CSV simple",
input: csvTestSimpleYaml,
expected: csvTestExpectedSimpleCsv,
scenarioType: "encode-csv",
},
{
description: "Encode TSV simple",
input: csvTestSimpleYaml,
expected: tsvTestExpectedSimpleCsv,
scenarioType: "encode-tsv",
},
{
description: "Encode array of objects to csv",
subdescription: "Add the header row manually, then the we convert each object into an array of values - resulting in an array of arrays. Nice thing about this method is you can pick the columns and call the header whatever you like.",
input: expectedYamlFromCSV,
expected: csvSimpleShort,
expression: `[["Name", "Number of Cats"]] + [.[] | [.name, .numberOfCats ]]`,
scenarioType: "encode-csv",
},
{
description: "Encode array of objects to csv - generic",
subdescription: "This is a little trickier than the previous example - we dynamically work out the $header, and use that to automatically create the value arrays.",
input: expectedYamlFromCSV,
expected: csvSimple,
expression: `(.[0] | keys | .[] ) as $header | [[$header]] + [.[] | [ .[$header] ]]`,
scenarioType: "encode-csv",
},
{
description: "Parse CSV into an array of objects",
subdescription: "First row is assumed to define the fields",
input: csvSimple,
expected: expectedYamlFromCSV,
scenarioType: "decode-csv-object",
},
{
description: "Parse TSV into an array of objects",
subdescription: "First row is assumed to define the fields",
input: tsvSimple,
expected: expectedYamlFromCSV,
scenarioType: "decode-tsv-object",
},
}
func testCSVScenario(t *testing.T, s formatScenario) {
switch s.scenarioType {
case "encode-csv":
test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewYamlDecoder(), NewCsvEncoder(',')), s.description)
case "encode-tsv":
test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewYamlDecoder(), NewCsvEncoder('\t')), s.description)
case "decode-csv-object":
test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewCSVObjectDecoder(','), NewYamlEncoder(2, false, true, true)), s.description)
case "decode-tsv-object":
test.AssertResultWithContext(t, s.expected, processFormatScenario(s, NewCSVObjectDecoder('\t'), NewYamlEncoder(2, false, true, true)), s.description)
default:
panic(fmt.Sprintf("unhandled scenario type %q", s.scenarioType))
}
}
func documentCSVDecodeObjectScenario(t *testing.T, w *bufio.Writer, s formatScenario, formatType string) {
writeOrPanic(w, fmt.Sprintf("## %v\n", s.description))
if s.subdescription != "" {
writeOrPanic(w, s.subdescription)
writeOrPanic(w, "\n\n")
}
writeOrPanic(w, fmt.Sprintf("Given a sample.%v file of:\n", formatType))
writeOrPanic(w, fmt.Sprintf("```%v\n%v\n```\n", formatType, s.input))
writeOrPanic(w, "then\n")
writeOrPanic(w, fmt.Sprintf("```bash\nyq -p=%v sample.%v\n```\n", formatType, formatType))
writeOrPanic(w, "will output\n")
separator := ','
if formatType == "tsv" {
separator = '\t'
}
writeOrPanic(w, fmt.Sprintf("```yaml\n%v```\n\n",
processFormatScenario(s, NewCSVObjectDecoder(separator), NewYamlEncoder(s.indent, false, true, true))),
)
}
func documentCSVEncodeScenario(w *bufio.Writer, s formatScenario, formatType string) {
writeOrPanic(w, fmt.Sprintf("## %v\n", s.description))
if s.subdescription != "" {
writeOrPanic(w, s.subdescription)
writeOrPanic(w, "\n\n")
}
writeOrPanic(w, "Given a sample.yml file of:\n")
writeOrPanic(w, fmt.Sprintf("```yaml\n%v\n```\n", s.input))
writeOrPanic(w, "then\n")
expression := s.expression
if expression != "" {
writeOrPanic(w, fmt.Sprintf("```bash\nyq -o=%v '%v' sample.yml\n```\n", formatType, expression))
} else {
writeOrPanic(w, fmt.Sprintf("```bash\nyq -o=%v sample.yml\n```\n", formatType))
}
writeOrPanic(w, "will output\n")
separator := ','
if formatType == "tsv" {
separator = '\t'
}
writeOrPanic(w, fmt.Sprintf("```%v\n%v```\n\n", formatType,
processFormatScenario(s, NewYamlDecoder(), NewCsvEncoder(separator))),
)
}
func documentCSVScenario(t *testing.T, w *bufio.Writer, i interface{}) {
s := i.(formatScenario)
if s.skipDoc {
return
}
switch s.scenarioType {
case "encode-csv":
documentCSVEncodeScenario(w, s, "csv")
case "encode-tsv":
documentCSVEncodeScenario(w, s, "tsv")
case "decode-csv-object":
documentCSVDecodeObjectScenario(t, w, s, "csv")
case "decode-tsv-object":
documentCSVDecodeObjectScenario(t, w, s, "tsv")
default:
panic(fmt.Sprintf("unhandled scenario type %q", s.scenarioType))
}
}
func TestCSVScenarios(t *testing.T) {
for _, tt := range csvScenarios {
testCSVScenario(t, tt)
}
genericScenarios := make([]interface{}, len(csvScenarios))
for i, s := range csvScenarios {
genericScenarios[i] = s
}
documentScenarios(t, "usage", "csv-tsv", genericScenarios, documentCSVScenario)
}

View File

@ -15,6 +15,8 @@ const (
PropertiesInputFormat
Base64InputFormat
JsonInputFormat
CSVObjectInputFormat
TSVObjectInputFormat
)
type Decoder interface {
@ -32,6 +34,10 @@ func InputFormatFromString(format string) (InputFormat, error) {
return PropertiesInputFormat, nil
case "json", "ndjson", "j":
return JsonInputFormat, nil
case "csv":
return CSVObjectInputFormat, nil
case "tsv":
return TSVObjectInputFormat, nil
default:
return 0, fmt.Errorf("unknown format '%v' please use [yaml|xml|props]", format)
}

View File

@ -0,0 +1,77 @@
package yqlib
import (
"encoding/csv"
"errors"
"io"
yaml "gopkg.in/yaml.v3"
)
type csvObjectDecoder struct {
separator rune
reader csv.Reader
finished bool
}
func NewCSVObjectDecoder(separator rune) Decoder {
return &csvObjectDecoder{separator: separator}
}
func (dec *csvObjectDecoder) Init(reader io.Reader) {
dec.reader = *csv.NewReader(reader)
dec.reader.Comma = dec.separator
dec.finished = false
}
func (dec *csvObjectDecoder) convertToYamlNode(content string) *yaml.Node {
node, err := parseSnippet(content)
if err != nil {
return createScalarNode(content, content)
}
return node
}
func (dec *csvObjectDecoder) createObject(headerRow []string, contentRow []string) *yaml.Node {
objectNode := &yaml.Node{Kind: yaml.MappingNode, Tag: "!!map"}
for i, header := range headerRow {
objectNode.Content = append(
objectNode.Content,
createScalarNode(header, header),
dec.convertToYamlNode(contentRow[i]))
}
return objectNode
}
func (dec *csvObjectDecoder) Decode(rootYamlNode *yaml.Node) error {
if dec.finished {
return io.EOF
}
headerRow, err := dec.reader.Read()
log.Debugf(": headerRow%v", headerRow)
if err != nil {
return err
}
rootArray := &yaml.Node{Kind: yaml.SequenceNode, Tag: "!!seq"}
contentRow, err := dec.reader.Read()
for err == nil && len(contentRow) > 0 {
log.Debugf("Adding contentRow: %v", contentRow)
rootArray.Content = append(rootArray.Content, dec.createObject(headerRow, contentRow))
contentRow, err = dec.reader.Read()
log.Debugf("Read next contentRow: %v, %v", contentRow, err)
}
if !errors.Is(err, io.EOF) {
return err
}
log.Debugf("finished, contentRow%v", contentRow)
log.Debugf("err: %v", err)
rootYamlNode.Kind = yaml.DocumentNode
rootYamlNode.Content = []*yaml.Node{rootArray}
return nil
}

View File

@ -0,0 +1,143 @@
{% hint style="warning" %}
Note that versions prior to 4.18 require the 'eval/e' command to be specified. 
`yq e <exp> <file>`
{% endhint %}
## Encode CSV simple
Given a sample.yml file of:
```yaml
- [i, like, csv]
- [because, excel, is, cool]
```
then
```bash
yq -o=csv sample.yml
```
will output
```csv
i,like,csv
because,excel,is,cool
```
## Encode TSV simple
Given a sample.yml file of:
```yaml
- [i, like, csv]
- [because, excel, is, cool]
```
then
```bash
yq -o=tsv sample.yml
```
will output
```tsv
i like csv
because excel is cool
```
## Encode array of objects to csv
Add the header row manually, then the we convert each object into an array of values - resulting in an array of arrays. Nice thing about this method is you can pick the columns and call the header whatever you like.
Given a sample.yml file of:
```yaml
- name: Gary
numberOfCats: 1
likesApples: true
height: 168.8
- name: Samantha's Rabbit
numberOfCats: 2
likesApples: false
height: -188.8
```
then
```bash
yq -o=csv '[["Name", "Number of Cats"]] + [.[] | [.name, .numberOfCats ]]' sample.yml
```
will output
```csv
Name,Number of Cats
Gary,1
Samantha's Rabbit,2
```
## Encode array of objects to csv - generic
This is a little trickier than the previous example - we dynamically work out the $header, and use that to automatically create the value arrays.
Given a sample.yml file of:
```yaml
- name: Gary
numberOfCats: 1
likesApples: true
height: 168.8
- name: Samantha's Rabbit
numberOfCats: 2
likesApples: false
height: -188.8
```
then
```bash
yq -o=csv '(.[0] | keys | .[] ) as $header | [[$header]] + [.[] | [ .[$header] ]]' sample.yml
```
will output
```csv
name,numberOfCats,likesApples,height
Gary,1,true,168.8
Samantha's Rabbit,2,false,-188.8
```
## Parse CSV into an array of objects
First row is assumed to define the fields
Given a sample.csv file of:
```csv
name,numberOfCats,likesApples,height
Gary,1,true,168.8
Samantha's Rabbit,2,false,-188.8
```
then
```bash
yq -p=csv sample.csv
```
will output
```yaml
- name: Gary
numberOfCats: 1
likesApples: true
height: 168.8
- name: Samantha's Rabbit
numberOfCats: 2
likesApples: false
height: -188.8
```
## Parse TSV into an array of objects
First row is assumed to define the fields
Given a sample.tsv file of:
```tsv
name numberOfCats likesApples height
Gary 1 true 168.8
Samantha's Rabbit 2 false -188.8
```
then
```bash
yq -p=tsv sample.tsv
```
will output
```yaml
- name: Gary
numberOfCats: 1
likesApples: true
height: 168.8
- name: Samantha's Rabbit
numberOfCats: 2
likesApples: false
height: -188.8
```

View File

@ -31,7 +31,7 @@ type expressionScenario struct {
}
func TestMain(m *testing.M) {
logging.SetLevel(logging.ERROR, "")
logging.SetLevel(logging.DEBUG, "")
Now = func() time.Time {
return time.Date(2021, time.May, 19, 1, 2, 3, 4, time.UTC)
}