Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add option to cat command to remove PARGO_PREFIX_ from field name #471

Merged
merged 1 commit into from
Dec 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 47 additions & 22 deletions cmd/cat.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"strings"
"time"

"github.com/itchyny/gojq"
"github.com/xitongsys/parquet-go/parquet"
"github.com/xitongsys/parquet-go/reader"
"github.com/xitongsys/parquet-go/types"
Expand All @@ -31,6 +32,7 @@ type CatCmd struct {
NoHeader bool `help:"(CSV/TSV only) do not output field name as header" default:"false"`
URI string `arg:"" predictor:"file" help:"URI of Parquet file."`
FailOnInt96 bool `help:"fail command if INT96 data type presents." name:"fail-on-int96" default:"false"`
PargoPrefix string `help:"remove this prefix from field names." default:""`
}

// here are performance numbers for different SkipPageSize:
Expand Down Expand Up @@ -100,7 +102,14 @@ func (c CatCmd) outputHeader(schemaRoot *internal.SchemaNode) ([]string, error)
}
fieldList[index] = child.Name
}
line, err := valuesToCSV(fieldList, delimiter[c.Format].fieldDelimiter)
headerList := make([]string, len(schemaRoot.Children))
_ = copy(headerList, fieldList)
if c.PargoPrefix != "" {
for i := range headerList {
headerList[i] = strings.TrimPrefix(headerList[i], c.PargoPrefix)
}
}
line, err := valuesToCSV(headerList, delimiter[c.Format].fieldDelimiter)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -145,6 +154,34 @@ func (c CatCmd) retrieveFieldDef(fileReader *reader.ParquetReader) ([]string, ma
return fieldList, reinterpretFields, nil
}

func (c CatCmd) outputSingleRow(rowStruct interface{}, jq *gojq.Query, fieldList []string) error {
switch c.Format {
case "json", "jsonl":
v, _ := jq.Run(rowStruct).Next()
buf, _ := json.Marshal(v)
fmt.Print(string(buf))
case "csv", "tsv":
flatValues := rowStruct.(map[string]interface{})
values := make([]string, len(flatValues))
for index, field := range fieldList {
switch val := flatValues[field].(type) {
case nil:
// nil is just empty
default:
values[index] = fmt.Sprint(val)
}
}

line, err := valuesToCSV(values, delimiter[c.Format].fieldDelimiter)
if err != nil {
return err
}
fmt.Print(strings.TrimRight(line, "\n"))
}

return nil
}

func (c CatCmd) outputRows(fileReader *reader.ParquetReader) error {
fieldList, reinterpretFields, err := c.retrieveFieldDef(fileReader)
if err != nil {
Expand All @@ -156,6 +193,13 @@ func (c CatCmd) outputRows(fileReader *reader.ParquetReader) error {
return err
}

// handle PARGO_PREFIX_ with jq jq
queryString := fmt.Sprintf(`walk(if type == "object" then with_entries(.key = (.key | sub("%s"; ""))) else . end)`, c.PargoPrefix)
jq, err := gojq.Parse(queryString)
if err != nil {
return fmt.Errorf("unable to use [%s] as prefix: %w", c.PargoPrefix, err)
}

// Output rows one by one to avoid running out of memory with a jumbo list
fmt.Print(delimiter[c.Format].begin)
for counter := uint64(0); counter < c.Limit; {
Expand All @@ -176,27 +220,8 @@ func (c CatCmd) outputRows(fileReader *reader.ParquetReader) error {
}
// there is no known error at this moment
rowStruct, _ := rowToStruct(rows[i], reinterpretFields)
switch c.Format {
case "json", "jsonl":
buf, _ := json.Marshal(rowStruct)
fmt.Print(string(buf))
case "csv", "tsv":
flatValues := rowStruct.(map[string]interface{})
values := make([]string, len(flatValues))
for index, field := range fieldList {
switch val := flatValues[field].(type) {
case nil:
// nil is just empty
default:
values[index] = fmt.Sprint(val)
}
}

line, err := valuesToCSV(values, delimiter[c.Format].fieldDelimiter)
if err != nil {
return err
}
fmt.Print(strings.TrimRight(line, "\n"))
if err := c.outputSingleRow(rowStruct, jq, fieldList); err != nil {
return err
}
counter++
}
Expand Down
60 changes: 60 additions & 0 deletions cmd/cat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -415,3 +415,63 @@ func Test_CatCmd_Run_fail_on_int96(t *testing.T) {
require.Equal(t, "", stdout)
require.Equal(t, "", stderr)
}

func Test_CatCmd_Run_csv_pargo_prefix(t *testing.T) {
cmd := &CatCmd{}
cmd.ReadPageSize = 10
cmd.SampleRatio = 1.0
cmd.URI = "../testdata/pargo-prefix-flat.parquet"
cmd.Format = "csv"

stdout, stderr := captureStdoutStderr(func() {
require.Nil(t, cmd.Run())
})

expected := loadExpected(t, "../testdata/golden/cat-pargo-prefix-flat-keep.csv")
require.Equal(t, expected, stdout)
require.Equal(t, "", stderr)

cmd.PargoPrefix = "PARGO_PREFIX_"
stdout, stderr = captureStdoutStderr(func() {
require.Nil(t, cmd.Run())
})

expected = loadExpected(t, "../testdata/golden/cat-pargo-prefix-flat-remove.csv")
require.Equal(t, expected, stdout)
require.Equal(t, "", stderr)
}

func Test_CatCmd_Run_json_pargo_prefix(t *testing.T) {
cmd := &CatCmd{}
cmd.ReadPageSize = 10
cmd.SampleRatio = 1.0
cmd.URI = "../testdata/pargo-prefix-nested.parquet"
cmd.Format = "json"

stdout, stderr := captureStdoutStderr(func() {
require.Nil(t, cmd.Run())
})

expected := loadExpected(t, "../testdata/golden/cat-pargo-prefix-nested-keep.json")
require.Equal(t, expected, stdout)
require.Equal(t, "", stderr)

cmd.PargoPrefix = "PARGO_PREFIX_"
stdout, stderr = captureStdoutStderr(func() {
require.Nil(t, cmd.Run())
})

expected = loadExpected(t, "../testdata/golden/cat-pargo-prefix-nested-remove.json")
require.Equal(t, expected, stdout)
require.Equal(t, "", stderr)

cmd.PargoPrefix = `\“`
stdout, stderr = captureStdoutStderr(func() {
err := cmd.Run()
require.NotNil(t, err)
require.Contains(t, err.Error(), `unable to use [\“] as prefix: invalid escape sequence`)
})

require.Equal(t, "", stdout)
require.Equal(t, "", stderr)
}
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ require (
github.com/aws/aws-sdk-go-v2 v1.32.7
github.com/aws/aws-sdk-go-v2/config v1.28.7
github.com/aws/aws-sdk-go-v2/service/s3 v1.71.1
github.com/itchyny/gojq v0.12.17
github.com/posener/complete v1.2.3
github.com/stretchr/testify v1.10.0
github.com/willabides/kongplete v0.4.0
Expand Down Expand Up @@ -65,6 +66,7 @@ require (
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/hashicorp/go-multierror v1.1.1 // indirect
github.com/hashicorp/go-uuid v0.0.0-20180228145832-27454136f036 // indirect
github.com/itchyny/timefmt-go v0.1.6 // indirect
github.com/jcmturner/gofork v0.0.0-20180107083740-2aebee971930 // indirect
github.com/klauspost/asmfmt v1.3.2 // indirect
github.com/klauspost/compress v1.16.7 // indirect
Expand Down
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,10 @@ github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUq
github.com/hexops/gotextdiff v1.0.3/go.mod h1:pSWU5MAI3yDq+fZBTazCSJysOMbxWL1BSow5/V2vxeg=
github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
github.com/itchyny/gojq v0.12.17 h1:8av8eGduDb5+rvEdaOO+zQUjA04MS0m3Ps8HiD+fceg=
github.com/itchyny/gojq v0.12.17/go.mod h1:WBrEMkgAfAGO1LUcGOckBl5O726KPp+OlkKug0I/FEY=
github.com/itchyny/timefmt-go v0.1.6 h1:ia3s54iciXDdzWzwaVKXZPbiXzxxnv1SPGFfM/myJ5Q=
github.com/itchyny/timefmt-go v0.1.6/go.mod h1:RRDZYC5s9ErkjQvTvvU7keJjxUYzIISJGxm9/mAERQg=
github.com/jackc/chunkreader v1.0.0/go.mod h1:RT6O25fNZIuasFJRyZ4R/Y2BbhasbmZXF9QQ7T3kePo=
github.com/jackc/chunkreader/v2 v2.0.0/go.mod h1:odVSm741yZoC3dpHEUXIqA9tQRhFrgOHwnPIn9lDKlk=
github.com/jackc/chunkreader/v2 v2.0.1/go.mod h1:odVSm741yZoC3dpHEUXIqA9tQRhFrgOHwnPIn9lDKlk=
Expand Down
4 changes: 4 additions & 0 deletions testdata/golden/cat-pargo-prefix-flat-keep.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
PARGO_PREFIX__shoe_brand,Shoe_name
nike,air_griffey
fila,grant_hill_2
steph_curry,curry7
14 changes: 14 additions & 0 deletions testdata/golden/cat-pargo-prefix-flat-keep.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[
{
"PARGO_PREFIX__shoe_brand": "nike",
"Shoe_name": "air_griffey"
},
{
"PARGO_PREFIX__shoe_brand": "fila",
"Shoe_name": "grant_hill_2"
},
{
"PARGO_PREFIX__shoe_brand": "steph_curry",
"Shoe_name": "curry7"
}
]
4 changes: 4 additions & 0 deletions testdata/golden/cat-pargo-prefix-flat-remove.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
_shoe_brand,Shoe_name
nike,air_griffey
fila,grant_hill_2
steph_curry,curry7
14 changes: 14 additions & 0 deletions testdata/golden/cat-pargo-prefix-flat-remove.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[
{
"_shoe_brand": "nike",
"Shoe_name": "air_griffey"
},
{
"_shoe_brand": "fila",
"Shoe_name": "grant_hill_2"
},
{
"_shoe_brand": "steph_curry",
"Shoe_name": "curry7"
}
]
Loading
Loading