Skip to content

Commit

Permalink
Add stats tool for total file stats (#921)
Browse files Browse the repository at this point in the history
  • Loading branch information
brancz authored Jul 24, 2024
1 parent bd8afbd commit 808dfa8
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 0 deletions.
1 change: 1 addition & 0 deletions cmd/parquet-tool/cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,5 @@ func init() {
rootCmd.AddCommand(rowgroupCmd)
rootCmd.AddCommand(columnCmd)
rootCmd.AddCommand(rowCmd)
rootCmd.AddCommand(statsCmd)
}
97 changes: 97 additions & 0 deletions cmd/parquet-tool/cmd/stats.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
package cmd

import (
"fmt"
"os"
"strings"

"github.com/dustin/go-humanize"
"github.com/olekukonko/tablewriter"
"github.com/spf13/cobra"
"golang.org/x/exp/maps"
"golang.org/x/exp/slices"
)

var statsCmd = &cobra.Command{
Use: "stats",
Example: "parquet-tool stats <file.parquet>",
Short: "print total stats of a parquet file",
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
return runStats(args[0])
},
}

type stats struct {
Type string
NumVal int64
Encoding string
TotalCompressedSize int64
TotalUncompressedSize int64
TotalByteSize int64
}

func runStats(file string) error {
pf, closer, err := openParquetFile(file)
if err != nil {
return err
}
defer closer.Close()

meta := pf.Metadata()

s := map[string]stats{}
for _, rg := range meta.RowGroups {
for _, ds := range rg.Columns {
col := strings.Join(ds.MetaData.PathInSchema, "/")
typ := ds.MetaData.Type.String()
enc := ""
for _, e := range ds.MetaData.Encoding {
enc += e.String() + " "
}

if _, ok := s[col]; !ok {
s[col] = stats{
Type: typ,
NumVal: ds.MetaData.NumValues,
Encoding: enc,
TotalCompressedSize: ds.MetaData.TotalCompressedSize,
TotalUncompressedSize: ds.MetaData.TotalUncompressedSize,
TotalByteSize: rg.TotalByteSize,
}
} else {
s[col] = stats{
Type: typ,
NumVal: s[col].NumVal + ds.MetaData.NumValues,
Encoding: enc,
TotalCompressedSize: s[col].TotalCompressedSize + ds.MetaData.TotalCompressedSize,
TotalUncompressedSize: s[col].TotalUncompressedSize + ds.MetaData.TotalUncompressedSize,
TotalByteSize: s[col].TotalByteSize + rg.TotalByteSize,
}
}
}
}

table := tablewriter.NewWriter(os.Stdout)
table.SetHeader([]string{"Col", "Type", "NumVal", "Encoding", "TotalCompressedSize", "TotalUncompressedSize", "Compression", "%"})
keys := maps.Keys(s)
slices.Sort(keys)

for _, k := range keys {
row := s[k]
table.Append(
[]string{
k,
row.Type,
fmt.Sprintf("%d", row.NumVal),
row.Encoding,
humanize.Bytes(uint64(row.TotalCompressedSize)),
humanize.Bytes(uint64(row.TotalUncompressedSize)),
fmt.Sprintf("%.2f", float64(row.TotalUncompressedSize-row.TotalCompressedSize)/float64(row.TotalCompressedSize)*100),
fmt.Sprintf("%.2f", float64(row.TotalUncompressedSize)/float64(row.TotalByteSize)*100),
})
}
table.Render()

return nil
}

0 comments on commit 808dfa8

Please sign in to comment.