perf: fork and use benchtab over benchseries

benchseries seems to fall over on some of our benchmarks and it's difficult to say why. But, benchstat works just fine. This CL forks the internals of benchstat and rewrites benchCompare to use them. The comparison made by benchstat is slightly different than the one made by benchseries, but it's not clear to me how. It might make sense to try and flush InfluxDB and refill it with the new comparison, but I suspect it's probably not worth it. This is a reapplication of go.dev/cl/623275 to correctly propagate bootstrapped ratio data instead of just a summary of the direct benchmark data. Change-Id: I24d161130934e59aee4082629e8d0698fef3fde5 Reviewed-on: https://go-review.googlesource.com/c/build/+/626640 Reviewed-by: Michael Pratt <mpratt@google.com> Auto-Submit: Michael Knyszek <mknyszek@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
2024-10-29 17:30:12 +00:00 · 2024-10-29 17:30:12 +00:00 · a21e2438b3
--- a/perf/app/influx.go
+++ b/perf/app/influx.go
@ -7,10 +7,14 @@ package app
 import (
 	"bytes"
 	"context"
+	"errors"
 	"fmt"
 	"io"
 	"log"
+	"math"
+	"math/rand/v2"
 	"net/http"
+	"slices"
 	"strings"
 	"time"

@ -19,9 +23,12 @@ import (
 	"cloud.google.com/go/secretmanager/apiv1/secretmanagerpb"
 	influxdb2 "github.com/influxdata/influxdb-client-go/v2"
 	"golang.org/x/build/internal/influx"
+	"golang.org/x/build/perf/app/internal/benchtab"
 	"golang.org/x/build/perfdata"
 	"golang.org/x/perf/benchfmt"
-	"golang.org/x/perf/benchseries"
+	"golang.org/x/perf/benchmath"
+	"golang.org/x/perf/benchproc"
+	"golang.org/x/perf/benchunit"
 	"google.golang.org/api/idtoken"
 )

@ -263,25 +270,26 @@ func (a *App) pushRunToInflux(ctx context.Context, ifxc influxdb2.Client, u perf
 		}
 		// TODO(mknyszek): Use new iterators when possible.
 		forAll(func(c comparison, err error) bool {
+			comparisonID := c.keys["experiment-commit"]
 			if err != nil {
 				// Just log this error. We don't want to quit early if we have other good comparisons.
-				log.Printf("%s: %s: %v", c.series, c.benchmarkName, err)
+				log.Printf("error: %s: %s: %s: %v", comparisonID, c.benchmarkName, c.unit, err)
 				return true
 			}
-			measurement := "benchmark-result"                   // measurement
-			benchmarkName := c.benchmarkName + cfg.suffix       // tag
-			series := c.series                                  // time
-			center, low, high := c.Center, c.Low, c.High        // fields
-			unit := c.unit                                      // tag
-			uploadTime := c.residues["upload-time"]             // field
-			cpu := c.residues["cpu"]                            // tag
-			goarch := c.residues["goarch"]                      // tag
-			goos := c.residues["goos"]                          // tag
-			benchmarksCommit := c.residues["benchmarks-commit"] // field
-			baselineCommit := c.hashPairs[series].DenHash       // field
-			experimentCommit := c.hashPairs[series].NumHash     // field
-			repository := c.residues["repository"]              // tag
-			branch := c.residues["branch"]                      // tag
+			measurement := "benchmark-result"               // measurement
+			benchmarkName := c.benchmarkName + cfg.suffix   // tag
+			timestamp := c.keys["experiment-commit-time"]   // time
+			center, low, high := c.center, c.lo, c.hi       // fields
+			unit := c.unit                                  // tag
+			uploadTime := c.keys["upload-time"]             // field
+			cpu := c.keys["cpu"]                            // tag
+			goarch := c.keys["goarch"]                      // tag
+			goos := c.keys["goos"]                          // tag
+			benchmarksCommit := c.keys["benchmarks-commit"] // field
+			baselineCommit := c.keys["baseline-commit"]     // field
+			experimentCommit := c.keys["experiment-commit"] // field
+			repository := c.keys["repository"]              // tag
+			branch := c.keys["branch"]                      // tag

 			// cmd/bench didn't set repository prior to
 			// CL 413915. Older runs are all against go.
@ -290,12 +298,12 @@ func (a *App) pushRunToInflux(ctx context.Context, ifxc influxdb2.Client, u perf
 			}

 			// Push to influx.
-			t, err := benchseries.ParseNormalizedDateString(series)
+			t, err := time.Parse(time.RFC3339Nano, timestamp)
 			if err != nil {
-				log.Printf("%s: %s: error parsing normalized date: %v", c.series, c.benchmarkName, err)
+				log.Printf("error: %s: %s: %s: parsing experiment-commit-time: %v", comparisonID, c.benchmarkName, c.unit, err)
 				return true
 			}
-			fields := map[string]interface{}{
+			fields := map[string]any{
 				"center":            center,
 				"low":               low,
 				"high":              high,
@ -312,12 +320,11 @@ func (a *App) pushRunToInflux(ctx context.Context, ifxc influxdb2.Client, u perf
 				"goos":       goos,
 				"repository": repository,
 				"branch":     branch,
-				// TODO(prattmic): Add pkg, which
-				// benchseries currently can't handle.
+				// TODO(mknyszek): Revisit adding pkg, now that we're not using benchseries.
 			}
 			p := influxdb2.NewPoint(measurement, tags, fields, t)
 			if err := wapi.WritePoint(ctx, p); err != nil {
-				log.Printf("%s: %s: error writing point: %v", c.series, c.benchmarkName, err)
+				log.Printf("%s: %s: %s: error writing point: %v", comparisonID, c.benchmarkName, c.unit, err)
 				return true
 			}
 			return true
@ -327,46 +334,36 @@ func (a *App) pushRunToInflux(ctx context.Context, ifxc influxdb2.Client, u perf
 }

 type comparisonConfig struct {
-	suffix      string
-	compare     string
-	numerator   string
-	denominator string
-	filter      string
+	suffix     string
+	columnExpr string
+	filter     string
 }

 var (
 	pgoOff = comparisonConfig{
 		// Default: toolchain:baseline vs experiment without PGO
-		compare:     "toolchain",
-		numerator:   "experiment",
-		denominator: "baseline",
-		filter:      "-pgo:on", // "off" or unset (bent doesn't set pgo).
+		columnExpr: "toolchain@(baseline experiment)",
+		filter:     "-pgo:on", // "off" or unset (bent doesn't set pgo).
 	}
 	pgoOn = comparisonConfig{
 		// toolchain:baseline vs experiment with PGO
-		suffix:      "/pgo=on,toolchain:baseline-vs-experiment",
-		compare:     "toolchain",
-		numerator:   "experiment",
-		denominator: "baseline",
-		filter:      "pgo:on",
+		suffix:     "/pgo=on,toolchain:baseline-vs-experiment",
+		columnExpr: "toolchain@(baseline experiment)",
+		filter:     "pgo:on",
 	}
 	pgoVs = comparisonConfig{
 		// pgo:off vs on with experiment toolchain (impact of enabling PGO)
-		suffix:      "/toolchain:experiment,pgo=off-vs-on",
-		compare:     "pgo",
-		numerator:   "on",
-		denominator: "off",
-		filter:      "toolchain:experiment",
+		suffix:     "/toolchain:experiment,pgo=off-vs-on",
+		columnExpr: "pgo@(off on)",
+		filter:     "toolchain:experiment",
 	}
 )

 type comparison struct {
-	series        string
-	benchmarkName string
-	unit          string
-	residues      map[string]string
-	hashPairs     map[string]benchseries.ComparisonHashes
-	*benchseries.ComparisonSummary
+	benchmarkName  string
+	unit           string
+	keys           map[string]string
+	lo, center, hi float64
 }

 // benchCompare reads r, assuming it contains benchmark data, and performs the provided comparison
@ -374,95 +371,120 @@ type comparison struct {
 func benchCompare(rr io.Reader, name string, c comparisonConfig) (func(func(comparison, error) bool), error) {
 	r := benchfmt.NewReader(rr, name)

-	// Use the default comparisons. Namely:
-	// 1. Build a series out of commit dates (in our case, this is length 1).
-	// 2. Split out comparisons by benchmark name (unit we get for free).
-	//
-	// Copy the options for mutation.
-	opts := *benchseries.DefaultBuilderOptions()
-	opts.Compare = c.compare
-	opts.Numerator = c.numerator
-	opts.Denominator = c.denominator
-	if opts.Filter == "" {
-		opts.Filter = c.filter
-	} else {
-		opts.Filter += " " + c.filter
+	filter, err := benchproc.NewFilter(c.filter)
+	if err != nil {
+		return nil, fmt.Errorf("parsing filter: %s", err)
+	}
+
+	var parser benchproc.ProjectionParser
+	var parseErr error
+	mustParse := func(name, val string, unit bool) *benchproc.Projection {
+		var proj *benchproc.Projection
+		var err error
+		if unit {
+			proj, _, err = parser.ParseWithUnit(val, filter)
+		} else {
+			proj, err = parser.Parse(val, filter)
+		}
+		if err != nil && parseErr == nil {
+			parseErr = fmt.Errorf("parsing %s: %s", name, err)
+		}
+		return proj
+	}
+	tableBy := mustParse("table", ".config", true)
+	rowBy := mustParse("row", ".fullname", false)
+	colBy := mustParse("col", c.columnExpr, false)
+	mustParse("ignore", "go,tip,base,bentstamp,shortname,suite", false)
+	residue := parser.Residue()
+
+	// Check parse error.
+	if parseErr != nil {
+		return nil, fmt.Errorf("internal error: failed to parse projections for configuration: %v", parseErr)
 	}

 	// Scan the results into a benchseries builder.
-	builder, err := benchseries.NewBuilder(&opts)
-	if err != nil {
-		return nil, fmt.Errorf("failed to create benchseries builder: %v", err)
-	}
+	stat := benchtab.NewBuilder(tableBy, rowBy, colBy, residue)
 	for r.Scan() {
-		rec := r.Result()
-		if err, ok := rec.(*benchfmt.SyntaxError); ok {
+		switch rec := r.Result(); rec := rec.(type) {
+		case *benchfmt.SyntaxError:
 			// Non-fatal result parse error. Warn
 			// but keep going.
 			log.Printf("Parse error: %v", err)
-			continue
+		case *benchfmt.Result:
+			if ok, _ := filter.Apply(rec); !ok {
+				continue
+			}
+			stat.Add(rec)
 		}
-		res := rec.(*benchfmt.Result)
-		builder.Add(res)
 	}
 	if err := r.Err(); err != nil {
 		return nil, err
 	}

-	// Run the comparison. We don't have any existing results so our
-	// duplicate policy doesn't matter here. Just pick replacement.
-	comparisons, err := builder.AllComparisonSeries(nil, benchseries.DUPE_REPLACE)
-	if err != nil {
-		return nil, fmt.Errorf("failed to creation comparison series: %w", err)
+	// Prepopulate some assumptions about binary size units.
+	// bent does emit these, but they get stripped by perfdata.
+	// TODO(mknyszek): Remove this once perfdata stops doing that.
+	units := r.Units()
+	assumeExact := func(unit string) {
+		_, tidyUnit := benchunit.Tidy(1, unit)
+		key := benchfmt.UnitMetadataKey{Unit: tidyUnit, Key: "assume"}
+		if _, ok := units[key]; ok {
+			return // There was an assumption in the benchmark data.
+		}
+		units[key] = &benchfmt.UnitMetadata{
+			UnitMetadataKey: key,
+			OrigUnit:        unit,
+			Value:           "exact",
+		}
 	}
+	assumeExact("total-bytes")
+	assumeExact("text-bytes")
+	assumeExact("data-bytes")
+	assumeExact("rodata-bytes")
+	assumeExact("pclntab-bytes")
+	assumeExact("debug-bytes")

-	const (
-		confidence = 0.95
-		bootstrap  = 1000
-	)
+	// Build the comparison table.
+	const confidence = 0.95
+	thresholds := benchmath.DefaultThresholds
+	tables := stat.ToTables(benchtab.TableOpts{
+		Confidence: confidence,
+		Thresholds: &thresholds,
+		Units:      r.Units(),
+	})

 	// Iterate over the comparisons and extract the results
 	return func(yield func(sum comparison, err error) bool) {
-	comparisonLoop:
-		for _, cs := range comparisons {
-			cs.AddSummaries(confidence, bootstrap)
-
-			summaries := cs.Summaries
-
-			// Build a map of residues with single values. Our benchmark pipeline enforces
-			// that the only key that has a differing value across benchmark runs of the same
-			// name and unit is "toolchain."
-			//
-			// Most other keys are singular for *all* benchmarks in a run (like "goos") but
-			// even those that are not (like "pkg") remain the same even if "toolchain" differs.
-			//
-			// We build a map instead of just using them because we need to decide at upload
-			// time whether the key is an Influx tag or field.
-			residues := make(map[string]string)
-			for _, r := range cs.Residues {
-				if len(r.Slice) > 1 {
-					err := fmt.Errorf("found non-singular key %q with values %v; comparison may be invalid, skipping...", r.S, r.Slice)
-					if !yield(comparison{}, err) {
-						return
-					}
-					continue comparisonLoop
-				}
-				residues[r.S] = r.Slice[0]
+		for t, table := range tables.Tables {
+			// All the other keys, which should be identical, are captured as
+			// sub-fields of .config, our table projection.
+			keys := make(map[string]string)
+			for _, f := range tableBy.Fields()[0].Sub {
+				keys[f.Name] = tables.Keys[t].Get(f)
 			}
-
-			// N.B. In our case Series should have length 1, because we're processing
-			// a single result here. By default the string value here is the commit date.
-			for i, series := range cs.Series {
-				for j, benchmarkName := range cs.Benchmarks {
-					sum := summaries[i][j]
-					if !sum.Defined() {
-						err := fmt.Errorf("summary not defined for %s %s", series, benchmarkName)
+			for _, row := range table.Rows {
+				benchmarkName := row.StringValues()
+				for _, col := range table.Cols {
+					cell, ok := table.Cells[benchtab.TableKey{Row: row, Col: col}]
+					if !ok {
+						// Cell not present due to missing data.
+						err := fmt.Errorf("summary not defined %s", benchmarkName)
 						if !yield(comparison{}, err) {
 							return
 						}
 						continue
 					}
-					if !yield(comparison{series, benchmarkName, cs.Unit, residues, cs.HashPairs, sum}, nil) {
+					if cell.Baseline == nil {
+						// Non-comparison cell.
+						continue
+					}
+					if len(cell.Summary.Warnings) != 0 {
+						// TODO(mknyszek): Make this an actual failure once it stops failing for x/tools.
+						// x/tools has 5 runs per benchmark, but we need 6 for 0.95 confidence.
+						log.Printf("warning: %s: %s: %s: %v", name, benchmarkName, table.Unit, errors.Join(cell.Summary.Warnings...))
+					}
+					lo, center, hi := ratioSummary(cell.Baseline.Sample, cell.Sample, confidence, 1000)
+					if !yield(comparison{benchmarkName, table.Unit, keys, lo, center, hi}, nil) {
 						return
 					}
 				}
@ -470,3 +492,82 @@ func benchCompare(rr io.Reader, name string, c comparisonConfig) (func(func(comp
 		}
 	}, nil
 }
+
+func ratioSummary(baseline, experiment *benchmath.Sample, confidence float64, bootstrapN int) (lo, center, hi float64) {
+	ratios := make([]float64, bootstrapN)
+	sampleNum := make([]float64, len(experiment.Values))
+	sampleDen := make([]float64, len(baseline.Values))
+	for i := range ratios {
+		resampleInto(experiment.Values, sampleNum)
+		resampleInto(baseline.Values, sampleDen)
+		den := median(sampleDen)
+		if den == 0 {
+			num := median(sampleNum)
+			if num >= 0 {
+				ratios[i] = (num + 1)
+			} else {
+				ratios[i] = (num - 1)
+			}
+		} else {
+			ratios[i] = median(sampleNum) / den
+		}
+	}
+	slices.Sort(ratios)
+	p := (1 - confidence) / 2
+	lo = percentile(ratios, p)
+	hi = percentile(ratios, 1-p)
+	center = median(ratios)
+	return
+}
+
+func percentile(a []float64, p float64) float64 {
+	if len(a) == 0 {
+		return math.NaN()
+	}
+	if p == 0 {
+		return a[0]
+	}
+	n := len(a)
+	if p == 1 {
+		return a[n-1]
+	}
+	f := float64(float64(n) * p) // Suppress fused-multiply-add
+	i := int(f)
+	x := f - float64(i)
+	r := a[i]
+	if x > 0 && i+1 < len(a) {
+		r = float64(r*(1-x)) + float64(a[i+1]*x) // Suppress fused-multiply-add
+	}
+	return r
+}
+
+func median(a []float64) float64 {
+	l := len(a)
+	if l&1 == 1 {
+		return a[l/2]
+	}
+	return (a[l/2] + a[l/2-1]) / 2
+}
+
+func norm(a []float64, l float64) float64 {
+	if len(a) == 0 {
+		return math.NaN()
+	}
+	n := 0.0
+	sum := 0.0
+	for _, x := range a {
+		if math.IsInf(x, 0) || math.IsNaN(x) {
+			continue
+		}
+		sum += math.Pow(math.Abs(x), l)
+		n++
+	}
+	return math.Pow(sum/n, 1/l)
+}
+
+func resampleInto(sample, dst []float64) {
+	for i := range dst {
+		dst[i] = sample[rand.N[int](len(sample))]
+	}
+	slices.Sort(dst)
+}
--- a/perf/app/internal/benchtab/README.md
+++ b/perf/app/internal/benchtab/README.md
@ -0,0 +1,5 @@
+# internal/benchtab
+
+This is a fork of `golang.org/x/perf/cmd/benchstat/internal/benchtab`,
+which is the core of the `benchstat` command.
+The perf service reuses that core to perform its own comparisons.
--- a/perf/app/internal/benchtab/builder.go
+++ b/perf/app/internal/benchtab/builder.go
@ -0,0 +1,344 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package benchtab presents benchmark results as comparison tables.
+package benchtab
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"runtime"
+	"strings"
+	"sync"
+
+	"github.com/aclements/go-moremath/stats"
+	"golang.org/x/perf/benchfmt"
+	"golang.org/x/perf/benchmath"
+	"golang.org/x/perf/benchproc"
+)
+
+// TODO: Color by good/bad (or nothing for unknown units)
+
+// A Builder collects benchmark results into a Tables set.
+type Builder struct {
+	tableBy, rowBy, colBy *benchproc.Projection
+	residue               *benchproc.Projection
+
+	unitField *benchproc.Field
+
+	// tables maps from tableBy to table.
+	tables map[benchproc.Key]*builderTable
+}
+
+type builderTable struct {
+	// Observed row and col Keys within this group. Within the
+	// group, we show only the row and col labels for the data in
+	// the group, but we sort them according to the global
+	// observation order for consistency across groups.
+	rows map[benchproc.Key]struct{}
+	cols map[benchproc.Key]struct{}
+
+	// cells maps from (row, col) to each cell.
+	cells map[TableKey]*builderCell
+}
+
+type builderCell struct {
+	// values is the observed values in this cell.
+	values []float64
+	// residue is the set of residue keys mapped to this cell.
+	// It is used to check for non-unique keys.
+	residue map[benchproc.Key]struct{}
+}
+
+// NewBuilder creates a new Builder for collecting benchmark results
+// into tables. Each result will be mapped to a Table by tableBy.
+// Within each table, the results are mapped to cells by rowBy and
+// colBy. Any results within a single cell that vary by residue will
+// be reported as warnings. tableBy must have a ".unit" field.
+func NewBuilder(tableBy, rowBy, colBy, residue *benchproc.Projection) *Builder {
+	tableFields := tableBy.Fields()
+	unitField := tableFields[len(tableFields)-1]
+	if unitField.Name != ".unit" {
+		panic("tableBy projection missing .unit field")
+	}
+	return &Builder{
+		tableBy: tableBy, rowBy: rowBy, colBy: colBy, residue: residue,
+		unitField: unitField,
+		tables:    make(map[benchproc.Key]*builderTable),
+	}
+}
+
+// Add adds all of the values in result to the tables in the Builder.
+func (b *Builder) Add(result *benchfmt.Result) {
+	// Project the result.
+	tableKeys := b.tableBy.ProjectValues(result)
+	rowKey := b.rowBy.Project(result)
+	colKey := b.colBy.Project(result)
+	residueKey := b.residue.Project(result)
+	cellKey := TableKey{rowKey, colKey}
+
+	// Map to tables.
+	for unitI, tableKey := range tableKeys {
+		table := b.tables[tableKey]
+		if table == nil {
+			table = b.newTable()
+			b.tables[tableKey] = table
+		}
+
+		// Map to a cell.
+		c := table.cells[cellKey]
+		if c == nil {
+			c = new(builderCell)
+			c.residue = make(map[benchproc.Key]struct{})
+			table.cells[cellKey] = c
+			table.rows[rowKey] = struct{}{}
+			table.cols[colKey] = struct{}{}
+		}
+
+		// Add to the cell.
+		c.values = append(c.values, result.Values[unitI].Value)
+		c.residue[residueKey] = struct{}{}
+	}
+}
+
+func (b *Builder) newTable() *builderTable {
+	return &builderTable{
+		rows:  make(map[benchproc.Key]struct{}),
+		cols:  make(map[benchproc.Key]struct{}),
+		cells: make(map[TableKey]*builderCell),
+	}
+}
+
+// TableOpts provides options for constructing the final analysis
+// tables from a Builder.
+type TableOpts struct {
+	// Confidence is the desired confidence level in summary
+	// intervals; e.g., 0.95 for 95%.
+	Confidence float64
+
+	// Thresholds is the thresholds to use for statistical tests.
+	Thresholds *benchmath.Thresholds
+
+	// Units is the unit metadata. This gives distributional
+	// assumptions for units, among other properties.
+	Units benchfmt.UnitMetadataMap
+}
+
+// Tables is a sequence of benchmark statistic tables.
+type Tables struct {
+	// Tables is a slice of statistic tables. Within a Table, all
+	// results have the same table Key (including unit).
+	Tables []*Table
+	// Keys is a slice of table keys, corresponding 1:1 to
+	// the Tables slice. These always end with a ".unit"
+	// field giving the unit.
+	Keys []benchproc.Key
+}
+
+// ToTables finalizes a Builder into a sequence of statistic tables.
+func (b *Builder) ToTables(opts TableOpts) *Tables {
+	// Sort tables.
+	var keys []benchproc.Key
+	for k := range b.tables {
+		keys = append(keys, k)
+	}
+	benchproc.SortKeys(keys)
+
+	// We're going to compute table cells in parallel because the
+	// statistics are somewhat expensive. This is entirely
+	// CPU-bound, so we put a simple concurrency limit on it.
+	limit := make(chan struct{}, 2*runtime.GOMAXPROCS(-1))
+	var wg sync.WaitGroup
+
+	// Process each table.
+	var tables []*Table
+	for _, k := range keys {
+		cTable := b.tables[k]
+
+		// Get the configured assumption for this unit.
+		unit := k.Get(b.unitField)
+		assumption := opts.Units.GetAssumption(unit)
+
+		// Sort the rows and columns.
+		rowKeys, colKeys := mapKeys(cTable.rows), mapKeys(cTable.cols)
+		table := &Table{
+			Unit:       unit,
+			Opts:       opts,
+			Assumption: assumption,
+			Rows:       rowKeys,
+			Cols:       colKeys,
+			Cells:      make(map[TableKey]*TableCell),
+		}
+		tables = append(tables, table)
+
+		// Create all TableCells and fill their Samples. This
+		// is fast enough it's not worth parallelizing. This
+		// enables the second pass to look up baselines and
+		// their samples.
+		for k, cCell := range cTable.cells {
+			table.Cells[k] = &TableCell{
+				Sample: benchmath.NewSample(cCell.values, opts.Thresholds),
+			}
+		}
+
+		// Populate cells.
+		baselineCfg := colKeys[0]
+		wg.Add(len(cTable.cells))
+		for k, cCell := range cTable.cells {
+			cell := table.Cells[k]
+
+			// Look up the baseline.
+			if k.Col != baselineCfg {
+				base, ok := table.Cells[TableKey{k.Row, baselineCfg}]
+				if ok {
+					cell.Baseline = base
+				}
+			}
+
+			limit <- struct{}{}
+			cCell := cCell
+			go func() {
+				summarizeCell(cCell, cell, assumption, opts.Confidence)
+				<-limit
+				wg.Done()
+			}()
+		}
+	}
+	wg.Wait()
+
+	// Add summary rows to each table.
+	for _, table := range tables {
+		table.SummaryLabel = "geomean"
+		table.Summary = make(map[benchproc.Key]*TableSummary)
+
+		// Count the number of baseline benchmarks. If later
+		// columns don't have the same number of baseline
+		// pairings, we know the benchmark sets don't match.
+		nBase := 0
+		baseCol := table.Cols[0]
+		for _, row := range table.Rows {
+			if _, ok := table.Cells[TableKey{row, baseCol}]; ok {
+				nBase++
+			}
+		}
+
+		for i, col := range table.Cols {
+			var s TableSummary
+			table.Summary[col] = &s
+			isBase := i == 0
+
+			limit <- struct{}{}
+			table, col := table, col
+			wg.Add(1)
+			go func() {
+				summarizeCol(table, col, &s, nBase, isBase)
+				<-limit
+				wg.Done()
+			}()
+		}
+	}
+	wg.Wait()
+
+	return &Tables{tables, keys}
+}
+
+func mapKeys(m map[benchproc.Key]struct{}) []benchproc.Key {
+	var keys []benchproc.Key
+	for k := range m {
+		keys = append(keys, k)
+	}
+	benchproc.SortKeys(keys)
+	return keys
+}
+
+func summarizeCell(cCell *builderCell, cell *TableCell, assumption benchmath.Assumption, confidence float64) {
+	cell.Summary = assumption.Summary(cell.Sample, confidence)
+
+	// If there's a baseline, compute comparison.
+	if cell.Baseline != nil {
+		cell.Comparison = assumption.Compare(cell.Baseline.Sample, cell.Sample)
+	}
+
+	// Warn for non-singular keys in this cell.
+	nsk := benchproc.NonSingularFields(mapKeys(cCell.residue))
+	if len(nsk) > 0 {
+		// Emit a warning.
+		var warn strings.Builder
+		warn.WriteString("benchmarks vary in ")
+		for i, field := range nsk {
+			if i > 0 {
+				warn.WriteString(", ")
+			}
+			warn.WriteString(field.Name)
+		}
+
+		cell.Sample.Warnings = append(cell.Sample.Warnings, errors.New(warn.String()))
+	}
+}
+
+func summarizeCol(table *Table, col benchproc.Key, s *TableSummary, nBase int, isBase bool) {
+	// Collect cells.
+	//
+	// This computes the geomean of the summary ratios rather than
+	// ratio of the summary geomeans. These are identical *if* the
+	// benchmark sets are the same. But if the benchmark sets
+	// differ, this leads to more sensible ratios because it's
+	// still the geomean of the column, rather than being a
+	// comparison of two incomparable numbers. It's still easy to
+	// misinterpret, but at least it's not meaningless.
+	var summaries, ratios []float64
+	badRatio := false
+	for _, row := range table.Rows {
+		cell, ok := table.Cells[TableKey{row, col}]
+		if !ok {
+			continue
+		}
+		summaries = append(summaries, cell.Summary.Center)
+		if cell.Baseline != nil {
+			var ratio float64
+			a, b := cell.Summary.Center, cell.Baseline.Summary.Center
+			if a == b {
+				// Treat 0/0 as 1.
+				ratio = 1
+			} else if b == 0 {
+				badRatio = true
+				// Keep nBase check working.
+				ratios = append(ratios, 0)
+				continue
+			} else {
+				ratio = a / b
+			}
+			ratios = append(ratios, ratio)
+		}
+	}
+
+	// If the number of cells in this column that had a baseline
+	// is the same as the total number of baselines, then we know
+	// the benchmark sets match. Otherwise, they don't and these
+	// numbers are probably misleading.
+	if !isBase && nBase != len(ratios) {
+		s.Warnings = append(s.Warnings, fmt.Errorf("benchmark set differs from baseline; geomeans may not be comparable"))
+	}
+
+	// Summarize centers.
+	gm := stats.GeoMean(summaries)
+	if math.IsNaN(gm) {
+		s.Warnings = append(s.Warnings, fmt.Errorf("summaries must be >0 to compute geomean"))
+	} else {
+		s.HasSummary = true
+		s.Summary = gm
+	}
+
+	// Summarize ratios.
+	if !isBase && !badRatio {
+		gm := stats.GeoMean(ratios)
+		if math.IsNaN(gm) {
+			s.Warnings = append(s.Warnings, fmt.Errorf("ratios must be >0 to compute geomean"))
+		} else {
+			s.HasRatio = true
+			s.Ratio = gm
+		}
+	}
+}
--- a/perf/app/internal/benchtab/table.go
+++ b/perf/app/internal/benchtab/table.go
@ -0,0 +1,105 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package benchtab
+
+import (
+	"golang.org/x/perf/benchmath"
+	"golang.org/x/perf/benchproc"
+	"golang.org/x/perf/benchunit"
+)
+
+// A Table summarizes and compares benchmark results in a 2D grid.
+// Each cell summarizes a Sample of results with identical row and
+// column Keys. Comparisons are done within each row between the
+// Sample in the first column and the Samples in any remaining
+// columns.
+type Table struct {
+	// Opts is the configuration options for this table.
+	Opts TableOpts
+
+	// Unit is the benchmark unit of all samples in this Table.
+	Unit string
+
+	// Assumption is the distributional assumption used for all
+	// samples in this table.
+	Assumption benchmath.Assumption
+
+	// Rows and Cols give the sequence of row and column Keys
+	// in this table. All row Keys have the same Projection and all
+	// col Keys have the same Projection.
+	Rows, Cols []benchproc.Key
+
+	// Cells is the cells in the body of this table. Each key in
+	// this map is a pair of some Key from Rows and some Key
+	// from Cols. However, not all Pairs may be present in the
+	// map.
+	Cells map[TableKey]*TableCell
+
+	// Summary is the final row of this table, which gives summary
+	// information across all benchmarks in this table. It is
+	// keyed by Cols.
+	Summary map[benchproc.Key]*TableSummary
+
+	// SummaryLabel is the label for the summary row.
+	SummaryLabel string
+}
+
+// TableKey is a map key used to index a single cell in a Table.
+type TableKey struct {
+	Row, Col benchproc.Key
+}
+
+// TableCell is a single cell in a Table. It represents a sample of
+// benchmark results with the same row and column Key.
+type TableCell struct {
+	// Sample is the set of benchmark results in this cell.
+	Sample *benchmath.Sample
+
+	// Summary is the summary of Sample, as computed by the
+	// Table's distributional assumption.
+	Summary benchmath.Summary
+
+	// Baseline is the baseline cell used for comparisons with
+	// this cell, or nil if there is no comparison. This is the
+	// cell in the first column of this cell's row, if any.
+	Baseline *TableCell
+
+	// Comparison is the comparison with the Baseline cell, as
+	// computed by the Table's distributional assumption. If
+	// Baseline is nil, this value is meaningless.
+	Comparison benchmath.Comparison
+}
+
+// TableSummary is a cell that summarizes a column of a Table.
+// It appears in the last row of a table.
+type TableSummary struct {
+	// HasSummary indicates that Summary is valid.
+	HasSummary bool
+	// Summary summarizes all of the TableCell.Summary values in
+	// this column.
+	Summary float64
+
+	// HasRatio indicates that Ratio is valid.
+	HasRatio bool
+	// Ratio summarizes all of the TableCell.Comparison values in
+	// this column.
+	Ratio float64
+
+	// Warnings is a list of warnings for this summary cell.
+	Warnings []error
+}
+
+// RowScaler returns a common scaler for the values in row.
+func (t *Table) RowScaler(row benchproc.Key, unitClass benchunit.Class) benchunit.Scaler {
+	// Collect the row summaries.
+	var values []float64
+	for _, col := range t.Cols {
+		cell, ok := t.Cells[TableKey{row, col}]
+		if ok {
+			values = append(values, cell.Summary.Center)
+		}
+	}
+	return benchunit.CommonScale(values, unitClass)
+}