More accurate and stable estimation of standard error. Unit tests for new functionality. Added reporting of errors

to make_summary.py.
2015-05-04 19:09:56 -07:00 · 2015-05-04 19:09:56 -07:00 · f5632ba57f
--- a/analysis/R/decode.R
+++ b/analysis/R/decode.R
@ -69,13 +69,6 @@ EstimateBloomCounts <- function(params, obs_counts) {
  	p_hats <- pmax(0, pmin(1, p_hats))  # clamp to [0,1]
    r <- p_hats * p11 + (1 - p_hats) * p01  # expectation of a reported 1
    N * r * (1 - r) / p2^2  # variance of the binomial
-
-    # using the formula for the random sum of random variables:
-#    var11 <- p_hats * N * p11 * (1 - p11) + p11^2 * p_hats * (1 - p_hats) * N
-#  	var01 <- (1 - p_hats) * N * p01 * (1 - p01) +
-#             p01^2 * (1 - p_hats) * p_hats * N
-
-#  	(var11 + var01) / p2^2
  })

  # Transform counts from absolute values to fractional, removing bias due to
@ -108,11 +101,10 @@ FitLasso <- function(X, Y, intercept = TRUE) {

  # If fitting fails, return an empty data.frame.
  if (class(mod)[1] == "try-error") {
-    coefs <- rep(0, ncol(X))
-    names(coefs) <- colnames(X)
+    coefs <- setNames(rep(0, ncol(X)), colnames(X))
  } else {
    coefs <- coef(mod)
-    coefs <- coefs[-1, ncol(coefs)]
+    coefs <- coefs[-1, ncol(coefs), drop = FALSE]
  }
  coefs
 }
@ -135,34 +127,29 @@ PerformInference <- function(X, Y, N, mod, params, alpha, correction) {

  betas <- matrix(mod$coefs, ncol = 1)

-  # This is what we want
-  # mod_var <- summary(mod$fit)$sigma^2
-  # betas_sd <- rep(sqrt(max(resid_var, mod_var) / (m * h)), length(betas))
-
-  # This is what we have
-  mod_var <- 0
-  betas_sd <- 1
-
-  z_values <- betas / betas_sd
-
-  # 1-sided t-test.
-  p_values <- pnorm(z_values, lower = FALSE)
+#   mod_var <- summary(mod$fit)$sigma^2
+#   betas_sd <- rep(sqrt(max(resid_var, mod_var) / (m * h)), length(betas))
+#
+#   z_values <- betas / betas_sd
+#
+#   # 1-sided t-test.
+#   p_values <- pnorm(z_values, lower = FALSE)

  fit <- data.frame(String = colnames(X), Estimate = betas,
-                    SD = betas_sd, z_stat = z_values, pvalue = p_values,
+                    SD = mod$stds, # z_stat = z_values, pvalue = p_values,
                    stringsAsFactors = FALSE)

-  if (correction == "FDR") {
-    fit <- fit[order(fit$pvalue, decreasing = FALSE), ]
-    ind <- which(fit$pvalue < (1:nrow(fit)) * alpha / nrow(fit))
-    if (length(ind) > 0) {
-      fit <- fit[1:max(ind), ]
-    } else {
-      fit <- fit[numeric(0), ]
-    }
-  } else {
-    fit <- fit[fit$p < alpha, ]
-  }
+#   if (correction == "FDR") {
+#     fit <- fit[order(fit$pvalue, decreasing = FALSE), ]
+#     ind <- which(fit$pvalue < (1:nrow(fit)) * alpha / nrow(fit))
+#     if (length(ind) > 0) {
+#       fit <- fit[1:max(ind), ]
+#     } else {
+#       fit <- fit[numeric(0), ]
+#     }
+#   } else {
+#     fit <- fit[fit$p < alpha, ]
+#   }

  fit <- fit[order(fit$Estimate, decreasing = TRUE), ]

@ -284,10 +271,11 @@ Decode <- function(counts, map, params, alpha = 0.05,

  es <- EstimateBloomCounts(params, counts)

-  estimates_stds_filtered <- list(estimates = es$estimates[filter_cohorts,],
-                                  stds = es$stds[filter_cohorts,])
+  estimates_stds_filtered <-
+    list(estimates = es$estimates[filter_cohorts, , drop = FALSE],
+         stds = es$stds[filter_cohorts, , drop = FALSE])

-  coefs <- vector()
+  coefs_all <- vector()

  for(r in 1:5)
  {
@ -295,21 +283,24 @@ Decode <- function(counts, map, params, alpha = 0.05,
      e <- Resample(estimates_stds_filtered)
    else
      e <- estimates_stds_filtered
-    coefs <- rbind(coefs, FitDistribution(e, map[filter_bits,]))
+
+    coefs_all <- rbind(coefs_all, FitDistribution(e, map[filter_bits,]))
  }

-  coefs_ssd <- N * apply(coefs, 2, sd)  # compute sample standard deviations
-  coefs <- N * apply(coefs, 2, median)
+  coefs_ssd <- N * apply(coefs_all, 2, sd)  # compute sample standard deviations
+  coefs_ave <- N * apply(coefs_all, 2, mean)

-  coefs[coefs < coefs_ssd] <- 0  # zero out coefficients within ssd from 0
+  # Only select coefficients more than two standard deviations from 0. May
+  # exaggerate empirical SD of the estimates.
+  reported <- which(coefs_ave > 1E-6 + 2 * coefs_ssd)

-  mod <- list(coefs = coefs, resid = NULL)  # a stub for now
+  mod <- list(coefs = coefs_ave[reported], stds = coefs_ssd[reported])

  if (correction == "Bonferroni") {
    alpha <- alpha / S
  }

-  inf <- PerformInference(map[filter_bits,],
+  inf <- PerformInference(map[filter_bits,reported, drop = FALSE],
                          as.vector(t(estimates_stds_filtered$estimates)),
                          N, mod, params, alpha,
                          correction)
@ -319,12 +310,11 @@ Decode <- function(counts, map, params, alpha = 0.05,
  if (sum(map) == sum(diag(map))) {
    fit$Estimate <- colSums(counts)[-1]
  }
-  resid <- mod$resid / inf$resid_sigma

  # Estimates from the model are per instance so must be multipled by h.
  # Standard errors are also adjusted.
  fit$Total_Est <- floor(fit$Estimate)
-  fit$Total_SD <- floor(fit$SD * m)
+  fit$Total_SD <- floor(fit$SD)
  fit$Prop <- fit$Total_Est / N
  fit$LPB <- fit$Prop - 1.96 * fit$Total_SD / N
  fit$UPB <- fit$Prop + 1.96 * fit$Total_SD / N
@ -351,7 +341,7 @@ Decode <- function(counts, map, params, alpha = 0.05,

  list(fit = fit, summary = res_summary, privacy = privacy, params = params,
       lasso = NULL, ests = as.vector(t(estimates_stds_filtered$estimates)),
-       counts = counts[, -1], resid = resid)
+       counts = counts[, -1], resid = NULL)
 }

 ComputeCounts <- function(reports, cohorts, params) {
--- a/analysis/R/decode_test.R
+++ b/analysis/R/decode_test.R
@ -12,12 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-#
-# This library implements the RAPPOR marginal decoding algorithms using LASSO.
-
 library(RUnit)
 library(abind)

+source('analysis/R/decode.R')
 source('tests/gen_counts.R')

 L1Distance <- function(X, Y) {
@ -39,61 +37,101 @@ LInfDistance <- function(X, Y) {
      Y[!names(Y) %in% common])
 }

+MatrixVectorMerge <- function(mat, vec) {
+  # Attaches a vector to a matrix, matching corresponding column names

-RunMultipleTests <- function(title, fun, repetitions, ...)
-{
-  cat(title, ": ")
-  pb <- txtProgressBar(min = 0, max = repetitions,
-                       width = getOption("width") - 20 - nchar(title))
+  mat_only <- setdiff(colnames(mat), names(vec))
+  vec_only <- setdiff(names(vec), colnames(mat))

-  for(i in 1:repetitions)
-  {
-    setTxtProgressBar(pb, i)
+  # extend the vector with missing columns
+  vec_long <- c(vec, setNames(rep(NA, length(mat_only)), mat_only))

-    fun(...)
+  # extend the matrix with missing columns
+  newcols <- matrix(NA, nrow = nrow(mat), ncol = length(vec_only))
+  colnames(newcols) <- vec_only
+  mat_long <- cbind(mat, newcols)
+
+  # Now vec and mat have the same columns, but in the wrong order. Sort the
+  # columns lexicographically.
+  if(length(vec_long) > 0) {
+    mat_long <- mat_long[, order(colnames(mat_long)), drop = FALSE]
+    vec_long <- vec_long[order(names(vec_long))]
  }
-  cat(" Done.")

-  close(pb)
+  rbind(mat_long, vec_long)
 }

-TestEstimatesAndStdsHelper <- function(params, map, partition) {
+RunMultipleTests <- function(title, fun, repetitions, ...) {
+  # Run a function with an annotated progress indicator
+  cat(title, ": ")
+
+  if(repetitions == 1) {
+    # only run once
+    fun(...)
+
+    cat(" Done.")
+  }
+  else {  # run multiple times
+    pb <- txtProgressBar(min = 0, max = repetitions,
+                         width = getOption("width") - 20 - nchar(title))
+
+    for(i in 1:repetitions) {
+      setTxtProgressBar(pb, i)
+      fun(...)
+    }
+    cat(" Done.")
+    close(pb)
+  }
+}
+
+TestEstimatesAndStdsHelper <- function(params, map, pdf, total) {
  # Helper function for TestEstimateBloomCounts.
+  partition <- RandomPartition(total, pdf)
  counts <- GenerateCounts(params, map, partition, 1)
  e <- EstimateBloomCounts(params, counts)

  results$estimates <<- abind(results$estimates, e$estimates, along = 3)
  results$stds <<- abind(results$stds, e$stds, along = 3)
-  results$counts <<- abind(results$counts, counts, along = 3)
 }

-TestEstimatesAndStds <- function(repetitions, title,
-                                 params, map, partition, true_distr) {
-  v <- 1  # only handly one report per client
-
-  total <- sum(partition)
-
-  results <<- c(estimates = list(), stds = list(), counts = list())
+TestEstimatesAndStds <- function(repetitions, title, params, map, pdf, total) {
+  # Checks that the expectations returned by EstimateBloomCounts on simulated
+  # inputs match the ground truth and the empirical standard deviation matches
+  # EstimateBloomCounts outputs.
+  #
+  # Input:
+  #   repetitions: the number of runs ofEstimateBloomCounts
+  #   title: label
+  #   params: params vector
+  #   map: the map table
+  #   pdf: probability density function of the distribution from which simulated
+  #        clients are sampled
+  #   total: number of reports
+  results <<- c(estimates = list(), stds = list())

  RunMultipleTests(title, TestEstimatesAndStdsHelper, repetitions,
-                   params, map, partition)
+                   params, map, pdf, total)

  ave_e <- apply(results$estimates,1:2, mean)
  observed_stds <- apply(results$estimates,1:2, sd)
  ave_stds <- apply(results$stds,1:2, mean)

-  if(!is.null(true_distr))
-    checkTrue(!any((ave_e - true_distr) > (ave_stds / repetitions^.5) * 5),
+  ground_truth <- matrix(map %*% pdf, nrow = params$m, byrow = TRUE)
+
+  checkTrue(!any(abs(ave_e - ground_truth) > 1E-9 +  # tolerance level
+                                             (ave_stds / repetitions^.5) * 5),
              "Averages deviate too much from expectations.")

-  checkTrue(!any(observed_stds > ave_stds * 2),
-            "Expected standard deviations are too pessimistic.")
+  checkTrue(!any(observed_stds > ave_stds * (1 + 5 * repetitions^.5)),
+            "Expected standard deviations are too high")

-  checkTrue(!any(observed_stds < ave_stds / 2),
-            "Expected standard deviations are too optimistic")
+  checkTrue(!any(observed_stds < ave_stds * (1 - 5 * repetitions^.5)),
+            "Expected standard deviations are too low")
 }

 TestEstimateBloomCounts <- function() {
+  # Unit tests for the EstimateBloomCounts function.
+
  report4x2 <- list(k = 4, m = 2)  # 2 cohorts, 4 bits each
  map0 <- Matrix(0, nrow = 8, ncol = 3, sparse = TRUE)  # 3 possible values
  map0[1,] <- c(1, 0, 0)
@ -104,19 +142,17 @@ TestEstimateBloomCounts <- function() {

  colnames(map0) <- c('v1', 'v2', 'v3')

-  partition0 <- c(3, 2, 1) * 100
-  names(partition0) <- colnames(map0)
-
-  true_distr <- matrix(c(1/2, 1/3, 1/6, 1, 1/6, 0, 0, 0), 2, 4, byrow = TRUE)
+  pdf0 <- c(1/2, 1/3, 1/6)
+  names(pdf0) <- colnames(map0)

  noise0 <- list(p = 0, q = 1, f = 0)  # no noise at all

  TestEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (1/3)",
-                       c(report4x2, noise0), map0, partition0, true_distr)
+                       c(report4x2, noise0), map0, pdf0, 100)

  noise1 <- list(p = 0.4, q = .6, f = 0.5)
  TestEstimatesAndStds(repetitions = 1000, "Testing estimates and stds (2/3)",
-                       c(report4x2, noise1), map0, partition0, true_distr)
+                       c(report4x2, noise1), map0, pdf0, 100)

  # MEDIUM TEST: 100 values, 32 cohorts, 8 bits each, 10^6 reports
  values <- 100
@ -127,35 +163,66 @@ TestEstimateBloomCounts <- function() {

  colnames(map1) <- sprintf("v%d", 1:values)

-  pdf <- ComputePdf("zipf1", values)
-  partition1 <- RandomPartition(10^9, pdf)
+  pdf1 <- ComputePdf("zipf1", values)

  TestEstimatesAndStds(repetitions = 100, "Testing estimates and stds (3/3)",
-                       c(report8x32, noise1), map1, partition1, NULL)
+                       c(report8x32, noise1), map1, pdf1, 10^9)
 }

-TestDecodeHelper <- function(params, map, partition, tolerance_l1,
-                             tolerance_linf) {
-  # Helper function for TestDecode.
+TestDecodeHelper <- function(params, map, pdf, num_clients,
+                             tolerance_l1, tolerance_linf) {
+  # Helper function for TestDecode. Simulates a RAPPOR run and checks results of
+  # Decode's output against the ground truth. Results are appended to a global
+  # list.

+  partition <- RandomPartition(num_clients, pdf)
  counts <- GenerateCounts(params, map, partition, 1)
  total <- sum(partition)

  decoded <- Decode(counts, map, params)

-  l1 <- L1Distance(setNames(decoded$fit$estimate, decoded$fit$strings),
-                   partition)
+  decoded_partition <- setNames(decoded$fit$estimate, decoded$fit$strings)

-  checkTrue(L1Distance(setNames(decoded$fit$estimate, decoded$fit$strings),
-                       partition) < total^.5 * tolerance_l1,
+  results$estimates <<- MatrixVectorMerge(results$estimates, decoded_partition)
+  results$stds <<- MatrixVectorMerge(results$stds,
+                                          setNames(decoded$fit$std_dev,
+                                                   decoded$fit$strings))
+
+  checkTrue(L1Distance(decoded_partition, partition) < total^.5 * tolerance_l1,
            "L1 distance is too large")

-  checkTrue(LInfDistance(setNames(decoded$fit$estimate, decoded$fit$strings),
-                       partition) < max(partition)^.5 * tolerance_linf,
-            "L_inf distance is too large")
+  checkTrue(LInfDistance(decoded_partition, partition) <
+              max(partition)^.5 * tolerance_linf, "L_inf distance is too large")
+}
+
+TestDecodeAveAndStds <- function(...) {
+  # Runs Decode multiple times (specified by the repetition argument), checks
+  # individuals runs against the ground truth, and the estimates of the standard
+  # error against empirical observations.
+
+  results <<- list(estimates = matrix(nrow = 0, ncol = 0),
+                   stds = matrix(nrow = 0, ncol = 0))
+
+  RunMultipleTests(...)
+
+  empirical_stds <- apply(results$estimates, 2, sd, na.rm = TRUE)
+  estimated_stds <- apply(results$stds, 2, mean, na.rm = TRUE)
+
+  if(dim(results$estimates)[1] > 1)
+  {
+    checkTrue(any(estimated_stds > empirical_stds / 2),
+              "Our estimate for the standard deviation is too low")
+
+    checkTrue(any(estimated_stds < empirical_stds * 3),
+              "Our estimate for the standard deviation is too high")
+  }
 }

 TestDecode <- function() {
+  # Unit tests for the Decode function.
+
+  # TOY TESTS: three values, 2 cohorts, 4 bits each
+
  report4x2 <- list(k = 4, m = 2, h = 2)  # 2 cohorts, 4 bits each
  map0 <- Matrix(0, nrow = 8, ncol = 3, sparse = TRUE)  # 3 possible values
  map0[1,] <- c(1, 0, 0)
@ -165,30 +232,28 @@ TestDecode <- function() {
  map0[5,] <- c(0, 0, 1)  # 1st bit of the second cohort gets signal from v3

  colnames(map0) <- c('v1', 'v2', 'v3')
+  distribution0 <- setNames(c(1/2, 1/3, 1/6),  colnames(map0))

-  # toy example
-  distribution0 <- setNames(c(.5, .3, 1/6),  colnames(map0))
-
-  noise0 <- list(p = 0, q = 1, f = 0)  # no noise whatsoever
  # Even in the absence of noise, the inferred counts won't necessarily
  # match the ground truth. Must be close enough though.
+  noise0 <- list(p = 0, q = 1, f = 0)  # no noise whatsoever

-#  RunMultipleTests("Testing Decode (1/5)", TestDecodeHelper, 100,
-#                   c(report4x2, noise0), map0, partition0,
-#                   tolerance_l1 = 5,
-#                   tolerance_linf = 3)
+  TestDecodeAveAndStds("Testing Decode (1/5)", TestDecodeHelper, 100,
+                       c(report4x2, noise0), map0, distribution0, 100,
+                       tolerance_l1 = 5,
+                       tolerance_linf = 3)

-  noise1 <- list(p = .4, q = .6, f = .5)  # substantial noise
-  RunMultipleTests("Testing Decode (2/5)", TestDecodeHelper, 100,
-                   c(report4x2, noise1), map0, partition0,
-                   tolerance_l1 = 20,
-                   tolerance_linf = 10)
+  noise1 <- list(p = .4, q = .6, f = .5)  # substantial noise, very few reports
+  TestDecodeAveAndStds("Testing Decode (2/5)", TestDecodeHelper, 100,
+                       c(report4x2, noise1), map0, distribution0, 100,
+                       tolerance_l1 = 20,
+                       tolerance_linf = 20)

-  partition1 <- setNames(c(3, 2, 1) * 100000,  colnames(map0))  # many reports
-  RunMultipleTests("Testing Decode (3/5)", TestDecodeHelper, 100,
-                   c(report4x2, noise1), map0, partition1,
-                   tolerance_l1 = 50,
-                   tolerance_linf = 40)
+  # substantial noise, many reports
+  TestDecodeAveAndStds("Testing Decode (3/5)", TestDecodeHelper, 100,
+                       c(report4x2, noise1), map0, distribution0, 100000,
+                       tolerance_l1 = 50,
+                       tolerance_linf = 40)

  # MEDIUM TEST: 100 values, 32 cohorts, 8 bits each, 10^6 reports
  values <- 100
@ -199,12 +264,12 @@ TestDecode <- function() {

  colnames(map1) <- sprintf("v%d", 1:values)

-  pdf <- ComputePdf("zipf1", values)
-  partition1 <- setNames(RandomPartition(10^6, pdf), colnames(map1))
-  RunMultipleTests("Testing Decode (4/5)", TestDecodeHelper, 100,
-                   c(report8x32, noise1), map1, partition1,
+  distribution1 <- ComputePdf("zipf1", values)
+  names(distribution1) <- colnames(map1)
+  TestDecodeAveAndStds("Testing Decode (4/5)", TestDecodeHelper, 100,
+                   c(report8x32, noise1), map1, distribution1, 10^6,
                   tolerance_l1 = values * 3,
-                   tolerance_linf = 50)
+                   tolerance_linf = 100)

  # Testing LASSO: 500 values, 32 cohorts, 8 bits each, 10^6 reports
  values <- 500
@ -215,10 +280,11 @@ TestDecode <- function() {

  colnames(map2) <- sprintf("v%d", 1:values)

-  pdf <- ComputePdf("zipf1.5", values)
-  partition2 <- setNames(RandomPartition(10^6, pdf), colnames(map2))
-  RunMultipleTests("Testing Decode (5/5)", TestDecodeHelper, 1,
-                   c(report8x32, noise0), map2, partition2,
+  distribution2 <- ComputePdf("zipf1.5", values)
+  names(distribution2) <- colnames(map2)
+
+  TestDecodeAveAndStds("Testing Decode (5/5)", TestDecodeHelper, 1,
+                   c(report8x32, noise0), map2, distribution2, 10^6,
                   tolerance_l1 = values * 3,
                   tolerance_linf = 20)

@ -229,5 +295,4 @@ TestAll <- function() {
  TestDecode()
 }

-
-TestAll()
+TestAll()
--- a/regtest.sh
+++ b/regtest.sh
@ -232,7 +232,7 @@ make-summary() {
  local dir=$1
  local filename=${2:-results.html}

-  tests/make_summary.py $dir > $dir/rows.html
+  tests/make_summary.py $dir $dir/rows.html

  pushd $dir >/dev/null

@ -240,6 +240,8 @@ make-summary() {
    | sed -e '/TABLE_ROWS/ r rows.html' \
    > $filename

+  rm rows.html
+
  popd >/dev/null

  log "Wrote $dir/$filename"
--- a/tests/gen_counts.R
+++ b/tests/gen_counts.R
@ -19,18 +19,19 @@ source('analysis/R/read_input.R')
 RandomPartition <- function(total, weights) {
  # Outputs a random partition according to a specified distribution
  # Args:
-  #   total - number of balls
-  #   weights - vector encoding the probability that a ball lands into a bin
+  #   total - number of samples
+  #   weights - weights that are proportional to the probability density
+  #              function of the target distribution
  # Returns:
-  #   an integer vector summing up to total
+  #   a histogram sampled according to the pdf
  # Example:
  #   > RandomPartition(100, c(3, 2, 1, 0, 1))
  #   [1] 47 24 15  0 14
  if (any(weights < 0))
-    stop("Weights cannot be negative")
+    stop("Probabilities cannot be negative")

  if (sum(weights) == 0)
-    stop("Weights cannot sum up to 0")
+    stop("Probabilities cannot sum up to 0")

  bins <- length(weights)
  result <- rep(0, bins)
@ -59,6 +60,8 @@ RandomPartition <- function(total, weights) {
      w <- w - weights[i]
  }

+  names(result) <- names(weights)
+
  return(result)
 }

--- a/tests/make_summary.py
+++ b/tests/make_summary.py
@ -1,5 +1,5 @@
 #!/usr/bin/python
-"""Given a regtest result tree, prints an HTML summary on stdout.
+"""Given a regtest result tree, prints an HTML summary to a file.

 See HTML skeleton in tests/regtest.html.
 """
@ -170,56 +170,64 @@ def ParseMetrics(metrics_file, log_file, num_additional):
  """Processes the metrics file.

  Args:
-    report_dir: A directory name containing metrics.csv and log.txt.
+    metrics_file: name of the metrics file
+    log_file: name of the log.txt file
    num_additional: A number of bogus candidates added to the candidate list.

  Returns a pair:
    - A dictionary of metrics (some can be []).
    - An HTML-formatted portion of the report row.
  """
-  with open(metrics_file) as m:
-    m.readline()
-    metrics_row = m.readline().split(',')

-  (num_actual, num_rappor, num_false_pos, num_false_neg, total_variation,
-   allocated_mass) = metrics_row
+  if not os.path.isfile(metrics_file):
+    metrics_row_str = ['', '', '', '', '', '']
+    metrics_row_dict = {}
+  else:
+    with open(metrics_file) as m:
+      m.readline()
+      metrics_row = m.readline().split(',')

-  num_actual = int(num_actual)
-  num_rappor = int(num_rappor)
+    (num_actual, num_rappor, num_false_pos, num_false_neg, total_variation,
+        allocated_mass) = metrics_row

-  num_false_pos = int(num_false_pos)
-  num_false_neg = int(num_false_neg)
+    num_actual = int(num_actual)
+    num_rappor = int(num_rappor)

-  total_variation = float(total_variation)
-  allocated_mass = float(allocated_mass)
+    num_false_pos = int(num_false_pos)
+    num_false_neg = int(num_false_neg)
+
+    total_variation = float(total_variation)
+    allocated_mass = float(allocated_mass)
+
+    # e.g. if there are 20 additional candidates added, and 1 false positive,
+    # the false positive rate is 5%.
+    fp_rate = float(num_false_pos) / num_additional if num_additional else 0
+    # e.g. if there are 100 strings in the true input, and 80 strings
+    # detected by RAPPOR, then we have 20 false negatives, and a false
+    # negative rate of 20%.
+    fn_rate = float(num_false_neg) / num_actual
+
+    metrics_row_str = [
+        str(num_actual),
+        str(num_rappor),
+        '%.1f%% (%d)' % (fp_rate * 100, num_false_pos) if num_additional
+        else '',
+        '%.1f%% (%d)' % (fn_rate * 100, num_false_neg),
+        '%.3f' % total_variation,
+        '%.3f' % allocated_mass,
+    ]
+
+    metrics_row_dict = {
+        'tv': [total_variation],
+        'fpr': [fp_rate] if num_additional else [],
+        'fnr': [fn_rate],
+        'am': [allocated_mass],
+    }

  elapsed_time = ExtractTime(log_file)
-
-  # e.g. if there are 20 additional candidates added, and 1 false positive,
-  # the false positive rate is 5%.
-  fp_rate = float(num_false_pos) / num_additional if num_additional else 0
-  # e.g. if there are 100 strings in the true input, and 80 strings
-  # detected by RAPPOR, then we have 20 false negatives, and a false
-  # negative rate of 20%.
-  fn_rate = float(num_false_neg) / num_actual
-
-  metrics_row_str = [
-      str(num_actual),
-      str(num_rappor),
-      '%.1f%% (%d)' % (fp_rate * 100, num_false_pos) if num_additional else '',
-      '%.1f%% (%d)' % (fn_rate * 100, num_false_neg),
-      '%.3f' % total_variation,
-      '%.3f' % allocated_mass,
-      '%.2f' % elapsed_time if elapsed_time is not None else '',
-  ]
-
-  metrics_row_dict = {
-      'tv': [total_variation],
-      'fpr': [fp_rate] if num_additional else [],
-      'fnr': [fn_rate],
-      'am': [allocated_mass],
-      'time': [elapsed_time] if elapsed_time is not None else [],
-  }
+  if elapsed_time is not None:
+    metrics_row_str = metrics_row_str + ['%.2f' % elapsed_time]
+    metrics_row_dict['time'] = [elapsed_time]

  # return metrics formatted as HTML table entries
  return (metrics_row_dict,
@ -292,11 +300,15 @@ def FormatPlots(base_dir, test_instances):

 def main(argv):
  base_dir = argv[1]
+  output_file = open(argv[2], 'w')

  # This file has the test case names, in the order that they should be
  # displayed.
-  path = os.path.join(base_dir, 'test-instances.txt')
-  with open(path) as f:
+  instances_file = os.path.join(base_dir, 'test-instances.txt')
+  if not os.path.isfile(instances_file):
+    raise RuntimeError('{} is missing'.format(instances_file))
+
+  with open(instances_file) as f:
    test_instances = [line.strip() for line in f]

  # Metrics are assembled into a dictionary of dictionaries. The top-level
@ -314,6 +326,10 @@ def main(argv):
  # file. Instead, rows' names are links to the corresponding .png files.
  include_plots = len(test_instances) < 20

+  instances_succeeded = 0
+  instances_failed = 0
+  instances_running = 0
+
  for instance in test_instances:
    # A test instance is idenfied by the test name and the test run.
    test_case, test_instance, _ = instance.split(' ')
@ -334,33 +350,48 @@ def main(argv):
    cell1_html = FormatCell1(test_case, test_instance, metrics_file, log_file,
                             plot_file, include_plots)

-    if os.path.isfile(metrics_file):
-      # ParseMetrics outputs an HTML table row and also updates lists
-      metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file,
-                                                num_additional)
+    # ParseMetrics outputs an HTML table row and also updates lists
+    metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file,
+                                              num_additional)

-      # Update the metrics structure. Initialize dictionaries if necessary.
-      for m in metrics:
+    # Update the metrics structure. Initialize dictionaries if necessary.
+    for m in metrics:
+      if m in metrics_dict:
        if not test_case in metrics[m]:
          metrics[m][test_case] = metrics_dict[m]
        else:
          metrics[m][test_case] += metrics_dict[m]

-    print '<tr>{}{}{}</tr>'.format(cell1_html, spec_html, metrics_html)
+    print >>output_file, '<tr>{}{}{}</tr>'.format(cell1_html,
+                                                  spec_html, metrics_html)

-  print FormatSummaryRow(metrics)
+    # Update counters
+    if 'tv' in metrics_dict:
+      instances_succeeded += 1
+    else:
+      if 'time' in metrics_dict:
+        instances_failed += 1
+      else:
+        if os.path.isfile(log_file):
+          instances_running += 1

-  print '</tbody>'
-  print '</table>'
-  print '<p style="padding-bottom: 3em"></p>'  # vertical space
+  print >>output_file, FormatSummaryRow(metrics)
+
+  print >>output_file, '</tbody>'
+  print >>output_file, '</table>'
+  print >>output_file, '<p style="padding-bottom: 3em"></p>'  # vertical space

  # Plot links.
  if include_plots:
-    print FormatPlots(base_dir, test_instances)
+    print >>output_file, FormatPlots(base_dir, test_instances)
  else:
-    print ('<p>Too many tests to include plots. '
-           'Click links within rows for details.</p>')
+    print >>output_file, ('<p>Too many tests to include plots. '
+                          'Click links within rows for details.</p>')

+  print ('Instances'
+         ' succeeded: {}  failed: {}  running: {}  total: {}'.
+         format(instances_succeeded, instances_failed, instances_running,
+                len(test_instances)))

 if __name__ == '__main__':
  try: