Remove duplication by making demo.sh call regtest.sh, and fix lint

errors.
2015-04-01 14:09:44 -07:00 · 2015-04-01 14:09:44 -07:00 · bbe802081f
--- a/client/python/rappor.py
+++ b/client/python/rappor.py
@ -53,7 +53,7 @@ class Params(object):

  def to_json(self):
    """Convert this instance to JSON.
-    
+
    TODO: The names should be compatible with apps/api.
    """
    return json.dumps({
@ -63,7 +63,7 @@ class Params(object):
        'p': self.prob_p,
        'q': self.prob_q,
        'f': self.prob_f,
-        })
+    })

  # NOTE:
  # - from_csv is currently used in sum_bits.py
--- a/demo.sh
+++ b/demo.sh
@ -9,11 +9,8 @@
 #
 #   $ ./demo.sh run
 #
-# Run demo for just one distribution (no HTML output):
-#
-#   $ ./demo.sh run-dist [exp|gauss|unif]
-#
-# (This takes a minute or so)
+# This takes a minute or so.  It runs a subset of tests from regtest.sh and
+# writes an HTML summary.

 set -o nounset
 set -o pipefail
@ -32,41 +29,8 @@ export PYTHONPATH=$CLIENT_DIR
 # Semi-automated demos
 #

-readonly NUM_UNIQUE_VALUES=50  # number of actual values
-
-# This generates the simulated input s1 .. s<n> with 3 different distributions.
-gen-sim-input-demo() {
-  local dist=$1
-  local num_clients=$2
-  local num_unique_values=${3:-$NUM_UNIQUE_VALUES}
-
-  mkdir -p _tmp
-
-  # Simulating 10,000 clients runs reasonably fast but the results look poor.
-  # 100,000 is slow but looks better.
-  # 50 different client values are easier to plot (default is 100)
-  time tests/gen_sim_input.py \
-    -d $dist \
-    -n $num_clients \
-    -r $num_unique_values \
-    -c 7 \
-    -o _tmp/$dist.csv
-}
-
-rappor-sim() {
-  time tests/rappor_sim.py "$@"
-}
-
-# Do the RAPPOR transformation on our simulated input.
-rappor-sim-demo() {
-  local dist=$1
-  shift
-  rappor-sim -i _tmp/$dist.csv "$@"
-    #-s 0  # deterministic seed
-}
-
-# Like rappor-sim, but run it through the Python profiler.
-rappor-sim-demo-profile() {
+# Run rappor-sim through the Python profiler.
+rappor-sim-profile() {
  local dist=$1
  shift

@ -78,161 +42,21 @@ rappor-sim-demo-profile() {
    | tee _tmp/profile.txt
 }

-# Add some more candidates here.  We hope these are estimated at 0.
-# e.g. if add_start=51, and num_additional is 20, show v51-v70
-more-candidates() {
-  local last_true=$1
-  local num_additional=$2
-
-  local begin
-  local end
-  begin=$(expr $last_true + 1)
-  end=$(expr $last_true + $num_additional)
-
-  seq $begin $end | awk '{print "v" $1}'
-}
-
-# Args:
-#   true_inputs: File of true inputs
-#   last_true: last true input, e.g. 50 if we generated "v1" .. "v50".
-#   num_additional: additional candidates to generate (starting at 'last_true')
-#   to_remove: Regex of true values to omit from the candidates list, or the
-#     string 'NONE' if none should be.  (Our values look like 'v1', 'v2', etc. so
-#     there isn't any ambiguity.)
-print-candidates() {
-  local true_inputs=$1
-  local last_true=$2
-  local num_additional=$3 
-  local to_remove=$4
-
-  if test $to_remove = NONE; then
-    cat $true_inputs  # include all true inputs
-  else
-    egrep -v $to_remove $true_inputs  # remove some true inputs
-  fi
-  more-candidates $last_true $num_additional
-}
-
-hash-candidates() {
-  local dist=$1
-  shift
-  local out=_tmp/${dist}_map.csv
-  time analysis/tools/hash_candidates.py \
-    _tmp/${dist}_params.csv \
-    < _tmp/${dist}_candidates.txt \
-    > $out
-  log "Wrote $out"
-}
-
-sum-bits() {
-  local dist=$1
-  shift
-  local out=_tmp/${dist}_counts.csv
-  analysis/tools/sum_bits.py \
-    _tmp/${dist}_params.csv \
-    < _tmp/${dist}_out.csv \
-    > $out
-  log "Wrote $out"
-}
-
-# Analyze output of Python client library.
-analyze() {
-  local dist=$1
-  local title=$2
-  local prefix=_tmp/$dist
-
-  local out_dir=_tmp/${dist}_report
-  mkdir -p $out_dir
-
-  # The shebang on analyze.R is /usr/bin/Rscript.  With some Linux distros
-  # (Ubuntu), you often need to compile your own R to get say R 3.0 instead of
-  # 2.14.  In that case, do something like:
-  #
-  # export R_SCRIPT=/usr/local/bin/Rscript
-
-  local r_script=${R_SCRIPT:-env}
-  time $r_script tests/analyze.R -t "$title" $prefix $out_dir
-}
-
-# Run end to end for one distribution.
-run-dist() {
-  local dist=$1
-  # TODO: parameterize output dirs by num_clients
-  local num_clients=${2:-100000}
-  local num_additional=${3:-10}  # number of additional candidates
-  local to_remove=${4:-NONE}  # empty by default, set to 'v1|v2' to remove
-
-  banner "Generating simulated input data ($dist)"
-  gen-sim-input-demo $dist $num_clients
-
-  banner "Running RAPPOR ($dist)"
-  rappor-sim-demo $dist
-
-  banner "Generating candidates ($dist)"
-
-  # Keep all candidates
-  print-candidates \
-    _tmp/${dist}_true_inputs.txt $NUM_UNIQUE_VALUES $num_additional \
-    $to_remove \
-    > _tmp/${dist}_candidates.txt
-
-  banner "Hashing Candidates ($dist)"
-  hash-candidates $dist
-
-  banner "Summing bits ($dist)"
-  sum-bits $dist
-
-  # TODO:
-  # guess-candidates  # cheat and get them from the true input
-  # hash-candidates  # create map file
-
-  banner "Analyzing RAPPOR output ($dist)"
-  analyze $dist "Distribution Comparison ($dist)"
-}
-
-expand-html() {
-  local template=${1:-../tests/report.html}
-  local out_dir=${2:-_tmp}
-
-  pushd $out_dir >/dev/null
-
-  # Add simulation parameters and RAPPOR parameters.
-  # NOTE: We're arbitrarily using the "exp" values since params are all
-  # independent of distribution.
-
-  cat $template \
-    | sed -e '/SIM_PARAMS/ r exp_sim_params.html' \
-          -e '/RAPPOR_PARAMS/ r exp_params.html' \
-    > report.html
-
-  log "Wrote $out_dir/report.html.  Open this in your browser."
-
-  popd >/dev/null
-}
-
 # Build prerequisites for the demo.
 build() {
  # This is optional; the simulation will fall back to pure Python code.
  ./build.sh fastrand
 }

-_run() {
-  local num_clients=${1:-100000}
-  for dist in exp gauss unif; do
-    run-dist $dist $num_clients
-  done
-
-  wc -l _tmp/*.csv
-
-  # Expand the HTML skeleton
-  expand-html ../tests/report.html _tmp
-}
-
-# Main entry point.  Run it for all distributions, and time the result.
+# Main entry point that is documented in README.md.
 run() {
-  time _run "$@"
+  # Run all the test cases that start with "demo-", and write to "report.html".
+  # (The original demo.sh used "report.html", so we're not changing the name.)
+  ./regtest.sh run-seq '^demo-' report.html
 }

+# TODO: Port these old bad cases to regtest_spec.py.
+
 # Running the demo of the exponential distribution with 10000 reports (x7,
 # which is 70000 values).
 #
--- a/regtest.sh
+++ b/regtest.sh
@ -40,12 +40,47 @@ readonly REGTEST_DIR=_tmp/regtest
 # All the Python tools need this
 export PYTHONPATH=$CLIENT_DIR

-readonly NUM_SPEC_COLS=${NUM_PROCS:-13}
+readonly NUM_SPEC_COLS=13

 # TODO: Get num cpus
 readonly NUM_PROCS=${NUM_PROCS:-12}


+# Add some more candidates here.  We hope these are estimated at 0.
+# e.g. if add_start=51, and num_additional is 20, show v51-v70
+more-candidates() {
+  local last_true=$1
+  local num_additional=$2
+
+  local begin
+  local end
+  begin=$(expr $last_true + 1)
+  end=$(expr $last_true + $num_additional)
+
+  seq $begin $end | awk '{print "v" $1}'
+}
+
+# Args:
+#   true_inputs: File of true inputs
+#   last_true: last true input, e.g. 50 if we generated "v1" .. "v50".
+#   num_additional: additional candidates to generate (starting at 'last_true')
+#   to_remove: Regex of true values to omit from the candidates list, or the
+#     string 'NONE' if none should be.  (Our values look like 'v1', 'v2', etc. so
+#     there isn't any ambiguity.)
+print-candidates() {
+  local true_inputs=$1
+  local last_true=$2
+  local num_additional=$3 
+  local to_remove=$4
+
+  if test $to_remove = NONE; then
+    cat $true_inputs  # include all true inputs
+  else
+    egrep -v $to_remove $true_inputs  # remove some true inputs
+  fi
+  more-candidates $last_true $num_additional
+}
+
 # Run a single test case, specified by a line of the test spec.
 # This is a helper function for 'run-all'.

@ -108,7 +143,7 @@ _run-one-case() {
  banner "Constructing candidates"

  # Reuse demo.sh function
-  ./demo.sh print-candidates \
+  print-candidates \
    $case_dir/case_true_inputs.txt $num_unique_values \
    $num_additional "$to_remove" \
    > $case_dir/case_candidates.txt
@ -192,6 +227,7 @@ write-test-cases() {
 # run-all should take regex?
 run-seq() {
  local spec_regex=$1  # grep -E format on the spec
+  local html_filename=${2:-results.html}  # demo.sh changes it to demo.sh

  local spec_list=$REGTEST_DIR/spec-list.txt
  tests/regtest_spec.py | grep -E $spec_regex > $spec_list
@ -203,7 +239,7 @@ run-seq() {

  log "Done running all test cases"

-  make-summary $REGTEST_DIR
+  make-summary $REGTEST_DIR $html_filename
 }

 run-all() {
--- a/tests/gen_sim_input.py
+++ b/tests/gen_sim_input.py
@ -238,7 +238,8 @@ def main(argv):
          elapsed = time.time() - start_time
          log('Generated %d rows in %.2f seconds', i, elapsed)

-        for _ in xrange(VALUES_PER_CLIENT):  # A fixed number of values per user
+        # A fixed number of values per user
+        for _ in xrange(VALUES_PER_CLIENT):
          true_value = 'v%d' % rand_sample()
          c.writerow((i, true_value))
  log('Wrote %s', OUTFILE)
--- a/tests/report.html
+++ b/tests/report.html
@ -1,23 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-  <title>RAPPOR Demo</title>
-</head>
-
-<body style="text-align: center">
-  <h2>RAPPOR Demo</h2>
-
-  <!-- These strings will be replaced by a sed script. -->
-
-  <!-- SIM_PARAMS -->
-
-  <!-- RAPPOR_PARAMS -->
-
-  <hr/>
-
-  <img src="exp_report/dist.png" alt="exponential distribution" />
-  <img src="gauss_report/dist.png" alt="gauss distribution" />
-  <img src="unif_report/dist.png" alt="uniform distribution" />
-</body>
-
-</html>