зеркало из https://github.com/mozilla/rappor.git
Remove duplication by making demo.sh call regtest.sh, and fix lint
errors.
This commit is contained in:
Родитель
1e0f52efbe
Коммит
bbe802081f
|
@ -53,7 +53,7 @@ class Params(object):
|
|||
|
||||
def to_json(self):
|
||||
"""Convert this instance to JSON.
|
||||
|
||||
|
||||
TODO: The names should be compatible with apps/api.
|
||||
"""
|
||||
return json.dumps({
|
||||
|
@ -63,7 +63,7 @@ class Params(object):
|
|||
'p': self.prob_p,
|
||||
'q': self.prob_q,
|
||||
'f': self.prob_f,
|
||||
})
|
||||
})
|
||||
|
||||
# NOTE:
|
||||
# - from_csv is currently used in sum_bits.py
|
||||
|
|
196
demo.sh
196
demo.sh
|
@ -9,11 +9,8 @@
|
|||
#
|
||||
# $ ./demo.sh run
|
||||
#
|
||||
# Run demo for just one distribution (no HTML output):
|
||||
#
|
||||
# $ ./demo.sh run-dist [exp|gauss|unif]
|
||||
#
|
||||
# (This takes a minute or so)
|
||||
# This takes a minute or so. It runs a subset of tests from regtest.sh and
|
||||
# writes an HTML summary.
|
||||
|
||||
set -o nounset
|
||||
set -o pipefail
|
||||
|
@ -32,41 +29,8 @@ export PYTHONPATH=$CLIENT_DIR
|
|||
# Semi-automated demos
|
||||
#
|
||||
|
||||
readonly NUM_UNIQUE_VALUES=50 # number of actual values
|
||||
|
||||
# This generates the simulated input s1 .. s<n> with 3 different distributions.
|
||||
gen-sim-input-demo() {
|
||||
local dist=$1
|
||||
local num_clients=$2
|
||||
local num_unique_values=${3:-$NUM_UNIQUE_VALUES}
|
||||
|
||||
mkdir -p _tmp
|
||||
|
||||
# Simulating 10,000 clients runs reasonably fast but the results look poor.
|
||||
# 100,000 is slow but looks better.
|
||||
# 50 different client values are easier to plot (default is 100)
|
||||
time tests/gen_sim_input.py \
|
||||
-d $dist \
|
||||
-n $num_clients \
|
||||
-r $num_unique_values \
|
||||
-c 7 \
|
||||
-o _tmp/$dist.csv
|
||||
}
|
||||
|
||||
rappor-sim() {
|
||||
time tests/rappor_sim.py "$@"
|
||||
}
|
||||
|
||||
# Do the RAPPOR transformation on our simulated input.
|
||||
rappor-sim-demo() {
|
||||
local dist=$1
|
||||
shift
|
||||
rappor-sim -i _tmp/$dist.csv "$@"
|
||||
#-s 0 # deterministic seed
|
||||
}
|
||||
|
||||
# Like rappor-sim, but run it through the Python profiler.
|
||||
rappor-sim-demo-profile() {
|
||||
# Run rappor-sim through the Python profiler.
|
||||
rappor-sim-profile() {
|
||||
local dist=$1
|
||||
shift
|
||||
|
||||
|
@ -78,161 +42,21 @@ rappor-sim-demo-profile() {
|
|||
| tee _tmp/profile.txt
|
||||
}
|
||||
|
||||
# Add some more candidates here. We hope these are estimated at 0.
|
||||
# e.g. if add_start=51, and num_additional is 20, show v51-v70
|
||||
more-candidates() {
|
||||
local last_true=$1
|
||||
local num_additional=$2
|
||||
|
||||
local begin
|
||||
local end
|
||||
begin=$(expr $last_true + 1)
|
||||
end=$(expr $last_true + $num_additional)
|
||||
|
||||
seq $begin $end | awk '{print "v" $1}'
|
||||
}
|
||||
|
||||
# Args:
|
||||
# true_inputs: File of true inputs
|
||||
# last_true: last true input, e.g. 50 if we generated "v1" .. "v50".
|
||||
# num_additional: additional candidates to generate (starting at 'last_true')
|
||||
# to_remove: Regex of true values to omit from the candidates list, or the
|
||||
# string 'NONE' if none should be. (Our values look like 'v1', 'v2', etc. so
|
||||
# there isn't any ambiguity.)
|
||||
print-candidates() {
|
||||
local true_inputs=$1
|
||||
local last_true=$2
|
||||
local num_additional=$3
|
||||
local to_remove=$4
|
||||
|
||||
if test $to_remove = NONE; then
|
||||
cat $true_inputs # include all true inputs
|
||||
else
|
||||
egrep -v $to_remove $true_inputs # remove some true inputs
|
||||
fi
|
||||
more-candidates $last_true $num_additional
|
||||
}
|
||||
|
||||
hash-candidates() {
|
||||
local dist=$1
|
||||
shift
|
||||
local out=_tmp/${dist}_map.csv
|
||||
time analysis/tools/hash_candidates.py \
|
||||
_tmp/${dist}_params.csv \
|
||||
< _tmp/${dist}_candidates.txt \
|
||||
> $out
|
||||
log "Wrote $out"
|
||||
}
|
||||
|
||||
sum-bits() {
|
||||
local dist=$1
|
||||
shift
|
||||
local out=_tmp/${dist}_counts.csv
|
||||
analysis/tools/sum_bits.py \
|
||||
_tmp/${dist}_params.csv \
|
||||
< _tmp/${dist}_out.csv \
|
||||
> $out
|
||||
log "Wrote $out"
|
||||
}
|
||||
|
||||
# Analyze output of Python client library.
|
||||
analyze() {
|
||||
local dist=$1
|
||||
local title=$2
|
||||
local prefix=_tmp/$dist
|
||||
|
||||
local out_dir=_tmp/${dist}_report
|
||||
mkdir -p $out_dir
|
||||
|
||||
# The shebang on analyze.R is /usr/bin/Rscript. With some Linux distros
|
||||
# (Ubuntu), you often need to compile your own R to get say R 3.0 instead of
|
||||
# 2.14. In that case, do something like:
|
||||
#
|
||||
# export R_SCRIPT=/usr/local/bin/Rscript
|
||||
|
||||
local r_script=${R_SCRIPT:-env}
|
||||
time $r_script tests/analyze.R -t "$title" $prefix $out_dir
|
||||
}
|
||||
|
||||
# Run end to end for one distribution.
|
||||
run-dist() {
|
||||
local dist=$1
|
||||
# TODO: parameterize output dirs by num_clients
|
||||
local num_clients=${2:-100000}
|
||||
local num_additional=${3:-10} # number of additional candidates
|
||||
local to_remove=${4:-NONE} # empty by default, set to 'v1|v2' to remove
|
||||
|
||||
banner "Generating simulated input data ($dist)"
|
||||
gen-sim-input-demo $dist $num_clients
|
||||
|
||||
banner "Running RAPPOR ($dist)"
|
||||
rappor-sim-demo $dist
|
||||
|
||||
banner "Generating candidates ($dist)"
|
||||
|
||||
# Keep all candidates
|
||||
print-candidates \
|
||||
_tmp/${dist}_true_inputs.txt $NUM_UNIQUE_VALUES $num_additional \
|
||||
$to_remove \
|
||||
> _tmp/${dist}_candidates.txt
|
||||
|
||||
banner "Hashing Candidates ($dist)"
|
||||
hash-candidates $dist
|
||||
|
||||
banner "Summing bits ($dist)"
|
||||
sum-bits $dist
|
||||
|
||||
# TODO:
|
||||
# guess-candidates # cheat and get them from the true input
|
||||
# hash-candidates # create map file
|
||||
|
||||
banner "Analyzing RAPPOR output ($dist)"
|
||||
analyze $dist "Distribution Comparison ($dist)"
|
||||
}
|
||||
|
||||
expand-html() {
|
||||
local template=${1:-../tests/report.html}
|
||||
local out_dir=${2:-_tmp}
|
||||
|
||||
pushd $out_dir >/dev/null
|
||||
|
||||
# Add simulation parameters and RAPPOR parameters.
|
||||
# NOTE: We're arbitrarily using the "exp" values since params are all
|
||||
# independent of distribution.
|
||||
|
||||
cat $template \
|
||||
| sed -e '/SIM_PARAMS/ r exp_sim_params.html' \
|
||||
-e '/RAPPOR_PARAMS/ r exp_params.html' \
|
||||
> report.html
|
||||
|
||||
log "Wrote $out_dir/report.html. Open this in your browser."
|
||||
|
||||
popd >/dev/null
|
||||
}
|
||||
|
||||
# Build prerequisites for the demo.
|
||||
build() {
|
||||
# This is optional; the simulation will fall back to pure Python code.
|
||||
./build.sh fastrand
|
||||
}
|
||||
|
||||
_run() {
|
||||
local num_clients=${1:-100000}
|
||||
for dist in exp gauss unif; do
|
||||
run-dist $dist $num_clients
|
||||
done
|
||||
|
||||
wc -l _tmp/*.csv
|
||||
|
||||
# Expand the HTML skeleton
|
||||
expand-html ../tests/report.html _tmp
|
||||
}
|
||||
|
||||
# Main entry point. Run it for all distributions, and time the result.
|
||||
# Main entry point that is documented in README.md.
|
||||
run() {
|
||||
time _run "$@"
|
||||
# Run all the test cases that start with "demo-", and write to "report.html".
|
||||
# (The original demo.sh used "report.html", so we're not changing the name.)
|
||||
./regtest.sh run-seq '^demo-' report.html
|
||||
}
|
||||
|
||||
# TODO: Port these old bad cases to regtest_spec.py.
|
||||
|
||||
# Running the demo of the exponential distribution with 10000 reports (x7,
|
||||
# which is 70000 values).
|
||||
#
|
||||
|
|
42
regtest.sh
42
regtest.sh
|
@ -40,12 +40,47 @@ readonly REGTEST_DIR=_tmp/regtest
|
|||
# All the Python tools need this
|
||||
export PYTHONPATH=$CLIENT_DIR
|
||||
|
||||
readonly NUM_SPEC_COLS=${NUM_PROCS:-13}
|
||||
readonly NUM_SPEC_COLS=13
|
||||
|
||||
# TODO: Get num cpus
|
||||
readonly NUM_PROCS=${NUM_PROCS:-12}
|
||||
|
||||
|
||||
# Add some more candidates here. We hope these are estimated at 0.
|
||||
# e.g. if add_start=51, and num_additional is 20, show v51-v70
|
||||
more-candidates() {
|
||||
local last_true=$1
|
||||
local num_additional=$2
|
||||
|
||||
local begin
|
||||
local end
|
||||
begin=$(expr $last_true + 1)
|
||||
end=$(expr $last_true + $num_additional)
|
||||
|
||||
seq $begin $end | awk '{print "v" $1}'
|
||||
}
|
||||
|
||||
# Args:
|
||||
# true_inputs: File of true inputs
|
||||
# last_true: last true input, e.g. 50 if we generated "v1" .. "v50".
|
||||
# num_additional: additional candidates to generate (starting at 'last_true')
|
||||
# to_remove: Regex of true values to omit from the candidates list, or the
|
||||
# string 'NONE' if none should be. (Our values look like 'v1', 'v2', etc. so
|
||||
# there isn't any ambiguity.)
|
||||
print-candidates() {
|
||||
local true_inputs=$1
|
||||
local last_true=$2
|
||||
local num_additional=$3
|
||||
local to_remove=$4
|
||||
|
||||
if test $to_remove = NONE; then
|
||||
cat $true_inputs # include all true inputs
|
||||
else
|
||||
egrep -v $to_remove $true_inputs # remove some true inputs
|
||||
fi
|
||||
more-candidates $last_true $num_additional
|
||||
}
|
||||
|
||||
# Run a single test case, specified by a line of the test spec.
|
||||
# This is a helper function for 'run-all'.
|
||||
|
||||
|
@ -108,7 +143,7 @@ _run-one-case() {
|
|||
banner "Constructing candidates"
|
||||
|
||||
# Reuse demo.sh function
|
||||
./demo.sh print-candidates \
|
||||
print-candidates \
|
||||
$case_dir/case_true_inputs.txt $num_unique_values \
|
||||
$num_additional "$to_remove" \
|
||||
> $case_dir/case_candidates.txt
|
||||
|
@ -192,6 +227,7 @@ write-test-cases() {
|
|||
# run-all should take regex?
|
||||
run-seq() {
|
||||
local spec_regex=$1 # grep -E format on the spec
|
||||
local html_filename=${2:-results.html} # demo.sh changes it to demo.sh
|
||||
|
||||
local spec_list=$REGTEST_DIR/spec-list.txt
|
||||
tests/regtest_spec.py | grep -E $spec_regex > $spec_list
|
||||
|
@ -203,7 +239,7 @@ run-seq() {
|
|||
|
||||
log "Done running all test cases"
|
||||
|
||||
make-summary $REGTEST_DIR
|
||||
make-summary $REGTEST_DIR $html_filename
|
||||
}
|
||||
|
||||
run-all() {
|
||||
|
|
|
@ -238,7 +238,8 @@ def main(argv):
|
|||
elapsed = time.time() - start_time
|
||||
log('Generated %d rows in %.2f seconds', i, elapsed)
|
||||
|
||||
for _ in xrange(VALUES_PER_CLIENT): # A fixed number of values per user
|
||||
# A fixed number of values per user
|
||||
for _ in xrange(VALUES_PER_CLIENT):
|
||||
true_value = 'v%d' % rand_sample()
|
||||
c.writerow((i, true_value))
|
||||
log('Wrote %s', OUTFILE)
|
||||
|
|
|
@ -1,23 +0,0 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>RAPPOR Demo</title>
|
||||
</head>
|
||||
|
||||
<body style="text-align: center">
|
||||
<h2>RAPPOR Demo</h2>
|
||||
|
||||
<!-- These strings will be replaced by a sed script. -->
|
||||
|
||||
<!-- SIM_PARAMS -->
|
||||
|
||||
<!-- RAPPOR_PARAMS -->
|
||||
|
||||
<hr/>
|
||||
|
||||
<img src="exp_report/dist.png" alt="exponential distribution" />
|
||||
<img src="gauss_report/dist.png" alt="gauss distribution" />
|
||||
<img src="unif_report/dist.png" alt="uniform distribution" />
|
||||
</body>
|
||||
|
||||
</html>
|
Загрузка…
Ссылка в новой задаче