- Addressing reviewer's comments

- Adding functionality of generating multiple reports per client to the fast count mode
This commit is contained in:
Ilya Mironov 2015-04-17 00:23:14 -07:00
Родитель 9a9d690223
Коммит 3eafbfb018
10 изменённых файлов: 545 добавлений и 648 удалений

Просмотреть файл

@ -52,7 +52,7 @@ build() {
run() {
# Run all the test cases that start with "demo-", and write to "report.html".
# (The original demo.sh used "report.html", so we're not changing the name.)
./regtest.sh run-seq '^demo-' report.html
./regtest.sh run-seq '^demo' 1 F
}
# TODO: Port these old bad cases to regtest_spec.py.

Просмотреть файл

@ -5,24 +5,31 @@
# Usage:
# ./regtest.sh <function name>
# Examples:
#
# $ export NUM_PROCS=20 # 12 by default
# $ ./regtest.sh run-all # run all reg tests with 20 parallel processes
#
# At the end, it will print an HTML summary.
# To run a subset of tests or debug a specific test case, use the 'run-seq'
# function:
#
# Three main functions are
# run [[<pattern> [<num> [<fast>]] - run tests matching <pattern> in
# parallel, each <num> times. The fast
# mode (T/F) shortcuts generation of
# reports.
# run-seq [<pattern> [<num> [<fast>]] - ditto, except that tests are run
# sequentially
# run-all [<num>] - run all tests, in parallel, each <num> times
#
# $ ./regtest.sh run-seq demo-exp # Sequential run, matches 1 case
# $ ./regtest.sh run-seq demo- # Sequential run, matches multiple cases
# Examples:
# $ ./regtest.sh run-seq unif-small-typical # Sequential run, matches 1 case
# $ ./regtest.sh run-seq unif-small- 3 F # Sequential, each test is run three
# times, using slow generation
# $ ./regtest.sh run unif- # Parallel run, matches multiple cases
# $ ./regtest.sh run unif- 5 # Parallel run, matches multiple cases, each test
# is run 5 times
# $ ./regtest.sh run-all # Run all tests once
#
# The first argument to run-seq is a regex in 'grep -E' format. (Detail: Don't
# The <pattern> argument is a regex in 'grep -E' format. (Detail: Don't
# use $ in the pattern, since it matches the whole spec line and not just the
# test case name.)
# test case name.) The number of processors used in a parallel run is one less
# than the number of CPUs on the machine.
# The first argument to run-all is the number of repetitions of each test
# Future speedups:
# - Reuse the same input -- come up with naming scheme based on params
@ -42,11 +49,6 @@ readonly REGTEST_DIR=_tmp/regtest
# All the Python tools need this
export PYTHONPATH=$CLIENT_DIR
readonly NUM_SPEC_COLS=14
# TODO: Get num cpus
readonly NUM_PROCS=${NUM_PROCS:-12}
print-true-inputs() {
local num_unique_values=$1
seq 1 $num_unique_values | awk '{print "v" $1}'
@ -88,128 +90,102 @@ print-candidates() {
}
# Generate a single test case, specified by a line of the test spec.
# This is a helper function for 'run-all'.
_generate-one-case() {
local test_case_id=$1
local test_case_run=$2
# This is a helper function for _run_tests().
_setup-one-case() {
local test_case=$1
# input params
local dist=$3
local num_unique_values=$4
local num_clients=$5
local values_per_client=$6
local dist=$2
local num_unique_values=$3
local num_clients=$4
local values_per_client=$5
# RAPPOR params
local num_bits=$7
local num_hashes=$8
local num_cohorts=$9
local p=${10} # need curly braces to get 10th arg
local q=${11}
local f=${12}
local num_bits=$6
local num_hashes=$7
local num_cohorts=$8
local p=$9
local q=${10} # need curly braces to get the 10th arg
local f=${11}
# map params
local num_additional=${13}
local to_remove=${14}
local num_additional=${12}
local to_remove=${13}
# NOTE: NUM_SPEC_COLS == 14
banner 'Setting up parameters and candidate files for '$test_case
# proceed only for the first instance out of (possibly) many
if test $test_case_run = 1; then
banner 'Setting up parameters and candidate files for '$test_case_id
local case_dir=$REGTEST_DIR/$test_case
mkdir --verbose -p $case_dir
local case_dir=$REGTEST_DIR/$test_case_id
mkdir --verbose -p $case_dir
# Save the "spec"
echo "$@" > $case_dir/spec.txt
# Save the "spec" for showing in the summary.
echo "$@" > $case_dir/spec.txt
local params_path=$case_dir/case_params.csv
local params_path=$case_dir/case_params.csv
echo 'k,h,m,p,q,f' > $params_path
echo "$num_bits,$num_hashes,$num_cohorts,$p,$q,$f" >> $params_path
echo 'k,h,m,p,q,f' > $params_path
echo "$num_bits,$num_hashes,$num_cohorts,$p,$q,$f" >> $params_path
print-true-inputs $num_unique_values > $case_dir/case_true_inputs.txt
print-true-inputs $num_unique_values > $case_dir/case_true_inputs.txt
local true_map_path=$case_dir/case_true_map.csv
local true_map_path=$case_dir/case_true_map.csv
analysis/tools/hash_candidates.py \
$params_path \
< $case_dir/case_true_inputs.txt \
> $true_map_path
analysis/tools/hash_candidates.py \
$params_path \
< $case_dir/case_true_inputs.txt \
> $true_map_path
# banner "Constructing candidates"
# banner "Constructing candidates"
print-candidates \
$case_dir/case_true_inputs.txt $num_unique_values \
$num_additional "$to_remove" \
> $case_dir/case_candidates.txt
# Reuse demo.sh function
print-candidates \
$case_dir/case_true_inputs.txt $num_unique_values \
$num_additional "$to_remove" \
> $case_dir/case_candidates.txt
# banner "Hashing candidates to get 'map'"
# banner "Hashing candidates to get 'map'"
analysis/tools/hash_candidates.py \
$case_dir/case_params.csv \
< $case_dir/case_candidates.txt \
> $case_dir/case_map.csv
fi
analysis/tools/hash_candidates.py \
$case_dir/case_params.csv \
< $case_dir/case_candidates.txt \
> $case_dir/case_map.csv
}
# Run a single test instance, specified by a line of the test spec.
# This is a helper function for 'run-all'.
# Run a single test instance, specified by <test_name, instance_num>.
# This is a helper function for _run_tests().
_run-one-instance() {
local test_case_id=$1
local test_case_run=$2
local test_case=$1
local test_instance=$2
local fast_counts=$3
# input params
local dist=$3
local num_unique_values=$4
local num_clients=$5
local values_per_client=$6
local case_dir=$REGTEST_DIR/$test_case
read -r case_name distr num_unique_values num_clients \
values_per_client num_bits num_hashes num_cohorts p q f num_additional \
to_remove < $case_dir/spec.txt
# RAPPOR params
local num_bits=$7
local num_hashes=$8
local num_cohorts=$9
local p=${10} # need curly braces to get 10th arg
local q=${11}
local f=${12}
# map params
local num_additional=${13}
local to_remove=${14}
# NOTE: NUM_SPEC_COLS == 14
local case_dir=$REGTEST_DIR/$test_case_id
local instance_dir=$REGTEST_DIR/$test_case_id/$test_case_run
local instance_dir=$REGTEST_DIR/$test_case/$test_instance
mkdir --verbose -p $instance_dir
local fast_counts=T
if test $fast_counts = T; then
local params_path=$case_dir/case_params.csv
local true_map_path=$case_dir/case_true_map.csv
local num_reports=$(expr $num_clients \* $values_per_client)
local params_file=$case_dir/case_params.csv
local true_map_file=$case_dir/case_true_map.csv
banner "Using gen_counts.R"
tests/gen_counts.R $params_path $true_map_path $dist $num_reports \
"$instance_dir/case"
echo tests/gen_counts.R $distr $num_clients $values_per_client $params_file \
$true_map_file "$instance_dir/case"
tests/gen_counts.R $distr $num_clients $values_per_client $params_file \
$true_map_file "$instance_dir/case"
else
banner "Generating input"
tests/gen_sim_input.py \
-d $dist \
-c $num_clients \
-u $num_unique_values \
-v $values_per_client \
> $instance_dir/case.csv
tests/gen_reports.R $distr $num_unique_values $num_clients \
$values_per_client $instance_dir/case.csv
banner "Running RAPPOR client"
# Writes encoded "out" file, true histogram, true inputs, params CSV and JSON
# to $case_dir.
# Writes encoded "out" file, true histogram, true inputs to $instance_dir.
tests/rappor_sim.py \
--num-bits $num_bits \
--num-hashes $num_hashes \
@ -231,10 +207,14 @@ _run-one-instance() {
local out_dir=${instance_dir}_report
mkdir --verbose -p $out_dir
# Currently, the summary file shows and aggregates timing of the inference
# engine, which excludes R's loading time and reading of the (possibly
# substantial) map file. Timing below is more inclusive.
TIMEFORMAT='Running analyze.R took %R seconds'
time {
# Input prefix, output dir
tests/analyze.R -t "Test case: $test_case_id (instance $test_case_run)" "$case_dir/case" "$instance_dir/case" $out_dir
tests/analyze.R -t "Test case: $test_case (instance $test_instance)" \
"$case_dir/case" "$instance_dir/case" $out_dir
}
}
@ -247,8 +227,9 @@ _run-one-instance-logged() {
mkdir --verbose -p $log_dir
log "Started '$test_case_id' (instance $test_case_run) -- logging to $log_dir/log.txt"
_run-one-instance "$@" >$log_dir/log.txt 2>&1
log "Test case $test_case_id (instance $test_case_run) done"
_run-one-instance "$@" >$log_dir/log.txt 2>&1 \
&& log "Test case $test_case_id (instance $test_case_run) done" \
|| log "Test case $test_case_id (instance $test_case_run) failed"
}
show-help() {
@ -274,11 +255,6 @@ make-summary() {
log "URL: file://$PWD/$dir/$filename"
}
# Helper to parse spec input with xargs
multi() {
xargs -n $NUM_SPEC_COLS --no-run-if-empty --verbose "$@"
}
test-error() {
local spec_regex=${1:-}
log "Some test cases failed"
@ -289,76 +265,95 @@ test-error() {
# exit 1
}
# Assuming the spec file, write a list of test case names (first column). This
# is read by make_summary.py.
write-test-cases() {
cut -d ' ' -f 1,2 $REGTEST_DIR/spec-list.txt > $REGTEST_DIR/test-cases.txt
# Assuming the spec file, write a list of test case names (first column) with
# the instance ids (second column), where instance ids run from 1 to $1.
# Third column is fast_counts (T/F).
_setup-test-instances() {
local instances=$1
local fast_counts=$2
while read line; do
for ((i=1; i<=$instances; i++))
do
read case_name _ <<< $line # extract the first token
echo $case_name $i $fast_counts
done
done
}
# run-all should take regex?
run-seq() {
# Args:
# regexp: A pattern selecting the subset of tests to run
# instances: A number of times each test case is run
# parallel: Whether the tests are run in parallel (T/F)
# fast_counts: Whether counts are sampled directly (T/F)
#
_run-tests() {
local spec_regex=$1 # grep -E format on the spec
local html_filename=${2:-results.html} # demo.sh changes it to demo.sh
local instances=$2
local parallel=$3
local fast_counts=$4
rm -r --verbose $REGTEST_DIR
mkdir --verbose -p $REGTEST_DIR
local spec_list=$REGTEST_DIR/spec-list.txt
tests/regtest_spec.py | grep -E $spec_regex > $spec_list
write-test-cases
# Generate parameters for all test cases.
cat $spec_list \
| multi -- $0 _generate-one-case || test-error
cat $spec_list \
| multi -- $0 _run-one-instance || test-error $spec_regex
log "Done running all test cases"
make-summary $REGTEST_DIR $html_filename
}
run-all() {
# Number of iterations of each test.
local repetitions=${1:-1}
# Limit it to this number of test cases. By default we run all of them.
local max_cases=${2:-1000000}
local verbose=${3:-F}
mkdir --verbose -p $REGTEST_DIR
# Print the spec
#
# -n3 has to match the number of arguments in the spec.
#local func=_run-one-case-logged
local func
if test $verbose = T; then
func=_run-one-instance # parallel process output mixed on the console
local processors=1
if test $parallel = F; then
func=_run-one-instance # output to the console
else
func=_run-one-instance-logged # one line
func=_run-one-instance-logged
processors=$(grep -c ^processor /proc/cpuinfo)
processors=$(expr $processors - 1)
log "Running $processors parallel processes"
fi
log "Using $NUM_PROCS parallel processes"
local spec_list=$REGTEST_DIR/spec-list.txt
tests/regtest_spec.py -r $repetitions > $spec_list
write-test-cases
local cases_list=$REGTEST_DIR/test-cases.txt
tests/regtest_spec.py | grep -E $spec_regex > $cases_list
# Generate parameters for all test cases.
head -n $max_cases $spec_list \
| multi -P $NUM_PROCS -- $0 _generate-one-case || test-error
cat $cases_list \
| xargs -l -P $processors -- $0 _setup-one-case \
|| test-error
log "Done generating parameters for all test cases"
head -n $max_cases $spec_list \
| multi -P $NUM_PROCS -- $0 $func || test-error
local instances_list=$REGTEST_DIR/test-instances.txt
_setup-test-instances $instances $fast_counts < $cases_list > $instances_list
log "Done running all test cases"
cat $instances_list \
| xargs -l -P $processors -- $0 $func || test-error
log "Done running all test instances"
make-summary $REGTEST_DIR
}
# Run tests sequentially
run-seq() {
local spec_regex=${1:-'^r-'} # grep -E format on the spec
local instances=${2:-1}
local fast_counts=${3:-T}
_run-tests $spec_regex $instances F $fast_counts
}
# Run tests in parallel
run() {
local spec_regex=${1:-'^r-'} # grep -E format on the spec
local instances=${2:-1}
local fast_counts=${3:-T}
_run-tests $spec_regex $instances T $fast_counts
}
# Run tests in parallel
run-all() {
local instances=${1:-1}
log "Running all tests. Can take a while."
_run-tests '^r-' $instances T T
}
"$@"

Просмотреть файл

@ -59,24 +59,33 @@ Log <- function(...) {
cat('\n')
}
LoadInputs <- function(prefix_params, prefix_counts, ctx) {
# prefix: path prefix, e.g. '_tmp/exp'
p <- paste0(prefix_params, '_params.csv')
c <- paste0(prefix_counts, '_counts.csv')
m <- paste0(prefix_params, '_map.csv')
h <- paste0(prefix_counts, '_hist.csv')
LoadInputs <- function(prefix_case, prefix_instance, ctx) {
# prefix_case: path prefix to the test case, e.g. '_tmp/exp'
# prefix_instance: path prefix to the test instance, e.g., '_tmp/exp/1'
p <- paste0(prefix_case, '_params.csv')
m <- paste0(prefix_case, '_map.csv')
c <- paste0(prefix_instance, '_counts.csv')
h <- paste0(prefix_instance, '_hist.csv')
params <- ReadParameterFile(p)
counts <- ReadCountsFile(c)
map <- ReadMapFile(m)
# Calls AnalyzeRAPPOR to run the analysis code
rappor <- AnalyzeRAPPOR(params, counts, map$map, "FDR", 0.05,
timing <- system.time({
# Calls AnalyzeRAPPOR to run the analysis code
rappor <- AnalyzeRAPPOR(params, counts, map$map, "FDR", 0.05,
date="01/01/01", date_num="100001")
})
if (is.null(rappor)) {
stop("RAPPOR analysis failed.")
}
# The line is searched for, and elapsed time is extracted, by make_summary.py.
# Should the formating or wording change, make_summary must be updated too.
Log(c("Inference took", timing["elapsed"], "seconds"))
Log("Analysis Results:")
str(rappor)
@ -195,8 +204,8 @@ main <- function(parsed) {
args <- parsed$args
options <- parsed$options
input_params_prefix <- args[[1]]
input_counts_prefix <- args[[2]]
input_case_prefix <- args[[1]]
input_instance_prefix <- args[[2]]
output_dir <- args[[3]]
# increase ggplot font size globally
@ -207,7 +216,7 @@ main <- function(parsed) {
# NOTE: It takes more than 2000+ ms to get here, while the analysis only
# takes 500 ms or so (as measured by system.time).
LoadInputs(input_params_prefix, input_counts_prefix, ctx)
LoadInputs(input_case_prefix, input_instance_prefix, ctx)
d <- ProcessAll(ctx)
p <- PlotAll(d$plot_data, options$title)

Просмотреть файл

@ -16,8 +16,6 @@
source('analysis/R/read_input.R')
library(zipfR)
RandomPartition <- function(total, weights) {
# Outputs a random partition according to a specified distribution
# Args:
@ -64,13 +62,13 @@ RandomPartition <- function(total, weights) {
return(result)
}
GenerateCounts <- function(params, true_map, partition) {
GenerateCounts <- function(params, true_map, partition, reports_per_client) {
# Fast simulation of the marginal table for RAPPOR reports
# Args:
# params - parameters of the RAPPOR reporting process
# total - number of reports
# true_map - hashed true inputs
# weights - vector encoding the probability that a ball lands into a bin
# partition - allocation of clients between true values
# reports_per_client - number of reports (IRRs) per client
if (nrow(true_map$map) != (params$m * params$k)) {
stop(cat("Map does not match the params file!",
"mk =", params$m * params$k,
@ -87,96 +85,115 @@ GenerateCounts <- function(params, true_map, partition) {
# Expands to (m x k) x strs matrix, where each element (corresponding to the
# bit in the aggregate Bloom filter) is repeated k times.
expanded <- apply(cohorts, 2, function(vec) rep(vec, each = params$k))
# Computes the number of bits set to one BEFORE privacy-preserving transform.
counts_ones <- apply(expanded * true_map$map, 1, sum)
# Computes the number of bits set to zero BEFORE privacy-preserving transform.
counts_zeros <- rep(apply(cohorts, 1, sum), each = params$k) - counts_ones
p <- params$p
q <- params$q
f <- params$f
# probability that a true 1 is reported as "1"
pstar <- (1 - f / 2) * q + (f / 2) * p
# probability that a true 0 is reported as "1"
qstar <- (1 - f / 2) * p + (f / 2) * q
# For each bit, the number of clients reporting this bit:
clients_per_bit <- rep(apply(cohorts, 1, sum), each = params$k)
reported_ones <-
unlist(lapply(counts_ones,
function(x) rbinom(n = 1, size = x, prob = pstar))) +
unlist(lapply(counts_zeros,
function(x) rbinom(n = 1, size = x, prob = qstar)))
# Computes the true number of bits set to one BEFORE PRR.
true_ones <- apply(expanded * true_map$map, 1, sum)
ones_in_prr <-
unlist(lapply(true_ones,
function(x) rbinom(n = 1, size = x, prob = 1 - params$f / 2))) +
unlist(lapply(clients_per_bit - true_ones, # clients where the bit is 0
function(x) rbinom(n = 1, size = x, prob = params$f / 2)))
counts <- cbind(apply(cohorts, 1, sum),
matrix(reported_ones, nrow = params$m, ncol = params$k, byrow = TRUE))
# Number of IRRs where each bit is reported (either as 0 or as 1)
reports_per_bit <- clients_per_bit * reports_per_client
ones_before_irr <- ones_in_prr * reports_per_client
ones_after_irr <-
unlist(lapply(ones_before_irr,
function(x) rbinom(n = 1, size = x, prob = params$q))) +
unlist(lapply(reports_per_bit - ones_before_irr,
function(x) rbinom(n = 1, size = x, prob = params$p)))
counts <- cbind(apply(cohorts, 1, sum) * reports_per_client,
matrix(ones_after_irr, nrow = params$m, ncol = params$k, byrow = TRUE))
if(any(is.na(counts)))
stop("Failed to generate bit counts. Likely due to integer overflow.")
counts
}
ComputePdf <- function(distr, range) {
# Outputs discrete probability density function for a given distribution
# These are the five distributions in gen_sim_input.py
if (distr == 'exp') {
pdf <- dexp(1:range, rate = 5 / range)
} else if (distr == 'gauss') {
half <- range / 2
left <- -half + 1
pdf <- dnorm(left : half, sd = range / 6)
} else if (distr == 'unif') {
# e.g. for N = 4, weights are [0.25, 0.25, 0.25, 0.25]
pdf <- dunif(1:range, max = range)
} else if (distr == 'zipf1') {
# Since the distrubition defined over a finite set, we allow the parameter
# of the Zipf distribution to be 1.
pdf <- sapply(1:range, function(x) 1/x)
} else if (distr == 'zipf1.5') {
pdf <- sapply(1:range, function(x) 1/x^1.5)
}
else {
stop(sprintf("Invalid distribution '%s'", distr))
}
pdf <- pdf / sum(pdf) # normalize
pdf
}
# Usage:
#
# $ ./gen_counts.R foo_params.csv foo_true_map.csv exp 10000 \
# foo_counts.csv
# $ ./gen_counts.R exp 10000 1 foo_params.csv foo_true_map.csv foo
#
# 4 inputs and 1 output.
# Inputs:
# distribution name
# number of clients
# reports per client
# parameters file
# map file
# prefix for output files
# Outputs:
# foo_counts.csv
# foo_hist.csv
#
# Warning: the number of reports in any cohort must be less than
# .Machine$integer.max
main <- function(argv) {
params_file <- argv[[1]]
true_map_file <- argv[[2]]
dist <- argv[[3]]
num_reports <- as.integer(argv[[4]])
out_prefix <- argv[[5]]
distr <- argv[[1]]
num_clients <- as.integer(argv[[2]])
reports_per_client <- as.integer(argv[[3]])
params_file <- argv[[4]]
true_map_file <- argv[[5]]
out_prefix <- argv[[6]]
params <- ReadParameterFile(params_file)
true_map <- ReadMapFile(true_map_file)
# print(true_map$strs)
num_unique_values <- length(true_map$strs)
# These are the four distributions in gen_sim_input.py
if (dist == 'exp') {
# NOTE: gen_sim_input.py hard-codes lambda = N/5 for 'exp'
weights <- dexp(1:num_unique_values, rate = 5 / num_unique_values)
} else if (dist == 'gauss') {
# NOTE: gen_sim_input.py hard-codes stddev = N/6 for 'exp'
half <- num_unique_values / 2
left <- -half + 1
weights <- dnorm(left : half, sd = num_unique_values / 6)
} else if (dist == 'unif') {
# e.g. for N = 4, weights are [0.25, 0.25, 0.25, 0.25]
weights <- dunif(1:num_unique_values, max = num_unique_values)
} else if (dist == 'zipf1') {
# Since the distrubition defined over a finite set, we allow the parameter
# of the Zipf distribution to be 1.
weights <- sapply(1:num_unique_values, function(x) 1/x)
} else if (dist == 'zipf1.5') {
weights <- sapply(1:num_unique_values, function(x) 1/x^1.5)
}
else {
stop(sprintf("Invalid distribution '%s'", dist))
}
print("weights")
print(weights)
pdf <- ComputePdf(distr, num_unique_values)
if (length(true_map$strs) != length(weights)) {
stop(cat("Dimensions of weights do not match:",
"m =", length(true_map$strs), "weights col:", length(weights),
sep = " "))
}
print("Distribution")
print(pdf)
# Computes the number of clients reporting each string
# according to the pre-specified distribution.
partition <- RandomPartition(num_reports, weights)
partition <- RandomPartition(num_clients, pdf)
print('PARTITION')
print(partition)
# Histogram
true_hist <- data.frame(string = true_map$strs, count = partition)
counts <- GenerateCounts(params, true_map, partition)
counts <- GenerateCounts(params, true_map, partition, reports_per_client)
# Now create a CSV file

Просмотреть файл

@ -15,9 +15,67 @@
# limitations under the License.
library(RUnit)
library(Matrix) # for sparse matrices
source('tests/gen_counts.R')
TestGenerateCounts <- function() {
report_params <- list(k = 4, m = 2) # 2 cohorts, 4 bits each
map <- Matrix(0, nrow = 8, ncol = 3, sparse = TRUE) # 3 possible values
map[1,] <- c(1, 0, 0)
map[2,] <- c(0, 1, 0)
map[3,] <- c(0, 0, 1)
map[4,] <- c(1, 1, 1) # 4th bit of the first cohort gets signal only from all
map[5,] <- c(0, 0, 1) # 1st bit of the second cohort gets signal from v3
strs <- c('v1', 'v2', 'v3')
maps <- list(map = map, strs = strs)
partition <- c(3, 2, 1) * 10000
v <- 100 # reports per client
noise0 <- list(p = 0, q = 1, f = 0) # no noise at all
counts0 <- GenerateCounts(c(report_params, noise0), maps, partition, v)
checkEqualsNumeric(sum(counts0[1,2:4]), counts0[1,1])
checkEqualsNumeric(counts0[1,5], counts0[1,1])
checkEqualsNumeric(partition[3] * v, counts0[1,4] + counts0[2,2])
checkEqualsNumeric(sum(partition) * v, counts0[1,1] + counts0[2,1])
pvalues <- chisq.test(counts0[,1] / v, p = c(.5, .5))$p.value
for(i in 2:4)
pvalues <- c(pvalues,
chisq.test(
c(counts0[1,i] / v, partition[i - 1] - counts0[1,i] / v),
p = c(.5, .5))$p.value)
noise1 <- list(p = .5, q = .5, f = 0) # truly random IRRs
counts1 <- GenerateCounts(c(report_params, noise1), maps, partition, v)
for(i in 2:4)
for(j in 1:2)
pvalues <- c(pvalues,
chisq.test(c(counts1[j,1] - counts1[j,i], counts1[j,i]),
p = c(.5, .5))$p.value)
noise2 <- list(p = 0, q = 1, f = 1.0) # truly random PRRs
counts2 <- GenerateCounts(c(report_params, noise2), maps, partition, v)
checkEqualsNumeric(0, max(counts2 %% v)) # all entries must be divisible by v
counts2 <- counts2 / v
for(i in 2:4)
for(j in 1:2)
pvalues <- c(pvalues,
chisq.test(c(counts2[j,1] - counts2[j,i], counts2[j,i]),
p = c(.5, .5))$p.value)
checkTrue(min(pvalues) > 1E-9 && max(pvalues) < 1 - 1E-9,
"Chi-squared test failed")
}
TestRandomPartition <- function() {
p1 <- RandomPartition(total = 100, dgeom(0:999, prob = .1))
@ -46,4 +104,9 @@ TestRandomPartition <- function() {
checkTrue((p.value > 1E-9) && (p.value < 1 - 1E-9))
}
TestRandomPartition()
TestAll <- function(){
TestRandomPartition()
TestGenerateCounts()
}
TestAll()

70
tests/gen_reports.R Executable file
Просмотреть файл

@ -0,0 +1,70 @@
#!/usr/bin/env Rscript
#
# Copyright 2015 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
source('tests/gen_counts.R')
# Usage:
#
# $ ./gen_reports.R exp 100 10000 1 foo.csv
#
# Inputs:
# distribution name
# size of the distribution's support
# number of clients
# reports per client
# name of the output file
# Output:
# csv file with reports sampled according to the specified distribution.
main <- function(argv) {
distr <- argv[[1]]
distr_range <- as.integer(argv[[2]])
num_clients <- as.integer(argv[[3]])
reports_per_client <- as.integer(argv[[4]])
out_file <- argv[[5]]
pdf <- ComputePdf(distr, distr_range)
print("Distribution")
print(pdf)
# Computes the number of clients reporting each value, where the numbers are
# sampled according to pdf.
partition <- RandomPartition(num_clients, pdf)
print('PARTITION')
print(partition)
values <- rep(1:distr_range, partition) # expand partition
stopifnot(length(values) == num_clients)
# Shuffle values randomly (make take a few sec for > 10^8 inputs)
values <- sample(values)
# Obtain reports by prefixing values with "v"s. Even slower than shuffling.
reports <- paste("v", format(values, trim = TRUE), sep = "")
reports <- cbind(1:num_clients, reports) # paste together "1 v342"
reports <- reports[rep(1:nrow(reports), each = reports_per_client), ]
write.table(reports, file = out_file, row.names = FALSE, col.names = FALSE,
sep = ",", quote = FALSE)
}
if (length(sys.frames()) == 0) {
main(commandArgs(TRUE))
}

Просмотреть файл

@ -1,194 +0,0 @@
#!/usr/bin/python
#
# Copyright 2014 Google Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2014 Google Inc. All Rights Reserved.
"""Generated simulated input data for RAPPOR."""
import csv
import getopt
import math
import optparse
import os
import random
import sys
import time
def log(msg, *args):
if args:
msg = msg % args
print >>sys.stderr, msg
class RandUniform(object):
"""Returns a value drawn from the uniform distribution."""
def __init__(self, num_unique_values):
self.num_unique_values = num_unique_values
def __call__(self):
return random.randrange(1, self.num_unique_values + 1)
class RandGauss(object):
"""Returns a value drawn from a Gaussian."""
def __init__(self, num_unique_values, dist_param):
self.num_unique_values = num_unique_values
self.stddev = dist_param or float(num_unique_values) / 6
def __call__(self):
mean = float(self.num_unique_values + 1) / 2
while True:
r = random.normalvariate(mean, self.stddev)
value = int(round(r))
# Rejection sampling to cut off Gaussian to within [1, num_unique_values]
if 1 <= value <= self.num_unique_values:
break
return value # true client value
class RandExp(object):
"""Returns a value drawn from an exponential distribution."""
def __init__(self, num_unique_values, dist_param):
self.num_unique_values = num_unique_values
self.lambd = dist_param or 1
def __call__(self):
while True:
r = random.expovariate(self.lambd)
value = int(round(r))
# Rejection sampling to drop outputs outside [1, num_unique_values]
if 1 <= value <= self.num_unique_values:
break
return value # true client value
class RandZipf(object):
"""Returns a value drawn from a Zipf distribution."""
def __init__(self, num_unique_values, dist_param):
self.num_unique_values = num_unique_values
self.alpha = dist_param or 1
def __call__(self):
while True:
r = random.paretovariate(self.alpha) # Zipf is a discrete Pareto
value = int(round(r))
# Rejection sampling to drop outputs outside [1, num_unique_values]
if 1 <= value <= self.num_unique_values:
break
return value # true client value
def CreateOptionsParser():
p = optparse.OptionParser()
# This will be used for the C++ client
p.add_option(
'-l', type='int', metavar='INT', dest='num_lines', default=0,
help='Instead of a CSV file, output a text file with a value on each '
'line, and this number of lines.')
choices = ['exp', 'gauss', 'unif', 'zipf1', 'zipf1.5']
p.add_option(
'-d', type='choice', dest='dist', default='exp', choices=choices,
help='Distribution to draw values from (%s)' % '|'.join(choices))
p.add_option(
'-u', type='int', metavar='INT',
dest='num_unique_values', default=100,
help='Number of unique values to generate.')
p.add_option(
'-c', type='int', metavar='INT', dest='num_clients', default=100000,
help='Number of clients.')
p.add_option(
'-v', type='int', metavar='INT', dest='values_per_client', default=1,
help='Number of values to generate per client.')
p.add_option(
'-p', type='float', metavar='FLOAT', dest='dist_param', default=None,
help='Parameter to distribution. Ignored for uniform; Std-dev '
'for Gaussian; Lambda for Exponential; Alpha for Zipf.')
return p
def main(argv):
(opts, argv) = CreateOptionsParser().parse_args(argv)
if opts.num_unique_values < 2:
raise RuntimeError('-u should be at least 2.')
if opts.num_clients < 10:
raise RuntimeError("RAPPOR won't work with fewer than 10 clients")
random.seed()
# Choose a function that yields the desired distrubtion. Each of these
# functions returns a randomly sampled integer between 1 and
# opts.num_unique_values.
if opts.dist == 'unif':
rand_sample = RandUniform(opts.num_unique_values)
elif opts.dist == 'gauss':
rand_sample = RandGauss(opts.num_unique_values, opts.dist_param)
elif opts.dist == 'exp':
rand_sample = RandExp(opts.num_unique_values, opts.dist_param)
elif opts.dist == 'zipf1':
rand_sample = RandZipf(opts.num_unique_values, 1.0)
elif opts.dist == 'zipf1.5':
rand_sample = RandZipf(opts.num_unique_values, 1.5)
else:
raise AssertionError(opts.dist)
start_time = time.time()
# Printing values into file OUTFILE
f = sys.stdout
if opts.num_lines: # line mode, not writing the client column
for i in xrange(opts.num_lines):
if i % 10000 == 0:
elapsed = time.time() - start_time
log('Generated %d rows in %.2f seconds', i, elapsed)
true_value = 'v%d' % rand_sample()
print >>f, true_value
else: # csv mode
c = csv.writer(f)
c.writerow(('client', 'true_value'))
for i in xrange(1, opts.num_clients + 1):
if i % 10000 == 0:
elapsed = time.time() - start_time
log('Generated %d rows in %.2f seconds', i, elapsed)
# A fixed number of values per user
for _ in xrange(opts.values_per_client):
true_value = 'v%d' % rand_sample()
c.writerow((i, true_value))
if __name__ == "__main__":
try:
main(sys.argv)
except RuntimeError, e:
print >>sys.stderr, e.args[0]
sys.exit(1)

Просмотреть файл

@ -9,18 +9,6 @@ import re
import sys
# Simulation parameters and result metrics.
EMPTY_ROW = """\
<tr>
<td>
%(name)s
</td>
<td colspan = 3>
missing
</td>
</tr>
"""
SUMMARY_ROW = """\
<tfoot style="font-weight: bold; text-align: right">
<tr>
@ -84,10 +72,10 @@ def FormatFloat(x, percent):
return '{:.3f}'.format(x)
def FormatEstimate(m_std_error, percent=False):
def FormatMeanWithSem(m_std_error, percent=False):
"""Formats an estimate with standard error."""
if m_std_error is None:
return ""
return ''
m, std_error = m_std_error
if std_error is None:
return FormatFloat(m, percent)
@ -101,14 +89,18 @@ def Mean(l):
"""Computes the mean (average) for a list of numbers."""
if l:
return float(sum(l)) / len(l)
else:
return None
def SampleVar(l):
"""Computes the sample variance for a list of numbers."""
if len(l) > 1:
mean = Mean(l)
var = sum([(x - mean)**2 for x in l]) / (len(l) - 1)
var = sum([(x - mean) ** 2 for x in l]) / (len(l) - 1)
return var
else:
return None
def StandardErrorEstimate(l):
@ -117,56 +109,26 @@ def StandardErrorEstimate(l):
For a singleton the standard error is assumed to be 10% of its value.
"""
if len(l) > 1:
return (SampleVar(l) / len(l))**.5
return (SampleVar(l) / len(l)) ** .5
elif l:
return l[0] / 10.0
def WeightedAverageOfAverages(list_of_lists, cap):
"""Computes the average of averages, weighted by accuracy.
Given a list of lists of numbers, computes a weighted average of averages
together the standard error of the estimate. Contribution from each list is
weighted proportionally to the standard error of its sample mean.
(Sublists with lower accuracy contribute less to the total). The cap limits
the weight of any one's list.
Args:
list_of_list: A list of list of floats.
cap: Limit on any list's weight
Returns:
A pair of floats - average and its standard error.
"""
l = [sublist for sublist in list_of_lists if sublist]
if not l:
else:
return None
total = 0
total_weights = 0
total_sem = 0 # SEM - Standard Error of the Mean
for sublist in l:
std_error = StandardErrorEstimate(sublist)
weight = 1 / std_error if std_error > 1.0 / cap else cap
total += Mean(sublist) * weight
total_weights += weight
total_sem += std_error**2 * weight**2 # == 1 when the weight is < cap
std_error_estimate = total_sem**.5 / total_weights
return total / total_weights, std_error_estimate
def AverageOfAverages(list_of_lists):
def MeanOfMeans(dict_of_lists):
"""Returns the average of averages with the standard error of the estimate.
"""
means = [Mean(l) for l in list_of_lists if l]
means = [Mean(dict_of_lists[key]) for key in dict_of_lists
if dict_of_lists[key]]
if means:
# Compute variances of the estimate for each sublist.
se = [StandardErrorEstimate(l)**2 for l in list_of_lists if l]
se = [StandardErrorEstimate(dict_of_lists[key]) ** 2 for key
in dict_of_lists if dict_of_lists[key]]
return (Mean(means), # Mean over all sublists
sum(se)**.5 / len(se)) # Standard deviation of the mean
sum(se) ** .5 / len(se)) # Standard deviation of the mean
else:
return None
def ParseSpecFile(spec_filename):
@ -179,8 +141,6 @@ def ParseSpecFile(spec_filename):
with open(spec_filename) as s:
spec_row = s.readline().split()
spec_row.pop(1) # drop the run_id (must be 1 if correctly generated)
# Second to last column is 'num_additional' -- the number of bogus
# candidates added
num_additional = int(spec_row[-2])
@ -190,39 +150,37 @@ def ParseSpecFile(spec_filename):
return num_additional, spec_in_html
def ParseLogFile(log_filename):
def ExtractTime(log_filename):
"""Extracts the elapsed time information from the log file.
Returns:
A float or None in case of failure.
Elapsed time (in seconds) or None in case of failure.
"""
if os.path.isfile(log_filename):
with open(log_filename) as log:
log_str = log.read()
match = re.search(r'took ([0-9.]+) seconds', log_str)
# Matching a line output by analyze.R.
match = re.search(r'Inference took ([0-9.]+) seconds', log_str)
if match:
return float(match.group(1))
return None
def ParseMetrics(report_dir, num_additional, metrics_lists):
def ParseMetrics(metrics_file, log_file, num_additional):
"""Processes the metrics file.
Args:
report_dir: A directory name containing metrics.csv and log.txt.
num_additional: A number of bogus candidates added to the candidate list.
metrics_lists: A dictionary containing lists (one for each metric) of
lists (one for each test case) of metrics (one for each test run).
Returns:
Part of the report row formatted in HTML. metrics_lists is updated with
new metrics.
"""
metrics_filename = os.path.join(report_dir, 'metrics.csv')
with open(metrics_filename) as m:
Returns a pair:
- A dictionary of metrics (some can be []).
- An HTML-formatted portion of the report row.
"""
with open(metrics_file) as m:
m.readline()
metrics_row = m.readline().split(',')
# Format numbers and sum
(num_actual, num_rappor, num_false_pos, num_false_neg, total_variation,
allocated_mass) = metrics_row
@ -235,8 +193,7 @@ def ParseMetrics(report_dir, num_additional, metrics_lists):
total_variation = float(total_variation)
allocated_mass = float(allocated_mass)
log_filename = os.path.join(report_dir, 'log.txt')
elapsed_time = ParseLogFile(log_filename)
elapsed_time = ExtractTime(log_file)
# e.g. if there are 20 additional candidates added, and 1 false positive,
# the false positive rate is 5%.
@ -249,75 +206,86 @@ def ParseMetrics(report_dir, num_additional, metrics_lists):
metrics_row_str = [
str(num_actual),
str(num_rappor),
'%.1f%% (%d)' % (fp_rate * 100, num_false_pos)
if num_additional else '',
'%.1f%% (%d)' % (fp_rate * 100, num_false_pos) if num_additional else '',
'%.1f%% (%d)' % (fn_rate * 100, num_false_neg),
'%.3f' % total_variation,
'%.3f' % allocated_mass,
'%.2f' % elapsed_time if elapsed_time is not None else '',
]
if num_additional:
metrics_lists['fpr'][-1].append(fp_rate)
metrics_lists['fnr'][-1].append(fn_rate)
metrics_lists['tv'][-1].append(total_variation)
metrics_lists['am'][-1].append(allocated_mass)
if elapsed_time is not None:
metrics_lists['time'][-1].append(elapsed_time)
metrics_row_dict = {
'tv': [total_variation],
'fpr': [fp_rate] if num_additional else [],
'fnr': [fn_rate],
'am': [allocated_mass],
'time': [elapsed_time] if elapsed_time is not None else [],
}
# return metrics formatted as HTML table entries
return ' '.join('<td>%s</td>' % cell for cell in metrics_row_str)
return (metrics_row_dict,
' '.join('<td>%s</td>' % cell for cell in metrics_row_str))
def FormatRowName(case_name, run_id_str, metrics_name, link_to_plots):
"""Outputs an HTML table entry.
def FormatCell1(test_case, test_instance, metrics_file, log_file, plot_file,
link_to_plots):
"""Outputs an HTML table entry for the first cell of the row.
The row is filled if the metrics file exist. The first cell contains a link
that for short tables points to a plot file inline, for large tables to an
external file.
If the metrics file is missing, the link points to the log file (if one
exists)
"""
relpath_report = '{}/{}_report'.format(case_name, run_id_str)
if os.path.isfile(metrics_name):
relpath_report = '{}/{}_report'.format(test_case, test_instance)
if os.path.isfile(metrics_file):
external_file = plot_file
if link_to_plots:
link = '#' + case_name + '_' + run_id_str # anchor
link = '#{}_{}'.format(test_case, test_instance) # anchor
else:
link = relpath_report + '/' + 'dist.png'
else: # no results likely due to an error, puts a link to the log file
external_file = log_file
link = relpath_report + '/' + 'log.txt'
return '<td><a href="{}">{}</a></td>'.format(link, case_name)
if os.path.isfile(external_file):
return '<td><a href="{}">{}</a></td>'.format(link, test_case)
else: # if no file to link to
return '<td>{}</td>'.format(test_case)
def FormatSummaryRows(metrics_lists):
"""Outputs an HTML-formatted summary row.
"""
def FormatSummaryRow(metrics_lists):
"""Outputs an HTML-formatted summary row."""
means_with_sem = {} # SEM - standard error of the mean
for key in metrics_lists:
means_with_sem[key] = AverageOfAverages(metrics_lists[key])
means_with_sem[key] = MeanOfMeans(metrics_lists[key])
# If none of the lists is longer than one element, drop the SEM component.
if means_with_sem[key] and max([len(l) for l in metrics_lists[key]]) < 2:
means_with_sem[key] = [means_with_sem[key][0], None]
summary = {
'name': 'Means',
'mean_fpr': FormatEstimate(means_with_sem['fpr'], percent=True),
'mean_fnr': FormatEstimate(means_with_sem['fnr'], percent=True),
'mean_tv': FormatEstimate(means_with_sem['tv'], percent=True),
'mean_am': FormatEstimate(means_with_sem['am'], percent=True),
'mean_time': FormatEstimate(means_with_sem['time']),
'mean_fpr': FormatMeanWithSem(means_with_sem['fpr'], percent=True),
'mean_fnr': FormatMeanWithSem(means_with_sem['fnr'], percent=True),
'mean_tv': FormatMeanWithSem(means_with_sem['tv'], percent=True),
'mean_am': FormatMeanWithSem(means_with_sem['am'], percent=True),
'mean_time': FormatMeanWithSem(means_with_sem['time']),
}
return SUMMARY_ROW % summary
def FormatPlots(base_dir, test_instances):
"""Outputs HTML-formatted plots.
"""
"""Outputs HTML-formatted plots."""
result = ''
for instance in test_instances:
# A test instance is idenfied by the test name and the test run.
case_name, run_id_str = instance.split(' ')
instance_dir = case_name + '/' + run_id_str + '_report'
# A test instance is identified by the test name and the test run.
test_case, test_instance, _ = instance.split(' ')
instance_dir = test_case + '/' + test_instance + '_report'
if os.path.isfile(os.path.join(base_dir, instance_dir, 'dist.png')):
result += DETAILS % {'anchor': case_name + '_' + run_id_str,
'name': '{} (instance {})'.format(case_name,
run_id_str),
result += DETAILS % {'anchor': test_case + '_' + test_instance,
'name': '{} (instance {})'.format(test_case,
test_instance),
'instance_dir': instance_dir}
return result
@ -327,16 +295,19 @@ def main(argv):
# This file has the test case names, in the order that they should be
# displayed.
path = os.path.join(base_dir, 'test-cases.txt')
path = os.path.join(base_dir, 'test-instances.txt')
with open(path) as f:
test_instances = [line.strip() for line in f]
metrics_lists = {
'tv': [], # total_variation for all test cases
'fpr': [], # list of false positive rates
'fnr': [], # list of false negative rates
'am': [], # list of total allocated masses
'time': [], # list of total elapsed time measurements
# Metrics are assembled into a dictionary of dictionaries. The top-level
# key is the metric name ('tv', 'fpr', etc.), the second level key is
# the test case. These keys reference a list of floats, which can be empty.
metrics = {
'tv': {}, # total_variation for all test cases
'fpr': {}, # dictionary of false positive rates
'fnr': {}, # dictionary of false negative rates
'am': {}, # dictionary of total allocated masses
'time': {}, # dictionary of total elapsed time measurements
}
# If there are too many tests, the plots are not included in the results
@ -345,36 +316,39 @@ def main(argv):
for instance in test_instances:
# A test instance is idenfied by the test name and the test run.
case_name, run_id_str = instance.split(' ')
# if this is the first run of a test case, start anew
if run_id_str == '1':
for metric in metrics_lists:
metrics_lists[metric].append([])
test_case, test_instance, _ = instance.split(' ')
spec = os.path.join(base_dir, case_name, 'spec.txt')
if os.path.isfile(spec):
num_additional, row_spec = ParseSpecFile(spec)
spec_file = os.path.join(base_dir, test_case, 'spec.txt')
if not os.path.isfile(spec_file):
raise RuntimeError('{} is missing'.format(spec_file))
report_dir = os.path.join(base_dir, case_name, run_id_str + '_report')
if os.path.isdir(report_dir):
metrics = os.path.join(report_dir, 'metrics.csv')
num_additional, spec_html = ParseSpecFile(spec_file)
metrics_html = '' # will be filled in later on, if metrics exist
row_name = FormatRowName(case_name, run_id_str, metrics, include_plots)
report_dir = os.path.join(base_dir, test_case, test_instance + '_report')
if os.path.isfile(metrics):
# ParseMetrics outputs an HTML table row and also updates lists
row_metrics = ParseMetrics(report_dir, num_additional, metrics_lists)
metrics_file = os.path.join(report_dir, 'metrics.csv')
log_file = os.path.join(report_dir, 'log.txt')
plot_file = os.path.join(report_dir, 'dist.png')
cell1_html = FormatCell1(test_case, test_instance, metrics_file, log_file,
plot_file, include_plots)
if os.path.isfile(metrics_file):
# ParseMetrics outputs an HTML table row and also updates lists
metrics_dict, metrics_html = ParseMetrics(metrics_file, log_file,
num_additional)
# Update the metrics structure. Initialize dictionaries if necessary.
for m in metrics:
if not test_case in metrics[m]:
metrics[m][test_case] = metrics_dict[m]
else:
row_metrics = ''
else:
row_name = '<td>{}<td>'.format(case_name)
row_metrics = ''
metrics[m][test_case] += metrics_dict[m]
print '<tr>{}{}{}</tr>'.format(row_name, row_spec, row_metrics)
else:
print EMPTY_ROW % {'name': case_name}
print '<tr>{}{}{}</tr>'.format(cell1_html, spec_html, metrics_html)
print FormatSummaryRows(metrics_lists)
print FormatSummaryRow(metrics)
print '</tbody>'
print '</table>'
@ -384,8 +358,8 @@ def main(argv):
if include_plots:
print FormatPlots(base_dir, test_instances)
else:
print '<p>Too many tests to include plots.\
Click links within rows for details.</p>'
print ('<p>Too many tests to include plots. '
'Click links within rows for details.</p>')
if __name__ == '__main__':

Просмотреть файл

@ -99,30 +99,11 @@ def CreateOptionsParser():
return p
def print_params(params, csv_out, json_out):
"""Print Rappor parameters to a text file."""
c = csv.writer(csv_out)
c.writerow(('k', 'h', 'm', 'p', 'q', 'f')) # header
row = (
params.num_bloombits,
params.num_hashes,
params.num_cohorts,
params.prob_p,
params.prob_q,
params.prob_f)
c.writerow(row)
print >>json_out, params.to_json()
def make_histogram(csv_in):
"""Make a histogram of the simulated input file."""
# TODO: It would be better to share parsing with rappor_encode()
counter = collections.Counter()
for i, (_, word) in enumerate(csv_in):
if i == 0:
continue
for (_, word) in csv_in:
counter[word] += 1
return dict(counter.most_common())
@ -171,14 +152,6 @@ def main(argv):
outfile = prefix + "_out.csv"
histfile = prefix + "_hist.csv"
true_inputs_file = prefix + "_true_inputs.txt"
params_csv = prefix + "_params.csv"
params_json = prefix + '_params.json'
# Print parameters to parameters file -- needed for the R analysis tool.
with open(params_csv, 'w') as csv_out:
with open(params_json, 'w') as json_out:
print_params(params, csv_out, json_out)
with open(opts.infile) as f:
csv_in = csv.reader(f)
@ -190,12 +163,6 @@ def main(argv):
all_words = sorted(word_hist) # unique words
# Print all true values, one per line. This file can be further processed to
# simulate inaccurate candidate lists.
with open(true_inputs_file, 'w') as f:
for word in all_words:
print >>f, word
rand = random.Random() # default Mersenne Twister randomness
#rand = random.SystemRandom() # cryptographic randomness from OS
@ -228,9 +195,6 @@ def main(argv):
start_time = time.time()
for i, (client, true_value) in enumerate(csv_in):
if i == 0:
continue # skip header line
if i % 10000 == 0:
elapsed = time.time() - start_time
log('Processed %d inputs in %.2f seconds', i, elapsed)

Просмотреть файл

@ -14,6 +14,17 @@ import sys
# TEST CONFIGURATION
#
DEMO = (
# (case_name distr num_unique_values num_clients values_per_client)
# (num_bits num_hashes num_cohorts)
# (p q f) (num_additional regexp_to_remove)
('demo1 unif 100 10000 10', '16 2 64', '0.1 0.9 0.2', '10 v[0-9]*9$'),
('demo2 gauss 100 10000 10', '16 2 64', '0.1 0.9 0.2', '10 v[0-9]*9$'),
('demo3 exp 100 10000 10', '16 2 64', '0.1 0.9 0.2', '10 v[0-9]*9$'),
('demo4 zipf1 100 10000 10', '16 2 64', '0.1 0.9 0.2', '10 v[0-9]*9$'),
('demo5 zipf1.5 100 10000 10', '16 2 64', '0.1 0.9 0.2', '10 v[0-9]*9$'),
)
DISTRIBUTIONS = (
'unif',
'exp',
@ -24,7 +35,8 @@ DISTRIBUTIONS = (
DISTRIBUTION_PARAMS = (
# name, num unique values, num clients, values per client
('small', 100, 1000000, 1),
('tiny', 100, 1000, 1), # test for insufficient data
('small', 100, 100000, 1),
('medium', 1000, 10000000, 1),
('large', 10000, 100000000, 1),
)
@ -39,9 +51,8 @@ BLOOMFILTER_PARAMS = {
# 'p, q, f' as in params file.
PRIVACY_PARAMS = {
'eps_1_1': (0.44, 0.56, 0), # eps_1 = 1, no eps_inf
'eps_1_1': (0.39, 0.61, 0.45), # eps_1 = 1, eps_inf = 5:
'eps_1_5': (0.225, 0.775, 0.0), # eps_1 = 5, no eps_inf
'eps_inf_5': (0.39, 0.61, 0.45), # eps_1 = 1, eps_inf = 5:
}
# For deriving candidates from true inputs.
@ -55,54 +66,42 @@ MAP_REGEX_MISSING = {
# regex missing)
TEST_CONFIGS = [
('typical', '128x128', 'eps_1_1', .2, '10%'),
('categorical', '128x128', 'eps_1_1', .0, 'sharp'), # no extra candidates
('sharp', '128x128', 'eps_1_1', .0, 'sharp'), # no extra candidates
('loose', '128x128', 'eps_1_5', .2, '10%'), # loose privacy
('over_x2', '128x128', 'eps_1_1', 2.0, '10%'), # overshoot by x2
('over_x10', '128x128', 'eps_1_1', 10.0, '10%'), # overshoot by x10
]
DEMO_CASES = [
# The 5 cases run by the demo.sh script
('demo-small-exp', 'exp_a', '8x128', 'eps_1_1', 20, '10%'),
('demo-small-gauss', 'gauss_a', '8x128', 'eps_1_1', 20, '10%'),
]
#
# END TEST CONFIGURATION
#
def CreateOptionsParser():
p = optparse.OptionParser()
p.add_option(
'-r', dest='runs', metavar='INT', type='int', default=1,
help='Number of runs for each test.')
return p
def main(argv):
(opts, argv) = CreateOptionsParser().parse_args(argv)
rows = []
test_case = []
for (distr_params, num_values, num_clients,
num_reports_per_client) in DISTRIBUTION_PARAMS:
for (distr_params, num_values, num_clients,
num_reports_per_client) in DISTRIBUTION_PARAMS:
for distribution in DISTRIBUTIONS:
for (config_name, bloom_name, privacy_params, fr_extra,
regex_missing) in TEST_CONFIGS:
test_name = '{}-{}-{}'.format(distribution,
distr_params, config_name)
for (config_name, bloom_name, privacy_params, fr_extra,
regex_missing) in TEST_CONFIGS:
test_name = 'r-{}-{}-{}'.format(distribution, distr_params,
config_name)
params = (BLOOMFILTER_PARAMS[bloom_name]
+ PRIVACY_PARAMS[privacy_params]
+ tuple([int(num_values * fr_extra)])
+ tuple([MAP_REGEX_MISSING[regex_missing]]))
+ PRIVACY_PARAMS[privacy_params]
+ tuple([int(num_values * fr_extra)])
+ tuple([MAP_REGEX_MISSING[regex_missing]]))
for r in range(1, opts.runs + 1):
test_run = (test_name, r, distribution, num_values, num_clients,
num_reports_per_client)
row_str = [str(element) for element in test_run + params]
rows.append(row_str)
test_case = (test_name, distribution, num_values, num_clients,
num_reports_per_client) + params
row_str = [str(element) for element in test_case]
rows.append(row_str)
for params in DEMO:
rows.append(params)
print >>sys.stderr, params
for row in rows:
print ' '.join(row)