Multiprocess in RAPPOR, and write data needed to generate plot to a csv file

This commit is contained in:
Alejandro Rodriguez Salamanca 2017-08-09 15:17:53 +02:00
Родитель ea46c460af
Коммит 32f7890c49
5 изменённых файлов: 102 добавлений и 26 удалений

Просмотреть файл

@ -26,6 +26,7 @@ import json
import struct
import sys
import os
import math
from random import SystemRandom
from hmac_drbg import HMAC_DRBG
@ -198,8 +199,15 @@ def get_prr_masks(secret, word, prob_f, num_bits):
#log('word %s, secret %s, HMAC-SHA256 %s', word, secret, h.hexdigest())
# Now go through each byte
digest_bytes = h.generate(num_bits)
assert len(digest_bytes) == num_bits
digest_bytes = []
iters = int(math.ceil(num_bits / 900.))
for i in range(iters):
if i == iters - 1:
digest_bytes += h.generate(num_bits % 900)
else:
digest_bytes += h.generate(900)
#assert len(digest_bytes) == num_bits
# Use 32 bits. If we want 64 bits, it may be fine to generate another 32
# bytes by repeated HMAC. For arbitrary numbers of bytes it's probably

Просмотреть файл

@ -167,7 +167,7 @@ _run-one-instance() {
< $case_dir/spec.txt
local instance_dir=$case_dir/$test_instance
mkdir -p $instance_dir
#mkdir -p $instance_dir
banner "Generating reports (gen_reports.R)"

Просмотреть файл

@ -236,6 +236,12 @@ WriteSummary <- function(metrics, outdir) {
Log('Wrote %s', filename)
}
WritePlotData <- function(plot_data, outdir) {
filename <- file.path(outdir, 'plot_data.csv')
write.csv(plot_data, file = filename, row.names = FALSE)
Log('Wrote %s', filename)
}
main <- function(parsed) {
args <- parsed$args
options <- parsed$options
@ -261,6 +267,8 @@ main <- function(parsed) {
WriteSummary(d$metrics, output_dir)
WritePlot(p, output_dir)
WritePlotData(d$plot_data, output_dir)
}
if (is_main) {

Просмотреть файл

@ -35,7 +35,7 @@ import os
import random
import sys
import time
from multiprocessing import Pool, Lock
import rappor # client library
try:
import fastrand
@ -44,7 +44,6 @@ except ImportError:
"Native fastrand module not imported; see README for speedups")
fastrand = None
def log(msg, *args):
if args:
msg = msg % args
@ -143,7 +142,7 @@ def GenAssocTestdata(params1, params2, irr_rand, assoc_testdata_count,
irr2_str = rappor.bit_string(irr2, params2.num_bloombits)
csv_out.writerow((client_str, cohort, irr1_str, irr2_str))
print "NEVER"
report_index += 1
@ -155,7 +154,10 @@ def RapporClientSim(params, irr_rand, csv_in, csv_out, max_lines):
# TODO: It would be more instructive/efficient to construct an encoder
# instance up front per client, rather than one per row below.
start_time = time.time()
lock = Lock()
poolsize = 8
pool = Pool(processes=poolsize, initializer=init, initargs=(lock,))
args = []
for i, (client_str, cohort_str, true_value) in enumerate(csv_in):
if i > max_lines:
break
@ -178,6 +180,22 @@ def RapporClientSim(params, irr_rand, csv_in, csv_out, max_lines):
cohort = int(cohort_str)
secret = client_str
args.append((params, cohort, secret, irr_rand, true_value))
rows = pool.map(create_report, args)
for r in rows:
csv_out.writerow(r)
def create_report(input):
params = input[0]
cohort = input[1]
secret = input[2]
irr_rand = input[3]
true_value = input[4]
#csv_out = input[4]
e = rappor.Encoder(params, cohort, secret, irr_rand)
# Real users should call e.encode(). For testing purposes, we also want
@ -188,10 +206,19 @@ def RapporClientSim(params, irr_rand, csv_in, csv_out, max_lines):
prr_str = rappor.bit_string(prr, params.num_bloombits)
irr_str = rappor.bit_string(irr, params.num_bloombits)
out_row = (client_str, cohort_str, bloom_str, prr_str, irr_str)
csv_out.writerow(out_row)
#with lock:
# out_row = (secret, str(cohort), bloom_str, prr_str, irr_str)
# csv_out.writerow(out_row)
out_row = (secret, str(cohort), bloom_str, prr_str, irr_str)
return out_row
#return out_row
def init(l):
global lock
lock = l
def main(argv):
(opts, argv) = CreateOptionsParser().parse_args(argv)
@ -212,7 +239,7 @@ def main(argv):
if opts.random_mode == 'simple':
irr_rand = rappor.SecureIrrRand(params)
elif opts.random_mode == 'fast':
if fastrand:
if False:
log('Using fastrand extension')
# NOTE: This doesn't take 'rand'. It's seeded in C with srand().
irr_rand = fastrand.FastIrrRand(params)
@ -228,6 +255,7 @@ def main(argv):
# - or srand(0) might do it.
csv_in = csv.reader(sys.stdin)
global csv_out
csv_out = csv.writer(sys.stdout)
if opts.assoc_testdata:

Просмотреть файл

@ -35,11 +35,13 @@ DISTRIBUTIONS = (
DISTRIBUTION_PARAMS = (
# name, num unique values, num clients, values per client
('tiny', 100, 1000, 1), # test for insufficient data
('tiny', 100, 10000, 1), # test for insufficient data
('small', 100, 1000000, 1),
('medium', 1000, 10000000, 1),
('medium2', 100, 10000000, 1),
('medium3', 10000, 10000000, 1),
('large', 10000, 100000000, 1),
# Params for testing how varying the number of clients affects the results
('clients1', 100, 10000000, 1),
('clients2', 100, 1000000, 1),
@ -74,6 +76,9 @@ DISTRIBUTION_PARAMS = (
('unique16', 1000, 10000000, 1),
('unique17', 2000, 10000000, 1),
('unique18', 5000, 10000000, 1),
('cohort', 10000, 10000000, 1),
)
# 'k, h, m' as in params file.
@ -83,9 +88,10 @@ BLOOMFILTER_PARAMS = {
'8x128x2': (8, 2, 128), # 128 cohorts, 8 bits each, 2 bits set in each
'128x8x2': (128, 2, 8), # 8 cohorts, 128 bits each, 2 bits set in each
'32x64x1': (32, 1, 64), # 64 cohorts, 32 bit each, 1 bits set in each
'32x2x1': (32, 1, 2), # 1 cohort, 32 bit each, 1 bits set in each
'32x2x1': (32, 1, 2), # 2 cohort, 32 bit each, 1 bits set in each
'32x64x2': (32, 2, 64), # 64 cohorts, 32 bit each, 1 bits set in each
'10000x200x2': (128, 2, 100), # 64 cohorts, 32 bit each, 1 bits set in each
# params for testing the size of the bloom filter
'4x32x2': (4, 2, 32), # 32 cohorts, 4 bits each, 2 bits set in each
@ -113,27 +119,33 @@ BLOOMFILTER_PARAMS = {
'8x256x2': (8, 2, 256),
# with different number of hash functions
'4x32x4': (4, 4, 32), # 32 cohorts, 4 bits each, 2 bits set in each
'8x32x4': (8, 4, 32), # 32 cohorts, 8 bits each, 2 bits set in each
'16x32x4': (16, 4, 32), # 32 cohorts, 16 bits each, 2 bits set in each
'32x32x4': (32, 4, 32), # 32 cohorts, 32 bits each, 2 bits set in each
'64x32x4': (64, 4, 32), # 32 cohorts, 64 bits each, 2 bits set in each
'128x32x4': (128, 4, 32), # 32 cohorts, 128 bits each, 2 bits set in each
'256x32x4': (256, 4, 32), # 32 cohorts, 256 bits each, 2 bits set in each
'4x32x4': (4, 4, 32), # 32 cohorts, 4 bits each, 4 bits set in each
'8x32x4': (8, 4, 32), # 32 cohorts, 8 bits each, 4 bits set in each
'16x32x4': (16, 4, 32), # 32 cohorts, 16 bits each, 4 bits set in each
'32x32x4': (32, 4, 32), # 32 cohorts, 32 bits each, 4 bits set in each
'64x32x4': (64, 4, 32), # 32 cohorts, 64 bits each, 4 bits set in each
'128x32x4': (128, 4, 32), # 32 cohorts, 128 bits each, 4 bits set in each
'256x32x4': (256, 4, 32), # 32 cohorts, 256 bits each, 4 bits set in each
#
'4x128x4': (4, 4, 128), # 128 cohorts, 4 bits each, 2 bits set in each
'8x128x4': (8, 4, 128), # 128 cohorts, 8 bits each, 2 bits set in each
'16x128x4': (16, 4, 128), # 128 cohorts, 16 bits each, 2 bits set in each
'32x128x4': (32, 4, 128), # 128 cohorts, 32 bits each, 2 bits set in each
'64x128x4': (64, 4, 128), # 128 cohorts, 64 bits each, 2 bits set in each
'128x128x4': (128, 4, 128), # 128 cohorts, 128 bits each, 2 bits set in each
'256x128x4': (256, 4, 128), # 128 cohorts, 256 bits each, 2 bits set in each
'4x128x4': (4, 4, 128), # 128 cohorts, 4 bits each, 4 bits set in each
'8x128x4': (8, 4, 128), # 128 cohorts, 8 bits each, 4 bits set in each
'16x128x4': (16, 4, 128), # 128 cohorts, 16 bits each, 4 bits set in each
'32x128x4': (32, 4, 128), # 128 cohorts, 32 bits each, 4 bits set in each
'64x128x4': (64, 4, 128), # 128 cohorts, 64 bits each, 4 bits set in each
'128x128x4': (128, 4, 128), # 128 cohorts, 128 bits each, 4 bits set in each
'256x128x4': (256, 4, 128), # 128 cohorts, 256 bits each, 4 bits set in each
# params for testing the number of hash functions
'8x128x1' : (8, 1, 128),
'8x128x4' : (8, 4, 128),
'8x128x8' : (8, 8, 128),
'8x128x16' : (8, 16, 128),
'256x128x1':(256, 1, 128),
'256x128x4':(256, 4, 128),
'256x128x8':(256, 8, 128),
'256x128x16':(256, 16, 128),
}
# 'p, q, f' as in params file.
@ -160,6 +172,8 @@ PRIVACY_PARAMS = {
#
'params12': (0.5, 0.75, 0.75),
'params13': (0.25, 0.75, 0.5),
'params14': (0.35, 0.65, 0.0),
}
# For deriving candidates from true inputs.
@ -232,6 +246,13 @@ TEST_CONFIGS = [
('sim_hash1_4', '8x128x8', 'params6', .0, 'sharp'),
('sim_hash1_5', '8x128x16', 'params7', .0, 'sharp'),
('sim_hash2_1', '256x128x1', 'params3', .0, 'sharp'),
('sim_hash2_2', '256x128x2', 'params4', .0, 'sharp'),
('sim_hash2_3', '256x128x4', 'params5', .0, 'sharp'),
('sim_hash2_4', '256x128x8', 'params6', .0, 'sharp'),
('sim_hash2_5', '256x128x16', 'params7', .0, 'sharp'),
# configuration for testing the number of cohorts
('sim_cohort1_1', '8x2x2', 'params3', .0, 'sharp'),
('sim_cohort1_2', '8x4x2', 'params3', .0, 'sharp'),
@ -242,6 +263,14 @@ TEST_CONFIGS = [
('sim_cohort1_7', '8x128x2', 'params3', .0, 'sharp'),
('sim_cohort1_8', '8x256x2', 'params3', .0, 'sharp'),
('sim_cohort2_1', '8x2x2', 'params3', .0, 'sharp'),
('sim_cohort2_2', '8x4x2', 'params3', .0, 'sharp'),
('sim_cohort2_3', '8x8x2', 'params3', .0, 'sharp'),
('sim_cohort2_4', '8x16x2', 'params3', .0, 'sharp'),
('sim_cohort2_5', '8x32x2', 'params3', .0, 'sharp'),
('sim_cohort2_6', '8x64x2', 'params3', .0, 'sharp'),
('sim_cohort2_7', '8x128x2', 'params3', .0, 'sharp'),
('sim_cohort2_8', '8x256x2', 'params3', .0, 'sharp'),
# configuration for testing different probabilities p, q, f
('sim_probs1_1', '8x128x2', 'params3', .0, 'sharp'),
('sim_probs1_2', '8x128x2', 'params8', .0, 'sharp'),
@ -253,6 +282,9 @@ TEST_CONFIGS = [
('sim_case_scenario_1', '16x128x2', 'params12', .0, 'sharp'),
('sim_case_scenario_2', '16x128x2', 'params13', .0, 'sharp'),
#
('sim_final', '10000x200x2', 'params14', .0, 'sharp'),
]
#