Multiprocess in RAPPOR, and write data needed to generate plot to a csv file

2017-08-09 15:17:53 +02:00 · 2017-08-09 15:17:53 +02:00 · 32f7890c49
--- a/client/python/rappor.py
+++ b/client/python/rappor.py
@ -26,6 +26,7 @@ import json
 import struct
 import sys
 import os
+import math

 from random import SystemRandom
 from hmac_drbg import HMAC_DRBG
@ -198,8 +199,15 @@ def get_prr_masks(secret, word, prob_f, num_bits):
  #log('word %s, secret %s, HMAC-SHA256 %s', word, secret, h.hexdigest())

  # Now go through each byte
-  digest_bytes = h.generate(num_bits)
-  assert len(digest_bytes) == num_bits
+  digest_bytes = []
+  iters = int(math.ceil(num_bits / 900.))
+  for i in range(iters):
+    if i == iters - 1:
+      digest_bytes += h.generate(num_bits % 900)
+    else:
+      digest_bytes += h.generate(900)
+
+  #assert len(digest_bytes) == num_bits

  # Use 32 bits.  If we want 64 bits, it may be fine to generate another 32
  # bytes by repeated HMAC.  For arbitrary numbers of bytes it's probably
--- a/regtest.sh
+++ b/regtest.sh
@ -167,7 +167,7 @@ _run-one-instance() {
    < $case_dir/spec.txt

  local instance_dir=$case_dir/$test_instance
-  mkdir -p $instance_dir
+  #mkdir -p $instance_dir

  banner "Generating reports (gen_reports.R)"

--- a/tests/compare_dist.R
+++ b/tests/compare_dist.R
@ -236,6 +236,12 @@ WriteSummary <- function(metrics, outdir) {
  Log('Wrote %s', filename)
 }

+WritePlotData <- function(plot_data, outdir) {
+  filename <- file.path(outdir, 'plot_data.csv')
+  write.csv(plot_data, file = filename, row.names = FALSE)
+  Log('Wrote %s', filename)
+}
+
 main <- function(parsed) {
  args <- parsed$args
  options <- parsed$options
@ -261,6 +267,8 @@ main <- function(parsed) {

  WriteSummary(d$metrics, output_dir)
  WritePlot(p, output_dir)
+  WritePlotData(d$plot_data, output_dir)
+
 }

 if (is_main) {
--- a/tests/rappor_sim.py
+++ b/tests/rappor_sim.py
@ -35,7 +35,7 @@ import os
 import random
 import sys
 import time
-
+from multiprocessing import Pool, Lock               
 import rappor  # client library
 try:
  import fastrand
@ -44,7 +44,6 @@ except ImportError:
      "Native fastrand module not imported; see README for speedups")
  fastrand = None

-
 def log(msg, *args):
  if args:
    msg = msg % args
@ -143,7 +142,7 @@ def GenAssocTestdata(params1, params2, irr_rand, assoc_testdata_count,
      irr2_str = rappor.bit_string(irr2, params2.num_bloombits)

      csv_out.writerow((client_str, cohort, irr1_str, irr2_str))
-
+      print "NEVER"
      report_index += 1


@ -155,7 +154,10 @@ def RapporClientSim(params, irr_rand, csv_in, csv_out, max_lines):
  # TODO: It would be more instructive/efficient to construct an encoder
  # instance up front per client, rather than one per row below.
  start_time = time.time()
-
+  lock = Lock()
+  poolsize = 8
+  pool = Pool(processes=poolsize, initializer=init, initargs=(lock,))
+  args = []
  for i, (client_str, cohort_str, true_value) in enumerate(csv_in):
    if i > max_lines:
      break
@ -178,6 +180,22 @@ def RapporClientSim(params, irr_rand, csv_in, csv_out, max_lines):

    cohort = int(cohort_str)
    secret = client_str
+    args.append((params, cohort, secret, irr_rand, true_value))
+
+  rows = pool.map(create_report, args)
+  for r in rows:
+    csv_out.writerow(r)
+  
+
+
+
+def create_report(input):
+    params = input[0]
+    cohort = input[1]
+    secret = input[2]
+    irr_rand = input[3]
+    true_value = input[4]
+    #csv_out = input[4]
    e = rappor.Encoder(params, cohort, secret, irr_rand)

    # Real users should call e.encode().  For testing purposes, we also want
@ -188,10 +206,19 @@ def RapporClientSim(params, irr_rand, csv_in, csv_out, max_lines):
    prr_str = rappor.bit_string(prr, params.num_bloombits)
    irr_str = rappor.bit_string(irr, params.num_bloombits)

-    out_row = (client_str, cohort_str, bloom_str, prr_str, irr_str)
-    csv_out.writerow(out_row)
+    #with lock:
+    #  out_row = (secret, str(cohort), bloom_str, prr_str, irr_str)
+    #  csv_out.writerow(out_row)
+    out_row = (secret, str(cohort), bloom_str, prr_str, irr_str)
+    return out_row
+   
+    #return out_row


+def init(l):
+    global lock
+    lock = l
+
 def main(argv):
  (opts, argv) = CreateOptionsParser().parse_args(argv)

@ -212,7 +239,7 @@ def main(argv):
  if opts.random_mode == 'simple':
    irr_rand = rappor.SecureIrrRand(params)
  elif opts.random_mode == 'fast':
-    if fastrand:
+    if False:
      log('Using fastrand extension')
      # NOTE: This doesn't take 'rand'.  It's seeded in C with srand().
      irr_rand = fastrand.FastIrrRand(params)
@ -228,6 +255,7 @@ def main(argv):
  #   - or srand(0) might do it.

  csv_in = csv.reader(sys.stdin)
+  global csv_out
  csv_out = csv.writer(sys.stdout)

  if opts.assoc_testdata:
--- a/tests/regtest_spec.py
+++ b/tests/regtest_spec.py
@ -35,11 +35,13 @@ DISTRIBUTIONS = (

 DISTRIBUTION_PARAMS = (
    # name, num unique values, num clients, values per client
-    ('tiny', 100, 1000, 1),  # test for insufficient data
+    ('tiny', 100, 10000, 1),  # test for insufficient data
    ('small', 100, 1000000, 1),
    ('medium', 1000, 10000000, 1),
    ('medium2', 100, 10000000, 1),
+    ('medium3', 10000, 10000000, 1),
    ('large', 10000, 100000000, 1),
+
    # Params for testing how varying the number of clients affects the results
    ('clients1', 100, 10000000, 1),
    ('clients2', 100, 1000000, 1),
@ -74,6 +76,9 @@ DISTRIBUTION_PARAMS = (
    ('unique16', 1000, 10000000, 1),
    ('unique17', 2000, 10000000, 1),
    ('unique18', 5000, 10000000, 1),
+
+    ('cohort', 10000, 10000000, 1),
+
 )

 # 'k, h, m' as in params file.
@ -83,9 +88,10 @@ BLOOMFILTER_PARAMS = {
    '8x128x2': (8, 2, 128),  # 128 cohorts, 8 bits each, 2 bits set in each
    '128x8x2': (128, 2, 8),  # 8 cohorts, 128 bits each, 2 bits set in each
    '32x64x1': (32, 1, 64), # 64 cohorts, 32 bit each, 1 bits set in each
-    '32x2x1': (32, 1, 2), # 1 cohort, 32 bit each, 1 bits set in each
+    '32x2x1': (32, 1, 2), # 2 cohort, 32 bit each, 1 bits set in each
    '32x64x2': (32, 2, 64), # 64 cohorts, 32 bit each, 1 bits set in each

+    '10000x200x2': (128, 2, 100), # 64 cohorts, 32 bit each, 1 bits set in each

    # params for testing the size of the bloom filter
    '4x32x2': (4, 2, 32),  # 32 cohorts, 4 bits each, 2 bits set in each
@ -113,27 +119,33 @@ BLOOMFILTER_PARAMS = {
    '8x256x2': (8, 2, 256),

    # with different number of hash functions
-    '4x32x4': (4, 4, 32),  # 32 cohorts, 4 bits each, 2 bits set in each
-    '8x32x4': (8, 4, 32),  # 32 cohorts, 8 bits each, 2 bits set in each
-    '16x32x4': (16, 4, 32),  # 32 cohorts, 16 bits each, 2 bits set in each
-    '32x32x4': (32, 4, 32),  # 32 cohorts, 32 bits each, 2 bits set in each
-    '64x32x4': (64, 4, 32),  # 32 cohorts, 64 bits each, 2 bits set in each
-    '128x32x4': (128, 4, 32),  # 32 cohorts, 128 bits each, 2 bits set in each
-    '256x32x4': (256, 4, 32),  # 32 cohorts, 256 bits each, 2 bits set in each
+    '4x32x4': (4, 4, 32),  # 32 cohorts, 4 bits each, 4 bits set in each
+    '8x32x4': (8, 4, 32),  # 32 cohorts, 8 bits each, 4 bits set in each
+    '16x32x4': (16, 4, 32),  # 32 cohorts, 16 bits each, 4 bits set in each
+    '32x32x4': (32, 4, 32),  # 32 cohorts, 32 bits each, 4 bits set in each
+    '64x32x4': (64, 4, 32),  # 32 cohorts, 64 bits each, 4 bits set in each
+    '128x32x4': (128, 4, 32),  # 32 cohorts, 128 bits each, 4 bits set in each
+    '256x32x4': (256, 4, 32),  # 32 cohorts, 256 bits each, 4 bits set in each
    #
-    '4x128x4': (4, 4, 128),  # 128 cohorts, 4 bits each, 2 bits set in each
-    '8x128x4': (8, 4, 128),  # 128 cohorts, 8 bits each, 2 bits set in each
-    '16x128x4': (16, 4, 128),  # 128 cohorts, 16 bits each, 2 bits set in each
-    '32x128x4': (32, 4, 128),  # 128 cohorts, 32 bits each, 2 bits set in each
-    '64x128x4': (64, 4, 128),  # 128 cohorts, 64 bits each, 2 bits set in each
-    '128x128x4': (128, 4, 128),  # 128 cohorts, 128 bits each, 2 bits set in each
-    '256x128x4': (256, 4, 128),  # 128 cohorts, 256 bits each, 2 bits set in each
+    '4x128x4': (4, 4, 128),  # 128 cohorts, 4 bits each, 4 bits set in each
+    '8x128x4': (8, 4, 128),  # 128 cohorts, 8 bits each, 4 bits set in each
+    '16x128x4': (16, 4, 128),  # 128 cohorts, 16 bits each, 4 bits set in each
+    '32x128x4': (32, 4, 128),  # 128 cohorts, 32 bits each, 4 bits set in each
+    '64x128x4': (64, 4, 128),  # 128 cohorts, 64 bits each, 4 bits set in each
+    '128x128x4': (128, 4, 128),  # 128 cohorts, 128 bits each, 4 bits set in each
+    '256x128x4': (256, 4, 128),  # 128 cohorts, 256 bits each, 4 bits set in each

    # params for testing the number of hash functions
    '8x128x1' : (8, 1, 128),
    '8x128x4' : (8, 4, 128),
    '8x128x8' : (8, 8, 128),
    '8x128x16' : (8, 16, 128),
+
+    '256x128x1':(256, 1, 128),
+    '256x128x4':(256, 4, 128),
+    '256x128x8':(256, 8, 128),
+    '256x128x16':(256, 16, 128),
+
 }

 # 'p, q, f' as in params file.
@ -160,6 +172,8 @@ PRIVACY_PARAMS = {
    #
    'params12': (0.5, 0.75, 0.75),
    'params13': (0.25, 0.75, 0.5),
+    'params14': (0.35, 0.65, 0.0),
+
 }

 # For deriving candidates from true inputs.
@ -232,6 +246,13 @@ TEST_CONFIGS = [
    ('sim_hash1_4', '8x128x8', 'params6', .0, 'sharp'),
    ('sim_hash1_5', '8x128x16', 'params7', .0, 'sharp'),

+
+    ('sim_hash2_1', '256x128x1', 'params3', .0, 'sharp'),
+    ('sim_hash2_2', '256x128x2', 'params4', .0, 'sharp'),
+    ('sim_hash2_3', '256x128x4', 'params5', .0, 'sharp'),
+    ('sim_hash2_4', '256x128x8', 'params6', .0, 'sharp'),
+    ('sim_hash2_5', '256x128x16', 'params7', .0, 'sharp'),
+
    # configuration for testing the number of cohorts
    ('sim_cohort1_1', '8x2x2', 'params3', .0, 'sharp'),
    ('sim_cohort1_2', '8x4x2', 'params3', .0, 'sharp'),
@ -242,6 +263,14 @@ TEST_CONFIGS = [
    ('sim_cohort1_7', '8x128x2', 'params3', .0, 'sharp'),
    ('sim_cohort1_8', '8x256x2', 'params3', .0, 'sharp'),

+    ('sim_cohort2_1', '8x2x2', 'params3', .0, 'sharp'),
+    ('sim_cohort2_2', '8x4x2', 'params3', .0, 'sharp'),
+    ('sim_cohort2_3', '8x8x2', 'params3', .0, 'sharp'),
+    ('sim_cohort2_4', '8x16x2', 'params3', .0, 'sharp'),
+    ('sim_cohort2_5', '8x32x2', 'params3', .0, 'sharp'),
+    ('sim_cohort2_6', '8x64x2', 'params3', .0, 'sharp'),
+    ('sim_cohort2_7', '8x128x2', 'params3', .0, 'sharp'),
+    ('sim_cohort2_8', '8x256x2', 'params3', .0, 'sharp'),
    # configuration for testing different probabilities p, q, f
    ('sim_probs1_1', '8x128x2', 'params3', .0, 'sharp'),
    ('sim_probs1_2', '8x128x2', 'params8', .0, 'sharp'),
@ -253,6 +282,9 @@ TEST_CONFIGS = [
    ('sim_case_scenario_1', '16x128x2', 'params12', .0, 'sharp'),
    ('sim_case_scenario_2', '16x128x2', 'params13', .0, 'sharp'),

+    #
+    ('sim_final', '10000x200x2', 'params14', .0, 'sharp'),
+
 ]

 #