From ea46c460af3e17f44fdb0365cab383b0abc48bfe Mon Sep 17 00:00:00 2001 From: Alejandro Rodriguez Salamanca Date: Tue, 1 Aug 2017 14:50:08 +0200 Subject: [PATCH] Clean repo --- .gitignore | 1 - analysis/R/decode_ngrams.R | 377 --------------------------------- analysis/R/fast_em.R | 137 ------------ analysis/R/ngrams_simulation.R | 271 ------------------------ analysis/R/unknowns_test.R | 139 ------------ calculate_epsilon.py | 29 +++ 6 files changed, 29 insertions(+), 925 deletions(-) delete mode 100755 analysis/R/decode_ngrams.R delete mode 100755 analysis/R/fast_em.R delete mode 100755 analysis/R/ngrams_simulation.R delete mode 100755 analysis/R/unknowns_test.R create mode 100644 calculate_epsilon.py diff --git a/.gitignore b/.gitignore index 7edcbec..54a1814 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,3 @@ pyvenv.cfg .venv pip-selfcheck.json save_results.sh -calculate_epsilon.py diff --git a/analysis/R/decode_ngrams.R b/analysis/R/decode_ngrams.R deleted file mode 100755 index e2585cb..0000000 --- a/analysis/R/decode_ngrams.R +++ /dev/null @@ -1,377 +0,0 @@ -# Copyright 2014 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# -# This file has functions that aid in the estimation of a distribution when the -# dictionary is unknown. There are functions for estimating pairwise joint -# ngram distributions, pruning out false positives, and combining the two -# steps. - -FindPairwiseCandidates <- function(report_data, N, ngram_params, params) { - # Finds the pairwise most likely ngrams. - # - # Args: - # report_data: Object containing data relevant to reports: - # $inds: The indices of reports collected using various pairs - # $cohorts: The cohort of each report - # $map: The map used for all the ngrams - # $reports: The reports used for each ngram and full string - # N: Number of reports collected - # ngram_params: Parameters related to ngram size - # params: Parameter list. - # - # Returns: - # List: list of matrices, list of pairwise distributions. - - inds <- report_data$inds - cohorts <- report_data$cohorts - num_ngrams_collected <- ngram_params$num_ngrams_collected - map <- report_data$map - reports <- report_data$reports - - # Cycle over all the unique pairs of ngrams being collected - found_candidates <- list() - - # Generate the map list to be used for all ngrams - maps <- lapply(1:num_ngrams_collected, function(x) map) - num_candidate_ngrams <- length(inds) - - .ComputeDist <- function(i, inds, cohorts, reports, maps, params, - num_ngrams_collected) { - library(glmnet) - ind <- inds[[i]] - cohort_subset <- lapply(1:num_ngrams_collected, function(x) - cohorts[ind]) - report_subset <- reports[[i]] - new_dist <- ComputeDistributionEM(report_subset, - cohort_subset, - maps, ignore_other = FALSE, - params = params, estimate_var = FALSE) - new_dist - } - - # Compute the pairwise distributions (could be parallelized) - dists <- lapply(seq(num_candidate_ngrams), function(i) - .ComputeDist(i, inds, cohorts, reports, maps, - params, num_ngrams_collected)) - - dists_null <- sapply(dists, function(x) is.null(x)) - if (any(dists_null)) { - return (list(found_candidates = list(), dists = dists)) - } - cat("Found the pairwise ngram distributions.\n") - - # Find the threshold for choosing "significant" ngram pairs - f <- params$f; q <- params$q; p <- params$p - q2 <- .5 * f * (p + q) + (1 - f) * q - p2 <- .5 * f * (p + q) + (1 - f) * p - std_dev_counts <- sqrt(p2 * (1 - p2) * N) / (q2 - p2) - (threshold <- std_dev_counts / N) - threshold <- 0.04 - - # Filter joints to remove infrequently co-occurring ngrams. - candidate_strs <- lapply(1:num_candidate_ngrams, function(i) { - fit <- dists[[i]]$fit - edges <- which(fit > threshold, arr.ind = TRUE, FALSE) - - # Recover the list of strings that seem significant - found_candidates <- sapply(1:ncol(edges), function(x) { - chunks <- sapply(edges[, x], - function(j) dimnames(fit)[[x]][j]) - chunks - }) - # sapply returns either "character" vector (for n=1) or a matrix. Convert - # it to a matrix. This can be seen as follows: - # - # > class(sapply(1:5, function(x) "a")) - # [1] "character" - # > class(sapply(1:5, function(x) c("a", "b"))) - # [1] "matrix" - found_candidates <- rbind(found_candidates) - - # Remove the "others" - others <- which(found_candidates == "Other") - if (length(others) > 0) { - other <- which(found_candidates == "Other", arr.ind = TRUE)[, 1] - # drop = FALSE necessary to keep it a matrix - found_candidates <- found_candidates[-other, , drop = FALSE] - } - - found_candidates - }) - if (any(lapply(found_candidates, function(x) length(x)) == 0)) { - return (NULL) - } - - list(candidate_strs = candidate_strs, dists = dists) -} - -FindFeasibleStrings <- function(found_candidates, pairings, num_ngrams, - ngram_size) { - # Uses the list of strings found by the pairwise comparisons to build - # a list of full feasible strings. This relies on the iterative, - # graph-based approach. - # - # Args: - # found_candidates: list of candidates found by each pairwise decoding - # pairings: Matrix of size 2x(num_ngrams choose 2) listing all the - # ngram position pairings. - # num_ngrams: The total number of ngrams per word. - # ngram_size: Number of characters per ngram - # - # Returns: - # List of full string candidates. - - # Which ngram pairs are adjacent, i.e. of the form (i,i+1) - adjacent <- sapply(seq(num_ngrams - 1), function(x) { - c(1 + (x - 1) * ngram_size, x * ngram_size + 1) - }) - - adjacent_pairs <- apply(adjacent, 2, function(x) { - which(apply(pairings, 1, function(y) identical(y, x))) - }) - - # The first set of candidates are ngrams found in positions 1 and 2 - active_cands <- found_candidates[[adjacent_pairs[1]]] - if (class(active_cands) == "list") { - return (list()) - } else { - active_cands <- as.data.frame(active_cands) - } - - # Now check successive ngrams to find consistent combinations - # i.e. after ngrams 1-2, check 2-3, 3-4, 4-5, etc. - for (i in 2:length(adjacent_pairs)) { - if (nrow(active_cands) == 0) { - return (list()) - } - new_cands <- found_candidates[[adjacent_pairs[i]]] - new_cands <- as.data.frame(new_cands) - # Builds the set of possible candidates based only on ascending - # candidate pairs - active_cands <- BuildCandidates(active_cands, new_cands) - } - - if (nrow(active_cands) == 0) { - return (list()) - } - # Now refine these candidates using non-adjacent bigrams - remaining <- (1:(num_ngrams * (num_ngrams - 1) / 2))[-c(1, adjacent_pairs)] - # For each non-adjacent pair, make sure that all the candidates are - # consistent (in this phase, candidates can ONLY be eliminated) - - for (i in remaining) { - new_cands <- found_candidates[[i]] - new_cands <- as.data.frame(new_cands) - # Prune out all candidates that do not agree with new_cands - active_cands <- PruneCandidates(active_cands, pairings[i, ], - ngram_size, - new_cands = new_cands) - } - # Consolidate the string ngrams into a full string representation - if (length(active_cands) > 0) { - active_cands <- sort(apply(active_cands, 1, - function(x) paste0(x, collapse = ""))) - } - unname(active_cands) -} - -BuildCandidates <- function(active_cands, new_cands) { - # Takes in a data frame where each row is a valid sequence of ngrams - # checks which of the new_cands ngram pairs are consistent with - # the original active_cands ngram sequence. - # - # Args: - # active_cands: data frame of ngram sequence candidates (1 candidate - # sequence per row) - # new_cands: An rx2 data frame with a new list of candidate ngram - # pairs that might fit in with the previous list of candidates - # - # Returns: - # Updated active_cands, with another column if valid extensions are - # found. - - # Get the trailing ngrams from the current candidates - to_check <- as.vector(tail(t(active_cands), n = 1)) - # Check which of the elements in to_check are leading ngrams among the - # new candidates - present <- sapply(to_check, function(x) any(x == new_cands)) - # Remove the strings that are not represented among the new candidates - to_check <- to_check[present] - # Now insert the new candidates where they belong - active_cands <- active_cands[present, , drop = FALSE] - active_cands <- cbind(active_cands, col = NA) - num_cands <- nrow(active_cands) - hit_list <- c() - for (j in 1:num_cands) { - inds <- which(new_cands[, 1] == to_check[j]) - if (length(inds) == 0) { - hit_list <- c(hit_list, j) - next - } - # If there are multiple candidates fitting with an ngram, include - # each /full/ string as a candidate - extra <- length(inds) - 1 - if (extra > 0) { - rep_inds <- c(j, (new_num_cands + 1):(new_num_cands + extra)) - to_paste <- active_cands[j, ] - # Add the new candidates to the bottom - for (p in 1:extra) { - active_cands <- rbind(active_cands, to_paste) - } - } else { - rep_inds <- c(j) - } - active_cands[rep_inds, ncol(active_cands)] <- - as.vector(new_cands[inds, 2]) - new_num_cands <- nrow(active_cands) - } - # If there were some false candidates in the original set, remove them - if (length(hit_list) > 0) { - active_cands <- active_cands[-hit_list, , drop = FALSE] - } - active_cands -} - -PruneCandidates <- function(active_cands, pairing, ngram_size, new_cands) { - # Takes in a data frame where each row is a valid sequence of ngrams - # checks which of the new_cands ngram pairs are consistent with - # the original active_cands ngram sequence. This can ONLY remove - # candidates presented in active_cands. - # - # Args: - # active_cands: data frame of ngram sequence candidates (1 candidate - # sequence per row) - # pairing: A length-2 list storing which two ngrams are measured - # ngram_size: Number of characters per ngram - # new_cands: An rx2 data frame with a new list of candidate ngram - # pairs that might fit in with the previous list of candidates - # - # Returns: - # Updated active_cands, with a reduced number of rows. - - # Convert the pairing to an ngram index - cols <- sapply(pairing, function(x) (x - 1) / ngram_size + 1) - - cands_to_check <- active_cands[, cols, drop = FALSE] - # Find the candidates that are inconsistent with the new data - hit_list <- sapply(1:nrow(cands_to_check), function(j) { - to_kill <- FALSE - if (nrow(new_cands) == 0) { - return (TRUE) - } - if (!any(apply(new_cands, 1, function(x) - all(cands_to_check[j, , drop = FALSE] == x)))) { - to_kill <- TRUE - } - to_kill - }) - - # Determine which rows are false positives - hit_indices <- which(hit_list) - # Remove the false positives - if (length(hit_indices) > 0) { - active_cands <- active_cands[-hit_indices, ] - } - active_cands -} - -EstimateDictionary <- function(report_data, N, ngram_params, params) { - # Takes in a list of report data and returns a list of string - # estimates of the dictionary. - # - # Args: - # report_data: Object containing data relevant to reports: - # $inds: The indices of reports collected using various pairs - # $cohorts: The cohort of each report - # $map: THe map used for all the ngrams - # $reports: The reports used for each ngram and full string - # N: the number of individuals sending reports - # ngram_params: Parameters related to ngram length, etc - # params: Parameter vector with RAPPOR noise levels, cohorts, etc - # - # Returns: - # List: list of found candidates, list of pairwise candidates - - pairwise_candidates <- FindPairwiseCandidates(report_data, N, - ngram_params, - params)$candidate_strs - cat("Found the pairwise candidates. \n") - if (is.null(pairwise_candidates)) { - return (list()) - } - found_candidates <- FindFeasibleStrings(pairwise_candidates, - report_data$pairings, - ngram_params$num_ngrams, - ngram_params$ngram_size) - cat("Found all the candidates. \n") - list(found_candidates = found_candidates, - pairwise_candidates = pairwise_candidates) -} - -WriteKPartiteGraph <- function(conn, pairwise_candidates, pairings, num_ngrams, - ngram_size) { - # Args: - # conn: R connection to write to. Should be opened with mode w+. - # pairwise_candidates: list of matrices. Each matrix represents a subgraph; - # it contains the edges between partitions i and j, so there are (k choose - # 2) matrices. Each matrix has dimension 2 x E, where E is the number of - # edges. - # pairings: 2 x (k choose 2) matrix of character positions. Each row - # corresponds to a subgraph; it has 1-based character index of partitions - # i and j. - # num_ngrams: length of pairwise_candidates, or the number of partitions in - # the k-partite graph - - # File Format: - # - # num_partitions 3 - # ngram_size 2 - # 0.ab 1.cd - # 0.ab 2.ef - # - # The first line specifies the number of partitions (k). - # The remaining lines are edges, where each node is .. - # - # Partitions are numbered from 0. The partition of the left node will be - # less than the partition of the right node. - - # First two lines are metadata - cat(sprintf('num_partitions %d\n', num_ngrams), file = conn) - cat(sprintf('ngram_size %d\n', ngram_size), file = conn) - - for (i in 1:length(pairwise_candidates)) { - # The two pairwise_candidates for this subgraph. - # Turn 1-based character positions into 0-based partition numbers, - # e.g. (3, 5) -> (1, 2) - - pos1 <- pairings[[i, 1]] - pos2 <- pairings[[i, 2]] - part1 <- (pos1 - 1) / ngram_size - part2 <- (pos2 - 1) / ngram_size - cat(sprintf("Writing partition (%d, %d)\n", part1, part2)) - - p <- pairwise_candidates[[i]] - # each row is an edge - for (j in 1:nrow(p)) { - n1 <- p[[j, 1]] - n2 <- p[[j, 2]] - line <- sprintf('edge %d.%s %d.%s\n', part1, n1, part2, n2) - # NOTE: It would be faster to preallocate 'lines', but we would have to - # make a two passes through pairwise_candidates. - cat(line, file = conn) - } - } -} - diff --git a/analysis/R/fast_em.R b/analysis/R/fast_em.R deleted file mode 100755 index c19862c..0000000 --- a/analysis/R/fast_em.R +++ /dev/null @@ -1,137 +0,0 @@ -# fast_em.R: Wrapper around analysis/cpp/fast_em.cc. -# -# This serializes the input, shells out, and deserializes the output. - -.Flatten <- function(list_of_matrices) { - listOfVectors <- lapply(list_of_matrices, as.vector) - #print(listOfVectors) - - # unlist takes list to vector. - unlist(listOfVectors) -} - -.WriteListOfMatrices <- function(list_of_matrices, f) { - flattened <- .Flatten(list_of_matrices) - - # NOTE: UpdateJointConditional does outer product of dimensions! - - # 3 letter strings are null terminated - writeBin('ne ', con = f) - num_entries <- length(list_of_matrices) - writeBin(num_entries, con = f) - - Log('Wrote num_entries = %d', num_entries) - - # For 2x3, this is 6 - writeBin('es ', con = f) - - entry_size <- as.integer(prod(dim(list_of_matrices[[1]]))) - writeBin(entry_size, con = f) - - Log('Wrote entry_size = %d', entry_size) - - # now write the data - writeBin('dat', con = f) - writeBin(flattened, con = f) -} - -.ExpectTag <- function(f, tag) { - # Read a single NUL-terminated character string. - actual <- readBin(con = f, what = "char", n = 1) - - # Assert that we got what was expected. - if (length(actual) != 1) { - stop(sprintf("Failed to read a tag '%s'", tag)) - } - if (actual != tag) { - stop(sprintf("Expected '%s', got '%s'", tag, actual)) - } -} - -.ReadResult <- function (f, entry_size, matrix_dims) { - .ExpectTag(f, "emi") - # NOTE: assuming R integers are 4 bytes (uint32_t) - num_em_iters <- readBin(con = f, what = "int", n = 1) - - .ExpectTag(f, "pij") - pij <- readBin(con = f, what = "double", n = entry_size) - - # Adjust dimensions - dim(pij) <- matrix_dims - - Log("Number of EM iterations: %d", num_em_iters) - Log("PIJ read from external implementation:") - print(pij) - - # est, sd, var_cov, hist - list(est = pij, num_em_iters = num_em_iters) -} - -.SanityChecks <- function(joint_conditional) { - # Display some stats before sending it over to C++. - - inf_counts <- lapply(joint_conditional, function(m) { - sum(m == Inf) - }) - total_inf <- sum(as.numeric(inf_counts)) - - nan_counts <- lapply(joint_conditional, function(m) { - sum(is.nan(m)) - }) - total_nan <- sum(as.numeric(nan_counts)) - - zero_counts <- lapply(joint_conditional, function(m) { - sum(m == 0.0) - }) - total_zero <- sum(as.numeric(zero_counts)) - - #sum(joint_conditional[joint_conditional == Inf, ]) - Log('total inf: %s', total_inf) - Log('total nan: %s', total_nan) - Log('total zero: %s', total_zero) -} - -ConstructFastEM <- function(em_executable, tmp_dir) { - - return(function(joint_conditional, max_em_iters = 1000, - epsilon = 10 ^ -6, verbose = FALSE, - estimate_var = FALSE) { - matrix_dims <- dim(joint_conditional[[1]]) - # Check that number of dimensions is 2. - if (length(matrix_dims) != 2) { - Log('FATAL: Expected 2 dimensions, got %d', length(matrix_dims)) - stop() - } - - entry_size <- prod(matrix_dims) - Log('entry size: %d', entry_size) - - .SanityChecks(joint_conditional) - - input_path <- file.path(tmp_dir, 'list_of_matrices.bin') - Log("Writing flattened list of matrices to %s", input_path) - f <- file(input_path, 'wb') # binary file - .WriteListOfMatrices(joint_conditional, f) - close(f) - Log("Done writing %s", input_path) - - output_path <- file.path(tmp_dir, 'pij.bin') - - cmd <- sprintf("%s %s %s %s", em_executable, input_path, output_path, - max_em_iters) - - Log("Shell command: %s", cmd) - exit_code <- system(cmd) - - Log("Done running shell command") - if (exit_code != 0) { - stop(sprintf("Command failed with code %d", exit_code)) - } - - f <- file(output_path, 'rb') - result <- .ReadResult(f, entry_size, matrix_dims) - close(f) - - result - }) -} diff --git a/analysis/R/ngrams_simulation.R b/analysis/R/ngrams_simulation.R deleted file mode 100755 index ca7ce49..0000000 --- a/analysis/R/ngrams_simulation.R +++ /dev/null @@ -1,271 +0,0 @@ -# Copyright 2014 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Authors: vpihur@google.com (Vasyl Pihur) and fanti@google.com (Giulia Fanti) -# -# Tools used to simulate sending partial ngrams to the server for estimating the -# dictionary of terms over which we want to learn a distribution. This -# mostly contains functions that aid in the generation of synthetic data. - -library(RUnit) -library(parallel) - -source("analysis/R/encode.R") -source("analysis/R/decode.R") -source("analysis/R/simulation.R") -source("analysis/R/association.R") -source("analysis/R/decode_ngrams.R") - -# The alphabet is the set of all possible characters that will appear in a -# string. Here we use the English alphabet, but one might want to include -# numbers or punctuation marks. -alphabet <- letters - -GenerateCandidates <- function(alphabet, ngram_size = 2) { - # Draws a random string for each individual in the - # population from distribution. - # - # Args: - # N: Number of individuals in the population - # num_strs: Number of strings from which to draw strings - # str_len: Length of each string - # - # Returns: - # Vector of strings for each individual in the population - - cands <- do.call(expand.grid, lapply(seq(ngram_size), function(i) alphabet)) - apply(cands, 1, function(x) paste0(x, collapse = "")) -} - -GenerateString <- function(n) { - # Generates a string of a given length from the alphabet. - # - # Args: - # n: Number of characters in the string - # - # Returns: - # String of length n - paste0(sample(alphabet, n, replace = TRUE), collapse = "") -} - -GeneratePopulation <- function(N, num_strs, str_len = 10, - distribution = 1) { - # Generates a string for each individual in the population from distribution. - # - # Args: - # N: Number of individuals in the population - # num_strs: Number of strings from which to draw strings - # str_len: Length of each string - # distribution: which type of distribution to use - # 1: Zipfian - # 2: Geometric (exponential) - # 3: Step function - # - # Returns: - # Vector of strings for each individual in the population - - strs <- sapply(1:num_strs, function(i) GenerateString(str_len)) - - if (distribution == 1) { - # Zipfian-ish distribution - prob <- (1:num_strs)^20 - prob <- prob / sum(prob) + 0.001 - prob <- prob / sum(prob) - } else if (distribution == 2) { - # Geometric distribution (discrete approximation to exponential) - p <- 0.3 - prob <- p * (1 - p)^(1:num_strs - 1) - prob <- prob / sum(prob) - } else { - # Uniform - prob <- rep(1 / num_strs, num_strs) - } - - sample(strs, N, replace = TRUE, prob = prob) -} - -SelectNGrams <- function(str, num_ngrams, size, max_str_len = 6) { - # Selects which ngrams each user will encode and then submit. - # - # Args: - # str: String from which ngram is built. - # num_ngrams: Number of ngrams to choose - # size: Number of characters per ngram - # max_str_len: Maximum number of characters in the string - # - # Returns: - # List of each individual's ngrams and which positions the ngrams - # were drawn from. - - start <- sort(sample(seq(1, max_str_len, by = size), num_ngrams)) - ngrams <- mapply(function(x, y, str) substr(str, x, y), - start, start + size - 1, - MoreArgs = list(str = str)) - list(ngrams = ngrams, starts = start) -} - -UpdateMapWithCandidates <- function(str_candidates, sim, params) { - # Generates a new map based on the returned candidates. - # Normally this would be created on the spot by having the - # aggregator hash the string candidates. But since we already have - # the map from simulation, we'll just choose the appropriate - # column - # - # Arguments: - # str_candidates: Vector of string candidates - # sim: Simulation object containing the original map - # params: RAPPOR parameter list - - k <- params$k - h <- params$h - m <- params$m - - # First add the real candidates to the map - valid_cands <- intersect(str_candidates, colnames(sim$full_map$map_by_cohort[[1]])) - updated_map <- sim$full_map - updated_map$map_by_cohort <- lapply(1:m, function(i) { - sim$full_map$map_by_cohort[[i]][, valid_cands] - }) - - # Now add the false positives (we can just draw random strings for - # these since they didn't appear in the original dataset anyway) - new_cands <- setdiff(str_candidates, colnames(sim$full_map$map_by_cohort[[1]])) - M <- length(new_cands) - if (M > 0) { - for (i in 1:m) { - ones <- sample(1:k, M * h, replace = TRUE) - cols <- rep(1:M, each = h) - strs <- c(sort(valid_cands), new_cands) - updated_map$map_by_cohort[[i]] <- - do.call(cBind, list(updated_map$map_by_cohort[[i]], - sparseMatrix(ones, cols, dims = c(k, M)))) - colnames(updated_map$map_by_cohort[[i]]) <- strs - } - } - if (class(updated_map$map_by_cohort[[1]]) == "logical") { - updated_map$all_cohorts_map <- unlist(updated_map$map_by_cohort) - updated_map$all_cohorts_map <- Matrix(updated_map$all_cohorts_map, sparse = TRUE) - colnames(updated_map$all_cohorts_map) <- c(valid_cands, new_cands) - } else { - updated_map$all_cohorts_map <- do.call("rBind", updated_map$map_by_cohort) - } - updated_map -} - -SimulateNGrams <- function(N, ngram_params, str_len, num_strs = 10, - alphabet, params, distribution = 1) { - # Simulates the creation and encoding of ngrams for each individual. - # - # Args: - # N: Number of individuals in the population - # ngram_params: Parameters about ngram size, etc. - # str_len: Length of each string - # num_strs: NUmber of strings in the dictionary - # alphabet: Alphabet used to generate strings - # params: RAPPOR parameters, like noise and cohorts - # - # Returns: - # List containing all the information needed for estimating and - # verifying the results. - - # Get the list of strings for each user - strs <- GeneratePopulation(N, num_strs = num_strs, - str_len = str_len, - distribution) - - # Split them into ngrams and encode - ngram <- lapply(strs, function(i) - SelectNGrams(i, - num_ngrams = ngram_params$num_ngrams_collected, - size = ngram_params$ngram_size, - max_str_len = str_len)) - - cands <- GenerateCandidates(alphabet, ngram_params$ngram_size) - map <- CreateMap(cands, params, FALSE) - cohorts <- sample(1:params$m, N, replace = TRUE) - - g <- sapply(ngram, function(x) paste(x$starts, sep = "_", - collapse = "_")) - ug <- sort(unique(g)) - pairings <- t(sapply(ug, function(x) - sapply(strsplit(x, "_"), function(y) as.numeric(y)))) - - inds <- lapply(1:length(ug), function(i) ind <- which(g == ug[i])) - - reports <- lapply(1:length(ug), function(k) { - # Generate the ngram reports - lapply(1:ngram_params$num_ngrams_collected, function(x) { - EncodeAll(sapply(inds[[k]], function(j) ngram[[j]]$ngrams[x]), - cohorts[inds[[k]]], map$map_by_cohort, params)}) - }) - cat("Encoded the ngrams.\n") - # Now generate the full string reports - full_map <- CreateMap(sort(unique(strs)), params, FALSE) - full_reports <- EncodeAll(strs, cohorts, full_map$map_by_cohort, params) - - list(reports = reports, cohorts = cohorts, ngram = ngram, map = map, - strs = strs, pairings = pairings, inds = inds, cands = cands, - full_reports = full_reports, full_map = full_map) - -} - - -EstimateDictionaryTrial <- function(N, str_len, num_strs, - params, ngram_params, - distribution = 3) { - # Runs a single trial for simulation. Generates simulated reports, - # decodes them, and returns the result. - # - # Arguments: - # N: Number of users to simulation - # str_len: The length of strings to estimate - # num_strs: The number of strings in the dictionary - # params: RAPPOR parameter list - # ngram_params: Parameters related to the size of ngrams - # distribution: Tells what kind of distribution to use: - # 1: Zipfian - # 2: Geometric - # 3: Uniform (default) - # - # Returns: - # List with recovered and true marginals. - - # We call the needed libraries here in order to make them available when this - # function gets called by BorgApply. Otherwise, they do not get included. - library(glmnet) - library(parallel) - sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs, - alphabet, params, distribution) - - res <- EstimateDictionary(sim, N, ngram_params, params) - str_candidates <- res$found_candidates - pairwise_candidates <- res$pairwise_candidates - - if (length(str_candidates) == 0) { - return (NULL) - } - updated_map <- UpdateMapWithCandidates(str_candidates, sim, params) - - # Compute the marginal for this new set of strings - variable_counts <- ComputeCounts(sim$full_reports, sim$cohorts, params) - # Our dictionary estimate - marginal <- Decode(variable_counts, updated_map$all_cohorts_map, params)$fit - # Estimate given full dictionary knowledge - marginal_full <- Decode(variable_counts, sim$full_map$all_cohorts_map, params)$fit - # The true (sampled) data distribution - truth <- sort(table(sim$strs)) / N - - list(marginal = marginal, marginal_full = marginal_full, - truth = truth, pairwise_candidates = pairwise_candidates) -} diff --git a/analysis/R/unknowns_test.R b/analysis/R/unknowns_test.R deleted file mode 100755 index 5efd738..0000000 --- a/analysis/R/unknowns_test.R +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright 2014 Google Inc. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Author: fanti@google.com (Giulia Fanti) -# -# Tests the unknown unknowns dictionary estimation functions. -# There are two main components involved in estimating this unknown -# distribution: -# a) Find the pairwise ngrams that co-occur often. -# b) Determine which full strings are consisted with all pairwise -# relations. -# -# TestEstimateDictionary() tests the full pipeline, including parts (a) -# and (b). -# TestFindFeasibleStrings() tests only part (b). -# Both tests generate their own data. - -library(parallel) -source("analysis/R/encode.R") -source("analysis/R/decode.R") -source("analysis/R/simulation.R") -source("analysis/R/association.R") -source("analysis/R/decode_ngrams.R") -source("analysis/R/ngrams_simulation.R") -alphabet <- letters -options(warn = -1) - -GeneratePopulation <- function(N, num_strs, str_len = 10, - distribution = NULL) { - # Generates a /deterministic/ string for each individual in the - # population from distribution. - # - # Args: - # N: Number of individuals in the population - # num_strs: Number of strings from which to draw strings - # str_len: Length of each string - # distribution: Just here for compatibility with original - # GeneratePopulation function in ngrams_simulation.R - # - # Returns: - # Vector of strings for each individual in the population - - strs <- sapply(1:num_strs, function(i) { - paste0(alphabet[(str_len * (i - 1) + 1):(str_len * i)], collapse = "") - }) - - # Uniform distribution - prob <- rep(1 / num_strs, num_strs) - sample(strs, N, replace = TRUE, prob = prob) -} - -TestEstimateDictionary <- function() { - # Tests that the algorithm without noise recovers a uniform - # string population correctly. - - # Compute the strings from measuring only 2 ngrams - N <- 100 - str_len <- 6 - ngram_size <- 2 - num_ngrams <- str_len / ngram_size - num_strs <- 1 - - params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0) - - ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams, - num_ngrams_collected = 2) - - sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs, - alphabet, params, distribution = 3) - - res <- EstimateDictionary(sim, N, ngram_params, params) - - # Check that the correct strings are found - if (num_strs == 1) { - checkTrue(res$found_candidates == sort(unique(sim$strs))) - } else { - checkTrue(all.equal(res$found_candidates, sort(unique(sim$strs)))) - } -} - -TestFindFeasibleStrings <- function() { - # Tests that FindPairwiseCandidates weeds out false positives. - # We test this by adding false positives to the pairwise estimates. - N <- 100 - str_len <- 6 - ngram_size <- 2 - num_ngrams <- str_len / ngram_size - num_strs <- 2 - - params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0) - - ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams, - num_ngrams_collected = 2) - - sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs, - alphabet, params) - - pairwise_candidates <- FindPairwiseCandidates(sim, N, ngram_params, - params)$candidate_strs - cat("Found the pairwise candidates. \n") - - pairwise_candidates[[1]] <- rbind(pairwise_candidates[[1]], c("ab", "le")) - - if (is.null(pairwise_candidates)) { - return (FALSE) - } - - conn <- file('graph.txt', 'w+') - WriteKPartiteGraph(conn, - pairwise_candidates, - sim$pairings, - ngram_params$num_ngrams, - ngram_params$ngram_size) - - close(conn) - cat("Wrote graph.txt\n") - - found_candidates <- FindFeasibleStrings(pairwise_candidates, - sim$pairings, - ngram_params$num_ngrams, - ngram_params$ngram_size) - # Check that the correct strings are found - if (num_strs == 1) { - checkTrue(found_candidates == sort(unique(sim$strs))) - } else { - checkTrue(all.equal(found_candidates, sort(unique(sim$strs)))) - } -} diff --git a/calculate_epsilon.py b/calculate_epsilon.py new file mode 100644 index 0000000..19e62d4 --- /dev/null +++ b/calculate_epsilon.py @@ -0,0 +1,29 @@ +import sys +from math import log + +def p_star(f, p, q): + return (f / 2.) * (p + q) + (1 - f) * p + +def q_star(f, p, q): + return (f / 2.) * (p + q) + (1 - f) * q + +def inner_factor(p_star, q_star): + return (q_star * (1. - p_star)) / (p_star * (1. - q_star )) + +def epsilon(h, inner): + return h * log(inner) + +def main(args): + f = float(args[1]) + p = float(args[2]) + q = float(args[3]) + h = int(args[4]) + + p_s = p_star(f, p, q) + q_s = q_star(f, p, q) + inner = inner_factor(p_s, q_s) + e = epsilon(h, inner) + print "{} ln({}) = {}".format(h, inner, e) + +if __name__ == "__main__": + main(sys.argv)