зеркало из https://github.com/mozilla/rappor.git
Clean repo
This commit is contained in:
Родитель
7b4de6077f
Коммит
ea46c460af
|
@ -19,4 +19,3 @@ pyvenv.cfg
|
|||
.venv
|
||||
pip-selfcheck.json
|
||||
save_results.sh
|
||||
calculate_epsilon.py
|
||||
|
|
|
@ -1,377 +0,0 @@
|
|||
# Copyright 2014 Google Inc. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
#
|
||||
# This file has functions that aid in the estimation of a distribution when the
|
||||
# dictionary is unknown. There are functions for estimating pairwise joint
|
||||
# ngram distributions, pruning out false positives, and combining the two
|
||||
# steps.
|
||||
|
||||
FindPairwiseCandidates <- function(report_data, N, ngram_params, params) {
|
||||
# Finds the pairwise most likely ngrams.
|
||||
#
|
||||
# Args:
|
||||
# report_data: Object containing data relevant to reports:
|
||||
# $inds: The indices of reports collected using various pairs
|
||||
# $cohorts: The cohort of each report
|
||||
# $map: The map used for all the ngrams
|
||||
# $reports: The reports used for each ngram and full string
|
||||
# N: Number of reports collected
|
||||
# ngram_params: Parameters related to ngram size
|
||||
# params: Parameter list.
|
||||
#
|
||||
# Returns:
|
||||
# List: list of matrices, list of pairwise distributions.
|
||||
|
||||
inds <- report_data$inds
|
||||
cohorts <- report_data$cohorts
|
||||
num_ngrams_collected <- ngram_params$num_ngrams_collected
|
||||
map <- report_data$map
|
||||
reports <- report_data$reports
|
||||
|
||||
# Cycle over all the unique pairs of ngrams being collected
|
||||
found_candidates <- list()
|
||||
|
||||
# Generate the map list to be used for all ngrams
|
||||
maps <- lapply(1:num_ngrams_collected, function(x) map)
|
||||
num_candidate_ngrams <- length(inds)
|
||||
|
||||
.ComputeDist <- function(i, inds, cohorts, reports, maps, params,
|
||||
num_ngrams_collected) {
|
||||
library(glmnet)
|
||||
ind <- inds[[i]]
|
||||
cohort_subset <- lapply(1:num_ngrams_collected, function(x)
|
||||
cohorts[ind])
|
||||
report_subset <- reports[[i]]
|
||||
new_dist <- ComputeDistributionEM(report_subset,
|
||||
cohort_subset,
|
||||
maps, ignore_other = FALSE,
|
||||
params = params, estimate_var = FALSE)
|
||||
new_dist
|
||||
}
|
||||
|
||||
# Compute the pairwise distributions (could be parallelized)
|
||||
dists <- lapply(seq(num_candidate_ngrams), function(i)
|
||||
.ComputeDist(i, inds, cohorts, reports, maps,
|
||||
params, num_ngrams_collected))
|
||||
|
||||
dists_null <- sapply(dists, function(x) is.null(x))
|
||||
if (any(dists_null)) {
|
||||
return (list(found_candidates = list(), dists = dists))
|
||||
}
|
||||
cat("Found the pairwise ngram distributions.\n")
|
||||
|
||||
# Find the threshold for choosing "significant" ngram pairs
|
||||
f <- params$f; q <- params$q; p <- params$p
|
||||
q2 <- .5 * f * (p + q) + (1 - f) * q
|
||||
p2 <- .5 * f * (p + q) + (1 - f) * p
|
||||
std_dev_counts <- sqrt(p2 * (1 - p2) * N) / (q2 - p2)
|
||||
(threshold <- std_dev_counts / N)
|
||||
threshold <- 0.04
|
||||
|
||||
# Filter joints to remove infrequently co-occurring ngrams.
|
||||
candidate_strs <- lapply(1:num_candidate_ngrams, function(i) {
|
||||
fit <- dists[[i]]$fit
|
||||
edges <- which(fit > threshold, arr.ind = TRUE, FALSE)
|
||||
|
||||
# Recover the list of strings that seem significant
|
||||
found_candidates <- sapply(1:ncol(edges), function(x) {
|
||||
chunks <- sapply(edges[, x],
|
||||
function(j) dimnames(fit)[[x]][j])
|
||||
chunks
|
||||
})
|
||||
# sapply returns either "character" vector (for n=1) or a matrix. Convert
|
||||
# it to a matrix. This can be seen as follows:
|
||||
#
|
||||
# > class(sapply(1:5, function(x) "a"))
|
||||
# [1] "character"
|
||||
# > class(sapply(1:5, function(x) c("a", "b")))
|
||||
# [1] "matrix"
|
||||
found_candidates <- rbind(found_candidates)
|
||||
|
||||
# Remove the "others"
|
||||
others <- which(found_candidates == "Other")
|
||||
if (length(others) > 0) {
|
||||
other <- which(found_candidates == "Other", arr.ind = TRUE)[, 1]
|
||||
# drop = FALSE necessary to keep it a matrix
|
||||
found_candidates <- found_candidates[-other, , drop = FALSE]
|
||||
}
|
||||
|
||||
found_candidates
|
||||
})
|
||||
if (any(lapply(found_candidates, function(x) length(x)) == 0)) {
|
||||
return (NULL)
|
||||
}
|
||||
|
||||
list(candidate_strs = candidate_strs, dists = dists)
|
||||
}
|
||||
|
||||
FindFeasibleStrings <- function(found_candidates, pairings, num_ngrams,
|
||||
ngram_size) {
|
||||
# Uses the list of strings found by the pairwise comparisons to build
|
||||
# a list of full feasible strings. This relies on the iterative,
|
||||
# graph-based approach.
|
||||
#
|
||||
# Args:
|
||||
# found_candidates: list of candidates found by each pairwise decoding
|
||||
# pairings: Matrix of size 2x(num_ngrams choose 2) listing all the
|
||||
# ngram position pairings.
|
||||
# num_ngrams: The total number of ngrams per word.
|
||||
# ngram_size: Number of characters per ngram
|
||||
#
|
||||
# Returns:
|
||||
# List of full string candidates.
|
||||
|
||||
# Which ngram pairs are adjacent, i.e. of the form (i,i+1)
|
||||
adjacent <- sapply(seq(num_ngrams - 1), function(x) {
|
||||
c(1 + (x - 1) * ngram_size, x * ngram_size + 1)
|
||||
})
|
||||
|
||||
adjacent_pairs <- apply(adjacent, 2, function(x) {
|
||||
which(apply(pairings, 1, function(y) identical(y, x)))
|
||||
})
|
||||
|
||||
# The first set of candidates are ngrams found in positions 1 and 2
|
||||
active_cands <- found_candidates[[adjacent_pairs[1]]]
|
||||
if (class(active_cands) == "list") {
|
||||
return (list())
|
||||
} else {
|
||||
active_cands <- as.data.frame(active_cands)
|
||||
}
|
||||
|
||||
# Now check successive ngrams to find consistent combinations
|
||||
# i.e. after ngrams 1-2, check 2-3, 3-4, 4-5, etc.
|
||||
for (i in 2:length(adjacent_pairs)) {
|
||||
if (nrow(active_cands) == 0) {
|
||||
return (list())
|
||||
}
|
||||
new_cands <- found_candidates[[adjacent_pairs[i]]]
|
||||
new_cands <- as.data.frame(new_cands)
|
||||
# Builds the set of possible candidates based only on ascending
|
||||
# candidate pairs
|
||||
active_cands <- BuildCandidates(active_cands, new_cands)
|
||||
}
|
||||
|
||||
if (nrow(active_cands) == 0) {
|
||||
return (list())
|
||||
}
|
||||
# Now refine these candidates using non-adjacent bigrams
|
||||
remaining <- (1:(num_ngrams * (num_ngrams - 1) / 2))[-c(1, adjacent_pairs)]
|
||||
# For each non-adjacent pair, make sure that all the candidates are
|
||||
# consistent (in this phase, candidates can ONLY be eliminated)
|
||||
|
||||
for (i in remaining) {
|
||||
new_cands <- found_candidates[[i]]
|
||||
new_cands <- as.data.frame(new_cands)
|
||||
# Prune out all candidates that do not agree with new_cands
|
||||
active_cands <- PruneCandidates(active_cands, pairings[i, ],
|
||||
ngram_size,
|
||||
new_cands = new_cands)
|
||||
}
|
||||
# Consolidate the string ngrams into a full string representation
|
||||
if (length(active_cands) > 0) {
|
||||
active_cands <- sort(apply(active_cands, 1,
|
||||
function(x) paste0(x, collapse = "")))
|
||||
}
|
||||
unname(active_cands)
|
||||
}
|
||||
|
||||
BuildCandidates <- function(active_cands, new_cands) {
|
||||
# Takes in a data frame where each row is a valid sequence of ngrams
|
||||
# checks which of the new_cands ngram pairs are consistent with
|
||||
# the original active_cands ngram sequence.
|
||||
#
|
||||
# Args:
|
||||
# active_cands: data frame of ngram sequence candidates (1 candidate
|
||||
# sequence per row)
|
||||
# new_cands: An rx2 data frame with a new list of candidate ngram
|
||||
# pairs that might fit in with the previous list of candidates
|
||||
#
|
||||
# Returns:
|
||||
# Updated active_cands, with another column if valid extensions are
|
||||
# found.
|
||||
|
||||
# Get the trailing ngrams from the current candidates
|
||||
to_check <- as.vector(tail(t(active_cands), n = 1))
|
||||
# Check which of the elements in to_check are leading ngrams among the
|
||||
# new candidates
|
||||
present <- sapply(to_check, function(x) any(x == new_cands))
|
||||
# Remove the strings that are not represented among the new candidates
|
||||
to_check <- to_check[present]
|
||||
# Now insert the new candidates where they belong
|
||||
active_cands <- active_cands[present, , drop = FALSE]
|
||||
active_cands <- cbind(active_cands, col = NA)
|
||||
num_cands <- nrow(active_cands)
|
||||
hit_list <- c()
|
||||
for (j in 1:num_cands) {
|
||||
inds <- which(new_cands[, 1] == to_check[j])
|
||||
if (length(inds) == 0) {
|
||||
hit_list <- c(hit_list, j)
|
||||
next
|
||||
}
|
||||
# If there are multiple candidates fitting with an ngram, include
|
||||
# each /full/ string as a candidate
|
||||
extra <- length(inds) - 1
|
||||
if (extra > 0) {
|
||||
rep_inds <- c(j, (new_num_cands + 1):(new_num_cands + extra))
|
||||
to_paste <- active_cands[j, ]
|
||||
# Add the new candidates to the bottom
|
||||
for (p in 1:extra) {
|
||||
active_cands <- rbind(active_cands, to_paste)
|
||||
}
|
||||
} else {
|
||||
rep_inds <- c(j)
|
||||
}
|
||||
active_cands[rep_inds, ncol(active_cands)] <-
|
||||
as.vector(new_cands[inds, 2])
|
||||
new_num_cands <- nrow(active_cands)
|
||||
}
|
||||
# If there were some false candidates in the original set, remove them
|
||||
if (length(hit_list) > 0) {
|
||||
active_cands <- active_cands[-hit_list, , drop = FALSE]
|
||||
}
|
||||
active_cands
|
||||
}
|
||||
|
||||
PruneCandidates <- function(active_cands, pairing, ngram_size, new_cands) {
|
||||
# Takes in a data frame where each row is a valid sequence of ngrams
|
||||
# checks which of the new_cands ngram pairs are consistent with
|
||||
# the original active_cands ngram sequence. This can ONLY remove
|
||||
# candidates presented in active_cands.
|
||||
#
|
||||
# Args:
|
||||
# active_cands: data frame of ngram sequence candidates (1 candidate
|
||||
# sequence per row)
|
||||
# pairing: A length-2 list storing which two ngrams are measured
|
||||
# ngram_size: Number of characters per ngram
|
||||
# new_cands: An rx2 data frame with a new list of candidate ngram
|
||||
# pairs that might fit in with the previous list of candidates
|
||||
#
|
||||
# Returns:
|
||||
# Updated active_cands, with a reduced number of rows.
|
||||
|
||||
# Convert the pairing to an ngram index
|
||||
cols <- sapply(pairing, function(x) (x - 1) / ngram_size + 1)
|
||||
|
||||
cands_to_check <- active_cands[, cols, drop = FALSE]
|
||||
# Find the candidates that are inconsistent with the new data
|
||||
hit_list <- sapply(1:nrow(cands_to_check), function(j) {
|
||||
to_kill <- FALSE
|
||||
if (nrow(new_cands) == 0) {
|
||||
return (TRUE)
|
||||
}
|
||||
if (!any(apply(new_cands, 1, function(x)
|
||||
all(cands_to_check[j, , drop = FALSE] == x)))) {
|
||||
to_kill <- TRUE
|
||||
}
|
||||
to_kill
|
||||
})
|
||||
|
||||
# Determine which rows are false positives
|
||||
hit_indices <- which(hit_list)
|
||||
# Remove the false positives
|
||||
if (length(hit_indices) > 0) {
|
||||
active_cands <- active_cands[-hit_indices, ]
|
||||
}
|
||||
active_cands
|
||||
}
|
||||
|
||||
EstimateDictionary <- function(report_data, N, ngram_params, params) {
|
||||
# Takes in a list of report data and returns a list of string
|
||||
# estimates of the dictionary.
|
||||
#
|
||||
# Args:
|
||||
# report_data: Object containing data relevant to reports:
|
||||
# $inds: The indices of reports collected using various pairs
|
||||
# $cohorts: The cohort of each report
|
||||
# $map: THe map used for all the ngrams
|
||||
# $reports: The reports used for each ngram and full string
|
||||
# N: the number of individuals sending reports
|
||||
# ngram_params: Parameters related to ngram length, etc
|
||||
# params: Parameter vector with RAPPOR noise levels, cohorts, etc
|
||||
#
|
||||
# Returns:
|
||||
# List: list of found candidates, list of pairwise candidates
|
||||
|
||||
pairwise_candidates <- FindPairwiseCandidates(report_data, N,
|
||||
ngram_params,
|
||||
params)$candidate_strs
|
||||
cat("Found the pairwise candidates. \n")
|
||||
if (is.null(pairwise_candidates)) {
|
||||
return (list())
|
||||
}
|
||||
found_candidates <- FindFeasibleStrings(pairwise_candidates,
|
||||
report_data$pairings,
|
||||
ngram_params$num_ngrams,
|
||||
ngram_params$ngram_size)
|
||||
cat("Found all the candidates. \n")
|
||||
list(found_candidates = found_candidates,
|
||||
pairwise_candidates = pairwise_candidates)
|
||||
}
|
||||
|
||||
WriteKPartiteGraph <- function(conn, pairwise_candidates, pairings, num_ngrams,
|
||||
ngram_size) {
|
||||
# Args:
|
||||
# conn: R connection to write to. Should be opened with mode w+.
|
||||
# pairwise_candidates: list of matrices. Each matrix represents a subgraph;
|
||||
# it contains the edges between partitions i and j, so there are (k choose
|
||||
# 2) matrices. Each matrix has dimension 2 x E, where E is the number of
|
||||
# edges.
|
||||
# pairings: 2 x (k choose 2) matrix of character positions. Each row
|
||||
# corresponds to a subgraph; it has 1-based character index of partitions
|
||||
# i and j.
|
||||
# num_ngrams: length of pairwise_candidates, or the number of partitions in
|
||||
# the k-partite graph
|
||||
|
||||
# File Format:
|
||||
#
|
||||
# num_partitions 3
|
||||
# ngram_size 2
|
||||
# 0.ab 1.cd
|
||||
# 0.ab 2.ef
|
||||
#
|
||||
# The first line specifies the number of partitions (k).
|
||||
# The remaining lines are edges, where each node is <partition>.<bigram>.
|
||||
#
|
||||
# Partitions are numbered from 0. The partition of the left node will be
|
||||
# less than the partition of the right node.
|
||||
|
||||
# First two lines are metadata
|
||||
cat(sprintf('num_partitions %d\n', num_ngrams), file = conn)
|
||||
cat(sprintf('ngram_size %d\n', ngram_size), file = conn)
|
||||
|
||||
for (i in 1:length(pairwise_candidates)) {
|
||||
# The two pairwise_candidates for this subgraph.
|
||||
# Turn 1-based character positions into 0-based partition numbers,
|
||||
# e.g. (3, 5) -> (1, 2)
|
||||
|
||||
pos1 <- pairings[[i, 1]]
|
||||
pos2 <- pairings[[i, 2]]
|
||||
part1 <- (pos1 - 1) / ngram_size
|
||||
part2 <- (pos2 - 1) / ngram_size
|
||||
cat(sprintf("Writing partition (%d, %d)\n", part1, part2))
|
||||
|
||||
p <- pairwise_candidates[[i]]
|
||||
# each row is an edge
|
||||
for (j in 1:nrow(p)) {
|
||||
n1 <- p[[j, 1]]
|
||||
n2 <- p[[j, 2]]
|
||||
line <- sprintf('edge %d.%s %d.%s\n', part1, n1, part2, n2)
|
||||
# NOTE: It would be faster to preallocate 'lines', but we would have to
|
||||
# make a two passes through pairwise_candidates.
|
||||
cat(line, file = conn)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,137 +0,0 @@
|
|||
# fast_em.R: Wrapper around analysis/cpp/fast_em.cc.
|
||||
#
|
||||
# This serializes the input, shells out, and deserializes the output.
|
||||
|
||||
.Flatten <- function(list_of_matrices) {
|
||||
listOfVectors <- lapply(list_of_matrices, as.vector)
|
||||
#print(listOfVectors)
|
||||
|
||||
# unlist takes list to vector.
|
||||
unlist(listOfVectors)
|
||||
}
|
||||
|
||||
.WriteListOfMatrices <- function(list_of_matrices, f) {
|
||||
flattened <- .Flatten(list_of_matrices)
|
||||
|
||||
# NOTE: UpdateJointConditional does outer product of dimensions!
|
||||
|
||||
# 3 letter strings are null terminated
|
||||
writeBin('ne ', con = f)
|
||||
num_entries <- length(list_of_matrices)
|
||||
writeBin(num_entries, con = f)
|
||||
|
||||
Log('Wrote num_entries = %d', num_entries)
|
||||
|
||||
# For 2x3, this is 6
|
||||
writeBin('es ', con = f)
|
||||
|
||||
entry_size <- as.integer(prod(dim(list_of_matrices[[1]])))
|
||||
writeBin(entry_size, con = f)
|
||||
|
||||
Log('Wrote entry_size = %d', entry_size)
|
||||
|
||||
# now write the data
|
||||
writeBin('dat', con = f)
|
||||
writeBin(flattened, con = f)
|
||||
}
|
||||
|
||||
.ExpectTag <- function(f, tag) {
|
||||
# Read a single NUL-terminated character string.
|
||||
actual <- readBin(con = f, what = "char", n = 1)
|
||||
|
||||
# Assert that we got what was expected.
|
||||
if (length(actual) != 1) {
|
||||
stop(sprintf("Failed to read a tag '%s'", tag))
|
||||
}
|
||||
if (actual != tag) {
|
||||
stop(sprintf("Expected '%s', got '%s'", tag, actual))
|
||||
}
|
||||
}
|
||||
|
||||
.ReadResult <- function (f, entry_size, matrix_dims) {
|
||||
.ExpectTag(f, "emi")
|
||||
# NOTE: assuming R integers are 4 bytes (uint32_t)
|
||||
num_em_iters <- readBin(con = f, what = "int", n = 1)
|
||||
|
||||
.ExpectTag(f, "pij")
|
||||
pij <- readBin(con = f, what = "double", n = entry_size)
|
||||
|
||||
# Adjust dimensions
|
||||
dim(pij) <- matrix_dims
|
||||
|
||||
Log("Number of EM iterations: %d", num_em_iters)
|
||||
Log("PIJ read from external implementation:")
|
||||
print(pij)
|
||||
|
||||
# est, sd, var_cov, hist
|
||||
list(est = pij, num_em_iters = num_em_iters)
|
||||
}
|
||||
|
||||
.SanityChecks <- function(joint_conditional) {
|
||||
# Display some stats before sending it over to C++.
|
||||
|
||||
inf_counts <- lapply(joint_conditional, function(m) {
|
||||
sum(m == Inf)
|
||||
})
|
||||
total_inf <- sum(as.numeric(inf_counts))
|
||||
|
||||
nan_counts <- lapply(joint_conditional, function(m) {
|
||||
sum(is.nan(m))
|
||||
})
|
||||
total_nan <- sum(as.numeric(nan_counts))
|
||||
|
||||
zero_counts <- lapply(joint_conditional, function(m) {
|
||||
sum(m == 0.0)
|
||||
})
|
||||
total_zero <- sum(as.numeric(zero_counts))
|
||||
|
||||
#sum(joint_conditional[joint_conditional == Inf, ])
|
||||
Log('total inf: %s', total_inf)
|
||||
Log('total nan: %s', total_nan)
|
||||
Log('total zero: %s', total_zero)
|
||||
}
|
||||
|
||||
ConstructFastEM <- function(em_executable, tmp_dir) {
|
||||
|
||||
return(function(joint_conditional, max_em_iters = 1000,
|
||||
epsilon = 10 ^ -6, verbose = FALSE,
|
||||
estimate_var = FALSE) {
|
||||
matrix_dims <- dim(joint_conditional[[1]])
|
||||
# Check that number of dimensions is 2.
|
||||
if (length(matrix_dims) != 2) {
|
||||
Log('FATAL: Expected 2 dimensions, got %d', length(matrix_dims))
|
||||
stop()
|
||||
}
|
||||
|
||||
entry_size <- prod(matrix_dims)
|
||||
Log('entry size: %d', entry_size)
|
||||
|
||||
.SanityChecks(joint_conditional)
|
||||
|
||||
input_path <- file.path(tmp_dir, 'list_of_matrices.bin')
|
||||
Log("Writing flattened list of matrices to %s", input_path)
|
||||
f <- file(input_path, 'wb') # binary file
|
||||
.WriteListOfMatrices(joint_conditional, f)
|
||||
close(f)
|
||||
Log("Done writing %s", input_path)
|
||||
|
||||
output_path <- file.path(tmp_dir, 'pij.bin')
|
||||
|
||||
cmd <- sprintf("%s %s %s %s", em_executable, input_path, output_path,
|
||||
max_em_iters)
|
||||
|
||||
Log("Shell command: %s", cmd)
|
||||
exit_code <- system(cmd)
|
||||
|
||||
Log("Done running shell command")
|
||||
if (exit_code != 0) {
|
||||
stop(sprintf("Command failed with code %d", exit_code))
|
||||
}
|
||||
|
||||
f <- file(output_path, 'rb')
|
||||
result <- .ReadResult(f, entry_size, matrix_dims)
|
||||
close(f)
|
||||
|
||||
result
|
||||
})
|
||||
}
|
|
@ -1,271 +0,0 @@
|
|||
# Copyright 2014 Google Inc. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Authors: vpihur@google.com (Vasyl Pihur) and fanti@google.com (Giulia Fanti)
|
||||
#
|
||||
# Tools used to simulate sending partial ngrams to the server for estimating the
|
||||
# dictionary of terms over which we want to learn a distribution. This
|
||||
# mostly contains functions that aid in the generation of synthetic data.
|
||||
|
||||
library(RUnit)
|
||||
library(parallel)
|
||||
|
||||
source("analysis/R/encode.R")
|
||||
source("analysis/R/decode.R")
|
||||
source("analysis/R/simulation.R")
|
||||
source("analysis/R/association.R")
|
||||
source("analysis/R/decode_ngrams.R")
|
||||
|
||||
# The alphabet is the set of all possible characters that will appear in a
|
||||
# string. Here we use the English alphabet, but one might want to include
|
||||
# numbers or punctuation marks.
|
||||
alphabet <- letters
|
||||
|
||||
GenerateCandidates <- function(alphabet, ngram_size = 2) {
|
||||
# Draws a random string for each individual in the
|
||||
# population from distribution.
|
||||
#
|
||||
# Args:
|
||||
# N: Number of individuals in the population
|
||||
# num_strs: Number of strings from which to draw strings
|
||||
# str_len: Length of each string
|
||||
#
|
||||
# Returns:
|
||||
# Vector of strings for each individual in the population
|
||||
|
||||
cands <- do.call(expand.grid, lapply(seq(ngram_size), function(i) alphabet))
|
||||
apply(cands, 1, function(x) paste0(x, collapse = ""))
|
||||
}
|
||||
|
||||
GenerateString <- function(n) {
|
||||
# Generates a string of a given length from the alphabet.
|
||||
#
|
||||
# Args:
|
||||
# n: Number of characters in the string
|
||||
#
|
||||
# Returns:
|
||||
# String of length n
|
||||
paste0(sample(alphabet, n, replace = TRUE), collapse = "")
|
||||
}
|
||||
|
||||
GeneratePopulation <- function(N, num_strs, str_len = 10,
|
||||
distribution = 1) {
|
||||
# Generates a string for each individual in the population from distribution.
|
||||
#
|
||||
# Args:
|
||||
# N: Number of individuals in the population
|
||||
# num_strs: Number of strings from which to draw strings
|
||||
# str_len: Length of each string
|
||||
# distribution: which type of distribution to use
|
||||
# 1: Zipfian
|
||||
# 2: Geometric (exponential)
|
||||
# 3: Step function
|
||||
#
|
||||
# Returns:
|
||||
# Vector of strings for each individual in the population
|
||||
|
||||
strs <- sapply(1:num_strs, function(i) GenerateString(str_len))
|
||||
|
||||
if (distribution == 1) {
|
||||
# Zipfian-ish distribution
|
||||
prob <- (1:num_strs)^20
|
||||
prob <- prob / sum(prob) + 0.001
|
||||
prob <- prob / sum(prob)
|
||||
} else if (distribution == 2) {
|
||||
# Geometric distribution (discrete approximation to exponential)
|
||||
p <- 0.3
|
||||
prob <- p * (1 - p)^(1:num_strs - 1)
|
||||
prob <- prob / sum(prob)
|
||||
} else {
|
||||
# Uniform
|
||||
prob <- rep(1 / num_strs, num_strs)
|
||||
}
|
||||
|
||||
sample(strs, N, replace = TRUE, prob = prob)
|
||||
}
|
||||
|
||||
SelectNGrams <- function(str, num_ngrams, size, max_str_len = 6) {
|
||||
# Selects which ngrams each user will encode and then submit.
|
||||
#
|
||||
# Args:
|
||||
# str: String from which ngram is built.
|
||||
# num_ngrams: Number of ngrams to choose
|
||||
# size: Number of characters per ngram
|
||||
# max_str_len: Maximum number of characters in the string
|
||||
#
|
||||
# Returns:
|
||||
# List of each individual's ngrams and which positions the ngrams
|
||||
# were drawn from.
|
||||
|
||||
start <- sort(sample(seq(1, max_str_len, by = size), num_ngrams))
|
||||
ngrams <- mapply(function(x, y, str) substr(str, x, y),
|
||||
start, start + size - 1,
|
||||
MoreArgs = list(str = str))
|
||||
list(ngrams = ngrams, starts = start)
|
||||
}
|
||||
|
||||
UpdateMapWithCandidates <- function(str_candidates, sim, params) {
|
||||
# Generates a new map based on the returned candidates.
|
||||
# Normally this would be created on the spot by having the
|
||||
# aggregator hash the string candidates. But since we already have
|
||||
# the map from simulation, we'll just choose the appropriate
|
||||
# column
|
||||
#
|
||||
# Arguments:
|
||||
# str_candidates: Vector of string candidates
|
||||
# sim: Simulation object containing the original map
|
||||
# params: RAPPOR parameter list
|
||||
|
||||
k <- params$k
|
||||
h <- params$h
|
||||
m <- params$m
|
||||
|
||||
# First add the real candidates to the map
|
||||
valid_cands <- intersect(str_candidates, colnames(sim$full_map$map_by_cohort[[1]]))
|
||||
updated_map <- sim$full_map
|
||||
updated_map$map_by_cohort <- lapply(1:m, function(i) {
|
||||
sim$full_map$map_by_cohort[[i]][, valid_cands]
|
||||
})
|
||||
|
||||
# Now add the false positives (we can just draw random strings for
|
||||
# these since they didn't appear in the original dataset anyway)
|
||||
new_cands <- setdiff(str_candidates, colnames(sim$full_map$map_by_cohort[[1]]))
|
||||
M <- length(new_cands)
|
||||
if (M > 0) {
|
||||
for (i in 1:m) {
|
||||
ones <- sample(1:k, M * h, replace = TRUE)
|
||||
cols <- rep(1:M, each = h)
|
||||
strs <- c(sort(valid_cands), new_cands)
|
||||
updated_map$map_by_cohort[[i]] <-
|
||||
do.call(cBind, list(updated_map$map_by_cohort[[i]],
|
||||
sparseMatrix(ones, cols, dims = c(k, M))))
|
||||
colnames(updated_map$map_by_cohort[[i]]) <- strs
|
||||
}
|
||||
}
|
||||
if (class(updated_map$map_by_cohort[[1]]) == "logical") {
|
||||
updated_map$all_cohorts_map <- unlist(updated_map$map_by_cohort)
|
||||
updated_map$all_cohorts_map <- Matrix(updated_map$all_cohorts_map, sparse = TRUE)
|
||||
colnames(updated_map$all_cohorts_map) <- c(valid_cands, new_cands)
|
||||
} else {
|
||||
updated_map$all_cohorts_map <- do.call("rBind", updated_map$map_by_cohort)
|
||||
}
|
||||
updated_map
|
||||
}
|
||||
|
||||
SimulateNGrams <- function(N, ngram_params, str_len, num_strs = 10,
|
||||
alphabet, params, distribution = 1) {
|
||||
# Simulates the creation and encoding of ngrams for each individual.
|
||||
#
|
||||
# Args:
|
||||
# N: Number of individuals in the population
|
||||
# ngram_params: Parameters about ngram size, etc.
|
||||
# str_len: Length of each string
|
||||
# num_strs: NUmber of strings in the dictionary
|
||||
# alphabet: Alphabet used to generate strings
|
||||
# params: RAPPOR parameters, like noise and cohorts
|
||||
#
|
||||
# Returns:
|
||||
# List containing all the information needed for estimating and
|
||||
# verifying the results.
|
||||
|
||||
# Get the list of strings for each user
|
||||
strs <- GeneratePopulation(N, num_strs = num_strs,
|
||||
str_len = str_len,
|
||||
distribution)
|
||||
|
||||
# Split them into ngrams and encode
|
||||
ngram <- lapply(strs, function(i)
|
||||
SelectNGrams(i,
|
||||
num_ngrams = ngram_params$num_ngrams_collected,
|
||||
size = ngram_params$ngram_size,
|
||||
max_str_len = str_len))
|
||||
|
||||
cands <- GenerateCandidates(alphabet, ngram_params$ngram_size)
|
||||
map <- CreateMap(cands, params, FALSE)
|
||||
cohorts <- sample(1:params$m, N, replace = TRUE)
|
||||
|
||||
g <- sapply(ngram, function(x) paste(x$starts, sep = "_",
|
||||
collapse = "_"))
|
||||
ug <- sort(unique(g))
|
||||
pairings <- t(sapply(ug, function(x)
|
||||
sapply(strsplit(x, "_"), function(y) as.numeric(y))))
|
||||
|
||||
inds <- lapply(1:length(ug), function(i) ind <- which(g == ug[i]))
|
||||
|
||||
reports <- lapply(1:length(ug), function(k) {
|
||||
# Generate the ngram reports
|
||||
lapply(1:ngram_params$num_ngrams_collected, function(x) {
|
||||
EncodeAll(sapply(inds[[k]], function(j) ngram[[j]]$ngrams[x]),
|
||||
cohorts[inds[[k]]], map$map_by_cohort, params)})
|
||||
})
|
||||
cat("Encoded the ngrams.\n")
|
||||
# Now generate the full string reports
|
||||
full_map <- CreateMap(sort(unique(strs)), params, FALSE)
|
||||
full_reports <- EncodeAll(strs, cohorts, full_map$map_by_cohort, params)
|
||||
|
||||
list(reports = reports, cohorts = cohorts, ngram = ngram, map = map,
|
||||
strs = strs, pairings = pairings, inds = inds, cands = cands,
|
||||
full_reports = full_reports, full_map = full_map)
|
||||
|
||||
}
|
||||
|
||||
|
||||
EstimateDictionaryTrial <- function(N, str_len, num_strs,
|
||||
params, ngram_params,
|
||||
distribution = 3) {
|
||||
# Runs a single trial for simulation. Generates simulated reports,
|
||||
# decodes them, and returns the result.
|
||||
#
|
||||
# Arguments:
|
||||
# N: Number of users to simulation
|
||||
# str_len: The length of strings to estimate
|
||||
# num_strs: The number of strings in the dictionary
|
||||
# params: RAPPOR parameter list
|
||||
# ngram_params: Parameters related to the size of ngrams
|
||||
# distribution: Tells what kind of distribution to use:
|
||||
# 1: Zipfian
|
||||
# 2: Geometric
|
||||
# 3: Uniform (default)
|
||||
#
|
||||
# Returns:
|
||||
# List with recovered and true marginals.
|
||||
|
||||
# We call the needed libraries here in order to make them available when this
|
||||
# function gets called by BorgApply. Otherwise, they do not get included.
|
||||
library(glmnet)
|
||||
library(parallel)
|
||||
sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs,
|
||||
alphabet, params, distribution)
|
||||
|
||||
res <- EstimateDictionary(sim, N, ngram_params, params)
|
||||
str_candidates <- res$found_candidates
|
||||
pairwise_candidates <- res$pairwise_candidates
|
||||
|
||||
if (length(str_candidates) == 0) {
|
||||
return (NULL)
|
||||
}
|
||||
updated_map <- UpdateMapWithCandidates(str_candidates, sim, params)
|
||||
|
||||
# Compute the marginal for this new set of strings
|
||||
variable_counts <- ComputeCounts(sim$full_reports, sim$cohorts, params)
|
||||
# Our dictionary estimate
|
||||
marginal <- Decode(variable_counts, updated_map$all_cohorts_map, params)$fit
|
||||
# Estimate given full dictionary knowledge
|
||||
marginal_full <- Decode(variable_counts, sim$full_map$all_cohorts_map, params)$fit
|
||||
# The true (sampled) data distribution
|
||||
truth <- sort(table(sim$strs)) / N
|
||||
|
||||
list(marginal = marginal, marginal_full = marginal_full,
|
||||
truth = truth, pairwise_candidates = pairwise_candidates)
|
||||
}
|
|
@ -1,139 +0,0 @@
|
|||
# Copyright 2014 Google Inc. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Author: fanti@google.com (Giulia Fanti)
|
||||
#
|
||||
# Tests the unknown unknowns dictionary estimation functions.
|
||||
# There are two main components involved in estimating this unknown
|
||||
# distribution:
|
||||
# a) Find the pairwise ngrams that co-occur often.
|
||||
# b) Determine which full strings are consisted with all pairwise
|
||||
# relations.
|
||||
#
|
||||
# TestEstimateDictionary() tests the full pipeline, including parts (a)
|
||||
# and (b).
|
||||
# TestFindFeasibleStrings() tests only part (b).
|
||||
# Both tests generate their own data.
|
||||
|
||||
library(parallel)
|
||||
source("analysis/R/encode.R")
|
||||
source("analysis/R/decode.R")
|
||||
source("analysis/R/simulation.R")
|
||||
source("analysis/R/association.R")
|
||||
source("analysis/R/decode_ngrams.R")
|
||||
source("analysis/R/ngrams_simulation.R")
|
||||
alphabet <- letters
|
||||
options(warn = -1)
|
||||
|
||||
GeneratePopulation <- function(N, num_strs, str_len = 10,
|
||||
distribution = NULL) {
|
||||
# Generates a /deterministic/ string for each individual in the
|
||||
# population from distribution.
|
||||
#
|
||||
# Args:
|
||||
# N: Number of individuals in the population
|
||||
# num_strs: Number of strings from which to draw strings
|
||||
# str_len: Length of each string
|
||||
# distribution: Just here for compatibility with original
|
||||
# GeneratePopulation function in ngrams_simulation.R
|
||||
#
|
||||
# Returns:
|
||||
# Vector of strings for each individual in the population
|
||||
|
||||
strs <- sapply(1:num_strs, function(i) {
|
||||
paste0(alphabet[(str_len * (i - 1) + 1):(str_len * i)], collapse = "")
|
||||
})
|
||||
|
||||
# Uniform distribution
|
||||
prob <- rep(1 / num_strs, num_strs)
|
||||
sample(strs, N, replace = TRUE, prob = prob)
|
||||
}
|
||||
|
||||
TestEstimateDictionary <- function() {
|
||||
# Tests that the algorithm without noise recovers a uniform
|
||||
# string population correctly.
|
||||
|
||||
# Compute the strings from measuring only 2 ngrams
|
||||
N <- 100
|
||||
str_len <- 6
|
||||
ngram_size <- 2
|
||||
num_ngrams <- str_len / ngram_size
|
||||
num_strs <- 1
|
||||
|
||||
params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0)
|
||||
|
||||
ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams,
|
||||
num_ngrams_collected = 2)
|
||||
|
||||
sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs,
|
||||
alphabet, params, distribution = 3)
|
||||
|
||||
res <- EstimateDictionary(sim, N, ngram_params, params)
|
||||
|
||||
# Check that the correct strings are found
|
||||
if (num_strs == 1) {
|
||||
checkTrue(res$found_candidates == sort(unique(sim$strs)))
|
||||
} else {
|
||||
checkTrue(all.equal(res$found_candidates, sort(unique(sim$strs))))
|
||||
}
|
||||
}
|
||||
|
||||
TestFindFeasibleStrings <- function() {
|
||||
# Tests that FindPairwiseCandidates weeds out false positives.
|
||||
# We test this by adding false positives to the pairwise estimates.
|
||||
N <- 100
|
||||
str_len <- 6
|
||||
ngram_size <- 2
|
||||
num_ngrams <- str_len / ngram_size
|
||||
num_strs <- 2
|
||||
|
||||
params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0)
|
||||
|
||||
ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams,
|
||||
num_ngrams_collected = 2)
|
||||
|
||||
sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs,
|
||||
alphabet, params)
|
||||
|
||||
pairwise_candidates <- FindPairwiseCandidates(sim, N, ngram_params,
|
||||
params)$candidate_strs
|
||||
cat("Found the pairwise candidates. \n")
|
||||
|
||||
pairwise_candidates[[1]] <- rbind(pairwise_candidates[[1]], c("ab", "le"))
|
||||
|
||||
if (is.null(pairwise_candidates)) {
|
||||
return (FALSE)
|
||||
}
|
||||
|
||||
conn <- file('graph.txt', 'w+')
|
||||
WriteKPartiteGraph(conn,
|
||||
pairwise_candidates,
|
||||
sim$pairings,
|
||||
ngram_params$num_ngrams,
|
||||
ngram_params$ngram_size)
|
||||
|
||||
close(conn)
|
||||
cat("Wrote graph.txt\n")
|
||||
|
||||
found_candidates <- FindFeasibleStrings(pairwise_candidates,
|
||||
sim$pairings,
|
||||
ngram_params$num_ngrams,
|
||||
ngram_params$ngram_size)
|
||||
# Check that the correct strings are found
|
||||
if (num_strs == 1) {
|
||||
checkTrue(found_candidates == sort(unique(sim$strs)))
|
||||
} else {
|
||||
checkTrue(all.equal(found_candidates, sort(unique(sim$strs))))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
import sys
|
||||
from math import log
|
||||
|
||||
def p_star(f, p, q):
|
||||
return (f / 2.) * (p + q) + (1 - f) * p
|
||||
|
||||
def q_star(f, p, q):
|
||||
return (f / 2.) * (p + q) + (1 - f) * q
|
||||
|
||||
def inner_factor(p_star, q_star):
|
||||
return (q_star * (1. - p_star)) / (p_star * (1. - q_star ))
|
||||
|
||||
def epsilon(h, inner):
|
||||
return h * log(inner)
|
||||
|
||||
def main(args):
|
||||
f = float(args[1])
|
||||
p = float(args[2])
|
||||
q = float(args[3])
|
||||
h = int(args[4])
|
||||
|
||||
p_s = p_star(f, p, q)
|
||||
q_s = q_star(f, p, q)
|
||||
inner = inner_factor(p_s, q_s)
|
||||
e = epsilon(h, inner)
|
||||
print "{} ln({}) = {}".format(h, inner, e)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main(sys.argv)
|
Загрузка…
Ссылка в новой задаче