Merge pull request #120 from microsoft/update/core
Core lib, dependencies and documentation updates
This commit is contained in:
Коммит
f93f02fca6
|
@ -50,8 +50,7 @@ jobs:
|
|||
uses: actions/checkout@v3
|
||||
|
||||
- name: Run cargo clippy
|
||||
# add "-D warning" so clippy warnings will result in pipeline failures
|
||||
run: cargo clippy --
|
||||
run: cargo clippy -- -D warnings
|
||||
|
||||
cargo_test:
|
||||
name: Run cargo tests
|
||||
|
|
|
@ -27,5 +27,5 @@
|
|||
"[python]": {
|
||||
"editor.defaultFormatter": "ms-python.python"
|
||||
},
|
||||
"cSpell.enabled": true
|
||||
"rust-analyzer.check.command": "clippy"
|
||||
}
|
||||
|
|
|
@ -4,11 +4,11 @@ version = 3
|
|||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.7.6"
|
||||
version = "0.8.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47"
|
||||
checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f"
|
||||
dependencies = [
|
||||
"getrandom",
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
"version_check",
|
||||
]
|
||||
|
@ -46,7 +46,7 @@ version = "0.2.14"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"hermit-abi 0.1.19",
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
@ -93,6 +93,12 @@ version = "1.12.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2f5715e491b5a1598fc2bef5a606847b5dc1d48ea625bd3c02c00de8285591da"
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.0.79"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.0"
|
||||
|
@ -154,7 +160,7 @@ dependencies = [
|
|||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
"lazy_static",
|
||||
"memoffset",
|
||||
"memoffset 0.6.5",
|
||||
"scopeguard",
|
||||
]
|
||||
|
||||
|
@ -215,6 +221,40 @@ dependencies = [
|
|||
"termcolor",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "env_logger"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0"
|
||||
dependencies = [
|
||||
"humantime",
|
||||
"is-terminal",
|
||||
"log",
|
||||
"regex",
|
||||
"termcolor",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "errno"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
|
||||
dependencies = [
|
||||
"errno-dragonfly",
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "errno-dragonfly"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fnv"
|
||||
version = "1.0.7"
|
||||
|
@ -236,9 +276,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.12.3"
|
||||
version = "0.13.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
|
||||
checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
]
|
||||
|
@ -261,6 +301,12 @@ dependencies = [
|
|||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
|
||||
|
||||
[[package]]
|
||||
name = "humantime"
|
||||
version = "2.1.0"
|
||||
|
@ -286,6 +332,28 @@ dependencies = [
|
|||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "io-lifetimes"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cfa919a82ea574332e2de6e74b4c36e74d41982b335080fa59d4ef31be20fdf3"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "is-terminal"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "21b6b32576413a8e69b90e952e4a026476040d81017b80445deda5f2d3921857"
|
||||
dependencies = [
|
||||
"hermit-abi 0.3.1",
|
||||
"io-lifetimes",
|
||||
"rustix",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.10.1"
|
||||
|
@ -318,9 +386,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
|||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.108"
|
||||
version = "0.2.140"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8521a1b57e76b1ec69af7599e75e38e7b7fad6610f037db8c79b127201b5d119"
|
||||
checksum = "99227334921fae1a979cf0bfdfcc6b3e5ce376ef57e16fb6fb3ea2ed6095f80c"
|
||||
|
||||
[[package]]
|
||||
name = "libm"
|
||||
|
@ -328,6 +396,12 @@ version = "0.2.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c7d73b3f436185384286bd8098d17ec07c9a7d2388a6599f824d8502b529702a"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.5"
|
||||
|
@ -348,9 +422,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "lru"
|
||||
version = "0.8.1"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6e8aaa3f231bb4bd57b84b2d5dc3ae7f350265df8aa96492e0bc394a1571909"
|
||||
checksum = "03f1160296536f10c833a82dca22267d5486734230d47bf00bf435885814ba1e"
|
||||
dependencies = [
|
||||
"hashbrown",
|
||||
]
|
||||
|
@ -379,6 +453,15 @@ dependencies = [
|
|||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nalgebra"
|
||||
version = "0.29.0"
|
||||
|
@ -454,19 +537,19 @@ version = "1.13.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"hermit-abi 0.1.19",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.8.0"
|
||||
version = "1.17.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56"
|
||||
checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
|
||||
|
||||
[[package]]
|
||||
name = "pac-synth"
|
||||
version = "0.0.7"
|
||||
version = "0.0.8"
|
||||
dependencies = [
|
||||
"log",
|
||||
"pyo3",
|
||||
|
@ -547,14 +630,14 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "pyo3"
|
||||
version = "0.17.2"
|
||||
version = "0.18.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "201b6887e5576bf2f945fe65172c1fcbf3fcf285b23e4d71eb171d9736e38d32"
|
||||
checksum = "06a3d8e8a46ab2738109347433cb7b96dffda2e4a218b03ef27090238886b147"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"indoc",
|
||||
"libc",
|
||||
"memoffset",
|
||||
"memoffset 0.8.0",
|
||||
"parking_lot",
|
||||
"pyo3-build-config",
|
||||
"pyo3-ffi",
|
||||
|
@ -564,9 +647,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "pyo3-build-config"
|
||||
version = "0.17.2"
|
||||
version = "0.18.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bf0708c9ed01692635cbf056e286008e5a2927ab1a5e48cdd3aeb1ba5a6fef47"
|
||||
checksum = "75439f995d07ddfad42b192dfcf3bc66a7ecfd8b4a1f5f6f046aa5c2c5d7677d"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"target-lexicon",
|
||||
|
@ -574,9 +657,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "pyo3-ffi"
|
||||
version = "0.17.2"
|
||||
version = "0.18.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "90352dea4f486932b72ddf776264d293f85b79a1d214de1d023927b41461132d"
|
||||
checksum = "839526a5c07a17ff44823679b68add4a58004de00512a95b6c1c98a6dcac0ee5"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"pyo3-build-config",
|
||||
|
@ -584,9 +667,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "pyo3-macros"
|
||||
version = "0.17.2"
|
||||
version = "0.18.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7eb24b804a2d9e88bfcc480a5a6dd76f006c1e3edaf064e8250423336e2cd79d"
|
||||
checksum = "bd44cf207476c6a9760c4653559be4f206efafb924d3e4cbf2721475fc0d6cc5"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"pyo3-macros-backend",
|
||||
|
@ -596,9 +679,9 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "pyo3-macros-backend"
|
||||
version = "0.17.2"
|
||||
version = "0.18.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f22bb49f6a7348c253d7ac67a6875f2dc65f36c2ae64a82c381d528972bea6d6"
|
||||
checksum = "dc1f43d8e30460f36350d18631ccf85ded64c059829208fe680904c65bcd0a4c"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
|
@ -736,6 +819,20 @@ dependencies = [
|
|||
"semver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.36.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fd5c6ff11fecd55b40746d1995a02f2eb375bf8c00d192d521ee09f42bef37bc"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"errno",
|
||||
"io-lifetimes",
|
||||
"libc",
|
||||
"linux-raw-sys",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.5"
|
||||
|
@ -765,10 +862,10 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
|||
|
||||
[[package]]
|
||||
name = "sds-cli"
|
||||
version = "1.8.6"
|
||||
version = "1.9.0"
|
||||
dependencies = [
|
||||
"csv",
|
||||
"env_logger",
|
||||
"env_logger 0.9.0",
|
||||
"log",
|
||||
"sds-core",
|
||||
"statrs",
|
||||
|
@ -777,7 +874,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "sds-core"
|
||||
version = "1.8.6"
|
||||
version = "1.9.0"
|
||||
dependencies = [
|
||||
"csv",
|
||||
"fnv",
|
||||
|
@ -796,10 +893,10 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "sds-pyo3"
|
||||
version = "1.8.6"
|
||||
version = "1.9.0"
|
||||
dependencies = [
|
||||
"csv",
|
||||
"env_logger",
|
||||
"env_logger 0.10.0",
|
||||
"log",
|
||||
"pyo3",
|
||||
"sds-core",
|
||||
|
@ -807,7 +904,7 @@ dependencies = [
|
|||
|
||||
[[package]]
|
||||
name = "sds-wasm"
|
||||
version = "1.8.6"
|
||||
version = "1.9.0"
|
||||
dependencies = [
|
||||
"console_error_panic_hook",
|
||||
"csv",
|
||||
|
@ -1058,9 +1155,9 @@ checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
|
|||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.3"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"
|
||||
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
|
@ -1210,3 +1307,69 @@ name = "winapi-x86_64-pc-windows-gnu"
|
|||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.45.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
|
||||
dependencies = [
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm",
|
||||
"windows_aarch64_msvc",
|
||||
"windows_i686_gnu",
|
||||
"windows_i686_msvc",
|
||||
"windows_x86_64_gnu",
|
||||
"windows_x86_64_gnullvm",
|
||||
"windows_x86_64_msvc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.42.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
|
||||
|
|
|
@ -163,17 +163,26 @@ In order to decrease the noise, we can use a differentially-private percentile t
|
|||
|
||||
From [Differentially Private Marginals](./dp_marginals.pdf), to satisfy $(\varepsilon, \delta)$-DP, the following inequality needs to hold:
|
||||
|
||||
$0.5 * R\varepsilon_Q^2 + 0.5 * \varepsilon_N^2 + 0.5 *\displaystyle\sum_{1}^{R} 1/\sigma_k^2 \leq \sqrt{\varepsilon + \ln(2/\delta)} - \sqrt{\ln(2/\delta)}$, where the reported aggregate count is `real_aggregate_count + ` $\sigma_{k} * \sqrt{\Delta_k} * N(0, 1)$ and the reported number of records is `real_number_of_records + ` $Laplace(1 / \varepsilon_N)$.
|
||||
(EQ1) $0.5 * R\varepsilon_Q^2 + 0.5 *\displaystyle\sum_{1}^{R} 1/\sigma_k^2 \leq (\sqrt{\varepsilon_M + \ln(2/\delta)} - \sqrt{\ln(2/\delta)})^2$, where the reported aggregate count will have noise added by $\sigma_{k} * \sqrt{\Delta_k} * N(0, 1)$.
|
||||
|
||||
Based on the given inequality we can:
|
||||
Assuming the total privacy budget to be $\varepsilon$, and $n$ to be the total number of records in the dataset, we then define:
|
||||
|
||||
1. Call $\rho=\sqrt{\varepsilon + \ln(2/\delta)} - \sqrt{\ln(2/\delta)}$
|
||||
2. Define $Q_{p}$ as the proportion of the total privacy budget dedicated for finding $Q^{th}$ percentiles
|
||||
(EQ2) $\varepsilon = \varepsilon_M + \varepsilon_N$, where $\varepsilon_M$ is the portion of privacy budget we dedicate to the marginals EQ1 equation and $\varepsilon_N$ what we dedicate to protect the number of records - $protected(n) = n + Laplace(1 / \varepsilon_N)$.
|
||||
|
||||
If a $\delta$ value is not provided, it will be inferred from the protected number of records:
|
||||
|
||||
$\delta = \frac{1}{protected(n) * \ln(protected(n))}$
|
||||
|
||||
Besides, based on EQ1 and EQ2 we can:
|
||||
|
||||
1. Call $\rho=(\sqrt{\varepsilon_M + \ln(2/\delta)} - \sqrt{\ln(2/\delta)})^2$
|
||||
2. Define $Q_{p}$ as the proportion of the privacy budget dedicated for finding $Q^{th}$ percentiles
|
||||
3. Define $N_{p}$ the proportion of the total privacy budget dedicated for finding the protected number of records
|
||||
4. Call $\varepsilon_N = N_{p} * \varepsilon$ and $\varepsilon_M = \varepsilon - \varepsilon_N$
|
||||
|
||||
Then, in order to find $\varepsilon_Q$, $\varepsilon_N$ and $\sigma_k$, we need to solve: (i) $0.5 * R\varepsilon_Q^2 = \rho * Q_{p}$; (ii) $0.5 * \varepsilon_N^2 = \rho * N_{p}$; and (iii) $0.5 *\displaystyle\sum_{1}^{R} 1/\sigma_i^2 = \rho * (1 - Q_{p} - N_{p})$.
|
||||
From the above assumptions we know that (i) $\varepsilon_M = \varepsilon - N_{p} * \varepsilon = \varepsilon * (1 - N_{p})$. Then, in order to find $\varepsilon_Q$, and $\sigma_k$, we need to solve: (ii) $0.5 * R\varepsilon_Q^2 = \rho * Q_{p}$; and (iii) $0.5 *\displaystyle\sum_{1}^{R} 1/\sigma_i^2 = \rho * (1 - Q_{p})$.
|
||||
|
||||
(i) directly tells us that $\varepsilon_Q = \sqrt{(2 * \rho * Q_{p}) / R}$ [3] and (ii) that $\varepsilon_N = \sqrt{2 * \rho * N_{p}}$.
|
||||
(ii) directly tells us that $\varepsilon_Q = \sqrt{(2 * \rho * Q_{p}) / R}$ [3].
|
||||
|
||||
On the other hand, to solve (iii) and find the $\sigma_k$ values, SDS will proportionally split the privacy budget such that:
|
||||
|
||||
|
@ -184,22 +193,24 @@ On the other hand, to solve (iii) and find the $\sigma_k$ values, SDS will propo
|
|||
|
||||
Thus:
|
||||
|
||||
$(\frac{1}{\sigma_1^2} + \frac{1}{\sigma_2^2}+ ... + \frac{1}{\sigma_k^2}) = 2 * \rho * (1 - Q_{p} - N_{p})$
|
||||
$(\frac{1}{\sigma_1^2} + \frac{1}{\sigma_2^2}+ ... + \frac{1}{\sigma_k^2}) = 2 * \rho * (1 - Q_{p})$
|
||||
|
||||
$(\frac{1}{p_1^2*\sigma^2} + \frac{1}{p_2^2*\sigma^2} + ... + \frac{1}{p_k^2*\sigma^2}) = 2 * \rho * (1 - Q_{p} - N_{p})$
|
||||
$(\frac{1}{p_1^2*\sigma^2} + \frac{1}{p_2^2*\sigma^2} + ... + \frac{1}{p_k^2*\sigma^2}) = 2 * \rho * (1 - Q_{p})$
|
||||
|
||||
$\frac{1}{\sigma^2} * (\frac{1}{p_1^2} + \frac{1}{p_2^2} + ... + \frac{1}{p_k^2}) = 2 * \rho * (1 - Q_{p} - N_{p})$
|
||||
$\frac{1}{\sigma^2} * (\frac{1}{p_1^2} + \frac{1}{p_2^2} + ... + \frac{1}{p_k^2}) = 2 * \rho * (1 - Q_{p})$
|
||||
|
||||
$\frac{1}{\sigma^2} = \frac{2 * \rho * (1 - Q_{p} - N_{p})}{(\frac{1}{p_1^2} + \frac{1}{p_2^2} + ... + \frac{1}{p_k^2})}$
|
||||
$\frac{1}{\sigma^2} = \frac{2 * \rho * (1 - Q_{p})}{(\frac{1}{p_1^2} + \frac{1}{p_2^2} + ... + \frac{1}{p_k^2})}$
|
||||
|
||||
$\sigma = \sqrt{\frac{(\frac{1}{p_1^2} + \frac{1}{p_2^2} + ... + \frac{1}{p_k^2})}{2 * \rho * (1 - Q_{p} - N_{p})}}$
|
||||
$\sigma = \sqrt{\frac{(\frac{1}{p_1^2} + \frac{1}{p_2^2} + ... + \frac{1}{p_k^2})}{2 * \rho * (1 - Q_{p})}}$
|
||||
|
||||
$\sigma_k = p_k * \sigma = p_k * \sqrt{\frac{(\frac{1}{p_1^2} + \frac{1}{p_2^2} + ... + \frac{1}{p_k^2})}{2 * \rho * (1 - Q_{p} - N_{p})}}$ [4]
|
||||
$\sigma_k = p_k * \sigma = p_k * \sqrt{\frac{(\frac{1}{p_1^2} + \frac{1}{p_2^2} + ... + \frac{1}{p_k^2})}{2 * \rho * (1 - Q_{p})}}$ [4]
|
||||
|
||||
To summarize, to control the allocation of the privacy budget $\varepsilon$, SDS expects the following inputs:
|
||||
|
||||
- `Percentile epsilon proportion` = $Q_p$, where $0 < Q_p < 1$ and $0 < Q_p + N_p < 1$
|
||||
- `Number of records epsilon proportion` = $N_p$, where $0 < N_p < 1$ and $0 < Q_p + N_p < 1$
|
||||
- `Total privacy budget` = $\varepsilon$
|
||||
- `Delta` = $\delta$, which can also be inferred from the protected number of records
|
||||
- `Percentile epsilon proportion` = $Q_p$, where $0 < Q_p < 1$
|
||||
- `Number of records epsilon proportion` = $N_p$, where $0 < N_p < 1$
|
||||
- `Sigma proportions` = $[p_1, p_2, ..., p_k]$, where $p_k > 0$
|
||||
|
||||
To illustrate the sigma proportions, let's assume a reporting length of $3$. Then we could set:
|
||||
|
|
Двоичные данные
docs/dp/dp_marginals.pdf
Двоичные данные
docs/dp/dp_marginals.pdf
Двоичный файл не отображается.
44
package.json
44
package.json
|
@ -17,35 +17,37 @@
|
|||
"git:precommit": "lint-staged",
|
||||
"git:ci": "run-s format:check lint:check test build",
|
||||
"release:wasm": "npm-publish --access public --token ${NPM_TOKEN} ./target/wasm/package.json",
|
||||
"release": "run-p release:wasm"
|
||||
"release": "run-p release:wasm",
|
||||
"rust:format": "cargo fmt --all --",
|
||||
"rust:check": "cargo clippy -- -D warnings && cargo fmt --all -- --check && cargo test"
|
||||
},
|
||||
"prettier": "@essex/prettier-config",
|
||||
"devDependencies": {
|
||||
"@essex/eslint-config": "^20.3.5",
|
||||
"@essex/eslint-plugin": "^20.3.12",
|
||||
"@essex/jest-config": "^21.0.17",
|
||||
"@essex/prettier-config": "^18.0.4",
|
||||
"@essex/scripts": "^22.2.0",
|
||||
"@essex/eslint-config": "^20.5.1",
|
||||
"@essex/eslint-plugin": "^20.5.1",
|
||||
"@essex/jest-config": "^21.0.20",
|
||||
"@essex/prettier-config": "^18.0.7",
|
||||
"@essex/scripts": "^24.0.3",
|
||||
"@jsdevtools/npm-publish": "^1.4.3",
|
||||
"@types/eslint": "^8.4.10",
|
||||
"@types/prettier": "^2.7.1",
|
||||
"@typescript-eslint/eslint-plugin": "^5.46.1",
|
||||
"@typescript-eslint/parser": "^5.46.1",
|
||||
"eslint": "^8.29.0",
|
||||
"eslint-import-resolver-node": "^0.3.6",
|
||||
"@types/eslint": "^8.21.2",
|
||||
"@types/prettier": "^2.7.2",
|
||||
"@typescript-eslint/eslint-plugin": "^5.55.0",
|
||||
"@typescript-eslint/parser": "^5.55.0",
|
||||
"eslint": "^8.36.0",
|
||||
"eslint-import-resolver-node": "^0.3.7",
|
||||
"eslint-plugin-header": "^3.1.1",
|
||||
"eslint-plugin-import": "^2.26.0",
|
||||
"eslint-plugin-jest": "^27.1.7",
|
||||
"eslint-plugin-jsx-a11y": "^6.6.1",
|
||||
"eslint-plugin-react": "^7.31.11",
|
||||
"eslint-plugin-import": "^2.27.5",
|
||||
"eslint-plugin-jest": "^27.2.1",
|
||||
"eslint-plugin-jsx-a11y": "^6.7.1",
|
||||
"eslint-plugin-react": "^7.32.2",
|
||||
"eslint-plugin-react-hooks": "^4.6.0",
|
||||
"eslint-plugin-simple-import-sort": "^8.0.0",
|
||||
"husky": "^8.0.2",
|
||||
"lint-staged": "^13.1.0",
|
||||
"eslint-plugin-simple-import-sort": "^10.0.0",
|
||||
"husky": "^8.0.3",
|
||||
"lint-staged": "^13.2.0",
|
||||
"npm-run-all": "^4.1.5",
|
||||
"prettier": "^2.8.1",
|
||||
"prettier": "^2.8.4",
|
||||
"replace": "^1.2.2",
|
||||
"typescript": "^4.8.4"
|
||||
"typescript": "^5.0.2"
|
||||
},
|
||||
"workspaces": [
|
||||
"packages/webapp",
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "sds-cli"
|
||||
version = "1.8.6"
|
||||
version = "1.9.0"
|
||||
license = "MIT"
|
||||
description = "Command line interface for the sds-core library"
|
||||
repository = "https://github.com/microsoft/synthetic-data-showcase"
|
||||
|
|
|
@ -403,8 +403,6 @@ fn main() {
|
|||
} => {
|
||||
let mut aggregator = Aggregator::new(data_block.clone());
|
||||
let aggregated_data = if dp {
|
||||
let n_records_f64 = data_block.number_of_records() as f64;
|
||||
let delta = noise_delta.unwrap_or(1.0 / (n_records_f64.ln() * n_records_f64));
|
||||
let thresholds_map = noise_threshold_values
|
||||
.unwrap()
|
||||
.iter()
|
||||
|
@ -425,9 +423,9 @@ fn main() {
|
|||
reporting_length,
|
||||
&DpParameters::new(
|
||||
noise_epsilon.unwrap(),
|
||||
delta,
|
||||
sensitivities_percentile.unwrap(),
|
||||
sensitivities_epsilon_proportion.unwrap(),
|
||||
noise_delta,
|
||||
sigma_proportions,
|
||||
number_of_records_epsilon_proportion,
|
||||
),
|
||||
|
|
|
@ -22,19 +22,19 @@
|
|||
"license": "MIT",
|
||||
"devDependencies": {
|
||||
"@types/mime": "^3.0.1",
|
||||
"@types/node": "^16.18.9",
|
||||
"@types/react": "^17.0.52",
|
||||
"@types/node": "^16.18.16",
|
||||
"@types/react": "^17.0.53",
|
||||
"npm-run-all": "^4.1.5",
|
||||
"react": "^17.0.2",
|
||||
"shx": "^0.3.4",
|
||||
"ts-node": "^10.9.1",
|
||||
"typescript": "^4.8.4"
|
||||
"typescript": "^5.0.2"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"react": "^17.0.2"
|
||||
},
|
||||
"dependencies": {
|
||||
"@griffel/react": "^1.5.1",
|
||||
"@griffel/react": "^1.5.5",
|
||||
"mime": "^3.0.0",
|
||||
"react-dropzone": "^14.2.3"
|
||||
}
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
* Copyright (c) Microsoft. All rights reserved.
|
||||
* Licensed under the MIT license. See LICENSE file in the project.
|
||||
*/
|
||||
import type { FC, PropsWithChildren } from 'react'
|
||||
import React, { forwardRef, memo, useImperativeHandle, useMemo } from 'react'
|
||||
import type { FC } from 'react'
|
||||
import React, { forwardRef, memo, useImperativeHandle } from 'react'
|
||||
|
||||
import { FileDropContext } from './FileDrop.context.js'
|
||||
import { useFileDrop } from './FileDrop.hooks.js'
|
||||
|
@ -15,25 +15,6 @@ export const FileDrop: FC<FileDropProps> = memo(
|
|||
const { getRootProps, getInputProps, isDragActive, open } =
|
||||
useFileDrop(props)
|
||||
const classes = useFileDropStyles(props.slotClassNames)
|
||||
const DivOverlay = useMemo(() => {
|
||||
return (
|
||||
props.divOverlay ??
|
||||
((({ children }) => {
|
||||
return <div className={classes.Overlay}>{children}</div>
|
||||
}) as FC<
|
||||
PropsWithChildren<{
|
||||
/* nothing */
|
||||
}>
|
||||
>)
|
||||
)
|
||||
}, [props.divOverlay, classes])
|
||||
|
||||
const DragMessage = useMemo(() => {
|
||||
return (
|
||||
props.onDragMessage ??
|
||||
(() => <span className={classes.OverlayMessage}>Drop files.</span>)
|
||||
)
|
||||
}, [props.onDragMessage, classes])
|
||||
|
||||
useImperativeHandle(ref, () => ({ open }), [open])
|
||||
|
||||
|
@ -42,9 +23,11 @@ export const FileDrop: FC<FileDropProps> = memo(
|
|||
<span {...getRootProps()}>
|
||||
<input {...getInputProps()} />
|
||||
{isDragActive && (
|
||||
<DivOverlay>
|
||||
<DragMessage />
|
||||
</DivOverlay>
|
||||
<div className={classes.Overlay}>
|
||||
<span className={classes.OverlayMessage}>
|
||||
{props.onDragMessage ?? 'Drop files.'}
|
||||
</span>
|
||||
</div>
|
||||
)}
|
||||
{props.children}
|
||||
</span>
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
* Copyright (c) Microsoft. All rights reserved.
|
||||
* Licensed under the MIT license. See LICENSE file in the project.
|
||||
*/
|
||||
import type { ElementType, PropsWithChildren, RefObject } from 'react'
|
||||
import type { PropsWithChildren, RefObject } from 'react'
|
||||
import type { DropzoneOptions } from 'react-dropzone'
|
||||
|
||||
import type { Expand } from '../types/expand.js'
|
||||
|
@ -16,8 +16,7 @@ export type FileDropProps = Expand<
|
|||
Omit<DropzoneOptions, 'accept'> &
|
||||
PropsWithChildren<{
|
||||
accept?: string | string[]
|
||||
onDragMessage?: ElementType
|
||||
divOverlay?: ElementType
|
||||
onDragMessage?: string
|
||||
ref?: RefObject<FileDropRef>
|
||||
slotClassNames?: FileDropSlotClassNames
|
||||
}>
|
||||
|
|
|
@ -3,21 +3,20 @@
|
|||
* Licensed under the MIT license. See LICENSE file in the project.
|
||||
*/
|
||||
import type { CSSProperties, FC } from 'react'
|
||||
import React, { memo, useMemo } from 'react'
|
||||
import React, { memo } from 'react'
|
||||
|
||||
import { useFlexContainerStyles } from './FlexContainer.hooks.js'
|
||||
import type { FlexContainerProps } from './FlexContainer.types.js'
|
||||
|
||||
export const FlexContainer: FC<FlexContainerProps> = memo(
|
||||
function FlexContainer(props) {
|
||||
const { className, as, children } = props
|
||||
const { className, children } = props
|
||||
const inlineStyles: CSSProperties = useFlexContainerStyles(props)
|
||||
const Element = useMemo(() => as ?? 'div', [as])
|
||||
|
||||
return (
|
||||
<Element className={className} style={inlineStyles}>
|
||||
<div className={className} style={inlineStyles}>
|
||||
{children}
|
||||
</Element>
|
||||
</div>
|
||||
)
|
||||
},
|
||||
)
|
||||
|
|
|
@ -3,10 +3,9 @@
|
|||
* Licensed under the MIT license. See LICENSE file in the project.
|
||||
*/
|
||||
|
||||
import type { CSSProperties, ElementType, PropsWithChildren } from 'react'
|
||||
import type { CSSProperties, PropsWithChildren } from 'react'
|
||||
|
||||
export type FlexContainerProps = PropsWithChildren<{
|
||||
as?: ElementType
|
||||
vertical?: boolean
|
||||
wrap?: boolean
|
||||
justify?:
|
||||
|
|
|
@ -4,20 +4,19 @@
|
|||
*/
|
||||
|
||||
import type { CSSProperties, FC } from 'react'
|
||||
import React, { memo, useMemo } from 'react'
|
||||
import React, { memo } from 'react'
|
||||
|
||||
import { useFlexItemStyles } from './FlexItem.hooks.js'
|
||||
import type { FlexItemProps } from './FlexItem.types.js'
|
||||
|
||||
export const FlexItem: FC<FlexItemProps> = memo(function FlexItem(props) {
|
||||
const { as, children, className } = props
|
||||
const { children, className } = props
|
||||
const inlineStyles: CSSProperties = useFlexItemStyles(props)
|
||||
const Element = useMemo(() => as ?? 'div', [as])
|
||||
|
||||
return (
|
||||
<Element className={className} style={inlineStyles}>
|
||||
<div className={className} style={inlineStyles}>
|
||||
{children}
|
||||
</Element>
|
||||
</div>
|
||||
)
|
||||
})
|
||||
FlexItem.displayName = 'FlexItem'
|
||||
|
|
|
@ -2,10 +2,9 @@
|
|||
* Copyright (c) Microsoft. All rights reserved.
|
||||
* Licensed under the MIT license. See LICENSE file in the project.
|
||||
*/
|
||||
import type { CSSProperties, ElementType, PropsWithChildren } from 'react'
|
||||
import type { CSSProperties, PropsWithChildren } from 'react'
|
||||
|
||||
export type FlexItemProps = PropsWithChildren<{
|
||||
as?: ElementType
|
||||
order?: number
|
||||
shrink?: number
|
||||
basis?: string
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "sds-core"
|
||||
version = "1.8.6"
|
||||
version = "1.9.0"
|
||||
license = "MIT"
|
||||
description = "Synthetic data showcase core library"
|
||||
repository = "https://github.com/microsoft/synthetic-data-showcase"
|
||||
|
@ -13,12 +13,12 @@ crate-type = ["rlib"]
|
|||
rand = { version = "0.8" }
|
||||
fnv = { version = "1.0" }
|
||||
itertools = { version = "0.10" }
|
||||
lru = { version = "0.8" }
|
||||
lru = { version = "0.10" }
|
||||
getrandom = { version = "0.2", features = ["js"] }
|
||||
log = { version = "0.4", features = ["std"] }
|
||||
csv = { version = "1.1" }
|
||||
instant = { version = "0.1", features = [ "stdweb", "wasm-bindgen" ] }
|
||||
pyo3 = { version = "0.17", features = ["extension-module"], optional = true }
|
||||
pyo3 = { version = "0.18", features = ["extension-module"], optional = true }
|
||||
rayon = { version = "1.5", optional = true }
|
||||
serde = { version = "1.0", features = [ "derive", "rc" ] }
|
||||
serde_json = { version = "1.0" }
|
||||
|
|
|
@ -33,16 +33,19 @@ pub struct DataBlock {
|
|||
pub records: DataBlockRecords,
|
||||
}
|
||||
|
||||
impl DataBlock {
|
||||
impl Default for DataBlock {
|
||||
/// Returns a new DataBlock with default values
|
||||
pub fn default() -> DataBlock {
|
||||
DataBlock {
|
||||
#[inline]
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
headers: DataBlockHeaders::default(),
|
||||
multi_value_column_metadata_map: MultiValueColumnMetadataMap::default(),
|
||||
records: DataBlockRecords::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DataBlock {
|
||||
/// Returns a new DataBlock
|
||||
/// # Arguments
|
||||
/// * `headers` - Vector of string representing the data headers
|
||||
|
|
|
@ -24,7 +24,6 @@ impl<T: Read> DataBlockCreator for CsvDataBlockCreator<T> {
|
|||
fn get_records(reader: &mut Self::InputType) -> Result<Vec<CsvRecord>, Error> {
|
||||
reader
|
||||
.records()
|
||||
.into_iter()
|
||||
.map(|record_result: Result<StringRecord, Error>| {
|
||||
Ok(record_result?
|
||||
.into_iter()
|
||||
|
|
|
@ -1,122 +0,0 @@
|
|||
use super::stats_error::StatsError;
|
||||
use statrs::distribution::{ContinuousCDF, Normal};
|
||||
|
||||
/// Default tolerance used to calculate sigma for the gaussian noise
|
||||
pub const DEFAULT_TOLERANCE: f64 = 1e-8;
|
||||
|
||||
fn binary_search(
|
||||
f: &dyn Fn(f64) -> bool,
|
||||
lower_bound: f64,
|
||||
upper_bound: f64,
|
||||
tolerance: f64,
|
||||
) -> f64 {
|
||||
let lower_res = f(lower_bound);
|
||||
let upper_res = f(upper_bound);
|
||||
|
||||
assert!(
|
||||
lower_res != upper_res,
|
||||
"upper and lower bound predicates should have different values for binary search"
|
||||
);
|
||||
|
||||
let mut lower = lower_bound;
|
||||
let mut upper = upper_bound;
|
||||
|
||||
while upper - lower > tolerance {
|
||||
let mid = lower + ((upper - lower) / 2.0);
|
||||
if f(mid) == upper_res {
|
||||
upper = mid
|
||||
} else {
|
||||
lower = mid;
|
||||
}
|
||||
}
|
||||
|
||||
if upper_res {
|
||||
upper
|
||||
} else {
|
||||
lower
|
||||
}
|
||||
}
|
||||
|
||||
pub trait DpAnalyticGaussianContinuousCDFScale
|
||||
where
|
||||
Self: ContinuousCDF<f64, f64> + Sized,
|
||||
{
|
||||
fn calc_alpha_increasing_beta(&self, epsilon: f64, delta: f64, tolerance: f64) -> f64 {
|
||||
let beta = |v: f64| {
|
||||
self.cdf(f64::sqrt(epsilon * v))
|
||||
- (f64::exp(epsilon) * self.cdf(-f64::sqrt(epsilon * (v + 2.0))))
|
||||
};
|
||||
let mut upper_bound: f64 = 2.0;
|
||||
|
||||
// this is monotonically increasing, so find the upper bound
|
||||
// for the binary search
|
||||
while beta(upper_bound) <= delta {
|
||||
upper_bound *= 2.0;
|
||||
}
|
||||
|
||||
let v_star = binary_search(&|v| beta(v) <= delta, 0.0, upper_bound, tolerance);
|
||||
|
||||
f64::sqrt(1.0 + (v_star / 2.0)) - f64::sqrt(v_star / 2.0)
|
||||
}
|
||||
|
||||
fn calc_alpha_decreasing_beta(&self, epsilon: f64, delta: f64, tolerance: f64) -> f64 {
|
||||
let beta = |u: f64| {
|
||||
self.cdf(-f64::sqrt(epsilon * u))
|
||||
- (f64::exp(epsilon) * self.cdf(-f64::sqrt(epsilon * (u + 2.0))))
|
||||
};
|
||||
let mut upper_bound: f64 = 2.0;
|
||||
|
||||
// this is monotonically increasing, so find the upper bound
|
||||
// for the binary search
|
||||
while beta(upper_bound) >= delta {
|
||||
upper_bound *= 2.0;
|
||||
}
|
||||
let u_star = binary_search(&|u| beta(u) <= delta, 0.0, upper_bound, tolerance);
|
||||
|
||||
f64::sqrt(1.0 + (u_star / 2.0)) + f64::sqrt(u_star / 2.0)
|
||||
}
|
||||
|
||||
/// Using the Analytic Gaussian Mechanism, calculates the standard deviation
|
||||
/// (`sigma`) for a `(epsilon, delta)-DP` normal distribution to be used as noise.
|
||||
/// # Arguments:
|
||||
/// * `sensitivity` - L2 sensitivity
|
||||
/// * `epsilon` - privacy budget
|
||||
/// * `delta` - probability of information being leaked
|
||||
/// * `tolerance` - tolerance used to find sigma
|
||||
fn calc_sigma_dp(&self, sensitivity: f64, epsilon: f64, delta: f64, tolerance: f64) -> f64 {
|
||||
let delta_zero = self.cdf(0.0) - (f64::exp(epsilon) * self.cdf(-f64::sqrt(2.0 * epsilon)));
|
||||
let alpha = if delta >= delta_zero {
|
||||
self.calc_alpha_increasing_beta(epsilon, delta, tolerance)
|
||||
} else {
|
||||
self.calc_alpha_decreasing_beta(epsilon, delta, tolerance)
|
||||
};
|
||||
alpha * sensitivity / f64::sqrt(2.0 * epsilon)
|
||||
}
|
||||
|
||||
/// Using the Analytic Gaussian Mechanism, creates a normal distribution
|
||||
/// that is `(epsilon, delta)-DP` to be used as noise.
|
||||
/// # Arguments:
|
||||
/// * `sensitivity` - L2 sensitivity
|
||||
/// * `epsilon` - privacy budget
|
||||
/// * `delta` - probability of information being leaked
|
||||
/// * `tolerance` - tolerance used to find sigma used to build the normal distribution
|
||||
fn new_analytic_gaussian(
|
||||
sensitivity: f64,
|
||||
epsilon: f64,
|
||||
delta: f64,
|
||||
tolerance: f64,
|
||||
) -> Result<Self, StatsError>;
|
||||
}
|
||||
|
||||
impl DpAnalyticGaussianContinuousCDFScale for Normal {
|
||||
fn new_analytic_gaussian(
|
||||
sensitivity: f64,
|
||||
epsilon: f64,
|
||||
delta: f64,
|
||||
tolerance: f64,
|
||||
) -> Result<Self, StatsError> {
|
||||
let n = Normal::new(0.0, 1.0).map_err(StatsError::new)?;
|
||||
Normal::new(0.0, n.calc_sigma_dp(sensitivity, epsilon, delta, tolerance))
|
||||
.map_err(StatsError::new)
|
||||
}
|
||||
}
|
|
@ -2,10 +2,6 @@
|
|||
use pyo3::prelude::*;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Default epsilon proportion used to add noise to the protected number of records
|
||||
/// in the aggregated data
|
||||
pub const DEFAULT_NUMBER_OF_RECORDS_EPSILON_PROPORTION: f64 = 0.005;
|
||||
|
||||
/// Parameters for aggregate generation with differential privacy
|
||||
#[cfg_attr(feature = "pyo3", pyclass)]
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
|
@ -14,13 +10,14 @@ pub struct DpParameters {
|
|||
/// Overall privacy budget used between
|
||||
/// percentile filtering and noisy generation by combination length
|
||||
pub epsilon: f64,
|
||||
/// Delta value used for noisy generation by combination length
|
||||
pub delta: f64,
|
||||
/// Percentage used to calculate the percentile that filters sensitivity
|
||||
pub percentile_percentage: usize,
|
||||
/// Maximum proportion to consume of the total privacy budget (0.1 means 10%)
|
||||
/// during the sensitivity filter stage
|
||||
pub percentile_epsilon_proportion: f64,
|
||||
/// Delta value used for noisy generation by combination length, if None will be set
|
||||
/// in runtime to `1 / (ln(protected_number_of_records) * protected_number_of_records)`
|
||||
pub delta: Option<f64>,
|
||||
/// `epsilon` and `percentile_epsilon_proportion` will be used to infer the
|
||||
/// sigma value by combination length. This parameters
|
||||
/// controls how the budget being split across combination lengths
|
||||
|
@ -28,7 +25,7 @@ pub struct DpParameters {
|
|||
/// - If `None` all the sigma values will be the same
|
||||
pub sigma_proportions: Option<Vec<f64>>,
|
||||
/// Proportion of epsilon used to add noise to the protected number of records in
|
||||
/// the aggregated data (default is 0.005)
|
||||
/// the aggregated data (if None, no noise is added)
|
||||
pub number_of_records_epsilon_proportion: Option<f64>,
|
||||
}
|
||||
|
||||
|
@ -41,30 +38,31 @@ impl DpParameters {
|
|||
/// # Arguments
|
||||
/// * `epsilon` - Overall privacy budget used between
|
||||
/// percentile filtering and noisy generation by combination length
|
||||
/// * `delta` - Delta value used for noisy generation by combination length
|
||||
/// * `percentile_percentage` - Percentage used to calculate the percentile that filters sensitivity
|
||||
/// * `percentile_epsilon_proportion` - Maximum proportion to consume of the total privacy budget (0.1 means 10%)
|
||||
/// during the sensitivity filter stage
|
||||
/// * `delta` - Delta value used for noisy generation by combination length, if None will be set
|
||||
/// in runtime to `1 / (ln(protected_number_of_records) * protected_number_of_records)`
|
||||
/// * `sigma_proportions` - `epsilon` and `percentile_epsilon_proportion` will be used to infer the
|
||||
/// sigma value by combination length. This parameters
|
||||
/// controls how the budget being split across combination lengths
|
||||
/// (e.g. \[1.0, 2.0, 3.0\] means that `sigma_2 = 2.0 * sigma_1` and `sigma_3 = 3.0 * sigma_1`)
|
||||
/// - If `None` all the sigma values will be the same
|
||||
/// * `number_of_records_epsilon_proportion` - Proportion of epsilon used to add noise to the protected number of records
|
||||
/// in the aggregated data (default is 0.005)
|
||||
/// in the aggregated data (if None, no noise is added)
|
||||
pub fn new(
|
||||
epsilon: f64,
|
||||
delta: f64,
|
||||
percentile_percentage: usize,
|
||||
percentile_epsilon_proportion: f64,
|
||||
delta: Option<f64>,
|
||||
sigma_proportions: Option<Vec<f64>>,
|
||||
number_of_records_epsilon_proportion: Option<f64>,
|
||||
) -> Self {
|
||||
DpParameters {
|
||||
epsilon,
|
||||
delta,
|
||||
percentile_percentage,
|
||||
percentile_epsilon_proportion,
|
||||
delta,
|
||||
sigma_proportions,
|
||||
number_of_records_epsilon_proportion,
|
||||
}
|
||||
|
@ -76,30 +74,31 @@ impl DpParameters {
|
|||
/// # Arguments
|
||||
/// * `epsilon` - Overall privacy budget used between
|
||||
/// percentile filtering and noisy generation by combination length
|
||||
/// * `delta` - Delta value used for noisy generation by combination length
|
||||
/// * `percentile_percentage` - Percentage used to calculate the percentile that filters sensitivity
|
||||
/// * `percentile_epsilon_proportion` - Maximum proportion to consume of the total privacy budget (0.1 means 10%)
|
||||
/// during the sensitivity filter stage
|
||||
/// * `delta` - Delta value used for noisy generation by combination length, if None will be set
|
||||
/// in runtime to `1 / (ln(protected_number_of_records) * protected_number_of_records)`
|
||||
/// * `sigma_proportions` - `epsilon` and `percentile_epsilon_proportion` will be used to infer the
|
||||
/// sigma value by combination length. This parameters
|
||||
/// controls how the budget being split across combination lengths
|
||||
/// (e.g. \[1.0, 2.0, 3.0\] means that `sigma_2 = 2.0 * sigma_1` and `sigma_3 = 3.0 * sigma_1`)
|
||||
/// - If `None` all the sigma values will be the same
|
||||
/// * `number_of_records_epsilon_proportion` - Proportion of epsilon used to add noise to the protected number of records
|
||||
/// in the aggregated data (default is 0.005)
|
||||
/// in the aggregated data (if None, no noise is added)
|
||||
pub fn new(
|
||||
epsilon: f64,
|
||||
delta: f64,
|
||||
percentile_percentage: usize,
|
||||
percentile_epsilon_proportion: f64,
|
||||
delta: Option<f64>,
|
||||
sigma_proportions: Option<Vec<f64>>,
|
||||
number_of_records_epsilon_proportion: Option<f64>,
|
||||
) -> Self {
|
||||
DpParameters {
|
||||
epsilon,
|
||||
delta,
|
||||
percentile_percentage,
|
||||
percentile_epsilon_proportion,
|
||||
delta,
|
||||
sigma_proportions,
|
||||
number_of_records_epsilon_proportion,
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
mod analytic_gaussian;
|
||||
mod dp_parameters;
|
||||
mod noise_aggregator;
|
||||
mod noise_parameters;
|
||||
mod noisy_count_threshold;
|
||||
mod percentile;
|
||||
mod stats_error;
|
||||
|
@ -9,7 +9,6 @@ mod typedefs;
|
|||
#[cfg(feature = "pyo3")]
|
||||
mod register_pyo3;
|
||||
|
||||
pub use analytic_gaussian::*;
|
||||
pub use dp_parameters::*;
|
||||
pub use noise_aggregator::*;
|
||||
pub use noisy_count_threshold::*;
|
||||
|
|
|
@ -1,7 +1,4 @@
|
|||
use super::{
|
||||
CombinationsByRecord, DpParameters, DpPercentile, NoisyCountThreshold,
|
||||
DEFAULT_NUMBER_OF_RECORDS_EPSILON_PROPORTION,
|
||||
};
|
||||
use super::{CombinationsByRecord, DpParameters, DpPercentile, NoisyCountThreshold};
|
||||
use fnv::FnvHashSet;
|
||||
use itertools::Itertools;
|
||||
use log::{debug, info, warn};
|
||||
|
@ -9,14 +6,14 @@ use rand::{
|
|||
prelude::{Distribution as rand_dist, IteratorRandom},
|
||||
thread_rng,
|
||||
};
|
||||
use statrs::distribution::{ContinuousCDF, Laplace, Normal};
|
||||
use statrs::distribution::{ContinuousCDF, Normal};
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::{
|
||||
data_block::{DataBlock, DataBlockValue},
|
||||
dp::{
|
||||
noise_parameters::NoiseParameters,
|
||||
typedefs::{CombinationsCountMap, CombinationsCountMapByLen},
|
||||
DEFAULT_TOLERANCE,
|
||||
},
|
||||
processing::aggregator::{
|
||||
AggregatedCount, AggregatedData, AggregatesCountMap, RecordsSensitivityByLen, RecordsSet,
|
||||
|
@ -39,82 +36,10 @@ pub struct NoiseAggregator {
|
|||
delta: f64,
|
||||
sigmas: Vec<f64>,
|
||||
threshold: NoisyCountThreshold,
|
||||
number_of_records_epsilon: f64,
|
||||
protected_number_of_records: Option<usize>,
|
||||
}
|
||||
|
||||
impl NoiseAggregator {
|
||||
#[inline]
|
||||
fn calc_percentile_epsilon_number_of_records_epsilon_and_sigma_by_len(
|
||||
reporting_length: usize,
|
||||
epsilon: f64,
|
||||
delta: f64,
|
||||
percentile_epsilon_proportion: f64,
|
||||
number_of_records_proportion: f64,
|
||||
sigma_proportions: &Option<Vec<f64>>,
|
||||
) -> (f64, f64, Vec<f64>) {
|
||||
let proportions = match sigma_proportions {
|
||||
Some(proportions) => proportions.clone(),
|
||||
None => {
|
||||
let mut v = Vec::default();
|
||||
v.resize_with(reporting_length, || 1.0);
|
||||
v
|
||||
}
|
||||
};
|
||||
|
||||
info!(
|
||||
"calculating percentile epsilon, number of records epsilon and sigma by len: total epsilon = {}, delta = {}, percentile_epsilon_proportion = {}, number_of_records_proportion = {}, sigma_proportions = {:?}",
|
||||
epsilon,
|
||||
delta,
|
||||
percentile_epsilon_proportion,
|
||||
number_of_records_proportion,
|
||||
proportions
|
||||
);
|
||||
|
||||
assert!(
|
||||
reporting_length == proportions.len(),
|
||||
"sigma proportions array size should match the reporting length",
|
||||
);
|
||||
|
||||
assert!(
|
||||
percentile_epsilon_proportion < 1.0 && percentile_epsilon_proportion > 0.0,
|
||||
"percentile_epsilon_proportion must be > 0 and < 1"
|
||||
);
|
||||
|
||||
assert!(
|
||||
number_of_records_proportion < 1.0 && number_of_records_proportion > 0.0,
|
||||
"number_of_records_proportion must be > 0 and < 1"
|
||||
);
|
||||
|
||||
assert!(
|
||||
number_of_records_proportion + percentile_epsilon_proportion < 1.0,
|
||||
"(percentile_epsilon_proportion + number_of_records_proportion) must be > 0 and < 1"
|
||||
);
|
||||
|
||||
let t = reporting_length as f64;
|
||||
let rho = (epsilon + (2.0 / delta).ln()).sqrt() - (2.0 / delta).ln().sqrt();
|
||||
let k: f64 = proportions.iter().map(|p| 1.0 / (p * p)).sum();
|
||||
let percentile_epsilon = (2.0 * rho * percentile_epsilon_proportion / t).sqrt();
|
||||
let number_of_records_epsilon = (2.0 * rho * number_of_records_proportion).sqrt();
|
||||
let base_sigma = (k
|
||||
/ (2.0 * rho * (1.0 - percentile_epsilon_proportion - number_of_records_proportion)))
|
||||
.sqrt();
|
||||
let sigmas: Vec<f64> = proportions.iter().map(|p| p * base_sigma).collect();
|
||||
let lhs = ((t * percentile_epsilon * percentile_epsilon) / 2.0)
|
||||
+ ((number_of_records_epsilon * number_of_records_epsilon) / 2.0)
|
||||
+ (sigmas.iter().map(|s| 1.0 / (s * s)).sum::<f64>() / 2.0);
|
||||
|
||||
info!("percentile epsilon = {}", percentile_epsilon);
|
||||
info!("number of records epsilon = {}", number_of_records_epsilon);
|
||||
info!("calculated sigmas = {:?}", sigmas);
|
||||
|
||||
assert!(
|
||||
(lhs - rho).abs() <= DEFAULT_TOLERANCE,
|
||||
"something went wrong calculating DP sigmas"
|
||||
);
|
||||
|
||||
(percentile_epsilon, number_of_records_epsilon, sigmas)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn gen_sorted_records(&self) -> Vec<Vec<Arc<DataBlockValue>>> {
|
||||
self.data_block
|
||||
|
@ -243,7 +168,7 @@ impl NoiseAggregator {
|
|||
let percentile_selector = DpPercentile::new(sensitivities);
|
||||
let allowed_sensitivity = percentile_selector
|
||||
.kth_percentile_quality_scores_iter(self.percentile_percentage)
|
||||
.get_noisy_max(self.percentile_epsilon / (self.reporting_length as f64))
|
||||
.get_noisy_max(self.percentile_epsilon)
|
||||
.unwrap_or(0);
|
||||
|
||||
(max_sensitivity, allowed_sensitivity)
|
||||
|
@ -262,11 +187,15 @@ impl NoiseAggregator {
|
|||
.choose_multiple(&mut thread_rng(), l1_sensitivity)
|
||||
.drain(..)
|
||||
{
|
||||
(*all_current_aggregates.get_mut(comb).unwrap()) += 1.0;
|
||||
(*all_current_aggregates
|
||||
.get_mut(comb)
|
||||
.expect("error getting combination count")) += 1.0;
|
||||
}
|
||||
} else {
|
||||
for comb in combinations.iter() {
|
||||
(*all_current_aggregates.get_mut(comb).unwrap()) += 1.0;
|
||||
(*all_current_aggregates
|
||||
.get_mut(comb)
|
||||
.expect("error getting combination count")) += 1.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -274,7 +203,7 @@ impl NoiseAggregator {
|
|||
|
||||
#[inline]
|
||||
fn add_gaussian_noise(all_current_aggregates: &mut CombinationsCountMap, current_sigma: f64) {
|
||||
let noise = Normal::new(0.0, 1.0).unwrap();
|
||||
let noise = Normal::new(0.0, 1.0).expect("error generating Normal noise");
|
||||
|
||||
for count in all_current_aggregates.values_mut() {
|
||||
(*count) += current_sigma * noise.sample(&mut thread_rng());
|
||||
|
@ -287,7 +216,7 @@ impl NoiseAggregator {
|
|||
1.0 + (self.sigmas[0]
|
||||
* l1_sensitivity.sqrt()
|
||||
* Normal::new(0.0, 1.0)
|
||||
.unwrap()
|
||||
.expect("error creating Normal for inverse CDF")
|
||||
.inverse_cdf((1.0 - (self.delta / 2.0)).powf(1.0 / l1_sensitivity)))
|
||||
} else {
|
||||
// thresholds should start at index 2 (1-counts needs to be fixed to guarantee DP)
|
||||
|
@ -301,7 +230,7 @@ impl NoiseAggregator {
|
|||
* l1_sensitivity.sqrt()
|
||||
// threshold values should be between 0 and 0.5
|
||||
// we are dividing by 2 here to normalize it between 0 and 1.0
|
||||
* Normal::new(0.0, 1.0).unwrap().inverse_cdf(
|
||||
* Normal::new(0.0, 1.0).expect("error creating Normal for inverse CDF").inverse_cdf(
|
||||
1.0 - (
|
||||
thresholds.get(&comb_len).cloned().unwrap_or(1.0) / 2.0
|
||||
).min(0.5),
|
||||
|
@ -364,25 +293,6 @@ impl NoiseAggregator {
|
|||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn protect_number_of_records(&self, number_of_records: usize) -> usize {
|
||||
info!(
|
||||
"protecting reported number of records with epsilon = {}",
|
||||
self.number_of_records_epsilon
|
||||
);
|
||||
|
||||
assert!(
|
||||
self.number_of_records_epsilon > 0.0,
|
||||
"number of records epsilon should be > 0"
|
||||
);
|
||||
|
||||
((number_of_records as f64)
|
||||
+ Laplace::new(0.0, 1.0 / self.number_of_records_epsilon)
|
||||
.unwrap()
|
||||
.sample(&mut thread_rng()))
|
||||
.round() as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn build_aggregated_data(
|
||||
&self,
|
||||
|
@ -406,7 +316,7 @@ impl NoiseAggregator {
|
|||
self.data_block.headers.clone(),
|
||||
self.data_block.multi_value_column_metadata_map.clone(),
|
||||
self.data_block.number_of_records(),
|
||||
Some(self.protect_number_of_records(self.data_block.number_of_records())),
|
||||
self.protected_number_of_records,
|
||||
aggregates_count,
|
||||
RecordsSensitivityByLen::default(),
|
||||
self.reporting_length,
|
||||
|
@ -449,27 +359,27 @@ impl NoiseAggregator {
|
|||
dp_parameters: &DpParameters,
|
||||
threshold: NoisyCountThreshold,
|
||||
) -> NoiseAggregator {
|
||||
let (percentile_epsilon, number_of_records_epsilon, sigmas) =
|
||||
NoiseAggregator::calc_percentile_epsilon_number_of_records_epsilon_and_sigma_by_len(
|
||||
reporting_length,
|
||||
dp_parameters.epsilon,
|
||||
dp_parameters.delta,
|
||||
dp_parameters.percentile_epsilon_proportion,
|
||||
dp_parameters
|
||||
.number_of_records_epsilon_proportion
|
||||
.unwrap_or(DEFAULT_NUMBER_OF_RECORDS_EPSILON_PROPORTION),
|
||||
&dp_parameters.sigma_proportions,
|
||||
);
|
||||
let noise_parameters = NoiseParameters::new(
|
||||
reporting_length,
|
||||
dp_parameters.epsilon,
|
||||
&dp_parameters.delta,
|
||||
dp_parameters.percentile_epsilon_proportion,
|
||||
&dp_parameters.number_of_records_epsilon_proportion,
|
||||
&dp_parameters.sigma_proportions,
|
||||
data_block.number_of_records(),
|
||||
);
|
||||
|
||||
info!("resulting noise parameters = {noise_parameters:?}");
|
||||
|
||||
NoiseAggregator {
|
||||
data_block,
|
||||
reporting_length,
|
||||
percentile_percentage: dp_parameters.percentile_percentage,
|
||||
percentile_epsilon,
|
||||
delta: dp_parameters.delta,
|
||||
sigmas,
|
||||
percentile_epsilon: noise_parameters.percentile_epsilon,
|
||||
delta: noise_parameters.delta,
|
||||
sigmas: noise_parameters.sigmas,
|
||||
threshold,
|
||||
number_of_records_epsilon,
|
||||
protected_number_of_records: noise_parameters.protected_number_of_records,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,244 @@
|
|||
use log::info;
|
||||
use rand::{prelude::Distribution as rand_dist, thread_rng};
|
||||
use statrs::distribution::Laplace;
|
||||
|
||||
// Default tolerance used to calculate sigma for the gaussian noise
|
||||
const DEFAULT_TOLERANCE: f64 = 1e-8;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct NoiseParameters {
|
||||
pub(crate) percentile_epsilon: f64,
|
||||
pub(crate) sigmas: Vec<f64>,
|
||||
pub(crate) delta: f64,
|
||||
pub(crate) protected_number_of_records: Option<usize>,
|
||||
}
|
||||
|
||||
impl NoiseParameters {
|
||||
#[inline]
|
||||
fn split_budget_for_records_and_marginals(
|
||||
total_epsilon: f64,
|
||||
number_of_records_epsilon_proportion: f64,
|
||||
) -> (f64, f64) {
|
||||
assert!(
|
||||
number_of_records_epsilon_proportion < 1.0
|
||||
&& number_of_records_epsilon_proportion > 0.0,
|
||||
"number_of_records_epsilon_proportion must be > 0 and < 1"
|
||||
);
|
||||
|
||||
// total_epsilon = marginals_epsilon + number_of_records_epsilon
|
||||
let number_of_records_epsilon = number_of_records_epsilon_proportion * total_epsilon;
|
||||
let marginals_epsilon = total_epsilon - number_of_records_epsilon;
|
||||
|
||||
(number_of_records_epsilon, marginals_epsilon)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn delta_value_or_default(delta_opt: &Option<f64>, number_of_records: usize) -> f64 {
|
||||
assert!(
|
||||
number_of_records > 0,
|
||||
"number_of_records must be greater than 0"
|
||||
);
|
||||
|
||||
let number_of_records_f64 = number_of_records as f64;
|
||||
let delta = delta_opt.unwrap_or(1.0 / (number_of_records_f64.ln() * number_of_records_f64));
|
||||
|
||||
assert!(
|
||||
delta > 0.0 && delta < 1.0,
|
||||
"delta value must be between 0 and 1"
|
||||
);
|
||||
|
||||
delta
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn unwrap_sigma_proportions_or_default(
|
||||
sigma_proportions_opt: &Option<Vec<f64>>,
|
||||
reporting_length: usize,
|
||||
) -> Vec<f64> {
|
||||
let sigma_proportions = match sigma_proportions_opt {
|
||||
Some(proportions) => proportions.clone(),
|
||||
None => {
|
||||
let mut v = Vec::default();
|
||||
v.resize_with(reporting_length, || 1.0);
|
||||
v
|
||||
}
|
||||
};
|
||||
|
||||
assert!(
|
||||
reporting_length == sigma_proportions.len(),
|
||||
"sigma proportions array size should match the reporting length",
|
||||
);
|
||||
|
||||
sigma_proportions
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn protect_number_of_records(
|
||||
number_of_records_epsilon: f64,
|
||||
number_of_records: usize,
|
||||
) -> usize {
|
||||
info!(
|
||||
"protecting reported number of records with epsilon = {}",
|
||||
number_of_records_epsilon
|
||||
);
|
||||
|
||||
assert!(
|
||||
number_of_records_epsilon > 0.0,
|
||||
"number of records epsilon should be > 0"
|
||||
);
|
||||
|
||||
let protected_number_of_records = ((number_of_records as f64)
|
||||
+ Laplace::new(0.0, 1.0 / number_of_records_epsilon)
|
||||
.expect("error generating Laplace noise")
|
||||
.sample(&mut thread_rng()))
|
||||
.round();
|
||||
|
||||
assert!(
|
||||
protected_number_of_records > 0.0,
|
||||
"adding noise to number of records resulted in a negative number"
|
||||
);
|
||||
|
||||
protected_number_of_records as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn calc_percentile_epsilon_and_sigmas(
|
||||
reporting_length: usize,
|
||||
marginals_epsilon: f64,
|
||||
delta: f64,
|
||||
sigma_proportions: &[f64],
|
||||
percentile_epsilon_proportion: f64,
|
||||
) -> (f64, Vec<f64>) {
|
||||
assert!(
|
||||
percentile_epsilon_proportion < 1.0 && percentile_epsilon_proportion > 0.0,
|
||||
"percentile_epsilon_proportion must be > 0 and < 1"
|
||||
);
|
||||
|
||||
let t = reporting_length as f64;
|
||||
let rho_sqrt = (marginals_epsilon + (2.0 / delta).ln()).sqrt() - (2.0 / delta).ln().sqrt();
|
||||
let rho = rho_sqrt * rho_sqrt;
|
||||
let k: f64 = sigma_proportions.iter().map(|p| 1.0 / (p * p)).sum();
|
||||
let percentile_epsilon = (2.0 * rho * percentile_epsilon_proportion / t).sqrt();
|
||||
let base_sigma = (k / (2.0 * rho * (1.0 - percentile_epsilon_proportion))).sqrt();
|
||||
let sigmas: Vec<f64> = sigma_proportions.iter().map(|p| p * base_sigma).collect();
|
||||
let lhs = ((t * percentile_epsilon * percentile_epsilon) / 2.0)
|
||||
+ (sigmas.iter().map(|s| 1.0 / (s * s)).sum::<f64>() / 2.0);
|
||||
|
||||
info!("percentile epsilon = {percentile_epsilon}, calculated sigmas = {sigmas:?}");
|
||||
|
||||
assert!(
|
||||
(lhs - rho).abs() <= DEFAULT_TOLERANCE,
|
||||
"something went wrong calculating DP sigmas"
|
||||
);
|
||||
|
||||
(percentile_epsilon, sigmas)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn calc_marginals_parameters(
|
||||
reporting_length: usize,
|
||||
total_epsilon: f64,
|
||||
delta_opt: &Option<f64>,
|
||||
number_of_records_epsilon_proportion_opt: &Option<f64>,
|
||||
sigma_proportions_opt: &Option<Vec<f64>>,
|
||||
number_of_records: usize,
|
||||
) -> (Vec<f64>, f64, f64, Option<usize>, f64) {
|
||||
let sigma_proportions = NoiseParameters::unwrap_sigma_proportions_or_default(
|
||||
sigma_proportions_opt,
|
||||
reporting_length,
|
||||
);
|
||||
let number_of_records_epsilon: f64;
|
||||
let marginals_epsilon: f64;
|
||||
let protected_number_of_records: Option<usize>;
|
||||
let delta: f64;
|
||||
|
||||
if let Some(number_of_records_epsilon_proportion) = number_of_records_epsilon_proportion_opt
|
||||
{
|
||||
// get a fraction of the budget to protect the number of records
|
||||
(number_of_records_epsilon, marginals_epsilon) =
|
||||
NoiseParameters::split_budget_for_records_and_marginals(
|
||||
total_epsilon,
|
||||
*number_of_records_epsilon_proportion,
|
||||
);
|
||||
|
||||
// consume budget to protect number of records
|
||||
protected_number_of_records = Some(NoiseParameters::protect_number_of_records(
|
||||
number_of_records_epsilon,
|
||||
number_of_records,
|
||||
));
|
||||
|
||||
// build delta from the protected number of records
|
||||
delta = NoiseParameters::delta_value_or_default(
|
||||
delta_opt,
|
||||
protected_number_of_records.unwrap(),
|
||||
);
|
||||
} else {
|
||||
// we don't want to protect the number of records, use all
|
||||
// the budget for the marginals
|
||||
number_of_records_epsilon = 0.0;
|
||||
marginals_epsilon = total_epsilon;
|
||||
protected_number_of_records = None;
|
||||
delta = NoiseParameters::delta_value_or_default(delta_opt, number_of_records);
|
||||
}
|
||||
|
||||
(
|
||||
sigma_proportions,
|
||||
number_of_records_epsilon,
|
||||
marginals_epsilon,
|
||||
protected_number_of_records,
|
||||
delta,
|
||||
)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new(
|
||||
reporting_length: usize,
|
||||
total_epsilon: f64,
|
||||
delta_opt: &Option<f64>,
|
||||
percentile_epsilon_proportion: f64,
|
||||
number_of_records_epsilon_proportion_opt: &Option<f64>,
|
||||
sigma_proportions_opt: &Option<Vec<f64>>,
|
||||
number_of_records: usize,
|
||||
) -> NoiseParameters {
|
||||
let (
|
||||
sigma_proportions,
|
||||
number_of_records_epsilon,
|
||||
marginals_epsilon,
|
||||
protected_number_of_records,
|
||||
delta,
|
||||
) = NoiseParameters::calc_marginals_parameters(
|
||||
reporting_length,
|
||||
total_epsilon,
|
||||
delta_opt,
|
||||
number_of_records_epsilon_proportion_opt,
|
||||
sigma_proportions_opt,
|
||||
number_of_records,
|
||||
);
|
||||
|
||||
info!(
|
||||
"calculating percentile epsilon and sigmas with:
|
||||
total_epsilon = {total_epsilon},
|
||||
number_of_records_epsilon = {number_of_records_epsilon},
|
||||
marginals_epsilon = {marginals_epsilon},
|
||||
delta = {delta},
|
||||
percentile_epsilon_proportion = {percentile_epsilon_proportion},
|
||||
number_of_records_epsilon_proportion = {number_of_records_epsilon_proportion_opt:?},
|
||||
sigma_proportions = {sigma_proportions:?}"
|
||||
);
|
||||
|
||||
let (percentile_epsilon, sigmas) = NoiseParameters::calc_percentile_epsilon_and_sigmas(
|
||||
reporting_length,
|
||||
marginals_epsilon,
|
||||
delta,
|
||||
&sigma_proportions,
|
||||
percentile_epsilon_proportion,
|
||||
);
|
||||
|
||||
NoiseParameters {
|
||||
percentile_epsilon,
|
||||
sigmas,
|
||||
delta,
|
||||
protected_number_of_records,
|
||||
}
|
||||
}
|
||||
}
|
|
@ -54,11 +54,11 @@ pub struct AggregatedData {
|
|||
pub reporting_length: usize,
|
||||
}
|
||||
|
||||
impl AggregatedData {
|
||||
impl Default for AggregatedData {
|
||||
/// Creates a new AggregatedData struct with default values
|
||||
#[inline]
|
||||
pub fn default() -> AggregatedData {
|
||||
AggregatedData {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
headers: DataBlockHeaders::default(),
|
||||
multi_value_column_metadata_map: MultiValueColumnMetadataMap::default(),
|
||||
number_of_records: 0,
|
||||
|
@ -68,7 +68,9 @@ impl AggregatedData {
|
|||
reporting_length: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AggregatedData {
|
||||
/// Creates a new AggregatedData struct
|
||||
/// # Arguments:
|
||||
/// * `headers` - Vector of strings representing the data headers
|
||||
|
|
|
@ -27,11 +27,11 @@ pub struct RecordsAnalysis {
|
|||
pub percentage_of_records_with_risky_combinations: f64,
|
||||
}
|
||||
|
||||
impl RecordsAnalysis {
|
||||
#[inline]
|
||||
impl Default for RecordsAnalysis {
|
||||
/// Created a new RecordsAnalysis with default values
|
||||
pub fn default() -> RecordsAnalysis {
|
||||
RecordsAnalysis {
|
||||
#[inline]
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
number_of_records_with_unique_combinations: 0,
|
||||
percentage_of_records_with_unique_combinations: 0.0,
|
||||
number_of_records_with_rare_combinations: 0,
|
||||
|
@ -118,43 +118,52 @@ impl RecordsAnalysisData {
|
|||
} as f64;
|
||||
let records_analysis_by_len: RecordsAnalysisByLenMap = (1..=reporting_length)
|
||||
.map(|l| {
|
||||
let mut ra = RecordsAnalysis::default();
|
||||
|
||||
ra.number_of_records_with_unique_combinations = records_with_unique_combs_by_len
|
||||
.get(&l)
|
||||
.map_or(0, |records| records.len());
|
||||
ra.number_of_records_with_rare_combinations = records_with_rare_combs_by_len
|
||||
let mut number_of_records_with_unique_combinations =
|
||||
records_with_unique_combs_by_len
|
||||
.get(&l)
|
||||
.map_or(0, |records| records.len());
|
||||
let mut number_of_records_with_rare_combinations = records_with_rare_combs_by_len
|
||||
.get(&l)
|
||||
.map_or(0, |records| records.len());
|
||||
|
||||
if protect {
|
||||
ra.number_of_records_with_unique_combinations = uround_down(
|
||||
ra.number_of_records_with_unique_combinations as f64,
|
||||
number_of_records_with_unique_combinations = uround_down(
|
||||
number_of_records_with_unique_combinations as f64,
|
||||
resolution as f64,
|
||||
);
|
||||
ra.number_of_records_with_rare_combinations = uround_down(
|
||||
ra.number_of_records_with_rare_combinations as f64,
|
||||
number_of_records_with_rare_combinations = uround_down(
|
||||
number_of_records_with_rare_combinations as f64,
|
||||
resolution as f64,
|
||||
)
|
||||
}
|
||||
|
||||
ra.percentage_of_records_with_unique_combinations = calc_percentage(
|
||||
ra.number_of_records_with_unique_combinations as f64,
|
||||
let percentage_of_records_with_unique_combinations = calc_percentage(
|
||||
number_of_records_with_unique_combinations as f64,
|
||||
total_number_of_records_f64,
|
||||
);
|
||||
ra.percentage_of_records_with_rare_combinations = calc_percentage(
|
||||
ra.number_of_records_with_rare_combinations as f64,
|
||||
let percentage_of_records_with_rare_combinations = calc_percentage(
|
||||
number_of_records_with_rare_combinations as f64,
|
||||
total_number_of_records_f64,
|
||||
);
|
||||
ra.number_of_records_with_risky_combinations = ra
|
||||
.number_of_records_with_unique_combinations
|
||||
+ ra.number_of_records_with_rare_combinations;
|
||||
ra.percentage_of_records_with_risky_combinations = calc_percentage(
|
||||
ra.number_of_records_with_risky_combinations as f64,
|
||||
let number_of_records_with_risky_combinations =
|
||||
number_of_records_with_unique_combinations
|
||||
+ number_of_records_with_rare_combinations;
|
||||
let percentage_of_records_with_risky_combinations = calc_percentage(
|
||||
number_of_records_with_risky_combinations as f64,
|
||||
total_number_of_records_f64,
|
||||
);
|
||||
|
||||
(l, ra)
|
||||
(
|
||||
l,
|
||||
RecordsAnalysis {
|
||||
number_of_records_with_unique_combinations,
|
||||
percentage_of_records_with_unique_combinations,
|
||||
number_of_records_with_rare_combinations,
|
||||
percentage_of_records_with_rare_combinations,
|
||||
number_of_records_with_risky_combinations,
|
||||
percentage_of_records_with_risky_combinations,
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
RecordsAnalysisData {
|
||||
|
|
|
@ -19,13 +19,15 @@ pub struct ValueCombination {
|
|||
combination: Vec<Arc<DataBlockValue>>,
|
||||
}
|
||||
|
||||
impl ValueCombination {
|
||||
#[inline]
|
||||
impl Default for ValueCombination {
|
||||
/// Creates a new ValueCombination with default values
|
||||
pub fn default() -> ValueCombination {
|
||||
ValueCombination::new(Vec::default())
|
||||
#[inline]
|
||||
fn default() -> Self {
|
||||
Self::new(Vec::default())
|
||||
}
|
||||
}
|
||||
|
||||
impl ValueCombination {
|
||||
#[inline]
|
||||
/// Creates a new ValueCombination
|
||||
/// # Arguments
|
||||
|
|
|
@ -16,12 +16,15 @@ use crate::processing::evaluator::preservation_by_length::PreservationByLengthBu
|
|||
/// Evaluates aggregated, sensitive and synthesized data
|
||||
pub struct Evaluator {}
|
||||
|
||||
impl Evaluator {
|
||||
impl Default for Evaluator {
|
||||
/// Returns a new Evaluator
|
||||
pub fn default() -> Evaluator {
|
||||
Evaluator {}
|
||||
#[inline]
|
||||
fn default() -> Self {
|
||||
Self {}
|
||||
}
|
||||
}
|
||||
|
||||
impl Evaluator {
|
||||
fn calc_combinations_abs_error_sum_count_by_len(
|
||||
&self,
|
||||
sensitive_aggregated_data: &AggregatedData,
|
||||
|
|
|
@ -18,10 +18,11 @@ pub struct PreservationBucket {
|
|||
pub proportional_error_sum: f64,
|
||||
}
|
||||
|
||||
impl PreservationBucket {
|
||||
impl Default for PreservationBucket {
|
||||
/// Return a new PreservationBucket with default values
|
||||
pub fn default() -> PreservationBucket {
|
||||
PreservationBucket {
|
||||
#[inline]
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
size: 0,
|
||||
preservation_sum: 0.0,
|
||||
length_sum: 0,
|
||||
|
@ -29,7 +30,9 @@ impl PreservationBucket {
|
|||
proportional_error_sum: 0.0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PreservationBucket {
|
||||
/// Adds a new value to the bucket
|
||||
/// # Arguments
|
||||
/// * `preservation` - Preservation related to the value
|
||||
|
|
|
@ -27,15 +27,17 @@ pub struct PreservationByCountBuckets {
|
|||
buckets_map: PreservationBucketsMap,
|
||||
}
|
||||
|
||||
impl PreservationByCountBuckets {
|
||||
impl Default for PreservationByCountBuckets {
|
||||
/// Returns a new default PreservationByCountBuckets
|
||||
#[inline]
|
||||
pub fn default() -> PreservationByCountBuckets {
|
||||
PreservationByCountBuckets {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
buckets_map: PreservationBucketsMap::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PreservationByCountBuckets {
|
||||
#[inline]
|
||||
pub(super) fn populate(
|
||||
&mut self,
|
||||
|
|
|
@ -17,15 +17,17 @@ pub struct PreservationByLengthBuckets {
|
|||
buckets_map: PreservationBucketsMap,
|
||||
}
|
||||
|
||||
impl PreservationByLengthBuckets {
|
||||
impl Default for PreservationByLengthBuckets {
|
||||
/// Returns a new default PreservationByLengthBuckets
|
||||
#[inline]
|
||||
pub fn default() -> PreservationByLengthBuckets {
|
||||
PreservationByLengthBuckets {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
buckets_map: PreservationBucketsMap::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PreservationByLengthBuckets {
|
||||
#[inline]
|
||||
pub(super) fn populate(
|
||||
&mut self,
|
||||
|
|
|
@ -27,17 +27,19 @@ pub struct GeneratedData {
|
|||
pub multi_value_column_metadata_map: MultiValueColumnMetadataMap,
|
||||
}
|
||||
|
||||
impl GeneratedData {
|
||||
impl Default for GeneratedData {
|
||||
/// Returns a new GeneratedData struct with default values
|
||||
#[inline]
|
||||
pub fn default() -> GeneratedData {
|
||||
GeneratedData {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
synthetic_data: RawData::default(),
|
||||
expansion_ratio: 0.0,
|
||||
multi_value_column_metadata_map: MultiValueColumnMetadataMap::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GeneratedData {
|
||||
/// Returns a new GeneratedData struct
|
||||
/// # Arguments
|
||||
/// * `synthetic_data` - Synthesized data headers (index 0) and records indexes 1...
|
||||
|
|
|
@ -1,38 +0,0 @@
|
|||
use sds_core::dp::{DpAnalyticGaussianContinuousCDFScale, DEFAULT_TOLERANCE};
|
||||
use statrs::distribution::Normal;
|
||||
|
||||
#[test]
|
||||
pub fn validate_sigma() {
|
||||
let n = Normal::new(0.0, 1.0).unwrap();
|
||||
|
||||
assert!(
|
||||
(n.calc_sigma_dp(f64::sqrt(30.0), 6.0, 0.5, DEFAULT_TOLERANCE) - 1.4659731497780966).abs()
|
||||
<= DEFAULT_TOLERANCE
|
||||
);
|
||||
assert!(
|
||||
(n.calc_sigma_dp(f64::sqrt(30.0), 6.0, 1.0 / 100000.0, DEFAULT_TOLERANCE)
|
||||
- 4.182602139814776)
|
||||
.abs()
|
||||
<= DEFAULT_TOLERANCE
|
||||
);
|
||||
assert!(
|
||||
(n.calc_sigma_dp(f64::sqrt(100.0), 0.1, 1.0 / 100000.0, DEFAULT_TOLERANCE)
|
||||
.abs()
|
||||
- 307.49566132862844)
|
||||
<= DEFAULT_TOLERANCE
|
||||
);
|
||||
assert!(
|
||||
(n.calc_sigma_dp(f64::sqrt(100.0), 0.1, 0.5, DEFAULT_TOLERANCE) - 7.016745810753165).abs()
|
||||
<= DEFAULT_TOLERANCE
|
||||
);
|
||||
assert!(
|
||||
(n.calc_sigma_dp(f64::sqrt(0.1), 0.1, 0.5, DEFAULT_TOLERANCE) - 0.221888985244248).abs()
|
||||
<= DEFAULT_TOLERANCE
|
||||
);
|
||||
assert!(
|
||||
(n.calc_sigma_dp(f64::sqrt(0.1), 0.1, 1.0 / 20000.0, DEFAULT_TOLERANCE)
|
||||
- 8.370597761781507)
|
||||
.abs()
|
||||
<= DEFAULT_TOLERANCE
|
||||
);
|
||||
}
|
|
@ -1,5 +1,3 @@
|
|||
mod analytic_gaussian;
|
||||
|
||||
mod noise_aggregator;
|
||||
|
||||
mod percentile;
|
||||
|
|
|
@ -23,7 +23,7 @@ fn get_noise_aggregator() -> NoiseAggregator {
|
|||
0,
|
||||
),
|
||||
3,
|
||||
&DpParameters::new(1.0, 0.001, 99, 0.1, None, None),
|
||||
&DpParameters::new(1.0, 99, 0.1, Some(0.001), None, None),
|
||||
NoisyCountThreshold::Fixed(InputValueByLen::default()),
|
||||
)
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "pac-synth"
|
||||
version = "0.0.7"
|
||||
version = "0.0.8"
|
||||
license = "MIT"
|
||||
description = "Private Accurate Combination (PAC) Synthesizers"
|
||||
repository = "https://github.com/microsoft/synthetic-data-showcase"
|
||||
|
@ -12,7 +12,7 @@ crate-type = ["cdylib"]
|
|||
|
||||
[dependencies]
|
||||
log = { version = "0.4", features = ["std"] }
|
||||
pyo3 = { version = "0.17", features = ["extension-module", "abi3-py37"] }
|
||||
pyo3 = { version = "0.18", features = ["extension-module", "abi3-py37"] }
|
||||
sds-core = { path = "../core", features = ["pyo3", "rayon"] }
|
||||
serde = { version = "1.0", features = [ "derive", "rc" ] }
|
||||
serde_json = { version = "1.0" }
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -161,7 +161,7 @@
|
|||
" <th>count</th>\n",
|
||||
" <td>6000.000000</td>\n",
|
||||
" <td>6000.000000</td>\n",
|
||||
" <td>6000.000000</td>\n",
|
||||
" <td>6000.00000</td>\n",
|
||||
" <td>6000.000000</td>\n",
|
||||
" <td>6000.000000</td>\n",
|
||||
" <td>6000.000000</td>\n",
|
||||
|
@ -172,35 +172,35 @@
|
|||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>mean</th>\n",
|
||||
" <td>1.003000</td>\n",
|
||||
" <td>2.612500</td>\n",
|
||||
" <td>4.557333</td>\n",
|
||||
" <td>0.494000</td>\n",
|
||||
" <td>0.517833</td>\n",
|
||||
" <td>0.501667</td>\n",
|
||||
" <td>0.497333</td>\n",
|
||||
" <td>0.494333</td>\n",
|
||||
" <td>0.513833</td>\n",
|
||||
" <td>0.496167</td>\n",
|
||||
" <td>0.992333</td>\n",
|
||||
" <td>2.646333</td>\n",
|
||||
" <td>4.57900</td>\n",
|
||||
" <td>0.502333</td>\n",
|
||||
" <td>0.492667</td>\n",
|
||||
" <td>0.497000</td>\n",
|
||||
" <td>0.498833</td>\n",
|
||||
" <td>0.501500</td>\n",
|
||||
" <td>0.487500</td>\n",
|
||||
" <td>0.493000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>std</th>\n",
|
||||
" <td>0.810206</td>\n",
|
||||
" <td>2.123228</td>\n",
|
||||
" <td>3.324338</td>\n",
|
||||
" <td>0.500006</td>\n",
|
||||
" <td>0.499724</td>\n",
|
||||
" <td>0.816324</td>\n",
|
||||
" <td>2.107129</td>\n",
|
||||
" <td>3.32665</td>\n",
|
||||
" <td>0.500036</td>\n",
|
||||
" <td>0.499988</td>\n",
|
||||
" <td>0.500033</td>\n",
|
||||
" <td>0.500040</td>\n",
|
||||
" <td>0.500039</td>\n",
|
||||
" <td>0.500035</td>\n",
|
||||
" <td>0.500010</td>\n",
|
||||
" <td>0.499850</td>\n",
|
||||
" <td>0.500027</td>\n",
|
||||
" <td>0.499885</td>\n",
|
||||
" <td>0.499993</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>min</th>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.00000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
|
@ -212,8 +212,8 @@
|
|||
" <tr>\n",
|
||||
" <th>25%</th>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>1.00000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
|
@ -225,21 +225,21 @@
|
|||
" <tr>\n",
|
||||
" <th>50%</th>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>2.000000</td>\n",
|
||||
" <td>4.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>3.000000</td>\n",
|
||||
" <td>5.00000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" <td>0.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>75%</th>\n",
|
||||
" <td>2.000000</td>\n",
|
||||
" <td>5.000000</td>\n",
|
||||
" <td>7.000000</td>\n",
|
||||
" <td>7.00000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
|
@ -252,7 +252,7 @@
|
|||
" <th>max</th>\n",
|
||||
" <td>2.000000</td>\n",
|
||||
" <td>6.000000</td>\n",
|
||||
" <td>10.000000</td>\n",
|
||||
" <td>10.00000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
" <td>1.000000</td>\n",
|
||||
|
@ -266,23 +266,23 @@
|
|||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" H1 H2 H3 H4 H5 \\\n",
|
||||
"count 6000.000000 6000.000000 6000.000000 6000.000000 6000.000000 \n",
|
||||
"mean 1.003000 2.612500 4.557333 0.494000 0.517833 \n",
|
||||
"std 0.810206 2.123228 3.324338 0.500006 0.499724 \n",
|
||||
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
||||
"25% 0.000000 0.000000 1.000000 0.000000 0.000000 \n",
|
||||
"50% 1.000000 2.000000 4.000000 0.000000 1.000000 \n",
|
||||
"75% 2.000000 5.000000 7.000000 1.000000 1.000000 \n",
|
||||
"max 2.000000 6.000000 10.000000 1.000000 1.000000 \n",
|
||||
" H1 H2 H3 H4 H5 \\\n",
|
||||
"count 6000.000000 6000.000000 6000.00000 6000.000000 6000.000000 \n",
|
||||
"mean 0.992333 2.646333 4.57900 0.502333 0.492667 \n",
|
||||
"std 0.816324 2.107129 3.32665 0.500036 0.499988 \n",
|
||||
"min 0.000000 0.000000 0.00000 0.000000 0.000000 \n",
|
||||
"25% 0.000000 1.000000 1.00000 0.000000 0.000000 \n",
|
||||
"50% 1.000000 3.000000 5.00000 1.000000 0.000000 \n",
|
||||
"75% 2.000000 5.000000 7.00000 1.000000 1.000000 \n",
|
||||
"max 2.000000 6.000000 10.00000 1.000000 1.000000 \n",
|
||||
"\n",
|
||||
" H6 H7 H8 H9 H10 \n",
|
||||
"count 6000.000000 6000.000000 6000.000000 6000.000000 6000.000000 \n",
|
||||
"mean 0.501667 0.497333 0.494333 0.513833 0.496167 \n",
|
||||
"std 0.500039 0.500035 0.500010 0.499850 0.500027 \n",
|
||||
"mean 0.497000 0.498833 0.501500 0.487500 0.493000 \n",
|
||||
"std 0.500033 0.500040 0.500039 0.499885 0.499993 \n",
|
||||
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
||||
"25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
||||
"50% 1.000000 0.000000 0.000000 1.000000 0.000000 \n",
|
||||
"50% 0.000000 0.000000 1.000000 0.000000 0.000000 \n",
|
||||
"75% 1.000000 1.000000 1.000000 1.000000 1.000000 \n",
|
||||
"max 1.000000 1.000000 1.000000 1.000000 1.000000 "
|
||||
]
|
||||
|
@ -337,42 +337,42 @@
|
|||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>count</th>\n",
|
||||
" <td>6001.000000</td>\n",
|
||||
" <td>6001.000000</td>\n",
|
||||
" <td>6001.000000</td>\n",
|
||||
" <td>6001.000000</td>\n",
|
||||
" <td>6001.000000</td>\n",
|
||||
" <td>6001.000000</td>\n",
|
||||
" <td>6001.000000</td>\n",
|
||||
" <td>6001.000000</td>\n",
|
||||
" <td>6001.000000</td>\n",
|
||||
" <td>6001.000000</td>\n",
|
||||
" <td>6030.000000</td>\n",
|
||||
" <td>6030.000000</td>\n",
|
||||
" <td>6030.000000</td>\n",
|
||||
" <td>6030.000000</td>\n",
|
||||
" <td>6030.000000</td>\n",
|
||||
" <td>6030.000000</td>\n",
|
||||
" <td>6030.000000</td>\n",
|
||||
" <td>6030.000000</td>\n",
|
||||
" <td>6030.000000</td>\n",
|
||||
" <td>6030.000000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>mean</th>\n",
|
||||
" <td>0.976671</td>\n",
|
||||
" <td>2.495084</td>\n",
|
||||
" <td>4.334778</td>\n",
|
||||
" <td>0.453091</td>\n",
|
||||
" <td>0.478587</td>\n",
|
||||
" <td>0.457424</td>\n",
|
||||
" <td>0.461256</td>\n",
|
||||
" <td>0.462923</td>\n",
|
||||
" <td>0.473254</td>\n",
|
||||
" <td>0.465756</td>\n",
|
||||
" <td>0.937977</td>\n",
|
||||
" <td>2.462023</td>\n",
|
||||
" <td>4.250083</td>\n",
|
||||
" <td>0.477944</td>\n",
|
||||
" <td>0.470149</td>\n",
|
||||
" <td>0.462355</td>\n",
|
||||
" <td>0.478441</td>\n",
|
||||
" <td>0.475788</td>\n",
|
||||
" <td>0.465008</td>\n",
|
||||
" <td>0.469818</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>std</th>\n",
|
||||
" <td>0.815346</td>\n",
|
||||
" <td>2.140020</td>\n",
|
||||
" <td>3.378620</td>\n",
|
||||
" <td>0.497836</td>\n",
|
||||
" <td>0.499583</td>\n",
|
||||
" <td>0.498225</td>\n",
|
||||
" <td>0.498538</td>\n",
|
||||
" <td>0.498665</td>\n",
|
||||
" <td>0.499326</td>\n",
|
||||
" <td>0.498868</td>\n",
|
||||
" <td>0.825132</td>\n",
|
||||
" <td>2.132173</td>\n",
|
||||
" <td>3.401991</td>\n",
|
||||
" <td>0.499555</td>\n",
|
||||
" <td>0.499150</td>\n",
|
||||
" <td>0.498622</td>\n",
|
||||
" <td>0.499576</td>\n",
|
||||
" <td>0.499455</td>\n",
|
||||
" <td>0.498815</td>\n",
|
||||
" <td>0.499130</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>min</th>\n",
|
||||
|
@ -445,9 +445,9 @@
|
|||
],
|
||||
"text/plain": [
|
||||
" H1 H2 H3 H4 H5 \\\n",
|
||||
"count 6001.000000 6001.000000 6001.000000 6001.000000 6001.000000 \n",
|
||||
"mean 0.976671 2.495084 4.334778 0.453091 0.478587 \n",
|
||||
"std 0.815346 2.140020 3.378620 0.497836 0.499583 \n",
|
||||
"count 6030.000000 6030.000000 6030.000000 6030.000000 6030.000000 \n",
|
||||
"mean 0.937977 2.462023 4.250083 0.477944 0.470149 \n",
|
||||
"std 0.825132 2.132173 3.401991 0.499555 0.499150 \n",
|
||||
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
||||
"25% 0.000000 0.000000 1.000000 0.000000 0.000000 \n",
|
||||
"50% 1.000000 2.000000 4.000000 0.000000 0.000000 \n",
|
||||
|
@ -455,9 +455,9 @@
|
|||
"max 2.000000 6.000000 10.000000 1.000000 1.000000 \n",
|
||||
"\n",
|
||||
" H6 H7 H8 H9 H10 \n",
|
||||
"count 6001.000000 6001.000000 6001.000000 6001.000000 6001.000000 \n",
|
||||
"mean 0.457424 0.461256 0.462923 0.473254 0.465756 \n",
|
||||
"std 0.498225 0.498538 0.498665 0.499326 0.498868 \n",
|
||||
"count 6030.000000 6030.000000 6030.000000 6030.000000 6030.000000 \n",
|
||||
"mean 0.462355 0.478441 0.475788 0.465008 0.469818 \n",
|
||||
"std 0.498622 0.499576 0.499455 0.498815 0.499130 \n",
|
||||
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
||||
"25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
||||
"50% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
|
||||
|
@ -491,7 +491,7 @@
|
|||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.4"
|
||||
"version": "3.10.6"
|
||||
},
|
||||
"orig_nbformat": 4,
|
||||
"vscode": {
|
||||
|
|
|
@ -11,7 +11,7 @@ use serde::Serialize;
|
|||
/// By default, the builder will be constructed with default values:
|
||||
/// - reporting_length: 3
|
||||
/// - epsilon: 4.0
|
||||
/// - delta: will be set in runtime to 1 / (ln(number_of_records) * number_of_records)
|
||||
/// - delta: will be set in runtime to `1 / (ln(protected_number_of_records) * protected_number_of_records)`
|
||||
/// - percentile_percentage: 99
|
||||
/// - percentile_epsilon_proportion: 0.01
|
||||
/// - accuracy_mode: AccuracyMode.prioritize_long_combinations()
|
||||
|
@ -307,8 +307,8 @@ impl DpAggregateSeededParametersBuilder {
|
|||
}
|
||||
|
||||
if let Some(delta) = self._delta {
|
||||
if delta <= 0.0 {
|
||||
return Err(PyValueError::new_err("delta must be > 0"));
|
||||
if delta <= 0.0 || delta >= 1.0 {
|
||||
return Err(PyValueError::new_err("delta must be > 0 and < 1"));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -333,12 +333,6 @@ impl DpAggregateSeededParametersBuilder {
|
|||
));
|
||||
}
|
||||
|
||||
if self._percentile_epsilon_proportion + self._number_of_records_epsilon_proportion >= 1.0 {
|
||||
return Err(PyValueError::new_err(
|
||||
"percentile_epsilon_proportion + number_of_records_epsilon_proportion must be < 1",
|
||||
));
|
||||
}
|
||||
|
||||
self._fabrication_mode.validate(self._reporting_length)?;
|
||||
|
||||
if self._weight_selection_percentile > 100 {
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
use super::{DpAggregateSeededParameters, DpAggregateSeededParametersBuilder};
|
||||
use pyo3::{exceptions::PyRuntimeError, prelude::*};
|
||||
use sds_core::{
|
||||
data_block::DataBlock,
|
||||
dp::DpParameters,
|
||||
processing::{
|
||||
aggregator::{AggregatedData, AggregatesCountStringMap, Aggregator},
|
||||
|
@ -72,9 +71,9 @@ impl DpAggregateSeededSynthesizer {
|
|||
self._parameters.reporting_length,
|
||||
&DpParameters::new(
|
||||
self._parameters.epsilon,
|
||||
self.delta_value_or_default(&dataset.data_block),
|
||||
self._parameters.percentile_percentage,
|
||||
self._parameters.percentile_epsilon_proportion,
|
||||
self._parameters.delta,
|
||||
Some(self._parameters.sigma_proportions.clone()),
|
||||
Some(self._parameters.number_of_records_epsilon_proportion),
|
||||
),
|
||||
|
@ -167,20 +166,6 @@ impl DpAggregateSeededSynthesizer {
|
|||
}
|
||||
}
|
||||
|
||||
impl DpAggregateSeededSynthesizer {
|
||||
#[inline]
|
||||
fn delta_value_or_default(&self, data_block: &DataBlock) -> f64 {
|
||||
let number_of_records = data_block.number_of_records();
|
||||
let number_of_records_f64 = number_of_records as f64;
|
||||
|
||||
self._parameters.delta.unwrap_or(if number_of_records > 0 {
|
||||
1.0 / (number_of_records_f64.ln() * number_of_records_f64)
|
||||
} else {
|
||||
0.0
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn register(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<DpAggregateSeededSynthesizer>()?;
|
||||
Ok(())
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "sds-pyo3"
|
||||
version = "1.8.6"
|
||||
version = "1.9.0"
|
||||
license = "MIT"
|
||||
description = "Python bindings for the sds-core library"
|
||||
repository = "https://github.com/microsoft/synthetic-data-showcase"
|
||||
|
@ -13,6 +13,6 @@ crate-type = ["cdylib"]
|
|||
[dependencies]
|
||||
log = { version = "0.4", features = ["std"] }
|
||||
csv = { version = "1.1" }
|
||||
pyo3 = { version = "0.17", features = ["extension-module"] }
|
||||
pyo3 = { version = "0.18", features = ["extension-module"] }
|
||||
sds-core = { path = "../core", features = ["pyo3", "rayon"] }
|
||||
env_logger = { version = "0.9" }
|
||||
env_logger = { version = "0.10" }
|
|
@ -48,11 +48,11 @@ impl SDSProcessor {
|
|||
pub fn new(
|
||||
path: &str,
|
||||
delimiter: char,
|
||||
subject_id: Option<String>,
|
||||
use_columns: Vec<String>,
|
||||
multi_value_columns: HashMap<String, String>,
|
||||
sensitive_zeros: Vec<String>,
|
||||
record_limit: usize,
|
||||
subject_id: Option<String>,
|
||||
) -> Result<SDSProcessor, CsvDataBlockCreatorError> {
|
||||
CsvDataBlockCreator::create(
|
||||
ReaderBuilder::new()
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
[package]
|
||||
name = "sds-wasm"
|
||||
version = "1.8.6"
|
||||
version = "1.9.0"
|
||||
license = "MIT"
|
||||
description = "Web Assembly bindings for the sds-core library"
|
||||
repository = "https://github.com/microsoft/synthetic-data-showcase"
|
||||
|
|
|
@ -14,14 +14,16 @@ pub struct WasmAggregateResult {
|
|||
pub(crate) aggregated_data: Arc<AggregatedData>,
|
||||
}
|
||||
|
||||
impl WasmAggregateResult {
|
||||
impl Default for WasmAggregateResult {
|
||||
#[inline]
|
||||
pub fn default() -> WasmAggregateResult {
|
||||
WasmAggregateResult {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
aggregated_data: Arc::new(AggregatedData::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl WasmAggregateResult {
|
||||
#[inline]
|
||||
pub fn new(aggregated_data: Arc<AggregatedData>) -> WasmAggregateResult {
|
||||
WasmAggregateResult { aggregated_data }
|
||||
|
|
|
@ -5,6 +5,7 @@ use wasm_bindgen::{prelude::*, JsCast};
|
|||
|
||||
use crate::utils::js::{JsGenerateResult, JsResult};
|
||||
|
||||
#[derive(Default)]
|
||||
#[wasm_bindgen]
|
||||
pub struct WasmGenerateResult {
|
||||
generated_data: GeneratedData,
|
||||
|
@ -12,14 +13,6 @@ pub struct WasmGenerateResult {
|
|||
}
|
||||
|
||||
impl WasmGenerateResult {
|
||||
#[inline]
|
||||
pub fn default() -> WasmGenerateResult {
|
||||
WasmGenerateResult {
|
||||
generated_data: GeneratedData::default(),
|
||||
resolution: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn new(generated_data: GeneratedData, resolution: usize) -> WasmGenerateResult {
|
||||
WasmGenerateResult {
|
||||
|
|
|
@ -36,10 +36,10 @@ pub struct WasmNavigateResult {
|
|||
column_index_by_name: ColumnIndexByName,
|
||||
}
|
||||
|
||||
impl WasmNavigateResult {
|
||||
impl Default for WasmNavigateResult {
|
||||
#[inline]
|
||||
pub fn default() -> WasmNavigateResult {
|
||||
WasmNavigateResult::new(
|
||||
fn default() -> Self {
|
||||
Self::new(
|
||||
HeaderNames::default(),
|
||||
Arc::new(DataBlock::default()),
|
||||
AttributeRowsByColumnMap::default(),
|
||||
|
@ -49,7 +49,9 @@ impl WasmNavigateResult {
|
|||
ColumnIndexByName::default(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl WasmNavigateResult {
|
||||
#[inline]
|
||||
pub fn new(
|
||||
header_names: HeaderNames,
|
||||
|
|
|
@ -28,6 +28,7 @@ use crate::{
|
|||
},
|
||||
};
|
||||
|
||||
#[derive(Default)]
|
||||
#[wasm_bindgen]
|
||||
pub struct WasmSdsContext {
|
||||
sensitive_data_params: Option<WasmCsvDataParameters>,
|
||||
|
@ -45,19 +46,8 @@ pub struct WasmSdsContext {
|
|||
#[wasm_bindgen]
|
||||
impl WasmSdsContext {
|
||||
#[wasm_bindgen(constructor)]
|
||||
pub fn default() -> Self {
|
||||
WasmSdsContext {
|
||||
sensitive_data_params: None,
|
||||
sensitive_processor: None,
|
||||
synthetic_processor: None,
|
||||
sensitive_aggregate_result: None,
|
||||
reportable_aggregate_result: None,
|
||||
synthetic_aggregate_result: None,
|
||||
generate_result: None,
|
||||
pre_computed_aggregates: false,
|
||||
evaluate_result: None,
|
||||
navigate_result: None,
|
||||
}
|
||||
pub fn new_default() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
#[wasm_bindgen(js_name = "clear")]
|
||||
|
|
|
@ -29,9 +29,10 @@ pub struct WasmSdsProcessor {
|
|||
pub(crate) data_block: Arc<DataBlock>,
|
||||
}
|
||||
|
||||
impl WasmSdsProcessor {
|
||||
pub fn default() -> WasmSdsProcessor {
|
||||
WasmSdsProcessor {
|
||||
impl Default for WasmSdsProcessor {
|
||||
#[inline]
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
data_block: Arc::new(DataBlock::default()),
|
||||
}
|
||||
}
|
||||
|
|
|
@ -46,11 +46,11 @@ export interface INoisyCountThreshold {
|
|||
|
||||
export interface IDpParameters {
|
||||
epsilon: number
|
||||
delta: number
|
||||
percentilePercentage: number
|
||||
percentileEpsilonProportion: number
|
||||
numberOfRecordsEpsilonProportion?: number
|
||||
delta?: number
|
||||
sigmaProportions?: number[]
|
||||
numberOfRecordsEpsilonProportion?: number
|
||||
}
|
||||
|
||||
export interface IOversamplingParameters {
|
||||
|
|
|
@ -53,11 +53,11 @@ def aggregate(config):
|
|||
sds_processor = sds.SDSProcessor(
|
||||
sensitive_microdata_path,
|
||||
sensitive_microdata_delimiter,
|
||||
subject_id,
|
||||
use_columns,
|
||||
multi_value_columns,
|
||||
sensitive_zeros,
|
||||
max(record_limit, 0)
|
||||
max(record_limit, 0),
|
||||
subject_id
|
||||
)
|
||||
|
||||
aggregated_data = sds_processor.aggregate(
|
||||
|
@ -75,20 +75,20 @@ def aggregate(config):
|
|||
aggregated_data.write_to_json(sensitive_aggregated_data_json)
|
||||
|
||||
if dp_aggregates:
|
||||
if not delta_factor:
|
||||
delta_factor = math.log(sds_processor.number_of_records())
|
||||
|
||||
noise_delta = 1 / \
|
||||
(delta_factor * sds_processor.number_of_records())
|
||||
if delta_factor:
|
||||
noise_delta = 1 / \
|
||||
(delta_factor * sds_processor.number_of_records())
|
||||
else:
|
||||
noise_delta = None
|
||||
|
||||
if noise_threshold_type == 'fixed':
|
||||
aggregated_data = sds_processor.aggregate_with_dp_fixed_threshold(
|
||||
reporting_length,
|
||||
sds.DpParameters(
|
||||
noise_epsilon,
|
||||
noise_delta,
|
||||
percentile_percentage,
|
||||
percentile_epsilon_proportion,
|
||||
noise_delta,
|
||||
sigma_proportions,
|
||||
number_of_records_epsilon_proportion
|
||||
),
|
||||
|
@ -99,9 +99,9 @@ def aggregate(config):
|
|||
reporting_length,
|
||||
sds.DpParameters(
|
||||
noise_epsilon,
|
||||
noise_delta,
|
||||
percentile_percentage,
|
||||
percentile_epsilon_proportion,
|
||||
noise_delta,
|
||||
sigma_proportions,
|
||||
number_of_records_epsilon_proportion
|
||||
),
|
||||
|
|
|
@ -84,11 +84,11 @@ class Evaluator:
|
|||
self.syn_sds_processor = sds.SDSProcessor(
|
||||
self.synthetic_microdata_path,
|
||||
"\t",
|
||||
None, # the synthetic data does not have an ID
|
||||
[], # use all columns from synthetic file
|
||||
[], # use all columns from synthetic file
|
||||
self.multi_value_columns,
|
||||
self.sensitive_zeros,
|
||||
0 # use all records from synthetic file
|
||||
0, # use all records from synthetic file
|
||||
None # the synthetic data does not have an ID
|
||||
)
|
||||
self.syn_aggregated_data = self.syn_sds_processor.aggregate(
|
||||
self.reporting_length
|
||||
|
|
|
@ -46,11 +46,11 @@ def generate(config):
|
|||
sds_processor = sds.SDSProcessor(
|
||||
sensitive_microdata_path,
|
||||
sensitive_microdata_delimiter,
|
||||
subject_id,
|
||||
use_columns,
|
||||
multi_value_columns,
|
||||
sensitive_zeros,
|
||||
max(record_limit, 0)
|
||||
max(record_limit, 0),
|
||||
subject_id
|
||||
)
|
||||
|
||||
if synthesis_mode == 'unseeded':
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "webapp",
|
||||
"version": "1.8.6",
|
||||
"version": "1.9.0",
|
||||
"private": true,
|
||||
"license": "MIT",
|
||||
"main": "src/index.ts",
|
||||
|
@ -22,51 +22,50 @@
|
|||
"@essex/arquero": "^2.0.3",
|
||||
"@essex/arquero-react": "^1.1.0",
|
||||
"@essex/sds-core": "workspace:^",
|
||||
"@fluentui/font-icons-mdl2": "^8.5.4",
|
||||
"@fluentui/react": "^8.103.9",
|
||||
"@fluentui/react-hooks": "^8.6.14",
|
||||
"@fluentui/utilities": "^8.13.4",
|
||||
"@fluentui/font-icons-mdl2": "^8.5.13",
|
||||
"@fluentui/react": "^8.106.7",
|
||||
"@fluentui/react-hooks": "^8.6.20",
|
||||
"@fluentui/utilities": "^8.13.9",
|
||||
"@sds/components": "workspace:^",
|
||||
"@thematic/core": "^3.1.0",
|
||||
"@thematic/d3": "^2.0.13",
|
||||
"@thematic/fluent": "^4.1.0",
|
||||
"@thematic/react": "^2.1.0",
|
||||
"@thematic/core": "^4.0.4",
|
||||
"@thematic/d3": "^2.0.19",
|
||||
"@thematic/fluent": "^5.0.5",
|
||||
"@thematic/react": "^2.1.6",
|
||||
"@types/mime": "^3.0.1",
|
||||
"@uifabric/icons": "7.9.5",
|
||||
"arquero": "^5.1.0",
|
||||
"chart.js": "^3.9.1",
|
||||
"chart.js": "^4.2.1",
|
||||
"chartjs-plugin-datalabels": "^2.2.0",
|
||||
"comlink": "^4.3.1",
|
||||
"dompurify": "^2.4.1",
|
||||
"comlink": "^4.4.1",
|
||||
"dompurify": "^3.0.1",
|
||||
"formik": "^2.2.9",
|
||||
"lodash": "^4.17.21",
|
||||
"marked": "^4.2.4",
|
||||
"marked": "^4.2.12",
|
||||
"mime": "^3.0.0",
|
||||
"react": "^17.0.2",
|
||||
"react-chartjs-2": "^4.3.1",
|
||||
"react-chartjs-2": "^5.2.0",
|
||||
"react-dom": "^17.0.2",
|
||||
"react-is": "^17.0.2",
|
||||
"react-router-dom": "^6.4.5",
|
||||
"recoil": "^0.7.6",
|
||||
"styled-components": "^5.3.6",
|
||||
"uuid": "^9.0.0",
|
||||
"yup": "^0.32.11"
|
||||
"react-router-dom": "^6.9.0",
|
||||
"recoil": "^0.7.7",
|
||||
"styled-components": "^5.3.9",
|
||||
"uuid": "^9.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/dompurify": "^2.4.0",
|
||||
"@types/dompurify": "^3.0.0",
|
||||
"@types/lodash": "^4.14.191",
|
||||
"@types/marked": "^4.0.8",
|
||||
"@types/node": "^16.18.9",
|
||||
"@types/react": "^17.0.52",
|
||||
"@types/react-dom": "^17.0.18",
|
||||
"@types/node": "^16.18.16",
|
||||
"@types/react": "^17.0.53",
|
||||
"@types/react-dom": "^17.0.19",
|
||||
"@types/react-is": "^17.0.3",
|
||||
"@types/recoil": "^0.0.9",
|
||||
"@types/styled-components": "^5.1.26",
|
||||
"@types/uuid": "^9.0.0",
|
||||
"@vitejs/plugin-react": "^3.0.0",
|
||||
"@types/uuid": "^9.0.1",
|
||||
"@vitejs/plugin-react": "^3.1.0",
|
||||
"ts-node": "^10.9.1",
|
||||
"typescript": "^4.8.4",
|
||||
"vite": "^4.0.1",
|
||||
"vite-tsconfig-paths": "^4.0.3"
|
||||
"typescript": "^5.0.2",
|
||||
"vite": "^4.2.0",
|
||||
"vite-tsconfig-paths": "^4.0.7"
|
||||
}
|
||||
}
|
||||
|
|
Двоичные данные
packages/webapp/public/synthetic_data_showcase.pbit
Двоичные данные
packages/webapp/public/synthetic_data_showcase.pbit
Двоичный файл не отображается.
|
@ -20,6 +20,7 @@ export const StyleContext: React.FC<
|
|||
> = memo(function StyleContext({ children }) {
|
||||
const theme = useThematic()
|
||||
const fluentTheme = useMemo(() => loadFluentTheme(theme), [theme])
|
||||
|
||||
return (
|
||||
<>
|
||||
{/* core thematic for charting colors and imperative use */}
|
||||
|
|
|
@ -2,9 +2,14 @@
|
|||
* Copyright (c) Microsoft. All rights reserved.
|
||||
* Licensed under the MIT license. See LICENSE file in the project.
|
||||
*/
|
||||
import { useThematic } from '@thematic/react'
|
||||
import { useMemo } from 'react'
|
||||
|
||||
import {
|
||||
useNominalBoldScale,
|
||||
useNominalMutedScale,
|
||||
useNominalScale,
|
||||
} from '~utils'
|
||||
|
||||
export type BarColors = {
|
||||
normal: string
|
||||
selected: string
|
||||
|
@ -12,26 +17,32 @@ export type BarColors = {
|
|||
}
|
||||
|
||||
export function useEstimatedBarChartColors(): BarColors {
|
||||
const thematic = useThematic()
|
||||
const nominalScale = useNominalScale()
|
||||
const nominalBoldScale = useNominalBoldScale()
|
||||
const nominalMutedScale = useNominalMutedScale()
|
||||
|
||||
return useMemo(
|
||||
() => ({
|
||||
normal: thematic.scales().nominal().toArray()[0],
|
||||
selected: thematic.scales().nominalBold().toArray()[0],
|
||||
suppressed: thematic.scales().nominalMuted().toArray()[0],
|
||||
normal: nominalScale[0],
|
||||
selected: nominalBoldScale[0],
|
||||
suppressed: nominalMutedScale[0],
|
||||
}),
|
||||
[thematic],
|
||||
[nominalScale, nominalBoldScale, nominalMutedScale],
|
||||
)
|
||||
}
|
||||
|
||||
export function useActualBarChartColors(): BarColors {
|
||||
const thematic = useThematic()
|
||||
const nominalScale = useNominalScale()
|
||||
const nominalBoldScale = useNominalBoldScale()
|
||||
const nominalMutedScale = useNominalMutedScale()
|
||||
|
||||
return useMemo(
|
||||
() => ({
|
||||
normal: thematic.scales().nominal().toArray()[1],
|
||||
selected: thematic.scales().nominalBold().toArray()[1],
|
||||
suppressed: thematic.scales().nominalMuted().toArray()[1],
|
||||
normal: nominalScale[1],
|
||||
selected: nominalBoldScale[1],
|
||||
suppressed: nominalMutedScale[1],
|
||||
}),
|
||||
[thematic],
|
||||
[nominalScale, nominalBoldScale, nominalMutedScale],
|
||||
)
|
||||
}
|
||||
|
||||
|
|
|
@ -6,6 +6,8 @@ import type { IMetricByKey } from '@essex/sds-core'
|
|||
import { memo } from 'react'
|
||||
import { Chart } from 'react-chartjs-2'
|
||||
|
||||
import { useNominalScale } from '~utils'
|
||||
|
||||
export interface IMetricsChart {
|
||||
label: string
|
||||
metrics: IMetricByKey
|
||||
|
@ -28,12 +30,14 @@ function add_chart(
|
|||
chart: IMetricsChart,
|
||||
position: 'left' | 'right',
|
||||
labels: number[],
|
||||
color: string,
|
||||
) {
|
||||
datasets.push({
|
||||
label: chart.label,
|
||||
type: chart.type,
|
||||
data: labels.map(l => chart.metrics[l] ?? 0),
|
||||
yAxisID: position,
|
||||
backgroundColor: color,
|
||||
})
|
||||
scales[position] = {
|
||||
type: 'linear',
|
||||
|
@ -55,13 +59,14 @@ export const MetricsChart: React.FC<MetricsChartProps> = memo(
|
|||
}: MetricsChartProps) {
|
||||
const datasets = []
|
||||
const scales = {}
|
||||
const color = useNominalScale()[0]
|
||||
|
||||
if (leftChart) {
|
||||
add_chart(datasets, scales, leftChart, 'left', labels)
|
||||
add_chart(datasets, scales, leftChart, 'left', labels, color)
|
||||
}
|
||||
|
||||
if (rightChart) {
|
||||
add_chart(datasets, scales, rightChart, 'right', labels)
|
||||
add_chart(datasets, scales, rightChart, 'right', labels, color)
|
||||
}
|
||||
|
||||
return (
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
* Licensed under the MIT license. See LICENSE file in the project.
|
||||
*/
|
||||
import { useThematic } from '@thematic/react'
|
||||
import type { _DeepPartialObject } from 'chart.js/types/utils'
|
||||
import type { Options } from 'chartjs-plugin-datalabels/types/options'
|
||||
import type { BaseSyntheticEvent, WheelEvent } from 'react'
|
||||
import { useCallback, useMemo } from 'react'
|
||||
|
@ -26,7 +25,7 @@ export interface ChartJsDatasetConfig {
|
|||
}
|
||||
|
||||
export interface DataLabelsConfig {
|
||||
datalabels?: _DeepPartialObject<Options>
|
||||
datalabels?: Options
|
||||
}
|
||||
|
||||
function useBarConfig(
|
||||
|
|
|
@ -73,7 +73,7 @@ const Container = styled.div`
|
|||
const NavBarStack = styled(Stack)`
|
||||
height: 100%;
|
||||
margin-left: ${({ theme }: { theme: FluentTheme }) => theme.spacing?.l2};
|
||||
margin-right: ${({ theme }: { theme: FluentTheme }) => theme.spacing?.l2}; ;
|
||||
margin-right: ${({ theme }: { theme: FluentTheme }) => theme.spacing?.l2};
|
||||
`
|
||||
|
||||
const NavBarStackItem = styled(Stack.Item)`
|
||||
|
|
|
@ -121,7 +121,7 @@ export const AggregateStatistics: FC = memo(function AggregateStatistics() {
|
|||
}
|
||||
label={'Most linkable columns'}
|
||||
containerHeight={220}
|
||||
barHeight={10}
|
||||
barHeight={30}
|
||||
tooltipFormatter={columnTooltipFormatter}
|
||||
/>
|
||||
</ChartItem>
|
||||
|
@ -132,7 +132,7 @@ export const AggregateStatistics: FC = memo(function AggregateStatistics() {
|
|||
}
|
||||
label={'Most linkable attributes'}
|
||||
containerHeight={220}
|
||||
barHeight={10}
|
||||
barHeight={30}
|
||||
tooltipFormatter={attributeTooltipFormatter}
|
||||
/>
|
||||
</ChartItem>
|
||||
|
|
|
@ -11,6 +11,8 @@ import type { FC } from 'react'
|
|||
import { memo, useCallback, useMemo } from 'react'
|
||||
import { Bar } from 'react-chartjs-2'
|
||||
|
||||
import { useNominalBoldScale, useNominalScale } from '~utils'
|
||||
|
||||
import { ChartContainer } from './ContributionChart.styles.js'
|
||||
import type { ContributionChartProps } from './ContributionChart.types.js'
|
||||
|
||||
|
@ -47,12 +49,14 @@ export const ContributionChart: FC<ContributionChartProps> = memo(
|
|||
[labels, onClick],
|
||||
)
|
||||
const thematic = useThematic()
|
||||
const nominalScale = useNominalScale()
|
||||
const nominalBoldScale = useNominalBoldScale()
|
||||
const backgroundColor = useMemo(() => {
|
||||
const normalColor = thematic.scales().nominal().toArray()[0]
|
||||
const selectedColor = thematic.scales().nominalBold().toArray()[0]
|
||||
const normalColor = nominalScale[0]
|
||||
const selectedColor = nominalBoldScale[0]
|
||||
|
||||
return labels.map(l => (l === selectedKey ? selectedColor : normalColor))
|
||||
}, [labels, thematic, selectedKey])
|
||||
}, [labels, nominalScale, nominalBoldScale, selectedKey])
|
||||
const labelColors = useMemo(() => {
|
||||
const greys = thematic.scales().greys().toArray()
|
||||
|
||||
|
@ -67,70 +71,77 @@ export const ContributionChart: FC<ContributionChartProps> = memo(
|
|||
maxHeight: containerHeight,
|
||||
}}
|
||||
>
|
||||
<Bar
|
||||
height={Math.max(barHeight * data.length, barHeight)}
|
||||
data={{
|
||||
labels: labels,
|
||||
datasets: [
|
||||
<div
|
||||
style={{
|
||||
height: Math.max(barHeight * data.length, barHeight),
|
||||
}}
|
||||
>
|
||||
<Bar
|
||||
data={{
|
||||
labels: labels,
|
||||
datasets: [
|
||||
{
|
||||
label: label,
|
||||
data: data,
|
||||
xAxisID: 'xAxis',
|
||||
yAxisID: 'yAxis',
|
||||
backgroundColor,
|
||||
},
|
||||
],
|
||||
}}
|
||||
plugins={[
|
||||
ChartDataLabels as Plugin<'bar'>,
|
||||
{
|
||||
label: label,
|
||||
data: data,
|
||||
xAxisID: 'xAxis',
|
||||
yAxisID: 'yAxis',
|
||||
backgroundColor,
|
||||
},
|
||||
],
|
||||
}}
|
||||
plugins={[
|
||||
ChartDataLabels as Plugin<'bar'>,
|
||||
{
|
||||
id: 'event-catcher',
|
||||
beforeEvent(chart, args, _pluginOptions) {
|
||||
// on hover at options will not handle well the case
|
||||
// where the mouse leaves the bar
|
||||
if (args.event.type === 'mousemove') {
|
||||
const elements = chart.getActiveElements()
|
||||
chart.canvas.style.cursor =
|
||||
elements && elements[0] ? 'pointer' : 'default'
|
||||
}
|
||||
},
|
||||
},
|
||||
]}
|
||||
options={{
|
||||
plugins: {
|
||||
legend: {
|
||||
display: false,
|
||||
},
|
||||
datalabels: {
|
||||
anchor: 'start',
|
||||
align: 'end',
|
||||
offset: 5,
|
||||
formatter: value => `${value} %`,
|
||||
color: labelColors,
|
||||
},
|
||||
tooltip: {
|
||||
callbacks: {
|
||||
label: tooltipFormatter,
|
||||
id: 'event-catcher',
|
||||
beforeEvent(chart, args, _pluginOptions) {
|
||||
// on hover at options will not handle well the case
|
||||
// where the mouse leaves the bar
|
||||
if (args.event.type === 'mousemove') {
|
||||
const elements = chart.getActiveElements()
|
||||
chart.canvas.style.cursor =
|
||||
elements && elements[0] ? 'pointer' : 'default'
|
||||
}
|
||||
},
|
||||
},
|
||||
},
|
||||
indexAxis: 'y',
|
||||
scales: {
|
||||
xAxis: {
|
||||
display: false,
|
||||
grid: {
|
||||
]}
|
||||
options={{
|
||||
responsive: true,
|
||||
maintainAspectRatio: false,
|
||||
plugins: {
|
||||
legend: {
|
||||
display: false,
|
||||
},
|
||||
},
|
||||
yAxis: {
|
||||
grid: {
|
||||
display: false,
|
||||
datalabels: {
|
||||
anchor: 'start',
|
||||
align: 'end',
|
||||
offset: 5,
|
||||
formatter: value => `${value} %`,
|
||||
color: labelColors,
|
||||
},
|
||||
tooltip: {
|
||||
callbacks: {
|
||||
label: tooltipFormatter,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
onClick: handleClick,
|
||||
}}
|
||||
/>
|
||||
indexAxis: 'y',
|
||||
scales: {
|
||||
xAxis: {
|
||||
display: false,
|
||||
grid: {
|
||||
display: false,
|
||||
},
|
||||
},
|
||||
yAxis: {
|
||||
grid: {
|
||||
display: false,
|
||||
},
|
||||
},
|
||||
},
|
||||
onClick: handleClick,
|
||||
}}
|
||||
/>
|
||||
</div>
|
||||
</ChartContainer>
|
||||
</FlexContainer>
|
||||
)
|
||||
|
|
|
@ -177,27 +177,24 @@ function convertRawToSynthesisParameters(
|
|||
} as IAggregateSeededSynthesisParameters
|
||||
break
|
||||
case SynthesisMode.DP: {
|
||||
const deltaFactor =
|
||||
rawParams.deltaFactor === 0 && rawParams.recordLimit > 0
|
||||
? Math.log(rawParams.recordLimit)
|
||||
: rawParams.deltaFactor
|
||||
const noiseDelta =
|
||||
rawParams.deltaFactor > 0 && rawParams.recordLimit > 0
|
||||
? 1.0 / (rawParams.deltaFactor * rawParams.recordLimit)
|
||||
: undefined
|
||||
|
||||
ret = {
|
||||
...ret,
|
||||
dpParameters: {
|
||||
epsilon: rawParams.noiseEpsilon,
|
||||
delta:
|
||||
rawParams.recordLimit > 0
|
||||
? 1.0 / (deltaFactor * rawParams.recordLimit)
|
||||
: 0.0,
|
||||
percentilePercentage: rawParams.percentilePercentage,
|
||||
percentileEpsilonProportion: rawParams.percentileEpsilonProportion,
|
||||
numberOfRecordsEpsilonProportion:
|
||||
rawParams.numberOfRecordsEpsilonProportion,
|
||||
delta: noiseDelta,
|
||||
sigmaProportions: generateSigmaProportions(
|
||||
rawParams.reportingLength,
|
||||
rawParams.accuracyMode,
|
||||
),
|
||||
numberOfRecordsEpsilonProportion:
|
||||
rawParams.numberOfRecordsEpsilonProportion,
|
||||
},
|
||||
noiseThreshold: {
|
||||
type: 'Adaptive',
|
||||
|
|
|
@ -5,3 +5,5 @@ Factor used to calculate the delta DP parameter.
|
|||
If set to `0`, then will default at runtime to `ln(record limit)`, resulting in:
|
||||
|
||||
`Delta = 1 / (ln(record limit) * [record limit])`
|
||||
|
||||
When set to '0', the record limit will be also protected with differential privacy, consuming a portion of the privacy budget. Look at the `Number of records epsilon proportion` parameter.
|
||||
|
|
|
@ -1 +1 @@
|
|||
Percentile selection is performed with differential privacy. This defines the proportion of the overall epsilon dedicated to this stage (0.1 means 10%).
|
||||
Percentile selection is performed with differential privacy. This defines the proportion of the overall epsilon dedicated to this stage (0.01 means 1%).
|
||||
|
|
|
@ -4,3 +4,4 @@
|
|||
*/
|
||||
export * from './arquero.js'
|
||||
export * from './env.js'
|
||||
export * from './thematic.js'
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
/*!
|
||||
* Copyright (c) Microsoft. All rights reserved.
|
||||
* Licensed under the MIT license. See LICENSE file in the project.
|
||||
*/
|
||||
import { useThematic } from '@thematic/react'
|
||||
import { useMemo } from 'react'
|
||||
|
||||
export function useNominalScale(): string[] {
|
||||
const themetic = useThematic()
|
||||
|
||||
return useMemo(() => {
|
||||
// adding blue from previous versions to keep consistence
|
||||
return ['rgb(128 172 247)', ...themetic.scales().nominal().toArray()]
|
||||
}, [themetic])
|
||||
}
|
||||
|
||||
export function useNominalBoldScale(): string[] {
|
||||
const themetic = useThematic()
|
||||
|
||||
return useMemo(() => {
|
||||
// adding blue from previous versions to keep consistence
|
||||
return ['rgb(0 95 174)', ...themetic.scales().nominalBold().toArray()]
|
||||
}, [themetic])
|
||||
}
|
||||
|
||||
export function useNominalMutedScale(): string[] {
|
||||
const themetic = useThematic()
|
||||
|
||||
return useMemo(() => {
|
||||
// adding blue from previous versions to keep consistence
|
||||
return ['rgb(207 212 228)', ...themetic.scales().nominalMuted().toArray()]
|
||||
}, [themetic])
|
||||
}
|
|
@ -2,7 +2,7 @@
|
|||
ARG REGISTRY
|
||||
|
||||
# --- compile wasm bindings from rust ---
|
||||
FROM ${REGISTRY}rust:1.60 as wasm-builder
|
||||
FROM ${REGISTRY}rust:1.64 as wasm-builder
|
||||
|
||||
# install wasm-pack to build wasm bindings
|
||||
RUN cargo install wasm-pack
|
||||
|
|
8121
yarn.lock
8121
yarn.lock
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Загрузка…
Ссылка в новой задаче