Merge pull request #120 from microsoft/update/core

Core lib, dependencies and documentation updates
This commit is contained in:
Rodrigo Martins Racanicci 2023-03-17 17:17:31 -03:00 коммит произвёл GitHub
Родитель 7f7b25b849 b9d59ccf6c
Коммит f93f02fca6
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
67 изменённых файлов: 4655 добавлений и 5814 удалений

3
.github/workflows/rust-ci.yml поставляемый
Просмотреть файл

@ -50,8 +50,7 @@ jobs:
uses: actions/checkout@v3
- name: Run cargo clippy
# add "-D warning" so clippy warnings will result in pipeline failures
run: cargo clippy --
run: cargo clippy -- -D warnings
cargo_test:
name: Run cargo tests

2
.vscode/settings.json поставляемый
Просмотреть файл

@ -27,5 +27,5 @@
"[python]": {
"editor.defaultFormatter": "ms-python.python"
},
"cSpell.enabled": true
"rust-analyzer.check.command": "clippy"
}

231
Cargo.lock сгенерированный
Просмотреть файл

@ -4,11 +4,11 @@ version = 3
[[package]]
name = "ahash"
version = "0.7.6"
version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47"
checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f"
dependencies = [
"getrandom",
"cfg-if",
"once_cell",
"version_check",
]
@ -46,7 +46,7 @@ version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8"
dependencies = [
"hermit-abi",
"hermit-abi 0.1.19",
"libc",
"winapi",
]
@ -93,6 +93,12 @@ version = "1.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f5715e491b5a1598fc2bef5a606847b5dc1d48ea625bd3c02c00de8285591da"
[[package]]
name = "cc"
version = "1.0.79"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f"
[[package]]
name = "cfg-if"
version = "1.0.0"
@ -154,7 +160,7 @@ dependencies = [
"cfg-if",
"crossbeam-utils",
"lazy_static",
"memoffset",
"memoffset 0.6.5",
"scopeguard",
]
@ -215,6 +221,40 @@ dependencies = [
"termcolor",
]
[[package]]
name = "env_logger"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0"
dependencies = [
"humantime",
"is-terminal",
"log",
"regex",
"termcolor",
]
[[package]]
name = "errno"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1"
dependencies = [
"errno-dragonfly",
"libc",
"winapi",
]
[[package]]
name = "errno-dragonfly"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf"
dependencies = [
"cc",
"libc",
]
[[package]]
name = "fnv"
version = "1.0.7"
@ -236,9 +276,9 @@ dependencies = [
[[package]]
name = "hashbrown"
version = "0.12.3"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e"
dependencies = [
"ahash",
]
@ -261,6 +301,12 @@ dependencies = [
"libc",
]
[[package]]
name = "hermit-abi"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
[[package]]
name = "humantime"
version = "2.1.0"
@ -286,6 +332,28 @@ dependencies = [
"web-sys",
]
[[package]]
name = "io-lifetimes"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cfa919a82ea574332e2de6e74b4c36e74d41982b335080fa59d4ef31be20fdf3"
dependencies = [
"libc",
"windows-sys",
]
[[package]]
name = "is-terminal"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21b6b32576413a8e69b90e952e4a026476040d81017b80445deda5f2d3921857"
dependencies = [
"hermit-abi 0.3.1",
"io-lifetimes",
"rustix",
"windows-sys",
]
[[package]]
name = "itertools"
version = "0.10.1"
@ -318,9 +386,9 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]]
name = "libc"
version = "0.2.108"
version = "0.2.140"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8521a1b57e76b1ec69af7599e75e38e7b7fad6610f037db8c79b127201b5d119"
checksum = "99227334921fae1a979cf0bfdfcc6b3e5ce376ef57e16fb6fb3ea2ed6095f80c"
[[package]]
name = "libm"
@ -328,6 +396,12 @@ version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7d73b3f436185384286bd8098d17ec07c9a7d2388a6599f824d8502b529702a"
[[package]]
name = "linux-raw-sys"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4"
[[package]]
name = "lock_api"
version = "0.4.5"
@ -348,9 +422,9 @@ dependencies = [
[[package]]
name = "lru"
version = "0.8.1"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6e8aaa3f231bb4bd57b84b2d5dc3ae7f350265df8aa96492e0bc394a1571909"
checksum = "03f1160296536f10c833a82dca22267d5486734230d47bf00bf435885814ba1e"
dependencies = [
"hashbrown",
]
@ -379,6 +453,15 @@ dependencies = [
"autocfg",
]
[[package]]
name = "memoffset"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
dependencies = [
"autocfg",
]
[[package]]
name = "nalgebra"
version = "0.29.0"
@ -454,19 +537,19 @@ version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
dependencies = [
"hermit-abi",
"hermit-abi 0.1.19",
"libc",
]
[[package]]
name = "once_cell"
version = "1.8.0"
version = "1.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "692fcb63b64b1758029e0a96ee63e049ce8c5948587f2f7208df04625e5f6b56"
checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
[[package]]
name = "pac-synth"
version = "0.0.7"
version = "0.0.8"
dependencies = [
"log",
"pyo3",
@ -547,14 +630,14 @@ dependencies = [
[[package]]
name = "pyo3"
version = "0.17.2"
version = "0.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "201b6887e5576bf2f945fe65172c1fcbf3fcf285b23e4d71eb171d9736e38d32"
checksum = "06a3d8e8a46ab2738109347433cb7b96dffda2e4a218b03ef27090238886b147"
dependencies = [
"cfg-if",
"indoc",
"libc",
"memoffset",
"memoffset 0.8.0",
"parking_lot",
"pyo3-build-config",
"pyo3-ffi",
@ -564,9 +647,9 @@ dependencies = [
[[package]]
name = "pyo3-build-config"
version = "0.17.2"
version = "0.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf0708c9ed01692635cbf056e286008e5a2927ab1a5e48cdd3aeb1ba5a6fef47"
checksum = "75439f995d07ddfad42b192dfcf3bc66a7ecfd8b4a1f5f6f046aa5c2c5d7677d"
dependencies = [
"once_cell",
"target-lexicon",
@ -574,9 +657,9 @@ dependencies = [
[[package]]
name = "pyo3-ffi"
version = "0.17.2"
version = "0.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90352dea4f486932b72ddf776264d293f85b79a1d214de1d023927b41461132d"
checksum = "839526a5c07a17ff44823679b68add4a58004de00512a95b6c1c98a6dcac0ee5"
dependencies = [
"libc",
"pyo3-build-config",
@ -584,9 +667,9 @@ dependencies = [
[[package]]
name = "pyo3-macros"
version = "0.17.2"
version = "0.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7eb24b804a2d9e88bfcc480a5a6dd76f006c1e3edaf064e8250423336e2cd79d"
checksum = "bd44cf207476c6a9760c4653559be4f206efafb924d3e4cbf2721475fc0d6cc5"
dependencies = [
"proc-macro2",
"pyo3-macros-backend",
@ -596,9 +679,9 @@ dependencies = [
[[package]]
name = "pyo3-macros-backend"
version = "0.17.2"
version = "0.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f22bb49f6a7348c253d7ac67a6875f2dc65f36c2ae64a82c381d528972bea6d6"
checksum = "dc1f43d8e30460f36350d18631ccf85ded64c059829208fe680904c65bcd0a4c"
dependencies = [
"proc-macro2",
"quote",
@ -736,6 +819,20 @@ dependencies = [
"semver",
]
[[package]]
name = "rustix"
version = "0.36.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fd5c6ff11fecd55b40746d1995a02f2eb375bf8c00d192d521ee09f42bef37bc"
dependencies = [
"bitflags",
"errno",
"io-lifetimes",
"libc",
"linux-raw-sys",
"windows-sys",
]
[[package]]
name = "ryu"
version = "1.0.5"
@ -765,10 +862,10 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
[[package]]
name = "sds-cli"
version = "1.8.6"
version = "1.9.0"
dependencies = [
"csv",
"env_logger",
"env_logger 0.9.0",
"log",
"sds-core",
"statrs",
@ -777,7 +874,7 @@ dependencies = [
[[package]]
name = "sds-core"
version = "1.8.6"
version = "1.9.0"
dependencies = [
"csv",
"fnv",
@ -796,10 +893,10 @@ dependencies = [
[[package]]
name = "sds-pyo3"
version = "1.8.6"
version = "1.9.0"
dependencies = [
"csv",
"env_logger",
"env_logger 0.10.0",
"log",
"pyo3",
"sds-core",
@ -807,7 +904,7 @@ dependencies = [
[[package]]
name = "sds-wasm"
version = "1.8.6"
version = "1.9.0"
dependencies = [
"console_error_panic_hook",
"csv",
@ -1058,9 +1155,9 @@ checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
[[package]]
name = "version_check"
version = "0.9.3"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "wasi"
@ -1210,3 +1307,69 @@ name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.45.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
[[package]]
name = "windows_aarch64_msvc"
version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
[[package]]
name = "windows_i686_gnu"
version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
[[package]]
name = "windows_i686_msvc"
version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
[[package]]
name = "windows_x86_64_gnu"
version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
[[package]]
name = "windows_x86_64_msvc"
version = "0.42.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"

Просмотреть файл

@ -163,17 +163,26 @@ In order to decrease the noise, we can use a differentially-private percentile t
From [Differentially Private Marginals](./dp_marginals.pdf), to satisfy $(\varepsilon, \delta)$-DP, the following inequality needs to hold:
$0.5 * R\varepsilon_Q^2 + 0.5 * \varepsilon_N^2 + 0.5 *\displaystyle\sum_{1}^{R} 1/\sigma_k^2 \leq \sqrt{\varepsilon + \ln(2/\delta)} - \sqrt{\ln(2/\delta)}$, where the reported aggregate count is `real_aggregate_count + ` $\sigma_{k} * \sqrt{\Delta_k} * N(0, 1)$ and the reported number of records is `real_number_of_records + ` $Laplace(1 / \varepsilon_N)$.
(EQ1) $0.5 * R\varepsilon_Q^2 + 0.5 *\displaystyle\sum_{1}^{R} 1/\sigma_k^2 \leq (\sqrt{\varepsilon_M + \ln(2/\delta)} - \sqrt{\ln(2/\delta)})^2$, where the reported aggregate count will have noise added by $\sigma_{k} * \sqrt{\Delta_k} * N(0, 1)$.
Based on the given inequality we can:
Assuming the total privacy budget to be $\varepsilon$, and $n$ to be the total number of records in the dataset, we then define:
1. Call $\rho=\sqrt{\varepsilon + \ln(2/\delta)} - \sqrt{\ln(2/\delta)}$
2. Define $Q_{p}$ as the proportion of the total privacy budget dedicated for finding $Q^{th}$ percentiles
(EQ2) $\varepsilon = \varepsilon_M + \varepsilon_N$, where $\varepsilon_M$ is the portion of privacy budget we dedicate to the marginals EQ1 equation and $\varepsilon_N$ what we dedicate to protect the number of records - $protected(n) = n + Laplace(1 / \varepsilon_N)$.
If a $\delta$ value is not provided, it will be inferred from the protected number of records:
$\delta = \frac{1}{protected(n) * \ln(protected(n))}$
Besides, based on EQ1 and EQ2 we can:
1. Call $\rho=(\sqrt{\varepsilon_M + \ln(2/\delta)} - \sqrt{\ln(2/\delta)})^2$
2. Define $Q_{p}$ as the proportion of the privacy budget dedicated for finding $Q^{th}$ percentiles
3. Define $N_{p}$ the proportion of the total privacy budget dedicated for finding the protected number of records
4. Call $\varepsilon_N = N_{p} * \varepsilon$ and $\varepsilon_M = \varepsilon - \varepsilon_N$
Then, in order to find $\varepsilon_Q$, $\varepsilon_N$ and $\sigma_k$, we need to solve: (i) $0.5 * R\varepsilon_Q^2 = \rho * Q_{p}$; (ii) $0.5 * \varepsilon_N^2 = \rho * N_{p}$; and (iii) $0.5 *\displaystyle\sum_{1}^{R} 1/\sigma_i^2 = \rho * (1 - Q_{p} - N_{p})$.
From the above assumptions we know that (i) $\varepsilon_M = \varepsilon - N_{p} * \varepsilon = \varepsilon * (1 - N_{p})$. Then, in order to find $\varepsilon_Q$, and $\sigma_k$, we need to solve: (ii) $0.5 * R\varepsilon_Q^2 = \rho * Q_{p}$; and (iii) $0.5 *\displaystyle\sum_{1}^{R} 1/\sigma_i^2 = \rho * (1 - Q_{p})$.
(i) directly tells us that $\varepsilon_Q = \sqrt{(2 * \rho * Q_{p}) / R}$ [3] and (ii) that $\varepsilon_N = \sqrt{2 * \rho * N_{p}}$.
(ii) directly tells us that $\varepsilon_Q = \sqrt{(2 * \rho * Q_{p}) / R}$ [3].
On the other hand, to solve (iii) and find the $\sigma_k$ values, SDS will proportionally split the privacy budget such that:
@ -184,22 +193,24 @@ On the other hand, to solve (iii) and find the $\sigma_k$ values, SDS will propo
Thus:
$(\frac{1}{\sigma_1^2} + \frac{1}{\sigma_2^2}+ ... + \frac{1}{\sigma_k^2}) = 2 * \rho * (1 - Q_{p} - N_{p})$
$(\frac{1}{\sigma_1^2} + \frac{1}{\sigma_2^2}+ ... + \frac{1}{\sigma_k^2}) = 2 * \rho * (1 - Q_{p})$
$(\frac{1}{p_1^2*\sigma^2} + \frac{1}{p_2^2*\sigma^2} + ... + \frac{1}{p_k^2*\sigma^2}) = 2 * \rho * (1 - Q_{p} - N_{p})$
$(\frac{1}{p_1^2*\sigma^2} + \frac{1}{p_2^2*\sigma^2} + ... + \frac{1}{p_k^2*\sigma^2}) = 2 * \rho * (1 - Q_{p})$
$\frac{1}{\sigma^2} * (\frac{1}{p_1^2} + \frac{1}{p_2^2} + ... + \frac{1}{p_k^2}) = 2 * \rho * (1 - Q_{p} - N_{p})$
$\frac{1}{\sigma^2} * (\frac{1}{p_1^2} + \frac{1}{p_2^2} + ... + \frac{1}{p_k^2}) = 2 * \rho * (1 - Q_{p})$
$\frac{1}{\sigma^2} = \frac{2 * \rho * (1 - Q_{p} - N_{p})}{(\frac{1}{p_1^2} + \frac{1}{p_2^2} + ... + \frac{1}{p_k^2})}$
$\frac{1}{\sigma^2} = \frac{2 * \rho * (1 - Q_{p})}{(\frac{1}{p_1^2} + \frac{1}{p_2^2} + ... + \frac{1}{p_k^2})}$
$\sigma = \sqrt{\frac{(\frac{1}{p_1^2} + \frac{1}{p_2^2} + ... + \frac{1}{p_k^2})}{2 * \rho * (1 - Q_{p} - N_{p})}}$
$\sigma = \sqrt{\frac{(\frac{1}{p_1^2} + \frac{1}{p_2^2} + ... + \frac{1}{p_k^2})}{2 * \rho * (1 - Q_{p})}}$
$\sigma_k = p_k * \sigma = p_k * \sqrt{\frac{(\frac{1}{p_1^2} + \frac{1}{p_2^2} + ... + \frac{1}{p_k^2})}{2 * \rho * (1 - Q_{p} - N_{p})}}$ [4]
$\sigma_k = p_k * \sigma = p_k * \sqrt{\frac{(\frac{1}{p_1^2} + \frac{1}{p_2^2} + ... + \frac{1}{p_k^2})}{2 * \rho * (1 - Q_{p})}}$ [4]
To summarize, to control the allocation of the privacy budget $\varepsilon$, SDS expects the following inputs:
- `Percentile epsilon proportion` = $Q_p$, where $0 < Q_p < 1$ and $0 < Q_p + N_p < 1$
- `Number of records epsilon proportion` = $N_p$, where $0 < N_p < 1$ and $0 < Q_p + N_p < 1$
- `Total privacy budget` = $\varepsilon$
- `Delta` = $\delta$, which can also be inferred from the protected number of records
- `Percentile epsilon proportion` = $Q_p$, where $0 < Q_p < 1$
- `Number of records epsilon proportion` = $N_p$, where $0 < N_p < 1$
- `Sigma proportions` = $[p_1, p_2, ..., p_k]$, where $p_k > 0$
To illustrate the sigma proportions, let's assume a reporting length of $3$. Then we could set:

Двоичные данные
docs/dp/dp_marginals.pdf

Двоичный файл не отображается.

Просмотреть файл

@ -17,35 +17,37 @@
"git:precommit": "lint-staged",
"git:ci": "run-s format:check lint:check test build",
"release:wasm": "npm-publish --access public --token ${NPM_TOKEN} ./target/wasm/package.json",
"release": "run-p release:wasm"
"release": "run-p release:wasm",
"rust:format": "cargo fmt --all --",
"rust:check": "cargo clippy -- -D warnings && cargo fmt --all -- --check && cargo test"
},
"prettier": "@essex/prettier-config",
"devDependencies": {
"@essex/eslint-config": "^20.3.5",
"@essex/eslint-plugin": "^20.3.12",
"@essex/jest-config": "^21.0.17",
"@essex/prettier-config": "^18.0.4",
"@essex/scripts": "^22.2.0",
"@essex/eslint-config": "^20.5.1",
"@essex/eslint-plugin": "^20.5.1",
"@essex/jest-config": "^21.0.20",
"@essex/prettier-config": "^18.0.7",
"@essex/scripts": "^24.0.3",
"@jsdevtools/npm-publish": "^1.4.3",
"@types/eslint": "^8.4.10",
"@types/prettier": "^2.7.1",
"@typescript-eslint/eslint-plugin": "^5.46.1",
"@typescript-eslint/parser": "^5.46.1",
"eslint": "^8.29.0",
"eslint-import-resolver-node": "^0.3.6",
"@types/eslint": "^8.21.2",
"@types/prettier": "^2.7.2",
"@typescript-eslint/eslint-plugin": "^5.55.0",
"@typescript-eslint/parser": "^5.55.0",
"eslint": "^8.36.0",
"eslint-import-resolver-node": "^0.3.7",
"eslint-plugin-header": "^3.1.1",
"eslint-plugin-import": "^2.26.0",
"eslint-plugin-jest": "^27.1.7",
"eslint-plugin-jsx-a11y": "^6.6.1",
"eslint-plugin-react": "^7.31.11",
"eslint-plugin-import": "^2.27.5",
"eslint-plugin-jest": "^27.2.1",
"eslint-plugin-jsx-a11y": "^6.7.1",
"eslint-plugin-react": "^7.32.2",
"eslint-plugin-react-hooks": "^4.6.0",
"eslint-plugin-simple-import-sort": "^8.0.0",
"husky": "^8.0.2",
"lint-staged": "^13.1.0",
"eslint-plugin-simple-import-sort": "^10.0.0",
"husky": "^8.0.3",
"lint-staged": "^13.2.0",
"npm-run-all": "^4.1.5",
"prettier": "^2.8.1",
"prettier": "^2.8.4",
"replace": "^1.2.2",
"typescript": "^4.8.4"
"typescript": "^5.0.2"
},
"workspaces": [
"packages/webapp",

Просмотреть файл

@ -1,6 +1,6 @@
[package]
name = "sds-cli"
version = "1.8.6"
version = "1.9.0"
license = "MIT"
description = "Command line interface for the sds-core library"
repository = "https://github.com/microsoft/synthetic-data-showcase"

Просмотреть файл

@ -403,8 +403,6 @@ fn main() {
} => {
let mut aggregator = Aggregator::new(data_block.clone());
let aggregated_data = if dp {
let n_records_f64 = data_block.number_of_records() as f64;
let delta = noise_delta.unwrap_or(1.0 / (n_records_f64.ln() * n_records_f64));
let thresholds_map = noise_threshold_values
.unwrap()
.iter()
@ -425,9 +423,9 @@ fn main() {
reporting_length,
&DpParameters::new(
noise_epsilon.unwrap(),
delta,
sensitivities_percentile.unwrap(),
sensitivities_epsilon_proportion.unwrap(),
noise_delta,
sigma_proportions,
number_of_records_epsilon_proportion,
),

Просмотреть файл

@ -22,19 +22,19 @@
"license": "MIT",
"devDependencies": {
"@types/mime": "^3.0.1",
"@types/node": "^16.18.9",
"@types/react": "^17.0.52",
"@types/node": "^16.18.16",
"@types/react": "^17.0.53",
"npm-run-all": "^4.1.5",
"react": "^17.0.2",
"shx": "^0.3.4",
"ts-node": "^10.9.1",
"typescript": "^4.8.4"
"typescript": "^5.0.2"
},
"peerDependencies": {
"react": "^17.0.2"
},
"dependencies": {
"@griffel/react": "^1.5.1",
"@griffel/react": "^1.5.5",
"mime": "^3.0.0",
"react-dropzone": "^14.2.3"
}

Просмотреть файл

@ -2,8 +2,8 @@
* Copyright (c) Microsoft. All rights reserved.
* Licensed under the MIT license. See LICENSE file in the project.
*/
import type { FC, PropsWithChildren } from 'react'
import React, { forwardRef, memo, useImperativeHandle, useMemo } from 'react'
import type { FC } from 'react'
import React, { forwardRef, memo, useImperativeHandle } from 'react'
import { FileDropContext } from './FileDrop.context.js'
import { useFileDrop } from './FileDrop.hooks.js'
@ -15,25 +15,6 @@ export const FileDrop: FC<FileDropProps> = memo(
const { getRootProps, getInputProps, isDragActive, open } =
useFileDrop(props)
const classes = useFileDropStyles(props.slotClassNames)
const DivOverlay = useMemo(() => {
return (
props.divOverlay ??
((({ children }) => {
return <div className={classes.Overlay}>{children}</div>
}) as FC<
PropsWithChildren<{
/* nothing */
}>
>)
)
}, [props.divOverlay, classes])
const DragMessage = useMemo(() => {
return (
props.onDragMessage ??
(() => <span className={classes.OverlayMessage}>Drop files.</span>)
)
}, [props.onDragMessage, classes])
useImperativeHandle(ref, () => ({ open }), [open])
@ -42,9 +23,11 @@ export const FileDrop: FC<FileDropProps> = memo(
<span {...getRootProps()}>
<input {...getInputProps()} />
{isDragActive && (
<DivOverlay>
<DragMessage />
</DivOverlay>
<div className={classes.Overlay}>
<span className={classes.OverlayMessage}>
{props.onDragMessage ?? 'Drop files.'}
</span>
</div>
)}
{props.children}
</span>

Просмотреть файл

@ -2,7 +2,7 @@
* Copyright (c) Microsoft. All rights reserved.
* Licensed under the MIT license. See LICENSE file in the project.
*/
import type { ElementType, PropsWithChildren, RefObject } from 'react'
import type { PropsWithChildren, RefObject } from 'react'
import type { DropzoneOptions } from 'react-dropzone'
import type { Expand } from '../types/expand.js'
@ -16,8 +16,7 @@ export type FileDropProps = Expand<
Omit<DropzoneOptions, 'accept'> &
PropsWithChildren<{
accept?: string | string[]
onDragMessage?: ElementType
divOverlay?: ElementType
onDragMessage?: string
ref?: RefObject<FileDropRef>
slotClassNames?: FileDropSlotClassNames
}>

Просмотреть файл

@ -3,21 +3,20 @@
* Licensed under the MIT license. See LICENSE file in the project.
*/
import type { CSSProperties, FC } from 'react'
import React, { memo, useMemo } from 'react'
import React, { memo } from 'react'
import { useFlexContainerStyles } from './FlexContainer.hooks.js'
import type { FlexContainerProps } from './FlexContainer.types.js'
export const FlexContainer: FC<FlexContainerProps> = memo(
function FlexContainer(props) {
const { className, as, children } = props
const { className, children } = props
const inlineStyles: CSSProperties = useFlexContainerStyles(props)
const Element = useMemo(() => as ?? 'div', [as])
return (
<Element className={className} style={inlineStyles}>
<div className={className} style={inlineStyles}>
{children}
</Element>
</div>
)
},
)

Просмотреть файл

@ -3,10 +3,9 @@
* Licensed under the MIT license. See LICENSE file in the project.
*/
import type { CSSProperties, ElementType, PropsWithChildren } from 'react'
import type { CSSProperties, PropsWithChildren } from 'react'
export type FlexContainerProps = PropsWithChildren<{
as?: ElementType
vertical?: boolean
wrap?: boolean
justify?:

Просмотреть файл

@ -4,20 +4,19 @@
*/
import type { CSSProperties, FC } from 'react'
import React, { memo, useMemo } from 'react'
import React, { memo } from 'react'
import { useFlexItemStyles } from './FlexItem.hooks.js'
import type { FlexItemProps } from './FlexItem.types.js'
export const FlexItem: FC<FlexItemProps> = memo(function FlexItem(props) {
const { as, children, className } = props
const { children, className } = props
const inlineStyles: CSSProperties = useFlexItemStyles(props)
const Element = useMemo(() => as ?? 'div', [as])
return (
<Element className={className} style={inlineStyles}>
<div className={className} style={inlineStyles}>
{children}
</Element>
</div>
)
})
FlexItem.displayName = 'FlexItem'

Просмотреть файл

@ -2,10 +2,9 @@
* Copyright (c) Microsoft. All rights reserved.
* Licensed under the MIT license. See LICENSE file in the project.
*/
import type { CSSProperties, ElementType, PropsWithChildren } from 'react'
import type { CSSProperties, PropsWithChildren } from 'react'
export type FlexItemProps = PropsWithChildren<{
as?: ElementType
order?: number
shrink?: number
basis?: string

Просмотреть файл

@ -1,6 +1,6 @@
[package]
name = "sds-core"
version = "1.8.6"
version = "1.9.0"
license = "MIT"
description = "Synthetic data showcase core library"
repository = "https://github.com/microsoft/synthetic-data-showcase"
@ -13,12 +13,12 @@ crate-type = ["rlib"]
rand = { version = "0.8" }
fnv = { version = "1.0" }
itertools = { version = "0.10" }
lru = { version = "0.8" }
lru = { version = "0.10" }
getrandom = { version = "0.2", features = ["js"] }
log = { version = "0.4", features = ["std"] }
csv = { version = "1.1" }
instant = { version = "0.1", features = [ "stdweb", "wasm-bindgen" ] }
pyo3 = { version = "0.17", features = ["extension-module"], optional = true }
pyo3 = { version = "0.18", features = ["extension-module"], optional = true }
rayon = { version = "1.5", optional = true }
serde = { version = "1.0", features = [ "derive", "rc" ] }
serde_json = { version = "1.0" }

Просмотреть файл

@ -33,16 +33,19 @@ pub struct DataBlock {
pub records: DataBlockRecords,
}
impl DataBlock {
impl Default for DataBlock {
/// Returns a new DataBlock with default values
pub fn default() -> DataBlock {
DataBlock {
#[inline]
fn default() -> Self {
Self {
headers: DataBlockHeaders::default(),
multi_value_column_metadata_map: MultiValueColumnMetadataMap::default(),
records: DataBlockRecords::default(),
}
}
}
impl DataBlock {
/// Returns a new DataBlock
/// # Arguments
/// * `headers` - Vector of string representing the data headers

Просмотреть файл

@ -24,7 +24,6 @@ impl<T: Read> DataBlockCreator for CsvDataBlockCreator<T> {
fn get_records(reader: &mut Self::InputType) -> Result<Vec<CsvRecord>, Error> {
reader
.records()
.into_iter()
.map(|record_result: Result<StringRecord, Error>| {
Ok(record_result?
.into_iter()

Просмотреть файл

@ -1,122 +0,0 @@
use super::stats_error::StatsError;
use statrs::distribution::{ContinuousCDF, Normal};
/// Default tolerance used to calculate sigma for the gaussian noise
pub const DEFAULT_TOLERANCE: f64 = 1e-8;
fn binary_search(
f: &dyn Fn(f64) -> bool,
lower_bound: f64,
upper_bound: f64,
tolerance: f64,
) -> f64 {
let lower_res = f(lower_bound);
let upper_res = f(upper_bound);
assert!(
lower_res != upper_res,
"upper and lower bound predicates should have different values for binary search"
);
let mut lower = lower_bound;
let mut upper = upper_bound;
while upper - lower > tolerance {
let mid = lower + ((upper - lower) / 2.0);
if f(mid) == upper_res {
upper = mid
} else {
lower = mid;
}
}
if upper_res {
upper
} else {
lower
}
}
pub trait DpAnalyticGaussianContinuousCDFScale
where
Self: ContinuousCDF<f64, f64> + Sized,
{
fn calc_alpha_increasing_beta(&self, epsilon: f64, delta: f64, tolerance: f64) -> f64 {
let beta = |v: f64| {
self.cdf(f64::sqrt(epsilon * v))
- (f64::exp(epsilon) * self.cdf(-f64::sqrt(epsilon * (v + 2.0))))
};
let mut upper_bound: f64 = 2.0;
// this is monotonically increasing, so find the upper bound
// for the binary search
while beta(upper_bound) <= delta {
upper_bound *= 2.0;
}
let v_star = binary_search(&|v| beta(v) <= delta, 0.0, upper_bound, tolerance);
f64::sqrt(1.0 + (v_star / 2.0)) - f64::sqrt(v_star / 2.0)
}
fn calc_alpha_decreasing_beta(&self, epsilon: f64, delta: f64, tolerance: f64) -> f64 {
let beta = |u: f64| {
self.cdf(-f64::sqrt(epsilon * u))
- (f64::exp(epsilon) * self.cdf(-f64::sqrt(epsilon * (u + 2.0))))
};
let mut upper_bound: f64 = 2.0;
// this is monotonically increasing, so find the upper bound
// for the binary search
while beta(upper_bound) >= delta {
upper_bound *= 2.0;
}
let u_star = binary_search(&|u| beta(u) <= delta, 0.0, upper_bound, tolerance);
f64::sqrt(1.0 + (u_star / 2.0)) + f64::sqrt(u_star / 2.0)
}
/// Using the Analytic Gaussian Mechanism, calculates the standard deviation
/// (`sigma`) for a `(epsilon, delta)-DP` normal distribution to be used as noise.
/// # Arguments:
/// * `sensitivity` - L2 sensitivity
/// * `epsilon` - privacy budget
/// * `delta` - probability of information being leaked
/// * `tolerance` - tolerance used to find sigma
fn calc_sigma_dp(&self, sensitivity: f64, epsilon: f64, delta: f64, tolerance: f64) -> f64 {
let delta_zero = self.cdf(0.0) - (f64::exp(epsilon) * self.cdf(-f64::sqrt(2.0 * epsilon)));
let alpha = if delta >= delta_zero {
self.calc_alpha_increasing_beta(epsilon, delta, tolerance)
} else {
self.calc_alpha_decreasing_beta(epsilon, delta, tolerance)
};
alpha * sensitivity / f64::sqrt(2.0 * epsilon)
}
/// Using the Analytic Gaussian Mechanism, creates a normal distribution
/// that is `(epsilon, delta)-DP` to be used as noise.
/// # Arguments:
/// * `sensitivity` - L2 sensitivity
/// * `epsilon` - privacy budget
/// * `delta` - probability of information being leaked
/// * `tolerance` - tolerance used to find sigma used to build the normal distribution
fn new_analytic_gaussian(
sensitivity: f64,
epsilon: f64,
delta: f64,
tolerance: f64,
) -> Result<Self, StatsError>;
}
impl DpAnalyticGaussianContinuousCDFScale for Normal {
fn new_analytic_gaussian(
sensitivity: f64,
epsilon: f64,
delta: f64,
tolerance: f64,
) -> Result<Self, StatsError> {
let n = Normal::new(0.0, 1.0).map_err(StatsError::new)?;
Normal::new(0.0, n.calc_sigma_dp(sensitivity, epsilon, delta, tolerance))
.map_err(StatsError::new)
}
}

Просмотреть файл

@ -2,10 +2,6 @@
use pyo3::prelude::*;
use serde::{Deserialize, Serialize};
/// Default epsilon proportion used to add noise to the protected number of records
/// in the aggregated data
pub const DEFAULT_NUMBER_OF_RECORDS_EPSILON_PROPORTION: f64 = 0.005;
/// Parameters for aggregate generation with differential privacy
#[cfg_attr(feature = "pyo3", pyclass)]
#[derive(Clone, Debug, Serialize, Deserialize)]
@ -14,13 +10,14 @@ pub struct DpParameters {
/// Overall privacy budget used between
/// percentile filtering and noisy generation by combination length
pub epsilon: f64,
/// Delta value used for noisy generation by combination length
pub delta: f64,
/// Percentage used to calculate the percentile that filters sensitivity
pub percentile_percentage: usize,
/// Maximum proportion to consume of the total privacy budget (0.1 means 10%)
/// during the sensitivity filter stage
pub percentile_epsilon_proportion: f64,
/// Delta value used for noisy generation by combination length, if None will be set
/// in runtime to `1 / (ln(protected_number_of_records) * protected_number_of_records)`
pub delta: Option<f64>,
/// `epsilon` and `percentile_epsilon_proportion` will be used to infer the
/// sigma value by combination length. This parameters
/// controls how the budget being split across combination lengths
@ -28,7 +25,7 @@ pub struct DpParameters {
/// - If `None` all the sigma values will be the same
pub sigma_proportions: Option<Vec<f64>>,
/// Proportion of epsilon used to add noise to the protected number of records in
/// the aggregated data (default is 0.005)
/// the aggregated data (if None, no noise is added)
pub number_of_records_epsilon_proportion: Option<f64>,
}
@ -41,30 +38,31 @@ impl DpParameters {
/// # Arguments
/// * `epsilon` - Overall privacy budget used between
/// percentile filtering and noisy generation by combination length
/// * `delta` - Delta value used for noisy generation by combination length
/// * `percentile_percentage` - Percentage used to calculate the percentile that filters sensitivity
/// * `percentile_epsilon_proportion` - Maximum proportion to consume of the total privacy budget (0.1 means 10%)
/// during the sensitivity filter stage
/// * `delta` - Delta value used for noisy generation by combination length, if None will be set
/// in runtime to `1 / (ln(protected_number_of_records) * protected_number_of_records)`
/// * `sigma_proportions` - `epsilon` and `percentile_epsilon_proportion` will be used to infer the
/// sigma value by combination length. This parameters
/// controls how the budget being split across combination lengths
/// (e.g. \[1.0, 2.0, 3.0\] means that `sigma_2 = 2.0 * sigma_1` and `sigma_3 = 3.0 * sigma_1`)
/// - If `None` all the sigma values will be the same
/// * `number_of_records_epsilon_proportion` - Proportion of epsilon used to add noise to the protected number of records
/// in the aggregated data (default is 0.005)
/// in the aggregated data (if None, no noise is added)
pub fn new(
epsilon: f64,
delta: f64,
percentile_percentage: usize,
percentile_epsilon_proportion: f64,
delta: Option<f64>,
sigma_proportions: Option<Vec<f64>>,
number_of_records_epsilon_proportion: Option<f64>,
) -> Self {
DpParameters {
epsilon,
delta,
percentile_percentage,
percentile_epsilon_proportion,
delta,
sigma_proportions,
number_of_records_epsilon_proportion,
}
@ -76,30 +74,31 @@ impl DpParameters {
/// # Arguments
/// * `epsilon` - Overall privacy budget used between
/// percentile filtering and noisy generation by combination length
/// * `delta` - Delta value used for noisy generation by combination length
/// * `percentile_percentage` - Percentage used to calculate the percentile that filters sensitivity
/// * `percentile_epsilon_proportion` - Maximum proportion to consume of the total privacy budget (0.1 means 10%)
/// during the sensitivity filter stage
/// * `delta` - Delta value used for noisy generation by combination length, if None will be set
/// in runtime to `1 / (ln(protected_number_of_records) * protected_number_of_records)`
/// * `sigma_proportions` - `epsilon` and `percentile_epsilon_proportion` will be used to infer the
/// sigma value by combination length. This parameters
/// controls how the budget being split across combination lengths
/// (e.g. \[1.0, 2.0, 3.0\] means that `sigma_2 = 2.0 * sigma_1` and `sigma_3 = 3.0 * sigma_1`)
/// - If `None` all the sigma values will be the same
/// * `number_of_records_epsilon_proportion` - Proportion of epsilon used to add noise to the protected number of records
/// in the aggregated data (default is 0.005)
/// in the aggregated data (if None, no noise is added)
pub fn new(
epsilon: f64,
delta: f64,
percentile_percentage: usize,
percentile_epsilon_proportion: f64,
delta: Option<f64>,
sigma_proportions: Option<Vec<f64>>,
number_of_records_epsilon_proportion: Option<f64>,
) -> Self {
DpParameters {
epsilon,
delta,
percentile_percentage,
percentile_epsilon_proportion,
delta,
sigma_proportions,
number_of_records_epsilon_proportion,
}

Просмотреть файл

@ -1,6 +1,6 @@
mod analytic_gaussian;
mod dp_parameters;
mod noise_aggregator;
mod noise_parameters;
mod noisy_count_threshold;
mod percentile;
mod stats_error;
@ -9,7 +9,6 @@ mod typedefs;
#[cfg(feature = "pyo3")]
mod register_pyo3;
pub use analytic_gaussian::*;
pub use dp_parameters::*;
pub use noise_aggregator::*;
pub use noisy_count_threshold::*;

Просмотреть файл

@ -1,7 +1,4 @@
use super::{
CombinationsByRecord, DpParameters, DpPercentile, NoisyCountThreshold,
DEFAULT_NUMBER_OF_RECORDS_EPSILON_PROPORTION,
};
use super::{CombinationsByRecord, DpParameters, DpPercentile, NoisyCountThreshold};
use fnv::FnvHashSet;
use itertools::Itertools;
use log::{debug, info, warn};
@ -9,14 +6,14 @@ use rand::{
prelude::{Distribution as rand_dist, IteratorRandom},
thread_rng,
};
use statrs::distribution::{ContinuousCDF, Laplace, Normal};
use statrs::distribution::{ContinuousCDF, Normal};
use std::sync::Arc;
use crate::{
data_block::{DataBlock, DataBlockValue},
dp::{
noise_parameters::NoiseParameters,
typedefs::{CombinationsCountMap, CombinationsCountMapByLen},
DEFAULT_TOLERANCE,
},
processing::aggregator::{
AggregatedCount, AggregatedData, AggregatesCountMap, RecordsSensitivityByLen, RecordsSet,
@ -39,82 +36,10 @@ pub struct NoiseAggregator {
delta: f64,
sigmas: Vec<f64>,
threshold: NoisyCountThreshold,
number_of_records_epsilon: f64,
protected_number_of_records: Option<usize>,
}
impl NoiseAggregator {
#[inline]
fn calc_percentile_epsilon_number_of_records_epsilon_and_sigma_by_len(
reporting_length: usize,
epsilon: f64,
delta: f64,
percentile_epsilon_proportion: f64,
number_of_records_proportion: f64,
sigma_proportions: &Option<Vec<f64>>,
) -> (f64, f64, Vec<f64>) {
let proportions = match sigma_proportions {
Some(proportions) => proportions.clone(),
None => {
let mut v = Vec::default();
v.resize_with(reporting_length, || 1.0);
v
}
};
info!(
"calculating percentile epsilon, number of records epsilon and sigma by len: total epsilon = {}, delta = {}, percentile_epsilon_proportion = {}, number_of_records_proportion = {}, sigma_proportions = {:?}",
epsilon,
delta,
percentile_epsilon_proportion,
number_of_records_proportion,
proportions
);
assert!(
reporting_length == proportions.len(),
"sigma proportions array size should match the reporting length",
);
assert!(
percentile_epsilon_proportion < 1.0 && percentile_epsilon_proportion > 0.0,
"percentile_epsilon_proportion must be > 0 and < 1"
);
assert!(
number_of_records_proportion < 1.0 && number_of_records_proportion > 0.0,
"number_of_records_proportion must be > 0 and < 1"
);
assert!(
number_of_records_proportion + percentile_epsilon_proportion < 1.0,
"(percentile_epsilon_proportion + number_of_records_proportion) must be > 0 and < 1"
);
let t = reporting_length as f64;
let rho = (epsilon + (2.0 / delta).ln()).sqrt() - (2.0 / delta).ln().sqrt();
let k: f64 = proportions.iter().map(|p| 1.0 / (p * p)).sum();
let percentile_epsilon = (2.0 * rho * percentile_epsilon_proportion / t).sqrt();
let number_of_records_epsilon = (2.0 * rho * number_of_records_proportion).sqrt();
let base_sigma = (k
/ (2.0 * rho * (1.0 - percentile_epsilon_proportion - number_of_records_proportion)))
.sqrt();
let sigmas: Vec<f64> = proportions.iter().map(|p| p * base_sigma).collect();
let lhs = ((t * percentile_epsilon * percentile_epsilon) / 2.0)
+ ((number_of_records_epsilon * number_of_records_epsilon) / 2.0)
+ (sigmas.iter().map(|s| 1.0 / (s * s)).sum::<f64>() / 2.0);
info!("percentile epsilon = {}", percentile_epsilon);
info!("number of records epsilon = {}", number_of_records_epsilon);
info!("calculated sigmas = {:?}", sigmas);
assert!(
(lhs - rho).abs() <= DEFAULT_TOLERANCE,
"something went wrong calculating DP sigmas"
);
(percentile_epsilon, number_of_records_epsilon, sigmas)
}
#[inline]
fn gen_sorted_records(&self) -> Vec<Vec<Arc<DataBlockValue>>> {
self.data_block
@ -243,7 +168,7 @@ impl NoiseAggregator {
let percentile_selector = DpPercentile::new(sensitivities);
let allowed_sensitivity = percentile_selector
.kth_percentile_quality_scores_iter(self.percentile_percentage)
.get_noisy_max(self.percentile_epsilon / (self.reporting_length as f64))
.get_noisy_max(self.percentile_epsilon)
.unwrap_or(0);
(max_sensitivity, allowed_sensitivity)
@ -262,11 +187,15 @@ impl NoiseAggregator {
.choose_multiple(&mut thread_rng(), l1_sensitivity)
.drain(..)
{
(*all_current_aggregates.get_mut(comb).unwrap()) += 1.0;
(*all_current_aggregates
.get_mut(comb)
.expect("error getting combination count")) += 1.0;
}
} else {
for comb in combinations.iter() {
(*all_current_aggregates.get_mut(comb).unwrap()) += 1.0;
(*all_current_aggregates
.get_mut(comb)
.expect("error getting combination count")) += 1.0;
}
}
}
@ -274,7 +203,7 @@ impl NoiseAggregator {
#[inline]
fn add_gaussian_noise(all_current_aggregates: &mut CombinationsCountMap, current_sigma: f64) {
let noise = Normal::new(0.0, 1.0).unwrap();
let noise = Normal::new(0.0, 1.0).expect("error generating Normal noise");
for count in all_current_aggregates.values_mut() {
(*count) += current_sigma * noise.sample(&mut thread_rng());
@ -287,7 +216,7 @@ impl NoiseAggregator {
1.0 + (self.sigmas[0]
* l1_sensitivity.sqrt()
* Normal::new(0.0, 1.0)
.unwrap()
.expect("error creating Normal for inverse CDF")
.inverse_cdf((1.0 - (self.delta / 2.0)).powf(1.0 / l1_sensitivity)))
} else {
// thresholds should start at index 2 (1-counts needs to be fixed to guarantee DP)
@ -301,7 +230,7 @@ impl NoiseAggregator {
* l1_sensitivity.sqrt()
// threshold values should be between 0 and 0.5
// we are dividing by 2 here to normalize it between 0 and 1.0
* Normal::new(0.0, 1.0).unwrap().inverse_cdf(
* Normal::new(0.0, 1.0).expect("error creating Normal for inverse CDF").inverse_cdf(
1.0 - (
thresholds.get(&comb_len).cloned().unwrap_or(1.0) / 2.0
).min(0.5),
@ -364,25 +293,6 @@ impl NoiseAggregator {
}
}
#[inline]
pub fn protect_number_of_records(&self, number_of_records: usize) -> usize {
info!(
"protecting reported number of records with epsilon = {}",
self.number_of_records_epsilon
);
assert!(
self.number_of_records_epsilon > 0.0,
"number of records epsilon should be > 0"
);
((number_of_records as f64)
+ Laplace::new(0.0, 1.0 / self.number_of_records_epsilon)
.unwrap()
.sample(&mut thread_rng()))
.round() as usize
}
#[inline]
pub fn build_aggregated_data(
&self,
@ -406,7 +316,7 @@ impl NoiseAggregator {
self.data_block.headers.clone(),
self.data_block.multi_value_column_metadata_map.clone(),
self.data_block.number_of_records(),
Some(self.protect_number_of_records(self.data_block.number_of_records())),
self.protected_number_of_records,
aggregates_count,
RecordsSensitivityByLen::default(),
self.reporting_length,
@ -449,27 +359,27 @@ impl NoiseAggregator {
dp_parameters: &DpParameters,
threshold: NoisyCountThreshold,
) -> NoiseAggregator {
let (percentile_epsilon, number_of_records_epsilon, sigmas) =
NoiseAggregator::calc_percentile_epsilon_number_of_records_epsilon_and_sigma_by_len(
reporting_length,
dp_parameters.epsilon,
dp_parameters.delta,
dp_parameters.percentile_epsilon_proportion,
dp_parameters
.number_of_records_epsilon_proportion
.unwrap_or(DEFAULT_NUMBER_OF_RECORDS_EPSILON_PROPORTION),
&dp_parameters.sigma_proportions,
);
let noise_parameters = NoiseParameters::new(
reporting_length,
dp_parameters.epsilon,
&dp_parameters.delta,
dp_parameters.percentile_epsilon_proportion,
&dp_parameters.number_of_records_epsilon_proportion,
&dp_parameters.sigma_proportions,
data_block.number_of_records(),
);
info!("resulting noise parameters = {noise_parameters:?}");
NoiseAggregator {
data_block,
reporting_length,
percentile_percentage: dp_parameters.percentile_percentage,
percentile_epsilon,
delta: dp_parameters.delta,
sigmas,
percentile_epsilon: noise_parameters.percentile_epsilon,
delta: noise_parameters.delta,
sigmas: noise_parameters.sigmas,
threshold,
number_of_records_epsilon,
protected_number_of_records: noise_parameters.protected_number_of_records,
}
}

Просмотреть файл

@ -0,0 +1,244 @@
use log::info;
use rand::{prelude::Distribution as rand_dist, thread_rng};
use statrs::distribution::Laplace;
// Default tolerance used to calculate sigma for the gaussian noise
const DEFAULT_TOLERANCE: f64 = 1e-8;
#[derive(Debug)]
pub(crate) struct NoiseParameters {
pub(crate) percentile_epsilon: f64,
pub(crate) sigmas: Vec<f64>,
pub(crate) delta: f64,
pub(crate) protected_number_of_records: Option<usize>,
}
impl NoiseParameters {
#[inline]
fn split_budget_for_records_and_marginals(
total_epsilon: f64,
number_of_records_epsilon_proportion: f64,
) -> (f64, f64) {
assert!(
number_of_records_epsilon_proportion < 1.0
&& number_of_records_epsilon_proportion > 0.0,
"number_of_records_epsilon_proportion must be > 0 and < 1"
);
// total_epsilon = marginals_epsilon + number_of_records_epsilon
let number_of_records_epsilon = number_of_records_epsilon_proportion * total_epsilon;
let marginals_epsilon = total_epsilon - number_of_records_epsilon;
(number_of_records_epsilon, marginals_epsilon)
}
#[inline]
fn delta_value_or_default(delta_opt: &Option<f64>, number_of_records: usize) -> f64 {
assert!(
number_of_records > 0,
"number_of_records must be greater than 0"
);
let number_of_records_f64 = number_of_records as f64;
let delta = delta_opt.unwrap_or(1.0 / (number_of_records_f64.ln() * number_of_records_f64));
assert!(
delta > 0.0 && delta < 1.0,
"delta value must be between 0 and 1"
);
delta
}
#[inline]
fn unwrap_sigma_proportions_or_default(
sigma_proportions_opt: &Option<Vec<f64>>,
reporting_length: usize,
) -> Vec<f64> {
let sigma_proportions = match sigma_proportions_opt {
Some(proportions) => proportions.clone(),
None => {
let mut v = Vec::default();
v.resize_with(reporting_length, || 1.0);
v
}
};
assert!(
reporting_length == sigma_proportions.len(),
"sigma proportions array size should match the reporting length",
);
sigma_proportions
}
#[inline]
pub fn protect_number_of_records(
number_of_records_epsilon: f64,
number_of_records: usize,
) -> usize {
info!(
"protecting reported number of records with epsilon = {}",
number_of_records_epsilon
);
assert!(
number_of_records_epsilon > 0.0,
"number of records epsilon should be > 0"
);
let protected_number_of_records = ((number_of_records as f64)
+ Laplace::new(0.0, 1.0 / number_of_records_epsilon)
.expect("error generating Laplace noise")
.sample(&mut thread_rng()))
.round();
assert!(
protected_number_of_records > 0.0,
"adding noise to number of records resulted in a negative number"
);
protected_number_of_records as usize
}
#[inline]
fn calc_percentile_epsilon_and_sigmas(
reporting_length: usize,
marginals_epsilon: f64,
delta: f64,
sigma_proportions: &[f64],
percentile_epsilon_proportion: f64,
) -> (f64, Vec<f64>) {
assert!(
percentile_epsilon_proportion < 1.0 && percentile_epsilon_proportion > 0.0,
"percentile_epsilon_proportion must be > 0 and < 1"
);
let t = reporting_length as f64;
let rho_sqrt = (marginals_epsilon + (2.0 / delta).ln()).sqrt() - (2.0 / delta).ln().sqrt();
let rho = rho_sqrt * rho_sqrt;
let k: f64 = sigma_proportions.iter().map(|p| 1.0 / (p * p)).sum();
let percentile_epsilon = (2.0 * rho * percentile_epsilon_proportion / t).sqrt();
let base_sigma = (k / (2.0 * rho * (1.0 - percentile_epsilon_proportion))).sqrt();
let sigmas: Vec<f64> = sigma_proportions.iter().map(|p| p * base_sigma).collect();
let lhs = ((t * percentile_epsilon * percentile_epsilon) / 2.0)
+ (sigmas.iter().map(|s| 1.0 / (s * s)).sum::<f64>() / 2.0);
info!("percentile epsilon = {percentile_epsilon}, calculated sigmas = {sigmas:?}");
assert!(
(lhs - rho).abs() <= DEFAULT_TOLERANCE,
"something went wrong calculating DP sigmas"
);
(percentile_epsilon, sigmas)
}
#[inline]
fn calc_marginals_parameters(
reporting_length: usize,
total_epsilon: f64,
delta_opt: &Option<f64>,
number_of_records_epsilon_proportion_opt: &Option<f64>,
sigma_proportions_opt: &Option<Vec<f64>>,
number_of_records: usize,
) -> (Vec<f64>, f64, f64, Option<usize>, f64) {
let sigma_proportions = NoiseParameters::unwrap_sigma_proportions_or_default(
sigma_proportions_opt,
reporting_length,
);
let number_of_records_epsilon: f64;
let marginals_epsilon: f64;
let protected_number_of_records: Option<usize>;
let delta: f64;
if let Some(number_of_records_epsilon_proportion) = number_of_records_epsilon_proportion_opt
{
// get a fraction of the budget to protect the number of records
(number_of_records_epsilon, marginals_epsilon) =
NoiseParameters::split_budget_for_records_and_marginals(
total_epsilon,
*number_of_records_epsilon_proportion,
);
// consume budget to protect number of records
protected_number_of_records = Some(NoiseParameters::protect_number_of_records(
number_of_records_epsilon,
number_of_records,
));
// build delta from the protected number of records
delta = NoiseParameters::delta_value_or_default(
delta_opt,
protected_number_of_records.unwrap(),
);
} else {
// we don't want to protect the number of records, use all
// the budget for the marginals
number_of_records_epsilon = 0.0;
marginals_epsilon = total_epsilon;
protected_number_of_records = None;
delta = NoiseParameters::delta_value_or_default(delta_opt, number_of_records);
}
(
sigma_proportions,
number_of_records_epsilon,
marginals_epsilon,
protected_number_of_records,
delta,
)
}
#[inline]
pub fn new(
reporting_length: usize,
total_epsilon: f64,
delta_opt: &Option<f64>,
percentile_epsilon_proportion: f64,
number_of_records_epsilon_proportion_opt: &Option<f64>,
sigma_proportions_opt: &Option<Vec<f64>>,
number_of_records: usize,
) -> NoiseParameters {
let (
sigma_proportions,
number_of_records_epsilon,
marginals_epsilon,
protected_number_of_records,
delta,
) = NoiseParameters::calc_marginals_parameters(
reporting_length,
total_epsilon,
delta_opt,
number_of_records_epsilon_proportion_opt,
sigma_proportions_opt,
number_of_records,
);
info!(
"calculating percentile epsilon and sigmas with:
total_epsilon = {total_epsilon},
number_of_records_epsilon = {number_of_records_epsilon},
marginals_epsilon = {marginals_epsilon},
delta = {delta},
percentile_epsilon_proportion = {percentile_epsilon_proportion},
number_of_records_epsilon_proportion = {number_of_records_epsilon_proportion_opt:?},
sigma_proportions = {sigma_proportions:?}"
);
let (percentile_epsilon, sigmas) = NoiseParameters::calc_percentile_epsilon_and_sigmas(
reporting_length,
marginals_epsilon,
delta,
&sigma_proportions,
percentile_epsilon_proportion,
);
NoiseParameters {
percentile_epsilon,
sigmas,
delta,
protected_number_of_records,
}
}
}

Просмотреть файл

@ -54,11 +54,11 @@ pub struct AggregatedData {
pub reporting_length: usize,
}
impl AggregatedData {
impl Default for AggregatedData {
/// Creates a new AggregatedData struct with default values
#[inline]
pub fn default() -> AggregatedData {
AggregatedData {
fn default() -> Self {
Self {
headers: DataBlockHeaders::default(),
multi_value_column_metadata_map: MultiValueColumnMetadataMap::default(),
number_of_records: 0,
@ -68,7 +68,9 @@ impl AggregatedData {
reporting_length: 0,
}
}
}
impl AggregatedData {
/// Creates a new AggregatedData struct
/// # Arguments:
/// * `headers` - Vector of strings representing the data headers

Просмотреть файл

@ -27,11 +27,11 @@ pub struct RecordsAnalysis {
pub percentage_of_records_with_risky_combinations: f64,
}
impl RecordsAnalysis {
#[inline]
impl Default for RecordsAnalysis {
/// Created a new RecordsAnalysis with default values
pub fn default() -> RecordsAnalysis {
RecordsAnalysis {
#[inline]
fn default() -> Self {
Self {
number_of_records_with_unique_combinations: 0,
percentage_of_records_with_unique_combinations: 0.0,
number_of_records_with_rare_combinations: 0,
@ -118,43 +118,52 @@ impl RecordsAnalysisData {
} as f64;
let records_analysis_by_len: RecordsAnalysisByLenMap = (1..=reporting_length)
.map(|l| {
let mut ra = RecordsAnalysis::default();
ra.number_of_records_with_unique_combinations = records_with_unique_combs_by_len
.get(&l)
.map_or(0, |records| records.len());
ra.number_of_records_with_rare_combinations = records_with_rare_combs_by_len
let mut number_of_records_with_unique_combinations =
records_with_unique_combs_by_len
.get(&l)
.map_or(0, |records| records.len());
let mut number_of_records_with_rare_combinations = records_with_rare_combs_by_len
.get(&l)
.map_or(0, |records| records.len());
if protect {
ra.number_of_records_with_unique_combinations = uround_down(
ra.number_of_records_with_unique_combinations as f64,
number_of_records_with_unique_combinations = uround_down(
number_of_records_with_unique_combinations as f64,
resolution as f64,
);
ra.number_of_records_with_rare_combinations = uround_down(
ra.number_of_records_with_rare_combinations as f64,
number_of_records_with_rare_combinations = uround_down(
number_of_records_with_rare_combinations as f64,
resolution as f64,
)
}
ra.percentage_of_records_with_unique_combinations = calc_percentage(
ra.number_of_records_with_unique_combinations as f64,
let percentage_of_records_with_unique_combinations = calc_percentage(
number_of_records_with_unique_combinations as f64,
total_number_of_records_f64,
);
ra.percentage_of_records_with_rare_combinations = calc_percentage(
ra.number_of_records_with_rare_combinations as f64,
let percentage_of_records_with_rare_combinations = calc_percentage(
number_of_records_with_rare_combinations as f64,
total_number_of_records_f64,
);
ra.number_of_records_with_risky_combinations = ra
.number_of_records_with_unique_combinations
+ ra.number_of_records_with_rare_combinations;
ra.percentage_of_records_with_risky_combinations = calc_percentage(
ra.number_of_records_with_risky_combinations as f64,
let number_of_records_with_risky_combinations =
number_of_records_with_unique_combinations
+ number_of_records_with_rare_combinations;
let percentage_of_records_with_risky_combinations = calc_percentage(
number_of_records_with_risky_combinations as f64,
total_number_of_records_f64,
);
(l, ra)
(
l,
RecordsAnalysis {
number_of_records_with_unique_combinations,
percentage_of_records_with_unique_combinations,
number_of_records_with_rare_combinations,
percentage_of_records_with_rare_combinations,
number_of_records_with_risky_combinations,
percentage_of_records_with_risky_combinations,
},
)
})
.collect();
RecordsAnalysisData {

Просмотреть файл

@ -19,13 +19,15 @@ pub struct ValueCombination {
combination: Vec<Arc<DataBlockValue>>,
}
impl ValueCombination {
#[inline]
impl Default for ValueCombination {
/// Creates a new ValueCombination with default values
pub fn default() -> ValueCombination {
ValueCombination::new(Vec::default())
#[inline]
fn default() -> Self {
Self::new(Vec::default())
}
}
impl ValueCombination {
#[inline]
/// Creates a new ValueCombination
/// # Arguments

Просмотреть файл

@ -16,12 +16,15 @@ use crate::processing::evaluator::preservation_by_length::PreservationByLengthBu
/// Evaluates aggregated, sensitive and synthesized data
pub struct Evaluator {}
impl Evaluator {
impl Default for Evaluator {
/// Returns a new Evaluator
pub fn default() -> Evaluator {
Evaluator {}
#[inline]
fn default() -> Self {
Self {}
}
}
impl Evaluator {
fn calc_combinations_abs_error_sum_count_by_len(
&self,
sensitive_aggregated_data: &AggregatedData,

Просмотреть файл

@ -18,10 +18,11 @@ pub struct PreservationBucket {
pub proportional_error_sum: f64,
}
impl PreservationBucket {
impl Default for PreservationBucket {
/// Return a new PreservationBucket with default values
pub fn default() -> PreservationBucket {
PreservationBucket {
#[inline]
fn default() -> Self {
Self {
size: 0,
preservation_sum: 0.0,
length_sum: 0,
@ -29,7 +30,9 @@ impl PreservationBucket {
proportional_error_sum: 0.0,
}
}
}
impl PreservationBucket {
/// Adds a new value to the bucket
/// # Arguments
/// * `preservation` - Preservation related to the value

Просмотреть файл

@ -27,15 +27,17 @@ pub struct PreservationByCountBuckets {
buckets_map: PreservationBucketsMap,
}
impl PreservationByCountBuckets {
impl Default for PreservationByCountBuckets {
/// Returns a new default PreservationByCountBuckets
#[inline]
pub fn default() -> PreservationByCountBuckets {
PreservationByCountBuckets {
fn default() -> Self {
Self {
buckets_map: PreservationBucketsMap::default(),
}
}
}
impl PreservationByCountBuckets {
#[inline]
pub(super) fn populate(
&mut self,

Просмотреть файл

@ -17,15 +17,17 @@ pub struct PreservationByLengthBuckets {
buckets_map: PreservationBucketsMap,
}
impl PreservationByLengthBuckets {
impl Default for PreservationByLengthBuckets {
/// Returns a new default PreservationByLengthBuckets
#[inline]
pub fn default() -> PreservationByLengthBuckets {
PreservationByLengthBuckets {
fn default() -> Self {
Self {
buckets_map: PreservationBucketsMap::default(),
}
}
}
impl PreservationByLengthBuckets {
#[inline]
pub(super) fn populate(
&mut self,

Просмотреть файл

@ -27,17 +27,19 @@ pub struct GeneratedData {
pub multi_value_column_metadata_map: MultiValueColumnMetadataMap,
}
impl GeneratedData {
impl Default for GeneratedData {
/// Returns a new GeneratedData struct with default values
#[inline]
pub fn default() -> GeneratedData {
GeneratedData {
fn default() -> Self {
Self {
synthetic_data: RawData::default(),
expansion_ratio: 0.0,
multi_value_column_metadata_map: MultiValueColumnMetadataMap::default(),
}
}
}
impl GeneratedData {
/// Returns a new GeneratedData struct
/// # Arguments
/// * `synthetic_data` - Synthesized data headers (index 0) and records indexes 1...

Просмотреть файл

@ -1,38 +0,0 @@
use sds_core::dp::{DpAnalyticGaussianContinuousCDFScale, DEFAULT_TOLERANCE};
use statrs::distribution::Normal;
#[test]
pub fn validate_sigma() {
let n = Normal::new(0.0, 1.0).unwrap();
assert!(
(n.calc_sigma_dp(f64::sqrt(30.0), 6.0, 0.5, DEFAULT_TOLERANCE) - 1.4659731497780966).abs()
<= DEFAULT_TOLERANCE
);
assert!(
(n.calc_sigma_dp(f64::sqrt(30.0), 6.0, 1.0 / 100000.0, DEFAULT_TOLERANCE)
- 4.182602139814776)
.abs()
<= DEFAULT_TOLERANCE
);
assert!(
(n.calc_sigma_dp(f64::sqrt(100.0), 0.1, 1.0 / 100000.0, DEFAULT_TOLERANCE)
.abs()
- 307.49566132862844)
<= DEFAULT_TOLERANCE
);
assert!(
(n.calc_sigma_dp(f64::sqrt(100.0), 0.1, 0.5, DEFAULT_TOLERANCE) - 7.016745810753165).abs()
<= DEFAULT_TOLERANCE
);
assert!(
(n.calc_sigma_dp(f64::sqrt(0.1), 0.1, 0.5, DEFAULT_TOLERANCE) - 0.221888985244248).abs()
<= DEFAULT_TOLERANCE
);
assert!(
(n.calc_sigma_dp(f64::sqrt(0.1), 0.1, 1.0 / 20000.0, DEFAULT_TOLERANCE)
- 8.370597761781507)
.abs()
<= DEFAULT_TOLERANCE
);
}

Просмотреть файл

@ -1,5 +1,3 @@
mod analytic_gaussian;
mod noise_aggregator;
mod percentile;

Просмотреть файл

@ -23,7 +23,7 @@ fn get_noise_aggregator() -> NoiseAggregator {
0,
),
3,
&DpParameters::new(1.0, 0.001, 99, 0.1, None, None),
&DpParameters::new(1.0, 99, 0.1, Some(0.001), None, None),
NoisyCountThreshold::Fixed(InputValueByLen::default()),
)
}

Просмотреть файл

@ -1,6 +1,6 @@
[package]
name = "pac-synth"
version = "0.0.7"
version = "0.0.8"
license = "MIT"
description = "Private Accurate Combination (PAC) Synthesizers"
repository = "https://github.com/microsoft/synthetic-data-showcase"
@ -12,7 +12,7 @@ crate-type = ["cdylib"]
[dependencies]
log = { version = "0.4", features = ["std"] }
pyo3 = { version = "0.17", features = ["extension-module", "abi3-py37"] }
pyo3 = { version = "0.18", features = ["extension-module", "abi3-py37"] }
sds-core = { path = "../core", features = ["pyo3", "rayon"] }
serde = { version = "1.0", features = [ "derive", "rc" ] }
serde_json = { version = "1.0" }

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -161,7 +161,7 @@
" <th>count</th>\n",
" <td>6000.000000</td>\n",
" <td>6000.000000</td>\n",
" <td>6000.000000</td>\n",
" <td>6000.00000</td>\n",
" <td>6000.000000</td>\n",
" <td>6000.000000</td>\n",
" <td>6000.000000</td>\n",
@ -172,35 +172,35 @@
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>1.003000</td>\n",
" <td>2.612500</td>\n",
" <td>4.557333</td>\n",
" <td>0.494000</td>\n",
" <td>0.517833</td>\n",
" <td>0.501667</td>\n",
" <td>0.497333</td>\n",
" <td>0.494333</td>\n",
" <td>0.513833</td>\n",
" <td>0.496167</td>\n",
" <td>0.992333</td>\n",
" <td>2.646333</td>\n",
" <td>4.57900</td>\n",
" <td>0.502333</td>\n",
" <td>0.492667</td>\n",
" <td>0.497000</td>\n",
" <td>0.498833</td>\n",
" <td>0.501500</td>\n",
" <td>0.487500</td>\n",
" <td>0.493000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.810206</td>\n",
" <td>2.123228</td>\n",
" <td>3.324338</td>\n",
" <td>0.500006</td>\n",
" <td>0.499724</td>\n",
" <td>0.816324</td>\n",
" <td>2.107129</td>\n",
" <td>3.32665</td>\n",
" <td>0.500036</td>\n",
" <td>0.499988</td>\n",
" <td>0.500033</td>\n",
" <td>0.500040</td>\n",
" <td>0.500039</td>\n",
" <td>0.500035</td>\n",
" <td>0.500010</td>\n",
" <td>0.499850</td>\n",
" <td>0.500027</td>\n",
" <td>0.499885</td>\n",
" <td>0.499993</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.00000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
@ -212,8 +212,8 @@
" <tr>\n",
" <th>25%</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.00000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
@ -225,21 +225,21 @@
" <tr>\n",
" <th>50%</th>\n",
" <td>1.000000</td>\n",
" <td>2.000000</td>\n",
" <td>4.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>3.000000</td>\n",
" <td>5.00000</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>2.000000</td>\n",
" <td>5.000000</td>\n",
" <td>7.000000</td>\n",
" <td>7.00000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
@ -252,7 +252,7 @@
" <th>max</th>\n",
" <td>2.000000</td>\n",
" <td>6.000000</td>\n",
" <td>10.000000</td>\n",
" <td>10.00000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
@ -266,23 +266,23 @@
"</div>"
],
"text/plain": [
" H1 H2 H3 H4 H5 \\\n",
"count 6000.000000 6000.000000 6000.000000 6000.000000 6000.000000 \n",
"mean 1.003000 2.612500 4.557333 0.494000 0.517833 \n",
"std 0.810206 2.123228 3.324338 0.500006 0.499724 \n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 1.000000 0.000000 0.000000 \n",
"50% 1.000000 2.000000 4.000000 0.000000 1.000000 \n",
"75% 2.000000 5.000000 7.000000 1.000000 1.000000 \n",
"max 2.000000 6.000000 10.000000 1.000000 1.000000 \n",
" H1 H2 H3 H4 H5 \\\n",
"count 6000.000000 6000.000000 6000.00000 6000.000000 6000.000000 \n",
"mean 0.992333 2.646333 4.57900 0.502333 0.492667 \n",
"std 0.816324 2.107129 3.32665 0.500036 0.499988 \n",
"min 0.000000 0.000000 0.00000 0.000000 0.000000 \n",
"25% 0.000000 1.000000 1.00000 0.000000 0.000000 \n",
"50% 1.000000 3.000000 5.00000 1.000000 0.000000 \n",
"75% 2.000000 5.000000 7.00000 1.000000 1.000000 \n",
"max 2.000000 6.000000 10.00000 1.000000 1.000000 \n",
"\n",
" H6 H7 H8 H9 H10 \n",
"count 6000.000000 6000.000000 6000.000000 6000.000000 6000.000000 \n",
"mean 0.501667 0.497333 0.494333 0.513833 0.496167 \n",
"std 0.500039 0.500035 0.500010 0.499850 0.500027 \n",
"mean 0.497000 0.498833 0.501500 0.487500 0.493000 \n",
"std 0.500033 0.500040 0.500039 0.499885 0.499993 \n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"50% 1.000000 0.000000 0.000000 1.000000 0.000000 \n",
"50% 0.000000 0.000000 1.000000 0.000000 0.000000 \n",
"75% 1.000000 1.000000 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 1.000000 1.000000 "
]
@ -337,42 +337,42 @@
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>6001.000000</td>\n",
" <td>6001.000000</td>\n",
" <td>6001.000000</td>\n",
" <td>6001.000000</td>\n",
" <td>6001.000000</td>\n",
" <td>6001.000000</td>\n",
" <td>6001.000000</td>\n",
" <td>6001.000000</td>\n",
" <td>6001.000000</td>\n",
" <td>6001.000000</td>\n",
" <td>6030.000000</td>\n",
" <td>6030.000000</td>\n",
" <td>6030.000000</td>\n",
" <td>6030.000000</td>\n",
" <td>6030.000000</td>\n",
" <td>6030.000000</td>\n",
" <td>6030.000000</td>\n",
" <td>6030.000000</td>\n",
" <td>6030.000000</td>\n",
" <td>6030.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>0.976671</td>\n",
" <td>2.495084</td>\n",
" <td>4.334778</td>\n",
" <td>0.453091</td>\n",
" <td>0.478587</td>\n",
" <td>0.457424</td>\n",
" <td>0.461256</td>\n",
" <td>0.462923</td>\n",
" <td>0.473254</td>\n",
" <td>0.465756</td>\n",
" <td>0.937977</td>\n",
" <td>2.462023</td>\n",
" <td>4.250083</td>\n",
" <td>0.477944</td>\n",
" <td>0.470149</td>\n",
" <td>0.462355</td>\n",
" <td>0.478441</td>\n",
" <td>0.475788</td>\n",
" <td>0.465008</td>\n",
" <td>0.469818</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.815346</td>\n",
" <td>2.140020</td>\n",
" <td>3.378620</td>\n",
" <td>0.497836</td>\n",
" <td>0.499583</td>\n",
" <td>0.498225</td>\n",
" <td>0.498538</td>\n",
" <td>0.498665</td>\n",
" <td>0.499326</td>\n",
" <td>0.498868</td>\n",
" <td>0.825132</td>\n",
" <td>2.132173</td>\n",
" <td>3.401991</td>\n",
" <td>0.499555</td>\n",
" <td>0.499150</td>\n",
" <td>0.498622</td>\n",
" <td>0.499576</td>\n",
" <td>0.499455</td>\n",
" <td>0.498815</td>\n",
" <td>0.499130</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
@ -445,9 +445,9 @@
],
"text/plain": [
" H1 H2 H3 H4 H5 \\\n",
"count 6001.000000 6001.000000 6001.000000 6001.000000 6001.000000 \n",
"mean 0.976671 2.495084 4.334778 0.453091 0.478587 \n",
"std 0.815346 2.140020 3.378620 0.497836 0.499583 \n",
"count 6030.000000 6030.000000 6030.000000 6030.000000 6030.000000 \n",
"mean 0.937977 2.462023 4.250083 0.477944 0.470149 \n",
"std 0.825132 2.132173 3.401991 0.499555 0.499150 \n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 1.000000 0.000000 0.000000 \n",
"50% 1.000000 2.000000 4.000000 0.000000 0.000000 \n",
@ -455,9 +455,9 @@
"max 2.000000 6.000000 10.000000 1.000000 1.000000 \n",
"\n",
" H6 H7 H8 H9 H10 \n",
"count 6001.000000 6001.000000 6001.000000 6001.000000 6001.000000 \n",
"mean 0.457424 0.461256 0.462923 0.473254 0.465756 \n",
"std 0.498225 0.498538 0.498665 0.499326 0.498868 \n",
"count 6030.000000 6030.000000 6030.000000 6030.000000 6030.000000 \n",
"mean 0.462355 0.478441 0.475788 0.465008 0.469818 \n",
"std 0.498622 0.499576 0.499455 0.498815 0.499130 \n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"50% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
@ -491,7 +491,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
"version": "3.10.6"
},
"orig_nbformat": 4,
"vscode": {

Просмотреть файл

@ -11,7 +11,7 @@ use serde::Serialize;
/// By default, the builder will be constructed with default values:
/// - reporting_length: 3
/// - epsilon: 4.0
/// - delta: will be set in runtime to 1 / (ln(number_of_records) * number_of_records)
/// - delta: will be set in runtime to `1 / (ln(protected_number_of_records) * protected_number_of_records)`
/// - percentile_percentage: 99
/// - percentile_epsilon_proportion: 0.01
/// - accuracy_mode: AccuracyMode.prioritize_long_combinations()
@ -307,8 +307,8 @@ impl DpAggregateSeededParametersBuilder {
}
if let Some(delta) = self._delta {
if delta <= 0.0 {
return Err(PyValueError::new_err("delta must be > 0"));
if delta <= 0.0 || delta >= 1.0 {
return Err(PyValueError::new_err("delta must be > 0 and < 1"));
}
}
@ -333,12 +333,6 @@ impl DpAggregateSeededParametersBuilder {
));
}
if self._percentile_epsilon_proportion + self._number_of_records_epsilon_proportion >= 1.0 {
return Err(PyValueError::new_err(
"percentile_epsilon_proportion + number_of_records_epsilon_proportion must be < 1",
));
}
self._fabrication_mode.validate(self._reporting_length)?;
if self._weight_selection_percentile > 100 {

Просмотреть файл

@ -1,7 +1,6 @@
use super::{DpAggregateSeededParameters, DpAggregateSeededParametersBuilder};
use pyo3::{exceptions::PyRuntimeError, prelude::*};
use sds_core::{
data_block::DataBlock,
dp::DpParameters,
processing::{
aggregator::{AggregatedData, AggregatesCountStringMap, Aggregator},
@ -72,9 +71,9 @@ impl DpAggregateSeededSynthesizer {
self._parameters.reporting_length,
&DpParameters::new(
self._parameters.epsilon,
self.delta_value_or_default(&dataset.data_block),
self._parameters.percentile_percentage,
self._parameters.percentile_epsilon_proportion,
self._parameters.delta,
Some(self._parameters.sigma_proportions.clone()),
Some(self._parameters.number_of_records_epsilon_proportion),
),
@ -167,20 +166,6 @@ impl DpAggregateSeededSynthesizer {
}
}
impl DpAggregateSeededSynthesizer {
#[inline]
fn delta_value_or_default(&self, data_block: &DataBlock) -> f64 {
let number_of_records = data_block.number_of_records();
let number_of_records_f64 = number_of_records as f64;
self._parameters.delta.unwrap_or(if number_of_records > 0 {
1.0 / (number_of_records_f64.ln() * number_of_records_f64)
} else {
0.0
})
}
}
pub(crate) fn register(_py: Python<'_>, m: &PyModule) -> PyResult<()> {
m.add_class::<DpAggregateSeededSynthesizer>()?;
Ok(())

Просмотреть файл

@ -1,6 +1,6 @@
[package]
name = "sds-pyo3"
version = "1.8.6"
version = "1.9.0"
license = "MIT"
description = "Python bindings for the sds-core library"
repository = "https://github.com/microsoft/synthetic-data-showcase"
@ -13,6 +13,6 @@ crate-type = ["cdylib"]
[dependencies]
log = { version = "0.4", features = ["std"] }
csv = { version = "1.1" }
pyo3 = { version = "0.17", features = ["extension-module"] }
pyo3 = { version = "0.18", features = ["extension-module"] }
sds-core = { path = "../core", features = ["pyo3", "rayon"] }
env_logger = { version = "0.9" }
env_logger = { version = "0.10" }

Просмотреть файл

@ -48,11 +48,11 @@ impl SDSProcessor {
pub fn new(
path: &str,
delimiter: char,
subject_id: Option<String>,
use_columns: Vec<String>,
multi_value_columns: HashMap<String, String>,
sensitive_zeros: Vec<String>,
record_limit: usize,
subject_id: Option<String>,
) -> Result<SDSProcessor, CsvDataBlockCreatorError> {
CsvDataBlockCreator::create(
ReaderBuilder::new()

Просмотреть файл

@ -1,6 +1,6 @@
[package]
name = "sds-wasm"
version = "1.8.6"
version = "1.9.0"
license = "MIT"
description = "Web Assembly bindings for the sds-core library"
repository = "https://github.com/microsoft/synthetic-data-showcase"

Просмотреть файл

@ -14,14 +14,16 @@ pub struct WasmAggregateResult {
pub(crate) aggregated_data: Arc<AggregatedData>,
}
impl WasmAggregateResult {
impl Default for WasmAggregateResult {
#[inline]
pub fn default() -> WasmAggregateResult {
WasmAggregateResult {
fn default() -> Self {
Self {
aggregated_data: Arc::new(AggregatedData::default()),
}
}
}
impl WasmAggregateResult {
#[inline]
pub fn new(aggregated_data: Arc<AggregatedData>) -> WasmAggregateResult {
WasmAggregateResult { aggregated_data }

Просмотреть файл

@ -5,6 +5,7 @@ use wasm_bindgen::{prelude::*, JsCast};
use crate::utils::js::{JsGenerateResult, JsResult};
#[derive(Default)]
#[wasm_bindgen]
pub struct WasmGenerateResult {
generated_data: GeneratedData,
@ -12,14 +13,6 @@ pub struct WasmGenerateResult {
}
impl WasmGenerateResult {
#[inline]
pub fn default() -> WasmGenerateResult {
WasmGenerateResult {
generated_data: GeneratedData::default(),
resolution: 0,
}
}
#[inline]
pub fn new(generated_data: GeneratedData, resolution: usize) -> WasmGenerateResult {
WasmGenerateResult {

Просмотреть файл

@ -36,10 +36,10 @@ pub struct WasmNavigateResult {
column_index_by_name: ColumnIndexByName,
}
impl WasmNavigateResult {
impl Default for WasmNavigateResult {
#[inline]
pub fn default() -> WasmNavigateResult {
WasmNavigateResult::new(
fn default() -> Self {
Self::new(
HeaderNames::default(),
Arc::new(DataBlock::default()),
AttributeRowsByColumnMap::default(),
@ -49,7 +49,9 @@ impl WasmNavigateResult {
ColumnIndexByName::default(),
)
}
}
impl WasmNavigateResult {
#[inline]
pub fn new(
header_names: HeaderNames,

Просмотреть файл

@ -28,6 +28,7 @@ use crate::{
},
};
#[derive(Default)]
#[wasm_bindgen]
pub struct WasmSdsContext {
sensitive_data_params: Option<WasmCsvDataParameters>,
@ -45,19 +46,8 @@ pub struct WasmSdsContext {
#[wasm_bindgen]
impl WasmSdsContext {
#[wasm_bindgen(constructor)]
pub fn default() -> Self {
WasmSdsContext {
sensitive_data_params: None,
sensitive_processor: None,
synthetic_processor: None,
sensitive_aggregate_result: None,
reportable_aggregate_result: None,
synthetic_aggregate_result: None,
generate_result: None,
pre_computed_aggregates: false,
evaluate_result: None,
navigate_result: None,
}
pub fn new_default() -> Self {
Self::default()
}
#[wasm_bindgen(js_name = "clear")]

Просмотреть файл

@ -29,9 +29,10 @@ pub struct WasmSdsProcessor {
pub(crate) data_block: Arc<DataBlock>,
}
impl WasmSdsProcessor {
pub fn default() -> WasmSdsProcessor {
WasmSdsProcessor {
impl Default for WasmSdsProcessor {
#[inline]
fn default() -> Self {
Self {
data_block: Arc::new(DataBlock::default()),
}
}

Просмотреть файл

@ -46,11 +46,11 @@ export interface INoisyCountThreshold {
export interface IDpParameters {
epsilon: number
delta: number
percentilePercentage: number
percentileEpsilonProportion: number
numberOfRecordsEpsilonProportion?: number
delta?: number
sigmaProportions?: number[]
numberOfRecordsEpsilonProportion?: number
}
export interface IOversamplingParameters {

Просмотреть файл

@ -53,11 +53,11 @@ def aggregate(config):
sds_processor = sds.SDSProcessor(
sensitive_microdata_path,
sensitive_microdata_delimiter,
subject_id,
use_columns,
multi_value_columns,
sensitive_zeros,
max(record_limit, 0)
max(record_limit, 0),
subject_id
)
aggregated_data = sds_processor.aggregate(
@ -75,20 +75,20 @@ def aggregate(config):
aggregated_data.write_to_json(sensitive_aggregated_data_json)
if dp_aggregates:
if not delta_factor:
delta_factor = math.log(sds_processor.number_of_records())
noise_delta = 1 / \
(delta_factor * sds_processor.number_of_records())
if delta_factor:
noise_delta = 1 / \
(delta_factor * sds_processor.number_of_records())
else:
noise_delta = None
if noise_threshold_type == 'fixed':
aggregated_data = sds_processor.aggregate_with_dp_fixed_threshold(
reporting_length,
sds.DpParameters(
noise_epsilon,
noise_delta,
percentile_percentage,
percentile_epsilon_proportion,
noise_delta,
sigma_proportions,
number_of_records_epsilon_proportion
),
@ -99,9 +99,9 @@ def aggregate(config):
reporting_length,
sds.DpParameters(
noise_epsilon,
noise_delta,
percentile_percentage,
percentile_epsilon_proportion,
noise_delta,
sigma_proportions,
number_of_records_epsilon_proportion
),

Просмотреть файл

@ -84,11 +84,11 @@ class Evaluator:
self.syn_sds_processor = sds.SDSProcessor(
self.synthetic_microdata_path,
"\t",
None, # the synthetic data does not have an ID
[], # use all columns from synthetic file
[], # use all columns from synthetic file
self.multi_value_columns,
self.sensitive_zeros,
0 # use all records from synthetic file
0, # use all records from synthetic file
None # the synthetic data does not have an ID
)
self.syn_aggregated_data = self.syn_sds_processor.aggregate(
self.reporting_length

Просмотреть файл

@ -46,11 +46,11 @@ def generate(config):
sds_processor = sds.SDSProcessor(
sensitive_microdata_path,
sensitive_microdata_delimiter,
subject_id,
use_columns,
multi_value_columns,
sensitive_zeros,
max(record_limit, 0)
max(record_limit, 0),
subject_id
)
if synthesis_mode == 'unseeded':

Просмотреть файл

@ -1,6 +1,6 @@
{
"name": "webapp",
"version": "1.8.6",
"version": "1.9.0",
"private": true,
"license": "MIT",
"main": "src/index.ts",
@ -22,51 +22,50 @@
"@essex/arquero": "^2.0.3",
"@essex/arquero-react": "^1.1.0",
"@essex/sds-core": "workspace:^",
"@fluentui/font-icons-mdl2": "^8.5.4",
"@fluentui/react": "^8.103.9",
"@fluentui/react-hooks": "^8.6.14",
"@fluentui/utilities": "^8.13.4",
"@fluentui/font-icons-mdl2": "^8.5.13",
"@fluentui/react": "^8.106.7",
"@fluentui/react-hooks": "^8.6.20",
"@fluentui/utilities": "^8.13.9",
"@sds/components": "workspace:^",
"@thematic/core": "^3.1.0",
"@thematic/d3": "^2.0.13",
"@thematic/fluent": "^4.1.0",
"@thematic/react": "^2.1.0",
"@thematic/core": "^4.0.4",
"@thematic/d3": "^2.0.19",
"@thematic/fluent": "^5.0.5",
"@thematic/react": "^2.1.6",
"@types/mime": "^3.0.1",
"@uifabric/icons": "7.9.5",
"arquero": "^5.1.0",
"chart.js": "^3.9.1",
"chart.js": "^4.2.1",
"chartjs-plugin-datalabels": "^2.2.0",
"comlink": "^4.3.1",
"dompurify": "^2.4.1",
"comlink": "^4.4.1",
"dompurify": "^3.0.1",
"formik": "^2.2.9",
"lodash": "^4.17.21",
"marked": "^4.2.4",
"marked": "^4.2.12",
"mime": "^3.0.0",
"react": "^17.0.2",
"react-chartjs-2": "^4.3.1",
"react-chartjs-2": "^5.2.0",
"react-dom": "^17.0.2",
"react-is": "^17.0.2",
"react-router-dom": "^6.4.5",
"recoil": "^0.7.6",
"styled-components": "^5.3.6",
"uuid": "^9.0.0",
"yup": "^0.32.11"
"react-router-dom": "^6.9.0",
"recoil": "^0.7.7",
"styled-components": "^5.3.9",
"uuid": "^9.0.0"
},
"devDependencies": {
"@types/dompurify": "^2.4.0",
"@types/dompurify": "^3.0.0",
"@types/lodash": "^4.14.191",
"@types/marked": "^4.0.8",
"@types/node": "^16.18.9",
"@types/react": "^17.0.52",
"@types/react-dom": "^17.0.18",
"@types/node": "^16.18.16",
"@types/react": "^17.0.53",
"@types/react-dom": "^17.0.19",
"@types/react-is": "^17.0.3",
"@types/recoil": "^0.0.9",
"@types/styled-components": "^5.1.26",
"@types/uuid": "^9.0.0",
"@vitejs/plugin-react": "^3.0.0",
"@types/uuid": "^9.0.1",
"@vitejs/plugin-react": "^3.1.0",
"ts-node": "^10.9.1",
"typescript": "^4.8.4",
"vite": "^4.0.1",
"vite-tsconfig-paths": "^4.0.3"
"typescript": "^5.0.2",
"vite": "^4.2.0",
"vite-tsconfig-paths": "^4.0.7"
}
}

Двоичный файл не отображается.

Просмотреть файл

@ -20,6 +20,7 @@ export const StyleContext: React.FC<
> = memo(function StyleContext({ children }) {
const theme = useThematic()
const fluentTheme = useMemo(() => loadFluentTheme(theme), [theme])
return (
<>
{/* core thematic for charting colors and imperative use */}

Просмотреть файл

@ -2,9 +2,14 @@
* Copyright (c) Microsoft. All rights reserved.
* Licensed under the MIT license. See LICENSE file in the project.
*/
import { useThematic } from '@thematic/react'
import { useMemo } from 'react'
import {
useNominalBoldScale,
useNominalMutedScale,
useNominalScale,
} from '~utils'
export type BarColors = {
normal: string
selected: string
@ -12,26 +17,32 @@ export type BarColors = {
}
export function useEstimatedBarChartColors(): BarColors {
const thematic = useThematic()
const nominalScale = useNominalScale()
const nominalBoldScale = useNominalBoldScale()
const nominalMutedScale = useNominalMutedScale()
return useMemo(
() => ({
normal: thematic.scales().nominal().toArray()[0],
selected: thematic.scales().nominalBold().toArray()[0],
suppressed: thematic.scales().nominalMuted().toArray()[0],
normal: nominalScale[0],
selected: nominalBoldScale[0],
suppressed: nominalMutedScale[0],
}),
[thematic],
[nominalScale, nominalBoldScale, nominalMutedScale],
)
}
export function useActualBarChartColors(): BarColors {
const thematic = useThematic()
const nominalScale = useNominalScale()
const nominalBoldScale = useNominalBoldScale()
const nominalMutedScale = useNominalMutedScale()
return useMemo(
() => ({
normal: thematic.scales().nominal().toArray()[1],
selected: thematic.scales().nominalBold().toArray()[1],
suppressed: thematic.scales().nominalMuted().toArray()[1],
normal: nominalScale[1],
selected: nominalBoldScale[1],
suppressed: nominalMutedScale[1],
}),
[thematic],
[nominalScale, nominalBoldScale, nominalMutedScale],
)
}

Просмотреть файл

@ -6,6 +6,8 @@ import type { IMetricByKey } from '@essex/sds-core'
import { memo } from 'react'
import { Chart } from 'react-chartjs-2'
import { useNominalScale } from '~utils'
export interface IMetricsChart {
label: string
metrics: IMetricByKey
@ -28,12 +30,14 @@ function add_chart(
chart: IMetricsChart,
position: 'left' | 'right',
labels: number[],
color: string,
) {
datasets.push({
label: chart.label,
type: chart.type,
data: labels.map(l => chart.metrics[l] ?? 0),
yAxisID: position,
backgroundColor: color,
})
scales[position] = {
type: 'linear',
@ -55,13 +59,14 @@ export const MetricsChart: React.FC<MetricsChartProps> = memo(
}: MetricsChartProps) {
const datasets = []
const scales = {}
const color = useNominalScale()[0]
if (leftChart) {
add_chart(datasets, scales, leftChart, 'left', labels)
add_chart(datasets, scales, leftChart, 'left', labels, color)
}
if (rightChart) {
add_chart(datasets, scales, rightChart, 'right', labels)
add_chart(datasets, scales, rightChart, 'right', labels, color)
}
return (

Просмотреть файл

@ -3,7 +3,6 @@
* Licensed under the MIT license. See LICENSE file in the project.
*/
import { useThematic } from '@thematic/react'
import type { _DeepPartialObject } from 'chart.js/types/utils'
import type { Options } from 'chartjs-plugin-datalabels/types/options'
import type { BaseSyntheticEvent, WheelEvent } from 'react'
import { useCallback, useMemo } from 'react'
@ -26,7 +25,7 @@ export interface ChartJsDatasetConfig {
}
export interface DataLabelsConfig {
datalabels?: _DeepPartialObject<Options>
datalabels?: Options
}
function useBarConfig(

Просмотреть файл

@ -73,7 +73,7 @@ const Container = styled.div`
const NavBarStack = styled(Stack)`
height: 100%;
margin-left: ${({ theme }: { theme: FluentTheme }) => theme.spacing?.l2};
margin-right: ${({ theme }: { theme: FluentTheme }) => theme.spacing?.l2}; ;
margin-right: ${({ theme }: { theme: FluentTheme }) => theme.spacing?.l2};
`
const NavBarStackItem = styled(Stack.Item)`

Просмотреть файл

@ -121,7 +121,7 @@ export const AggregateStatistics: FC = memo(function AggregateStatistics() {
}
label={'Most linkable columns'}
containerHeight={220}
barHeight={10}
barHeight={30}
tooltipFormatter={columnTooltipFormatter}
/>
</ChartItem>
@ -132,7 +132,7 @@ export const AggregateStatistics: FC = memo(function AggregateStatistics() {
}
label={'Most linkable attributes'}
containerHeight={220}
barHeight={10}
barHeight={30}
tooltipFormatter={attributeTooltipFormatter}
/>
</ChartItem>

Просмотреть файл

@ -11,6 +11,8 @@ import type { FC } from 'react'
import { memo, useCallback, useMemo } from 'react'
import { Bar } from 'react-chartjs-2'
import { useNominalBoldScale, useNominalScale } from '~utils'
import { ChartContainer } from './ContributionChart.styles.js'
import type { ContributionChartProps } from './ContributionChart.types.js'
@ -47,12 +49,14 @@ export const ContributionChart: FC<ContributionChartProps> = memo(
[labels, onClick],
)
const thematic = useThematic()
const nominalScale = useNominalScale()
const nominalBoldScale = useNominalBoldScale()
const backgroundColor = useMemo(() => {
const normalColor = thematic.scales().nominal().toArray()[0]
const selectedColor = thematic.scales().nominalBold().toArray()[0]
const normalColor = nominalScale[0]
const selectedColor = nominalBoldScale[0]
return labels.map(l => (l === selectedKey ? selectedColor : normalColor))
}, [labels, thematic, selectedKey])
}, [labels, nominalScale, nominalBoldScale, selectedKey])
const labelColors = useMemo(() => {
const greys = thematic.scales().greys().toArray()
@ -67,70 +71,77 @@ export const ContributionChart: FC<ContributionChartProps> = memo(
maxHeight: containerHeight,
}}
>
<Bar
height={Math.max(barHeight * data.length, barHeight)}
data={{
labels: labels,
datasets: [
<div
style={{
height: Math.max(barHeight * data.length, barHeight),
}}
>
<Bar
data={{
labels: labels,
datasets: [
{
label: label,
data: data,
xAxisID: 'xAxis',
yAxisID: 'yAxis',
backgroundColor,
},
],
}}
plugins={[
ChartDataLabels as Plugin<'bar'>,
{
label: label,
data: data,
xAxisID: 'xAxis',
yAxisID: 'yAxis',
backgroundColor,
},
],
}}
plugins={[
ChartDataLabels as Plugin<'bar'>,
{
id: 'event-catcher',
beforeEvent(chart, args, _pluginOptions) {
// on hover at options will not handle well the case
// where the mouse leaves the bar
if (args.event.type === 'mousemove') {
const elements = chart.getActiveElements()
chart.canvas.style.cursor =
elements && elements[0] ? 'pointer' : 'default'
}
},
},
]}
options={{
plugins: {
legend: {
display: false,
},
datalabels: {
anchor: 'start',
align: 'end',
offset: 5,
formatter: value => `${value} %`,
color: labelColors,
},
tooltip: {
callbacks: {
label: tooltipFormatter,
id: 'event-catcher',
beforeEvent(chart, args, _pluginOptions) {
// on hover at options will not handle well the case
// where the mouse leaves the bar
if (args.event.type === 'mousemove') {
const elements = chart.getActiveElements()
chart.canvas.style.cursor =
elements && elements[0] ? 'pointer' : 'default'
}
},
},
},
indexAxis: 'y',
scales: {
xAxis: {
display: false,
grid: {
]}
options={{
responsive: true,
maintainAspectRatio: false,
plugins: {
legend: {
display: false,
},
},
yAxis: {
grid: {
display: false,
datalabels: {
anchor: 'start',
align: 'end',
offset: 5,
formatter: value => `${value} %`,
color: labelColors,
},
tooltip: {
callbacks: {
label: tooltipFormatter,
},
},
},
},
onClick: handleClick,
}}
/>
indexAxis: 'y',
scales: {
xAxis: {
display: false,
grid: {
display: false,
},
},
yAxis: {
grid: {
display: false,
},
},
},
onClick: handleClick,
}}
/>
</div>
</ChartContainer>
</FlexContainer>
)

Просмотреть файл

@ -177,27 +177,24 @@ function convertRawToSynthesisParameters(
} as IAggregateSeededSynthesisParameters
break
case SynthesisMode.DP: {
const deltaFactor =
rawParams.deltaFactor === 0 && rawParams.recordLimit > 0
? Math.log(rawParams.recordLimit)
: rawParams.deltaFactor
const noiseDelta =
rawParams.deltaFactor > 0 && rawParams.recordLimit > 0
? 1.0 / (rawParams.deltaFactor * rawParams.recordLimit)
: undefined
ret = {
...ret,
dpParameters: {
epsilon: rawParams.noiseEpsilon,
delta:
rawParams.recordLimit > 0
? 1.0 / (deltaFactor * rawParams.recordLimit)
: 0.0,
percentilePercentage: rawParams.percentilePercentage,
percentileEpsilonProportion: rawParams.percentileEpsilonProportion,
numberOfRecordsEpsilonProportion:
rawParams.numberOfRecordsEpsilonProportion,
delta: noiseDelta,
sigmaProportions: generateSigmaProportions(
rawParams.reportingLength,
rawParams.accuracyMode,
),
numberOfRecordsEpsilonProportion:
rawParams.numberOfRecordsEpsilonProportion,
},
noiseThreshold: {
type: 'Adaptive',

Просмотреть файл

@ -5,3 +5,5 @@ Factor used to calculate the delta DP parameter.
If set to `0`, then will default at runtime to `ln(record limit)`, resulting in:
`Delta = 1 / (ln(record limit) * [record limit])`
When set to '0', the record limit will be also protected with differential privacy, consuming a portion of the privacy budget. Look at the `Number of records epsilon proportion` parameter.

Просмотреть файл

@ -1 +1 @@
Percentile selection is performed with differential privacy. This defines the proportion of the overall epsilon dedicated to this stage (0.1 means 10%).
Percentile selection is performed with differential privacy. This defines the proportion of the overall epsilon dedicated to this stage (0.01 means 1%).

Просмотреть файл

@ -4,3 +4,4 @@
*/
export * from './arquero.js'
export * from './env.js'
export * from './thematic.js'

Просмотреть файл

@ -0,0 +1,33 @@
/*!
* Copyright (c) Microsoft. All rights reserved.
* Licensed under the MIT license. See LICENSE file in the project.
*/
import { useThematic } from '@thematic/react'
import { useMemo } from 'react'
export function useNominalScale(): string[] {
const themetic = useThematic()
return useMemo(() => {
// adding blue from previous versions to keep consistence
return ['rgb(128 172 247)', ...themetic.scales().nominal().toArray()]
}, [themetic])
}
export function useNominalBoldScale(): string[] {
const themetic = useThematic()
return useMemo(() => {
// adding blue from previous versions to keep consistence
return ['rgb(0 95 174)', ...themetic.scales().nominalBold().toArray()]
}, [themetic])
}
export function useNominalMutedScale(): string[] {
const themetic = useThematic()
return useMemo(() => {
// adding blue from previous versions to keep consistence
return ['rgb(207 212 228)', ...themetic.scales().nominalMuted().toArray()]
}, [themetic])
}

Просмотреть файл

@ -2,7 +2,7 @@
ARG REGISTRY
# --- compile wasm bindings from rust ---
FROM ${REGISTRY}rust:1.60 as wasm-builder
FROM ${REGISTRY}rust:1.64 as wasm-builder
# install wasm-pack to build wasm bindings
RUN cargo install wasm-pack

8121
yarn.lock

Разница между файлами не показана из-за своего большого размера Загрузить разницу