- creating to_raw_data and to_data_frame methods for Dataset

- bug fix for empty value mapping with multi value columns
This commit is contained in:
Rodrigo Racanicci 2022-07-26 12:42:35 -03:00
Родитель 167ce16107
Коммит d017bffa2b
14 изменённых файлов: 1035 добавлений и 412 удалений

3
.vscode/settings.json поставляемый
Просмотреть файл

@ -26,5 +26,6 @@
},
"[python]": {
"editor.defaultFormatter": "ms-python.python"
}
},
"cSpell.enabled": true
}

Просмотреть файл

@ -375,6 +375,7 @@ fn main() {
gd.write_synthetic_data(
&synthetic_path,
synthetic_delimiter.chars().next().unwrap(),
"",
join_multi_value_columns,
long_form,
)

Просмотреть файл

@ -4,14 +4,17 @@ use super::{
DataBlockHeaders, DataBlockRecords,
},
value::DataBlockValue,
MultiValueColumnMetadataMap,
MultiValueColumnMetadataMap, RawData, RawDataMultiValueColumnJoiner,
};
use fnv::FnvHashMap;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use crate::{processing::aggregator::RecordsSet, utils::math::uround_down};
use crate::{
processing::{aggregator::RecordsSet, generator::SynthesizerCacheKey},
utils::math::uround_down,
};
#[cfg(feature = "pyo3")]
use pyo3::prelude::*;
@ -194,4 +197,73 @@ impl DataBlock {
usize::min(reporting_length, self.headers.len())
}
}
#[inline]
/// Creates a `RawData` vector for the stored data,
/// where the first entry is the headers
/// # Arguments
/// * `empty_value` - Empty values will be replaced by this
pub fn to_raw_data(&self, empty_value: &Arc<String>) -> RawData {
let mut raw_data: RawData = vec![self.headers.iter().map(|h| (*h).clone()).collect()];
let n_headers = self.headers.len();
raw_data.append(
&mut self
.records
.iter()
.map(|r| SynthesizerCacheKey::new(n_headers, &r.values).format_record(empty_value))
.collect(),
);
raw_data
}
#[inline]
/// Clones the data block data to a `Vec<Vec<String>>`,
/// where the first entry is the headers
/// # Arguments
/// * `empty_value` - Empty values will be replaced by this
/// * `join_multi_value_columns` - Whether multi value columns should be joined back together or not
pub fn to_raw_data_vec(
&self,
empty_value: &Arc<String>,
join_multi_value_columns: bool,
) -> Vec<Vec<String>> {
Self::raw_data_to_vec(
&self.to_raw_data(empty_value),
empty_value,
&self.multi_value_column_metadata_map,
join_multi_value_columns,
)
}
#[inline]
/// Clones the `raw_data` data to a `Vec<Vec<String>>`,
/// where the first entry is the headers
/// # Arguments
/// * `raw_data` - Raw data to be cloned
/// * `empty_value` - Empty values will be replaced by this
/// * `multi_value_column_metadata_map` - Maps a normalized multi-value header name (such as A_a1)
/// to its corresponding metadata
/// * `join_multi_value_columns` - Whether multi value columns should be joined back together or not
pub fn raw_data_to_vec(
raw_data: &RawData,
empty_value: &Arc<String>,
multi_value_column_metadata_map: &MultiValueColumnMetadataMap,
join_multi_value_columns: bool,
) -> Vec<Vec<String>> {
let mut raw_data_vec = if join_multi_value_columns {
RawDataMultiValueColumnJoiner::new(
raw_data,
multi_value_column_metadata_map,
empty_value,
)
.join()
} else {
raw_data.clone()
};
raw_data_vec
.drain(..)
.map(|mut record| record.drain(..).map(|value| (*value).clone()).collect())
.collect()
}
}

Просмотреть файл

@ -17,13 +17,15 @@ enum JoinSpec {
}
/// Helper to join columns in the raw data that were spread using multiple values
pub struct RawDataMultiValueColumnJoiner<'raw_data, 'multi_value_column_metadata_map> {
pub struct RawDataMultiValueColumnJoiner<'raw_data, 'multi_value_column_metadata_map, 'empty_value>
{
raw_data: &'raw_data [CsvRecordRef],
multi_value_column_metadata_map: &'multi_value_column_metadata_map MultiValueColumnMetadataMap,
empty_value: &'empty_value Arc<String>,
}
impl<'raw_data, 'multi_value_column_metadata_map>
RawDataMultiValueColumnJoiner<'raw_data, 'multi_value_column_metadata_map>
impl<'raw_data, 'multi_value_column_metadata_map, 'empty_value>
RawDataMultiValueColumnJoiner<'raw_data, 'multi_value_column_metadata_map, 'empty_value>
{
/// Creates a new joiner
/// # Arguments
@ -33,10 +35,12 @@ impl<'raw_data, 'multi_value_column_metadata_map>
pub fn new(
raw_data: &'raw_data [CsvRecordRef],
multi_value_column_metadata_map: &'multi_value_column_metadata_map MultiValueColumnMetadataMap,
empty_value: &'empty_value Arc<String>,
) -> Self {
RawDataMultiValueColumnJoiner {
raw_data,
multi_value_column_metadata_map,
empty_value,
}
}
@ -113,21 +117,25 @@ impl<'raw_data, 'multi_value_column_metadata_map>
new_record.push(record[*value_index].clone());
}
JoinSpec::MultiValue(entry) => {
new_record.push(Arc::new(
record
.iter()
.take(entry.end_index + 1)
.skip(entry.start_index)
.enumerate()
.filter_map(|(attr_index, value)| {
if **value == "1" {
Some(entry.attributes[attr_index].clone())
} else {
None
}
})
.join(&entry.delimiter),
));
let new_value = record
.iter()
.take(entry.end_index + 1)
.skip(entry.start_index)
.enumerate()
.filter_map(|(attr_index, value)| {
if **value == "1" {
Some(entry.attributes[attr_index].clone())
} else {
None
}
})
.join(&entry.delimiter);
new_record.push(if !new_value.is_empty() {
Arc::new(new_value)
} else {
self.empty_value.clone()
});
}
}
}

Просмотреть файл

@ -1,7 +1,7 @@
use csv::Writer;
use csv::WriterBuilder;
use log::info;
use std::io::Write;
use std::{io::Write, sync::Arc};
#[cfg(feature = "pyo3")]
use pyo3::prelude::*;
@ -10,7 +10,9 @@ use pyo3::prelude::*;
use crate::data_block::CsvRecord;
use crate::{
data_block::{CsvIOError, MultiValueColumnMetadataMap, RawData, RawDataMultiValueColumnJoiner},
data_block::{
CsvIOError, DataBlock, MultiValueColumnMetadataMap, RawData, RawDataMultiValueColumnJoiner,
},
utils::time::ElapsedDurationLogger,
};
@ -60,6 +62,7 @@ impl GeneratedData {
&self,
writer: &mut T,
delimiter: char,
empty_value: &str,
join_multi_value_columns: bool,
long_form: bool,
) -> Result<(), CsvIOError> {
@ -72,6 +75,7 @@ impl GeneratedData {
joined_synthetic_data = RawDataMultiValueColumnJoiner::new(
&self.synthetic_data,
&self.multi_value_column_metadata_map,
&Arc::new(empty_value.to_owned()),
)
.join();
&joined_synthetic_data
@ -161,12 +165,14 @@ impl GeneratedData {
/// # Arguments
/// * `path` - File path to be written
/// * `delimiter` - Delimiter to use when writing to `path`
/// * `empty_value` - Empty values will be replaced by this
/// * `join_multi_value_columns` - Whether multi value columns should be joined back together or not
/// * `long_form` - Pivots column headers and value pairs to key-value row entries.
pub fn write_synthetic_data(
&self,
path: &str,
delimiter: char,
empty_value: &str,
join_multi_value_columns: bool,
long_form: bool,
) -> Result<(), CsvIOError> {
@ -178,17 +184,25 @@ impl GeneratedData {
info!("writing file {}", path);
self._write_synthetic_data(&mut file, delimiter, join_multi_value_columns, long_form)
self._write_synthetic_data(
&mut file,
delimiter,
empty_value,
join_multi_value_columns,
long_form,
)
}
/// Generates a CSV string from the synthetic data
/// # Arguments
/// * `delimiter` - CSV delimiter to use
/// * `empty_value` - Empty values will be replaced by this
/// * `join_multi_value_columns` - Whether multi value columns should be joined back together or not
/// * `long_form` - Pivots column headers and value pairs to key-value row entries.
pub fn synthetic_data_to_string(
&self,
delimiter: char,
empty_value: &str,
join_multi_value_columns: bool,
long_form: bool,
) -> Result<String, CsvIOError> {
@ -197,6 +211,7 @@ impl GeneratedData {
self._write_synthetic_data(
&mut csv_data,
delimiter,
empty_value,
join_multi_value_columns,
long_form,
)?;
@ -205,22 +220,20 @@ impl GeneratedData {
}
/// Clones the raw synthetic data to a `Vec<Vec<String>>`,
/// where the first entry are the headers
/// where the first entry is the headers
/// # Arguments
/// * `empty_value` - Empty values will be replaced by this
/// * `join_multi_value_columns` - Whether multi value columns should be joined back together or not
pub fn synthetic_data_to_vec(&self, join_multi_value_columns: bool) -> Vec<Vec<String>> {
let mut synthetic_data = if join_multi_value_columns {
RawDataMultiValueColumnJoiner::new(
&self.synthetic_data,
&self.multi_value_column_metadata_map,
)
.join()
} else {
self.synthetic_data.clone()
};
synthetic_data
.drain(..)
.map(|mut record| record.drain(..).map(|value| (*value).clone()).collect())
.collect()
pub fn synthetic_data_to_vec(
&self,
empty_value: &str,
join_multi_value_columns: bool,
) -> Vec<Vec<String>> {
DataBlock::raw_data_to_vec(
&self.synthetic_data,
&Arc::new(empty_value.to_owned()),
&self.multi_value_column_metadata_map,
join_multi_value_columns,
)
}
}

Просмотреть файл

@ -256,3 +256,97 @@ fn validate_normalize_reporting_length() {
assert_eq!(db.normalize_reporting_length(10), 4);
assert_eq!(db.normalize_reporting_length(2), 2);
}
#[test]
fn validate_to_raw_data_vec() {
let db = read_test_data_block(
TEST_FILE_PATH,
DELIMITER,
None,
&[],
&HashMap::default(),
&[],
0,
);
let raw_data = db.to_raw_data_vec(&Arc::new("".to_owned()), false);
assert_eq!(
raw_data,
vec![
vec![
"A".to_owned(),
"B".to_owned(),
"C".to_owned(),
"D".to_owned()
],
vec![
"a1".to_owned(),
"b1".to_owned(),
"c1".to_owned(),
"d1".to_owned()
],
vec![
"a2".to_owned(),
"b2".to_owned(),
"".to_owned(),
"d2".to_owned()
],
vec![
"a1".to_owned(),
"b2".to_owned(),
"".to_owned(),
"d3".to_owned()
],
]
);
}
#[test]
fn validate_to_raw_data_vec_with_multi_value_columns() {
let db = read_test_data_block(
TEST_FILE_PATH,
DELIMITER,
None,
&[],
&[
("C".to_owned(), ";".to_owned()),
("D".to_owned(), "|".to_owned()),
]
.iter()
.cloned()
.collect(),
&[],
0,
);
let raw_data = db.to_raw_data_vec(&Arc::new("empty".to_owned()), true);
assert_eq!(
raw_data,
vec![
vec![
"A".to_owned(),
"B".to_owned(),
"C".to_owned(),
"D".to_owned()
],
vec![
"a1".to_owned(),
"b1".to_owned(),
"c1".to_owned(),
"d1".to_owned()
],
vec![
"a2".to_owned(),
"b2".to_owned(),
"empty".to_owned(),
"d2".to_owned()
],
vec![
"a1".to_owned(),
"b2".to_owned(),
"empty".to_owned(),
"d3".to_owned()
],
]
);
}

Просмотреть файл

@ -99,8 +99,12 @@ fn valid_duplicated_id() {
}));
assert!(
RawDataMultiValueColumnJoiner::new(&raw_data, &data_block.multi_value_column_metadata_map)
.join()
RawDataMultiValueColumnJoiner::new(
&raw_data,
&data_block.multi_value_column_metadata_map,
&empty_value,
)
.join()
== expected
);
}

Просмотреть файл

@ -55,8 +55,75 @@ fn validate_multi_value_column_joiner() {
}));
assert!(
RawDataMultiValueColumnJoiner::new(&raw_data, &data_block.multi_value_column_metadata_map)
.join()
RawDataMultiValueColumnJoiner::new(
&raw_data,
&data_block.multi_value_column_metadata_map,
&empty_value,
)
.join()
== expected
);
}
#[test]
fn validate_multi_value_column_joiner_with_different_empty_values() {
let data_block = read_test_data_block(
TEST_FILE_PATH,
DELIMITER,
None,
&[],
&[
("B".to_owned(), ";".to_owned()),
("C".to_owned(), ";".to_owned()),
("D".to_owned(), "|".to_owned()),
]
.iter()
.cloned()
.collect(),
&["B".to_owned()],
0,
);
let mut raw_data = RawData::default();
let empty_value = Arc::new("empty".to_owned());
let expected = [
["ID", "A", "B", "C", "F", "G", "D"].map(|s| Arc::new(s.to_owned())),
[
"1",
"a1",
"0;b1;b3;b4",
"c1;c2;c3",
"empty",
"empty",
"d1|d3",
]
.map(|s| Arc::new(s.to_owned())),
["2", "a1", "b1", "c1", "empty", "empty", "d1"].map(|s| Arc::new(s.to_owned())),
["3", "a1", "b1", "c1", "empty", "empty", "d1"].map(|s| Arc::new(s.to_owned())),
["4", "a1", "b1;b2;b3", "c2", "1", "empty", "d2"].map(|s| Arc::new(s.to_owned())),
["5", "a2", "b1", "c2", "empty", "empty", "d2"].map(|s| Arc::new(s.to_owned())),
["6", "a2", "b2;b3", "c1", "empty", "empty", "d1|d4"].map(|s| Arc::new(s.to_owned())),
["7", "a2", "b2", "c2", "1", "empty", "d2"].map(|s| Arc::new(s.to_owned())),
["8", "a2", "b2", "c2", "empty", "empty", "d3"].map(|s| Arc::new(s.to_owned())),
["9", "a2", "b2", "c1;c3;c4", "empty", "empty", "d2"].map(|s| Arc::new(s.to_owned())),
["10", "a2", "empty_multi", "c2", "empty", "empty", "d2"].map(|s| Arc::new(s.to_owned())),
["11", "a3", "b2", "c2", "empty", "empty", "d2"].map(|s| Arc::new(s.to_owned())),
]
.iter()
.cloned()
.collect_vec();
raw_data.push(data_block.headers.to_vec());
raw_data.extend(data_block.records.iter().map(|r| {
SynthesizerCacheKey::new(data_block.headers.len(), &r.values).format_record(&empty_value)
}));
assert!(
RawDataMultiValueColumnJoiner::new(
&raw_data,
&data_block.multi_value_column_metadata_map,
&Arc::new("empty_multi".to_owned()),
)
.join()
== expected
);
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -9,7 +9,7 @@
},
{
"cell_type": "code",
"execution_count": 71,
"execution_count": 92,
"metadata": {},
"outputs": [],
"source": [
@ -32,7 +32,7 @@
},
{
"cell_type": "code",
"execution_count": 72,
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
@ -50,7 +50,7 @@
},
{
"cell_type": "code",
"execution_count": 73,
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
@ -66,7 +66,7 @@
},
{
"cell_type": "code",
"execution_count": 74,
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
@ -98,7 +98,7 @@
},
{
"cell_type": "code",
"execution_count": 75,
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
@ -118,7 +118,7 @@
},
{
"cell_type": "code",
"execution_count": 76,
"execution_count": 97,
"metadata": {},
"outputs": [
{
@ -170,29 +170,29 @@
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>1.008167</td>\n",
" <td>2.631000</td>\n",
" <td>4.590833</td>\n",
" <td>0.511667</td>\n",
" <td>0.510500</td>\n",
" <td>0.500000</td>\n",
" <td>0.496167</td>\n",
" <td>0.496500</td>\n",
" <td>0.498500</td>\n",
" <td>0.496333</td>\n",
" <td>1.012500</td>\n",
" <td>2.599167</td>\n",
" <td>4.537500</td>\n",
" <td>0.503000</td>\n",
" <td>0.500333</td>\n",
" <td>0.506500</td>\n",
" <td>0.504667</td>\n",
" <td>0.514333</td>\n",
" <td>0.502667</td>\n",
" <td>0.499833</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.816626</td>\n",
" <td>2.112167</td>\n",
" <td>3.340800</td>\n",
" <td>0.499906</td>\n",
" <td>0.499931</td>\n",
" <td>0.817999</td>\n",
" <td>2.117998</td>\n",
" <td>3.335733</td>\n",
" <td>0.500033</td>\n",
" <td>0.500042</td>\n",
" <td>0.499999</td>\n",
" <td>0.500020</td>\n",
" <td>0.499836</td>\n",
" <td>0.500035</td>\n",
" <td>0.500042</td>\n",
" <td>0.500027</td>\n",
" <td>0.500029</td>\n",
" <td>0.500039</td>\n",
" <td>0.500028</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
@ -210,8 +210,8 @@
" <tr>\n",
" <th>25%</th>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>2.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
@ -223,21 +223,21 @@
" <tr>\n",
" <th>50%</th>\n",
" <td>1.000000</td>\n",
" <td>3.000000</td>\n",
" <td>2.000000</td>\n",
" <td>4.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.500000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>2.000000</td>\n",
" <td>5.000000</td>\n",
" <td>8.000000</td>\n",
" <td>4.000000</td>\n",
" <td>7.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
@ -266,26 +266,26 @@
"text/plain": [
" H1 H2 H3 H4 H5 \\\n",
"count 6000.000000 6000.000000 6000.000000 6000.000000 6000.000000 \n",
"mean 1.008167 2.631000 4.590833 0.511667 0.510500 \n",
"std 0.816626 2.112167 3.340800 0.499906 0.499931 \n",
"mean 1.012500 2.599167 4.537500 0.503000 0.500333 \n",
"std 0.817999 2.117998 3.335733 0.500033 0.500042 \n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"25% 0.000000 1.000000 2.000000 0.000000 0.000000 \n",
"50% 1.000000 3.000000 4.000000 1.000000 1.000000 \n",
"75% 2.000000 5.000000 8.000000 1.000000 1.000000 \n",
"25% 0.000000 0.000000 1.000000 0.000000 0.000000 \n",
"50% 1.000000 2.000000 4.000000 1.000000 1.000000 \n",
"75% 2.000000 4.000000 7.000000 1.000000 1.000000 \n",
"max 2.000000 6.000000 10.000000 1.000000 1.000000 \n",
"\n",
" H6 H7 H8 H9 H10 \n",
"count 6000.000000 6000.000000 6000.000000 6000.000000 6000.000000 \n",
"mean 0.500000 0.496167 0.496500 0.498500 0.496333 \n",
"std 0.500042 0.500027 0.500029 0.500039 0.500028 \n",
"mean 0.506500 0.504667 0.514333 0.502667 0.499833 \n",
"std 0.499999 0.500020 0.499836 0.500035 0.500042 \n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"50% 0.500000 0.000000 0.000000 0.000000 0.000000 \n",
"50% 1.000000 1.000000 1.000000 1.000000 0.000000 \n",
"75% 1.000000 1.000000 1.000000 1.000000 1.000000 \n",
"max 1.000000 1.000000 1.000000 1.000000 1.000000 "
]
},
"execution_count": 76,
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
@ -296,7 +296,7 @@
},
{
"cell_type": "code",
"execution_count": 77,
"execution_count": 98,
"metadata": {},
"outputs": [
{
@ -335,42 +335,42 @@
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>6024.000000</td>\n",
" <td>6024.000000</td>\n",
" <td>6024.000000</td>\n",
" <td>6024.000000</td>\n",
" <td>6024.000000</td>\n",
" <td>6024.000000</td>\n",
" <td>6024.000000</td>\n",
" <td>6024.000000</td>\n",
" <td>6024.000000</td>\n",
" <td>6024.000000</td>\n",
" <td>6003.000000</td>\n",
" <td>6003.000000</td>\n",
" <td>6003.000000</td>\n",
" <td>6003.000000</td>\n",
" <td>6003.000000</td>\n",
" <td>6003.000000</td>\n",
" <td>6003.000000</td>\n",
" <td>6003.000000</td>\n",
" <td>6003.000000</td>\n",
" <td>6003.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>0.921647</td>\n",
" <td>2.335159</td>\n",
" <td>4.047643</td>\n",
" <td>0.495684</td>\n",
" <td>0.481740</td>\n",
" <td>0.475100</td>\n",
" <td>0.482404</td>\n",
" <td>0.469788</td>\n",
" <td>0.482902</td>\n",
" <td>0.483898</td>\n",
" <td>0.913043</td>\n",
" <td>2.156755</td>\n",
" <td>3.909545</td>\n",
" <td>0.489255</td>\n",
" <td>0.488089</td>\n",
" <td>0.492754</td>\n",
" <td>0.478094</td>\n",
" <td>0.481759</td>\n",
" <td>0.489755</td>\n",
" <td>0.479427</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.827974</td>\n",
" <td>2.155591</td>\n",
" <td>3.490394</td>\n",
" <td>0.500023</td>\n",
" <td>0.499708</td>\n",
" <td>0.499421</td>\n",
" <td>0.499732</td>\n",
" <td>0.499128</td>\n",
" <td>0.499749</td>\n",
" <td>0.499782</td>\n",
" <td>0.832789</td>\n",
" <td>2.159173</td>\n",
" <td>3.476390</td>\n",
" <td>0.499926</td>\n",
" <td>0.499900</td>\n",
" <td>0.499989</td>\n",
" <td>0.499562</td>\n",
" <td>0.499709</td>\n",
" <td>0.499937</td>\n",
" <td>0.499618</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
@ -402,7 +402,7 @@
" <th>50%</th>\n",
" <td>1.000000</td>\n",
" <td>2.000000</td>\n",
" <td>4.000000</td>\n",
" <td>3.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
@ -443,19 +443,19 @@
],
"text/plain": [
" H1 H2 H3 H4 H5 \\\n",
"count 6024.000000 6024.000000 6024.000000 6024.000000 6024.000000 \n",
"mean 0.921647 2.335159 4.047643 0.495684 0.481740 \n",
"std 0.827974 2.155591 3.490394 0.500023 0.499708 \n",
"count 6003.000000 6003.000000 6003.000000 6003.000000 6003.000000 \n",
"mean 0.913043 2.156755 3.909545 0.489255 0.488089 \n",
"std 0.832789 2.159173 3.476390 0.499926 0.499900 \n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"50% 1.000000 2.000000 4.000000 0.000000 0.000000 \n",
"50% 1.000000 2.000000 3.000000 0.000000 0.000000 \n",
"75% 2.000000 4.000000 7.000000 1.000000 1.000000 \n",
"max 2.000000 6.000000 10.000000 1.000000 1.000000 \n",
"\n",
" H6 H7 H8 H9 H10 \n",
"count 6024.000000 6024.000000 6024.000000 6024.000000 6024.000000 \n",
"mean 0.475100 0.482404 0.469788 0.482902 0.483898 \n",
"std 0.499421 0.499732 0.499128 0.499749 0.499782 \n",
"count 6003.000000 6003.000000 6003.000000 6003.000000 6003.000000 \n",
"mean 0.492754 0.478094 0.481759 0.489755 0.479427 \n",
"std 0.499989 0.499562 0.499709 0.499937 0.499618 \n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"25% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"50% 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
@ -463,7 +463,7 @@
"max 1.000000 1.000000 1.000000 1.000000 1.000000 "
]
},
"execution_count": 77,
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}

Просмотреть файл

@ -79,7 +79,10 @@ impl DpAggregatedSeededSynthesizer {
target_number_of_records,
&mut create_progress_reporter(),
)?;
Ok(generated_data.synthetic_data_to_vec(join_multi_value_columns.unwrap_or(false)))
Ok(generated_data.synthetic_data_to_vec(
&self._parameters.empty_value,
join_multi_value_columns.unwrap_or(false),
))
} else {
Err(PyRuntimeError::new_err(
"make sure 'fit' method has been successfully called first",

Просмотреть файл

@ -101,6 +101,27 @@ impl Dataset {
})
}
pub fn to_raw_data(
&self,
empty_value: Option<String>,
join_multi_value_columns: Option<bool>,
) -> DatasetRawData {
self.data_block.to_raw_data_vec(
&empty_value
.map(Arc::new)
.unwrap_or_else(|| Arc::new("".to_owned())),
join_multi_value_columns.unwrap_or(false),
)
}
pub fn to_data_frame(
&self,
empty_value: Option<String>,
join_multi_value_columns: Option<bool>,
) -> PyResult<PyObject> {
Self::raw_data_to_data_frame(self.to_raw_data(empty_value, join_multi_value_columns))
}
pub fn get_aggregates(
&self,
reporting_length: usize,

Просмотреть файл

@ -51,7 +51,7 @@ impl WasmGenerateResult {
long_form: bool,
) -> JsResult<String> {
self.generated_data
.synthetic_data_to_string(delimiter, join_multi_value_columns, long_form)
.synthetic_data_to_string(delimiter, "", join_multi_value_columns, long_form)
.map_err(|err| JsValue::from(err.to_string()))
}

Просмотреть файл

@ -93,7 +93,8 @@ def generate(config):
else:
raise ValueError(f'invalid synthesis mode: {synthesis_mode}')
generated_data.write_synthetic_data(synthetic_microdata_path, '\t', False, False)
generated_data.write_synthetic_data(
synthetic_microdata_path, '\t', '', False, False)
syn_ratio = generated_data.expansion_ratio
config['expansion_ratio'] = syn_ratio